cruft
·
2009-07-09
sa2ltx.py
1#! /usr/bin/env python3
2
3import optparse
4import xml.etree.ElementTree
5import re
6
7quotes_re = re.compile(r'"([^"]+)"')
8dots_re = re.compile(r'\.\.\.+')
9crap_re = re.compile(r'<p class="editedby">.*</p>', re.DOTALL)
10tag_re = re.compile(r'<[^>]+>')
11
12def by_class(e, classname):
13 todo = [e]
14 while todo:
15 i = todo.pop(0)
16 if i.get('class') == classname:
17 yield i
18 todo = i.getchildren() + todo
19
20def first_by_class(e, classname):
21 for i in by_class(e, classname):
22 return i
23
24def table_to_ltx(t):
25 dt = first_by_class(t, 'author')
26
27 username = dt.text
28 if not username:
29 # Moderators
30 username = dt.getchildren()[-1].tail
31 print('%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%')
32 print('\\by{%s}' % username)
33
34 body = first_by_class(t, 'postbody')
35
36 s = xml.etree.ElementTree.tostring(body)
37 s = s.replace('<br />', '\n')
38 s = s.replace('<i>', '{\\em ')
39 s = s.replace('</i>', '}')
40 s = s.replace('<b>', '{\\bf ')
41 s = s.replace('</b>', '}')
42 s = crap_re.sub('', s)
43 s = tag_re.sub('', s)
44 s = dots_re.sub('{\ldots}', s)
45 s = quotes_re.sub(r"``\1''", s)
46 print(s)
47
48def doc_to_ltx(doc):
49 for e in doc.getiterator('table'):
50 if e.get('class') == 'post':
51 table_to_ltx(e)
52
53def main():
54 p = optparse.OptionParser()
55 (opts, args) = p.parse_args()
56
57 for a in args:
58 f = open(a, encoding='iso-8859-1')
59 parser = xml.etree.ElementTree.XMLTreeBuilder()
60 parser.entity.update(nbsp=" ",
61 lsaquo="<", rsaquo=">",
62 lsquo="`", rsquo="'",
63 ldquo="``", rdquo="''",
64 hellip="{\\ldots}",
65 ndash="---",
66 mdash="---",
67 iexcl="{\\!`}",
68 copy="{\\copyright}",
69 eacute="\\'e",
70 ccedil="\\,c",
71 )
72 doc = xml.etree.ElementTree.parse(f, parser)
73 doc_to_ltx(doc)
74
75main()