horrors2

Awful horror fiction
git clone https://git.woozle.org/neale/horrors2.git

cruft  ·  2009-07-09

sa2ltx.py

 1#! /usr/bin/env python3
 2
 3import optparse
 4import xml.etree.ElementTree
 5import re
 6
 7quotes_re = re.compile(r'"([^"]+)"')
 8dots_re = re.compile(r'\.\.\.+')
 9crap_re = re.compile(r'<p class="editedby">.*</p>', re.DOTALL)
10tag_re = re.compile(r'<[^>]+>')
11
12def by_class(e, classname):
13    todo = [e]
14    while todo:
15        i = todo.pop(0)
16        if i.get('class') == classname:
17            yield i
18        todo = i.getchildren() + todo
19
20def first_by_class(e, classname):
21    for i in by_class(e, classname):
22        return i
23
24def table_to_ltx(t):
25    dt = first_by_class(t, 'author')
26
27    username = dt.text
28    if not username:
29        # Moderators
30        username = dt.getchildren()[-1].tail
31    print('%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%')
32    print('\\by{%s}' % username)
33
34    body = first_by_class(t, 'postbody')
35
36    s = xml.etree.ElementTree.tostring(body)
37    s = s.replace('<br />', '\n')
38    s = s.replace('<i>', '{\\em ')
39    s = s.replace('</i>', '}')
40    s = s.replace('<b>', '{\\bf ')
41    s = s.replace('</b>', '}')
42    s = crap_re.sub('', s)
43    s = tag_re.sub('', s)
44    s = dots_re.sub('{\ldots}', s)
45    s = quotes_re.sub(r"``\1''", s)
46    print(s)
47
48def doc_to_ltx(doc):
49    for e in doc.getiterator('table'):
50        if e.get('class') == 'post':
51            table_to_ltx(e)
52
53def main():
54    p = optparse.OptionParser()
55    (opts, args) = p.parse_args()
56
57    for a in args:
58        f = open(a, encoding='iso-8859-1')
59        parser = xml.etree.ElementTree.XMLTreeBuilder()
60        parser.entity.update(nbsp=" ",
61                             lsaquo="<", rsaquo=">",
62                             lsquo="`",  rsquo="'",
63                             ldquo="``", rdquo="''",
64                             hellip="{\\ldots}",
65                             ndash="---",
66                             mdash="---",
67                             iexcl="{\\!`}",
68                             copy="{\\copyright}",
69                             eacute="\\'e",
70                             ccedil="\\,c",
71                             )
72        doc = xml.etree.ElementTree.parse(f, parser)
73        doc_to_ltx(doc)
74
75main()