#! /usr/bin/python3 import optparse import xml.dom.minidom import xml.etree.ElementTree import re quotes_re = re.compile(r'"([^"]+)"') dots_re = re.compile(r'\.\.\.+') crap_re = re.compile(r'

.*

', re.DOTALL) tag_re = re.compile(r'<[^>]+>') def by_class(e, classname): todo = [e] while todo: i = todo.pop(0) if i.get('class') == classname: yield i todo = i.getchildren() + todo def first_by_class(e, classname): for i in by_class(e, classname): return i def table_to_ltx(t): dt = first_by_class(t, 'author') username = dt.text if not username: # Moderators username = dt.getchildren()[-1].tail print('%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%') print('\\by{%s}' % username) body = first_by_class(t, 'postbody') s = xml.etree.ElementTree.tostring(body) s = s.replace('
', '\n') s = s.replace('', '{\\em ') s = s.replace('', '}') s = s.replace('', '{\\bf ') s = s.replace('', '}') s = crap_re.sub('', s) s = tag_re.sub('', s) s = dots_re.sub('{\ldots}', s) s = quotes_re.sub(r"``\1''", s) print(s) def doc_to_ltx(doc): for e in doc.getiterator('table'): if e.get('class') == 'post': table_to_ltx(e) def main(): p = optparse.OptionParser() (opts, args) = p.parse_args() for a in args: f = open(a, encoding='iso-8859-1') parser = xml.etree.ElementTree.XMLTreeBuilder() parser.entity.update(nbsp=" ", rsaquo=">", lsquo="`", rsquo="'", ldquo="``", rdquo="''", hellip="{\\ldots}", ndash="---", mdash="---", iexcl="{\\!`}", copy="{\\copyright}", ) doc = xml.etree.ElementTree.parse(f, parser) doc_to_ltx(doc) main()