Horrors2/sa2ltx.py

77 lines
2.1 KiB
Python
Raw Normal View History

2009-07-09 01:03:38 -06:00
#! /usr/bin/python3
import optparse
import xml.dom.minidom
import xml.etree.ElementTree
import re
quotes_re = re.compile(r'"([^"]+)"')
dots_re = re.compile(r'\.\.\.+')
crap_re = re.compile(r'<p class="editedby">.*</p>', re.DOTALL)
tag_re = re.compile(r'<[^>]+>')
def by_class(e, classname):
todo = [e]
while todo:
i = todo.pop(0)
if i.get('class') == classname:
yield i
todo = i.getchildren() + todo
def first_by_class(e, classname):
for i in by_class(e, classname):
return i
def table_to_ltx(t):
dt = first_by_class(t, 'author')
username = dt.text
if not username:
# Moderators
username = dt.getchildren()[-1].tail
print('%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%')
print('\\by{%s}' % username)
body = first_by_class(t, 'postbody')
s = xml.etree.ElementTree.tostring(body)
s = s.replace('<br />', '\n')
s = s.replace('<i>', '{\\em ')
s = s.replace('</i>', '}')
s = s.replace('<b>', '{\\bf ')
s = s.replace('</b>', '}')
s = crap_re.sub('', s)
s = tag_re.sub('', s)
s = dots_re.sub('{\ldots}', s)
s = quotes_re.sub(r"``\1''", s)
print(s)
def doc_to_ltx(doc):
for e in doc.getiterator('table'):
if e.get('class') == 'post':
table_to_ltx(e)
def main():
p = optparse.OptionParser()
(opts, args) = p.parse_args()
for a in args:
f = open(a, encoding='iso-8859-1')
parser = xml.etree.ElementTree.XMLTreeBuilder()
parser.entity.update(nbsp=" ",
rsaquo=">",
lsquo="`",
rsquo="'",
ldquo="``",
rdquo="''",
hellip="{\\ldots}",
ndash="---",
mdash="---",
iexcl="{\\!`}",
copy="{\\copyright}",
)
doc = xml.etree.ElementTree.parse(f, parser)
doc_to_ltx(doc)
main()