from lxml import etree import re html_xsl = """ """ # TODO: this should be xsl for real formatting pretty_html_xsl = html_xsl html_transform = etree.XSLT(etree.XML(html_xsl)) pretty_html_transform = etree.XSLT(etree.XML(pretty_html_xsl)) def tostring(doc, pretty = False, doctype_pair=None): """ return HTML string representation of the document given note: this will create a meta http-equiv="Content" tag in the head and may replace any that are present """ if pretty: doc = str(pretty_html_transform(doc)) else: doc = str(html_transform(doc)) if doctype_pair: doc = """\n%s""" % (doctype_pair[0], doctype_pair[1], doc) return doc #HTTP_EQUIV_MATCHER_PAT = re.compile(r"\<\s*meta\s+([^\>])*http-equiv\s*=\s*(\'|\")\s*content-type\s*(\'|\")([^\>])*charset\s*=\s*(?P[\w-]+)([^\>])*\>",re.I|re.M) #OTHER_HTTP_EQUIV_MATCHER_PAT = re.compile(r"\<\s*meta\s+([^\>])*charset\s*=\s*(?P[\w-]+)([^\>])*http-equiv\s*=\s*(\'|\")\s*content-type\s*(\'|\")([^\>])*\>",re.I|re.M) def decodeAndParseHTML(text): """ if an html meta tag specifying a charset can be matched, decode the text to a python unicode string before parsing XXX - this is disabled and in camelCase for no good reason """ # m = HTTP_EQUIV_MATCHER_PAT.search(text) # if not m: # m = OTHER_HTTP_EQUIV_MATCHER_PAT.search(text) # # if m: # charset = m.group('charset') # text = text.decode(charset) content = etree.HTML(text) assert content is not None return content