from lxml import etree
import re
html_xsl = """
"""
# TODO: this should be xsl for real formatting
pretty_html_xsl = html_xsl
html_transform = etree.XSLT(etree.XML(html_xsl))
pretty_html_transform = etree.XSLT(etree.XML(pretty_html_xsl))
def tostring(doc, pretty = False, doctype_pair=None):
"""
return HTML string representation of the document given
note: this will create a meta http-equiv="Content" tag in the head
and may replace any that are present
"""
if pretty:
doc = str(pretty_html_transform(doc))
else:
doc = str(html_transform(doc))
if doctype_pair:
doc = """\n%s""" % (doctype_pair[0], doctype_pair[1], doc)
return doc
#HTTP_EQUIV_MATCHER_PAT = re.compile(r"\<\s*meta\s+([^\>])*http-equiv\s*=\s*(\'|\")\s*content-type\s*(\'|\")([^\>])*charset\s*=\s*(?P[\w-]+)([^\>])*\>",re.I|re.M)
#OTHER_HTTP_EQUIV_MATCHER_PAT = re.compile(r"\<\s*meta\s+([^\>])*charset\s*=\s*(?P[\w-]+)([^\>])*http-equiv\s*=\s*(\'|\")\s*content-type\s*(\'|\")([^\>])*\>",re.I|re.M)
def decodeAndParseHTML(text):
"""
if an html meta tag specifying a charset can be matched,
decode the text to a python unicode string before parsing
XXX - this is disabled and in camelCase for no good reason
"""
# m = HTTP_EQUIV_MATCHER_PAT.search(text)
# if not m:
# m = OTHER_HTTP_EQUIV_MATCHER_PAT.search(text)
#
# if m:
# charset = m.group('charset')
# text = text.decode(charset)
content = etree.HTML(text)
assert content is not None
return content