[Lxml-checkins] r45127 - in lxml/branch/html: . doc src/lxml/html
scoder at codespeak.net
scoder at codespeak.net
Mon Jul 16 15:12:51 CEST 2007
Author: scoder
Date: Mon Jul 16 15:12:51 2007
New Revision: 45127
Added:
lxml/branch/html/doc/elementsoup.txt
lxml/branch/html/src/lxml/html/ElementSoup.py
Modified:
lxml/branch/html/CHANGES.txt
Log:
BeautifulSoup support
Modified: lxml/branch/html/CHANGES.txt
==============================================================================
--- lxml/branch/html/CHANGES.txt (original)
+++ lxml/branch/html/CHANGES.txt Mon Jul 16 15:12:51 2007
@@ -8,6 +8,8 @@
Features added
--------------
+* HTML tag soup parser based on BeautifulSoup
+
* Entity support through an ``Entity`` factory and element classes. XML
parsers now have a ``resolve_entities`` keyword argument that can be set to
False to keep entities in the document.
Added: lxml/branch/html/doc/elementsoup.txt
==============================================================================
--- (empty file)
+++ lxml/branch/html/doc/elementsoup.txt Mon Jul 16 15:12:51 2007
@@ -0,0 +1,47 @@
+====================
+BeautifulSoup Parser
+====================
+
+:Author:
+ Stefan Behnel
+
+BeautifulSoup_ is a Python package that parses broken HTML. While libxml2
+(and thus lxml) can also parse broken HTML, BeautifulSoup is much more
+forgiving and has superiour `support for encoding detection`_.
+
+.. _BeautifulSoup: http://www.crummy.com/software/BeautifulSoup/
+.. _`support for encoding detection`: http://www.crummy.com/software/BeautifulSoup/documentation.html#Beautiful%20Soup%20Gives%20You%20Unicode,%20Dammit
+
+lxml can benefit from the parsing capabilities of BeautifulSoup through the
+`lxml.html.ElementSoup` module. It provides two main functions: `parse()` to
+parse a file using BeautifulSoup, and `convert_tree()` to convert a
+BeautifulSoup tree into a list of top-level Elements.
+
+Here is a document full of tag soup, similar to, but not quite like, HTML::
+
+ >>> tag_soup = '<meta><head><title>Hello</head<body onload=crash()>Hi all<p>'
+
+all you need to do is pass it to the `parse()` function::
+
+ >>> from lxml.html.ElementSoup import parse
+ >>> from StringIO import StringIO
+ >>> root = parse(StringIO(tag_soup))
+
+To see what we have here, you can serialise it::
+
+ >>> from lxml.etree import tostring
+ >>> print tostring(root, pretty_print=True)
+ <html>
+ <meta/>
+ <head>
+ <title>Hello</title>
+ </head>
+ <body onload="crash()">Hi all<p/></body>
+ </html>
+
+Not quite what you'd expect from an HTML page, but, well, it was broken
+already, right? BeautifulSoup did its best, and so now it's a tree.
+
+To control which Element implementation is used, you can pass a
+``makeelement`` factory function to ``parse()``. By default, this is based on
+the HTML parser defined in ``lxml.html``.
Added: lxml/branch/html/src/lxml/html/ElementSoup.py
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/ElementSoup.py Mon Jul 16 15:12:51 2007
@@ -0,0 +1,94 @@
+__doc__ = """External interface to the BeautifulSoup HTML parser.
+"""
+
+__all__ = ["parse", "convert_tree"]
+
+from lxml import etree, html
+from BeautifulSoup import \
+ BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString
+
+
+def parse(file, beautifulsoup=None, makeelement=None):
+ if beautifulsoup is None:
+ beautifulsoup = BeautifulSoup
+ if makeelement is None:
+ makeelement = html.html_parser.makeelement
+ if not hasattr(file, 'read'):
+ file = open(file)
+ tree = beautifulsoup(file)
+ root = _convert_tree(tree, makeelement)
+ # from ET: wrap the document in a html root element, if necessary
+ if len(root) == 1 and root[0].tag == "html":
+ return root[0]
+ root.tag = "html"
+ return root
+
+def convert_tree(beautiful_soup_tree, makeelement=None):
+ if makeelement is None:
+ makeelement = html.html_parser.makeelement
+ root = _convert_tree(beautiful_soup_tree, makeelement)
+ children = root.getchildren()
+ for child in children:
+ root.remove(child)
+ return children
+
+
+# helpers
+
+def _convert_tree(beautiful_soup_tree, makeelement):
+ root = makeelement(beautiful_soup_tree.name,
+ attrib=dict(beautiful_soup_tree.attrs))
+ _convert_children(root, beautiful_soup_tree, makeelement)
+ return root
+
+def _convert_children(parent, beautiful_soup_tree, makeelement):
+ SubElement = etree.SubElement
+ et_child = None
+ for child in beautiful_soup_tree:
+ if isinstance(child, Tag):
+ et_child = SubElement(parent, child.name, attrib=dict(
+ [(k, unescape(v)) for (k,v) in child.attrs]))
+ _convert_children(et_child, child, makeelement)
+ elif type(child) is NavigableString:
+ _append_text(parent, et_child, unescape(unicode(child)))
+ else:
+ if isinstance(child, Comment):
+ parent.append(etree.Comment(child.string))
+ elif isinstance(child, ProcessingInstruction):
+ parent.append(etree.ProcessingInstruction(
+ *child.string.split(' ', 1)))
+ else: # CData
+ _append_text(parent, et_child, unescape(unicode(child)))
+
+def _append_text(parent, element, text):
+ if element is None:
+ parent.text = (parent.text or '') + text
+ else:
+ element.tail = (element.tail or '') + text
+
+
+# copied from ET's ElementSoup
+
+import htmlentitydefs, re
+
+handle_entities = re.compile("&(\w+);").sub
+
+try:
+ name2codepoint = htmlentitydefs.name2codepoint
+except AttributeError:
+ # Emulate name2codepoint for Python 2.2 and earlier
+ name2codepoint = {}
+ for name, entity in htmlentitydefs.entitydefs.items():
+ if len(entity) == 1:
+ name2codepoint[name] = ord(entity)
+ else:
+ name2codepoint[name] = int(entity[2:-1])
+
+def unescape(string):
+ # work around oddities in BeautifulSoup's entity handling
+ def unescape_entity(m):
+ try:
+ return unichr(name2codepoint[m.group(1)])
+ except KeyError:
+ return m.group(0) # use as is
+ return handle_entities(unescape_entity, string)
More information about the lxml-checkins
mailing list