[Lxml-checkins] r45127 - in lxml/branch/html: . doc src/lxml/html

scoder at codespeak.net scoder at codespeak.net
Mon Jul 16 15:12:51 CEST 2007


Author: scoder
Date: Mon Jul 16 15:12:51 2007
New Revision: 45127

Added:
   lxml/branch/html/doc/elementsoup.txt
   lxml/branch/html/src/lxml/html/ElementSoup.py
Modified:
   lxml/branch/html/CHANGES.txt
Log:
BeautifulSoup support

Modified: lxml/branch/html/CHANGES.txt
==============================================================================
--- lxml/branch/html/CHANGES.txt	(original)
+++ lxml/branch/html/CHANGES.txt	Mon Jul 16 15:12:51 2007
@@ -8,6 +8,8 @@
 Features added
 --------------
 
+* HTML tag soup parser based on BeautifulSoup
+
 * Entity support through an ``Entity`` factory and element classes. XML
   parsers now have a ``resolve_entities`` keyword argument that can be set to
   False to keep entities in the document.

Added: lxml/branch/html/doc/elementsoup.txt
==============================================================================
--- (empty file)
+++ lxml/branch/html/doc/elementsoup.txt	Mon Jul 16 15:12:51 2007
@@ -0,0 +1,47 @@
+====================
+BeautifulSoup Parser
+====================
+
+:Author:
+  Stefan Behnel
+
+BeautifulSoup_ is a Python package that parses broken HTML.  While libxml2
+(and thus lxml) can also parse broken HTML, BeautifulSoup is much more
+forgiving and has superiour `support for encoding detection`_.
+
+.. _BeautifulSoup: http://www.crummy.com/software/BeautifulSoup/
+.. _`support for encoding detection`: http://www.crummy.com/software/BeautifulSoup/documentation.html#Beautiful%20Soup%20Gives%20You%20Unicode,%20Dammit
+
+lxml can benefit from the parsing capabilities of BeautifulSoup through the
+`lxml.html.ElementSoup` module.  It provides two main functions: `parse()` to
+parse a file using BeautifulSoup, and `convert_tree()` to convert a
+BeautifulSoup tree into a list of top-level Elements.
+
+Here is a document full of tag soup, similar to, but not quite like, HTML::
+
+    >>> tag_soup = '<meta><head><title>Hello</head<body onload=crash()>Hi all<p>'
+
+all you need to do is pass it to the `parse()` function::
+
+    >>> from lxml.html.ElementSoup import parse
+    >>> from StringIO import StringIO
+    >>> root = parse(StringIO(tag_soup))
+
+To see what we have here, you can serialise it::
+
+    >>> from lxml.etree import tostring
+    >>> print tostring(root, pretty_print=True)
+    <html>
+      <meta/>
+      <head>
+        <title>Hello</title>
+      </head>
+      <body onload="crash()">Hi all<p/></body>
+    </html>
+
+Not quite what you'd expect from an HTML page, but, well, it was broken
+already, right?  BeautifulSoup did its best, and so now it's a tree.
+
+To control which Element implementation is used, you can pass a
+``makeelement`` factory function to ``parse()``. By default, this is based on
+the HTML parser defined in ``lxml.html``.

Added: lxml/branch/html/src/lxml/html/ElementSoup.py
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/ElementSoup.py	Mon Jul 16 15:12:51 2007
@@ -0,0 +1,94 @@
+__doc__ = """External interface to the BeautifulSoup HTML parser.
+"""
+
+__all__ = ["parse", "convert_tree"]
+
+from lxml import etree, html
+from BeautifulSoup import \
+     BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString
+
+
+def parse(file, beautifulsoup=None, makeelement=None):
+    if beautifulsoup is None:
+        beautifulsoup = BeautifulSoup
+    if makeelement is None:
+        makeelement = html.html_parser.makeelement
+    if not hasattr(file, 'read'):
+        file = open(file)
+    tree = beautifulsoup(file)
+    root = _convert_tree(tree, makeelement)
+    # from ET: wrap the document in a html root element, if necessary
+    if len(root) == 1 and root[0].tag == "html":
+        return root[0]
+    root.tag = "html"
+    return root
+
+def convert_tree(beautiful_soup_tree, makeelement=None):
+    if makeelement is None:
+        makeelement = html.html_parser.makeelement
+    root = _convert_tree(beautiful_soup_tree, makeelement)
+    children = root.getchildren()
+    for child in children:
+        root.remove(child)
+    return children
+
+
+# helpers
+
+def _convert_tree(beautiful_soup_tree, makeelement):
+    root = makeelement(beautiful_soup_tree.name,
+                       attrib=dict(beautiful_soup_tree.attrs))
+    _convert_children(root, beautiful_soup_tree, makeelement)
+    return root
+
+def _convert_children(parent, beautiful_soup_tree, makeelement):
+    SubElement = etree.SubElement
+    et_child = None
+    for child in beautiful_soup_tree:
+        if isinstance(child, Tag):
+            et_child = SubElement(parent, child.name, attrib=dict(
+                [(k, unescape(v)) for (k,v) in child.attrs]))
+            _convert_children(et_child, child, makeelement)
+        elif type(child) is NavigableString:
+            _append_text(parent, et_child, unescape(unicode(child)))
+        else:
+            if isinstance(child, Comment):
+                parent.append(etree.Comment(child.string))
+            elif isinstance(child, ProcessingInstruction):
+                parent.append(etree.ProcessingInstruction(
+                    *child.string.split(' ', 1)))
+            else: # CData
+                _append_text(parent, et_child, unescape(unicode(child)))
+
+def _append_text(parent, element, text):
+    if element is None:
+        parent.text = (parent.text or '') + text
+    else:
+        element.tail = (element.tail or '') + text
+
+
+# copied from ET's ElementSoup
+
+import htmlentitydefs, re
+
+handle_entities = re.compile("&(\w+);").sub
+
+try:
+    name2codepoint = htmlentitydefs.name2codepoint
+except AttributeError:
+    # Emulate name2codepoint for Python 2.2 and earlier
+    name2codepoint = {}
+    for name, entity in htmlentitydefs.entitydefs.items():
+        if len(entity) == 1:
+            name2codepoint[name] = ord(entity)
+        else:
+            name2codepoint[name] = int(entity[2:-1])
+
+def unescape(string):
+    # work around oddities in BeautifulSoup's entity handling
+    def unescape_entity(m):
+        try:
+            return unichr(name2codepoint[m.group(1)])
+        except KeyError:
+            return m.group(0) # use as is
+    return handle_entities(unescape_entity, string)


More information about the lxml-checkins mailing list