[Lxml-checkins] r52576 - in lxml/trunk: . doc src/lxml/html

scoder at codespeak.net scoder at codespeak.net
Sun Mar 16 11:50:28 CET 2008


Author: scoder
Date: Sun Mar 16 11:50:28 2008
New Revision: 52576

Added:
   lxml/trunk/src/lxml/html/soupparser.py
      - copied, changed from r51012, lxml/trunk/src/lxml/html/ElementSoup.py
Removed:
   lxml/trunk/src/lxml/html/ElementSoup.py
Modified:
   lxml/trunk/   (props changed)
   lxml/trunk/doc/elementsoup.txt
Log:
 r3788 at delle:  sbehnel | 2008-03-16 11:49:19 +0100
 split of ElementSoup module: soupparser.py with consistent API, ElementSoup.py as legace module


Modified: lxml/trunk/doc/elementsoup.txt
==============================================================================
--- lxml/trunk/doc/elementsoup.txt	(original)
+++ lxml/trunk/doc/elementsoup.txt	Sun Mar 16 11:50:28 2008
@@ -2,9 +2,6 @@
 BeautifulSoup Parser
 ====================
 
-:Author:
-  Stefan Behnel
-
 BeautifulSoup_ is a Python package that parses broken HTML.  While libxml2
 (and thus lxml) can also parse broken HTML, BeautifulSoup is much more
 forgiving and has superiour `support for encoding detection`_.
@@ -12,24 +9,28 @@
 .. _BeautifulSoup: http://www.crummy.com/software/BeautifulSoup/
 .. _`support for encoding detection`: http://www.crummy.com/software/BeautifulSoup/documentation.html#Beautiful%20Soup%20Gives%20You%20Unicode,%20Dammit
 
-lxml can benefit from the parsing capabilities of BeautifulSoup through the
-`lxml.html.ElementSoup` module.  It provides two main functions: `parse()` to
-parse a file using BeautifulSoup, and `convert_tree()` to convert a
+lxml can benefit from the parsing capabilities of BeautifulSoup
+through the ``lxml.html.soupparser`` module.  It provides three main
+functions: ``fromstring()`` and ``parse()`` to parse a string or file
+using BeautifulSoup, and `convert_tree()` to convert an existing
 BeautifulSoup tree into a list of top-level Elements.
 
+The functions ``fromstring()`` and ``parse()`` behave as known from
+ElementTree.  The first returns a root Element, the latter returns an
+ElementTree.
+
 Here is a document full of tag soup, similar to, but not quite like, HTML:
 
 .. sourcecode:: pycon
 
     >>> tag_soup = '<meta><head><title>Hello</head<body onload=crash()>Hi all<p>'
 
-all you need to do is pass it to the `parse()` function:
+all you need to do is pass it to the ``fromstring()`` function:
 
 .. sourcecode:: pycon
 
-    >>> from lxml.html.ElementSoup import parse
-    >>> from StringIO import StringIO
-    >>> root = parse(StringIO(tag_soup))
+    >>> from lxml.html.soupparser import fromstring
+    >>> root = fromstring(tag_soup)
 
 To see what we have here, you can serialise it:
 
@@ -49,5 +50,10 @@
 already, right?  BeautifulSoup did its best, and so now it's a tree.
 
 To control which Element implementation is used, you can pass a
-``makeelement`` factory function to ``parse()``. By default, this is based on
-the HTML parser defined in ``lxml.html``.
+``makeelement`` factory function to ``parse()`` and ``fromstring()``.
+By default, this is based on the HTML parser defined in ``lxml.html``.
+
+There is also a legacy module called ``ElementSoup``, which mimics the
+interface provided by ElementTree's own ElementSoup_ module.
+
+.. _ElementSoup: http://effbot.org/zone/element-soup.htm

Deleted: /lxml/trunk/src/lxml/html/ElementSoup.py
==============================================================================
--- /lxml/trunk/src/lxml/html/ElementSoup.py	Sun Mar 16 11:50:28 2008
+++ (empty file)
@@ -1,94 +0,0 @@
-__doc__ = """External interface to the BeautifulSoup HTML parser.
-"""
-
-__all__ = ["parse", "convert_tree"]
-
-from lxml import etree, html
-from BeautifulSoup import \
-     BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString
-
-
-def parse(file, beautifulsoup=None, makeelement=None):
-    if beautifulsoup is None:
-        beautifulsoup = BeautifulSoup
-    if makeelement is None:
-        makeelement = html.html_parser.makeelement
-    if not hasattr(file, 'read'):
-        file = open(file)
-    tree = beautifulsoup(file)
-    root = _convert_tree(tree, makeelement)
-    # from ET: wrap the document in a html root element, if necessary
-    if len(root) == 1 and root[0].tag == "html":
-        return root[0]
-    root.tag = "html"
-    return root
-
-def convert_tree(beautiful_soup_tree, makeelement=None):
-    if makeelement is None:
-        makeelement = html.html_parser.makeelement
-    root = _convert_tree(beautiful_soup_tree, makeelement)
-    children = root.getchildren()
-    for child in children:
-        root.remove(child)
-    return children
-
-
-# helpers
-
-def _convert_tree(beautiful_soup_tree, makeelement):
-    root = makeelement(beautiful_soup_tree.name,
-                       attrib=dict(beautiful_soup_tree.attrs))
-    _convert_children(root, beautiful_soup_tree, makeelement)
-    return root
-
-def _convert_children(parent, beautiful_soup_tree, makeelement):
-    SubElement = etree.SubElement
-    et_child = None
-    for child in beautiful_soup_tree:
-        if isinstance(child, Tag):
-            et_child = SubElement(parent, child.name, attrib=dict(
-                [(k, unescape(v)) for (k,v) in child.attrs]))
-            _convert_children(et_child, child, makeelement)
-        elif type(child) is NavigableString:
-            _append_text(parent, et_child, unescape(unicode(child)))
-        else:
-            if isinstance(child, Comment):
-                parent.append(etree.Comment(child.string))
-            elif isinstance(child, ProcessingInstruction):
-                parent.append(etree.ProcessingInstruction(
-                    *child.string.split(' ', 1)))
-            else: # CData
-                _append_text(parent, et_child, unescape(unicode(child)))
-
-def _append_text(parent, element, text):
-    if element is None:
-        parent.text = (parent.text or '') + text
-    else:
-        element.tail = (element.tail or '') + text
-
-
-# copied from ET's ElementSoup
-
-import htmlentitydefs, re
-
-handle_entities = re.compile("&(\w+);").sub
-
-try:
-    name2codepoint = htmlentitydefs.name2codepoint
-except AttributeError:
-    # Emulate name2codepoint for Python 2.2 and earlier
-    name2codepoint = {}
-    for name, entity in htmlentitydefs.entitydefs.items():
-        if len(entity) == 1:
-            name2codepoint[name] = ord(entity)
-        else:
-            name2codepoint[name] = int(entity[2:-1])
-
-def unescape(string):
-    # work around oddities in BeautifulSoup's entity handling
-    def unescape_entity(m):
-        try:
-            return unichr(name2codepoint[m.group(1)])
-        except KeyError:
-            return m.group(0) # use as is
-    return handle_entities(unescape_entity, string)

Copied: lxml/trunk/src/lxml/html/soupparser.py (from r51012, lxml/trunk/src/lxml/html/ElementSoup.py)
==============================================================================
--- lxml/trunk/src/lxml/html/ElementSoup.py	(original)
+++ lxml/trunk/src/lxml/html/soupparser.py	Sun Mar 16 11:50:28 2008
@@ -1,29 +1,50 @@
 __doc__ = """External interface to the BeautifulSoup HTML parser.
 """
 
-__all__ = ["parse", "convert_tree"]
+__all__ = ["fromstring", "parse", "convert_tree"]
 
 from lxml import etree, html
 from BeautifulSoup import \
      BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString
 
 
+def fromstring(data, beautifulsoup=None, makeelement=None):
+    """Parse a string of HTML data into an Element tree using the
+    BeautifulSoup parser.
+
+    Returns the root ``<html>`` Element of the tree.
+
+    You can pass a different BeautifulSoup parser through the
+    `beautifulsoup` keyword, and a diffent Element factory function
+    through the `makeelement` keyword.  By default, the standard
+    ``BeautifulSoup`` class and the default factory of `lxml.html` are
+    used.
+    """
+    return _parse(data, beautifulsoup, makeelement)
+
 def parse(file, beautifulsoup=None, makeelement=None):
-    if beautifulsoup is None:
-        beautifulsoup = BeautifulSoup
-    if makeelement is None:
-        makeelement = html.html_parser.makeelement
+    """Parse a file into an ElemenTree using the BeautifulSoup parser.
+
+    You can pass a different BeautifulSoup parser through the
+    `beautifulsoup` keyword, and a diffent Element factory function
+    through the `makeelement` keyword.  By default, the standard
+    ``BeautifulSoup`` class and the default factory of `lxml.html` are
+    used.
+    """
     if not hasattr(file, 'read'):
         file = open(file)
-    tree = beautifulsoup(file)
-    root = _convert_tree(tree, makeelement)
-    # from ET: wrap the document in a html root element, if necessary
-    if len(root) == 1 and root[0].tag == "html":
-        return root[0]
-    root.tag = "html"
-    return root
+    root = _parse(file, beautifulsoup, makeelement)
+    return etree.ElementTree(root)
 
 def convert_tree(beautiful_soup_tree, makeelement=None):
+    """Convert a BeautifulSoup tree to a list of Element trees.
+
+    Returns a list instead of a single root Element to support
+    HTML-like soup with more than one root element.
+
+    You can pass a different Element factory through the `makeelement`
+    keyword.
+    """
     if makeelement is None:
         makeelement = html.html_parser.makeelement
     root = _convert_tree(beautiful_soup_tree, makeelement)
@@ -35,6 +56,19 @@
 
 # helpers
 
+def _parse(source, beautifulsoup, makeelement):
+    if beautifulsoup is None:
+        beautifulsoup = BeautifulSoup
+    if makeelement is None:
+        makeelement = html.html_parser.makeelement
+    tree = beautifulsoup(source)
+    root = _convert_tree(tree, makeelement)
+    # from ET: wrap the document in a html root element, if necessary
+    if len(root) == 1 and root[0].tag == "html":
+        return root[0]
+    root.tag = "html"
+    return root
+
 def _convert_tree(beautiful_soup_tree, makeelement):
     root = makeelement(beautiful_soup_tree.name,
                        attrib=dict(beautiful_soup_tree.attrs))


More information about the lxml-checkins mailing list