[Lxml-checkins] r52576 - in lxml/trunk: . doc src/lxml/html
scoder at codespeak.net
scoder at codespeak.net
Sun Mar 16 11:50:28 CET 2008
Author: scoder
Date: Sun Mar 16 11:50:28 2008
New Revision: 52576
Added:
lxml/trunk/src/lxml/html/soupparser.py
- copied, changed from r51012, lxml/trunk/src/lxml/html/ElementSoup.py
Removed:
lxml/trunk/src/lxml/html/ElementSoup.py
Modified:
lxml/trunk/ (props changed)
lxml/trunk/doc/elementsoup.txt
Log:
r3788 at delle: sbehnel | 2008-03-16 11:49:19 +0100
split of ElementSoup module: soupparser.py with consistent API, ElementSoup.py as legace module
Modified: lxml/trunk/doc/elementsoup.txt
==============================================================================
--- lxml/trunk/doc/elementsoup.txt (original)
+++ lxml/trunk/doc/elementsoup.txt Sun Mar 16 11:50:28 2008
@@ -2,9 +2,6 @@
BeautifulSoup Parser
====================
-:Author:
- Stefan Behnel
-
BeautifulSoup_ is a Python package that parses broken HTML. While libxml2
(and thus lxml) can also parse broken HTML, BeautifulSoup is much more
forgiving and has superiour `support for encoding detection`_.
@@ -12,24 +9,28 @@
.. _BeautifulSoup: http://www.crummy.com/software/BeautifulSoup/
.. _`support for encoding detection`: http://www.crummy.com/software/BeautifulSoup/documentation.html#Beautiful%20Soup%20Gives%20You%20Unicode,%20Dammit
-lxml can benefit from the parsing capabilities of BeautifulSoup through the
-`lxml.html.ElementSoup` module. It provides two main functions: `parse()` to
-parse a file using BeautifulSoup, and `convert_tree()` to convert a
+lxml can benefit from the parsing capabilities of BeautifulSoup
+through the ``lxml.html.soupparser`` module. It provides three main
+functions: ``fromstring()`` and ``parse()`` to parse a string or file
+using BeautifulSoup, and `convert_tree()` to convert an existing
BeautifulSoup tree into a list of top-level Elements.
+The functions ``fromstring()`` and ``parse()`` behave as known from
+ElementTree. The first returns a root Element, the latter returns an
+ElementTree.
+
Here is a document full of tag soup, similar to, but not quite like, HTML:
.. sourcecode:: pycon
>>> tag_soup = '<meta><head><title>Hello</head<body onload=crash()>Hi all<p>'
-all you need to do is pass it to the `parse()` function:
+all you need to do is pass it to the ``fromstring()`` function:
.. sourcecode:: pycon
- >>> from lxml.html.ElementSoup import parse
- >>> from StringIO import StringIO
- >>> root = parse(StringIO(tag_soup))
+ >>> from lxml.html.soupparser import fromstring
+ >>> root = fromstring(tag_soup)
To see what we have here, you can serialise it:
@@ -49,5 +50,10 @@
already, right? BeautifulSoup did its best, and so now it's a tree.
To control which Element implementation is used, you can pass a
-``makeelement`` factory function to ``parse()``. By default, this is based on
-the HTML parser defined in ``lxml.html``.
+``makeelement`` factory function to ``parse()`` and ``fromstring()``.
+By default, this is based on the HTML parser defined in ``lxml.html``.
+
+There is also a legacy module called ``ElementSoup``, which mimics the
+interface provided by ElementTree's own ElementSoup_ module.
+
+.. _ElementSoup: http://effbot.org/zone/element-soup.htm
Deleted: /lxml/trunk/src/lxml/html/ElementSoup.py
==============================================================================
--- /lxml/trunk/src/lxml/html/ElementSoup.py Sun Mar 16 11:50:28 2008
+++ (empty file)
@@ -1,94 +0,0 @@
-__doc__ = """External interface to the BeautifulSoup HTML parser.
-"""
-
-__all__ = ["parse", "convert_tree"]
-
-from lxml import etree, html
-from BeautifulSoup import \
- BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString
-
-
-def parse(file, beautifulsoup=None, makeelement=None):
- if beautifulsoup is None:
- beautifulsoup = BeautifulSoup
- if makeelement is None:
- makeelement = html.html_parser.makeelement
- if not hasattr(file, 'read'):
- file = open(file)
- tree = beautifulsoup(file)
- root = _convert_tree(tree, makeelement)
- # from ET: wrap the document in a html root element, if necessary
- if len(root) == 1 and root[0].tag == "html":
- return root[0]
- root.tag = "html"
- return root
-
-def convert_tree(beautiful_soup_tree, makeelement=None):
- if makeelement is None:
- makeelement = html.html_parser.makeelement
- root = _convert_tree(beautiful_soup_tree, makeelement)
- children = root.getchildren()
- for child in children:
- root.remove(child)
- return children
-
-
-# helpers
-
-def _convert_tree(beautiful_soup_tree, makeelement):
- root = makeelement(beautiful_soup_tree.name,
- attrib=dict(beautiful_soup_tree.attrs))
- _convert_children(root, beautiful_soup_tree, makeelement)
- return root
-
-def _convert_children(parent, beautiful_soup_tree, makeelement):
- SubElement = etree.SubElement
- et_child = None
- for child in beautiful_soup_tree:
- if isinstance(child, Tag):
- et_child = SubElement(parent, child.name, attrib=dict(
- [(k, unescape(v)) for (k,v) in child.attrs]))
- _convert_children(et_child, child, makeelement)
- elif type(child) is NavigableString:
- _append_text(parent, et_child, unescape(unicode(child)))
- else:
- if isinstance(child, Comment):
- parent.append(etree.Comment(child.string))
- elif isinstance(child, ProcessingInstruction):
- parent.append(etree.ProcessingInstruction(
- *child.string.split(' ', 1)))
- else: # CData
- _append_text(parent, et_child, unescape(unicode(child)))
-
-def _append_text(parent, element, text):
- if element is None:
- parent.text = (parent.text or '') + text
- else:
- element.tail = (element.tail or '') + text
-
-
-# copied from ET's ElementSoup
-
-import htmlentitydefs, re
-
-handle_entities = re.compile("&(\w+);").sub
-
-try:
- name2codepoint = htmlentitydefs.name2codepoint
-except AttributeError:
- # Emulate name2codepoint for Python 2.2 and earlier
- name2codepoint = {}
- for name, entity in htmlentitydefs.entitydefs.items():
- if len(entity) == 1:
- name2codepoint[name] = ord(entity)
- else:
- name2codepoint[name] = int(entity[2:-1])
-
-def unescape(string):
- # work around oddities in BeautifulSoup's entity handling
- def unescape_entity(m):
- try:
- return unichr(name2codepoint[m.group(1)])
- except KeyError:
- return m.group(0) # use as is
- return handle_entities(unescape_entity, string)
Copied: lxml/trunk/src/lxml/html/soupparser.py (from r51012, lxml/trunk/src/lxml/html/ElementSoup.py)
==============================================================================
--- lxml/trunk/src/lxml/html/ElementSoup.py (original)
+++ lxml/trunk/src/lxml/html/soupparser.py Sun Mar 16 11:50:28 2008
@@ -1,29 +1,50 @@
__doc__ = """External interface to the BeautifulSoup HTML parser.
"""
-__all__ = ["parse", "convert_tree"]
+__all__ = ["fromstring", "parse", "convert_tree"]
from lxml import etree, html
from BeautifulSoup import \
BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString
+def fromstring(data, beautifulsoup=None, makeelement=None):
+ """Parse a string of HTML data into an Element tree using the
+ BeautifulSoup parser.
+
+ Returns the root ``<html>`` Element of the tree.
+
+ You can pass a different BeautifulSoup parser through the
+ `beautifulsoup` keyword, and a diffent Element factory function
+ through the `makeelement` keyword. By default, the standard
+ ``BeautifulSoup`` class and the default factory of `lxml.html` are
+ used.
+ """
+ return _parse(data, beautifulsoup, makeelement)
+
def parse(file, beautifulsoup=None, makeelement=None):
- if beautifulsoup is None:
- beautifulsoup = BeautifulSoup
- if makeelement is None:
- makeelement = html.html_parser.makeelement
+ """Parse a file into an ElemenTree using the BeautifulSoup parser.
+
+ You can pass a different BeautifulSoup parser through the
+ `beautifulsoup` keyword, and a diffent Element factory function
+ through the `makeelement` keyword. By default, the standard
+ ``BeautifulSoup`` class and the default factory of `lxml.html` are
+ used.
+ """
if not hasattr(file, 'read'):
file = open(file)
- tree = beautifulsoup(file)
- root = _convert_tree(tree, makeelement)
- # from ET: wrap the document in a html root element, if necessary
- if len(root) == 1 and root[0].tag == "html":
- return root[0]
- root.tag = "html"
- return root
+ root = _parse(file, beautifulsoup, makeelement)
+ return etree.ElementTree(root)
def convert_tree(beautiful_soup_tree, makeelement=None):
+ """Convert a BeautifulSoup tree to a list of Element trees.
+
+ Returns a list instead of a single root Element to support
+ HTML-like soup with more than one root element.
+
+ You can pass a different Element factory through the `makeelement`
+ keyword.
+ """
if makeelement is None:
makeelement = html.html_parser.makeelement
root = _convert_tree(beautiful_soup_tree, makeelement)
@@ -35,6 +56,19 @@
# helpers
+def _parse(source, beautifulsoup, makeelement):
+ if beautifulsoup is None:
+ beautifulsoup = BeautifulSoup
+ if makeelement is None:
+ makeelement = html.html_parser.makeelement
+ tree = beautifulsoup(source)
+ root = _convert_tree(tree, makeelement)
+ # from ET: wrap the document in a html root element, if necessary
+ if len(root) == 1 and root[0].tag == "html":
+ return root[0]
+ root.tag = "html"
+ return root
+
def _convert_tree(beautiful_soup_tree, makeelement):
root = makeelement(beautiful_soup_tree.name,
attrib=dict(beautiful_soup_tree.attrs))
More information about the lxml-checkins
mailing list