[Lxml-checkins] r54311 - in lxml/trunk: . src/lxml/html src/lxml/html/tests
scoder at codespeak.net
scoder at codespeak.net
Fri May 2 09:49:51 CEST 2008
Author: scoder
Date: Fri May 2 09:49:49 2008
New Revision: 54311
Modified:
lxml/trunk/ (props changed)
lxml/trunk/CHANGES.txt
lxml/trunk/src/lxml/html/__init__.py
lxml/trunk/src/lxml/html/tests/test_basic.txt
Log:
r4115 at delle: sbehnel | 2008-05-02 09:48:17 +0200
'parser' keyword in lxml.html parse functions, XHTMLParser class
Modified: lxml/trunk/CHANGES.txt
==============================================================================
--- lxml/trunk/CHANGES.txt (original)
+++ lxml/trunk/CHANGES.txt Fri May 2 09:49:49 2008
@@ -8,6 +8,12 @@
Features added
--------------
+* All parse functions in lxml.html take a ``parser`` keyword argument.
+
+* lxml.html has a new parser class ``XHTMLParser`` and a module
+ attribute ``xhtml_parser`` that provide XML parsers that are
+ pre-configured for the lxml.html package.
+
Bugs fixed
----------
Modified: lxml/trunk/src/lxml/html/__init__.py
==============================================================================
--- lxml/trunk/src/lxml/html/__init__.py (original)
+++ lxml/trunk/src/lxml/html/__init__.py Fri May 2 09:49:49 2008
@@ -443,14 +443,17 @@
# parsing
################################################################################
-def document_fromstring(html, **kw):
- value = etree.HTML(html, html_parser, **kw)
+def document_fromstring(html, parser=None, **kw):
+ if parser is None:
+ parser = html_parser
+ value = etree.fromstring(html, parser, **kw)
if value is None:
raise etree.ParserError(
"Document is empty")
return value
-def fragments_fromstring(html, no_leading_text=False, base_url=None, **kw):
+def fragments_fromstring(html, no_leading_text=False, base_url=None,
+ parser=None, **kw):
"""
Parses several HTML elements, returning a list of elements.
@@ -461,11 +464,13 @@
base_url will set the document's base_url attribute (and the tree's docinfo.URL)
"""
+ if parser is None:
+ parser = html_parser
# FIXME: check what happens when you give html with a body, head, etc.
start = html[:20].lstrip().lower()
if not start.startswith('<html') and not start.startswith('<!doctype'):
html = '<html><body>%s</body></html>' % html
- doc = document_fromstring(html, base_url=base_url, **kw)
+ doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
assert doc.tag == 'html'
bodies = [e for e in doc if e.tag == 'body']
assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html))
@@ -481,7 +486,8 @@
# would be nice
return elements
-def fragment_fromstring(html, create_parent=False, base_url=None, **kw):
+def fragment_fromstring(html, create_parent=False, base_url=None,
+ parser=None, **kw):
"""
Parses a single HTML element; it is an error if there is more than
one element, or if anything but whitespace precedes or follows the
@@ -492,12 +498,16 @@
base_url will set the document's base_url attribute (and the tree's docinfo.URL)
"""
+ if parser is None:
+ parser = html_parser
if create_parent:
if not isinstance(create_parent, basestring):
create_parent = 'div'
return fragment_fromstring('<%s>%s</%s>' % (
- create_parent, html, create_parent), base_url=base_url, **kw)
- elements = fragments_fromstring(html, no_leading_text=True, base_url=base_url, **kw)
+ create_parent, html, create_parent),
+ parser=parser, base_url=base_url, **kw)
+ elements = fragments_fromstring(html, parser=parser, no_leading_text=True,
+ base_url=base_url, **kw)
if not elements:
raise etree.ParserError(
"No elements found")
@@ -512,7 +522,7 @@
el.tail = None
return el
-def fromstring(html, base_url=None, **kw):
+def fromstring(html, base_url=None, parser=None, **kw):
"""
Parse the html, returning a single element/document.
@@ -521,12 +531,14 @@
base_url will set the document's base_url attribute (and the tree's docinfo.URL)
"""
+ if parser is None:
+ parser = html_parser
start = html[:10].lstrip().lower()
if start.startswith('<html') or start.startswith('<!doctype'):
# Looks like a full HTML document
- return document_fromstring(html, base_url=base_url, **kw)
+ return document_fromstring(html, parser=parser, base_url=base_url, **kw)
# otherwise, lets parse it out...
- doc = document_fromstring(html, base_url=base_url, **kw)
+ doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
bodies = doc.findall('body')
if bodies:
body = bodies[0]
@@ -1341,8 +1353,18 @@
super(HTMLParser, self).__init__(**kwargs)
self.set_element_class_lookup(HtmlElementClassLookup())
+class XHTMLParser(etree.XMLParser):
+ def __init__(self, **kwargs):
+ super(XHTMLParser, self).__init__(**kwargs)
+ self.set_element_class_lookup(HtmlElementClassLookup())
+
def Element(*args, **kw):
+ """Create a new HTML Element.
+
+ This can also be used for XHTML documents.
+ """
v = html_parser.makeelement(*args, **kw)
return v
html_parser = HTMLParser()
+xhtml_parser = XHTMLParser()
Modified: lxml/trunk/src/lxml/html/tests/test_basic.txt
==============================================================================
--- lxml/trunk/src/lxml/html/tests/test_basic.txt (original)
+++ lxml/trunk/src/lxml/html/tests/test_basic.txt Fri May 2 09:49:49 2008
@@ -96,3 +96,16 @@
<div>footer</div>
</body>
</html>
+
+lxml.html has two parsers, one for HTML, one for XHTML:
+
+ >>> from lxml.html import HTMLParser, XHTMLParser
+ >>> html = "<html><body><p>Hi!</p></body></html>"
+
+ >>> root = document_fromstring(html, parser=HTMLParser())
+ >>> print root.tag
+ html
+
+ >>> root = document_fromstring(html, parser=XHTMLParser())
+ >>> print root.tag
+ html
More information about the lxml-checkins
mailing list