[Lxml-checkins] r54311 - in lxml/trunk: . src/lxml/html src/lxml/html/tests

scoder at codespeak.net scoder at codespeak.net
Fri May 2 09:49:51 CEST 2008


Author: scoder
Date: Fri May  2 09:49:49 2008
New Revision: 54311

Modified:
   lxml/trunk/   (props changed)
   lxml/trunk/CHANGES.txt
   lxml/trunk/src/lxml/html/__init__.py
   lxml/trunk/src/lxml/html/tests/test_basic.txt
Log:
 r4115 at delle:  sbehnel | 2008-05-02 09:48:17 +0200
 'parser' keyword in lxml.html parse functions, XHTMLParser class


Modified: lxml/trunk/CHANGES.txt
==============================================================================
--- lxml/trunk/CHANGES.txt	(original)
+++ lxml/trunk/CHANGES.txt	Fri May  2 09:49:49 2008
@@ -8,6 +8,12 @@
 Features added
 --------------
 
+* All parse functions in lxml.html take a ``parser`` keyword argument.
+
+* lxml.html has a new parser class ``XHTMLParser`` and a module
+  attribute ``xhtml_parser`` that provide XML parsers that are
+  pre-configured for the lxml.html package.
+
 Bugs fixed
 ----------
 

Modified: lxml/trunk/src/lxml/html/__init__.py
==============================================================================
--- lxml/trunk/src/lxml/html/__init__.py	(original)
+++ lxml/trunk/src/lxml/html/__init__.py	Fri May  2 09:49:49 2008
@@ -443,14 +443,17 @@
 # parsing
 ################################################################################
 
-def document_fromstring(html, **kw):
-    value = etree.HTML(html, html_parser, **kw)
+def document_fromstring(html, parser=None, **kw):
+    if parser is None:
+        parser = html_parser
+    value = etree.fromstring(html, parser, **kw)
     if value is None:
         raise etree.ParserError(
             "Document is empty")
     return value
 
-def fragments_fromstring(html, no_leading_text=False, base_url=None, **kw):
+def fragments_fromstring(html, no_leading_text=False, base_url=None,
+                         parser=None, **kw):
     """
     Parses several HTML elements, returning a list of elements.
 
@@ -461,11 +464,13 @@
 
     base_url will set the document's base_url attribute (and the tree's docinfo.URL)
     """
+    if parser is None:
+        parser = html_parser
     # FIXME: check what happens when you give html with a body, head, etc.
     start = html[:20].lstrip().lower()
     if not start.startswith('<html') and not start.startswith('<!doctype'):
         html = '<html><body>%s</body></html>' % html
-    doc = document_fromstring(html, base_url=base_url, **kw)
+    doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
     assert doc.tag == 'html'
     bodies = [e for e in doc if e.tag == 'body']
     assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html))
@@ -481,7 +486,8 @@
     # would be nice
     return elements
 
-def fragment_fromstring(html, create_parent=False, base_url=None, **kw):
+def fragment_fromstring(html, create_parent=False, base_url=None,
+                        parser=None, **kw):
     """
     Parses a single HTML element; it is an error if there is more than
     one element, or if anything but whitespace precedes or follows the
@@ -492,12 +498,16 @@
 
     base_url will set the document's base_url attribute (and the tree's docinfo.URL)
     """
+    if parser is None:
+        parser = html_parser
     if create_parent:
         if not isinstance(create_parent, basestring):
             create_parent = 'div'
         return fragment_fromstring('<%s>%s</%s>' % (
-            create_parent, html, create_parent), base_url=base_url, **kw)
-    elements = fragments_fromstring(html, no_leading_text=True, base_url=base_url, **kw)
+            create_parent, html, create_parent),
+                                   parser=parser, base_url=base_url, **kw)
+    elements = fragments_fromstring(html, parser=parser, no_leading_text=True,
+                                    base_url=base_url, **kw)
     if not elements:
         raise etree.ParserError(
             "No elements found")
@@ -512,7 +522,7 @@
     el.tail = None
     return el
 
-def fromstring(html, base_url=None, **kw):
+def fromstring(html, base_url=None, parser=None, **kw):
     """
     Parse the html, returning a single element/document.
 
@@ -521,12 +531,14 @@
 
     base_url will set the document's base_url attribute (and the tree's docinfo.URL)
     """
+    if parser is None:
+        parser = html_parser
     start = html[:10].lstrip().lower()
     if start.startswith('<html') or start.startswith('<!doctype'):
         # Looks like a full HTML document
-        return document_fromstring(html, base_url=base_url, **kw)
+        return document_fromstring(html, parser=parser, base_url=base_url, **kw)
     # otherwise, lets parse it out...
-    doc = document_fromstring(html, base_url=base_url, **kw)
+    doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
     bodies = doc.findall('body')
     if bodies:
         body = bodies[0]
@@ -1341,8 +1353,18 @@
         super(HTMLParser, self).__init__(**kwargs)
         self.set_element_class_lookup(HtmlElementClassLookup())
 
+class XHTMLParser(etree.XMLParser):
+    def __init__(self, **kwargs):
+        super(XHTMLParser, self).__init__(**kwargs)
+        self.set_element_class_lookup(HtmlElementClassLookup())
+
 def Element(*args, **kw):
+    """Create a new HTML Element.
+
+    This can also be used for XHTML documents.
+    """
     v = html_parser.makeelement(*args, **kw)
     return v
 
 html_parser = HTMLParser()
+xhtml_parser = XHTMLParser()

Modified: lxml/trunk/src/lxml/html/tests/test_basic.txt
==============================================================================
--- lxml/trunk/src/lxml/html/tests/test_basic.txt	(original)
+++ lxml/trunk/src/lxml/html/tests/test_basic.txt	Fri May  2 09:49:49 2008
@@ -96,3 +96,16 @@
       <div>footer</div>
      </body>
     </html>
+
+lxml.html has two parsers, one for HTML, one for XHTML:
+
+    >>> from lxml.html import HTMLParser, XHTMLParser
+    >>> html = "<html><body><p>Hi!</p></body></html>"
+
+    >>> root = document_fromstring(html, parser=HTMLParser())
+    >>> print root.tag
+    html
+
+    >>> root = document_fromstring(html, parser=XHTMLParser())
+    >>> print root.tag
+    html


More information about the lxml-checkins mailing list