[Lxml-checkins] r43988 - lxml/branch/html/src/lxml/html

ianb at codespeak.net ianb at codespeak.net
Sat Jun 2 03:37:12 CEST 2007


Author: ianb
Date: Sat Jun  2 03:37:12 2007
New Revision: 43988

Modified:
   lxml/branch/html/src/lxml/html/__init__.py
Log:
Fix the parsing of fragments a big, when there's just a single element that looks like a head element, and nothing that looks like a body element.  Add a PI custom element

Modified: lxml/branch/html/src/lxml/html/__init__.py
==============================================================================
--- lxml/branch/html/src/lxml/html/__init__.py	(original)
+++ lxml/branch/html/src/lxml/html/__init__.py	Sat Jun  2 03:37:12 2007
@@ -261,9 +261,12 @@
 class HtmlElement(etree.ElementBase, HtmlMixin):
     pass
 
+class HtmlProcessingInstruction(etree.PIBase, HtmlMixin):
+    pass
+
 html_parser = etree.HTMLParser()
 html_parser.setElementClassLookup(etree.ElementDefaultClassLookup(
-    element=HtmlElement, comment=HtmlComment))
+    element=HtmlElement, comment=HtmlComment, pi=HtmlProcessingInstruction))
 
 def HTML(html):
     # FIXME: should this notice a fragment and parse accordingly?
@@ -345,20 +348,23 @@
     # otherwise, lets parse it out...
     doc = HTML(html)
     bodies = doc.findall('body')
-    body = bodies[0]
-    if len(bodies) > 1:
-        # Somehow there are multiple bodies, which is bad, but just
-        # smash them into one body
-        for other_body in bodies[1:]:
-            if other_body.text:
-                if len(body):
-                    body[-1].tail = (body[-1].tail or '') + other_body.text
-                else:
-                    body.text = (body.text or '') + other_body.text
-            body.extend(other_body)
-            # We'll ignore tail
-            # I guess we are ignoring attributes too
-            other_body.drop_element()
+    if bodies:
+        body = bodies[0]
+        if len(bodies) > 1:
+            # Somehow there are multiple bodies, which is bad, but just
+            # smash them into one body
+            for other_body in bodies[1:]:
+                if other_body.text:
+                    if len(body):
+                        body[-1].tail = (body[-1].tail or '') + other_body.text
+                    else:
+                        body.text = (body.text or '') + other_body.text
+                body.extend(other_body)
+                # We'll ignore tail
+                # I guess we are ignoring attributes too
+                other_body.drop_element()
+    else:
+        body = None
     heads = doc.findall('head')
     if heads:
         # Well, we have some sort of structure, so lets keep it all
@@ -369,7 +375,6 @@
                 # We don't care about text or tail in a head
                 other_head.drop_element()
         return doc
-    
     if (len(body) == 1 and (not body.text or not body.text.strip())
         and (not body[-1].tail or not body[-1].tail.strip())):
         # The body has just one element, so it was probably a single


More information about the lxml-checkins mailing list