[Lxml-checkins] r43988 - lxml/branch/html/src/lxml/html
ianb at codespeak.net
ianb at codespeak.net
Sat Jun 2 03:37:12 CEST 2007
Author: ianb
Date: Sat Jun 2 03:37:12 2007
New Revision: 43988
Modified:
lxml/branch/html/src/lxml/html/__init__.py
Log:
Fix the parsing of fragments a big, when there's just a single element that looks like a head element, and nothing that looks like a body element. Add a PI custom element
Modified: lxml/branch/html/src/lxml/html/__init__.py
==============================================================================
--- lxml/branch/html/src/lxml/html/__init__.py (original)
+++ lxml/branch/html/src/lxml/html/__init__.py Sat Jun 2 03:37:12 2007
@@ -261,9 +261,12 @@
class HtmlElement(etree.ElementBase, HtmlMixin):
pass
+class HtmlProcessingInstruction(etree.PIBase, HtmlMixin):
+ pass
+
html_parser = etree.HTMLParser()
html_parser.setElementClassLookup(etree.ElementDefaultClassLookup(
- element=HtmlElement, comment=HtmlComment))
+ element=HtmlElement, comment=HtmlComment, pi=HtmlProcessingInstruction))
def HTML(html):
# FIXME: should this notice a fragment and parse accordingly?
@@ -345,20 +348,23 @@
# otherwise, lets parse it out...
doc = HTML(html)
bodies = doc.findall('body')
- body = bodies[0]
- if len(bodies) > 1:
- # Somehow there are multiple bodies, which is bad, but just
- # smash them into one body
- for other_body in bodies[1:]:
- if other_body.text:
- if len(body):
- body[-1].tail = (body[-1].tail or '') + other_body.text
- else:
- body.text = (body.text or '') + other_body.text
- body.extend(other_body)
- # We'll ignore tail
- # I guess we are ignoring attributes too
- other_body.drop_element()
+ if bodies:
+ body = bodies[0]
+ if len(bodies) > 1:
+ # Somehow there are multiple bodies, which is bad, but just
+ # smash them into one body
+ for other_body in bodies[1:]:
+ if other_body.text:
+ if len(body):
+ body[-1].tail = (body[-1].tail or '') + other_body.text
+ else:
+ body.text = (body.text or '') + other_body.text
+ body.extend(other_body)
+ # We'll ignore tail
+ # I guess we are ignoring attributes too
+ other_body.drop_element()
+ else:
+ body = None
heads = doc.findall('head')
if heads:
# Well, we have some sort of structure, so lets keep it all
@@ -369,7 +375,6 @@
# We don't care about text or tail in a head
other_head.drop_element()
return doc
-
if (len(body) == 1 and (not body.text or not body.text.strip())
and (not body[-1].tail or not body[-1].tail.strip())):
# The body has just one element, so it was probably a single
More information about the lxml-checkins
mailing list