[Lxml-checkins] r46535 - in lxml/trunk: . src/lxml src/lxml/tests
scoder at codespeak.net
scoder at codespeak.net
Thu Sep 13 12:52:26 CEST 2007
Author: scoder
Date: Thu Sep 13 12:52:26 2007
New Revision: 46535
Modified:
lxml/trunk/CHANGES.txt
lxml/trunk/selftest.py
lxml/trunk/src/lxml/_elementpath.py
lxml/trunk/src/lxml/etree.pyx
lxml/trunk/src/lxml/tests/test_etree.py
Log:
ET 1.3 compatibility updates: iterfind(), new ElementPath implementation, updated selftest.py, fix for itertext()
Modified: lxml/trunk/CHANGES.txt
==============================================================================
--- lxml/trunk/CHANGES.txt (original)
+++ lxml/trunk/CHANGES.txt Thu Sep 13 12:52:26 2007
@@ -8,6 +8,9 @@
Features added
--------------
+* ``iterfind()`` method on Elements returns an iterator equivalent to
+ ``findall()``
+
* ``itertext()`` method on Elements
* Setting a QName object as value of the .text property or as an attribute
Modified: lxml/trunk/selftest.py
==============================================================================
--- lxml/trunk/selftest.py (original)
+++ lxml/trunk/selftest.py Thu Sep 13 12:52:26 2007
@@ -1,4 +1,4 @@
-# $Id: selftest.py 2193 2004-12-05 18:03:00Z fredrik $
+# $Id: selftest.py 3276 2007-09-12 06:52:30Z fredrik $
# -*- coding: iso-8859-1 -*-
# elementtree selftest program
@@ -14,6 +14,7 @@
from lxml import etree as ElementTree
from lxml import _elementpath as ElementPath
from lxml import ElementInclude
+ET = ElementTree
#from elementtree import ElementTree
#from elementtree import ElementPath
@@ -26,14 +27,11 @@
xml_data = xml_data.replace(' />', '/>')
return xml_data
-def serialize(elem, encoding=None):
+def serialize(elem, **options):
import StringIO
file = StringIO.StringIO()
tree = ElementTree.ElementTree(elem)
- if encoding:
- tree.write(file, encoding)
- else:
- tree.write(file)
+ tree.write(file, **options)
return fix_compatibility( file.getvalue() )
def summarize(elem):
@@ -106,18 +104,21 @@
# --------------------------------------------------------------------
# element tree tests
-## def sanity():
-## """
-## >>> from elementtree.ElementTree import *
-## >>> from elementtree.ElementInclude import *
-## >>> from elementtree.ElementPath import *
-## >>> from elementtree.HTMLTreeBuilder import *
-## >>> from elementtree.SimpleXMLTreeBuilder import *
-## >>> from elementtree.SimpleXMLWriter import *
-## >>> from elementtree.TidyHTMLTreeBuilder import *
-## >>> from elementtree.TidyTools import *
-## >>> from elementtree.XMLTreeBuilder import *
-## """
+def sanity():
+ """
+ >>> from elementtree.ElementTree import *
+ >>> from elementtree.ElementInclude import *
+ >>> from elementtree.ElementPath import *
+ >>> from elementtree.HTMLTreeBuilder import *
+ >>> from elementtree.SimpleXMLTreeBuilder import *
+ >>> from elementtree.SimpleXMLWriter import *
+ >>> from elementtree.TidyHTMLTreeBuilder import *
+ >>> from elementtree.TidyTools import *
+ >>> from elementtree.XMLTreeBuilder import *
+ """
+
+# doesn't work with lxml.etree
+del sanity
def interface():
"""
@@ -129,38 +130,41 @@
>>> check_element_tree(tree)
"""
-## def simplefind():
-## """
-## Test find methods using the elementpath fallback.
+def simplefind():
+ """
+ Test find methods using the elementpath fallback.
-## >>> CurrentElementPath = ElementTree.ElementPath
-## >>> ElementTree.ElementPath = ElementTree._SimpleElementPath()
-## >>> elem = SAMPLE_XML
-## >>> elem.find("tag").tag
-## 'tag'
-## >>> ElementTree.ElementTree(elem).find("tag").tag
-## 'tag'
-## >>> elem.findtext("tag")
-## 'text'
-## >>> elem.findtext("tog")
-## >>> elem.findtext("tog", "default")
-## 'default'
-## >>> ElementTree.ElementTree(elem).findtext("tag")
-## 'text'
-## >>> summarize_list(elem.findall("tag"))
-## ['tag', 'tag']
-## >>> summarize_list(elem.findall(".//tag"))
-## ['tag', 'tag', 'tag']
+ >>> CurrentElementPath = ElementTree.ElementPath
+ >>> ElementTree.ElementPath = ElementTree._SimpleElementPath()
+ >>> elem = SAMPLE_XML
+ >>> elem.find("tag").tag
+ 'tag'
+ >>> ElementTree.ElementTree(elem).find("tag").tag
+ 'tag'
+ >>> elem.findtext("tag")
+ 'text'
+ >>> elem.findtext("tog")
+ >>> elem.findtext("tog", "default")
+ 'default'
+ >>> ElementTree.ElementTree(elem).findtext("tag")
+ 'text'
+ >>> summarize_list(elem.findall("tag"))
+ ['tag', 'tag']
+ >>> summarize_list(elem.findall(".//tag"))
+ ['tag', 'tag', 'tag']
-## Path syntax doesn't work in this case.
+ Path syntax doesn't work in this case.
-## >>> elem.find("section/tag")
-## >>> elem.findtext("section/tag")
-## >>> elem.findall("section/tag")
-## []
+ >>> elem.find("section/tag")
+ >>> elem.findtext("section/tag")
+ >>> elem.findall("section/tag")
+ []
-## >>> ElementTree.ElementPath = CurrentElementPath
-## """
+ >>> ElementTree.ElementPath = CurrentElementPath
+ """
+
+# doesn't work with lxml.etree
+del simplefind
def find():
"""
@@ -216,10 +220,31 @@
['tag', 'tag', 'tag']
>>> summarize_list(elem.findall("././tag"))
['tag', 'tag']
+
+## >>> summarize_list(elem.findall(".//tag[@class]"))
+## ['tag', 'tag', 'tag']
+## >>> summarize_list(elem.findall(".//tag[@class='a']"))
+## ['tag']
+## >>> summarize_list(elem.findall(".//tag[@class='b']"))
+## ['tag', 'tag']
+## >>> summarize_list(elem.findall(".//tag[@id]"))
+## ['tag']
+## >>> summarize_list(elem.findall(".//section[tag]"))
+## ['section']
+## >>> summarize_list(elem.findall(".//section[element]"))
+## []
+## >>> summarize_list(elem.findall("../tag"))
+## []
+## >>> summarize_list(elem.findall("section/../tag"))
+## ['tag', 'tag']
+## >>> summarize_list(ElementTree.ElementTree(elem).findall("./tag"))
+## ['tag', 'tag']
+
+ FIXME: ET's Path module handles this case incorrectly; this gives
+ a warning in 1.3, and the behaviour will be modified in 1.4.
+
>>> summarize_list(ElementTree.ElementTree(elem).findall("/tag"))
['tag', 'tag']
- >>> summarize_list(ElementTree.ElementTree(elem).findall("./tag"))
- ['tag', 'tag']
"""
def bad_find():
@@ -230,15 +255,9 @@
>>> elem.findall("/tag")
Traceback (most recent call last):
SyntaxError: cannot use absolute path on element
- >>> elem.findall("../tag")
- Traceback (most recent call last):
- SyntaxError: unsupported path syntax (..)
>>> elem.findall("section//")
Traceback (most recent call last):
- SyntaxError: path cannot end with //
- >>> elem.findall("tag[tag]")
- Traceback (most recent call last):
- SyntaxError: expected path separator ([)
+ SyntaxError: invalid path
"""
def parsefile():
@@ -261,6 +280,12 @@
<element>text</element>tail
<empty-element/>
</root>
+
+## <ns0:root xmlns:ns0="namespace">
+## <ns0:element key="value">text</ns0:element>
+## <ns0:element>text</ns0:element>tail
+## <ns0:empty-element/>
+## </ns0:root>
"""
## def parsehtml():
@@ -282,6 +307,12 @@
>>> element = ElementTree.fromstring("<html><body>text</body></html>")
>>> ElementTree.ElementTree(element).write(sys.stdout)
<html><body>text</body></html>
+
+## >>> sequence = ["<html><body>", "text</bo", "dy></html>"]
+## >>> element = ElementTree.fromstringlist(sequence)
+## >>> ElementTree.ElementTree(element).write(sys.stdout)
+## <html><body>text</body></html>
+
>>> print ElementTree.tostring(element)
<html><body>text</body></html>
@@ -426,6 +457,11 @@
>>> ElementTree.SubElement(elem, "subtag").text = "subtext"
>>> serialize(elem)
'<tag>text<subtag>subtext</subtag></tag>'
+
+## Test tag suppression
+## >>> elem.tag = None
+## >>> serialize(elem)
+## 'text<subtag>subtext</subtag>'
"""
def writestring():
@@ -446,58 +482,95 @@
>>> elem.text = u"abc"
>>> serialize(elem)
'<tag>abc</tag>'
- >>> serialize(elem, "utf-8")
+ >>> serialize(elem, encoding="utf-8")
'<tag>abc</tag>'
- >>> serialize(elem, "us-ascii")
+ >>> serialize(elem, encoding="us-ascii")
'<tag>abc</tag>'
- >>> serialize(elem, "ISO-8859-1")
+ >>> serialize(elem, encoding="ISO-8859-1")
"<?xml version='1.0' encoding='ISO-8859-1'?>\n<tag>abc</tag>"
>>> elem.text = "<&\"\'>"
>>> serialize(elem)
'<tag><&"\'></tag>'
- >>> serialize(elem, "utf-8")
+ >>> serialize(elem, encoding="utf-8")
'<tag><&"\'></tag>'
- >>> serialize(elem, "us-ascii") # cdata characters
+ >>> serialize(elem, encoding="us-ascii") # cdata characters
'<tag><&"\'></tag>'
- >>> serialize(elem, "ISO-8859-1")
+ >>> serialize(elem, encoding="ISO-8859-1")
'<?xml version=\'1.0\' encoding=\'ISO-8859-1\'?>\n<tag><&"\'></tag>'
## >>> elem.attrib["key"] = "<&\"\'>"
## >>> elem.text = None
## >>> serialize(elem)
## '<tag key="<&"'>"/>'
-## >>> serialize(elem, "utf-8")
+## >>> serialize(elem, encoding="utf-8")
## '<tag key="<&"'>"/>'
-## >>> serialize(elem, "us-ascii")
+## >>> serialize(elem, encoding="us-ascii")
## '<tag key="<&"'>"/>'
-## >>> serialize(elem, "iso-8859-1")
+## >>> serialize(elem, encoding="iso-8859-1")
## '<?xml version=\'1.0\' encoding=\'iso-8859-1\'?>\n<tag key="<&"'>"/>'
>>> elem.text = u'\xe5\xf6\xf6<>'
>>> elem.attrib.clear()
>>> serialize(elem)
'<tag>åöö<></tag>'
- >>> serialize(elem, "utf-8")
+ >>> serialize(elem, encoding="utf-8")
'<tag>\xc3\xa5\xc3\xb6\xc3\xb6<></tag>'
- >>> serialize(elem, "us-ascii")
+ >>> serialize(elem, encoding="us-ascii")
'<tag>åöö<></tag>'
- >>> serialize(elem, "ISO-8859-1")
+ >>> serialize(elem, encoding="ISO-8859-1")
"<?xml version='1.0' encoding='ISO-8859-1'?>\n<tag>\xe5\xf6\xf6<></tag>"
## >>> elem.attrib["key"] = u'\xe5\xf6\xf6<>'
## >>> elem.text = None
## >>> serialize(elem)
## '<tag key="åöö<>"/>'
-## >>> serialize(elem, "utf-8")
+## >>> serialize(elem, encoding="utf-8")
## '<tag key="\xc3\xa5\xc3\xb6\xc3\xb6<>"/>'
-## >>> serialize(elem, "us-ascii")
+## >>> serialize(elem, encoding="us-ascii")
## '<tag key="åöö<>"/>'
-## >>> serialize(elem, "ISO-8859-1")
+## >>> serialize(elem, encoding="ISO-8859-1")
## '<?xml version=\'1.0\' encoding=\'ISO-8859-1\'?>\n<tag key="\xe5\xf6\xf6<>"/>'
"""
+def methods():
+ r"""
+ Test serialization methods.
+
+ >>> e = ET.XML("<html><link/><script>1 < 2</script></html>")
+ >>> e.tail = "\n"
+ >>> serialize(e)
+ '<html><link /><script>1 < 2</script></html>\n'
+ >>> serialize(e, method=None)
+ '<html><link /><script>1 < 2</script></html>\n'
+ >>> serialize(e, method="xml")
+ '<html><link /><script>1 < 2</script></html>\n'
+ >>> serialize(e, method="html")
+ '<html><link><script>1 < 2</script></html>\n'
+ >>> serialize(e, method="text")
+ '1 < 2\n'
+
+ """
+
+# doesn't work with lxml.etree
+del methods
+
+def iterators():
+ """
+ Test iterators.
+
+ >>> e = ET.XML("<html><body>this is a <i>paragraph</i>.</body>..</html>")
+ >>> summarize_list(e.iter())
+ ['html', 'body', 'i']
+ >>> summarize_list(e.find("body").iter())
+ ['body', 'i']
+ >>> "".join(e.itertext())
+ 'this is a paragraph...'
+ >>> "".join(e.find("body").itertext())
+ 'this is a paragraph.'
+ """
+
ENTITY_XML = """\
<!DOCTYPE points [
<!ENTITY % user-entities SYSTEM 'user-entities.xml'>
@@ -506,40 +579,56 @@
<document>&entity;</document>
"""
-## def entity():
-## """
-## Test entity handling.
+def entity():
+ """
+ Test entity handling.
-## 1) bad entities
+ 1) bad entities
-## >>> ElementTree.XML("<document>&entity;</document>")
-## Traceback (most recent call last):
-## ExpatError: undefined entity: line 1, column 10
+ >>> ElementTree.XML("<document>&entity;</document>")
+ Traceback (most recent call last):
+ ExpatError: undefined entity: line 1, column 10
-## >>> ElementTree.XML(ENTITY_XML)
-## Traceback (most recent call last):
-## ExpatError: undefined entity &entity;: line 5, column 10
+ >>> ElementTree.XML(ENTITY_XML)
+ Traceback (most recent call last):
+ ExpatError: undefined entity &entity;: line 5, column 10
-## (add more tests here)
+ (add more tests here)
-## """
+ """
+
+# doesn't work with lxml.etree
+del entity
-def xmllang():
+def error(xml):
"""
- This appears to be a problem; in underlying libxml2?
-
- 1) xml namespace
+ Test error handling.
+
+ >>> error("foo").position
+ (1, 0)
+ >>> error("<tag>&foo;</tag>").position
+ (1, 5)
+ >>> error("foobar<").position
+ (1, 6)
- >>> elem = ElementTree.XML("<tag xml:lang='en' />")
- >>> serialize(elem) # 1.1
- '<tag xml:lang="en"/>'
"""
-
+ try:
+ ET.XML(xml)
+ except ET.ParseError:
+ return sys.exc_value
+
+# doesn't work with lxml.etree
+del error
+
def namespace():
"""
Test namespace issues.
+ 1) xml namespace
+ >>> elem = ElementTree.XML("<tag xml:lang='en' />")
+ >>> serialize(elem) # 1.1
+ '<tag xml:lang="en"/>'
2) other "well-known" namespaces
@@ -634,15 +723,15 @@
>>> xpath_tokenizer("/doc/chapter[5]/section[2]")
['/', 'doc', '/', 'chapter', '[', '5', ']', '/', 'section', '[', '2', ']']
>>> xpath_tokenizer("chapter//para")
- ['chapter', '/', '/', 'para']
+ ['chapter', '//', 'para']
>>> xpath_tokenizer("//para")
- ['/', '/', 'para']
+ ['//', 'para']
>>> xpath_tokenizer("//olist/item")
- ['/', '/', 'olist', '/', 'item']
+ ['//', 'olist', '/', 'item']
>>> xpath_tokenizer(".")
['.']
>>> xpath_tokenizer(".//para")
- ['.', '/', '/', 'para']
+ ['.', '//', 'para']
>>> xpath_tokenizer("..")
['..']
>>> xpath_tokenizer("../@lang")
@@ -658,7 +747,7 @@
>>> xpath_tokenizer("./spam.egg")
['.', '/', 'spam.egg']
>>> xpath_tokenizer(".//{http://spam}egg")
- ['.', '/', '/', '{http://spam}egg']
+ ['.', '//', '{http://spam}egg']
"""
out = []
for op, tag in ElementPath.xpath_tokenizer(p):
@@ -811,70 +900,76 @@
#
# xmlwriter
-## def xmlwriter():
-## r"""
-## >>> file = StringIO.StringIO()
-## >>> w = SimpleXMLWriter.XMLWriter(file)
-## >>> html = w.start("html")
-## >>> x = w.start("head")
-## >>> w.element("title", "my document")
-## >>> w.data("\n")
-## >>> w.element("meta", name="hello", value="goodbye")
-## >>> w.data("\n")
-## >>> w.end()
-## >>> x = w.start("body")
-## >>> w.element("h1", "this is a heading")
-## >>> w.data("\n")
-## >>> w.element("p", u"this is a paragraph")
-## >>> w.data("\n")
-## >>> w.element("p", u"reserved characters: <&>")
-## >>> w.data("\n")
-## >>> w.element("p", u"detta är också ett stycke")
-## >>> w.data("\n")
-## >>> w.close(html)
-## >>> print file.getvalue()
-## <html><head><title>my document</title>
-## <meta name="hello" value="goodbye" />
-## </head><body><h1>this is a heading</h1>
-## <p>this is a paragraph</p>
-## <p>reserved characters: <&></p>
-## <p>detta är också ett stycke</p>
-## </body></html>
-## """
+def xmlwriter():
+ r"""
+ >>> file = StringIO.StringIO()
+ >>> w = SimpleXMLWriter.XMLWriter(file)
+ >>> html = w.start("html")
+ >>> x = w.start("head")
+ >>> w.element("title", "my document")
+ >>> w.data("\n")
+ >>> w.element("meta", name="hello", value="goodbye")
+ >>> w.data("\n")
+ >>> w.end()
+ >>> x = w.start("body")
+ >>> w.element("h1", "this is a heading")
+ >>> w.data("\n")
+ >>> w.element("p", u"this is a paragraph")
+ >>> w.data("\n")
+ >>> w.element("p", u"reserved characters: <&>")
+ >>> w.data("\n")
+ >>> w.element("p", u"detta är också ett stycke")
+ >>> w.data("\n")
+ >>> w.close(html)
+ >>> print file.getvalue()
+ <html><head><title>my document</title>
+ <meta name="hello" value="goodbye" />
+ </head><body><h1>this is a heading</h1>
+ <p>this is a paragraph</p>
+ <p>reserved characters: <&></p>
+ <p>detta är också ett stycke</p>
+ </body></html>
+ """
+
+# doesn't work with lxml.etree
+del xmlwriter
# --------------------------------------------------------------------
# reported bugs
-## def bug_xmltoolkit21():
-## """
-## marshaller gives obscure errors for non-string values
+def bug_xmltoolkit21():
+ """
+ marshaller gives obscure errors for non-string values
-## >>> elem = ElementTree.Element(123)
-## >>> serialize(elem) # tag
-## Traceback (most recent call last):
-## TypeError: cannot serialize 123 (type int)
-## >>> elem = ElementTree.Element("elem")
-## >>> elem.text = 123
-## >>> serialize(elem) # text
-## Traceback (most recent call last):
-## TypeError: cannot serialize 123 (type int)
-## >>> elem = ElementTree.Element("elem")
-## >>> elem.tail = 123
-## >>> serialize(elem) # tail
-## Traceback (most recent call last):
-## TypeError: cannot serialize 123 (type int)
-## >>> elem = ElementTree.Element("elem")
-## >>> elem.set(123, "123")
-## >>> serialize(elem) # attribute key
-## Traceback (most recent call last):
-## TypeError: cannot serialize 123 (type int)
-## >>> elem = ElementTree.Element("elem")
-## >>> elem.set("123", 123)
-## >>> serialize(elem) # attribute value
-## Traceback (most recent call last):
-## TypeError: cannot serialize 123 (type int)
+ >>> elem = ElementTree.Element(123)
+ >>> serialize(elem) # tag
+ Traceback (most recent call last):
+ TypeError: cannot serialize 123 (type int)
+ >>> elem = ElementTree.Element("elem")
+ >>> elem.text = 123
+ >>> serialize(elem) # text
+ Traceback (most recent call last):
+ TypeError: cannot serialize 123 (type int)
+ >>> elem = ElementTree.Element("elem")
+ >>> elem.tail = 123
+ >>> serialize(elem) # tail
+ Traceback (most recent call last):
+ TypeError: cannot serialize 123 (type int)
+ >>> elem = ElementTree.Element("elem")
+ >>> elem.set(123, "123")
+ >>> serialize(elem) # attribute key
+ Traceback (most recent call last):
+ TypeError: cannot serialize 123 (type int)
+ >>> elem = ElementTree.Element("elem")
+ >>> elem.set("123", 123)
+ >>> serialize(elem) # attribute value
+ Traceback (most recent call last):
+ TypeError: cannot serialize 123 (type int)
-## """
+ """
+
+# doesn't work with lxml.etree
+del bug_xmltoolkit21
def bug_xmltoolkit25():
"""
@@ -898,92 +993,199 @@
['tbody']
"""
-## def bug_xmltoolkitX1():
-## """
-## dump() doesn't flush the output buffer
+def bug_xmltoolkitX1():
+ """
+ dump() doesn't flush the output buffer
-## >>> tree = ElementTree.XML("<doc><table><tbody/></table></doc>")
-## >>> ElementTree.dump(tree); sys.stdout.write("tail")
-## <doc><table><tbody /></table></doc>
-## tail
-## """
+ >>> tree = ElementTree.XML("<doc><table><tbody/></table></doc>")
+ >>> ElementTree.dump(tree); sys.stdout.write("tail")
+ <doc><table><tbody /></table></doc>
+ tail
+ """
-## def bug_xmltoolkit39():
-## """
-## non-ascii element and attribute names doesn't work
+# doesn't work with lxml.etree
+del bug_xmltoolkitX1
-## >>> tree = ElementTree.XML("<?xml version='1.0' encoding='iso-8859-1'?><täg />")
-## >>> ElementTree.tostring(tree, "utf-8")
-## '<t\\xc3\\xa4g />'
-
-## >>> tree = ElementTree.XML("<?xml version='1.0' encoding='iso-8859-1'?><tag ättr='välue' />")
-## >>> tree.attrib
-## {u'\\xe4ttr': u'v\\xe4lue'}
-## >>> ElementTree.tostring(tree, "utf-8")
-## '<tag \\xc3\\xa4ttr="v\\xc3\\xa4lue" />'
-
-## >>> tree = ElementTree.XML("<?xml version='1.0' encoding='iso-8859-1'?><täg>text</täg>")
-## >>> ElementTree.tostring(tree, "utf-8")
-## '<t\\xc3\\xa4g>text</t\\xc3\\xa4g>'
-
-## >>> tree = ElementTree.Element(u"täg")
-## >>> ElementTree.tostring(tree, "utf-8")
-## '<t\\xc3\\xa4g />'
-
-## >>> tree = ElementTree.Element("tag")
-## >>> tree.set(u"ättr", u"välue")
-## >>> ElementTree.tostring(tree, "utf-8")
-## '<tag \\xc3\\xa4ttr="v\\xc3\\xa4lue" />'
+def bug_xmltoolkit39():
+ """
+ non-ascii element and attribute names doesn't work
-## """
+ >>> tree = ElementTree.XML("<?xml version='1.0' encoding='iso-8859-1'?><täg />")
+ >>> ElementTree.tostring(tree, "utf-8")
+ '<t\\xc3\\xa4g />'
-## def bug_xmltoolkit45():
-## """
-## problems parsing mixed unicode/non-ascii html documents
+ >>> tree = ElementTree.XML("<?xml version='1.0' encoding='iso-8859-1'?><tag ättr='välue' />")
+ >>> tree.attrib
+ {u'\\xe4ttr': u'v\\xe4lue'}
+ >>> ElementTree.tostring(tree, "utf-8")
+ '<tag \\xc3\\xa4ttr="v\\xc3\\xa4lue" />'
-## latin-1 text
-## >>> p = HTMLTreeBuilder.TreeBuilder()
-## >>> p.feed("<p>välue</p>")
-## >>> serialize(p.close())
-## '<p>välue</p>'
+ >>> tree = ElementTree.XML("<?xml version='1.0' encoding='iso-8859-1'?><täg>text</täg>")
+ >>> ElementTree.tostring(tree, "utf-8")
+ '<t\\xc3\\xa4g>text</t\\xc3\\xa4g>'
-## utf-8 text
-## >>> p = HTMLTreeBuilder.TreeBuilder(encoding="utf-8")
-## >>> p.feed("<p>v\xc3\xa4lue</p>")
-## >>> serialize(p.close())
-## '<p>välue</p>'
+ >>> tree = ElementTree.Element(u"täg")
+ >>> ElementTree.tostring(tree, "utf-8")
+ '<t\\xc3\\xa4g />'
-## utf-8 text using meta tag
-## >>> p = HTMLTreeBuilder.TreeBuilder()
-## >>> p.feed("<html><meta http-equiv='Content-Type' content='text/html; charset=utf-8'><p>v\xc3\xa4lue</p></html>")
-## >>> serialize(p.close().find("p"))
-## '<p>välue</p>'
-
-## latin-1 character references
-## >>> p = HTMLTreeBuilder.TreeBuilder()
-## >>> p.feed("<p>välue</p>")
-## >>> serialize(p.close())
-## '<p>välue</p>'
+ >>> tree = ElementTree.Element("tag")
+ >>> tree.set(u"ättr", u"välue")
+ >>> ElementTree.tostring(tree, "utf-8")
+ '<tag \\xc3\\xa4ttr="v\\xc3\\xa4lue" />'
-## latin-1 character entities
-## >>> p = HTMLTreeBuilder.TreeBuilder()
-## >>> p.feed("<p>välue</p>")
-## >>> serialize(p.close())
-## '<p>välue</p>'
+ """
-## mixed latin-1 text and unicode entities
-## >>> p = HTMLTreeBuilder.TreeBuilder()
-## >>> p.feed("<p>”välue”</p>")
-## >>> serialize(p.close())
-## '<p>”välue”</p>'
+# doesn't work with lxml.etree
+del bug_xmltoolkit39
-## mixed unicode and latin-1 entities
-## >>> p = HTMLTreeBuilder.TreeBuilder()
-## >>> p.feed("<p>”välue”</p>")
-## >>> serialize(p.close())
-## '<p>”välue”</p>'
+def bug_xmltoolkit45():
+ """
+ problems parsing mixed unicode/non-ascii html documents
-## """
+ latin-1 text
+ >>> p = HTMLTreeBuilder.TreeBuilder()
+ >>> p.feed("<p>välue</p>")
+ >>> serialize(p.close())
+ '<p>välue</p>'
+
+ utf-8 text
+ >>> p = HTMLTreeBuilder.TreeBuilder(encoding="utf-8")
+ >>> p.feed("<p>v\xc3\xa4lue</p>")
+ >>> serialize(p.close())
+ '<p>välue</p>'
+
+ utf-8 text using meta tag
+ >>> p = HTMLTreeBuilder.TreeBuilder()
+ >>> p.feed("<html><meta http-equiv='Content-Type' content='text/html; charset=utf-8'><p>v\xc3\xa4lue</p></html>")
+ >>> serialize(p.close().find("p"))
+ '<p>välue</p>'
+
+ latin-1 character references
+ >>> p = HTMLTreeBuilder.TreeBuilder()
+ >>> p.feed("<p>välue</p>")
+ >>> serialize(p.close())
+ '<p>välue</p>'
+
+ latin-1 character entities
+ >>> p = HTMLTreeBuilder.TreeBuilder()
+ >>> p.feed("<p>välue</p>")
+ >>> serialize(p.close())
+ '<p>välue</p>'
+
+ mixed latin-1 text and unicode entities
+ >>> p = HTMLTreeBuilder.TreeBuilder()
+ >>> p.feed("<p>”välue”</p>")
+ >>> serialize(p.close())
+ '<p>”välue”</p>'
+
+ mixed unicode and latin-1 entities
+ >>> p = HTMLTreeBuilder.TreeBuilder()
+ >>> p.feed("<p>”välue”</p>")
+ >>> serialize(p.close())
+ '<p>”välue”</p>'
+
+ """
+
+# doesn't work with lxml.etree
+del bug_xmltoolkit45
+
+def bug_xmltoolkit46():
+ """
+ problems parsing open BR tags
+
+ >>> p = HTMLTreeBuilder.TreeBuilder()
+ >>> p.feed("<p>key<br>value</p>")
+ >>> serialize(p.close())
+ '<p>key<br />value</p>'
+
+ """
+
+# doesn't work with lxml.etree
+del bug_xmltoolkit46
+
+def bug_xmltoolkit54():
+ """
+ problems handling internally defined entities
+
+ >>> e = ElementTree.XML("<!DOCTYPE doc [<!ENTITY ldots '舰'>]><doc>&ldots;</doc>")
+ >>> serialize(e)
+ '<doc>舰</doc>'
+ """
+
+# doesn't work with lxml.etree
+del bug_xmltoolkit54
+
+def bug_xmltoolkit55():
+ """
+ make sure we're reporting the first error, not the last
+
+ >>> e = ElementTree.XML("<!DOCTYPE doc SYSTEM 'doc.dtd'><doc>&ldots;&ndots;&rdots;</doc>")
+ Traceback (most recent call last):
+ ParseError: undefined entity &ldots;: line 1, column 36
+ """
+
+# doesn't work with lxml.etree
+del bug_xmltoolkit55
+
+def bug_200708_version():
+ """
+ >>> parser = ET.XMLParser()
+ >>> parser.version
+ 'Expat 2.0.0'
+ >>> parser.feed(open("samples/simple.xml").read())
+ >>> print serialize(parser.close())
+ <root>
+ <element key="value">text</element>
+ <element>text</element>tail
+ <empty-element />
+ </root>
+ """
+
+# doesn't work with lxml.etree
+del bug_200708_version
+
+def bug_200708_newline():
+ r"""
+
+ Preserve newlines in attributes.
+
+ >>> e = ET.Element('SomeTag', text="def _f():\n return 3\n")
+ >>> ET.tostring(e)
+ '<SomeTag text="def _f(): return 3 " />'
+ >>> ET.XML(ET.tostring(e)).get("text")
+ 'def _f():\n return 3\n'
+ >>> ET.tostring(ET.XML(ET.tostring(e)))
+ '<SomeTag text="def _f(): return 3 " />'
+ """
+
+# doesn't work with lxml.etree
+del bug_200708_newline
+
+def bug_200709_default_namespace():
+ """
+
+ >>> e = ET.Element("{default}elem")
+ >>> s = ET.SubElement(e, "{default}elem")
+ >>> serialize(e, default_namespace="default") # 1
+ '<elem xmlns="default"><elem /></elem>'
+
+ >>> e = ET.Element("{default}elem")
+ >>> s = ET.SubElement(e, "{default}elem")
+ >>> s = ET.SubElement(e, "{not-default}elem")
+ >>> serialize(e, default_namespace="default") # 2
+ '<elem xmlns="default" xmlns:ns1="not-default"><elem /><ns1:elem /></elem>'
+
+ >>> e = ET.Element("{default}elem")
+ >>> s = ET.SubElement(e, "{default}elem")
+ >>> s = ET.SubElement(e, "elem") # unprefixed name
+ >>> serialize(e, default_namespace="default") # 3
+ Traceback (most recent call last):
+ ValueError: cannot use non-qualified names with default_namespace option
+
+ """
+
+# doesn't work with lxml.etree
+del bug_200709_default_namespace
# --------------------------------------------------------------------
Modified: lxml/trunk/src/lxml/_elementpath.py
==============================================================================
--- lxml/trunk/src/lxml/_elementpath.py (original)
+++ lxml/trunk/src/lxml/_elementpath.py Thu Sep 13 12:52:26 2007
@@ -1,4 +1,6 @@
-# This file is taken from ElementTree directly, unchanged beyond this line.
+#
+# ElementTree
+# $Id: ElementPath.py 3276 2007-09-12 06:52:30Z fredrik $
#
# limited xpath support for element trees
#
@@ -6,8 +8,9 @@
# 2003-05-23 fl created
# 2003-05-28 fl added support for // etc
# 2003-08-27 fl fixed parsing of periods in element names
+# 2007-09-10 fl new selection engine
#
-# Copyright (c) 2003-2004 by Fredrik Lundh. All rights reserved.
+# Copyright (c) 2003-2007 by Fredrik Lundh. All rights reserved.
#
# fredrik at pythonware.com
# http://www.pythonware.com
@@ -15,7 +18,7 @@
# --------------------------------------------------------------------
# The ElementTree toolkit is
#
-# Copyright (c) 1999-2004 by Fredrik Lundh
+# Copyright (c) 1999-2007 by Fredrik Lundh
#
# By obtaining, using, and/or copying this software and/or its
# associated documentation, you agree that you have read, understood,
@@ -49,146 +52,178 @@
import re
xpath_tokenizer = re.compile(
- "(::|\.\.|\(\)|[/.*:\[\]\(\)@=])|((?:\{[^}]+\})?[^/:\[\]\(\)@=\s]+)|\s+"
+ "("
+ "'[^']*'|\"[^\"]*\"|"
+ "::|"
+ "//?|"
+ "\.\.|"
+ "\(\)|"
+ "[/.*:\[\]\(\)@=])|"
+ "((?:\{[^}]+\})?[^/:\[\]\(\)@=\s]+)|"
+ "\s+"
).findall
-class xpath_descendant_or_self:
- pass
-
-##
-# Wrapper for a compiled XPath.
-
-class Path:
-
- ##
- # Create an Path instance from an XPath expression.
-
- def __init__(self, path):
- tokens = xpath_tokenizer(path)
- # the current version supports 'path/path'-style expressions only
- self.path = []
- self.tag = None
- if tokens and tokens[0][0] == "/":
- raise SyntaxError("cannot use absolute path on element")
- while tokens:
- op, tag = tokens.pop(0)
- if tag or op == "*":
- self.path.append(tag or op)
- elif op == ".":
- pass
- elif op == "/":
- self.path.append(xpath_descendant_or_self())
- continue
- else:
- raise SyntaxError("unsupported path syntax (%s)" % op)
- if tokens:
- op, tag = tokens.pop(0)
- if op != "/":
- raise SyntaxError(
- "expected path separator (%s)" % (op or tag)
- )
- if self.path and isinstance(self.path[-1], xpath_descendant_or_self):
- raise SyntaxError("path cannot end with //")
- if len(self.path) == 1 and isinstance(self.path[0], type("")):
- self.tag = self.path[0]
-
- ##
- # Find first matching object.
-
- def find(self, element):
- tag = self.tag
- if tag is None:
- nodeset = self.findall(element)
- if not nodeset:
- return None
- return nodeset[0]
- for elem in element:
- if elem.tag == tag:
- return elem
- return None
-
- ##
- # Find text for first matching object.
-
- def findtext(self, element, default=None):
- tag = self.tag
- if tag is None:
- nodeset = self.findall(element)
- if not nodeset:
- return default
- return nodeset[0].text or ""
- for elem in element:
- if elem.tag == tag:
- return elem.text or ""
- return default
-
- ##
- # Find all matching objects.
-
- def findall(self, element):
- nodeset = [element]
- index = 0
- while 1:
- try:
- path = self.path[index]
- index = index + 1
- except IndexError:
- return nodeset
- set = []
- if isinstance(path, xpath_descendant_or_self):
- try:
- tag = self.path[index]
- if not isinstance(tag, type("")):
- tag = None
- else:
- index = index + 1
- except IndexError:
- tag = None # invalid path
- for node in nodeset:
- new = list(node.getiterator(tag))
- if new and new[0] is node:
- set.extend(new[1:])
- else:
- set.extend(new)
+def prepare_tag(next, token):
+ tag = token[1]
+ def select(context, result):
+ for elem in result:
+ for e in elem:
+ if e.tag == tag:
+ yield e
+ return select
+
+def prepare_star(next, token):
+ def select(context, result):
+ for elem in result:
+ for e in elem:
+ yield e
+ return select
+
+def prepare_dot(next, token):
+ def select(context, result):
+ for elem in result:
+ yield elem
+ return select
+
+def prepare_iter(next, token):
+ token = next()
+ if token[0] == "*":
+ tag = "*"
+ elif not token[0]:
+ tag = token[1]
+ else:
+ raise SyntaxError
+ def select(context, result):
+ for elem in result:
+ for e in elem.iter(tag):
+ if e is not elem:
+ yield e
+ return select
+
+def prepare_dot_dot(next, token):
+ def select(context, result):
+ parent_map = context.parent_map
+ if parent_map is None:
+ context.parent_map = parent_map = {}
+ for p in context.root.iter():
+ for e in p:
+ parent_map[e] = p
+ for elem in result:
+ if elem in parent_map:
+ yield parent_map[elem]
+ return select
+
+def prepare_predicate(next, token):
+ # this one should probably be refactored...
+ token = next()
+ if token[0] == "@":
+ # attribute
+ token = next()
+ if token[0]:
+ raise SyntaxError("invalid attribute predicate")
+ key = token[1]
+ token = next()
+ if token[0] == "]":
+ def select(context, result):
+ for elem in result:
+ if elem.get(key) is not None:
+ yield elem
+ elif token[0] == "=":
+ value = next()[0]
+ if value[:1] == "'" or value[:1] == '"':
+ value = value[1:-1]
else:
- for node in nodeset:
- for node in node:
- if path == "*" or node.tag == path:
- set.append(node)
- if not set:
- return []
- nodeset = set
+ raise SyntaxError("invalid comparision target")
+ token = next()
+ def select(context, result):
+ for elem in result:
+ if elem.get(key) == value:
+ yield elem
+ if token[0] != "]":
+ raise SyntaxError("invalid attribute predicate")
+ elif not token[0]:
+ tag = token[1]
+ token = next()
+ if token[0] != "]":
+ raise SyntaxError("invalid node predicate")
+ def select(context, result):
+ for elem in result:
+ if elem.find(tag) is not None:
+ yield elem
+ else:
+ raise SyntaxError("invalid predicate")
+ return select
+
+ops = {
+ "": prepare_tag,
+ "*": prepare_star,
+ ".": prepare_dot,
+ "..": prepare_dot_dot,
+ "//": prepare_iter,
+ "[": prepare_predicate,
+ }
_cache = {}
-##
-# (Internal) Compile path.
+class _SelectorContext:
+ parent_map = None
+ def __init__(self, root):
+ self.root = root
-def _compile(path):
- p = _cache.get(path)
- if p is not None:
- return p
- p = Path(path)
- if len(_cache) >= 100:
- _cache.clear()
- _cache[path] = p
- return p
+# --------------------------------------------------------------------
##
# Find first matching object.
-def find(element, path):
- return _compile(path).find(element)
+def find(elem, path):
+ try:
+ return iterfind(elem, path).next()
+ except StopIteration:
+ return None
##
-# Find text for first matching object.
+# Find all matching objects.
-def findtext(element, path, default=None):
- return _compile(path).findtext(element, default)
+def findall(elem, path):
+ return list(iterfind(elem, path))
-##
-# Find all matching objects.
+def iterfind(elem, path):
+ # compile selector pattern
+ try:
+ selector = _cache[path]
+ except KeyError:
+ if len(_cache) > 100:
+ _cache.clear()
+ if path[:1] == "/":
+ raise SyntaxError("cannot use absolute path on element")
+ stream = iter(xpath_tokenizer(path))
+ next = stream.next; token = next()
+ selector = []
+ while 1:
+ try:
+ selector.append(ops[token[0]](next, token))
+ except StopIteration:
+ raise SyntaxError("invalid path")
+ try:
+ token = next()
+ if token[0] == "/":
+ token = next()
+ except StopIteration:
+ break
+ _cache[path] = selector
+ # execute selector pattern
+ result = [elem]
+ context = _SelectorContext(elem)
+ for select in selector:
+ result = select(context, result)
+ return result
-def findall(element, path):
- return _compile(path).findall(element)
+##
+# Find text for first matching object.
+def findtext(elem, path, default=None):
+ try:
+ elem = iterfind(elem, path).next()
+ return elem.text
+ except StopIteration:
+ return default
Modified: lxml/trunk/src/lxml/etree.pyx
==============================================================================
--- lxml/trunk/src/lxml/etree.pyx (original)
+++ lxml/trunk/src/lxml/etree.pyx Thu Sep 13 12:52:26 2007
@@ -1130,6 +1130,13 @@
path = (<QName>path).text
return _elementpath.findall(self, path)
+ def iterfind(self, path):
+ """Iterates over all matching subelements, by tag name or path.
+ """
+ if isinstance(path, QName):
+ path = (<QName>path).text
+ return _elementpath.iterfind(self, path)
+
def xpath(self, _path, namespaces=None, extensions=None, **_variables):
"""Evaluate an xpath expression using the element as context node.
"""
@@ -1423,8 +1430,8 @@
return root.iter(tag)
def iter(self, tag=None):
- """Creates an iterator for the root element. The iterator loops over all elements
- in this tree, in document order.
+ """Creates an iterator for the root element. The iterator loops over
+ all elements in this tree, in document order.
"""
root = self.getroot()
if root is None:
@@ -1432,7 +1439,8 @@
return root.iter(tag)
def find(self, path):
- """Finds the first toplevel element with given tag. Same as getroot().find(path).
+ """Finds the first toplevel element with given tag. Same as
+ getroot().find(path).
"""
self._assertHasRoot()
root = self.getroot()
@@ -1441,7 +1449,8 @@
return root.find(path)
def findtext(self, path, default=None):
- """Finds the element text for the first toplevel element with given tag. Same as getroot().findtext(path)
+ """Finds the text for the first element matching the ElementPath
+ expression. Same as getroot().findtext(path)
"""
self._assertHasRoot()
root = self.getroot()
@@ -1450,14 +1459,25 @@
return root.findtext(path, default)
def findall(self, path):
- """Finds all toplevel elements with the given tag. Same as getroot().findall(path).
+ """Finds all elements matching the ElementPath expression. Same as
+ getroot().findall(path).
"""
self._assertHasRoot()
root = self.getroot()
if path[:1] == "/":
path = "." + path
return root.findall(path)
-
+
+ def iterfind(self, path):
+ """Iterates over all elements matching the ElementPath expression.
+ Same as getroot().finditer(path).
+ """
+ self._assertHasRoot()
+ root = self.getroot()
+ if path[:1] == "/":
+ path = "." + path
+ return root.iterfind(path)
+
def xpath(self, _path, namespaces=None, extensions=None, **_variables):
"""XPath evaluate in context of document.
@@ -1918,11 +1938,13 @@
tail text.
"""
cdef object _nextEvent
+ cdef _Element _start_element
def __init__(self, _Element element not None, tag=None, with_tail=True):
if with_tail:
events = ("start", "end")
else:
events = ("start",)
+ self._start_element = element
self._nextEvent = iterwalk(element, events=events, tag=tag).next
def __iter__(self):
@@ -1931,10 +1953,10 @@
def __next__(self):
cdef _Element element
while result is None:
- event, element = self._nextEvent()
+ event, element = self._nextEvent() # raises StopIteration
if event == "start":
result = element.text
- else:
+ elif element is not self._start_element:
result = element.tail
return result
Modified: lxml/trunk/src/lxml/tests/test_etree.py
==============================================================================
--- lxml/trunk/src/lxml/tests/test_etree.py (original)
+++ lxml/trunk/src/lxml/tests/test_etree.py Thu Sep 13 12:52:26 2007
@@ -1429,6 +1429,15 @@
self.assertEquals(["RTEXT", "ATAIL", "CTEXT", "CTAIL"],
text)
+ def test_itertext_child(self):
+ # ET 1.3+
+ XML = self.etree.XML
+ root = XML("<root>RTEXT<a></a>ATAIL<b/><c>CTEXT</c>CTAIL</root>")
+
+ text = list(root[2].itertext())
+ self.assertEquals(["CTEXT"],
+ text)
+
def test_findall_ns(self):
XML = self.etree.XML
root = XML('<a xmlns:x="X" xmlns:y="Y"><x:b><c/></x:b><b/><c><x:b/><b/></c><b/></a>')
More information about the lxml-checkins
mailing list