[Lxml-checkins] r46535 - in lxml/trunk: . src/lxml src/lxml/tests

scoder at codespeak.net scoder at codespeak.net
Thu Sep 13 12:52:26 CEST 2007


Author: scoder
Date: Thu Sep 13 12:52:26 2007
New Revision: 46535

Modified:
   lxml/trunk/CHANGES.txt
   lxml/trunk/selftest.py
   lxml/trunk/src/lxml/_elementpath.py
   lxml/trunk/src/lxml/etree.pyx
   lxml/trunk/src/lxml/tests/test_etree.py
Log:
ET 1.3 compatibility updates: iterfind(), new ElementPath implementation, updated selftest.py, fix for itertext()

Modified: lxml/trunk/CHANGES.txt
==============================================================================
--- lxml/trunk/CHANGES.txt	(original)
+++ lxml/trunk/CHANGES.txt	Thu Sep 13 12:52:26 2007
@@ -8,6 +8,9 @@
 Features added
 --------------
 
+* ``iterfind()`` method on Elements returns an iterator equivalent to
+  ``findall()``
+
 * ``itertext()`` method on Elements
 
 * Setting a QName object as value of the .text property or as an attribute

Modified: lxml/trunk/selftest.py
==============================================================================
--- lxml/trunk/selftest.py	(original)
+++ lxml/trunk/selftest.py	Thu Sep 13 12:52:26 2007
@@ -1,4 +1,4 @@
-# $Id: selftest.py 2193 2004-12-05 18:03:00Z fredrik $
+# $Id: selftest.py 3276 2007-09-12 06:52:30Z fredrik $
 # -*- coding: iso-8859-1 -*-
 # elementtree selftest program
 
@@ -14,6 +14,7 @@
 from lxml import etree as ElementTree
 from lxml import _elementpath as ElementPath
 from lxml import ElementInclude
+ET = ElementTree
 
 #from elementtree import ElementTree
 #from elementtree import ElementPath
@@ -26,14 +27,11 @@
     xml_data = xml_data.replace(' />', '/>')
     return xml_data
 
-def serialize(elem, encoding=None):
+def serialize(elem, **options):
     import StringIO
     file = StringIO.StringIO()
     tree = ElementTree.ElementTree(elem)
-    if encoding:
-        tree.write(file, encoding)
-    else:
-        tree.write(file)
+    tree.write(file, **options)
     return fix_compatibility( file.getvalue() )
 
 def summarize(elem):
@@ -106,18 +104,21 @@
 # --------------------------------------------------------------------
 # element tree tests
 
-## def sanity():
-##     """
-##     >>> from elementtree.ElementTree import *
-##     >>> from elementtree.ElementInclude import *
-##     >>> from elementtree.ElementPath import *
-##     >>> from elementtree.HTMLTreeBuilder import *
-##     >>> from elementtree.SimpleXMLTreeBuilder import *
-##     >>> from elementtree.SimpleXMLWriter import *
-##     >>> from elementtree.TidyHTMLTreeBuilder import *
-##     >>> from elementtree.TidyTools import *
-##     >>> from elementtree.XMLTreeBuilder import *
-##     """
+def sanity():
+    """
+    >>> from elementtree.ElementTree import *
+    >>> from elementtree.ElementInclude import *
+    >>> from elementtree.ElementPath import *
+    >>> from elementtree.HTMLTreeBuilder import *
+    >>> from elementtree.SimpleXMLTreeBuilder import *
+    >>> from elementtree.SimpleXMLWriter import *
+    >>> from elementtree.TidyHTMLTreeBuilder import *
+    >>> from elementtree.TidyTools import *
+    >>> from elementtree.XMLTreeBuilder import *
+    """
+
+# doesn't work with lxml.etree
+del sanity
 
 def interface():
     """
@@ -129,38 +130,41 @@
     >>> check_element_tree(tree)
     """
 
-## def simplefind():
-##     """
-##     Test find methods using the elementpath fallback.
+def simplefind():
+    """
+    Test find methods using the elementpath fallback.
 
-##     >>> CurrentElementPath = ElementTree.ElementPath
-##     >>> ElementTree.ElementPath = ElementTree._SimpleElementPath()
-##     >>> elem = SAMPLE_XML
-##     >>> elem.find("tag").tag
-##     'tag'
-##     >>> ElementTree.ElementTree(elem).find("tag").tag
-##     'tag'
-##     >>> elem.findtext("tag")
-##     'text'
-##     >>> elem.findtext("tog")
-##     >>> elem.findtext("tog", "default")
-##     'default'
-##     >>> ElementTree.ElementTree(elem).findtext("tag")
-##     'text'
-##     >>> summarize_list(elem.findall("tag"))
-##     ['tag', 'tag']
-##     >>> summarize_list(elem.findall(".//tag"))
-##     ['tag', 'tag', 'tag']
+    >>> CurrentElementPath = ElementTree.ElementPath
+    >>> ElementTree.ElementPath = ElementTree._SimpleElementPath()
+    >>> elem = SAMPLE_XML
+    >>> elem.find("tag").tag
+    'tag'
+    >>> ElementTree.ElementTree(elem).find("tag").tag
+    'tag'
+    >>> elem.findtext("tag")
+    'text'
+    >>> elem.findtext("tog")
+    >>> elem.findtext("tog", "default")
+    'default'
+    >>> ElementTree.ElementTree(elem).findtext("tag")
+    'text'
+    >>> summarize_list(elem.findall("tag"))
+    ['tag', 'tag']
+    >>> summarize_list(elem.findall(".//tag"))
+    ['tag', 'tag', 'tag']
 
-##     Path syntax doesn't work in this case.
+    Path syntax doesn't work in this case.
 
-##     >>> elem.find("section/tag")
-##     >>> elem.findtext("section/tag")
-##     >>> elem.findall("section/tag")
-##     []
+    >>> elem.find("section/tag")
+    >>> elem.findtext("section/tag")
+    >>> elem.findall("section/tag")
+    []
 
-##     >>> ElementTree.ElementPath = CurrentElementPath
-##     """
+    >>> ElementTree.ElementPath = CurrentElementPath
+    """
+
+# doesn't work with lxml.etree
+del simplefind
 
 def find():
     """
@@ -216,10 +220,31 @@
     ['tag', 'tag', 'tag']
     >>> summarize_list(elem.findall("././tag"))
     ['tag', 'tag']
+
+##     >>> summarize_list(elem.findall(".//tag[@class]"))
+##     ['tag', 'tag', 'tag']
+##     >>> summarize_list(elem.findall(".//tag[@class='a']"))
+##     ['tag']
+##     >>> summarize_list(elem.findall(".//tag[@class='b']"))
+##     ['tag', 'tag']
+##     >>> summarize_list(elem.findall(".//tag[@id]"))
+##     ['tag']
+##     >>> summarize_list(elem.findall(".//section[tag]"))
+##     ['section']
+##     >>> summarize_list(elem.findall(".//section[element]"))
+##     []
+##     >>> summarize_list(elem.findall("../tag"))
+##     []
+##     >>> summarize_list(elem.findall("section/../tag"))
+##     ['tag', 'tag']
+##     >>> summarize_list(ElementTree.ElementTree(elem).findall("./tag"))
+##     ['tag', 'tag']
+
+    FIXME: ET's Path module handles this case incorrectly; this gives
+    a warning in 1.3, and the behaviour will be modified in 1.4.
+
     >>> summarize_list(ElementTree.ElementTree(elem).findall("/tag"))
     ['tag', 'tag']
-    >>> summarize_list(ElementTree.ElementTree(elem).findall("./tag"))
-    ['tag', 'tag']
     """
 
 def bad_find():
@@ -230,15 +255,9 @@
     >>> elem.findall("/tag")
     Traceback (most recent call last):
     SyntaxError: cannot use absolute path on element
-    >>> elem.findall("../tag")
-    Traceback (most recent call last):
-    SyntaxError: unsupported path syntax (..)
     >>> elem.findall("section//")
     Traceback (most recent call last):
-    SyntaxError: path cannot end with //
-    >>> elem.findall("tag[tag]")
-    Traceback (most recent call last):
-    SyntaxError: expected path separator ([)
+    SyntaxError: invalid path
     """
 
 def parsefile():
@@ -261,6 +280,12 @@
        <element>text</element>tail
        <empty-element/>
     </root>
+
+##     <ns0:root xmlns:ns0="namespace">
+##        <ns0:element key="value">text</ns0:element>
+##        <ns0:element>text</ns0:element>tail
+##        <ns0:empty-element/>
+##     </ns0:root>
     """
 
 ## def parsehtml():
@@ -282,6 +307,12 @@
     >>> element = ElementTree.fromstring("<html><body>text</body></html>")
     >>> ElementTree.ElementTree(element).write(sys.stdout)
     <html><body>text</body></html>
+
+##     >>> sequence = ["<html><body>", "text</bo", "dy></html>"]
+##     >>> element = ElementTree.fromstringlist(sequence)
+##     >>> ElementTree.ElementTree(element).write(sys.stdout)
+##     <html><body>text</body></html>
+
     >>> print ElementTree.tostring(element)
     <html><body>text</body></html>
 
@@ -426,6 +457,11 @@
     >>> ElementTree.SubElement(elem, "subtag").text = "subtext"
     >>> serialize(elem)
     '<tag>text<subtag>subtext</subtag></tag>'
+
+##     Test tag suppression
+##     >>> elem.tag = None
+##     >>> serialize(elem)
+##     'text<subtag>subtext</subtag>'
     """
 
 def writestring():
@@ -446,58 +482,95 @@
     >>> elem.text = u"abc"
     >>> serialize(elem)
     '<tag>abc</tag>'
-    >>> serialize(elem, "utf-8")
+    >>> serialize(elem, encoding="utf-8")
     '<tag>abc</tag>'
-    >>> serialize(elem, "us-ascii")
+    >>> serialize(elem, encoding="us-ascii")
     '<tag>abc</tag>'
-    >>> serialize(elem, "ISO-8859-1")
+    >>> serialize(elem, encoding="ISO-8859-1")
     "<?xml version='1.0' encoding='ISO-8859-1'?>\n<tag>abc</tag>"
 
     >>> elem.text = "<&\"\'>"
     >>> serialize(elem)
     '<tag>&lt;&amp;"\'&gt;</tag>'
-    >>> serialize(elem, "utf-8")
+    >>> serialize(elem, encoding="utf-8")
     '<tag>&lt;&amp;"\'&gt;</tag>'
-    >>> serialize(elem, "us-ascii") # cdata characters
+    >>> serialize(elem, encoding="us-ascii") # cdata characters
     '<tag>&lt;&amp;"\'&gt;</tag>'
-    >>> serialize(elem, "ISO-8859-1")
+    >>> serialize(elem, encoding="ISO-8859-1")
     '<?xml version=\'1.0\' encoding=\'ISO-8859-1\'?>\n<tag>&lt;&amp;"\'&gt;</tag>'
 
 ##     >>> elem.attrib["key"] = "<&\"\'>"
 ##     >>> elem.text = None
 ##     >>> serialize(elem)
 ##     '<tag key="&lt;&amp;&quot;&apos;&gt;"/>'
-##     >>> serialize(elem, "utf-8")
+##     >>> serialize(elem, encoding="utf-8")
 ##     '<tag key="&lt;&amp;&quot;&apos;&gt;"/>'
-##     >>> serialize(elem, "us-ascii")
+##     >>> serialize(elem, encoding="us-ascii")
 ##     '<tag key="&lt;&amp;&quot;&apos;&gt;"/>'
-##     >>> serialize(elem, "iso-8859-1")
+##     >>> serialize(elem, encoding="iso-8859-1")
 ##     '<?xml version=\'1.0\' encoding=\'iso-8859-1\'?>\n<tag key="&lt;&amp;&quot;&apos;&gt;"/>'
 
     >>> elem.text = u'\xe5\xf6\xf6<>'
     >>> elem.attrib.clear()
     >>> serialize(elem)
     '<tag>&#229;&#246;&#246;&lt;&gt;</tag>'
-    >>> serialize(elem, "utf-8")
+    >>> serialize(elem, encoding="utf-8")
     '<tag>\xc3\xa5\xc3\xb6\xc3\xb6&lt;&gt;</tag>'
-    >>> serialize(elem, "us-ascii")
+    >>> serialize(elem, encoding="us-ascii")
     '<tag>&#229;&#246;&#246;&lt;&gt;</tag>'
-    >>> serialize(elem, "ISO-8859-1")
+    >>> serialize(elem, encoding="ISO-8859-1")
     "<?xml version='1.0' encoding='ISO-8859-1'?>\n<tag>\xe5\xf6\xf6&lt;&gt;</tag>"
 
 ##     >>> elem.attrib["key"] = u'\xe5\xf6\xf6<>'
 ##     >>> elem.text = None
 ##     >>> serialize(elem)
 ##     '<tag key="&#229;&#246;&#246;&lt;&gt;"/>'
-##     >>> serialize(elem, "utf-8")
+##     >>> serialize(elem, encoding="utf-8")
 ##     '<tag key="\xc3\xa5\xc3\xb6\xc3\xb6&lt;&gt;"/>'
-##     >>> serialize(elem, "us-ascii")
+##     >>> serialize(elem, encoding="us-ascii")
 ##     '<tag key="&#229;&#246;&#246;&lt;&gt;"/>'
-##     >>> serialize(elem, "ISO-8859-1")
+##     >>> serialize(elem, encoding="ISO-8859-1")
 ##     '<?xml version=\'1.0\' encoding=\'ISO-8859-1\'?>\n<tag key="\xe5\xf6\xf6&lt;&gt;"/>'
 
     """
 
+def methods():
+    r"""
+    Test serialization methods.
+
+    >>> e = ET.XML("<html><link/><script>1 &lt; 2</script></html>")
+    >>> e.tail = "\n"
+    >>> serialize(e)
+    '<html><link /><script>1 &lt; 2</script></html>\n'
+    >>> serialize(e, method=None)
+    '<html><link /><script>1 &lt; 2</script></html>\n'
+    >>> serialize(e, method="xml")
+    '<html><link /><script>1 &lt; 2</script></html>\n'
+    >>> serialize(e, method="html")
+    '<html><link><script>1 < 2</script></html>\n'
+    >>> serialize(e, method="text")
+    '1 < 2\n'
+
+    """
+
+# doesn't work with lxml.etree
+del methods
+
+def iterators():
+    """
+    Test iterators.
+
+    >>> e = ET.XML("<html><body>this is a <i>paragraph</i>.</body>..</html>")
+    >>> summarize_list(e.iter())
+    ['html', 'body', 'i']
+    >>> summarize_list(e.find("body").iter())
+    ['body', 'i']
+    >>> "".join(e.itertext())
+    'this is a paragraph...'
+    >>> "".join(e.find("body").itertext())
+    'this is a paragraph.'
+    """
+
 ENTITY_XML = """\
 <!DOCTYPE points [
 <!ENTITY % user-entities SYSTEM 'user-entities.xml'>
@@ -506,40 +579,56 @@
 <document>&entity;</document>
 """
 
-## def entity():
-##     """
-##     Test entity handling.
+def entity():
+    """
+    Test entity handling.
 
-##     1) bad entities
+    1) bad entities
 
-##     >>> ElementTree.XML("<document>&entity;</document>")
-##     Traceback (most recent call last):
-##     ExpatError: undefined entity: line 1, column 10
+    >>> ElementTree.XML("<document>&entity;</document>")
+    Traceback (most recent call last):
+    ExpatError: undefined entity: line 1, column 10
 
-##     >>> ElementTree.XML(ENTITY_XML)
-##     Traceback (most recent call last):
-##     ExpatError: undefined entity &entity;: line 5, column 10
+    >>> ElementTree.XML(ENTITY_XML)
+    Traceback (most recent call last):
+    ExpatError: undefined entity &entity;: line 5, column 10
 
-##     (add more tests here)
+    (add more tests here)
 
-##     """
+    """
+
+# doesn't work with lxml.etree
+del entity
 
-def xmllang():
+def error(xml):
     """
-    This appears to be a problem; in underlying libxml2?
-    
-    1) xml namespace
+    Test error handling.
+
+    >>> error("foo").position
+    (1, 0)
+    >>> error("<tag>&foo;</tag>").position
+    (1, 5)
+    >>> error("foobar<").position
+    (1, 6)
 
-    >>> elem = ElementTree.XML("<tag xml:lang='en' />")
-    >>> serialize(elem) # 1.1
-    '<tag xml:lang="en"/>'
     """
-    
+    try:
+        ET.XML(xml)
+    except ET.ParseError:
+        return sys.exc_value
+
+# doesn't work with lxml.etree
+del error
+
 def namespace():
     """
     Test namespace issues.
 
+    1) xml namespace
 
+    >>> elem = ElementTree.XML("<tag xml:lang='en' />")
+    >>> serialize(elem) # 1.1
+    '<tag xml:lang="en"/>'
 
     2) other "well-known" namespaces
 
@@ -634,15 +723,15 @@
     >>> xpath_tokenizer("/doc/chapter[5]/section[2]")
     ['/', 'doc', '/', 'chapter', '[', '5', ']', '/', 'section', '[', '2', ']']
     >>> xpath_tokenizer("chapter//para")
-    ['chapter', '/', '/', 'para']
+    ['chapter', '//', 'para']
     >>> xpath_tokenizer("//para")
-    ['/', '/', 'para']
+    ['//', 'para']
     >>> xpath_tokenizer("//olist/item")
-    ['/', '/', 'olist', '/', 'item']
+    ['//', 'olist', '/', 'item']
     >>> xpath_tokenizer(".")
     ['.']
     >>> xpath_tokenizer(".//para")
-    ['.', '/', '/', 'para']
+    ['.', '//', 'para']
     >>> xpath_tokenizer("..")
     ['..']
     >>> xpath_tokenizer("../@lang")
@@ -658,7 +747,7 @@
     >>> xpath_tokenizer("./spam.egg")
     ['.', '/', 'spam.egg']
     >>> xpath_tokenizer(".//{http://spam}egg")
-    ['.', '/', '/', '{http://spam}egg']
+    ['.', '//', '{http://spam}egg']
     """
     out = []
     for op, tag in ElementPath.xpath_tokenizer(p):
@@ -811,70 +900,76 @@
 #
 # xmlwriter
 
-## def xmlwriter():
-##     r"""
-##     >>> file = StringIO.StringIO()
-##     >>> w = SimpleXMLWriter.XMLWriter(file)
-##     >>> html = w.start("html")
-##     >>> x = w.start("head")
-##     >>> w.element("title", "my document")
-##     >>> w.data("\n")
-##     >>> w.element("meta", name="hello", value="goodbye")
-##     >>> w.data("\n")
-##     >>> w.end()
-##     >>> x = w.start("body")
-##     >>> w.element("h1", "this is a heading")
-##     >>> w.data("\n")
-##     >>> w.element("p", u"this is a paragraph")
-##     >>> w.data("\n")
-##     >>> w.element("p", u"reserved characters: <&>")
-##     >>> w.data("\n")
-##     >>> w.element("p", u"detta är också ett stycke")
-##     >>> w.data("\n")
-##     >>> w.close(html)
-##     >>> print file.getvalue()
-##     <html><head><title>my document</title>
-##     <meta name="hello" value="goodbye" />
-##     </head><body><h1>this is a heading</h1>
-##     <p>this is a paragraph</p>
-##     <p>reserved characters: &lt;&amp;&gt;</p>
-##     <p>detta &#228;r ocks&#229; ett stycke</p>
-##     </body></html>
-##     """
+def xmlwriter():
+    r"""
+    >>> file = StringIO.StringIO()
+    >>> w = SimpleXMLWriter.XMLWriter(file)
+    >>> html = w.start("html")
+    >>> x = w.start("head")
+    >>> w.element("title", "my document")
+    >>> w.data("\n")
+    >>> w.element("meta", name="hello", value="goodbye")
+    >>> w.data("\n")
+    >>> w.end()
+    >>> x = w.start("body")
+    >>> w.element("h1", "this is a heading")
+    >>> w.data("\n")
+    >>> w.element("p", u"this is a paragraph")
+    >>> w.data("\n")
+    >>> w.element("p", u"reserved characters: <&>")
+    >>> w.data("\n")
+    >>> w.element("p", u"detta är också ett stycke")
+    >>> w.data("\n")
+    >>> w.close(html)
+    >>> print file.getvalue()
+    <html><head><title>my document</title>
+    <meta name="hello" value="goodbye" />
+    </head><body><h1>this is a heading</h1>
+    <p>this is a paragraph</p>
+    <p>reserved characters: &lt;&amp;&gt;</p>
+    <p>detta &#228;r ocks&#229; ett stycke</p>
+    </body></html>
+    """
+
+# doesn't work with lxml.etree
+del xmlwriter
 
 # --------------------------------------------------------------------
 # reported bugs
 
-## def bug_xmltoolkit21():
-##     """
-##     marshaller gives obscure errors for non-string values
+def bug_xmltoolkit21():
+    """
+    marshaller gives obscure errors for non-string values
 
-##     >>> elem = ElementTree.Element(123)
-##     >>> serialize(elem) # tag
-##     Traceback (most recent call last):
-##     TypeError: cannot serialize 123 (type int)
-##     >>> elem = ElementTree.Element("elem")
-##     >>> elem.text = 123
-##     >>> serialize(elem) # text
-##     Traceback (most recent call last):
-##     TypeError: cannot serialize 123 (type int)
-##     >>> elem = ElementTree.Element("elem")
-##     >>> elem.tail = 123
-##     >>> serialize(elem) # tail
-##     Traceback (most recent call last):
-##     TypeError: cannot serialize 123 (type int)
-##     >>> elem = ElementTree.Element("elem")
-##     >>> elem.set(123, "123")
-##     >>> serialize(elem) # attribute key
-##     Traceback (most recent call last):
-##     TypeError: cannot serialize 123 (type int)
-##     >>> elem = ElementTree.Element("elem")
-##     >>> elem.set("123", 123)
-##     >>> serialize(elem) # attribute value
-##     Traceback (most recent call last):
-##     TypeError: cannot serialize 123 (type int)
+    >>> elem = ElementTree.Element(123)
+    >>> serialize(elem) # tag
+    Traceback (most recent call last):
+    TypeError: cannot serialize 123 (type int)
+    >>> elem = ElementTree.Element("elem")
+    >>> elem.text = 123
+    >>> serialize(elem) # text
+    Traceback (most recent call last):
+    TypeError: cannot serialize 123 (type int)
+    >>> elem = ElementTree.Element("elem")
+    >>> elem.tail = 123
+    >>> serialize(elem) # tail
+    Traceback (most recent call last):
+    TypeError: cannot serialize 123 (type int)
+    >>> elem = ElementTree.Element("elem")
+    >>> elem.set(123, "123")
+    >>> serialize(elem) # attribute key
+    Traceback (most recent call last):
+    TypeError: cannot serialize 123 (type int)
+    >>> elem = ElementTree.Element("elem")
+    >>> elem.set("123", 123)
+    >>> serialize(elem) # attribute value
+    Traceback (most recent call last):
+    TypeError: cannot serialize 123 (type int)
 
-##     """
+    """
+
+# doesn't work with lxml.etree
+del bug_xmltoolkit21
 
 def bug_xmltoolkit25():
     """
@@ -898,92 +993,199 @@
     ['tbody']
     """
 
-## def bug_xmltoolkitX1():
-##     """
-##     dump() doesn't flush the output buffer
+def bug_xmltoolkitX1():
+    """
+    dump() doesn't flush the output buffer
 
-##     >>> tree = ElementTree.XML("<doc><table><tbody/></table></doc>")
-##     >>> ElementTree.dump(tree); sys.stdout.write("tail")
-##     <doc><table><tbody /></table></doc>
-##     tail
-##     """
+    >>> tree = ElementTree.XML("<doc><table><tbody/></table></doc>")
+    >>> ElementTree.dump(tree); sys.stdout.write("tail")
+    <doc><table><tbody /></table></doc>
+    tail
+    """
 
-## def bug_xmltoolkit39():
-##     """
-##     non-ascii element and attribute names doesn't work
+# doesn't work with lxml.etree
+del bug_xmltoolkitX1
 
-##     >>> tree = ElementTree.XML("<?xml version='1.0' encoding='iso-8859-1'?><täg />")
-##     >>> ElementTree.tostring(tree, "utf-8")
-##     '<t\\xc3\\xa4g />'
-
-##     >>> tree = ElementTree.XML("<?xml version='1.0' encoding='iso-8859-1'?><tag ättr='v&#228;lue' />")
-##     >>> tree.attrib
-##     {u'\\xe4ttr': u'v\\xe4lue'}
-##     >>> ElementTree.tostring(tree, "utf-8")
-##     '<tag \\xc3\\xa4ttr="v\\xc3\\xa4lue" />'
-
-##     >>> tree = ElementTree.XML("<?xml version='1.0' encoding='iso-8859-1'?><täg>text</täg>")
-##     >>> ElementTree.tostring(tree, "utf-8")
-##     '<t\\xc3\\xa4g>text</t\\xc3\\xa4g>'
-
-##     >>> tree = ElementTree.Element(u"täg")
-##     >>> ElementTree.tostring(tree, "utf-8")
-##     '<t\\xc3\\xa4g />'
-
-##     >>> tree = ElementTree.Element("tag")
-##     >>> tree.set(u"ättr", u"välue")
-##     >>> ElementTree.tostring(tree, "utf-8")
-##     '<tag \\xc3\\xa4ttr="v\\xc3\\xa4lue" />'
+def bug_xmltoolkit39():
+    """
+    non-ascii element and attribute names doesn't work
 
-##     """
+    >>> tree = ElementTree.XML("<?xml version='1.0' encoding='iso-8859-1'?><täg />")
+    >>> ElementTree.tostring(tree, "utf-8")
+    '<t\\xc3\\xa4g />'
 
-## def bug_xmltoolkit45():
-##     """
-##     problems parsing mixed unicode/non-ascii html documents
+    >>> tree = ElementTree.XML("<?xml version='1.0' encoding='iso-8859-1'?><tag ättr='v&#228;lue' />")
+    >>> tree.attrib
+    {u'\\xe4ttr': u'v\\xe4lue'}
+    >>> ElementTree.tostring(tree, "utf-8")
+    '<tag \\xc3\\xa4ttr="v\\xc3\\xa4lue" />'
 
-##     latin-1 text
-##     >>> p = HTMLTreeBuilder.TreeBuilder()
-##     >>> p.feed("<p>välue</p>")
-##     >>> serialize(p.close())
-##     '<p>v&#228;lue</p>'
+    >>> tree = ElementTree.XML("<?xml version='1.0' encoding='iso-8859-1'?><täg>text</täg>")
+    >>> ElementTree.tostring(tree, "utf-8")
+    '<t\\xc3\\xa4g>text</t\\xc3\\xa4g>'
 
-##     utf-8 text
-##     >>> p = HTMLTreeBuilder.TreeBuilder(encoding="utf-8")
-##     >>> p.feed("<p>v\xc3\xa4lue</p>")
-##     >>> serialize(p.close())
-##     '<p>v&#228;lue</p>'
+    >>> tree = ElementTree.Element(u"täg")
+    >>> ElementTree.tostring(tree, "utf-8")
+    '<t\\xc3\\xa4g />'
 
-##     utf-8 text using meta tag
-##     >>> p = HTMLTreeBuilder.TreeBuilder()
-##     >>> p.feed("<html><meta http-equiv='Content-Type' content='text/html; charset=utf-8'><p>v\xc3\xa4lue</p></html>")
-##     >>> serialize(p.close().find("p"))
-##     '<p>v&#228;lue</p>'
-
-##     latin-1 character references
-##     >>> p = HTMLTreeBuilder.TreeBuilder()
-##     >>> p.feed("<p>v&#228;lue</p>")
-##     >>> serialize(p.close())
-##     '<p>v&#228;lue</p>'
+    >>> tree = ElementTree.Element("tag")
+    >>> tree.set(u"ättr", u"välue")
+    >>> ElementTree.tostring(tree, "utf-8")
+    '<tag \\xc3\\xa4ttr="v\\xc3\\xa4lue" />'
 
-##     latin-1 character entities
-##     >>> p = HTMLTreeBuilder.TreeBuilder()
-##     >>> p.feed("<p>v&auml;lue</p>")
-##     >>> serialize(p.close())
-##     '<p>v&#228;lue</p>'
+    """
 
-##     mixed latin-1 text and unicode entities
-##     >>> p = HTMLTreeBuilder.TreeBuilder()
-##     >>> p.feed("<p>&#8221;välue&#8221;</p>")
-##     >>> serialize(p.close())
-##     '<p>&#8221;v&#228;lue&#8221;</p>'
+# doesn't work with lxml.etree
+del bug_xmltoolkit39
 
-##     mixed unicode and latin-1 entities
-##     >>> p = HTMLTreeBuilder.TreeBuilder()
-##     >>> p.feed("<p>&#8221;v&auml;lue&#8221;</p>")
-##     >>> serialize(p.close())
-##     '<p>&#8221;v&#228;lue&#8221;</p>'
+def bug_xmltoolkit45():
+    """
+    problems parsing mixed unicode/non-ascii html documents
 
-##     """
+    latin-1 text
+    >>> p = HTMLTreeBuilder.TreeBuilder()
+    >>> p.feed("<p>välue</p>")
+    >>> serialize(p.close())
+    '<p>v&#228;lue</p>'
+
+    utf-8 text
+    >>> p = HTMLTreeBuilder.TreeBuilder(encoding="utf-8")
+    >>> p.feed("<p>v\xc3\xa4lue</p>")
+    >>> serialize(p.close())
+    '<p>v&#228;lue</p>'
+
+    utf-8 text using meta tag
+    >>> p = HTMLTreeBuilder.TreeBuilder()
+    >>> p.feed("<html><meta http-equiv='Content-Type' content='text/html; charset=utf-8'><p>v\xc3\xa4lue</p></html>")
+    >>> serialize(p.close().find("p"))
+    '<p>v&#228;lue</p>'
+
+    latin-1 character references
+    >>> p = HTMLTreeBuilder.TreeBuilder()
+    >>> p.feed("<p>v&#228;lue</p>")
+    >>> serialize(p.close())
+    '<p>v&#228;lue</p>'
+
+    latin-1 character entities
+    >>> p = HTMLTreeBuilder.TreeBuilder()
+    >>> p.feed("<p>v&auml;lue</p>")
+    >>> serialize(p.close())
+    '<p>v&#228;lue</p>'
+
+    mixed latin-1 text and unicode entities
+    >>> p = HTMLTreeBuilder.TreeBuilder()
+    >>> p.feed("<p>&#8221;välue&#8221;</p>")
+    >>> serialize(p.close())
+    '<p>&#8221;v&#228;lue&#8221;</p>'
+
+    mixed unicode and latin-1 entities
+    >>> p = HTMLTreeBuilder.TreeBuilder()
+    >>> p.feed("<p>&#8221;v&auml;lue&#8221;</p>")
+    >>> serialize(p.close())
+    '<p>&#8221;v&#228;lue&#8221;</p>'
+
+    """
+
+# doesn't work with lxml.etree
+del bug_xmltoolkit45
+
+def bug_xmltoolkit46():
+    """
+    problems parsing open BR tags
+
+   >>> p = HTMLTreeBuilder.TreeBuilder()
+    >>> p.feed("<p>key<br>value</p>")
+    >>> serialize(p.close())
+    '<p>key<br />value</p>'
+
+    """
+
+# doesn't work with lxml.etree
+del bug_xmltoolkit46
+
+def bug_xmltoolkit54():
+    """
+    problems handling internally defined entities
+
+    >>> e = ElementTree.XML("<!DOCTYPE doc [<!ENTITY ldots '&#x8230;'>]><doc>&ldots;</doc>")
+    >>> serialize(e)
+    '<doc>&#33328;</doc>'
+    """
+
+# doesn't work with lxml.etree
+del bug_xmltoolkit54
+
+def bug_xmltoolkit55():
+    """
+    make sure we're reporting the first error, not the last
+
+    >>> e = ElementTree.XML("<!DOCTYPE doc SYSTEM 'doc.dtd'><doc>&ldots;&ndots;&rdots;</doc>")
+    Traceback (most recent call last):
+    ParseError: undefined entity &ldots;: line 1, column 36
+    """
+
+# doesn't work with lxml.etree
+del bug_xmltoolkit55
+
+def bug_200708_version():
+    """
+    >>> parser = ET.XMLParser()
+    >>> parser.version
+    'Expat 2.0.0'
+    >>> parser.feed(open("samples/simple.xml").read())
+    >>> print serialize(parser.close())
+    <root>
+       <element key="value">text</element>
+       <element>text</element>tail
+       <empty-element />
+    </root>
+    """
+
+# doesn't work with lxml.etree
+del bug_200708_version
+
+def bug_200708_newline():
+    r"""
+
+    Preserve newlines in attributes.
+
+    >>> e = ET.Element('SomeTag', text="def _f():\n  return 3\n")
+    >>> ET.tostring(e)
+    '<SomeTag text="def _f():&#10;  return 3&#10;" />'
+    >>> ET.XML(ET.tostring(e)).get("text")
+    'def _f():\n  return 3\n'
+    >>> ET.tostring(ET.XML(ET.tostring(e)))
+    '<SomeTag text="def _f():&#10;  return 3&#10;" />'
+    """
+
+# doesn't work with lxml.etree
+del bug_200708_newline
+
+def bug_200709_default_namespace():
+    """
+
+    >>> e = ET.Element("{default}elem")
+    >>> s = ET.SubElement(e, "{default}elem")
+    >>> serialize(e, default_namespace="default") # 1
+    '<elem xmlns="default"><elem /></elem>'
+
+    >>> e = ET.Element("{default}elem")
+    >>> s = ET.SubElement(e, "{default}elem")
+    >>> s = ET.SubElement(e, "{not-default}elem")
+    >>> serialize(e, default_namespace="default") # 2
+    '<elem xmlns="default" xmlns:ns1="not-default"><elem /><ns1:elem /></elem>'
+
+    >>> e = ET.Element("{default}elem")
+    >>> s = ET.SubElement(e, "{default}elem")
+    >>> s = ET.SubElement(e, "elem") # unprefixed name
+    >>> serialize(e, default_namespace="default") # 3
+    Traceback (most recent call last):
+    ValueError: cannot use non-qualified names with default_namespace option
+
+    """
+
+# doesn't work with lxml.etree
+del bug_200709_default_namespace
 
 # --------------------------------------------------------------------
 

Modified: lxml/trunk/src/lxml/_elementpath.py
==============================================================================
--- lxml/trunk/src/lxml/_elementpath.py	(original)
+++ lxml/trunk/src/lxml/_elementpath.py	Thu Sep 13 12:52:26 2007
@@ -1,4 +1,6 @@
-# This file is taken from ElementTree directly, unchanged beyond this line.
+#
+# ElementTree
+# $Id: ElementPath.py 3276 2007-09-12 06:52:30Z fredrik $
 #
 # limited xpath support for element trees
 #
@@ -6,8 +8,9 @@
 # 2003-05-23 fl   created
 # 2003-05-28 fl   added support for // etc
 # 2003-08-27 fl   fixed parsing of periods in element names
+# 2007-09-10 fl   new selection engine
 #
-# Copyright (c) 2003-2004 by Fredrik Lundh.  All rights reserved.
+# Copyright (c) 2003-2007 by Fredrik Lundh.  All rights reserved.
 #
 # fredrik at pythonware.com
 # http://www.pythonware.com
@@ -15,7 +18,7 @@
 # --------------------------------------------------------------------
 # The ElementTree toolkit is
 #
-# Copyright (c) 1999-2004 by Fredrik Lundh
+# Copyright (c) 1999-2007 by Fredrik Lundh
 #
 # By obtaining, using, and/or copying this software and/or its
 # associated documentation, you agree that you have read, understood,
@@ -49,146 +52,178 @@
 import re
 
 xpath_tokenizer = re.compile(
-    "(::|\.\.|\(\)|[/.*:\[\]\(\)@=])|((?:\{[^}]+\})?[^/:\[\]\(\)@=\s]+)|\s+"
+    "("
+    "'[^']*'|\"[^\"]*\"|"
+    "::|"
+    "//?|"
+    "\.\.|"
+    "\(\)|"
+    "[/.*:\[\]\(\)@=])|"
+    "((?:\{[^}]+\})?[^/:\[\]\(\)@=\s]+)|"
+    "\s+"
     ).findall
 
-class xpath_descendant_or_self:
-    pass
-
-##
-# Wrapper for a compiled XPath.
-
-class Path:
-
-    ##
-    # Create an Path instance from an XPath expression.
-
-    def __init__(self, path):
-        tokens = xpath_tokenizer(path)
-        # the current version supports 'path/path'-style expressions only
-        self.path = []
-        self.tag = None
-        if tokens and tokens[0][0] == "/":
-            raise SyntaxError("cannot use absolute path on element")
-        while tokens:
-            op, tag = tokens.pop(0)
-            if tag or op == "*":
-                self.path.append(tag or op)
-            elif op == ".":
-                pass
-            elif op == "/":
-                self.path.append(xpath_descendant_or_self())
-                continue
-            else:
-                raise SyntaxError("unsupported path syntax (%s)" % op)
-            if tokens:
-                op, tag = tokens.pop(0)
-                if op != "/":
-                    raise SyntaxError(
-                        "expected path separator (%s)" % (op or tag)
-                        )
-        if self.path and isinstance(self.path[-1], xpath_descendant_or_self):
-            raise SyntaxError("path cannot end with //")
-        if len(self.path) == 1 and isinstance(self.path[0], type("")):
-            self.tag = self.path[0]
-
-    ##
-    # Find first matching object.
-
-    def find(self, element):
-        tag = self.tag
-        if tag is None:
-            nodeset = self.findall(element)
-            if not nodeset:
-                return None
-            return nodeset[0]
-        for elem in element:
-            if elem.tag == tag:
-                return elem
-        return None
-
-    ##
-    # Find text for first matching object.
-
-    def findtext(self, element, default=None):
-        tag = self.tag
-        if tag is None:
-            nodeset = self.findall(element)
-            if not nodeset:
-                return default
-            return nodeset[0].text or ""
-        for elem in element:
-            if elem.tag == tag:
-                return elem.text or ""
-        return default
-
-    ##
-    # Find all matching objects.
-
-    def findall(self, element):
-        nodeset = [element]
-        index = 0
-        while 1:
-            try:
-                path = self.path[index]
-                index = index + 1
-            except IndexError:
-                return nodeset
-            set = []
-            if isinstance(path, xpath_descendant_or_self):
-                try:
-                    tag = self.path[index]
-                    if not isinstance(tag, type("")):
-                        tag = None
-                    else:
-                        index = index + 1
-                except IndexError:
-                    tag = None # invalid path
-                for node in nodeset:
-                    new = list(node.getiterator(tag))
-                    if new and new[0] is node:
-                        set.extend(new[1:])
-                    else:
-                        set.extend(new)
+def prepare_tag(next, token):
+    tag = token[1]
+    def select(context, result):
+        for elem in result:
+            for e in elem:
+                if e.tag == tag:
+                    yield e
+    return select
+
+def prepare_star(next, token):
+    def select(context, result):
+        for elem in result:
+            for e in elem:
+                yield e
+    return select
+
+def prepare_dot(next, token):
+    def select(context, result):
+        for elem in result:
+            yield elem
+    return select
+
+def prepare_iter(next, token):
+    token = next()
+    if token[0] == "*":
+        tag = "*"
+    elif not token[0]:
+        tag = token[1]
+    else:
+        raise SyntaxError
+    def select(context, result):
+        for elem in result:
+            for e in elem.iter(tag):
+                if e is not elem:
+                    yield e
+    return select
+
+def prepare_dot_dot(next, token):
+    def select(context, result):
+        parent_map = context.parent_map
+        if parent_map is None:
+            context.parent_map = parent_map = {}
+            for p in context.root.iter():
+                for e in p:
+                    parent_map[e] = p
+        for elem in result:
+            if elem in parent_map:
+                yield parent_map[elem]
+    return select
+
+def prepare_predicate(next, token):
+    # this one should probably be refactored...
+    token = next()
+    if token[0] == "@":
+        # attribute
+        token = next()
+        if token[0]:
+            raise SyntaxError("invalid attribute predicate")
+        key = token[1]
+        token = next()
+        if token[0] == "]":
+            def select(context, result):
+                for elem in result:
+                    if elem.get(key) is not None:
+                        yield elem
+        elif token[0] == "=":
+            value = next()[0]
+            if value[:1] == "'" or value[:1] == '"':
+                value = value[1:-1]
             else:
-                for node in nodeset:
-                    for node in node:
-                        if path == "*" or node.tag == path:
-                            set.append(node)
-            if not set:
-                return []
-            nodeset = set
+                raise SyntaxError("invalid comparision target")
+            token = next()
+            def select(context, result):
+                for elem in result:
+                    if elem.get(key) == value:
+                        yield elem
+        if token[0] != "]":
+            raise SyntaxError("invalid attribute predicate")
+    elif not token[0]:
+        tag = token[1]
+        token = next()
+        if token[0] != "]":
+            raise SyntaxError("invalid node predicate")
+        def select(context, result):
+            for elem in result:
+                if elem.find(tag) is not None:
+                    yield elem
+    else:
+        raise SyntaxError("invalid predicate")
+    return select
+
+ops = {
+    "": prepare_tag,
+    "*": prepare_star,
+    ".": prepare_dot,
+    "..": prepare_dot_dot,
+    "//": prepare_iter,
+    "[": prepare_predicate,
+    }
 
 _cache = {}
 
-##
-# (Internal) Compile path.
+class _SelectorContext:
+    parent_map = None
+    def __init__(self, root):
+        self.root = root
 
-def _compile(path):
-    p = _cache.get(path)
-    if p is not None:
-        return p
-    p = Path(path)
-    if len(_cache) >= 100:
-        _cache.clear()
-    _cache[path] = p
-    return p
+# --------------------------------------------------------------------
 
 ##
 # Find first matching object.
 
-def find(element, path):
-    return _compile(path).find(element)
+def find(elem, path):
+    try:
+        return iterfind(elem, path).next()
+    except StopIteration:
+        return None
 
 ##
-# Find text for first matching object.
+# Find all matching objects.
 
-def findtext(element, path, default=None):
-    return _compile(path).findtext(element, default)
+def findall(elem, path):
+    return list(iterfind(elem, path))
 
-##
-# Find all matching objects.
+def iterfind(elem, path):
+    # compile selector pattern
+    try:
+        selector = _cache[path]
+    except KeyError:
+        if len(_cache) > 100:
+            _cache.clear()
+        if path[:1] == "/":
+            raise SyntaxError("cannot use absolute path on element")
+        stream = iter(xpath_tokenizer(path))
+        next = stream.next; token = next()
+        selector = []
+        while 1:
+            try:
+                selector.append(ops[token[0]](next, token))
+            except StopIteration:
+                raise SyntaxError("invalid path")
+            try:
+                token = next()
+                if token[0] == "/":
+                    token = next()
+            except StopIteration:
+                break
+        _cache[path] = selector
+    # execute selector pattern
+    result = [elem]
+    context = _SelectorContext(elem)
+    for select in selector:
+        result = select(context, result)
+    return result
 
-def findall(element, path):
-    return _compile(path).findall(element)
+##
+# Find text for first matching object.
 
+def findtext(elem, path, default=None):
+    try:
+        elem = iterfind(elem, path).next()
+        return elem.text
+    except StopIteration:
+        return default

Modified: lxml/trunk/src/lxml/etree.pyx
==============================================================================
--- lxml/trunk/src/lxml/etree.pyx	(original)
+++ lxml/trunk/src/lxml/etree.pyx	Thu Sep 13 12:52:26 2007
@@ -1130,6 +1130,13 @@
             path = (<QName>path).text
         return _elementpath.findall(self, path)
 
+    def iterfind(self, path):
+        """Iterates over all matching subelements, by tag name or path.
+        """
+        if isinstance(path, QName):
+            path = (<QName>path).text
+        return _elementpath.iterfind(self, path)
+
     def xpath(self, _path, namespaces=None, extensions=None, **_variables):
         """Evaluate an xpath expression using the element as context node.
         """
@@ -1423,8 +1430,8 @@
         return root.iter(tag)
 
     def iter(self, tag=None):
-        """Creates an iterator for the root element. The iterator loops over all elements
-        in this tree, in document order.
+        """Creates an iterator for the root element.  The iterator loops over
+        all elements in this tree, in document order.
         """
         root = self.getroot()
         if root is None:
@@ -1432,7 +1439,8 @@
         return root.iter(tag)
 
     def find(self, path):
-        """Finds the first toplevel element with given tag. Same as getroot().find(path).
+        """Finds the first toplevel element with given tag.  Same as
+        getroot().find(path).
         """
         self._assertHasRoot()
         root = self.getroot()
@@ -1441,7 +1449,8 @@
         return root.find(path)
 
     def findtext(self, path, default=None):
-        """Finds the element text for the first toplevel element with given tag. Same as getroot().findtext(path)
+        """Finds the text for the first element matching the ElementPath
+        expression.  Same as getroot().findtext(path)
         """
         self._assertHasRoot()
         root = self.getroot()
@@ -1450,14 +1459,25 @@
         return root.findtext(path, default)
 
     def findall(self, path):
-        """Finds all toplevel elements with the given tag. Same as getroot().findall(path).
+        """Finds all elements matching the ElementPath expression.  Same as
+        getroot().findall(path).
         """
         self._assertHasRoot()
         root = self.getroot()
         if path[:1] == "/":
             path = "." + path
         return root.findall(path)
-    
+
+    def iterfind(self, path):
+        """Iterates over all elements matching the ElementPath expression.
+        Same as getroot().finditer(path).
+        """
+        self._assertHasRoot()
+        root = self.getroot()
+        if path[:1] == "/":
+            path = "." + path
+        return root.iterfind(path)
+
     def xpath(self, _path, namespaces=None, extensions=None, **_variables):
         """XPath evaluate in context of document.
 
@@ -1918,11 +1938,13 @@
     tail text.
     """
     cdef object _nextEvent
+    cdef _Element _start_element
     def __init__(self, _Element element not None, tag=None, with_tail=True):
         if with_tail:
             events = ("start", "end")
         else:
             events = ("start",)
+        self._start_element = element
         self._nextEvent = iterwalk(element, events=events, tag=tag).next
 
     def __iter__(self):
@@ -1931,10 +1953,10 @@
     def __next__(self):
         cdef _Element element
         while result is None:
-            event, element = self._nextEvent()
+            event, element = self._nextEvent() # raises StopIteration
             if event == "start":
                 result = element.text
-            else:
+            elif element is not self._start_element:
                 result = element.tail
         return result
 

Modified: lxml/trunk/src/lxml/tests/test_etree.py
==============================================================================
--- lxml/trunk/src/lxml/tests/test_etree.py	(original)
+++ lxml/trunk/src/lxml/tests/test_etree.py	Thu Sep 13 12:52:26 2007
@@ -1429,6 +1429,15 @@
         self.assertEquals(["RTEXT", "ATAIL", "CTEXT", "CTAIL"],
                           text)
 
+    def test_itertext_child(self):
+        # ET 1.3+
+        XML = self.etree.XML
+        root = XML("<root>RTEXT<a></a>ATAIL<b/><c>CTEXT</c>CTAIL</root>")
+
+        text = list(root[2].itertext())
+        self.assertEquals(["CTEXT"],
+                          text)
+
     def test_findall_ns(self):
         XML = self.etree.XML
         root = XML('<a xmlns:x="X" xmlns:y="Y"><x:b><c/></x:b><b/><c><x:b/><b/></c><b/></a>')


More information about the lxml-checkins mailing list