# $Id: selftest.py 2193 2004-12-05 18:03:00Z fredrik $
# -*- coding: iso-8859-1 -*-
# elementtree selftest program
# this test script uses Python's "doctest" module to check that the
# *test script* works as expected.
# TODO: add more elementtree method tests
# TODO: add xml/html parsing tests
# TODO: etc
import sys, string, StringIO
from lxml import etree as ElementTree
from lxml import _elementpath as ElementPath
#from elementtree import ElementTree
#from elementtree import ElementPath
#from elementtree import ElementInclude
#from elementtree import HTMLTreeBuilder
#from elementtree import SimpleXMLWriter
def serialize(elem, encoding=None):
import StringIO
file = StringIO.StringIO()
tree = ElementTree.ElementTree(elem)
if encoding:
tree.write(file, encoding)
else:
tree.write(file)
return file.getvalue()
def summarize(elem):
return elem.tag
def summarize_list(seq):
return map(summarize, seq)
def normalize_crlf(tree):
for elem in tree.getiterator():
if elem.text: elem.text = string.replace(elem.text, "\r\n", "\n")
if elem.tail: elem.tail = string.replace(elem.tail, "\r\n", "\n")
SAMPLE_XML = ElementTree.XML("""
text
""")
#
# interface tests
def check_string(string):
len(string)
for char in string:
if len(char) != 1:
print "expected one-character string, got %r" % char
new_string = string + ""
new_string = string + " "
string[:0]
def check_string_or_none(value):
if value is None:
return
return check_string(value)
def check_mapping(mapping):
len(mapping)
keys = mapping.keys()
items = mapping.items()
for key in keys:
item = mapping[key]
mapping["key"] = "value"
if mapping["key"] != "value":
print "expected value string, got %r" % mapping["key"]
def check_element(element):
if not hasattr(element, "tag"):
print "no tag member"
if not hasattr(element, "attrib"):
print "no attrib member"
if not hasattr(element, "text"):
print "no text member"
if not hasattr(element, "tail"):
print "no tail member"
check_string(element.tag)
check_mapping(element.attrib)
check_string_or_none(element.text)
check_string_or_none(element.tail)
for elem in element:
check_element(elem)
def check_element_tree(tree):
check_element(tree.getroot())
# --------------------------------------------------------------------
# element tree tests
## def sanity():
## """
## >>> from elementtree.ElementTree import *
## >>> from elementtree.ElementInclude import *
## >>> from elementtree.ElementPath import *
## >>> from elementtree.HTMLTreeBuilder import *
## >>> from elementtree.SimpleXMLTreeBuilder import *
## >>> from elementtree.SimpleXMLWriter import *
## >>> from elementtree.TidyHTMLTreeBuilder import *
## >>> from elementtree.TidyTools import *
## >>> from elementtree.XMLTreeBuilder import *
## """
def interface():
"""
Test element tree interface.
>>> element = ElementTree.Element("tag")
>>> check_element(element)
>>> tree = ElementTree.ElementTree(element)
>>> check_element_tree(tree)
"""
## def simplefind():
## """
## Test find methods using the elementpath fallback.
## >>> CurrentElementPath = ElementTree.ElementPath
## >>> ElementTree.ElementPath = ElementTree._SimpleElementPath()
## >>> elem = SAMPLE_XML
## >>> elem.find("tag").tag
## 'tag'
## >>> ElementTree.ElementTree(elem).find("tag").tag
## 'tag'
## >>> elem.findtext("tag")
## 'text'
## >>> elem.findtext("tog")
## >>> elem.findtext("tog", "default")
## 'default'
## >>> ElementTree.ElementTree(elem).findtext("tag")
## 'text'
## >>> summarize_list(elem.findall("tag"))
## ['tag', 'tag']
## >>> summarize_list(elem.findall(".//tag"))
## ['tag', 'tag', 'tag']
## Path syntax doesn't work in this case.
## >>> elem.find("section/tag")
## >>> elem.findtext("section/tag")
## >>> elem.findall("section/tag")
## []
## >>> ElementTree.ElementPath = CurrentElementPath
## """
def find():
"""
Test find methods (including xpath syntax).
>>> elem = SAMPLE_XML
>>> elem.find("tag").tag
'tag'
>>> ElementTree.ElementTree(elem).find("tag").tag
'tag'
>>> elem.find("section/tag").tag
'tag'
>>> ElementTree.ElementTree(elem).find("section/tag").tag
'tag'
>>> elem.findtext("tag")
'text'
>>> elem.findtext("tog")
>>> elem.findtext("tog", "default")
'default'
>>> ElementTree.ElementTree(elem).findtext("tag")
'text'
>>> elem.findtext("section/tag")
'subtext'
>>> ElementTree.ElementTree(elem).findtext("section/tag")
'subtext'
>>> summarize_list(elem.findall("tag"))
['tag', 'tag']
>>> summarize_list(elem.findall("*"))
['tag', 'tag', 'section']
>>> summarize_list(elem.findall(".//tag"))
['tag', 'tag', 'tag']
>>> summarize_list(elem.findall("section/tag"))
['tag']
>>> summarize_list(elem.findall("section//tag"))
['tag']
>>> summarize_list(elem.findall("section/*"))
['tag']
>>> summarize_list(elem.findall("section//*"))
['tag']
>>> summarize_list(elem.findall("section/.//*"))
['tag']
>>> summarize_list(elem.findall("*/*"))
['tag']
>>> summarize_list(elem.findall("*//*"))
['tag']
>>> summarize_list(elem.findall("*/tag"))
['tag']
>>> summarize_list(elem.findall("*/./tag"))
['tag']
>>> summarize_list(elem.findall("./tag"))
['tag', 'tag']
>>> summarize_list(elem.findall(".//tag"))
['tag', 'tag', 'tag']
>>> summarize_list(elem.findall("././tag"))
['tag', 'tag']
>>> summarize_list(ElementTree.ElementTree(elem).findall("/tag"))
['tag', 'tag']
>>> summarize_list(ElementTree.ElementTree(elem).findall("./tag"))
['tag', 'tag']
"""
def bad_find():
"""
Check bad or unsupported path expressions.
>>> elem = SAMPLE_XML
>>> elem.findall("/tag")
Traceback (most recent call last):
SyntaxError: cannot use absolute path on element
>>> elem.findall("../tag")
Traceback (most recent call last):
SyntaxError: unsupported path syntax (..)
>>> elem.findall("section//")
Traceback (most recent call last):
SyntaxError: path cannot end with //
>>> elem.findall("tag[tag]")
Traceback (most recent call last):
SyntaxError: expected path separator ([)
"""
def parsefile():
"""
Test parsing from file.
>>> tree = ElementTree.parse("samples/simple.xml")
>>> normalize_crlf(tree)
>>> tree.write(sys.stdout)
text
texttail
>>> tree = ElementTree.parse("samples/simple-ns.xml")
>>> normalize_crlf(tree)
>>> tree.write(sys.stdout)
text
texttail
"""
## def parsehtml():
## """
## Test HTML parsing.
## >>> p = HTMLTreeBuilder.TreeBuilder()
## >>> p.feed("spamegg
")
## >>> serialize(p.close())
## 'spamegg
'
## """
## def parseliteral():
## r"""
## >>> element = ElementTree.XML("text")
## >>> ElementTree.ElementTree(element).write(sys.stdout)
## text
## >>> element = ElementTree.fromstring("text")
## >>> ElementTree.ElementTree(element).write(sys.stdout)
## text
## >>> print ElementTree.tostring(element)
## text
## >>> print ElementTree.tostring(element, "ascii")
##
## text
## >>> _, ids = ElementTree.XMLID("text")
## >>> len(ids)
## 0
## >>> _, ids = ElementTree.XMLID("text")
## >>> len(ids)
## 1
## >>> ids["body"].tag
## 'body'
## """
## def simpleparsefile():
## """
## Test the xmllib-based parser.
## >>> from elementtree import SimpleXMLTreeBuilder
## >>> parser = SimpleXMLTreeBuilder.TreeBuilder()
## >>> tree = ElementTree.parse("samples/simple.xml", parser)
## >>> normalize_crlf(tree)
## >>> tree.write(sys.stdout)
##
## text
## texttail
##
##
## """
## def fancyparsefile():
## """
## Test the "fancy" parser.
## Sanity check.
## >>> from elementtree import XMLTreeBuilder
## >>> parser = XMLTreeBuilder.FancyTreeBuilder()
## >>> tree = ElementTree.parse("samples/simple.xml", parser)
## >>> normalize_crlf(tree)
## >>> tree.write(sys.stdout)
##
## text
## texttail
##
##
## Callback check.
## >>> class MyFancyParser(XMLTreeBuilder.FancyTreeBuilder):
## ... def start(self, elem):
## ... print "START", elem.tag
## ... def end(self, elem):
## ... print "END", elem.tag
## >>> parser = MyFancyParser()
## >>> tree = ElementTree.parse("samples/simple.xml", parser)
## START root
## START element
## END element
## START element
## END element
## START empty-element
## END empty-element
## END root
## """
def writefile():
"""
>>> elem = ElementTree.Element("tag")
>>> elem.text = "text"
>>> serialize(elem)
'text'
>>> ElementTree.SubElement(elem, "subtag").text = "subtext"
>>> serialize(elem)
'textsubtext'
"""
def writestring():
"""
>>> elem = ElementTree.XML("text")
>>> ElementTree.tostring(elem)
'text'
>>> elem = ElementTree.fromstring("text")
>>> ElementTree.tostring(elem)
'text'
"""
## def encoding():
## r"""
## Test encoding issues.
## >>> elem = ElementTree.Element("tag")
## >>> elem.text = u"abc"
## >>> serialize(elem)
## 'abc'
## >>> serialize(elem, "utf-8")
## 'abc'
## >>> serialize(elem, "us-ascii")
## 'abc'
## >>> serialize(elem, "iso-8859-1")
## "\nabc"
## >>> elem.text = "<&\"\'>"
## >>> serialize(elem)
## '<&"\'>'
## >>> serialize(elem, "utf-8")
## '<&"\'>'
## >>> serialize(elem, "us-ascii") # cdata characters
## '<&"\'>'
## >>> serialize(elem, "iso-8859-1")
## '\n<&"\'>'
## >>> elem.attrib["key"] = "<&\"\'>"
## >>> elem.text = None
## >>> serialize(elem)
## ''
## >>> serialize(elem, "utf-8")
## ''
## >>> serialize(elem, "us-ascii")
## ''
## >>> serialize(elem, "iso-8859-1")
## '\n'
## >>> elem.text = u'\xe5\xf6\xf6<>'
## >>> elem.attrib.clear()
## >>> serialize(elem)
## 'åöö<>'
## >>> serialize(elem, "utf-8")
## '\xc3\xa5\xc3\xb6\xc3\xb6<>'
## >>> serialize(elem, "us-ascii")
## 'åöö<>'
## >>> serialize(elem, "iso-8859-1")
## "\n\xe5\xf6\xf6<>"
## >>> elem.attrib["key"] = u'\xe5\xf6\xf6<>'
## >>> elem.text = None
## >>> serialize(elem)
## ''
## >>> serialize(elem, "utf-8")
## ''
## >>> serialize(elem, "us-ascii")
## ''
## >>> serialize(elem, "iso-8859-1")
## '\n'
## """
ENTITY_XML = """\
%user-entities;
]>
&entity;
"""
## def entity():
## """
## Test entity handling.
## 1) bad entities
## >>> ElementTree.XML("&entity;")
## Traceback (most recent call last):
## ExpatError: undefined entity: line 1, column 10
## >>> ElementTree.XML(ENTITY_XML)
## Traceback (most recent call last):
## ExpatError: undefined entity &entity;: line 5, column 10
## (add more tests here)
## """
## def xmllang():
## """
## This appears to be a problem; in underlying libxml2?
## 1) xml namespace
## >>> elem = ElementTree.XML("")
## >>> serialize(elem) # 1.1
## ''
## """
def namespace():
"""
Test namespace issues.
2) other "well-known" namespaces
>>> elem = ElementTree.XML("")
>>> serialize(elem) # 2.1
''
>>> elem = ElementTree.XML("")
>>> serialize(elem) # 2.2
''
>>> elem = ElementTree.XML("")
>>> serialize(elem) # 2.3
''
3) unknown namespaces
"""
## def qname():
## """
## Test QName handling.
## 1) decorated tags
## >>> elem = ElementTree.Element("{uri}tag")
## >>> serialize(elem) # 1.1
## ''
## >>> elem = ElementTree.Element(ElementTree.QName("{uri}tag"))
## >>> serialize(elem) # 1.2
## ''
## >>> elem = ElementTree.Element(ElementTree.QName("uri", "tag"))
## >>> serialize(elem) # 1.3
## ''
## 2) decorated attributes
## >>> elem.clear()
## >>> elem.attrib["{uri}key"] = "value"
## >>> serialize(elem) # 2.1
## ''
## >>> elem.clear()
## >>> elem.attrib[ElementTree.QName("{uri}key")] = "value"
## >>> serialize(elem) # 2.2
## ''
## 3) decorated values are not converted by default, but the
## QName wrapper can be used for values
## >>> elem.clear()
## >>> elem.attrib["{uri}key"] = "{uri}value"
## >>> serialize(elem) # 3.1
## ''
## >>> elem.clear()
## >>> elem.attrib["{uri}key"] = ElementTree.QName("{uri}value")
## >>> serialize(elem) # 3.2
## ''
## >>> elem.clear()
## >>> subelem = ElementTree.Element("tag")
## >>> subelem.attrib["{uri1}key"] = ElementTree.QName("{uri2}value")
## >>> elem.append(subelem)
## >>> elem.append(subelem)
## >>> serialize(elem) # 3.3
## ''
## """
def xpath_tokenizer(p):
"""
Test the XPath tokenizer.
>>> # tests from the xml specification
>>> xpath_tokenizer("*")
['*']
>>> xpath_tokenizer("text()")
['text', '()']
>>> xpath_tokenizer("@name")
['@', 'name']
>>> xpath_tokenizer("@*")
['@', '*']
>>> xpath_tokenizer("para[1]")
['para', '[', '1', ']']
>>> xpath_tokenizer("para[last()]")
['para', '[', 'last', '()', ']']
>>> xpath_tokenizer("*/para")
['*', '/', 'para']
>>> xpath_tokenizer("/doc/chapter[5]/section[2]")
['/', 'doc', '/', 'chapter', '[', '5', ']', '/', 'section', '[', '2', ']']
>>> xpath_tokenizer("chapter//para")
['chapter', '/', '/', 'para']
>>> xpath_tokenizer("//para")
['/', '/', 'para']
>>> xpath_tokenizer("//olist/item")
['/', '/', 'olist', '/', 'item']
>>> xpath_tokenizer(".")
['.']
>>> xpath_tokenizer(".//para")
['.', '/', '/', 'para']
>>> xpath_tokenizer("..")
['..']
>>> xpath_tokenizer("../@lang")
['..', '/', '@', 'lang']
>>> xpath_tokenizer("chapter[title]")
['chapter', '[', 'title', ']']
>>> xpath_tokenizer("employee[@secretary and @assistant]")
['employee', '[', '@', 'secretary', '', 'and', '', '@', 'assistant', ']']
>>> # additional tests
>>> xpath_tokenizer("{http://spam}egg")
['{http://spam}egg']
>>> xpath_tokenizer("./spam.egg")
['.', '/', 'spam.egg']
>>> xpath_tokenizer(".//{http://spam}egg")
['.', '/', '/', '{http://spam}egg']
"""
out = []
for op, tag in ElementPath.xpath_tokenizer(p):
out.append(op or tag)
return out
#
# xinclude tests (samples from appendix C of the xinclude specification)
XINCLUDE = {}
XINCLUDE["C1.xml"] = """\
120 Mz is adequate for an average home user.
"""
XINCLUDE["disclaimer.xml"] = """\
The opinions represented herein represent those of the individual
and should not be interpreted as official policy endorsed by this
organization.
"""
XINCLUDE["C2.xml"] = """\
This document has been accessed
times.
"""
XINCLUDE["count.txt"] = "324387"
XINCLUDE["C3.xml"] = """\
The following is the source of the "data.xml" resource:
"""
XINCLUDE["data.xml"] = """\
"""
XINCLUDE["C5.xml"] = """\
"""
XINCLUDE["default.xml"] = """\
Example.
"""
def xinclude_loader(href, parse="xml", encoding=None):
try:
data = XINCLUDE[href]
except KeyError:
raise IOError("resource not found")
if parse == "xml":
return ElementTree.XML(data)
return data
## def xinclude():
## r"""
## Basic inclusion example (XInclude C.1)
## >>> document = xinclude_loader("C1.xml")
## >>> ElementInclude.include(document, xinclude_loader)
## >>> print serialize(document) # C1
##
## 120 Mz is adequate for an average home user.
##
## The opinions represented herein represent those of the individual
## and should not be interpreted as official policy endorsed by this
## organization.
##
##
## Textual inclusion example (XInclude C.2)
## >>> document = xinclude_loader("C2.xml")
## >>> ElementInclude.include(document, xinclude_loader)
## >>> print serialize(document) # C2
##
## This document has been accessed
## 324387 times.
##
## Textual inclusion of XML example (XInclude C.3)
## >>> document = xinclude_loader("C3.xml")
## >>> ElementInclude.include(document, xinclude_loader)
## >>> print serialize(document) # C3
##
## The following is the source of the "data.xml" resource:
## <?xml version='1.0'?>
## <data>
## <item><![CDATA[Brooks & Shields]]></item>
## </data>
##
##
## Fallback example (XInclude C.5)
## Note! Fallback support is not yet implemented
## >>> document = xinclude_loader("C5.xml")
## >>> ElementInclude.include(document, xinclude_loader)
## Traceback (most recent call last):
## IOError: resource not found
## >>> # print serialize(document) # C5
## """
## def xinclude_default():
## """
## >>> document = xinclude_loader("default.xml")
## >>> ElementInclude.include(document)
## >>> print serialize(document) # default
##
## Example.
##
## text
## texttail
##
##
##
## """
#
# xmlwriter
## def xmlwriter():
## r"""
## >>> file = StringIO.StringIO()
## >>> w = SimpleXMLWriter.XMLWriter(file)
## >>> html = w.start("html")
## >>> x = w.start("head")
## >>> w.element("title", "my document")
## >>> w.data("\n")
## >>> w.element("meta", name="hello", value="goodbye")
## >>> w.data("\n")
## >>> w.end()
## >>> x = w.start("body")
## >>> w.element("h1", "this is a heading")
## >>> w.data("\n")
## >>> w.element("p", u"this is a paragraph")
## >>> w.data("\n")
## >>> w.element("p", u"reserved characters: <&>")
## >>> w.data("\n")
## >>> w.element("p", u"detta är också ett stycke")
## >>> w.data("\n")
## >>> w.close(html)
## >>> print file.getvalue()
## my document
##
## this is a heading
## this is a paragraph
## reserved characters: <&>
## detta är också ett stycke
##
## """
# --------------------------------------------------------------------
# reported bugs
## def bug_xmltoolkit21():
## """
## marshaller gives obscure errors for non-string values
## >>> elem = ElementTree.Element(123)
## >>> serialize(elem) # tag
## Traceback (most recent call last):
## TypeError: cannot serialize 123 (type int)
## >>> elem = ElementTree.Element("elem")
## >>> elem.text = 123
## >>> serialize(elem) # text
## Traceback (most recent call last):
## TypeError: cannot serialize 123 (type int)
## >>> elem = ElementTree.Element("elem")
## >>> elem.tail = 123
## >>> serialize(elem) # tail
## Traceback (most recent call last):
## TypeError: cannot serialize 123 (type int)
## >>> elem = ElementTree.Element("elem")
## >>> elem.set(123, "123")
## >>> serialize(elem) # attribute key
## Traceback (most recent call last):
## TypeError: cannot serialize 123 (type int)
## >>> elem = ElementTree.Element("elem")
## >>> elem.set("123", 123)
## >>> serialize(elem) # attribute value
## Traceback (most recent call last):
## TypeError: cannot serialize 123 (type int)
## """
def bug_xmltoolkit25():
"""
typo in ElementTree.findtext
>>> tree = ElementTree.ElementTree(SAMPLE_XML)
>>> tree.findtext("tag")
'text'
>>> tree.findtext("section/tag")
'subtext'
"""
def bug_xmltoolkit28():
"""
.//tag causes exceptions
>>> tree = ElementTree.XML("")
>>> summarize_list(tree.findall(".//thead"))
[]
>>> summarize_list(tree.findall(".//tbody"))
['tbody']
"""
## def bug_xmltoolkitX1():
## """
## dump() doesn't flush the output buffer
## >>> tree = ElementTree.XML("")
## >>> ElementTree.dump(tree); sys.stdout.write("tail")
##
## tail
## """
## def bug_xmltoolkit39():
## """
## non-ascii element and attribute names doesn't work
## >>> tree = ElementTree.XML("")
## >>> ElementTree.tostring(tree, "utf-8")
## ''
## >>> tree = ElementTree.XML("")
## >>> tree.attrib
## {u'\\xe4ttr': u'v\\xe4lue'}
## >>> ElementTree.tostring(tree, "utf-8")
## ''
## >>> tree = ElementTree.XML("text")
## >>> ElementTree.tostring(tree, "utf-8")
## 'text'
## >>> tree = ElementTree.Element(u"täg")
## >>> ElementTree.tostring(tree, "utf-8")
## ''
## >>> tree = ElementTree.Element("tag")
## >>> tree.set(u"ättr", u"välue")
## >>> ElementTree.tostring(tree, "utf-8")
## ''
## """
## def bug_xmltoolkit45():
## """
## problems parsing mixed unicode/non-ascii html documents
## latin-1 text
## >>> p = HTMLTreeBuilder.TreeBuilder()
## >>> p.feed("välue
")
## >>> serialize(p.close())
## 'välue
'
## utf-8 text
## >>> p = HTMLTreeBuilder.TreeBuilder(encoding="utf-8")
## >>> p.feed("v\xc3\xa4lue
")
## >>> serialize(p.close())
## 'välue
'
## utf-8 text using meta tag
## >>> p = HTMLTreeBuilder.TreeBuilder()
## >>> p.feed("v\xc3\xa4lue
")
## >>> serialize(p.close().find("p"))
## 'välue
'
## latin-1 character references
## >>> p = HTMLTreeBuilder.TreeBuilder()
## >>> p.feed("välue
")
## >>> serialize(p.close())
## 'välue
'
## latin-1 character entities
## >>> p = HTMLTreeBuilder.TreeBuilder()
## >>> p.feed("välue
")
## >>> serialize(p.close())
## 'välue
'
## mixed latin-1 text and unicode entities
## >>> p = HTMLTreeBuilder.TreeBuilder()
## >>> p.feed("”välue”
")
## >>> serialize(p.close())
## '”välue”
'
## mixed unicode and latin-1 entities
## >>> p = HTMLTreeBuilder.TreeBuilder()
## >>> p.feed("”välue”
")
## >>> serialize(p.close())
## '”välue”
'
## """
# --------------------------------------------------------------------
if __name__ == "__main__":
import doctest, selftest
failed, tested = doctest.testmod(selftest)
print tested - failed, "tests ok."