# $Id: selftest.py 2193 2004-12-05 18:03:00Z fredrik $ # -*- coding: iso-8859-1 -*- # elementtree selftest program # this test script uses Python's "doctest" module to check that the # *test script* works as expected. # TODO: add more elementtree method tests # TODO: add xml/html parsing tests # TODO: etc import sys, string, StringIO from lxml import etree as ElementTree from lxml import _elementpath as ElementPath #from elementtree import ElementTree #from elementtree import ElementPath #from elementtree import ElementInclude #from elementtree import HTMLTreeBuilder #from elementtree import SimpleXMLWriter def serialize(elem, encoding=None): import StringIO file = StringIO.StringIO() tree = ElementTree.ElementTree(elem) if encoding: tree.write(file, encoding) else: tree.write(file) return file.getvalue() def summarize(elem): return elem.tag def summarize_list(seq): return map(summarize, seq) def normalize_crlf(tree): for elem in tree.getiterator(): if elem.text: elem.text = string.replace(elem.text, "\r\n", "\n") if elem.tail: elem.tail = string.replace(elem.tail, "\r\n", "\n") SAMPLE_XML = ElementTree.XML(""" text
subtext
""") # # interface tests def check_string(string): len(string) for char in string: if len(char) != 1: print "expected one-character string, got %r" % char new_string = string + "" new_string = string + " " string[:0] def check_string_or_none(value): if value is None: return return check_string(value) def check_mapping(mapping): len(mapping) keys = mapping.keys() items = mapping.items() for key in keys: item = mapping[key] mapping["key"] = "value" if mapping["key"] != "value": print "expected value string, got %r" % mapping["key"] def check_element(element): if not hasattr(element, "tag"): print "no tag member" if not hasattr(element, "attrib"): print "no attrib member" if not hasattr(element, "text"): print "no text member" if not hasattr(element, "tail"): print "no tail member" check_string(element.tag) check_mapping(element.attrib) check_string_or_none(element.text) check_string_or_none(element.tail) for elem in element: check_element(elem) def check_element_tree(tree): check_element(tree.getroot()) # -------------------------------------------------------------------- # element tree tests ## def sanity(): ## """ ## >>> from elementtree.ElementTree import * ## >>> from elementtree.ElementInclude import * ## >>> from elementtree.ElementPath import * ## >>> from elementtree.HTMLTreeBuilder import * ## >>> from elementtree.SimpleXMLTreeBuilder import * ## >>> from elementtree.SimpleXMLWriter import * ## >>> from elementtree.TidyHTMLTreeBuilder import * ## >>> from elementtree.TidyTools import * ## >>> from elementtree.XMLTreeBuilder import * ## """ def interface(): """ Test element tree interface. >>> element = ElementTree.Element("tag") >>> check_element(element) >>> tree = ElementTree.ElementTree(element) >>> check_element_tree(tree) """ ## def simplefind(): ## """ ## Test find methods using the elementpath fallback. ## >>> CurrentElementPath = ElementTree.ElementPath ## >>> ElementTree.ElementPath = ElementTree._SimpleElementPath() ## >>> elem = SAMPLE_XML ## >>> elem.find("tag").tag ## 'tag' ## >>> ElementTree.ElementTree(elem).find("tag").tag ## 'tag' ## >>> elem.findtext("tag") ## 'text' ## >>> elem.findtext("tog") ## >>> elem.findtext("tog", "default") ## 'default' ## >>> ElementTree.ElementTree(elem).findtext("tag") ## 'text' ## >>> summarize_list(elem.findall("tag")) ## ['tag', 'tag'] ## >>> summarize_list(elem.findall(".//tag")) ## ['tag', 'tag', 'tag'] ## Path syntax doesn't work in this case. ## >>> elem.find("section/tag") ## >>> elem.findtext("section/tag") ## >>> elem.findall("section/tag") ## [] ## >>> ElementTree.ElementPath = CurrentElementPath ## """ def find(): """ Test find methods (including xpath syntax). >>> elem = SAMPLE_XML >>> elem.find("tag").tag 'tag' >>> ElementTree.ElementTree(elem).find("tag").tag 'tag' >>> elem.find("section/tag").tag 'tag' >>> ElementTree.ElementTree(elem).find("section/tag").tag 'tag' >>> elem.findtext("tag") 'text' >>> elem.findtext("tog") >>> elem.findtext("tog", "default") 'default' >>> ElementTree.ElementTree(elem).findtext("tag") 'text' >>> elem.findtext("section/tag") 'subtext' >>> ElementTree.ElementTree(elem).findtext("section/tag") 'subtext' >>> summarize_list(elem.findall("tag")) ['tag', 'tag'] >>> summarize_list(elem.findall("*")) ['tag', 'tag', 'section'] >>> summarize_list(elem.findall(".//tag")) ['tag', 'tag', 'tag'] >>> summarize_list(elem.findall("section/tag")) ['tag'] >>> summarize_list(elem.findall("section//tag")) ['tag'] >>> summarize_list(elem.findall("section/*")) ['tag'] >>> summarize_list(elem.findall("section//*")) ['tag'] >>> summarize_list(elem.findall("section/.//*")) ['tag'] >>> summarize_list(elem.findall("*/*")) ['tag'] >>> summarize_list(elem.findall("*//*")) ['tag'] >>> summarize_list(elem.findall("*/tag")) ['tag'] >>> summarize_list(elem.findall("*/./tag")) ['tag'] >>> summarize_list(elem.findall("./tag")) ['tag', 'tag'] >>> summarize_list(elem.findall(".//tag")) ['tag', 'tag', 'tag'] >>> summarize_list(elem.findall("././tag")) ['tag', 'tag'] >>> summarize_list(ElementTree.ElementTree(elem).findall("/tag")) ['tag', 'tag'] >>> summarize_list(ElementTree.ElementTree(elem).findall("./tag")) ['tag', 'tag'] """ def bad_find(): """ Check bad or unsupported path expressions. >>> elem = SAMPLE_XML >>> elem.findall("/tag") Traceback (most recent call last): SyntaxError: cannot use absolute path on element >>> elem.findall("../tag") Traceback (most recent call last): SyntaxError: unsupported path syntax (..) >>> elem.findall("section//") Traceback (most recent call last): SyntaxError: path cannot end with // >>> elem.findall("tag[tag]") Traceback (most recent call last): SyntaxError: expected path separator ([) """ def parsefile(): """ Test parsing from file. >>> tree = ElementTree.parse("samples/simple.xml") >>> normalize_crlf(tree) >>> tree.write(sys.stdout) text texttail >>> tree = ElementTree.parse("samples/simple-ns.xml") >>> normalize_crlf(tree) >>> tree.write(sys.stdout) text texttail """ ## def parsehtml(): ## """ ## Test HTML parsing. ## >>> p = HTMLTreeBuilder.TreeBuilder() ## >>> p.feed("

spamegg

") ## >>> serialize(p.close()) ## '

spamegg

' ## """ ## def parseliteral(): ## r""" ## >>> element = ElementTree.XML("text") ## >>> ElementTree.ElementTree(element).write(sys.stdout) ## text ## >>> element = ElementTree.fromstring("text") ## >>> ElementTree.ElementTree(element).write(sys.stdout) ## text ## >>> print ElementTree.tostring(element) ## text ## >>> print ElementTree.tostring(element, "ascii") ## ## text ## >>> _, ids = ElementTree.XMLID("text") ## >>> len(ids) ## 0 ## >>> _, ids = ElementTree.XMLID("text") ## >>> len(ids) ## 1 ## >>> ids["body"].tag ## 'body' ## """ ## def simpleparsefile(): ## """ ## Test the xmllib-based parser. ## >>> from elementtree import SimpleXMLTreeBuilder ## >>> parser = SimpleXMLTreeBuilder.TreeBuilder() ## >>> tree = ElementTree.parse("samples/simple.xml", parser) ## >>> normalize_crlf(tree) ## >>> tree.write(sys.stdout) ## ## text ## texttail ## ## ## """ ## def fancyparsefile(): ## """ ## Test the "fancy" parser. ## Sanity check. ## >>> from elementtree import XMLTreeBuilder ## >>> parser = XMLTreeBuilder.FancyTreeBuilder() ## >>> tree = ElementTree.parse("samples/simple.xml", parser) ## >>> normalize_crlf(tree) ## >>> tree.write(sys.stdout) ## ## text ## texttail ## ## ## Callback check. ## >>> class MyFancyParser(XMLTreeBuilder.FancyTreeBuilder): ## ... def start(self, elem): ## ... print "START", elem.tag ## ... def end(self, elem): ## ... print "END", elem.tag ## >>> parser = MyFancyParser() ## >>> tree = ElementTree.parse("samples/simple.xml", parser) ## START root ## START element ## END element ## START element ## END element ## START empty-element ## END empty-element ## END root ## """ def writefile(): """ >>> elem = ElementTree.Element("tag") >>> elem.text = "text" >>> serialize(elem) 'text' >>> ElementTree.SubElement(elem, "subtag").text = "subtext" >>> serialize(elem) 'textsubtext' """ def writestring(): """ >>> elem = ElementTree.XML("text") >>> ElementTree.tostring(elem) 'text' >>> elem = ElementTree.fromstring("text") >>> ElementTree.tostring(elem) 'text' """ ## def encoding(): ## r""" ## Test encoding issues. ## >>> elem = ElementTree.Element("tag") ## >>> elem.text = u"abc" ## >>> serialize(elem) ## 'abc' ## >>> serialize(elem, "utf-8") ## 'abc' ## >>> serialize(elem, "us-ascii") ## 'abc' ## >>> serialize(elem, "iso-8859-1") ## "\nabc" ## >>> elem.text = "<&\"\'>" ## >>> serialize(elem) ## '<&"\'>' ## >>> serialize(elem, "utf-8") ## '<&"\'>' ## >>> serialize(elem, "us-ascii") # cdata characters ## '<&"\'>' ## >>> serialize(elem, "iso-8859-1") ## '\n<&"\'>' ## >>> elem.attrib["key"] = "<&\"\'>" ## >>> elem.text = None ## >>> serialize(elem) ## '' ## >>> serialize(elem, "utf-8") ## '' ## >>> serialize(elem, "us-ascii") ## '' ## >>> serialize(elem, "iso-8859-1") ## '\n' ## >>> elem.text = u'\xe5\xf6\xf6<>' ## >>> elem.attrib.clear() ## >>> serialize(elem) ## 'åöö<>' ## >>> serialize(elem, "utf-8") ## '\xc3\xa5\xc3\xb6\xc3\xb6<>' ## >>> serialize(elem, "us-ascii") ## 'åöö<>' ## >>> serialize(elem, "iso-8859-1") ## "\n\xe5\xf6\xf6<>" ## >>> elem.attrib["key"] = u'\xe5\xf6\xf6<>' ## >>> elem.text = None ## >>> serialize(elem) ## '' ## >>> serialize(elem, "utf-8") ## '' ## >>> serialize(elem, "us-ascii") ## '' ## >>> serialize(elem, "iso-8859-1") ## '\n' ## """ ENTITY_XML = """\ %user-entities; ]> &entity; """ ## def entity(): ## """ ## Test entity handling. ## 1) bad entities ## >>> ElementTree.XML("&entity;") ## Traceback (most recent call last): ## ExpatError: undefined entity: line 1, column 10 ## >>> ElementTree.XML(ENTITY_XML) ## Traceback (most recent call last): ## ExpatError: undefined entity &entity;: line 5, column 10 ## (add more tests here) ## """ ## def xmllang(): ## """ ## This appears to be a problem; in underlying libxml2? ## 1) xml namespace ## >>> elem = ElementTree.XML("") ## >>> serialize(elem) # 1.1 ## '' ## """ def namespace(): """ Test namespace issues. 2) other "well-known" namespaces >>> elem = ElementTree.XML("") >>> serialize(elem) # 2.1 '' >>> elem = ElementTree.XML("") >>> serialize(elem) # 2.2 '' >>> elem = ElementTree.XML("") >>> serialize(elem) # 2.3 '' 3) unknown namespaces """ ## def qname(): ## """ ## Test QName handling. ## 1) decorated tags ## >>> elem = ElementTree.Element("{uri}tag") ## >>> serialize(elem) # 1.1 ## '' ## >>> elem = ElementTree.Element(ElementTree.QName("{uri}tag")) ## >>> serialize(elem) # 1.2 ## '' ## >>> elem = ElementTree.Element(ElementTree.QName("uri", "tag")) ## >>> serialize(elem) # 1.3 ## '' ## 2) decorated attributes ## >>> elem.clear() ## >>> elem.attrib["{uri}key"] = "value" ## >>> serialize(elem) # 2.1 ## '' ## >>> elem.clear() ## >>> elem.attrib[ElementTree.QName("{uri}key")] = "value" ## >>> serialize(elem) # 2.2 ## '' ## 3) decorated values are not converted by default, but the ## QName wrapper can be used for values ## >>> elem.clear() ## >>> elem.attrib["{uri}key"] = "{uri}value" ## >>> serialize(elem) # 3.1 ## '' ## >>> elem.clear() ## >>> elem.attrib["{uri}key"] = ElementTree.QName("{uri}value") ## >>> serialize(elem) # 3.2 ## '' ## >>> elem.clear() ## >>> subelem = ElementTree.Element("tag") ## >>> subelem.attrib["{uri1}key"] = ElementTree.QName("{uri2}value") ## >>> elem.append(subelem) ## >>> elem.append(subelem) ## >>> serialize(elem) # 3.3 ## '' ## """ def xpath_tokenizer(p): """ Test the XPath tokenizer. >>> # tests from the xml specification >>> xpath_tokenizer("*") ['*'] >>> xpath_tokenizer("text()") ['text', '()'] >>> xpath_tokenizer("@name") ['@', 'name'] >>> xpath_tokenizer("@*") ['@', '*'] >>> xpath_tokenizer("para[1]") ['para', '[', '1', ']'] >>> xpath_tokenizer("para[last()]") ['para', '[', 'last', '()', ']'] >>> xpath_tokenizer("*/para") ['*', '/', 'para'] >>> xpath_tokenizer("/doc/chapter[5]/section[2]") ['/', 'doc', '/', 'chapter', '[', '5', ']', '/', 'section', '[', '2', ']'] >>> xpath_tokenizer("chapter//para") ['chapter', '/', '/', 'para'] >>> xpath_tokenizer("//para") ['/', '/', 'para'] >>> xpath_tokenizer("//olist/item") ['/', '/', 'olist', '/', 'item'] >>> xpath_tokenizer(".") ['.'] >>> xpath_tokenizer(".//para") ['.', '/', '/', 'para'] >>> xpath_tokenizer("..") ['..'] >>> xpath_tokenizer("../@lang") ['..', '/', '@', 'lang'] >>> xpath_tokenizer("chapter[title]") ['chapter', '[', 'title', ']'] >>> xpath_tokenizer("employee[@secretary and @assistant]") ['employee', '[', '@', 'secretary', '', 'and', '', '@', 'assistant', ']'] >>> # additional tests >>> xpath_tokenizer("{http://spam}egg") ['{http://spam}egg'] >>> xpath_tokenizer("./spam.egg") ['.', '/', 'spam.egg'] >>> xpath_tokenizer(".//{http://spam}egg") ['.', '/', '/', '{http://spam}egg'] """ out = [] for op, tag in ElementPath.xpath_tokenizer(p): out.append(op or tag) return out # # xinclude tests (samples from appendix C of the xinclude specification) XINCLUDE = {} XINCLUDE["C1.xml"] = """\

120 Mz is adequate for an average home user.

""" XINCLUDE["disclaimer.xml"] = """\

The opinions represented herein represent those of the individual and should not be interpreted as official policy endorsed by this organization.

""" XINCLUDE["C2.xml"] = """\

This document has been accessed times.

""" XINCLUDE["count.txt"] = "324387" XINCLUDE["C3.xml"] = """\

The following is the source of the "data.xml" resource:

""" XINCLUDE["data.xml"] = """\ """ XINCLUDE["C5.xml"] = """\ """ XINCLUDE["default.xml"] = """\

Example.

""" def xinclude_loader(href, parse="xml", encoding=None): try: data = XINCLUDE[href] except KeyError: raise IOError("resource not found") if parse == "xml": return ElementTree.XML(data) return data ## def xinclude(): ## r""" ## Basic inclusion example (XInclude C.1) ## >>> document = xinclude_loader("C1.xml") ## >>> ElementInclude.include(document, xinclude_loader) ## >>> print serialize(document) # C1 ## ##

120 Mz is adequate for an average home user.

## ##

The opinions represented herein represent those of the individual ## and should not be interpreted as official policy endorsed by this ## organization.

##
##
## Textual inclusion example (XInclude C.2) ## >>> document = xinclude_loader("C2.xml") ## >>> ElementInclude.include(document, xinclude_loader) ## >>> print serialize(document) # C2 ## ##

This document has been accessed ## 324387 times.

##
## Textual inclusion of XML example (XInclude C.3) ## >>> document = xinclude_loader("C3.xml") ## >>> ElementInclude.include(document, xinclude_loader) ## >>> print serialize(document) # C3 ## ##

The following is the source of the "data.xml" resource:

## <?xml version='1.0'?> ## <data> ## <item><![CDATA[Brooks & Shields]]></item> ## </data> ## ##
## Fallback example (XInclude C.5) ## Note! Fallback support is not yet implemented ## >>> document = xinclude_loader("C5.xml") ## >>> ElementInclude.include(document, xinclude_loader) ## Traceback (most recent call last): ## IOError: resource not found ## >>> # print serialize(document) # C5 ## """ ## def xinclude_default(): ## """ ## >>> document = xinclude_loader("default.xml") ## >>> ElementInclude.include(document) ## >>> print serialize(document) # default ## ##

Example.

## ## text ## texttail ## ## ##
## """ # # xmlwriter ## def xmlwriter(): ## r""" ## >>> file = StringIO.StringIO() ## >>> w = SimpleXMLWriter.XMLWriter(file) ## >>> html = w.start("html") ## >>> x = w.start("head") ## >>> w.element("title", "my document") ## >>> w.data("\n") ## >>> w.element("meta", name="hello", value="goodbye") ## >>> w.data("\n") ## >>> w.end() ## >>> x = w.start("body") ## >>> w.element("h1", "this is a heading") ## >>> w.data("\n") ## >>> w.element("p", u"this is a paragraph") ## >>> w.data("\n") ## >>> w.element("p", u"reserved characters: <&>") ## >>> w.data("\n") ## >>> w.element("p", u"detta är också ett stycke") ## >>> w.data("\n") ## >>> w.close(html) ## >>> print file.getvalue() ## my document ## ##

this is a heading

##

this is a paragraph

##

reserved characters: <&>

##

detta är också ett stycke

## ## """ # -------------------------------------------------------------------- # reported bugs ## def bug_xmltoolkit21(): ## """ ## marshaller gives obscure errors for non-string values ## >>> elem = ElementTree.Element(123) ## >>> serialize(elem) # tag ## Traceback (most recent call last): ## TypeError: cannot serialize 123 (type int) ## >>> elem = ElementTree.Element("elem") ## >>> elem.text = 123 ## >>> serialize(elem) # text ## Traceback (most recent call last): ## TypeError: cannot serialize 123 (type int) ## >>> elem = ElementTree.Element("elem") ## >>> elem.tail = 123 ## >>> serialize(elem) # tail ## Traceback (most recent call last): ## TypeError: cannot serialize 123 (type int) ## >>> elem = ElementTree.Element("elem") ## >>> elem.set(123, "123") ## >>> serialize(elem) # attribute key ## Traceback (most recent call last): ## TypeError: cannot serialize 123 (type int) ## >>> elem = ElementTree.Element("elem") ## >>> elem.set("123", 123) ## >>> serialize(elem) # attribute value ## Traceback (most recent call last): ## TypeError: cannot serialize 123 (type int) ## """ def bug_xmltoolkit25(): """ typo in ElementTree.findtext >>> tree = ElementTree.ElementTree(SAMPLE_XML) >>> tree.findtext("tag") 'text' >>> tree.findtext("section/tag") 'subtext' """ def bug_xmltoolkit28(): """ .//tag causes exceptions >>> tree = ElementTree.XML("
") >>> summarize_list(tree.findall(".//thead")) [] >>> summarize_list(tree.findall(".//tbody")) ['tbody'] """ ## def bug_xmltoolkitX1(): ## """ ## dump() doesn't flush the output buffer ## >>> tree = ElementTree.XML("
") ## >>> ElementTree.dump(tree); sys.stdout.write("tail") ##
## tail ## """ ## def bug_xmltoolkit39(): ## """ ## non-ascii element and attribute names doesn't work ## >>> tree = ElementTree.XML("") ## >>> ElementTree.tostring(tree, "utf-8") ## '' ## >>> tree = ElementTree.XML("") ## >>> tree.attrib ## {u'\\xe4ttr': u'v\\xe4lue'} ## >>> ElementTree.tostring(tree, "utf-8") ## '' ## >>> tree = ElementTree.XML("text") ## >>> ElementTree.tostring(tree, "utf-8") ## 'text' ## >>> tree = ElementTree.Element(u"täg") ## >>> ElementTree.tostring(tree, "utf-8") ## '' ## >>> tree = ElementTree.Element("tag") ## >>> tree.set(u"ättr", u"välue") ## >>> ElementTree.tostring(tree, "utf-8") ## '' ## """ ## def bug_xmltoolkit45(): ## """ ## problems parsing mixed unicode/non-ascii html documents ## latin-1 text ## >>> p = HTMLTreeBuilder.TreeBuilder() ## >>> p.feed("

välue

") ## >>> serialize(p.close()) ## '

välue

' ## utf-8 text ## >>> p = HTMLTreeBuilder.TreeBuilder(encoding="utf-8") ## >>> p.feed("

v\xc3\xa4lue

") ## >>> serialize(p.close()) ## '

välue

' ## utf-8 text using meta tag ## >>> p = HTMLTreeBuilder.TreeBuilder() ## >>> p.feed("

v\xc3\xa4lue

") ## >>> serialize(p.close().find("p")) ## '

välue

' ## latin-1 character references ## >>> p = HTMLTreeBuilder.TreeBuilder() ## >>> p.feed("

välue

") ## >>> serialize(p.close()) ## '

välue

' ## latin-1 character entities ## >>> p = HTMLTreeBuilder.TreeBuilder() ## >>> p.feed("

välue

") ## >>> serialize(p.close()) ## '

välue

' ## mixed latin-1 text and unicode entities ## >>> p = HTMLTreeBuilder.TreeBuilder() ## >>> p.feed("

”välue”

") ## >>> serialize(p.close()) ## '

”välue”

' ## mixed unicode and latin-1 entities ## >>> p = HTMLTreeBuilder.TreeBuilder() ## >>> p.feed("

”välue”

") ## >>> serialize(p.close()) ## '

”välue”

' ## """ # -------------------------------------------------------------------- if __name__ == "__main__": import doctest, selftest failed, tested = doctest.testmod(selftest) print tested - failed, "tests ok."