From scoder at codespeak.net Mon May 1 16:39:21 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Mon May 1 16:39:23 2006 Subject: [Lxml-checkins] r26635 - lxml/trunk/src/lxml Message-ID: <20060501143921.DA068100AD@code0.codespeak.net> Author: scoder Date: Mon May 1 16:39:21 2006 New Revision: 26635 Modified: lxml/trunk/src/lxml/xslt.pxi Log: doc typo Modified: lxml/trunk/src/lxml/xslt.pxi ============================================================================== --- lxml/trunk/src/lxml/xslt.pxi (original) +++ lxml/trunk/src/lxml/xslt.pxi Mon May 1 16:39:21 2006 @@ -336,7 +336,7 @@ result._xslt = xslt return result -# do not register all libxslt extra function, provide only "node-set" +# do not register all libxslt extra functions, provide only "node-set" # functions like "output" and "write" are a potential security risk #xslt.xsltRegisterAllExtras() xslt.xsltRegisterExtModuleFunction("node-set", From scoder at codespeak.net Mon May 1 17:08:53 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Mon May 1 17:08:54 2006 Subject: [Lxml-checkins] r26636 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20060501150853.5D92D100AD@code0.codespeak.net> Author: scoder Date: Mon May 1 17:08:50 2006 New Revision: 26636 Modified: lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/etree.pyx lxml/trunk/src/lxml/tests/test_etree.py lxml/trunk/src/lxml/tree.pxd Log: doctype property for _ElementTree, returns (public ID, system URL) tuple based on libxml2 parsed DTD information Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Mon May 1 17:08:50 2006 @@ -7,6 +7,9 @@ Features added -------------- +* Read-only 'doctype' attribute in ElementTree class that holds a tuple + (public ID, system URL) as seen by the parser + * etree module can be compiled without libxslt by commenting out the line 'include "xslt.pxi"' at the end of the etree.pyx source file Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Mon May 1 17:08:50 2006 @@ -129,6 +129,24 @@ return None return _elementFactory(self, c_node) + cdef getdoctype(self): + cdef tree.xmlDtd* dtd + public_id = None + sys_url = None + dtd = self._c_doc.intSubset + if dtd is not NULL: + if dtd.ExternalID is not NULL: + public_id = funicode(dtd.ExternalID) + if dtd.SystemID is not NULL: + sys_url = funicode(dtd.SystemID) + dtd = self._c_doc.extSubset + if dtd is not NULL: + if not public_id and dtd.ExternalID is not NULL: + public_id = funicode(dtd.ExternalID) + if not sys_url and dtd.SystemID is not NULL: + sys_url = funicode(dtd.SystemID) + return (public_id, sys_url) + cdef buildNewPrefix(self): ns = python.PyString_FromFormat("ns%d", self._ns_counter) self._ns_counter = self._ns_counter + 1 @@ -233,7 +251,16 @@ def getroot(self): return self._context_node - + + property doctype: + """A tuple (public ID, system URL) of the DOCTYPE seen by the parser. + Any of the two may be None. This value is only defined for + ElementTree objects based on the root node of a parsed document (e.g. + those returned by the parse functions). + """ + def __get__(self): + return self._doc.getdoctype() + def write(self, file, encoding='us-ascii'): if not hasattr(file, 'write'): # file is a filename, we want a file object Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Mon May 1 17:08:50 2006 @@ -379,7 +379,36 @@ 8, e.index(e[8], -12, -1)) self.assertEquals( 0, e.index(e[0], -12, -1)) - + + def test_doctype_public(self): + etree = self.etree + pub_id = "-//W3C//DTD XHTML 1.0 Transitional//EN" + sys_id = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd" + xml = '''\ + + +''' % (pub_id, sys_id) + + tree = etree.parse(StringIO(xml)) + self.assertEquals(tree.doctype, (pub_id, sys_id)) + + def test_doctype_system(self): + etree = self.etree + sys_id = "some.dtd" + xml = '''\ + + +''' % sys_id + + tree = etree.parse(StringIO(xml)) + self.assertEquals(tree.doctype, (None, sys_id)) + + def test_doctype_empty(self): + etree = self.etree + xml = '' + tree = etree.parse(StringIO(xml)) + self.assertEquals(tree.doctype, (None, None)) + def _writeElement(self, element, encoding='us-ascii'): """Write out element for comparison. """ Modified: lxml/trunk/src/lxml/tree.pxd ============================================================================== --- lxml/trunk/src/lxml/tree.pxd (original) +++ lxml/trunk/src/lxml/tree.pxd Mon May 1 17:08:50 2006 @@ -65,7 +65,11 @@ char* content xmlAttr* properties xmlNs* ns - + + ctypedef struct xmlDtd: + char* ExternalID + char* SystemID + ctypedef struct xmlDoc: xmlElementType type char* name @@ -79,6 +83,8 @@ xmlHashTable* ids char* URL void* _private + xmlDtd* intSubset + xmlDtd* extSubset ctypedef struct xmlAttr: void* _private From scoder at codespeak.net Tue May 2 07:46:18 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Tue May 2 07:46:19 2006 Subject: [Lxml-checkins] r26645 - in lxml/trunk: . src/lxml Message-ID: <20060502054618.2116010091@code0.codespeak.net> Author: scoder Date: Tue May 2 07:46:16 2006 New Revision: 26645 Modified: lxml/trunk/src/lxml/parser.pxi lxml/trunk/src/lxml/xmlparser.pxd lxml/trunk/version.txt Log: do not destroy mal-formed parser results if the recover option is set Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Tue May 2 07:46:16 2006 @@ -149,7 +149,7 @@ cdef xmlDoc* _handleParseResult(xmlParserCtxt* ctxt, xmlDoc* result, char* c_filename) except NULL: cdef _ResolverContext context - if ctxt.wellFormed: + if ctxt.wellFormed or (ctxt.options & xmlparser.XML_PARSE_RECOVER): __GLOBAL_PARSER_CONTEXT._initDocDict(result) elif result is not NULL: # free broken document @@ -158,7 +158,11 @@ if ctxt._private is not NULL: context = <_ResolverContext>ctxt._private - context._raise_if_stored() + if context._has_raised(): + if result is not NULL: + tree.xmlFreeDoc(result) + result = NULL + context._raise_if_stored() if result is NULL: if c_filename is not NULL and \ Modified: lxml/trunk/src/lxml/xmlparser.pxd ============================================================================== --- lxml/trunk/src/lxml/xmlparser.pxd (original) +++ lxml/trunk/src/lxml/xmlparser.pxd Tue May 2 07:46:16 2006 @@ -15,6 +15,7 @@ xmlDict* dict void* _private int wellFormed + int options xmlError lastError ctypedef enum xmlParserOption: Modified: lxml/trunk/version.txt ============================================================================== --- lxml/trunk/version.txt (original) +++ lxml/trunk/version.txt Tue May 2 07:46:16 2006 @@ -1 +1 @@ -0.9.1 +0.9.2 From scoder at codespeak.net Tue May 2 07:47:47 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Tue May 2 07:47:49 2006 Subject: [Lxml-checkins] r26646 - lxml/trunk Message-ID: <20060502054747.D4DB510091@code0.codespeak.net> Author: scoder Date: Tue May 2 07:47:46 2006 New Revision: 26646 Modified: lxml/trunk/version.txt Log: oops, one file slipped through Modified: lxml/trunk/version.txt ============================================================================== --- lxml/trunk/version.txt (original) +++ lxml/trunk/version.txt Tue May 2 07:47:46 2006 @@ -1 +1 @@ -0.9.2 +0.9.1 From scoder at codespeak.net Tue May 2 07:56:31 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Tue May 2 07:56:33 2006 Subject: [Lxml-checkins] r26647 - lxml/trunk/doc Message-ID: <20060502055631.E09E010091@code0.codespeak.net> Author: scoder Date: Tue May 2 07:56:31 2006 New Revision: 26647 Modified: lxml/trunk/doc/api.txt Log: forgot to merge doctests from htmlparser branch Modified: lxml/trunk/doc/api.txt ============================================================================== --- lxml/trunk/doc/api.txt (original) +++ lxml/trunk/doc/api.txt Tue May 2 07:56:31 2006 @@ -19,13 +19,14 @@ >>> from StringIO import StringIO -XMLParser ---------- +Parsers +------- -One of the differences is the parser. It is based on libxml2 and therefore -only supports options that are backed by the library. Parsers take a number -of keyword arguments. The following is an example for namespace cleanup -during parsing, first with the default parser, then with a parametrized one:: +One of the differences is the parser. There is support for both XML and +(broken) HTML. Both are based on libxml2 and therefore only support options +that are backed by the library. Parsers take a number of keyword arguments. +The following is an example for namespace cleanup during parsing, first with +the default parser, then with a parametrized one:: >>> xml = '' @@ -38,6 +39,23 @@ >>> print lxml.etree.tostring(et.getroot()) +HTML parsing is similarly simple:: + + >>> broken_html = "test<body><h1>page title</body></html>" + + >>> parser = lxml.etree.HTMLParser() + >>> et = lxml.etree.parse(StringIO(broken_html), parser) + + >>> print lxml.etree.tostring(et.getroot()) + <html><head><title>test

page title

+ +Lxml has an HTML function, similar to the XML shortcut known from +ElementTree:: + + >>> html = lxml.etree.HTML(broken_html) + >>> print lxml.etree.tostring(html) + test

page title

+ Error handling on exceptions ---------------------------- From scoder at codespeak.net Tue May 2 08:04:43 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Tue May 2 08:04:46 2006 Subject: [Lxml-checkins] r26648 - lxml/trunk/doc Message-ID: <20060502060443.6527B10091@code0.codespeak.net> Author: scoder Date: Tue May 2 08:04:40 2006 New Revision: 26648 Modified: lxml/trunk/doc/api.txt Log: mention recover option for parsers in api.txt Modified: lxml/trunk/doc/api.txt ============================================================================== --- lxml/trunk/doc/api.txt (original) +++ lxml/trunk/doc/api.txt Tue May 2 08:04:40 2006 @@ -39,9 +39,11 @@ >>> print lxml.etree.tostring(et.getroot()) -HTML parsing is similarly simple:: +HTML parsing is similarly simple. The parsers have a ``recover`` keyword +argument that the HTMLParser sets by default. It lets libxml2 try its best to +return something usable without raising an exception:: - >>> broken_html = "test<body><h1>page title</body></html>" + >>> broken_html = "<html><head><title>test<body><h1>page title" >>> parser = lxml.etree.HTMLParser() >>> et = lxml.etree.parse(StringIO(broken_html), parser) From scoder at codespeak.net Tue May 2 08:04:58 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Tue May 2 08:04:59 2006 Subject: [Lxml-checkins] r26649 - lxml/trunk/doc Message-ID: <20060502060458.2389E10091@code0.codespeak.net> Author: scoder Date: Tue May 2 08:04:56 2006 New Revision: 26649 Modified: lxml/trunk/doc/resolvers.txt Log: ReST updates Modified: lxml/trunk/doc/resolvers.txt ============================================================================== --- lxml/trunk/doc/resolvers.txt (original) +++ lxml/trunk/doc/resolvers.txt Tue May 2 08:04:56 2006 @@ -14,21 +14,21 @@ ... '<!ENTITY myentity "[resolved text: %s]">' % url, context) This defines a resolver that always returns a dynamically generated DTD -fragment defining an entity. The 'url' argument passes the system URL of the -requested document, the 'id' argument is the public ID. Note that any of -these may be None. The context object is not normally used by client code. +fragment defining an entity. The ``url`` argument passes the system URL of +the requested document, the ``id`` argument is the public ID. Note that any +of these may be None. The context object is not normally used by client code. Resolving is based on three methods of the Resolver object that build internal representations of the result document. The following methods exist: -* 'resolve_string' takes a parsable string as result document -* 'resolve_filename' takes a filename -* 'resolve_file' takes an open file-like object that has at least a read() method -* 'resolve_empty' resolves into an empty document +* ``resolve_string`` takes a parsable string as result document +* ``resolve_filename`` takes a filename +* ``resolve_file`` takes an open file-like object that has at least a read() method +* ``resolve_empty`` resolves into an empty document -The 'resolve' method may choose to return None, in which case the next +The ``resolve`` method may choose to return None, in which case the next registered resolver (or the default resolver) is consulted. It is never -called if the resolver returns the result of any of the above 'resolve_*' +called if the resolver returns the result of any of the above ``resolve_*`` methods. Resolvers are registered local to a parser:: @@ -90,11 +90,11 @@ ... </xsl:stylesheet> ... """ -Note that it needs to resolve two URIs: 'honk:test' when compiling the XSLT -document (i.e. when resolving xsl:import and xsl:include elements) and -'hoi:test' at transformation time, when calls to the 'document' function are -resolved. If we now register different resolvers with two different parsers, -we can parse our document twice in different resolver contexts:: +Note that it needs to resolve two URIs: ``honk:test`` when compiling the XSLT +document (i.e. when resolving ``xsl:import`` and ``xsl:include`` elements) and +``hoi:test`` at transformation time, when calls to the ``document`` function +are resolved. If we now register different resolvers with two different +parsers, we can parse our document twice in different resolver contexts:: >>> hoi_parser = etree.XMLParser() >>> normal_doc = etree.parse(StringIO(xml_text), hoi_parser) @@ -109,7 +109,8 @@ These contexts are important for the further behaviour of the documents. They memorise their original parser so that the correct set of resolvers is used in subsequent lookups. To compile the stylesheet, XSLT must resolve the -honk:test URI in the xsl:include element. The "hoi" resolver cannot do that:: +``honk:test`` URI in the ``xsl:include`` element. The ``hoi`` resolver cannot +do that:: >>> transform = etree.XSLT(normal_doc) Traceback (most recent call last): @@ -121,15 +122,15 @@ [...] XSLTParseError: Cannot resolve URI honk:test -However, if we use the "honk" resolver associated with the respective +However, if we use the ``honk`` resolver associated with the respective document, everything works fine:: >>> transform = etree.XSLT(honk_doc) Resolving url honk:test as prefix honk ... done Running the transform accesses the same parser context again, but since it now -needs to resolve the "hoi" URI in the call to the document function, its -"honk" resolver will fail to do so:: +needs to resolve the ``hoi`` URI in the call to the document function, its +``honk`` resolver will fail to do so:: >>> result = transform(normal_doc) Traceback (most recent call last): @@ -146,7 +147,7 @@ [...] XSLTApplyError: Cannot resolve URI hoi:test -This can only be solved by adding a "hoi" resolver to the parser. Note that +This can only be solved by adding a ``hoi`` resolver to the parser. Note that adding it after parsing the XSL document will not work as parsed documents remember the state of the parser at the time of their creation:: @@ -169,10 +170,10 @@ <?xml version="1.0"?> <test>hoi-TEST</test> -We can see that the "hoi" resolver was called to generate a document that was -then inserted into the result document by the XSLT transformation. Note that -this is completely independent of the XML file you transform, as the URI is -resolved from within the stylesheet context:: +We can see that the ``hoi`` resolver was called to generate a document that +was then inserted into the result document by the XSLT transformation. Note +that this is completely independent of the XML file you transform, as the URI +is resolved from within the stylesheet context:: >>> result = transform(normal_doc) Resolving url hoi:test as prefix honk ... failed From scoder at codespeak.net Tue May 2 08:56:01 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Tue May 2 08:56:03 2006 Subject: [Lxml-checkins] r26652 - in lxml/trunk: doc src/lxml src/lxml/tests Message-ID: <20060502065601.63F42100A7@code0.codespeak.net> Author: scoder Date: Tue May 2 08:55:58 2006 New Revision: 26652 Modified: lxml/trunk/doc/api.txt lxml/trunk/src/lxml/etree.pyx lxml/trunk/src/lxml/tests/test_etree.py Log: make ElementTree.doctype return a DocType object instead of a tuple, enables "<!DOCTYPE ...>" string building via str() Modified: lxml/trunk/doc/api.txt ============================================================================== --- lxml/trunk/doc/api.txt (original) +++ lxml/trunk/doc/api.txt Tue May 2 08:55:58 2006 @@ -58,6 +58,24 @@ >>> print lxml.etree.tostring(html) <html><head><title>test

page title

+The use of the libxml2 parsers makes some additional information available at +the API level. Currently, ElementTree objects can access the DOCTYPE +information provided by a parsed document:: + + >>> pub_id = "-//W3C//DTD XHTML 1.0 Transitional//EN" + >>> sys_url = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd" + >>> doctype_string = '' % (pub_id, sys_url) + >>> xhtml = doctype_string + '' + + >>> et = lxml.etree.parse(StringIO(xhtml)) + >>> doctype = et.doctype + >>> print doctype.public_id + -//W3C//DTD XHTML 1.0 Transitional//EN + >>> print doctype.system_url + http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd + >>> str(doctype) == doctype_string + True + Error handling on exceptions ---------------------------- Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Tue May 2 08:55:58 2006 @@ -131,6 +131,7 @@ cdef getdoctype(self): cdef tree.xmlDtd* dtd + cdef xmlNode* c_root_node public_id = None sys_url = None dtd = self._c_doc.intSubset @@ -145,7 +146,12 @@ public_id = funicode(dtd.ExternalID) if not sys_url and dtd.SystemID is not NULL: sys_url = funicode(dtd.SystemID) - return (public_id, sys_url) + c_root_node = tree.xmlDocGetRootElement(self._c_doc) + if c_root_node is NULL: + root_name = None + else: + root_name = funicode(c_root_node.name) + return (root_name, public_id, sys_url) cdef buildNewPrefix(self): ns = python.PyString_FromFormat("ns%d", self._ns_counter) @@ -215,7 +221,33 @@ parser = __DEFAULT_PARSER result._parser = parser.copy() return result - + +cdef class DocType: + "Hold Public ID and System URL of a DOCTYPE declaration." + cdef readonly object root_name + cdef readonly object public_id + cdef readonly object system_url + def __init__(self, tree): + cdef _Document doc + doc = _documentOrRaise(tree) + self.root_name, self.public_id, self.system_url = doc.getdoctype() + if not self.root_name and (self.public_id or self.system_url): + raise ValueError, "Could not find root node" + + def __str__(self): + if self.public_id: + if self.system_url: + return '' % ( + self.root_name, self.public_id, self.system_url) + else: + return '' % ( + self.root_name, self.public_id) + elif self.system_url: + return '' % ( + self.root_name, self.system_url) + else: + return "" + cdef class _NodeBase: """Base class to reference a document object and a libxml node. @@ -259,7 +291,7 @@ those returned by the parse functions). """ def __get__(self): - return self._doc.getdoctype() + return DocType(self._doc) def write(self, file, encoding='us-ascii'): if not hasattr(file, 'write'): Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Tue May 2 08:55:58 2006 @@ -384,30 +384,39 @@ etree = self.etree pub_id = "-//W3C//DTD XHTML 1.0 Transitional//EN" sys_id = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd" - xml = '''\ - - -''' % (pub_id, sys_id) + doctype_string = '' % (pub_id, sys_id) + + xml = doctype_string + '' tree = etree.parse(StringIO(xml)) - self.assertEquals(tree.doctype, (pub_id, sys_id)) + doctype = tree.doctype + self.assertEquals(doctype.public_id, pub_id) + self.assertEquals(doctype.system_url, sys_id) + self.assertEquals(doctype.root_name, 'html') + self.assertEquals(str(doctype), doctype_string) def test_doctype_system(self): etree = self.etree sys_id = "some.dtd" - xml = '''\ - - -''' % sys_id + doctype_string = '' % sys_id + xml = doctype_string + '' tree = etree.parse(StringIO(xml)) - self.assertEquals(tree.doctype, (None, sys_id)) + doctype = tree.doctype + self.assertEquals(doctype.public_id, None) + self.assertEquals(doctype.system_url, sys_id) + self.assertEquals(doctype.root_name, 'html') + self.assertEquals(str(doctype), doctype_string) def test_doctype_empty(self): etree = self.etree xml = '' tree = etree.parse(StringIO(xml)) - self.assertEquals(tree.doctype, (None, None)) + doctype = tree.doctype + self.assertEquals(doctype.public_id, None) + self.assertEquals(doctype.system_url, None) + self.assertEquals(doctype.root_name, 'html') + self.assertEquals(str(doctype), '') def _writeElement(self, element, encoding='us-ascii'): """Write out element for comparison. From scoder at codespeak.net Tue May 2 09:01:38 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Tue May 2 09:01:39 2006 Subject: [Lxml-checkins] r26653 - in lxml/trunk: . src/lxml Message-ID: <20060502070138.94411100A7@code0.codespeak.net> Author: scoder Date: Tue May 2 09:01:37 2006 New Revision: 26653 Modified: lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/etree.pyx Log: doc updates Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Tue May 2 09:01:37 2006 @@ -7,8 +7,8 @@ Features added -------------- -* Read-only 'doctype' attribute in ElementTree class that holds a tuple - (public ID, system URL) as seen by the parser +* Read-only 'doctype' attribute in ElementTree class holds DOCTYPE information + as seen by the parser * etree module can be compiled without libxslt by commenting out the line 'include "xslt.pxi"' at the end of the etree.pyx source file Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Tue May 2 09:01:37 2006 @@ -228,6 +228,7 @@ cdef readonly object public_id cdef readonly object system_url def __init__(self, tree): + "Create a DocType object for an ElementTree object or root Element." cdef _Document doc doc = _documentOrRaise(tree) self.root_name, self.public_id, self.system_url = doc.getdoctype() From scoder at codespeak.net Tue May 2 09:14:46 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Tue May 2 09:14:46 2006 Subject: [Lxml-checkins] r26654 - lxml/trunk/doc Message-ID: <20060502071446.380AB100A0@code0.codespeak.net> Author: scoder Date: Tue May 2 09:14:44 2006 New Revision: 26654 Modified: lxml/trunk/doc/compatibility.txt Log: doc updates Modified: lxml/trunk/doc/compatibility.txt ============================================================================== --- lxml/trunk/doc/compatibility.txt (original) +++ lxml/trunk/doc/compatibility.txt Tue May 2 09:14:44 2006 @@ -2,7 +2,7 @@ ============================= A lot of care has been taken to ensure compatibility between etree and -ElementTree. Nonetheless some differences and incompatibilities exist: +ElementTree. Nonetheless some differences and incompatibilities exist: * Importing etree is obviously different; etree uses a lower case package name, while ElementTree a combination of upper-case and @@ -25,22 +25,22 @@ # use from lxml import etree as ElementTree -* Some of the API of ElementTree has not yet been implemented and is - thus missing in lxml.etree. Feel free to help out! +* Some minor parts of the API of ElementTree have not yet been implemented and + are thus missing in lxml.etree. Feel free to help out! * Then again, lxml.etree offers a lot more functionality, such as XPath, XSLT, Relax NG, and XML Schema support, which (c)ElementTree does not offer. * ElementTree allows you to place an Element in two different trees as the - same time. Thus, this:: + same time. Thus, this:: a = Element('a') b = SubElement(a, 'b') c = Element('c') c.append(b) - Will result in the following tree a:: + will result in the following tree a:: @@ -48,11 +48,10 @@ - In lxml, this behavior is different, because of lxml is built on top - of a tree that maintains parent relationships for elements (like W3C - DOM). This means an element can only exist in a single tree at the - same time. Adding an element in some tree to another tree will cause - this element to be moved. + In lxml, this behavior is different, because lxml is built on top of a tree + that maintains parent relationships for elements (like W3C DOM). This means + an element can only exist in a single tree at the same time. Adding an + element in some tree to another tree will cause this element to be moved. So, for tree a we will get:: @@ -62,44 +61,43 @@ - Unfortunately this is a rather fundamental difference in behavior, - which will be hard to solve. It won't affect some applications, but - if you want to port code you do unfortunately have to make sure that - it doesn't. - -* ElementTree has a bug when serializing an empty Comment (no text - argument given) to XML, etree serializes this successfully. - -* When trying to set a subelement using __setitem__ that is in fact - not an Element but some other object, etree raises a TypeError, and - ElementTree raises an AssertionError. - -* ElementTree ignores comments when parsing XML, while etree will read - them in and treat them as Comment elements. - -* Because etree is built on top of libxml2, which is namespace prefix - aware, etree preserves namespaces declarations and prefixes while - ElementTree tends to come up with its own prefixes (ns0, ns1, - etc). When no namespace prefix is given however, etree creates - ElementTree style prefixes as well. - -* etree has a 'prefix' attribute (read-only) on elements giving the - Element's prefix, if this is known, and None otherwise (in case of - no namespace at all, or default namespace). etree also allows a - 'nsmap' dictionary which maps namespace prefix to namespace URI to - be passed to the Element and SubElement element factories. - - These will be translated into namespace declarations on that - element. This means that in the probably rare case that you need to - construct an attribute called 'nsmap', you need to be aware that - unlike in ElementTree, you cannot pass it as a keyword argument to - the Element and SubElement factories directly. - -* etree elements can be copied using copy.deepcopy() and copy.copy(), - just like ElementTree's. copy.copy() however does *not* create a - shallow copy where elements are shared between trees, as this makes - no sense in the context of libxml2 trees. + Unfortunately this is a rather fundamental difference in behavior, which + will be hard to solve. It won't affect some applications, but if you want + to port code you must unfortunately make sure that it doesn't. + +* ElementTree has a bug when serializing an empty Comment (no text argument + given) to XML, etree serializes this successfully. + +* When trying to set a subelement using __setitem__ that is in fact not an + Element but some other object, etree raises a TypeError, and ElementTree + raises an AssertionError. + +* ElementTree ignores comments when parsing XML, while etree will read them in + and treat them as Comment elements. + +* Because etree is built on top of libxml2, which is namespace prefix aware, + etree preserves namespaces declarations and prefixes while ElementTree tends + to come up with its own prefixes (ns0, ns1, etc). When no namespace prefix + is given however, etree creates ElementTree style prefixes as well. + +* etree has a 'prefix' attribute (read-only) on elements giving the Element's + prefix, if this is known, and None otherwise (in case of no namespace at + all, or default namespace). + + etree further allows passing an 'nsmap' dictionary to the Element and + SubElement element factories to explicitly map namespace prefixes to + namespace URIs. These will be translated into namespace declarations on + that element. This means that in the probably rare case that you need to + construct an attribute called 'nsmap', you need to be aware that unlike in + ElementTree, you cannot pass it as a keyword argument to the Element and + SubElement factories directly. + +* etree elements can be copied using copy.deepcopy() and copy.copy(), just + like ElementTree's. copy.copy() however does *not* create a shallow copy + where elements are shared between trees, as this makes no sense in the + context of libxml2 trees. Note that lxml can deep-copy trees considerably + faster than than ElementTree. * etree allows navigation to the parent of a node by the ``getparent()`` - method. This is not possible in ElementTree as the underlying tree - model does not have this information. + method. This is not possible in ElementTree as the underlying tree model + does not have this information. From scoder at codespeak.net Tue May 2 13:44:04 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Tue May 2 13:44:05 2006 Subject: [Lxml-checkins] r26657 - lxml/trunk/src/lxml/tests Message-ID: <20060502114404.A2267100B5@code0.codespeak.net> Author: scoder Date: Tue May 2 13:44:03 2006 New Revision: 26657 Modified: lxml/trunk/src/lxml/tests/test_elementtree.py Log: stop running api.txt doctests twice (they should not be part of the ET test cases) Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Tue May 2 13:44:03 2006 @@ -1891,8 +1891,6 @@ suite.addTests([unittest.makeSuite(ETreeTestCase)]) if ElementTree: suite.addTests([unittest.makeSuite(ElementTreeTestCase)]) - suite.addTests( - [doctest.DocFileSuite('../../../doc/api.txt')]) return suite if __name__ == '__main__': From scoder at codespeak.net Tue May 2 13:45:02 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Tue May 2 13:45:03 2006 Subject: [Lxml-checkins] r26658 - lxml/trunk/src/lxml Message-ID: <20060502114502.AB696100B5@code0.codespeak.net> Author: scoder Date: Tue May 2 13:45:01 2006 New Revision: 26658 Modified: lxml/trunk/src/lxml/etree.pyx lxml/trunk/src/lxml/tree.pxd Log: allow accessing URL, XML version and original encoding through DocType object Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Tue May 2 13:45:01 2006 @@ -153,6 +153,25 @@ root_name = funicode(c_root_node.name) return (root_name, public_id, sys_url) + cdef getxmlinfo(self): + cdef xmlDoc* c_doc + c_doc = self._c_doc + if c_doc.version is NULL: + version = None + else: + version = c_doc.version + if c_doc.encoding is NULL: + encoding = None + else: + encoding = c_doc.encoding + return (version, encoding) + + cdef getURL(self): + if self._c_doc.URL is NULL: + return None + else: + return self._c_doc.URL + cdef buildNewPrefix(self): ns = python.PyString_FromFormat("ns%d", self._ns_counter) self._ns_counter = self._ns_counter + 1 @@ -227,6 +246,9 @@ cdef readonly object root_name cdef readonly object public_id cdef readonly object system_url + cdef readonly object xml_version + cdef readonly object encoding + cdef readonly object URL def __init__(self, tree): "Create a DocType object for an ElementTree object or root Element." cdef _Document doc @@ -234,6 +256,8 @@ self.root_name, self.public_id, self.system_url = doc.getdoctype() if not self.root_name and (self.public_id or self.system_url): raise ValueError, "Could not find root node" + self.xml_version, self.encoding = doc.getxmlinfo() + self.URL = doc.getURL() def __str__(self): if self.public_id: Modified: lxml/trunk/src/lxml/tree.pxd ============================================================================== --- lxml/trunk/src/lxml/tree.pxd (original) +++ lxml/trunk/src/lxml/tree.pxd Tue May 2 13:45:01 2006 @@ -81,6 +81,8 @@ xmlDoc* doc xmlDict* dict xmlHashTable* ids + char* version + char* encoding char* URL void* _private xmlDtd* intSubset From scoder at codespeak.net Tue May 2 13:57:14 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Tue May 2 13:57:15 2006 Subject: [Lxml-checkins] r26660 - lxml/trunk/src/lxml Message-ID: <20060502115714.5ABA6100AA@code0.codespeak.net> Author: scoder Date: Tue May 2 13:57:13 2006 New Revision: 26660 Modified: lxml/trunk/src/lxml/extensions.pxi lxml/trunk/src/lxml/python.pxd Log: C-ification in _unwrapXPathObject(bool) Modified: lxml/trunk/src/lxml/extensions.pxi ============================================================================== --- lxml/trunk/src/lxml/extensions.pxi (original) +++ lxml/trunk/src/lxml/extensions.pxi Tue May 2 13:57:13 2006 @@ -227,7 +227,7 @@ elif xpathObj.type == xpath.XPATH_NODESET: return _createNodeSetResult(xpathObj, doc) elif xpathObj.type == xpath.XPATH_BOOLEAN: - return bool(xpathObj.boolval) + return python.PyBool_FromLong(xpathObj.boolval) elif xpathObj.type == xpath.XPATH_NUMBER: return xpathObj.floatval elif xpathObj.type == xpath.XPATH_STRING: Modified: lxml/trunk/src/lxml/python.pxd ============================================================================== --- lxml/trunk/src/lxml/python.pxd (original) +++ lxml/trunk/src/lxml/python.pxd Tue May 2 13:57:13 2006 @@ -17,6 +17,7 @@ cdef object PyString_FromStringAndSize(char* s, int size) cdef object PyString_FromString(char* s) cdef object PyString_FromFormat(char* format, ...) + cdef object PyBool_FromLong(long value) cdef int PyList_GET_SIZE(object l) cdef int PyList_Append(object l, object obj) From scoder at codespeak.net Tue May 2 13:57:43 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Tue May 2 13:57:44 2006 Subject: [Lxml-checkins] r26661 - lxml/trunk/doc Message-ID: <20060502115743.73997100AA@code0.codespeak.net> Author: scoder Date: Tue May 2 13:57:42 2006 New Revision: 26661 Modified: lxml/trunk/doc/api.txt Log: doc updates, more API doc tests Modified: lxml/trunk/doc/api.txt ============================================================================== --- lxml/trunk/doc/api.txt (original) +++ lxml/trunk/doc/api.txt Tue May 2 13:57:42 2006 @@ -60,12 +60,14 @@ The use of the libxml2 parsers makes some additional information available at the API level. Currently, ElementTree objects can access the DOCTYPE -information provided by a parsed document:: +information provided by a parsed document, as well as the XML version and the +original encoding:: >>> pub_id = "-//W3C//DTD XHTML 1.0 Transitional//EN" >>> sys_url = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd" >>> doctype_string = '' % (pub_id, sys_url) - >>> xhtml = doctype_string + '' + >>> xml_header = '' + >>> xhtml = xml_header + doctype_string + '' >>> et = lxml.etree.parse(StringIO(xhtml)) >>> doctype = et.doctype @@ -76,6 +78,11 @@ >>> str(doctype) == doctype_string True + >>> print doctype.xml_version + 1.0 + >>> print doctype.encoding + ascii + Error handling on exceptions ---------------------------- @@ -109,23 +116,20 @@ xpath method on ElementTree, Element ------------------------------------ -lxml.etree extends the ElementTree and Element interfaces with an -xpath method. For ElementTree, the xpath method performs a global -xpath query against the document. When xpath is used on an element, -the xpath expression is performed taking the element as the xpath -context node. - -You call the xpath() method with the XPath expression to use, and -optionally a second namespaces argument, which should be a dictionary -mapping namespace prefixes to be used in the XPath expression to -namespace URIs. +lxml.etree extends the ElementTree and Element interfaces with an xpath +method. For ElementTree, the xpath method performs a global xpath query +against the document. When xpath is used on an element, the xpath expression +is performed taking the element as the xpath context node. + +You call the xpath() method with the XPath expression to use. Optionally, you +can provide a second argument, which should be a dictionary mapping the +namespace prefixes used in the XPath expression to namespace URIs. -The return values of xpath vary, depending on the XPath expression -used: +The return values of xpath vary, depending on the XPath expression used: -* 1 or 0, when the XPath expression has a boolean result +* True or False, when the XPath expression has a boolean result -* a float, when the XPath expression has a floating point result +* a float, when the XPath expression has a numeric result (integer or float) * a (unicode) string, when the XPath expression has a string result. @@ -189,7 +193,7 @@ >>> doc = lxml.etree.parse(f) >>> result = transform(doc) -The result object can accessed like a normal ElementTree document:: +The result object can be accessed like a normal ElementTree document:: >>> result.getroot().text 'Text' @@ -230,9 +234,9 @@ '\nText\n' There's also a convenience method on the tree object for doing XSL -transformations. This is less efficient if you want to apply the same XSL -transformation to multiple documents, but is shorter to write, as you do not -have to instantiate a stylesheet yourself:: +transformations. This is less efficient if you want to apply the same XSL +transformation to multiple documents, but is shorter to write for one-shot +operations, as you do not have to instantiate a stylesheet yourself:: >>> result = doc.xslt(xslt_doc, a="'A'") >>> str(result) @@ -281,13 +285,18 @@ invalid! If you prefer getting an exception when validating, you can use the -assertValid method:: +``assert_`` or ``assertValid`` methods:: >>> relaxng.assertValid(doc2) Traceback (most recent call last): [...] DocumentInvalid: Document does not comply with schema + >>> relaxng.assert_(doc2) + Traceback (most recent call last): + [...] + AssertionError: Document does not comply with schema + Starting with version 0.9, lxml now has a simple API to report the errors generated by libxml2. If you want to find out why the validation failed in the second case, you can look up the error log of the validation process and check @@ -300,7 +309,9 @@ You can see that the error (ERROR) happened during RelaxNG validation (RELAXNGV). The message then tells you what went wrong. Note that this error is local to the RelaxNG object. It will only contain log entries that -appeares during the validation. +appeares during the validation. The DocumentInvalid exception raised by the +``assertValid`` method above provides access to the global error log (like all +other lxml exceptions). Similar to XSLT, there's also a less efficient but easier shortcut method to do one-shot RelaxNG validation:: @@ -356,13 +367,18 @@ invalid! If you prefer getting an exception when validating, you can use the -assertValid method:: +``assert_`` or ``assertValid`` methods:: >>> xmlschema.assertValid(doc2) Traceback (most recent call last): [...] DocumentInvalid: Document does not comply with schema + >>> xmlschema.assert_(doc2) + Traceback (most recent call last): + [...] + AssertionError: Document does not comply with schema + Error reporting works like for the RelaxNG class:: >>> log = xmlschema.error_log From scoder at codespeak.net Tue May 2 20:32:39 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Tue May 2 20:32:41 2006 Subject: [Lxml-checkins] r26682 - in lxml/trunk: . doc src/lxml src/lxml/tests Message-ID: <20060502183239.E5E36100BB@code0.codespeak.net> Author: scoder Date: Tue May 2 20:32:38 2006 New Revision: 26682 Modified: lxml/trunk/CHANGES.txt lxml/trunk/doc/api.txt lxml/trunk/src/lxml/etree.pyx lxml/trunk/src/lxml/tests/test_etree.py Log: renamed 'doctype' attribute to 'docinfo': more generic name for more generic information Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Tue May 2 20:32:38 2006 @@ -7,8 +7,8 @@ Features added -------------- -* Read-only 'doctype' attribute in ElementTree class holds DOCTYPE information - as seen by the parser +* Read-only 'docinfo' attribute in ElementTree class holds DOCTYPE + information, original encoding and XML version as seen by the parser * etree module can be compiled without libxslt by commenting out the line 'include "xslt.pxi"' at the end of the etree.pyx source file Modified: lxml/trunk/doc/api.txt ============================================================================== --- lxml/trunk/doc/api.txt (original) +++ lxml/trunk/doc/api.txt Tue May 2 20:32:38 2006 @@ -70,17 +70,17 @@ >>> xhtml = xml_header + doctype_string + '' >>> et = lxml.etree.parse(StringIO(xhtml)) - >>> doctype = et.doctype - >>> print doctype.public_id + >>> docinfo = et.docinfo + >>> print docinfo.public_id -//W3C//DTD XHTML 1.0 Transitional//EN - >>> print doctype.system_url + >>> print docinfo.system_url http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd - >>> str(doctype) == doctype_string + >>> docinfo.doctype == doctype_string True - >>> print doctype.xml_version + >>> print docinfo.xml_version 1.0 - >>> print doctype.encoding + >>> print docinfo.encoding ascii Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Tue May 2 20:32:38 2006 @@ -241,8 +241,8 @@ result._parser = parser.copy() return result -cdef class DocType: - "Hold Public ID and System URL of a DOCTYPE declaration." +cdef class DocInfo: + "Document information provided by parser and DTD." cdef readonly object root_name cdef readonly object public_id cdef readonly object system_url @@ -250,7 +250,7 @@ cdef readonly object encoding cdef readonly object URL def __init__(self, tree): - "Create a DocType object for an ElementTree object or root Element." + "Create a DocInfo object for an ElementTree object or root Element." cdef _Document doc doc = _documentOrRaise(tree) self.root_name, self.public_id, self.system_url = doc.getdoctype() @@ -259,19 +259,20 @@ self.xml_version, self.encoding = doc.getxmlinfo() self.URL = doc.getURL() - def __str__(self): - if self.public_id: - if self.system_url: - return '' % ( - self.root_name, self.public_id, self.system_url) + property doctype: + def __get__(self): + if self.public_id: + if self.system_url: + return '' % ( + self.root_name, self.public_id, self.system_url) + else: + return '' % ( + self.root_name, self.public_id) + elif self.system_url: + return '' % ( + self.root_name, self.system_url) else: - return '' % ( - self.root_name, self.public_id) - elif self.system_url: - return '' % ( - self.root_name, self.system_url) - else: - return "" + return "" cdef class _NodeBase: """Base class to reference a document object and a libxml node. @@ -309,14 +310,13 @@ def getroot(self): return self._context_node - property doctype: - """A tuple (public ID, system URL) of the DOCTYPE seen by the parser. - Any of the two may be None. This value is only defined for - ElementTree objects based on the root node of a parsed document (e.g. - those returned by the parse functions). + property docinfo: + """Information about the document provided by parser and DTD. This + value is only defined for ElementTree objects based on the root node + of a parsed document (e.g. those returned by the parse functions). """ def __get__(self): - return DocType(self._doc) + return DocInfo(self._doc) def write(self, file, encoding='us-ascii'): if not hasattr(file, 'write'): Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Tue May 2 20:32:38 2006 @@ -380,43 +380,51 @@ self.assertEquals( 0, e.index(e[0], -12, -1)) - def test_doctype_public(self): + def test_docinfo_public(self): etree = self.etree + xml_header = '' pub_id = "-//W3C//DTD XHTML 1.0 Transitional//EN" sys_id = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd" doctype_string = '' % (pub_id, sys_id) - xml = doctype_string + '' + xml = xml_header + doctype_string + '' tree = etree.parse(StringIO(xml)) - doctype = tree.doctype - self.assertEquals(doctype.public_id, pub_id) - self.assertEquals(doctype.system_url, sys_id) - self.assertEquals(doctype.root_name, 'html') - self.assertEquals(str(doctype), doctype_string) + docinfo = tree.docinfo + self.assertEquals(docinfo.encoding, "ascii") + self.assertEquals(docinfo.xml_version, "1.0") + self.assertEquals(docinfo.public_id, pub_id) + self.assertEquals(docinfo.system_url, sys_id) + self.assertEquals(docinfo.root_name, 'html') + self.assertEquals(docinfo.doctype, doctype_string) - def test_doctype_system(self): + def test_docinfo_system(self): etree = self.etree + xml_header = '' sys_id = "some.dtd" doctype_string = '' % sys_id - xml = doctype_string + '' + xml = xml_header + doctype_string + '' tree = etree.parse(StringIO(xml)) - doctype = tree.doctype - self.assertEquals(doctype.public_id, None) - self.assertEquals(doctype.system_url, sys_id) - self.assertEquals(doctype.root_name, 'html') - self.assertEquals(str(doctype), doctype_string) + docinfo = tree.docinfo + self.assertEquals(docinfo.encoding, "UTF-8") + self.assertEquals(docinfo.xml_version, "1.0") + self.assertEquals(docinfo.public_id, None) + self.assertEquals(docinfo.system_url, sys_id) + self.assertEquals(docinfo.root_name, 'html') + self.assertEquals(docinfo.doctype, doctype_string) - def test_doctype_empty(self): + def test_docinfo_empty(self): etree = self.etree xml = '' tree = etree.parse(StringIO(xml)) - doctype = tree.doctype - self.assertEquals(doctype.public_id, None) - self.assertEquals(doctype.system_url, None) - self.assertEquals(doctype.root_name, 'html') - self.assertEquals(str(doctype), '') + docinfo = tree.docinfo + self.assertEquals(docinfo.encoding, None) + self.assertEquals(docinfo.xml_version, "1.0") + self.assertEquals(docinfo.public_id, None) + self.assertEquals(docinfo.system_url, None) + self.assertEquals(docinfo.root_name, 'html') + self.assertEquals(docinfo.doctype, '') def _writeElement(self, element, encoding='us-ascii'): """Write out element for comparison. From scoder at codespeak.net Wed May 3 09:10:53 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Wed May 3 09:10:55 2006 Subject: [Lxml-checkins] r26702 - lxml/trunk Message-ID: <20060503071053.0F074100BB@code0.codespeak.net> Author: scoder Date: Wed May 3 09:10:51 2006 New Revision: 26702 Modified: lxml/trunk/setup.py Log: check for SVN revision and store lxml version in lxml-version.h Modified: lxml/trunk/setup.py ============================================================================== --- lxml/trunk/setup.py (original) +++ lxml/trunk/setup.py Wed May 3 09:10:51 2006 @@ -1,12 +1,33 @@ -import os +import sys, os, os.path, re def flags(cmd): wf, rf, ef = os.popen3(cmd) return rf.read().strip().split(' ') +src_dir = os.path.join(os.getcwd(), os.path.dirname(sys.argv[0])) +version = open(os.path.join(src_dir, 'version.txt')).read().strip() + +try: + svn_entries = open(os.path.join(src_dir, '.svn', 'entries')).read() +except IOError: + svn_version = version +else: + revision = re.search("]*name=\"\"[^>]*revision=\"([^\"]+)\"", + svn_entries).group(1) + svn_version = version + '-' + revision + +version_h = open(os.path.join(src_dir, 'src', 'lxml', 'lxml-version.h'), 'w') +version_h.write('''\ +#ifndef LXML_VERSION_STRING +#define LXML_VERSION_STRING "%s" +#endif +''' % svn_version) +version_h.close() + +print "Building lxml version", svn_version + setup_args = {} changelog_text = "" -version = open('version.txt').read().strip() try: from setuptools import setup From scoder at codespeak.net Wed May 3 09:12:57 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Wed May 3 09:13:00 2006 Subject: [Lxml-checkins] r26703 - lxml/trunk/src/lxml Message-ID: <20060503071257.EF3F9100BB@code0.codespeak.net> Author: scoder Date: Wed May 3 09:12:56 2006 New Revision: 26703 Modified: lxml/trunk/src/lxml/etree.pyx lxml/trunk/src/lxml/tree.pxd lxml/trunk/src/lxml/xslt.pxd lxml/trunk/src/lxml/xslt.pxi Log: provide versions of lxml/libxml2/libxslt at module level Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Wed May 3 09:12:56 2006 @@ -56,6 +56,32 @@ class C14NError(LxmlError): pass +# version information +cdef __unpackDottedVersion(version): + version_list = [] + l = (version.replace('-', '.').split('.') + [0]*4)[:4] + for item in l: + try: + version_list.append(int(item)) + except ValueError: + version_list.append(item) + return tuple(version_list) + +cdef __unpackIntVersion(int c_version): + return ( + ((c_version / (100*100)) % 100), + ((c_version / 100) % 100), + (c_version % 100) + ) + +LIBXML_COMPILED_VERSION = __unpackIntVersion(tree.LIBXML_VERSION) +try: + LIBXML_VERSION = __unpackIntVersion( + int((tree.xmlParserVersion).split('-')[0])) +except Exception: + LIBXML_VERSION = (0,0,0) +LXML_VERSION = __unpackDottedVersion(tree.LXML_VERSION_STRING) + # class for temporary storage of Python references cdef class _TempStore: Modified: lxml/trunk/src/lxml/tree.pxd ============================================================================== --- lxml/trunk/src/lxml/tree.pxd (original) +++ lxml/trunk/src/lxml/tree.pxd Wed May 3 09:12:56 2006 @@ -5,7 +5,14 @@ cdef int strlen(char* s) cdef int strcmp(char* s1, char* s2) cdef int strncmp(char* s1, char* s2, int len) - + +cdef extern from "lxml-version.h": + cdef char* LXML_VERSION_STRING + +cdef extern from "libxml/xmlversion.h": + cdef char* xmlParserVersion + cdef int LIBXML_VERSION + cdef extern from "libxml/encoding.h": ctypedef struct xmlCharEncodingHandler cdef xmlCharEncodingHandler* xmlFindCharEncodingHandler(char* name) Modified: lxml/trunk/src/lxml/xslt.pxd ============================================================================== --- lxml/trunk/src/lxml/xslt.pxd (original) +++ lxml/trunk/src/lxml/xslt.pxd Wed May 3 09:12:56 2006 @@ -1,6 +1,12 @@ from tree cimport xmlDoc, xmlDict from xpath cimport xmlXPathContext, xmlXPathFunction +cdef extern from "libxslt/xslt.h": + cdef int xsltLibxsltVersion + +cdef extern from "libxslt/xsltconfig.h": + cdef int LIBXSLT_VERSION + cdef extern from "libxslt/xsltInternals.h": ctypedef struct xsltDocument: xmlDoc* doc Modified: lxml/trunk/src/lxml/xslt.pxi ============================================================================== --- lxml/trunk/src/lxml/xslt.pxi (original) +++ lxml/trunk/src/lxml/xslt.pxi Wed May 3 09:12:56 2006 @@ -17,6 +17,9 @@ class XSLTExtensionError(XSLTError): pass +# version information +LIBXSLT_COMPILED_VERSION = __unpackIntVersion(xslt.LIBXSLT_VERSION) +LIBXSLT_VERSION = __unpackIntVersion(xslt.xsltLibxsltVersion) cdef void _logLibxsltErrors(): xslt.xsltSetGenericErrorFunc(NULL, _receiveGenericError) From scoder at codespeak.net Wed May 3 09:21:49 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Wed May 3 09:21:51 2006 Subject: [Lxml-checkins] r26705 - lxml/trunk/src/lxml Message-ID: <20060503072149.73146100BB@code0.codespeak.net> Author: scoder Date: Wed May 3 09:21:47 2006 New Revision: 26705 Modified: lxml/trunk/src/lxml/parser.pxi Log: pass 'recover' option to _handleParseResult explicitly to override libxml2 parse options (relying on them fails on certain libxml2 versions) Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Wed May 3 09:21:47 2006 @@ -147,9 +147,9 @@ c_ctxt._private = self._context cdef xmlDoc* _handleParseResult(xmlParserCtxt* ctxt, xmlDoc* result, - char* c_filename) except NULL: + char* c_filename, int recover) except NULL: cdef _ResolverContext context - if ctxt.wellFormed or (ctxt.options & xmlparser.XML_PARSE_RECOVER): + if ctxt.wellFormed or recover: __GLOBAL_PARSER_CONTEXT._initDocDict(result) elif result is not NULL: # free broken document @@ -254,7 +254,7 @@ """ cdef xmlDoc* result cdef xmlParserCtxt* pctxt - cdef int parse_error + cdef int recover self._error_log.connect() pctxt = self._memory_parser_ctxt if pctxt is NULL: @@ -264,11 +264,13 @@ result = xmlparser.xmlCtxtReadDoc( pctxt, c_text, NULL, NULL, self._parse_options) self._error_log.disconnect() - return _handleParseResult(pctxt, result, NULL) + recover = self._parse_options & xmlparser.XML_PARSE_RECOVER + return _handleParseResult(pctxt, result, NULL, recover) cdef xmlDoc* _parseDocFromFile(self, char* c_filename) except NULL: cdef xmlDoc* result cdef xmlParserCtxt* pctxt + cdef int recover self._error_log.connect() pctxt = self._file_parser_ctxt if pctxt is NULL: @@ -278,13 +280,15 @@ result = xmlparser.xmlCtxtReadFile( pctxt, c_filename, NULL, self._parse_options) self._error_log.disconnect() - return _handleParseResult(pctxt, result, c_filename) + recover = self._parse_options & xmlparser.XML_PARSE_RECOVER + return _handleParseResult(pctxt, result, c_filename, recover) cdef xmlDoc* _internalParseDoc(char* c_text, int options, _ResolverContext context) except NULL: # internal parser function for XSLT cdef xmlParserCtxt* pctxt cdef xmlDoc* c_doc + cdef int recover pctxt = xmlparser.xmlNewParserCtxt() if pctxt is NULL: return NULL @@ -293,7 +297,8 @@ c_doc = xmlparser.xmlCtxtReadDoc( pctxt, c_text, NULL, NULL, options) try: - c_doc = _handleParseResult(pctxt, c_doc, NULL) + recover = options & xmlparser.XML_PARSE_RECOVER + c_doc = _handleParseResult(pctxt, c_doc, NULL, recover) finally: xmlparser.xmlFreeParserCtxt(pctxt) return c_doc @@ -303,6 +308,7 @@ # internal parser function for XSLT cdef xmlParserCtxt* pctxt cdef xmlDoc* c_doc + cdef int recover pctxt = xmlparser.xmlNewParserCtxt() if pctxt is NULL: return NULL @@ -311,7 +317,8 @@ c_doc = xmlparser.xmlCtxtReadFile( pctxt, c_filename, NULL, options) try: - c_doc = _handleParseResult(pctxt, c_doc, c_filename) + recover = options & xmlparser.XML_PARSE_RECOVER + c_doc = _handleParseResult(pctxt, c_doc, c_filename, recover) finally: xmlparser.xmlFreeParserCtxt(pctxt) return c_doc @@ -400,6 +407,7 @@ cdef xmlDoc* result cdef xmlParserCtxt* pctxt cdef int c_len + cdef int recover self._error_log.connect() pctxt = self._memory_parser_ctxt if pctxt is NULL: @@ -412,12 +420,13 @@ result = htmlparser.htmlCtxtReadDoc( pctxt, c_text, NULL, NULL, self._parse_options) self._error_log.disconnect() - return _handleParseResult(pctxt, result, NULL) + recover = self._parse_options & xmlparser.XML_PARSE_RECOVER + return _handleParseResult(pctxt, result, NULL, recover) cdef xmlDoc* _parseDocFromFile(self, char* c_filename) except NULL: cdef xmlDoc* result cdef xmlParserCtxt* pctxt - cdef int parser_error + cdef int recover self._error_log.connect() pctxt = self._file_parser_ctxt if pctxt is NULL: @@ -433,7 +442,8 @@ result = htmlparser.htmlCtxtReadFile( pctxt, c_filename, NULL, self._parse_options) self._error_log.disconnect() - return _handleParseResult(pctxt, result, c_filename) + recover = self._parse_options & xmlparser.XML_PARSE_RECOVER + return _handleParseResult(pctxt, result, c_filename, recover) cdef HTMLParser __DEFAULT_HTML_PARSER __DEFAULT_HTML_PARSER = HTMLParser() From scoder at codespeak.net Wed May 3 09:48:41 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Wed May 3 09:48:42 2006 Subject: [Lxml-checkins] r26707 - lxml/trunk/src/lxml Message-ID: <20060503074841.EA90A100C6@code0.codespeak.net> Author: scoder Date: Wed May 3 09:48:40 2006 New Revision: 26707 Modified: lxml/trunk/src/lxml/parser.pxi Log: allow setting 'recover' option in XMLParser() Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Wed May 3 09:48:40 2006 @@ -208,7 +208,8 @@ cdef xmlParserCtxt* _file_parser_ctxt cdef xmlParserCtxt* _memory_parser_ctxt def __init__(self, attribute_defaults=False, dtd_validation=False, - load_dtd=False, no_network=False, ns_clean=False): + load_dtd=False, no_network=False, ns_clean=False, + recover=False): cdef int parse_options self._file_parser_ctxt = NULL BaseParser.__init__(self) @@ -226,6 +227,8 @@ parse_options = parse_options | xmlparser.XML_PARSE_NONET if ns_clean: parse_options = parse_options | xmlparser.XML_PARSE_NSCLEAN + if recover: + parse_options = parse_options | xmlparser.XML_PARSE_RECOVER self._parse_options = parse_options From scoder at codespeak.net Wed May 3 09:59:07 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Wed May 3 09:59:07 2006 Subject: [Lxml-checkins] r26708 - lxml/trunk/src/lxml/tests Message-ID: <20060503075907.26E2A100C6@code0.codespeak.net> Author: scoder Date: Wed May 3 09:59:06 2006 New Revision: 26708 Modified: lxml/trunk/src/lxml/tests/test_htmlparser.py Log: updated test cases for broken HTML Modified: lxml/trunk/src/lxml/tests/test_htmlparser.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_htmlparser.py (original) +++ lxml/trunk/src/lxml/tests/test_htmlparser.py Wed May 3 09:59:06 2006 @@ -15,7 +15,7 @@ etree = etree html_str = "test

page title

" - broken_html_str = "test<body><h1>page title</body></html>" + broken_html_str = "<html><head><title>test<body><h1>page title</h3></p></html>" def tearDown(self): self.etree.set_default_parser() @@ -32,6 +32,13 @@ self.assertRaises(self.etree.XMLSyntaxError, parse, f, parser) + def test_module_parse_html_norecover(self): + parser = self.etree.HTMLParser(recover=False) + parse = self.etree.parse + f = StringIO(self.broken_html_str) + self.assertRaises(self.etree.XMLSyntaxError, + parse, f, parser) + def test_module_HTML_broken(self): element = self.etree.HTML(self.broken_html_str) self.assertEqual(self.etree.tostring(element), @@ -39,8 +46,7 @@ def test_module_HTML_access(self): element = self.etree.HTML(self.html_str) - element = element[0][0] - self.assertEqual(element.tag, 'title') + self.assertEqual(element[0][0].tag, 'title') def test_module_parse_html(self): parser = self.etree.HTMLParser() From scoder at codespeak.net Wed May 3 10:37:45 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Wed May 3 10:37:46 2006 Subject: [Lxml-checkins] r26710 - in lxml/trunk: . doc Message-ID: <20060503083745.47A0D100C6@code0.codespeak.net> Author: scoder Date: Wed May 3 10:37:43 2006 New Revision: 26710 Modified: lxml/trunk/INSTALL.txt lxml/trunk/doc/api.txt Log: updated doctest for parsing broken HTML, mention that libxml2 2.6.21 is needed for this to work well Modified: lxml/trunk/INSTALL.txt ============================================================================== --- lxml/trunk/INSTALL.txt (original) +++ lxml/trunk/INSTALL.txt Wed May 3 10:37:43 2006 @@ -8,12 +8,16 @@ You need libxml2 and libxslt, in particular: -* libxml 2.6.16 (newer versions are recommended). It can be found here: +* libxml 2.6.16 or later. It can be found here: http://xmlsoft.org/downloads.html -* libxslt 1.1.12 (newer versions are recommended). It can be found here: +* libxslt 1.1.12 or later. It can be found here: http://xmlsoft.org/XSLT/downloads.html +Newer versions generally contain less bugs and are therefore recommended. The +HTML parser benefits from libxml2 version 2.6.21 or later, which support +parsing horribly broken HTML. + For Windows, there is a `binary distribution`_ of libxml2 and libxslt. Note that you need both libxml2 and libxslt, as well as iconv and zlib. You can then install the `binary egg distribution`_ of lxml (see below). Modified: lxml/trunk/doc/api.txt ============================================================================== --- lxml/trunk/doc/api.txt (original) +++ lxml/trunk/doc/api.txt Wed May 3 10:37:43 2006 @@ -41,9 +41,11 @@ HTML parsing is similarly simple. The parsers have a ``recover`` keyword argument that the HTMLParser sets by default. It lets libxml2 try its best to -return something usable without raising an exception:: +return something usable without raising an exception. Note that this +functionality depends entirely on libxml2. You should use libxml2 version +2.6.21 or newer to take advantage of this feature:: - >>> broken_html = "<html><head><title>test<body><h1>page title" + >>> broken_html = "<html><head><title>test<body><h1>page title</h3>" >>> parser = lxml.etree.HTMLParser() >>> et = lxml.etree.parse(StringIO(broken_html), parser) From scoder at codespeak.net Wed May 3 11:00:31 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Wed May 3 11:00:32 2006 Subject: [Lxml-checkins] r26713 - lxml/trunk/doc Message-ID: <20060503090031.5400C100BF@code0.codespeak.net> Author: scoder Date: Wed May 3 11:00:30 2006 New Revision: 26713 Modified: lxml/trunk/doc/api.txt Log: doc updates: getting lxml version through API Modified: lxml/trunk/doc/api.txt ============================================================================== --- lxml/trunk/doc/api.txt (original) +++ lxml/trunk/doc/api.txt Wed May 3 11:00:30 2006 @@ -2,22 +2,29 @@ APIs specific to lxml ===================== -lxml tries to follow established APIs wherever possible. Sometimes -however the need to expose a feature in an easy way led to the -invention of a new API. +lxml tries to follow established APIs wherever possible. Sometimes, however, +the need to expose a feature in an easy way led to the invention of a new API. lxml.etree ========== -lxml.etree tries to follow the etree API wherever it can. There are -however some incompatibilities (see compatibility.txt). There are also -some extensions. +lxml.etree tries to follow the `ElementTree API`_ wherever it can. There are +however some incompatibilities (see compatibility.txt). The extensions are +documented here. + +.. _`ElementTree API`: http://effbot.org/zone/element-index.htm The following examples usually assume this to be executed first:: >>> import lxml.etree >>> from StringIO import StringIO +If you need to know which version of lxml is installed, you can access the +``lxml.etree.LXML_VERSION`` attribute to retrieve a version tuple. Note, +however, that it did not exist before version 1.0, so you will get an +AttributeError in older versions. The versions of libxml2 and libxslt are +available through the attributes ``LIBXML_VERSION`` and ``LIBXSLT_VERSION``. + Parsers ------- From scoder at codespeak.net Wed May 3 15:05:21 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Wed May 3 15:05:23 2006 Subject: [Lxml-checkins] r26722 - lxml/trunk/src/lxml/tests Message-ID: <20060503130521.31DED10088@code0.codespeak.net> Author: scoder Date: Wed May 3 15:05:19 2006 New Revision: 26722 Modified: lxml/trunk/src/lxml/tests/test_xslt.py Log: fix broken XSLT test case: should test for broken XSLT, not broken XML Modified: lxml/trunk/src/lxml/tests/test_xslt.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_xslt.py (original) +++ lxml/trunk/src/lxml/tests/test_xslt.py Wed May 3 15:05:19 2006 @@ -94,10 +94,10 @@ def test_xslt_broken(self): tree = self.parse('<a/>') style = self.parse('''\ -<xslt:stylesheet version="1.0" +<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"> <xsl:foo /> -</xslt:stylesheet>''') +</xsl:stylesheet>''') self.assertRaises(etree.XSLTParseError, etree.XSLT, style) From scoder at codespeak.net Wed May 3 17:06:35 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Wed May 3 17:06:37 2006 Subject: [Lxml-checkins] r26725 - lxml/trunk/src/lxml Message-ID: <20060503150635.E0C0B100A8@code0.codespeak.net> Author: scoder Date: Wed May 3 17:06:32 2006 New Revision: 26725 Modified: lxml/trunk/src/lxml/tree.pxd lxml/trunk/src/lxml/xmlerror.pxi lxml/trunk/src/lxml/xmlparser.pxd Log: check error message arguments in _receiveGenericError() Modified: lxml/trunk/src/lxml/tree.pxd ============================================================================== --- lxml/trunk/src/lxml/tree.pxd (original) +++ lxml/trunk/src/lxml/tree.pxd Wed May 3 17:06:32 2006 @@ -3,6 +3,7 @@ cdef extern from "stdio.h": ctypedef struct FILE cdef int strlen(char* s) + cdef char* strstr(char* haystack, char* needle) cdef int strcmp(char* s1, char* s2) cdef int strncmp(char* s1, char* s2, int len) Modified: lxml/trunk/src/lxml/xmlerror.pxi ============================================================================== --- lxml/trunk/src/lxml/xmlerror.pxi (original) +++ lxml/trunk/src/lxml/xmlerror.pxi Wed May 3 17:06:32 2006 @@ -267,27 +267,45 @@ log_handler = __GLOBAL_ERROR_LOG cstd.va_start(args, msg) - c_text = cstd.va_charptr(args) - c_filename = cstd.va_charptr(args) - c_line = cstd.va_int(args) - c_element = cstd.va_charptr(args) - cstd.va_end(args) - - if c_text is NULL: - message = None - elif c_element is NULL: - message = funicode(c_text) + if tree.strncmp(msg, '%s:', 3) == 0: + c_text = cstd.va_charptr(args) + else: + c_text = NULL + if tree.strstr(msg, 'file %s') is not NULL: + c_filename = cstd.va_charptr(args) + else: + c_filename = NULL + if tree.strstr(msg, 'line %d') is not NULL: + c_line = cstd.va_int(args) + else: + c_line = -1 + if tree.strstr(msg, 'element %s') is not NULL: + c_element = cstd.va_charptr(args) else: - message = "%s (element '%s')" % ( - funicode(c_text), funicode(c_element)) + c_element = NULL + cstd.va_end(args) - if c_filename is not NULL and tree.strlen(c_filename) > 0: - if tree.strncmp(c_filename, 'XSLT:', 5) == 0: - filename = '<xslt>' + try: + if c_text is NULL: + message = None + elif c_element is NULL: + message = funicode(c_text) else: - filename = funicode(c_filename) - else: - filename = None + message = "%s (element '%s')" % ( + funicode(c_text), funicode(c_element)) + except UnicodeDecodeError: + message = "<undecodable message>" + + try: + if c_filename is not NULL and tree.strlen(c_filename) > 0: + if tree.strncmp(c_filename, 'XSLT:', 5) == 0: + filename = '<xslt>' + else: + filename = funicode(c_filename) + else: + filename = None + except UnicodeDecodeError: + filename = "<undecodable filename>" log_handler._receiveGeneric(xmlerror.XML_FROM_XSLT, xmlerror.XML_ERR_OK, @@ -306,6 +324,9 @@ # init global logging initThreadLogging() +# switch on line number reporting +xmlparser.xmlLineNumbersDefault(1) + ################################################################################ ## CONSTANTS FROM "xmlerror.pxd" ################################################################################ Modified: lxml/trunk/src/lxml/xmlparser.pxd ============================================================================== --- lxml/trunk/src/lxml/xmlparser.pxd (original) +++ lxml/trunk/src/lxml/xmlparser.pxd Wed May 3 17:06:32 2006 @@ -39,9 +39,15 @@ #XML_PARSE_COMPACT = 65536 # compact small text nodes cdef void xmlInitParser() + cdef int xmlLineNumbersDefault(int onoff) cdef xmlParserCtxt* xmlNewParserCtxt() + cdef int xmlCtxtUseOptions(xmlParserCtxt* ctxt, int options) cdef void xmlFreeParserCtxt(xmlParserCtxt* ctxt) - + cdef int xmlCtxtResetPush(xmlParserCtxt* ctxt, + char* chunk, int size, + char* filename, char* encoding) + cdef int xmlParseChunk(xmlParserCtxt* ctxt, + char* chunk, int size, int terminate) cdef xmlDoc* xmlCtxtReadDoc(xmlParserCtxt* ctxt, char* cur, char* URL, char* encoding, int options) From scoder at codespeak.net Wed May 3 17:40:12 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Wed May 3 17:40:14 2006 Subject: [Lxml-checkins] r26726 - in lxml/trunk/src/lxml: . tests Message-ID: <20060503154012.02F811008F@code0.codespeak.net> Author: scoder Date: Wed May 3 17:40:10 2006 New Revision: 26726 Modified: lxml/trunk/src/lxml/etree.pyx lxml/trunk/src/lxml/htmlparser.pxd lxml/trunk/src/lxml/parser.pxi lxml/trunk/src/lxml/tests/common_imports.py lxml/trunk/src/lxml/tests/test_io.py Log: rewrite of file-like object reading in XML parser * use StringIO.getvalue() for StingIO's iff we read from the start * use libxml2 chunk parser for any other file-like object to avoid reading the whole string into memory * test with LargeFileLike object * try to provide file URLs to parser wherever possible Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Wed May 3 17:40:10 2006 @@ -1357,12 +1357,12 @@ def HTML(text): cdef _Document doc - doc = _parseMemoryDocument(text, __DEFAULT_HTML_PARSER) + doc = _parseMemoryDocument(text, None, __DEFAULT_HTML_PARSER) return doc.getroot() def XML(text): cdef _Document doc - doc = _parseMemoryDocument(text, __DEFAULT_XML_PARSER) + doc = _parseMemoryDocument(text, None, __DEFAULT_XML_PARSER) return doc.getroot() fromstring = XML Modified: lxml/trunk/src/lxml/htmlparser.pxd ============================================================================== --- lxml/trunk/src/lxml/htmlparser.pxd (original) +++ lxml/trunk/src/lxml/htmlparser.pxd Wed May 3 17:40:10 2006 @@ -13,14 +13,14 @@ # HTML_PARSE_RECOVER # Relaxed parsing # HTML_PARSE_COMPACT # compact small text nodes - xmlParserCtxt* htmlCreateMemoryParserCtxt(char* buffer, int size) - xmlParserCtxt* htmlCreateFileParserCtxt(char* filename, char* encoding) - void htmlFreeParserCtxt(xmlParserCtxt* ctxt) - int htmlParseDocument(xmlParserCtxt* ctxt) + cdef xmlParserCtxt* htmlCreateMemoryParserCtxt(char* buffer, int size) + cdef xmlParserCtxt* htmlCreateFileParserCtxt(char* filename, char* encoding) + cdef void htmlFreeParserCtxt(xmlParserCtxt* ctxt) + cdef int htmlParseDocument(xmlParserCtxt* ctxt) - xmlDoc* htmlCtxtReadFile(xmlParserCtxt* ctxt, - char* filename, char* encoding, - int options) - xmlDoc* htmlCtxtReadDoc(xmlParserCtxt* ctxt, - char* buffer, char* URL, char* encoding, - int options) + cdef xmlDoc* htmlCtxtReadFile(xmlParserCtxt* ctxt, + char* filename, char* encoding, + int options) + cdef xmlDoc* htmlCtxtReadDoc(xmlParserCtxt* ctxt, + char* buffer, char* URL, char* encoding, + int options) Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Wed May 3 17:40:10 2006 @@ -191,6 +191,9 @@ xmlparser.XML_PARSE_NOERROR ) +cdef object __FILE_READ_CHUNK_SIZE +__FILE_READ_CHUNK_SIZE = 32768 + cdef class XMLParser(BaseParser): """The XML parser. Parsers can be supplied as additional argument to various parse functions of the lxml API. A default parser is always @@ -207,6 +210,7 @@ cdef int _parse_options cdef xmlParserCtxt* _file_parser_ctxt cdef xmlParserCtxt* _memory_parser_ctxt + cdef xmlParserCtxt* _push_parser_ctxt def __init__(self, attribute_defaults=False, dtd_validation=False, load_dtd=False, no_network=False, ns_clean=False, recover=False): @@ -237,6 +241,8 @@ xmlparser.xmlFreeParserCtxt(self._file_parser_ctxt) if self._memory_parser_ctxt != NULL: xmlparser.xmlFreeParserCtxt(self._memory_parser_ctxt) + if self._push_parser_ctxt != NULL: + xmlparser.xmlFreeParserCtxt(self._push_parser_ctxt) def copy(self): cdef XMLParser parser @@ -252,7 +258,7 @@ raise ParserError, "Failed to create parser context" return pctxt - cdef xmlDoc* _parseDoc(self, char* c_text) except NULL: + cdef xmlDoc* _parseDoc(self, char* c_text, char* c_filename) except NULL: """Parse document, share dictionary if possible. """ cdef xmlDoc* result @@ -265,7 +271,7 @@ self._memory_parser_ctxt = pctxt self._initContext(pctxt) result = xmlparser.xmlCtxtReadDoc( - pctxt, c_text, NULL, NULL, self._parse_options) + pctxt, c_text, c_filename, NULL, self._parse_options) self._error_log.disconnect() recover = self._parse_options & xmlparser.XML_PARSE_RECOVER return _handleParseResult(pctxt, result, NULL, recover) @@ -286,6 +292,53 @@ recover = self._parse_options & xmlparser.XML_PARSE_RECOVER return _handleParseResult(pctxt, result, c_filename, recover) + cdef xmlDoc* _parseDocFromFilelike(self, filelike, + char* c_filename) except NULL: + cdef xmlDoc* result + cdef xmlParserCtxt* pctxt + cdef int recover + cdef int success + self._error_log.connect() + pctxt = self._push_parser_ctxt + if pctxt is NULL: + pctxt = self._createContext() + self._push_parser_ctxt = pctxt + self._initContext(pctxt) + result = NULL + success = xmlparser.xmlCtxtResetPush(pctxt, NULL, 0, c_filename, NULL) + if success != 0: + self._error_log.disconnect() + raise ParserError, "Failed to setup parser context" + xmlparser.xmlCtxtUseOptions(pctxt, self._parse_options) + + try: + read = filelike.read + data = read(__FILE_READ_CHUNK_SIZE) + if python.PyUnicode_Check(data): + data = _stripDeclaration(_utf8(data)) + while data: + if python.PyUnicode_Check(data): + data = _utf8(data) + elif not python.PyString_Check(data): + raise TypeError, "File-like objects must return string or unicode" + success = xmlparser.xmlParseChunk(pctxt, _cstr(data), len(data), 0) + if success != 0: + return _handleParseResult(pctxt, NULL, c_filename, 0) + data = read(__FILE_READ_CHUNK_SIZE) + xmlparser.xmlParseChunk(pctxt, NULL, 0, 1) + except Exception: + if pctxt.myDoc is not NULL: + tree.xmlFreeDoc(pctxt.myDoc) + pctxt.myDoc = NULL + self._error_log.disconnect() + raise + + self._error_log.disconnect() + result = pctxt.myDoc + pctxt.myDoc = NULL + recover = self._parse_options & xmlparser.XML_PARSE_RECOVER + return _handleParseResult(pctxt, result, c_filename, recover) + cdef xmlDoc* _internalParseDoc(char* c_text, int options, _ResolverContext context) except NULL: # internal parser function for XSLT @@ -404,7 +457,7 @@ parser._parse_options = self._parse_options return parser - cdef xmlDoc* _parseDoc(self, char* c_text) except NULL: + cdef xmlDoc* _parseDoc(self, char* c_text, char* c_filename) except NULL: """Parse HTML document, share dictionary if possible. """ cdef xmlDoc* result @@ -421,7 +474,7 @@ self._memory_parser_ctxt = pctxt self._initContext(pctxt) result = htmlparser.htmlCtxtReadDoc( - pctxt, c_text, NULL, NULL, self._parse_options) + pctxt, c_text, c_filename, NULL, self._parse_options) self._error_log.disconnect() recover = self._parse_options & xmlparser.XML_PARSE_RECOVER return _handleParseResult(pctxt, result, NULL, recover) @@ -455,14 +508,19 @@ ## helper functions for document creation ############################################################ -cdef xmlDoc* _parseDoc(text_utf, parser) except NULL: +cdef xmlDoc* _parseDoc(text_utf, filename, parser) except NULL: + cdef char* c_filename if parser is None: parser = __DEFAULT_PARSER __GLOBAL_PARSER_CONTEXT._initParser() + if not filename: + c_filename = NULL + else: + c_filename = _cstr(filename) if isinstance(parser, XMLParser): - return (<XMLParser>parser)._parseDoc(_cstr(text_utf)) + return (<XMLParser>parser)._parseDoc(_cstr(text_utf), c_filename) elif isinstance(parser, HTMLParser): - return (<HTMLParser>parser)._parseDoc(_cstr(text_utf)) + return (<HTMLParser>parser)._parseDoc(_cstr(text_utf), c_filename) else: raise TypeError, "invalid parser" @@ -477,6 +535,23 @@ else: raise TypeError, "invalid parser" +cdef xmlDoc* _parseDocFromFilelike(source, filename, parser) except NULL: + cdef char* c_filename + if parser is None: + parser = __DEFAULT_PARSER + __GLOBAL_PARSER_CONTEXT._initParser() + if not filename: + c_filename = NULL + else: + c_filename = _cstr(filename) + if isinstance(parser, XMLParser): + return (<XMLParser>parser)._parseDocFromFilelike(source, c_filename) + elif isinstance(parser, HTMLParser): + data = source.read() + return (<HTMLParser>parser)._parseDoc(_cstr(data), c_filename) + else: + raise TypeError, "invalid parser" + cdef xmlDoc* _newDoc(): cdef xmlDoc* result result = tree.xmlNewDoc("1.0") @@ -490,9 +565,14 @@ cdef _Document _parseDocument(source, parser): cdef xmlDoc* c_doc filename = _getFilenameForFile(source) + if hasattr(source, 'getvalue') and hasattr(source, 'tell'): + # StringIO - reading from start? + if source.tell() == 0: + return _parseMemoryDocument(source.getvalue(), filename, parser) + # Support for unamed file-like object (StringIO, urlgrabber.urlopen, ...) if not filename and hasattr(source, 'read'): - return _parseMemoryDocument(source.read(), parser) + return _parseFilelikeDocument(source, filename, parser) # Otherwise parse the file directly from the filesystem if filename is None: @@ -501,10 +581,14 @@ c_doc = _parseDocFromFile(_utf8(filename), parser) return _documentFactory(c_doc, parser) -cdef _Document _parseMemoryDocument(text, parser): +cdef _Document _parseMemoryDocument(text, url, parser): cdef xmlDoc* c_doc if python.PyUnicode_Check(text): text = _stripDeclaration(_utf8(text)) - c_doc = _parseDoc(text, parser) + c_doc = _parseDoc(text, url, parser) return _documentFactory(c_doc, parser) +cdef _Document _parseFilelikeDocument(source, filename, parser): + cdef xmlDoc* c_doc + c_doc = _parseDocFromFilelike(source, filename, parser) + return _documentFactory(c_doc, parser) Modified: lxml/trunk/src/lxml/tests/common_imports.py ============================================================================== --- lxml/trunk/src/lxml/tests/common_imports.py (original) +++ lxml/trunk/src/lxml/tests/common_imports.py Wed May 3 17:40:10 2006 @@ -20,14 +20,57 @@ class SillyFileLike: def __init__(self, xml_data='<foo><bar/></foo>'): self.xml_data = xml_data - self.done = False def read(self, amount=None): - if not self.done: - self.done = True - return self.xml_data + if self.xml_data: + if amount: + data = self.xml_data[:amount] + self.xml_data = self.xml_data[amount:] + else: + data = self.xml_data + self.xml_data = '' + return data return '' +class LargeFileLike: + def __init__(self, charlen=100, depth=4, children=10): + self.data = StringIO() + self.chars = 'a' * charlen + self.children = range(children) + self.more = self.iterelements(depth) + + def iterelements(self, depth): + yield '<root>' + depth -= 1 + if depth > 0: + for child in self.children: + for element in self.iterelements(depth): + yield element + yield self.chars + else: + yield self.chars + yield '</root>' + + def read(self, amount=None): + data = self.data + append = data.write + if amount: + for element in self.more: + append(element) + if data.tell() >= amount: + break + else: + for element in self.more: + append(element) + result = data.getvalue() + if amount: + self.data = StringIO(result[amount:]) + result = result[:amount] + else: + data.seek(0) + data.truncate() + return result + def fileInTestDir(name): _testdir = os.path.split(__file__)[0] return os.path.join(_testdir, name) Modified: lxml/trunk/src/lxml/tests/test_io.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_io.py (original) +++ lxml/trunk/src/lxml/tests/test_io.py Wed May 3 17:40:10 2006 @@ -7,7 +7,8 @@ import unittest import tempfile, gzip -from common_imports import etree, ElementTree, fileInTestDir, SillyFileLike +from common_imports import etree, ElementTree, fileInTestDir +from common_imports import SillyFileLike, LargeFileLike class IOTestCaseBase(unittest.TestCase): """(c)ElementTree compatibility for IO functions/methods @@ -84,6 +85,29 @@ root = self.etree.ElementTree().parse(f) self.assert_(root.tag.endswith('foo')) + def test_module_parse_large_fileobject(self): + # parse from unamed file object + f = LargeFileLike() + tree = self.etree.parse(f) + root = tree.getroot() + self.assert_(root.tag.endswith('root')) + + def test_module_parse_fileobject_error(self): + class LocalError(Exception): + pass + class TestFile: + def read(*args): + raise LocalError + f = TestFile() + self.assertRaises(LocalError, self.etree.parse, f) + + def test_module_parse_fileobject_type_error(self): + class TestFile: + def read(*args): + return 1 + f = TestFile() + self.assertRaises(TypeError, self.etree.parse, f) + class ETreeIOTestCase(IOTestCaseBase): etree = etree From scoder at codespeak.net Wed May 3 17:47:34 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Wed May 3 17:47:36 2006 Subject: [Lxml-checkins] r26727 - lxml/trunk Message-ID: <20060503154734.B81871008F@code0.codespeak.net> Author: scoder Date: Wed May 3 17:47:33 2006 New Revision: 26727 Modified: lxml/trunk/CHANGES.txt Log: updated CHANGES.txt Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Wed May 3 17:47:33 2006 @@ -7,6 +7,10 @@ Features added -------------- +* Parsing file-like objects now reads chunks rather than the whole file + +* Parsing StringIO objects from the start avoid copying the string + * Read-only 'docinfo' attribute in ElementTree class holds DOCTYPE information, original encoding and XML version as seen by the parser From scoder at codespeak.net Wed May 3 17:53:03 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Wed May 3 17:53:05 2006 Subject: [Lxml-checkins] r26728 - lxml/trunk/src/lxml/tests Message-ID: <20060503155303.E66451008F@code0.codespeak.net> Author: scoder Date: Wed May 3 17:53:03 2006 New Revision: 26728 Modified: lxml/trunk/src/lxml/tests/test_io.py Log: new test case for exception after parsing has started Modified: lxml/trunk/src/lxml/tests/test_io.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_io.py (original) +++ lxml/trunk/src/lxml/tests/test_io.py Wed May 3 17:53:03 2006 @@ -101,6 +101,23 @@ f = TestFile() self.assertRaises(LocalError, self.etree.parse, f) + def test_module_parse_fileobject_late_error(self): + class LocalError(Exception): + pass + class TestFile: + data = '<root>test</' + next_char = iter(data).next + counter = 0 + def read(self, *args): + try: + self.counter += 1 + return self.next_char() + except StopIteration: + raise LocalError + f = TestFile() + self.assertRaises(LocalError, self.etree.parse, f) + self.assertEquals(f.counter, len(f.data)+1) + def test_module_parse_fileobject_type_error(self): class TestFile: def read(*args): From scoder at codespeak.net Wed May 3 17:56:03 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Wed May 3 17:56:04 2006 Subject: [Lxml-checkins] r26729 - lxml/trunk Message-ID: <20060503155603.5A8C71008F@code0.codespeak.net> Author: scoder Date: Wed May 3 17:56:00 2006 New Revision: 26729 Modified: lxml/trunk/CHANGES.txt Log: typo Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Wed May 3 17:56:00 2006 @@ -9,7 +9,7 @@ * Parsing file-like objects now reads chunks rather than the whole file -* Parsing StringIO objects from the start avoid copying the string +* Parsing StringIO objects from the start avoids copying the string * Read-only 'docinfo' attribute in ElementTree class holds DOCTYPE information, original encoding and XML version as seen by the parser From scoder at codespeak.net Thu May 4 08:19:33 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Thu May 4 08:19:36 2006 Subject: [Lxml-checkins] r26748 - lxml/trunk Message-ID: <20060504061933.DE359100A0@code0.codespeak.net> Author: scoder Date: Thu May 4 08:19:31 2006 New Revision: 26748 Modified: lxml/trunk/CHANGES.txt Log: doc updates Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Thu May 4 08:19:31 2006 @@ -15,15 +15,15 @@ information, original encoding and XML version as seen by the parser * etree module can be compiled without libxslt by commenting out the line - 'include "xslt.pxi"' at the end of the etree.pyx source file + 'include "xslt.pxi"' near the end of the etree.pyx source file * Error reporting now also works in XSLT * Support for custom document loaders (URI resolvers) in parsers and XSLT, resolvers are registered at parser level -* Exslt:regexp implementation for XSLT based on the Python 're' module - on by default, can be switched off with 'regexp=False' keyword argument +* Implementation of exslt:regexp for XSLT based on the Python 're' module, + enabled by default, can be switched off with 'regexp=False' keyword argument * Support for exslt extensions (libexslt) and node-set function From scoder at codespeak.net Thu May 4 08:27:00 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Thu May 4 08:27:03 2006 Subject: [Lxml-checkins] r26749 - lxml/trunk/src/lxml Message-ID: <20060504062700.13A5E10090@code0.codespeak.net> Author: scoder Date: Thu May 4 08:26:58 2006 New Revision: 26749 Modified: lxml/trunk/src/lxml/parser.pxi Log: parse as file-like object even if filename is known (there might be a reason the user gave us a file-like) Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Thu May 4 08:26:58 2006 @@ -570,8 +570,8 @@ if source.tell() == 0: return _parseMemoryDocument(source.getvalue(), filename, parser) - # Support for unamed file-like object (StringIO, urlgrabber.urlopen, ...) - if not filename and hasattr(source, 'read'): + # Support for file-like objects (urlgrabber.urlopen, ...) + if hasattr(source, 'read'): return _parseFilelikeDocument(source, filename, parser) # Otherwise parse the file directly from the filesystem From scoder at codespeak.net Thu May 4 08:56:44 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Thu May 4 08:56:47 2006 Subject: [Lxml-checkins] r26750 - lxml/trunk/src/lxml Message-ID: <20060504065644.DA45E10092@code0.codespeak.net> Author: scoder Date: Thu May 4 08:56:41 2006 New Revision: 26750 Modified: lxml/trunk/src/lxml/apihelpers.pxi Log: support finding file-like URL for urllib2 handlers Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Thu May 4 08:56:41 2006 @@ -348,6 +348,9 @@ # gzip file instances have a filename attribute if hasattr(source, 'filename'): return source.filename + # urllib2 + if hasattr(source, 'geturl'): + return source.geturl() return None cdef void changeDocumentBelow(_NodeBase node, _Document doc, int recursive): From scoder at codespeak.net Thu May 4 08:58:09 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Thu May 4 08:58:12 2006 Subject: [Lxml-checkins] r26751 - lxml/trunk/src/lxml Message-ID: <20060504065809.2CC2610092@code0.codespeak.net> Author: scoder Date: Thu May 4 08:58:04 2006 New Revision: 26751 Modified: lxml/trunk/src/lxml/parser.pxi Log: convert URLs/filenames to UTF-8 in document parser functions Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Thu May 4 08:58:04 2006 @@ -585,10 +585,14 @@ cdef xmlDoc* c_doc if python.PyUnicode_Check(text): text = _stripDeclaration(_utf8(text)) + if url is not None: + url = _utf8(url) c_doc = _parseDoc(text, url, parser) return _documentFactory(c_doc, parser) -cdef _Document _parseFilelikeDocument(source, filename, parser): +cdef _Document _parseFilelikeDocument(source, url, parser): cdef xmlDoc* c_doc - c_doc = _parseDocFromFilelike(source, filename, parser) + if url is not None: + url = _utf8(url) + c_doc = _parseDocFromFilelike(source, url, parser) return _documentFactory(c_doc, parser) From scoder at codespeak.net Thu May 4 12:11:04 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Thu May 4 12:11:06 2006 Subject: [Lxml-checkins] r26755 - lxml/trunk/doc Message-ID: <20060504101104.E129410092@code0.codespeak.net> Author: scoder Date: Thu May 4 12:11:03 2006 New Revision: 26755 Modified: lxml/trunk/doc/extensions.txt Log: loads of restructuring and clarifications in doc/extensions.txt Modified: lxml/trunk/doc/extensions.txt ============================================================================== --- lxml/trunk/doc/extensions.txt (original) +++ lxml/trunk/doc/extensions.txt Thu May 4 12:11:03 2006 @@ -9,13 +9,15 @@ Here is how such a function looks like. As the first argument, it always receives a dummy object. It is currently None, but do not rely on this as it may become meaningful in later versions of lxml. The other arguments are -provided by the respective call in the XPath expression. Any number of -arguments is allowed:: +provided by the respective call in the XPath expression, one in the following +examples. Any number of arguments is allowed:: >>> def hello(dummy, a): ... return "Hello %s" % a >>> def ola(dummy, a): ... return "Ola %s" % a + >>> def loadsofargs(dummy, *args): + ... return "Got %d arguments." % len(args) The FunctionNamespace @@ -29,10 +31,12 @@ >>> from lxml import etree >>> ns = etree.FunctionNamespace(None) >>> ns['hello'] = hello + >>> ns['countargs'] = loadsofargs -This registers the function 'foo' with the name 'myfunction' in the default -namespace. Now we're going to create a document that we can run XPath -expressions against:: +This registers the function `hello` with the name `hello` in the default +namespace (None), and the function `loadsofargs` with the name `countargs`. +Now we're going to create a document that we can run XPath expressions +against:: >>> from lxml import etree >>> from StringIO import StringIO @@ -48,15 +52,18 @@ Hello b >>> print root.xpath('hello(string(b))') Hello Haegar + >>> print root.xpath('countargs(., b, ./*)') + Got 3 arguments. -Note how we call both a Python function (hello) and an XPath built-in function -(local-name) in exactly the same way. Normally, however, you would want to -separate the two in different namespaces. The FunctionNamespace class allows -you to do this:: +Note how we call both a Python function (`hello`) and an XPath built-in +function (`string`) in exactly the same way. Normally, however, you would +want to separate the two in different namespaces. The FunctionNamespace class +allows you to do this:: >>> ns = etree.FunctionNamespace('http://mydomain.org/myfunctions') >>> ns['hello'] = hello - >>> print root.xpath('f:hello(local-name(*))', {'f' : 'http://mydomain.org/myfunctions'}) + >>> prefixmap = {'f' : 'http://mydomain.org/myfunctions'} + >>> print root.xpath('f:hello(local-name(*))', prefixmap) Hello b @@ -84,39 +91,11 @@ would rather complicate things than be of any help. -What to return from a function ------------------------------- - -Extension functions can return any data type for which there is an XPath -equivalent. This includes numbers, boolean values, elements and lists of -elements:: - - >>> def returnsFloat(_): - ... return 1.7 - >>> def returnsBool(_): - ... return True - >>> def returnFirstNode(_, nodes): - ... return nodes[0] - - >>> ns = etree.FunctionNamespace(None) - >>> ns['float'] = returnsFloat - >>> ns['bool'] = returnsBool - >>> ns['first'] = returnFirstNode - - >>> e = etree.XPathEvaluator(doc) - >>> e.evaluate("float()") - 1.7 - >>> e.evaluate("bool()") - True - >>> e.evaluate("count(first(//b))") - 1.0 - - Evaluators and XSLT ------------------- Extension functions work for all ways of evaluating XPath expressions and for -XSLT execution:: +XSL transformations:: >>> e = etree.XPathEvaluator(doc) >>> print e.evaluate('es:hello(local-name(/a))') @@ -142,64 +121,120 @@ It is also possible to register namespaces with a single evaluator. While the following example involves no functions, the idea should still be clear:: - >>> f = StringIO('<hey:a xmlns:hey="http://mydomain.org/myfunctions" />') + >>> f = StringIO('<a xmlns="http://mydomain.org/myfunctions" />') >>> ns_doc = etree.parse(f) >>> e = etree.XPathEvaluator(ns_doc) >>> e.evaluate('/a') [] -This obviously returns nothing, but when we register the namespace with the -evaluator, we can access it via a prefix. Note that this prefix mapping is -only known to this evaluator, as opposed to the global mapping of the -FunctionNamespace objects:: +This returns nothing, as we did not ask for the right namespace. When we +register the namespace with the evaluator, we can access it via a prefix:: >>> e.registerNamespace('foo', 'http://mydomain.org/myfunctions') >>> e.evaluate('/foo:a')[0].tag '{http://mydomain.org/myfunctions}a' +Note that this prefix mapping is only known to this evaluator, as opposed to +the global mapping of the FunctionNamespace objects:: + + >>> e2 = etree.XPathEvaluator(ns_doc) + >>> e2.evaluate('/foo:a') + Traceback (most recent call last): + ... + XPathSyntaxError: Error in xpath expression. -BETA Features -------------- -Note: the following features are still in beta state. They may not work as -expected. +What to return from a function +------------------------------ + +Extension functions can return any data type for which there is an XPath +equivalent. This includes numbers, boolean values, elements and lists of +elements. Note that integers will also be returned as floats:: + + >>> def returnsFloat(_): + ... return 1.7 + >>> def returnsInteger(_): + ... return 1 + >>> def returnsBool(_): + ... return True + >>> def returnFirstNode(_, nodes): + ... return nodes[0] -It is possible to return lists of newly created nodes as XML structures:: + >>> ns = etree.FunctionNamespace(None) + >>> ns['float'] = returnsFloat + >>> ns['int'] = returnsInteger + >>> ns['bool'] = returnsBool + >>> ns['first'] = returnFirstNode - >>> def returnsNodeSet(evaluator): - ... results = etree.Element('results') - ... result = etree.SubElement(results, 'result') - ... result.text = "Alpha" - ... result2 = etree.SubElement(results, 'result') - ... result2.text = "Beta" - ... result3 = etree.SubElement(results, 'result') - ... result3.text = "Gamma" - ... return [results] - >>> extension4 = { (None, 'returnsNodeSet') : returnsNodeSet } - >>> e = etree.XPathEvaluator(doc, None, extensions=[extension4]) - >>> r = e.evaluate("returnsNodeSet()") - >>> len(r) + >>> e = etree.XPathEvaluator(doc) + >>> e.evaluate("float()") + 1.7 + >>> e.evaluate("int()") + 1.0 + >>> int( e.evaluate("int()") ) 1 - >>> t = r[0] - >>> t.tag - 'results' - >>> len(t) - 3 - >>> t[0].tag - 'result' - >>> t[0].text + >>> e.evaluate("bool()") + True + >>> e.evaluate("count(first(//b))") + 1.0 + +As the last example shows, you can pass the results of functions back into +the XPath expression. Elements and sequences of elements are treated as +XPath node-sets:: + + >>> def returnsNodeSet(_): + ... results1 = etree.Element('results1') + ... etree.SubElement(results1, 'result').text = "Alpha" + ... etree.SubElement(results1, 'result').text = "Beta" + ... + ... results2 = etree.Element('results2') + ... etree.SubElement(results2, 'result').text = "Gamma" + ... etree.SubElement(results2, 'result').text = "Delta" + ... + ... results3 = etree.SubElement(results2, 'subresult') + ... return [results1, results2, results3] + + >>> ns['new-node-set'] = returnsNodeSet + + >>> e = etree.XPathEvaluator(doc, None) + + >>> r = e.evaluate("new-node-set()/result") + >>> print [ t.text for t in r ] + ['Alpha', 'Beta', 'Gamma', 'Delta'] + + >>> r = e.evaluate("new-node-set()") + >>> print [ t.tag for t in r ] + ['results1', 'results2', 'subresult'] + >>> print [ len(t) for t in r ] + [2, 3, 0] + >>> r[0][0].text 'Alpha' - >>> t[1].text - 'Beta' -It's even possible to filter that result set with another XPath expression:: + >>> print etree.tostring(r[0]) + <results1><result>Alpha</result><result>Beta</result></results1> - >>> r = e.evaluate("returnsNodeSet()/result") - >>> len(r) - 3 - >>> r[0].tag - 'result' - >>> r[1].tag - 'result' - >>> r[0].text - 'Alpha' + >>> print etree.tostring(r[1]) + <results2><result>Gamma</result><result>Delta</result><subresult/></results2> + + >>> print etree.tostring(r[2]) + <subresult/> + +The current implementation deep-copies newly created elements in node-sets. +Only the elements and their children are passed on, no outlying parents or +tail texts will be available in the result. This also means that in the above +example, the `subresult` elements in `results2` and `results3` are no longer +identical within the node-set, they belong to independent trees:: + + >>> print r[1][-1].tag, r[2].tag + subresult subresult + >>> print r[1][-1] == r[2] + False + >>> print r[1][-1].getparent().tag + results2 + >>> print r[2].getparent() + None + +This is an implementation detail that you should be aware of, but you should +avoid relying on it in your code. Note that elements taken from the source +document (the most common case) do not suffer from this restriction. They +will always be passed unchanged. From scoder at codespeak.net Fri May 5 08:49:58 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Fri May 5 08:50:02 2006 Subject: [Lxml-checkins] r26788 - lxml/trunk/src/lxml/tests Message-ID: <20060505064958.7CE8A10088@code0.codespeak.net> Author: scoder Date: Fri May 5 08:49:55 2006 New Revision: 26788 Modified: lxml/trunk/src/lxml/tests/test_unicode.py Log: fix file encoding to use UTF-8 Modified: lxml/trunk/src/lxml/tests/test_unicode.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_unicode.py (original) +++ lxml/trunk/src/lxml/tests/test_unicode.py Fri May 5 08:49:55 2006 @@ -1,4 +1,4 @@ -# -*- coding: UTF-8 -*- +# -*- coding: utf-8 -*- import unittest, doctest from StringIO import StringIO @@ -6,7 +6,7 @@ ascii_uni = u'a' -uni = u'?\uF8D2' # klingon etc. +uni = u'??\uF8D2' # klingon etc. class UnicodeTestCase(unittest.TestCase): def test_unicode_xml(self): From scoder at codespeak.net Fri May 5 09:06:26 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Fri May 5 09:06:30 2006 Subject: [Lxml-checkins] r26789 - lxml/trunk/src/lxml/tests Message-ID: <20060505070626.9103B10086@code0.codespeak.net> Author: scoder Date: Fri May 5 09:06:19 2006 New Revision: 26789 Modified: lxml/trunk/src/lxml/tests/test_elementtree.py lxml/trunk/src/lxml/tests/test_errors.py lxml/trunk/src/lxml/tests/test_etree.py lxml/trunk/src/lxml/tests/test_io.py lxml/trunk/src/lxml/tests/test_nsclasses.py lxml/trunk/src/lxml/tests/test_relaxng.py lxml/trunk/src/lxml/tests/test_sax.py lxml/trunk/src/lxml/tests/test_xmlschema.py lxml/trunk/src/lxml/tests/test_xpathevaluator.py lxml/trunk/src/lxml/tests/test_xslt.py Log: fix file encoding header for Emacs Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Fri May 5 09:06:19 2006 @@ -1,4 +1,4 @@ -# -*- coding: UTF-8 -*- +# -*- coding: utf-8 -*- """ Tests for the ElementTree API Modified: lxml/trunk/src/lxml/tests/test_errors.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_errors.py (original) +++ lxml/trunk/src/lxml/tests/test_errors.py Fri May 5 09:06:19 2006 @@ -1,4 +1,4 @@ -?# -*- coding: UTF-8 -*- +?# -*- coding: utf-8 -*- import unittest, doctest # These tests check that error handling in the Pyrex code is Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Fri May 5 09:06:19 2006 @@ -1,4 +1,4 @@ -# -*- coding: UTF-8 -*- +# -*- coding: utf-8 -*- """ Tests specific to the extended etree API Modified: lxml/trunk/src/lxml/tests/test_io.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_io.py (original) +++ lxml/trunk/src/lxml/tests/test_io.py Fri May 5 09:06:19 2006 @@ -1,4 +1,4 @@ -# -*- coding: UTF-8 -*- +# -*- coding: utf-8 -*- """ IO test cases that apply to both etree and ElementTree Modified: lxml/trunk/src/lxml/tests/test_nsclasses.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_nsclasses.py (original) +++ lxml/trunk/src/lxml/tests/test_nsclasses.py Fri May 5 09:06:19 2006 @@ -1,4 +1,4 @@ -# -*- coding: UTF-8 -*- +# -*- coding: utf-8 -*- """ Test cases related to namespace implementation classes and the Modified: lxml/trunk/src/lxml/tests/test_relaxng.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_relaxng.py (original) +++ lxml/trunk/src/lxml/tests/test_relaxng.py Fri May 5 09:06:19 2006 @@ -1,4 +1,4 @@ -# -*- coding: UTF-8 -*- +# -*- coding: utf-8 -*- """ Test cases related to RelaxNG parsing and validation Modified: lxml/trunk/src/lxml/tests/test_sax.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_sax.py (original) +++ lxml/trunk/src/lxml/tests/test_sax.py Fri May 5 09:06:19 2006 @@ -1,4 +1,4 @@ -# -*- coding: UTF-8 -*- +# -*- coding: utf-8 -*- """ Test cases related to SAX I/O Modified: lxml/trunk/src/lxml/tests/test_xmlschema.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_xmlschema.py (original) +++ lxml/trunk/src/lxml/tests/test_xmlschema.py Fri May 5 09:06:19 2006 @@ -1,4 +1,4 @@ -# -*- coding: UTF-8 -*- +# -*- coding: utf-8 -*- """ Test cases related to XML Schema parsing and validation Modified: lxml/trunk/src/lxml/tests/test_xpathevaluator.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_xpathevaluator.py (original) +++ lxml/trunk/src/lxml/tests/test_xpathevaluator.py Fri May 5 09:06:19 2006 @@ -1,4 +1,4 @@ -# -*- coding: UTF-8 -*- +# -*- coding: utf-8 -*- """ Test cases related to XPath evaluation and the XPath class Modified: lxml/trunk/src/lxml/tests/test_xslt.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_xslt.py (original) +++ lxml/trunk/src/lxml/tests/test_xslt.py Fri May 5 09:06:19 2006 @@ -1,4 +1,4 @@ -# -*- coding: UTF-8 -*- +# -*- coding: utf-8 -*- """ Test cases related to XSLT processing From scoder at codespeak.net Fri May 5 09:15:16 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Fri May 5 09:15:18 2006 Subject: [Lxml-checkins] r26790 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20060505071516.DF0AE10088@code0.codespeak.net> Author: scoder Date: Fri May 5 09:15:08 2006 New Revision: 26790 Modified: lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/etree.pyx lxml/trunk/src/lxml/parser.pxi lxml/trunk/src/lxml/tests/test_htmlparser.py Log: only parse file-like objects on libxml2 >= 2.6.24 due to CRLF bug, fix UTF-8 conversion in HTML file-like parser Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Fri May 5 09:15:08 2006 @@ -7,7 +7,9 @@ Features added -------------- -* Parsing file-like objects now reads chunks rather than the whole file +* Parsing file-like objects now reads chunks rather than the whole file at + once (only on libxml2 >= 2.6.24, older versions have a bug with CRLF line + endings) * Parsing StringIO objects from the start avoids copying the string Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Fri May 5 09:15:08 2006 @@ -74,12 +74,14 @@ (c_version % 100) ) -LIBXML_COMPILED_VERSION = __unpackIntVersion(tree.LIBXML_VERSION) +cdef int _LIBXML_VERSION_INT try: - LIBXML_VERSION = __unpackIntVersion( - int((tree.xmlParserVersion).split('-')[0])) + _LIBXML_VERSION_INT = int((tree.xmlParserVersion).split('-')[0]) except Exception: - LIBXML_VERSION = (0,0,0) + _LIBXML_VERSION_INT = 0 + +LIBXML_VERSION = __unpackIntVersion(_LIBXML_VERSION_INT) +LIBXML_COMPILED_VERSION = __unpackIntVersion(tree.LIBXML_VERSION) LXML_VERSION = __unpackDottedVersion(tree.LXML_VERSION_STRING) Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Fri May 5 09:15:08 2006 @@ -547,8 +547,8 @@ if isinstance(parser, XMLParser): return (<XMLParser>parser)._parseDocFromFilelike(source, c_filename) elif isinstance(parser, HTMLParser): - data = source.read() - return (<HTMLParser>parser)._parseDoc(_cstr(data), c_filename) + data_utf = _utf8(source.read()) + return (<HTMLParser>parser)._parseDoc(_cstr(data_utf), c_filename) else: raise TypeError, "invalid parser" @@ -594,5 +594,9 @@ cdef xmlDoc* c_doc if url is not None: url = _utf8(url) - c_doc = _parseDocFromFilelike(source, url, parser) - return _documentFactory(c_doc, parser) + # CRLF reading bug in libxml2 <= 2.6.23 + if _LIBXML_VERSION_INT >= 20624: + c_doc = _parseDocFromFilelike(source, url, parser) + return _documentFactory(c_doc, parser) + else: + return _parseMemoryDocument(source.read(), url, parser) Modified: lxml/trunk/src/lxml/tests/test_htmlparser.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_htmlparser.py (original) +++ lxml/trunk/src/lxml/tests/test_htmlparser.py Fri May 5 09:15:08 2006 @@ -1,4 +1,4 @@ -# -*- coding: UTF-8 -*- +# -*- coding: utf-8 -*- """ HTML parser test cases for etree @@ -6,6 +6,7 @@ import unittest import tempfile +import re from common_imports import StringIO, etree, fileInTestDir, SillyFileLike, HelperTestCase @@ -16,6 +17,7 @@ html_str = "<html><head><title>test

page title

" broken_html_str = "test<body><h1>page title</h3></p></html>" + uhtml_str = u"<html><head><title>test ??\uF8D2

page ??\uF8D2 title

" def tearDown(self): self.etree.set_default_parser() @@ -56,6 +58,15 @@ tree = self.etree.parse(f, parser) self.assertEqual(self.etree.tostring(tree.getroot()), self.html_str) + def test_module_parse_html_filelike(self): + parser = self.etree.HTMLParser() + f = SillyFileLike(self.uhtml_str) + tree = self.etree.parse(f, parser) + html = self.etree.tostring(tree.getroot()) + for entity_name, value in re.findall("(&#([0-9]+);)", html): + html = html.replace(entity_name, unichr(int(value))) + self.assertEqual(html, self.uhtml_str) + def test_html_file_error(self): parser = self.etree.HTMLParser() parse = self.etree.parse From scoder at codespeak.net Fri May 5 10:21:13 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Fri May 5 10:21:15 2006 Subject: [Lxml-checkins] r26792 - lxml/trunk/src/lxml/tests Message-ID: <20060505082113.715A110088@code0.codespeak.net> Author: scoder Date: Fri May 5 10:21:02 2006 New Revision: 26792 Modified: lxml/trunk/src/lxml/tests/common_imports.py lxml/trunk/src/lxml/tests/test_htmlparser.py Log: tests: move unentitify utility function to common_imports Modified: lxml/trunk/src/lxml/tests/common_imports.py ============================================================================== --- lxml/trunk/src/lxml/tests/common_imports.py (original) +++ lxml/trunk/src/lxml/tests/common_imports.py Fri May 5 10:21:02 2006 @@ -1,6 +1,7 @@ import unittest import os.path from StringIO import StringIO +import re from lxml import etree @@ -81,3 +82,8 @@ f = StringIO() tree.write_c14n(f) return f.getvalue() + +def unentitify(xml): + for entity_name, value in re.findall("(&#([0-9]+);)", xml): + xml = xml.replace(entity_name, unichr(int(value))) + return xml Modified: lxml/trunk/src/lxml/tests/test_htmlparser.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_htmlparser.py (original) +++ lxml/trunk/src/lxml/tests/test_htmlparser.py Fri May 5 10:21:02 2006 @@ -6,9 +6,9 @@ import unittest import tempfile -import re -from common_imports import StringIO, etree, fileInTestDir, SillyFileLike, HelperTestCase +from common_imports import StringIO, etree, fileInTestDir +from common_imports import SillyFileLike, HelperTestCase, unentitify class HtmlParserTestCaseBase(HelperTestCase): """HTML parser test cases @@ -63,9 +63,7 @@ f = SillyFileLike(self.uhtml_str) tree = self.etree.parse(f, parser) html = self.etree.tostring(tree.getroot()) - for entity_name, value in re.findall("(&#([0-9]+);)", html): - html = html.replace(entity_name, unichr(int(value))) - self.assertEqual(html, self.uhtml_str) + self.assertEqual(unentitify(html), self.uhtml_str) def test_html_file_error(self): parser = self.etree.HTMLParser() From scoder at codespeak.net Fri May 5 10:36:02 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Fri May 5 10:36:09 2006 Subject: [Lxml-checkins] r26793 - lxml/trunk/src/lxml/tests Message-ID: <20060505083602.E3D0510088@code0.codespeak.net> Author: scoder Date: Fri May 5 10:35:51 2006 New Revision: 26793 Modified: lxml/trunk/src/lxml/tests/test_elementtree.py Log: fix test_elementtree: when writing encoded XML, be sure to recode it to unicode before canonicalizing (API can't handle UTF-8 strings) Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Fri May 5 10:35:51 2006 @@ -1816,7 +1816,7 @@ f = StringIO() tree = ElementTree(element=element) tree.write(f, encoding) - data = f.getvalue() + data = unicode(f.getvalue(), encoding) return canonicalize(data) def _writeElementFile(self, element, encoding='us-ascii'): @@ -1829,7 +1829,7 @@ tree.write(f, encoding) f.close() f = open(filename, 'rb') - data = f.read() + data = unicode(f.read(), encoding) f.close() os.close(handle) os.remove(filename) From scoder at codespeak.net Fri May 5 10:43:21 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Fri May 5 10:43:28 2006 Subject: [Lxml-checkins] r26794 - lxml/trunk/src/lxml/tests Message-ID: <20060505084321.12ADF10088@code0.codespeak.net> Author: scoder Date: Fri May 5 10:43:20 2006 New Revision: 26794 Modified: lxml/trunk/src/lxml/tests/test_unicode.py Log: new test case for parsing unicode from file-like object Modified: lxml/trunk/src/lxml/tests/test_unicode.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_unicode.py (original) +++ lxml/trunk/src/lxml/tests/test_unicode.py Fri May 5 10:43:20 2006 @@ -1,13 +1,14 @@ # -*- coding: utf-8 -*- import unittest, doctest -from StringIO import StringIO -from lxml import etree +from common_imports import StringIO, etree, SillyFileLike, unentitify ascii_uni = u'a' uni = u'??\uF8D2' # klingon etc. +uxml = u"test ??\uF8D2

page ??\uF8D2 title

" + class UnicodeTestCase(unittest.TestCase): def test_unicode_xml(self): tree = etree.XML(u'

%s

' % uni) @@ -40,6 +41,12 @@ el = etree.parse(StringIO(u'

%s

' % uni)).getroot() self.assertEquals(uni, el.text) + def test_parse_fileobject_unicode(self): + # parse unicode from unamed file object (not support by ElementTree) + f = SillyFileLike(uxml) + root = etree.parse(f).getroot() + self.assertEquals(unentitify(etree.tostring(root)), uxml) + def test_suite(): suite = unittest.TestSuite() suite.addTests([unittest.makeSuite(UnicodeTestCase)]) From scoder at codespeak.net Fri May 5 10:46:47 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Fri May 5 10:46:53 2006 Subject: [Lxml-checkins] r26795 - lxml/trunk/src/lxml/tests Message-ID: <20060505084647.6C0C710088@code0.codespeak.net> Author: scoder Date: Fri May 5 10:46:45 2006 New Revision: 26795 Modified: lxml/trunk/src/lxml/tests/test_io.py Log: new test case for libxml2 <= 2.6.23 CRLF bug, fix late_error test case Modified: lxml/trunk/src/lxml/tests/test_io.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_io.py (original) +++ lxml/trunk/src/lxml/tests/test_io.py Fri May 5 10:46:45 2006 @@ -85,6 +85,14 @@ root = self.etree.ElementTree().parse(f) self.assert_(root.tag.endswith('foo')) + def test_class_parse_fileobject_crlf(self): + # libxml2 <= 2.6.23 has a bug reading CRLF files in chunks + xml = '' + 'test\r\n' * 100 + '' + f = SillyFileLike(xml) + root = self.etree.ElementTree().parse(f) + self.assertEquals(self.etree.tostring(root).replace('\r', ''), + xml.replace('\r', '')) + def test_module_parse_large_fileobject(self): # parse from unamed file object f = LargeFileLike() @@ -108,12 +116,16 @@ data = 'test Author: scoder Date: Fri May 5 10:49:00 2006 New Revision: 26796 Modified: lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/etree.pyx Log: fix memory leak in write_c14n if it fails to write the file after conversion Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Fri May 5 10:49:00 2006 @@ -48,6 +48,8 @@ Bugs fixed ---------- +* Memory leak if write_c14n fails to write the file after conversion + * ElementTree.xpath() and XPathDocumentEvaluator were not using the ElementTree root node as reference point Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Fri May 5 10:49:00 2006 @@ -479,10 +479,12 @@ if bytes < 0: raise C14NError, "C14N failed" - if not hasattr(file, 'write'): - file = open(file, 'wb') - file.write(data) - tree.xmlFree(data) + try: + if not hasattr(file, 'write'): + file = open(file, 'wb') + file.write(data) + finally: + tree.xmlFree(data) cdef _ElementTree _elementTreeFactory(_Document doc, _NodeBase context_node): From scoder at codespeak.net Fri May 5 10:54:39 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Fri May 5 10:54:42 2006 Subject: [Lxml-checkins] r26798 - lxml/trunk/src/lxml Message-ID: <20060505085439.D700A10090@code0.codespeak.net> Author: scoder Date: Fri May 5 10:54:33 2006 New Revision: 26798 Modified: lxml/trunk/src/lxml/parser.pxi Log: let parse functions always check the input type is string or unicode Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Fri May 5 10:54:33 2006 @@ -560,6 +560,7 @@ ############################################################ ## API level helper functions for _Document creation +## (here we convert to UTF-8) ############################################################ cdef _Document _parseDocument(source, parser): @@ -583,11 +584,12 @@ cdef _Document _parseMemoryDocument(text, url, parser): cdef xmlDoc* c_doc + text_utf = _utf8(text) if python.PyUnicode_Check(text): - text = _stripDeclaration(_utf8(text)) + text_utf = _stripDeclaration(text_utf) if url is not None: url = _utf8(url) - c_doc = _parseDoc(text, url, parser) + c_doc = _parseDoc(text_utf, url, parser) return _documentFactory(c_doc, parser) cdef _Document _parseFilelikeDocument(source, url, parser): From scoder at codespeak.net Fri May 5 11:00:00 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Fri May 5 11:00:03 2006 Subject: [Lxml-checkins] r26800 - lxml/trunk/src/lxml/tests Message-ID: <20060505090000.2A78B10088@code0.codespeak.net> Author: scoder Date: Fri May 5 10:59:58 2006 New Revision: 26800 Modified: lxml/trunk/src/lxml/tests/test_etree.py lxml/trunk/src/lxml/tests/test_io.py Log: moved CRLF test case to test_etree since it's only for libxml2 anyway Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Fri May 5 10:59:58 2006 @@ -10,9 +10,8 @@ import unittest, doctest -from StringIO import StringIO - -from common_imports import etree, HelperTestCase, fileInTestDir, canonicalize +from common_imports import etree, StringIO, HelperTestCase, fileInTestDir +from common_imports import SillyFileLike, canonicalize class ETreeOnlyTestCase(HelperTestCase): """Tests only for etree, not ElementTree""" @@ -426,6 +425,14 @@ self.assertEquals(docinfo.root_name, 'html') self.assertEquals(docinfo.doctype, '') + def test_parse_fileobject_crlf(self): + # libxml2 <= 2.6.23 has a bug reading CRLF files in chunks + xml = '' + 'test\r\n\r\n' * 10000 + '' + f = SillyFileLike(xml) + root = self.etree.parse(f).getroot() + self.assertEquals(self.etree.tostring(root).replace('\r', ''), + xml.replace('\r', '')) + def _writeElement(self, element, encoding='us-ascii'): """Write out element for comparison. """ Modified: lxml/trunk/src/lxml/tests/test_io.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_io.py (original) +++ lxml/trunk/src/lxml/tests/test_io.py Fri May 5 10:59:58 2006 @@ -85,14 +85,6 @@ root = self.etree.ElementTree().parse(f) self.assert_(root.tag.endswith('foo')) - def test_class_parse_fileobject_crlf(self): - # libxml2 <= 2.6.23 has a bug reading CRLF files in chunks - xml = '' + 'test\r\n' * 100 + '' - f = SillyFileLike(xml) - root = self.etree.ElementTree().parse(f) - self.assertEquals(self.etree.tostring(root).replace('\r', ''), - xml.replace('\r', '')) - def test_module_parse_large_fileobject(self): # parse from unamed file object f = LargeFileLike() From scoder at codespeak.net Fri May 5 11:13:26 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Fri May 5 11:13:27 2006 Subject: [Lxml-checkins] r26801 - lxml/trunk/src/lxml Message-ID: <20060505091326.09F0F10088@code0.codespeak.net> Author: scoder Date: Fri May 5 11:13:25 2006 New Revision: 26801 Modified: lxml/trunk/src/lxml/apihelpers.pxi Log: make _stripDeclaration a little more tolerant Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Fri May 5 11:13:25 2006 @@ -122,13 +122,15 @@ tree.xmlNodeDumpOutput(c_buffer, c_doc, c_next, 0, 0, encoding) cdef object _stripDeclaration(object xml_string): + # this is a hack to remove the XML declaration when we encode to UTF-8 xml_string = xml_string.strip() if xml_string[:5] == '') if i != -1: - if xml_string[i+2:i+3] == '\n': + i = i + 2 + while xml_string[i:i+1] in '\n\r ': i = i+1 - xml_string = xml_string[i + 2:] + xml_string = xml_string[i:] return xml_string cdef _collectText(xmlNode* c_node): From scoder at codespeak.net Fri May 5 11:42:31 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Fri May 5 11:42:42 2006 Subject: [Lxml-checkins] r26802 - lxml/trunk/src/lxml/tests Message-ID: <20060505094231.2AA8210088@code0.codespeak.net> Author: scoder Date: Fri May 5 11:42:29 2006 New Revision: 26802 Modified: lxml/trunk/src/lxml/tests/common_imports.py Log: fix LargeFileLike in common_imports (was loosing content) Modified: lxml/trunk/src/lxml/tests/common_imports.py ============================================================================== --- lxml/trunk/src/lxml/tests/common_imports.py (original) +++ lxml/trunk/src/lxml/tests/common_imports.py Fri May 5 11:42:29 2006 @@ -64,12 +64,11 @@ for element in self.more: append(element) result = data.getvalue() + data.seek(0) + data.truncate() if amount: - self.data = StringIO(result[amount:]) + self.data.write(result[amount:]) result = result[:amount] - else: - data.seek(0) - data.truncate() return result def fileInTestDir(name): From scoder at codespeak.net Fri May 5 12:21:41 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Fri May 5 12:21:43 2006 Subject: [Lxml-checkins] r26803 - lxml/trunk/src/lxml/tests Message-ID: <20060505102141.519E510090@code0.codespeak.net> Author: scoder Date: Fri May 5 12:21:40 2006 New Revision: 26803 Modified: lxml/trunk/src/lxml/tests/common_imports.py Log: reduce size of large_file test case Modified: lxml/trunk/src/lxml/tests/common_imports.py ============================================================================== --- lxml/trunk/src/lxml/tests/common_imports.py (original) +++ lxml/trunk/src/lxml/tests/common_imports.py Fri May 5 12:21:40 2006 @@ -34,7 +34,7 @@ return '' class LargeFileLike: - def __init__(self, charlen=100, depth=4, children=10): + def __init__(self, charlen=100, depth=4, children=5): self.data = StringIO() self.chars = 'a' * charlen self.children = range(children) From scoder at codespeak.net Fri May 5 12:37:14 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Fri May 5 12:37:16 2006 Subject: [Lxml-checkins] r26805 - in lxml/trunk: . src/lxml Message-ID: <20060505103714.8869310090@code0.codespeak.net> Author: scoder Date: Fri May 5 12:37:12 2006 New Revision: 26805 Modified: lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/parser.pxi Log: re-enable chunk parsing on older libxml2 versions: new CRLF bug work-around in read loop, new chunk_size keyword for XMLParser Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Fri May 5 12:37:12 2006 @@ -7,9 +7,7 @@ Features added -------------- -* Parsing file-like objects now reads chunks rather than the whole file at - once (only on libxml2 >= 2.6.24, older versions have a bug with CRLF line - endings) +* Parsing file-like objects now reads chunks rather than the whole file * Parsing StringIO objects from the start avoids copying the string Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Fri May 5 12:37:12 2006 @@ -208,14 +208,14 @@ Note that you must not share parsers between threads. """ cdef int _parse_options + cdef object _chunk_size cdef xmlParserCtxt* _file_parser_ctxt cdef xmlParserCtxt* _memory_parser_ctxt cdef xmlParserCtxt* _push_parser_ctxt def __init__(self, attribute_defaults=False, dtd_validation=False, load_dtd=False, no_network=False, ns_clean=False, - recover=False): + recover=False, chunk_size=__FILE_READ_CHUNK_SIZE): cdef int parse_options - self._file_parser_ctxt = NULL BaseParser.__init__(self) parse_options = _XML_DEFAULT_PARSE_OPTIONS @@ -235,6 +235,7 @@ parse_options = parse_options | xmlparser.XML_PARSE_RECOVER self._parse_options = parse_options + self._chunk_size = int(chunk_size) def __dealloc__(self): if self._file_parser_ctxt != NULL: @@ -313,18 +314,18 @@ try: read = filelike.read - data = read(__FILE_READ_CHUNK_SIZE) + data = read(self._chunk_size) if python.PyUnicode_Check(data): - data = _stripDeclaration(_utf8(data)) + data = _stripDeclaration(data) + data = _utf8(data) while data: - if python.PyUnicode_Check(data): - data = _utf8(data) - elif not python.PyString_Check(data): - raise TypeError, "File-like objects must return string or unicode" + if _LIBXML_VERSION_INT < 20624: + # CRLF reading bug in libxml2 <= 2.6.23 + data = data.replace('\r\n', '\n') success = xmlparser.xmlParseChunk(pctxt, _cstr(data), len(data), 0) if success != 0: return _handleParseResult(pctxt, NULL, c_filename, 0) - data = read(__FILE_READ_CHUNK_SIZE) + data = _utf8( read(self._chunk_size) ) xmlparser.xmlParseChunk(pctxt, NULL, 0, 1) except Exception: if pctxt.myDoc is not NULL: @@ -596,9 +597,5 @@ cdef xmlDoc* c_doc if url is not None: url = _utf8(url) - # CRLF reading bug in libxml2 <= 2.6.23 - if _LIBXML_VERSION_INT >= 20624: - c_doc = _parseDocFromFilelike(source, url, parser) - return _documentFactory(c_doc, parser) - else: - return _parseMemoryDocument(source.read(), url, parser) + c_doc = _parseDocFromFilelike(source, url, parser) + return _documentFactory(c_doc, parser) From scoder at codespeak.net Fri May 5 12:41:31 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Fri May 5 12:41:33 2006 Subject: [Lxml-checkins] r26806 - lxml/trunk/src/lxml/tests Message-ID: <20060505104131.006A510090@code0.codespeak.net> Author: scoder Date: Fri May 5 12:41:30 2006 New Revision: 26806 Modified: lxml/trunk/src/lxml/tests/test_etree.py Log: increase chance that fileobject_crlf test case captures the bug Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Fri May 5 12:41:30 2006 @@ -426,11 +426,13 @@ self.assertEquals(docinfo.doctype, '') def test_parse_fileobject_crlf(self): - # libxml2 <= 2.6.23 has a bug reading CRLF files in chunks - xml = '' + 'test\r\n\r\n' * 10000 + '' + # libxml2 < 2.6.23 has a bug reading CRLF files in chunks + etree = self.etree + parser = etree.XMLParser(chunk_size=3) + xml = '' + '\r\ntest\r\n\r\n' * 10 + '' f = SillyFileLike(xml) - root = self.etree.parse(f).getroot() - self.assertEquals(self.etree.tostring(root).replace('\r', ''), + root = etree.parse(f, parser).getroot() + self.assertEquals(etree.tostring(root).replace('\r', ''), xml.replace('\r', '')) def _writeElement(self, element, encoding='us-ascii'): From scoder at codespeak.net Fri May 5 12:43:16 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Fri May 5 12:43:18 2006 Subject: [Lxml-checkins] r26807 - lxml/trunk/src/lxml Message-ID: <20060505104316.DDE9E10090@code0.codespeak.net> Author: scoder Date: Fri May 5 12:43:15 2006 New Revision: 26807 Modified: lxml/trunk/src/lxml/parser.pxi Log: do not apply CRLF bug fix to libxml2 2.6.23, only older versions Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Fri May 5 12:43:15 2006 @@ -319,8 +319,8 @@ data = _stripDeclaration(data) data = _utf8(data) while data: - if _LIBXML_VERSION_INT < 20624: - # CRLF reading bug in libxml2 <= 2.6.23 + if _LIBXML_VERSION_INT <= 20622: + # CRLF reading bug in libxml2 <= 2.6.22 data = data.replace('\r\n', '\n') success = xmlparser.xmlParseChunk(pctxt, _cstr(data), len(data), 0) if success != 0: From scoder at codespeak.net Fri May 5 12:47:24 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Fri May 5 12:47:25 2006 Subject: [Lxml-checkins] r26808 - lxml/trunk/src/lxml Message-ID: <20060505104724.7868710090@code0.codespeak.net> Author: scoder Date: Fri May 5 12:47:15 2006 New Revision: 26808 Modified: lxml/trunk/src/lxml/parser.pxi Log: clean up in XMLParser.__init__ Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Fri May 5 12:47:15 2006 @@ -202,7 +202,7 @@ major run-time overhead. The keyword arguments in the constructor are mainly based on the libxml2 - parser configuration. A DTD will only be loaded if validation or + parser configuration. A DTD will also be loaded if validation or attribute default values are requested. Note that you must not share parsers between threads. @@ -222,11 +222,11 @@ if load_dtd: parse_options = parse_options | xmlparser.XML_PARSE_DTDLOAD if dtd_validation: - parse_options = parse_options | xmlparser.XML_PARSE_DTDLOAD | \ - xmlparser.XML_PARSE_DTDVALID + parse_options = parse_options | xmlparser.XML_PARSE_DTDVALID | \ + xmlparser.XML_PARSE_DTDLOAD if attribute_defaults: - parse_options = parse_options | xmlparser.XML_PARSE_DTDLOAD | \ - xmlparser.XML_PARSE_DTDATTR + parse_options = parse_options | xmlparser.XML_PARSE_DTDATTR | \ + xmlparser.XML_PARSE_DTDLOAD if no_network: parse_options = parse_options | xmlparser.XML_PARSE_NONET if ns_clean: From scoder at codespeak.net Fri May 5 13:33:48 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Fri May 5 13:33:49 2006 Subject: [Lxml-checkins] r26810 - lxml/trunk/src/lxml Message-ID: <20060505113348.E638E10092@code0.codespeak.net> Author: scoder Date: Fri May 5 13:33:48 2006 New Revision: 26810 Modified: lxml/trunk/src/lxml/parser.pxi Log: doc updates Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Fri May 5 13:33:48 2006 @@ -205,7 +205,17 @@ parser configuration. A DTD will also be loaded if validation or attribute default values are requested. - Note that you must not share parsers between threads. + Available keyword arguments: + * attribute_defaults - read default attributes from DTD + * dtd_validation - validate (if DTD is available) + * load_dtd - use DTD for parsing + * no_network - prevent network access + * ns_clean - clean up redundant namespace declarations + * recover - try hard to parse through broken XML + * chunk_size - read this many bytes from file-like objects + + Note that you must not share parsers between threads. This applies also + to the default parser. """ cdef int _parse_options cdef object _chunk_size @@ -216,6 +226,9 @@ load_dtd=False, no_network=False, ns_clean=False, recover=False, chunk_size=__FILE_READ_CHUNK_SIZE): cdef int parse_options + self._memory_parser_ctxt = NULL + self._file_parser_ctxt = NULL + self._push_parser_ctxt = NULL BaseParser.__init__(self) parse_options = _XML_DEFAULT_PARSE_OPTIONS @@ -246,6 +259,7 @@ xmlparser.xmlFreeParserCtxt(self._push_parser_ctxt) def copy(self): + "Create a new parser with the same configuration." cdef XMLParser parser parser = self._copy() parser._parse_options = self._parse_options @@ -423,6 +437,11 @@ tree. By default, it can read broken (non well-formed) HTML, depending on the capabilities of libxml2. Use the 'recover' option to switch this off. + Available keyword arguments: + * recover - try hard to parse through broken HTML (default: True) + * no_network - prevent network access + * remove_blank_text - clean up empty text nodes + Note that you must not share parsers between threads. """ cdef int _parse_options @@ -453,6 +472,7 @@ htmlparser.htmlFreeParserCtxt(self._memory_parser_ctxt) def copy(self): + "Create a new parser with the same configuration." cdef HTMLParser parser parser = self._copy() parser._parse_options = self._parse_options From scoder at codespeak.net Fri May 5 13:36:02 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Fri May 5 13:36:03 2006 Subject: [Lxml-checkins] r26811 - lxml/trunk/src/lxml Message-ID: <20060505113602.92B8310092@code0.codespeak.net> Author: scoder Date: Fri May 5 13:36:01 2006 New Revision: 26811 Modified: lxml/trunk/src/lxml/relaxng.pxi lxml/trunk/src/lxml/xmlschema.pxi Log: valgrind fixes Modified: lxml/trunk/src/lxml/relaxng.pxi ============================================================================== --- lxml/trunk/src/lxml/relaxng.pxi (original) +++ lxml/trunk/src/lxml/relaxng.pxi Fri May 5 13:36:01 2006 @@ -24,7 +24,7 @@ cdef xmlNode* c_node cdef xmlDoc* fake_c_doc cdef relaxng.xmlRelaxNGParserCtxt* parser_ctxt - + self._c_schema = NULL fake_c_doc = NULL if etree is not None: doc = _documentOrRaise(etree) Modified: lxml/trunk/src/lxml/xmlschema.pxi ============================================================================== --- lxml/trunk/src/lxml/xmlschema.pxi (original) +++ lxml/trunk/src/lxml/xmlschema.pxi Fri May 5 13:36:01 2006 @@ -23,7 +23,7 @@ cdef xmlDoc* fake_c_doc cdef xmlNode* c_node cdef xmlschema.xmlSchemaParserCtxt* parser_ctxt - + self._c_schema = NULL if etree is not None: doc = _documentOrRaise(etree) root_node = _rootNodeOf(etree) From scoder at codespeak.net Fri May 5 15:56:08 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Fri May 5 15:56:10 2006 Subject: [Lxml-checkins] r26814 - lxml/trunk/src/lxml Message-ID: <20060505135608.B875E10092@code0.codespeak.net> Author: scoder Date: Fri May 5 15:56:07 2006 New Revision: 26814 Modified: lxml/trunk/src/lxml/extensions.pxi Log: cleanup in _createNodeSetResult Modified: lxml/trunk/src/lxml/extensions.pxi ============================================================================== --- lxml/trunk/src/lxml/extensions.pxi (original) +++ lxml/trunk/src/lxml/extensions.pxi Fri May 5 15:56:07 2006 @@ -248,7 +248,6 @@ cdef object _createNodeSetResult(xpath.xmlXPathObject* xpathObj, _Document doc): cdef xmlNode* c_node cdef char* s - cdef _NodeBase element cdef int i result = [] if xpathObj.nodesetval is NULL: @@ -262,18 +261,17 @@ # -> we store Python refs to these, so that is OK # XSLT: can it leak when merging trees from multiple sources? c_node = tree.xmlDocCopyNode(c_node, doc._c_doc, 1) - element = _elementFactory(doc, c_node) - result.append(element) + value = _elementFactory(doc, c_node) elif c_node.type == tree.XML_TEXT_NODE: - result.append(funicode(c_node.content)) + value = funicode(c_node.content) elif c_node.type == tree.XML_ATTRIBUTE_NODE: s = tree.xmlNodeGetContent(c_node) - attr_value = funicode(s) + value = funicode(s) tree.xmlFree(s) - result.append(attr_value) else: print "Not yet implemented result node type:", c_node.type raise NotImplementedError + python.PyList_Append(result, value) return result cdef void _freeXPathObject(xpath.xmlXPathObject* xpathObj): From scoder at codespeak.net Fri May 5 15:56:40 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Fri May 5 15:56:41 2006 Subject: [Lxml-checkins] r26815 - lxml/trunk/src/lxml/tests Message-ID: <20060505135640.7CFB310092@code0.codespeak.net> Author: scoder Date: Fri May 5 15:56:39 2006 New Revision: 26815 Modified: lxml/trunk/src/lxml/tests/test_xpathevaluator.py Log: new test case for node sets as XPath variable values Modified: lxml/trunk/src/lxml/tests/test_xpathevaluator.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_xpathevaluator.py (original) +++ lxml/trunk/src/lxml/tests/test_xpathevaluator.py Fri May 5 15:56:39 2006 @@ -182,6 +182,18 @@ self.assertEquals(1, len(r)) self.assertEquals("true", r[0].get('attr')) + def test_xpath_variables_nodeset(self): + x = self.parse('') + e = etree.XPathEvaluator(x) + + element = etree.Element("test-el") + etree.SubElement(element, "test-sub") + expr = "$value" + r = e.evaluate(expr, value=element) + self.assertEquals(1, len(r)) + self.assertEquals(element.tag, r[0].tag) + self.assertEquals(element[0].tag, r[0][0].tag) + def test_xpath_extensions_mix(self): x = self.parse('') From scoder at codespeak.net Fri May 5 15:59:50 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Fri May 5 15:59:51 2006 Subject: [Lxml-checkins] r26816 - lxml/trunk/src/lxml Message-ID: <20060505135950.848AC10092@code0.codespeak.net> Author: scoder Date: Fri May 5 15:59:49 2006 New Revision: 26816 Modified: lxml/trunk/src/lxml/xpath.pxd lxml/trunk/src/lxml/xpath.pxi Log: cleanup in XPathContext, free variables with xmlXPathRegisteredVariablesCleanup, exceptions consistently raised in base class XPathEvaluatorBase Modified: lxml/trunk/src/lxml/xpath.pxd ============================================================================== --- lxml/trunk/src/lxml/xpath.pxd (original) +++ lxml/trunk/src/lxml/xpath.pxd Fri May 5 15:59:49 2006 @@ -109,8 +109,7 @@ char* name, char* ns_uri, xmlXPathObject* value) - cdef xmlXPathObject* xmlXPathVariableLookup(xmlXPathContext* ctxt, - char* name) + cdef void xmlXPathRegisteredVariablesCleanup(xmlXPathContext *ctxt) cdef xmlXPathObject* valuePop (xmlXPathParserContext *ctxt) cdef int valuePush(xmlXPathParserContext* ctxt, xmlXPathObject *value) Modified: lxml/trunk/src/lxml/xpath.pxi ============================================================================== --- lxml/trunk/src/lxml/xpath.pxi (original) +++ lxml/trunk/src/lxml/xpath.pxi Fri May 5 15:59:49 2006 @@ -11,11 +11,9 @@ cdef class _XPathContext(_BaseContext): cdef object _variables - cdef object _registered_variables def __init__(self, namespaces, extensions, variables): self._ext_lookup_function = _function_check self._variables = variables - self._registered_variables = [] _BaseContext.__init__(self, namespaces, extensions) cdef register_context(self, xpath.xmlXPathContext* xpathCtxt, _Document doc): @@ -32,32 +30,15 @@ xpathCtxt = self._xpathCtxt if xpathCtxt is NULL: return - self._unregisterVariables() - del self._registered_variables[:] + xpath.xmlXPathRegisteredVariablesCleanup(xpathCtxt) self._unregister_context() - cdef void _unregisterVariables(self): - cdef xpath.xmlXPathContext* xpathCtxt - cdef xpath.xmlXPathObject* xpathVarValue - cdef char* c_name - xpathCtxt = self._xpathCtxt - for name_utf in self._registered_variables: - c_name = _cstr(name_utf) - xpathVarValue = xpath.xmlXPathVariableLookup(xpathCtxt, c_name) - if xpathVarValue is not NULL: - xpath.xmlXPathRegisterVariable(xpathCtxt, c_name, NULL) - _freeXPathObject(xpathVarValue) - def registerVariables(self, variable_dict): for name, value in variable_dict.items(): - name_utf = self._to_utf(name) - self._registerVariable(name_utf, value) - python.PyList_Append(self._registered_variables, name_utf) + self._registerVariable(self._to_utf(name), value) def registerVariable(self, name, value): - name_utf = self._to_utf(name) - self._registerVariable(name_utf, value) - python.PyList_Append(self._registered_variables, name_utf) + self._registerVariable(self._to_utf(name), value) cdef void _registerVariable(self, name_utf, value): xpath.xmlXPathRegisterVariable( @@ -65,11 +46,24 @@ cdef class XPathEvaluatorBase: + cdef xpath.xmlXPathContext* _xpathCtxt cdef _XPathContext _context def __init__(self, namespaces, extensions, variables=None): self._context = _XPathContext(namespaces, extensions, variables) + def __dealloc__(self): + if self._xpathCtxt is not NULL: + xpath.xmlXPathFreeContext(self._xpathCtxt) + + cdef _raise_parse_error(self): + if self._xpathCtxt is not NULL and \ + self._xpathCtxt.lastError.message is not NULL: + message = funicode(self._xpathCtxt.lastError.message) + else: + message = "Error in xpath expression." + raise XPathSyntaxError, message + cdef object _handle_result(self, xpath.xmlXPathObject* xpathObj, _Document doc): if self._context._exc._has_raised(): if xpathObj is not NULL: @@ -80,17 +74,14 @@ if xpathObj is NULL: self._context._release_temp_refs() - raise XPathSyntaxError, "Error in xpath expression." + self._raise_parse_error() try: result = _unwrapXPathObject(xpathObj, doc) - except XPathResultError: + finally: _freeXPathObject(xpathObj) self._context._release_temp_refs() - raise - _freeXPathObject(xpathObj) - self._context._release_temp_refs() return result @@ -99,7 +90,6 @@ XPath evaluators must not be shared between threads. """ - cdef xpath.xmlXPathContext* _c_ctxt cdef _Element _element def __init__(self, _NodeBase element not None, namespaces=None, extensions=None): cdef xpath.xmlXPathContext* xpathCtxt @@ -107,16 +97,12 @@ cdef _Document doc doc = element._doc xpathCtxt = xpath.xmlXPathNewContext(doc._c_doc) + self._xpathCtxt = xpathCtxt if xpathCtxt is NULL: raise XPathContextError, "Unable to create new XPath context" self._element = element - self._c_ctxt = xpathCtxt XPathEvaluatorBase.__init__(self, namespaces, extensions) - def __dealloc__(self): - if self._c_ctxt is not NULL: - xpath.xmlXPathFreeContext(self._c_ctxt) - def registerNamespace(self, prefix, uri): """Register a namespace with the XPath context. """ @@ -137,7 +123,7 @@ cdef xpath.xmlXPathObject* xpathObj cdef xmlNode* c_node cdef _Document doc - xpathCtxt = self._c_ctxt + xpathCtxt = self._xpathCtxt xpathCtxt.node = self._element._c_node doc = self._element._doc @@ -173,9 +159,7 @@ cdef class XPath(XPathEvaluatorBase): - cdef xpath.xmlXPathContext* _xpathCtxt cdef xpath.xmlXPathCompExpr* _xpath - cdef object _prefix_map cdef readonly object path def __init__(self, path, namespaces=None, extensions=None): @@ -184,7 +168,7 @@ path = _utf8(path) self._xpath = xpath.xmlXPathCompile(_cstr(path)) if self._xpath is NULL: - raise XPathSyntaxError, "Error in XPath expression" + self._raise_parse_error() self._xpathCtxt = xpath.xmlXPathNewContext(NULL) def __call__(self, _etree_or_element, **_variables): @@ -214,8 +198,6 @@ return self(_tree, **_variables) def __dealloc__(self): - if self._xpathCtxt is not NULL: - xpath.xmlXPathFreeContext(self._xpathCtxt) if self._xpath is not NULL: xpath.xmlXPathFreeCompExpr(self._xpath) From scoder at codespeak.net Fri May 5 19:27:15 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Fri May 5 19:27:16 2006 Subject: [Lxml-checkins] r26821 - in lxml/trunk: doc src/lxml src/lxml/tests Message-ID: <20060505172715.42D7A10092@code0.codespeak.net> Author: scoder Date: Fri May 5 19:27:10 2006 New Revision: 26821 Modified: lxml/trunk/doc/extensions.txt lxml/trunk/src/lxml/extensions.pxi lxml/trunk/src/lxml/python.pxd lxml/trunk/src/lxml/tests/test_xslt.py lxml/trunk/src/lxml/xpath.pxd lxml/trunk/src/lxml/xslt.pxi Log: fix and describe (doctests) API for evaluator-local extension functions Modified: lxml/trunk/doc/extensions.txt ============================================================================== --- lxml/trunk/doc/extensions.txt (original) +++ lxml/trunk/doc/extensions.txt Fri May 5 19:27:10 2006 @@ -144,6 +144,85 @@ XPathSyntaxError: Error in xpath expression. +Evaluator-local extensions +-------------------------- + +Apart from the global registration of extension functions, there is also a way +of making extensions known to a single Evaluator or XSLT. All evaluators and +the XSLT object accept a keyword argument ``extensions`` in their constructor. +The value is a dictionary mapping (namespace, name) tuples to functions:: + + >>> extensions = {('local-ns', 'local-hello') : hello} + >>> namespaces = {'l' : 'local-ns'} + + >>> e = etree.XPathEvaluator(doc, namespaces=namespaces, extensions=extensions) + >>> print e.evaluate('l:local-hello(string(b))') + Hello Haegar + +For larger numbers of extension functions, you can define classes or modules +and use the ``Extension`` helper:: + + >>> class MyExt: + ... def function1(self, _, arg): + ... return '1'+arg + ... def function2(self, _, arg): + ... return '2'+arg + ... def function3(self, _, arg): + ... return '3'+arg + + >>> ext_module = MyExt() + >>> functions = ('function1', 'function2') + >>> extensions = etree.Extension( ext_module, functions, 'local-ns' ) + + >>> e = etree.XPathEvaluator(doc, namespaces=namespaces, extensions=extensions) + >>> print e.evaluate('l:function1(string(b))') + 1Haegar + +The second argument to ``Extension`` can either be be a sequence of names to +select from the module, a dictionary that explicitly maps function names to +their XPath alter-ego or ``None`` (explicitly passed) to take all available +functions under their original name (if their name does not start with '_'). + +The third argument takes a namespace URI or ``None`` (also if left out) for +the default namespace. The following examples will therefore all do the same +thing:: + + >>> functions = ('function1', 'function2', 'function3') + >>> extensions = etree.Extension( ext_module, functions ) + >>> e = etree.XPathEvaluator(doc, extensions=extensions) + >>> print e.evaluate('function1(function2(function3(string(b))))') + 123Haegar + + >>> extensions = etree.Extension( ext_module, functions, None ) + >>> e = etree.XPathEvaluator(doc, extensions=extensions) + >>> print e.evaluate('function1(function2(function3(string(b))))') + 123Haegar + + >>> extensions = etree.Extension( ext_module, None ) + >>> e = etree.XPathEvaluator(doc, extensions=extensions) + >>> print e.evaluate('function1(function2(function3(string(b))))') + 123Haegar + + >>> functions = { + ... 'function1' : 'function1', + ... 'function2' : 'function2', + ... 'function3' : 'function3' + ... } + >>> extensions = etree.Extension( ext_module, functions ) + >>> e = etree.XPathEvaluator(doc, extensions=extensions) + >>> print e.evaluate('function1(function2(function3(string(b))))') + 123Haegar + +For convenience, you can also pass a sequence of extensions:: + + >>> extensions1 = etree.Extension( ext_module, None ) + >>> extensions2 = etree.Extension( ext_module, None, 'local-ns' ) + >>> e = etree.XPathEvaluator(doc, extensions=[extensions1, extensions2], + ... namespaces=namespaces) + >>> print e.evaluate('function1(l:function2(function3(string(b))))') + 123Haegar + + What to return from a function ------------------------------ Modified: lxml/trunk/src/lxml/extensions.pxi ============================================================================== --- lxml/trunk/src/lxml/extensions.pxi (original) +++ lxml/trunk/src/lxml/extensions.pxi Fri May 5 19:27:10 2006 @@ -34,17 +34,18 @@ self._function_cache = {} self._called_function = None - # convert old format extensions to UTF-8 - if isinstance(extensions, (list, tuple)): + if extensions is not None: + # convert extensions to UTF-8 + if python.PyDict_Check(extensions): + extensions = (extensions,) + # format: [ {(ns,name):function} ] -> {(ns_utf,name_utf):function} new_extensions = {} for extension in extensions: for (ns_uri, name), function in extension.items(): ns_utf = self._to_utf(ns_uri) name_utf = self._to_utf(name) - try: - new_extensions[ns_utf][name_utf] = function - except KeyError: - new_extensions[ns_utf] = {name_utf : function} + python.PyDict_SetItem( + new_extensions, (ns_utf, name_utf), function) extensions = new_extensions or None self._doc = None @@ -81,7 +82,7 @@ self._xpathCtxt, self._ext_lookup_function, self) cdef _unregister_context(self): - self._unregisterNamespaces() + xpath.xmlXPathRegisteredNsCleanup(self._xpathCtxt) self._free_context() cdef _free_context(self): @@ -107,13 +108,6 @@ prefix_utf = self._to_utf(prefix) ns_uri_utf = self._to_utf(ns_uri) xpath.xmlXPathRegisterNs(self._xpathCtxt, prefix_utf, ns_uri_utf) - python.PyList_Append(self._registered_namespaces, prefix_utf) - - cdef _unregisterNamespaces(self): - cdef xpath.xmlXPathContext* xpathCtxt - xpathCtxt = self._xpathCtxt - for prefix_utf in self._registered_namespaces: - xpath.xmlXPathRegisterNs(xpathCtxt, prefix_utf, NULL) # extension functions @@ -126,9 +120,8 @@ self._called_function = function return function is not None - dict_result = python.PyDict_GetItem(self._extensions, ns_uri_utf) - if dict_result is not NULL: - dict_result = python.PyDict_GetItem(dict_result, name_utf) + if self._extensions is not None: + dict_result = python.PyDict_GetItem(self._extensions, key) if dict_result is not NULL: function = dict_result else: @@ -165,11 +158,22 @@ self._temp_refs.add(element._doc) -def Extension(module, function_mapping, ns_uri=None): - functions = [] - for function_name, xpath_name in function_mapping.items(): - functions[xpath_name] = getattr(module, function_name) - return {ns_uri : functions} +def Extension(module, function_mapping, ns=None): + functions = {} + if python.PyDict_Check(function_mapping): + for function_name, xpath_name in function_mapping.items(): + python.PyDict_SetItem(functions, (ns, xpath_name), + getattr(module, function_name)) + else: + if function_mapping is None: + function_mapping = [] + for name in dir(module): + if not name.startswith('_'): + python.PyList_Append(function_mapping, name) + for function_name in function_mapping: + python.PyDict_SetItem(functions, (ns, function_name), + getattr(module, function_name)) + return functions ################################################################################ Modified: lxml/trunk/src/lxml/python.pxd ============================================================================== --- lxml/trunk/src/lxml/python.pxd (original) +++ lxml/trunk/src/lxml/python.pxd Fri May 5 19:27:10 2006 @@ -33,6 +33,7 @@ cdef object PySequence_Tuple(object o) cdef object PyTuple_GET_ITEM(object o, int pos) + cdef int PyDict_Check(object instance) cdef int PyNumber_Check(object instance) cdef int PyBool_Check(object instance) cdef int PySequence_Check(object instance) Modified: lxml/trunk/src/lxml/tests/test_xslt.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_xslt.py (original) +++ lxml/trunk/src/lxml/tests/test_xslt.py Fri May 5 19:27:10 2006 @@ -341,7 +341,7 @@ def mytext(ctxt, values): return 'X' * len(values) - result = tree.xslt(style, {'testns' : {'mytext' : mytext}}) + result = tree.xslt(style, {('testns', 'mytext') : mytext}) self.assertEquals(self._rootstring(result), 'X') Modified: lxml/trunk/src/lxml/xpath.pxd ============================================================================== --- lxml/trunk/src/lxml/xpath.pxd (original) +++ lxml/trunk/src/lxml/xpath.pxd Fri May 5 19:27:10 2006 @@ -110,6 +110,7 @@ char* ns_uri, xmlXPathObject* value) cdef void xmlXPathRegisteredVariablesCleanup(xmlXPathContext *ctxt) + cdef void xmlXPathRegisteredNsCleanup(xmlXPathContext *ctxt) cdef xmlXPathObject* valuePop (xmlXPathParserContext *ctxt) cdef int valuePush(xmlXPathParserContext* ctxt, xmlXPathObject *value) Modified: lxml/trunk/src/lxml/xslt.pxi ============================================================================== --- lxml/trunk/src/lxml/xslt.pxi (original) +++ lxml/trunk/src/lxml/xslt.pxi Fri May 5 19:27:10 2006 @@ -153,15 +153,9 @@ self._release_temp_refs() cdef _registerLocalExtensionFunction(self, ns_utf, name_utf, function): - extensions = self._extensions - if extensions is None: - self._extensions = {ns_utf:{name_utf:function}} - else: - if ns_utf in extensions: - ns_extensions = extensions[ns_utf] - else: - ns_extensions = extensions[ns_utf] = {} - python.PyDict_SetItem(ns_extensions, name_utf, function) + if self._extensions is None: + self._extensions = {} + python.PyDict_SetItem(self._extensions, (ns_utf, name_utf), function) cdef class _ExsltRegExp # forward declaration From scoder at codespeak.net Fri May 5 19:48:26 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Fri May 5 19:48:27 2006 Subject: [Lxml-checkins] r26824 - lxml/trunk/src/lxml Message-ID: <20060505174826.86B9010094@code0.codespeak.net> Author: scoder Date: Fri May 5 19:48:23 2006 New Revision: 26824 Modified: lxml/trunk/src/lxml/cstd.pxd lxml/trunk/src/lxml/extensions.pxi lxml/trunk/src/lxml/proxy.pxi lxml/trunk/src/lxml/python.pxd lxml/trunk/src/lxml/xpath.pxi lxml/trunk/src/lxml/xslt.pxi Log: clean up, use Python memory management instead of plain C-malloc Modified: lxml/trunk/src/lxml/cstd.pxd ============================================================================== --- lxml/trunk/src/lxml/cstd.pxd (original) +++ lxml/trunk/src/lxml/cstd.pxd Fri May 5 19:48:23 2006 @@ -1,8 +1,4 @@ -cdef extern from "stdlib.h": - cdef void* malloc(int size) - void free(void* ptr) - cdef extern from "stdarg.h": ctypedef void *va_list void va_start(va_list ap, void *last) @@ -11,4 +7,3 @@ cdef extern from "etree.h": cdef int va_int(va_list ap) cdef char *va_charptr(va_list ap) - Modified: lxml/trunk/src/lxml/extensions.pxi ============================================================================== --- lxml/trunk/src/lxml/extensions.pxi (original) +++ lxml/trunk/src/lxml/extensions.pxi Fri May 5 19:48:23 2006 @@ -20,7 +20,6 @@ cdef _Document _doc cdef object _extensions cdef object _namespaces - cdef object _registered_namespaces cdef object _utf_refs cdef object _function_cache cdef object _called_function @@ -52,7 +51,6 @@ self._exc = _ExceptionContext() self._extensions = extensions self._namespaces = namespaces - self._registered_namespaces = [] self._temp_refs = _TempStore() cdef object _to_utf(self, s): @@ -71,7 +69,7 @@ self._xpathCtxt = xpathCtxt xpathCtxt.userData = self - cdef _register_context(self, _Document doc, int allow_none_namespace): + cdef _register_context(self, _Document doc): self._doc = doc self._exc.clear() python.PyDict_Clear(self._function_cache) @@ -86,7 +84,6 @@ self._free_context() cdef _free_context(self): - del self._registered_namespaces[:] python.PyDict_Clear(self._utf_refs) self._doc = None if self._xpathCtxt is not NULL: Modified: lxml/trunk/src/lxml/proxy.pxi ============================================================================== --- lxml/trunk/src/lxml/proxy.pxi (original) +++ lxml/trunk/src/lxml/proxy.pxi Fri May 5 19:48:23 2006 @@ -41,7 +41,7 @@ return # XXX should we check whether we ran into proxy_type before? #print "registering for:", proxy._c_node - ref = cstd.malloc(sizeof(ProxyRef)) + ref = python.PyMem_Malloc(sizeof(ProxyRef)) ref.proxy = proxy ref.type = proxy_type ref.next = c_node._private @@ -59,7 +59,7 @@ ref = c_node._private if ref.proxy == proxy_ref: c_node._private = ref.next - cstd.free(ref) + python.PyMem_Free(ref) return prev_ref = ref #print "First registered is:", ref.type @@ -68,7 +68,7 @@ #print "Registered is:", ref.type if ref.proxy == proxy_ref: prev_ref.next = ref.next - cstd.free(ref) + python.PyMem_Free(ref) return prev_ref = ref ref = ref.next Modified: lxml/trunk/src/lxml/python.pxd ============================================================================== --- lxml/trunk/src/lxml/python.pxd (original) +++ lxml/trunk/src/lxml/python.pxd Fri May 5 19:48:23 2006 @@ -2,6 +2,7 @@ cdef extern from "Python.h": ctypedef struct PyObject + ctypedef int size_t cdef FILE* PyFile_AsFile(PyObject* p) cdef int PyFile_Check(object p) @@ -39,6 +40,9 @@ cdef int PySequence_Check(object instance) cdef int PyType_Check(object instance) + cdef void* PyMem_Malloc(size_t size) + cdef void PyMem_Free(void* p) + cdef extern from "etree.h": # redefines some functions as macros cdef int isinstance(object instance, object classes) cdef int issubclass(object derived, object superclasses) Modified: lxml/trunk/src/lxml/xpath.pxi ============================================================================== --- lxml/trunk/src/lxml/xpath.pxi (original) +++ lxml/trunk/src/lxml/xpath.pxi Fri May 5 19:48:23 2006 @@ -21,7 +21,7 @@ ns_prefixes = _find_all_extension_prefixes() if ns_prefixes: self.registerNamespaces(ns_prefixes) - self._register_context(doc, 1) + self._register_context(doc) if self._variables is not None: self.registerVariables(self._variables) Modified: lxml/trunk/src/lxml/xslt.pxi ============================================================================== --- lxml/trunk/src/lxml/xslt.pxi (original) +++ lxml/trunk/src/lxml/xslt.pxi Fri May 5 19:48:23 2006 @@ -139,7 +139,7 @@ _Document doc): self._xsltCtxt = xsltCtxt self._set_xpath_context(xsltCtxt.xpathCtxt) - self._register_context(doc, 0) + self._register_context(doc) xsltCtxt.xpathCtxt.userData = self cdef free_context(self): @@ -260,10 +260,10 @@ # allocate space for parameters # * 2 as we want an entry for both key and value, # and + 1 as array is NULL terminated - params = cstd.malloc(sizeof(char*) * (len(_kw) * 2 + 1)) + params = python.PyMem_Malloc(sizeof(char*) * (len(_kw) * 2 + 1)) i = 0 keep_ref = [] - for key, value in _kw.items(): + for key, value in _kw.iteritems(): k = _utf8(key) python.PyList_Append(keep_ref, k) v = _utf8(value) @@ -285,7 +285,7 @@ if params is not NULL: # deallocate space for parameters - cstd.free(params) + python.PyMem_Free(params) self._context.free_context() c_doc._private = ptemp # restore _private before _destroyFakeDoc! From scoder at codespeak.net Sat May 6 10:07:33 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Sat May 6 10:07:34 2006 Subject: [Lxml-checkins] r26844 - lxml/trunk/src/lxml Message-ID: <20060506080733.BE7C01007E@code0.codespeak.net> Author: scoder Date: Sat May 6 10:07:32 2006 New Revision: 26844 Modified: lxml/trunk/src/lxml/xslt.pxi Log: cleanup Modified: lxml/trunk/src/lxml/xslt.pxi ============================================================================== --- lxml/trunk/src/lxml/xslt.pxi (original) +++ lxml/trunk/src/lxml/xslt.pxi Sat May 6 10:07:32 2006 @@ -234,7 +234,7 @@ cdef xmlDoc* c_doc cdef char** params cdef void* ptemp - cdef int i + cdef int i, kw_count input_doc = _documentOrRaise(_input) root_node = _rootNodeOf(_input) @@ -256,11 +256,13 @@ ptemp = c_doc._private c_doc._private = resolver_context - if _kw: + kw_count = python.PyDict_Size(_kw) + if kw_count > 0: # allocate space for parameters # * 2 as we want an entry for both key and value, # and + 1 as array is NULL terminated - params = python.PyMem_Malloc(sizeof(char*) * (len(_kw) * 2 + 1)) + params = python.PyMem_Malloc( + sizeof(char*) * (kw_count * 2 + 1)) i = 0 keep_ref = [] for key, value in _kw.iteritems(): From scoder at codespeak.net Sat May 6 10:54:27 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Sat May 6 10:54:28 2006 Subject: [Lxml-checkins] r26848 - lxml/trunk/src/lxml Message-ID: <20060506085427.7AE0310082@code0.codespeak.net> Author: scoder Date: Sat May 6 10:54:25 2006 New Revision: 26848 Modified: lxml/trunk/src/lxml/apihelpers.pxi lxml/trunk/src/lxml/etree.h lxml/trunk/src/lxml/etree.pyx lxml/trunk/src/lxml/parser.pxi lxml/trunk/src/lxml/python.pxd lxml/trunk/src/lxml/xslt.pxi Log: support Py_ssize_t in Python 2.5 (compiling under Py2.5/64bit needs patched Pyrex) Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Sat May 6 10:54:25 2006 @@ -140,7 +140,7 @@ If there was no text to collect, return None """ - cdef int scount + cdef Py_ssize_t scount cdef char* text cdef xmlNode* c_node_cur # check for multiple text nodes @@ -179,17 +179,17 @@ tree.xmlFreeNode(c_node) c_node = c_next -cdef xmlNode* _findChild(xmlNode* c_node, int index): +cdef xmlNode* _findChild(xmlNode* c_node, Py_ssize_t index): if index < 0: return _findChildBackwards(c_node, -index - 1) else: return _findChildForwards(c_node, index) -cdef xmlNode* _findChildForwards(xmlNode* c_node, int index): +cdef xmlNode* _findChildForwards(xmlNode* c_node, Py_ssize_t index): """Return child element of c_node with index, or return NULL if not found. """ cdef xmlNode* c_child - cdef int c + cdef Py_ssize_t c c_child = c_node.children c = 0 while c_child is not NULL: @@ -201,12 +201,12 @@ else: return NULL -cdef xmlNode* _findChildBackwards(xmlNode* c_node, int index): +cdef xmlNode* _findChildBackwards(xmlNode* c_node, Py_ssize_t index): """Return child element of c_node with index, or return NULL if not found. Search from the end. """ cdef xmlNode* c_child - cdef int c + cdef Py_ssize_t c c_child = c_node.last c = 0 while c_child is not NULL: @@ -255,16 +255,11 @@ c_target = c_tail c_tail = c_next -### see etree.h: -## cdef int _isElement(xmlNode* c_node): -## return (c_node.type == tree.XML_ELEMENT_NODE or -## c_node.type == tree.XML_COMMENT_NODE) - -cdef xmlNode* _deleteSlice(xmlNode* c_node, int start, int stop): +cdef xmlNode* _deleteSlice(xmlNode* c_node, Py_ssize_t start, Py_ssize_t stop): """Delete slice, starting with c_node, start counting at start, end at stop. """ cdef xmlNode* c_next - cdef int c + cdef Py_ssize_t c if c_node is NULL: return NULL # now start deleting nodes Modified: lxml/trunk/src/lxml/etree.h ============================================================================== --- lxml/trunk/src/lxml/etree.h (original) +++ lxml/trunk/src/lxml/etree.h Sat May 6 10:54:25 2006 @@ -1,6 +1,17 @@ #ifndef HAS_ETREE_H #define HAS_ETREE_H +/* Py_ssize_t support was added in Python 2.5 */ +#if PY_VERSION_HEX < 0x02050000 +#ifndef PY_SSIZE_T_MAX /* patched Pyrex? */ + typedef int Py_ssize_t; + #define PY_SSIZE_T_MAX INT_MAX + #define PY_SSIZE_T_MIN INT_MIN + #define PyInt_FromSsize_t(z) PyInt_FromLong(z) + #define PyInt_AsSsize_t(o) PyInt_AsLong(o) +#endif +#endif + #define isinstance(o,c) PyObject_IsInstance(o,c) #define issubclass(c,csuper) PyObject_IsSubclass(c,csuper) #define hasattr(o,a) PyObject_HasAttr(o,a) Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Sat May 6 10:54:25 2006 @@ -1,7 +1,7 @@ cimport tree, python from tree cimport xmlDoc, xmlNode, xmlAttr, xmlNs, _isElement from python cimport isinstance, issubclass, hasattr, callable -from python cimport iter, str, _cstr +from python cimport iter, str, _cstr, Py_ssize_t cimport xinclude cimport c14n cimport cstd @@ -505,7 +505,7 @@ # MANIPULATORS - def __setitem__(self, index, _NodeBase element): + def __setitem__(self, Py_ssize_t index, _NodeBase element): cdef xmlNode* c_node cdef xmlNode* c_next cdef int foreign @@ -519,7 +519,7 @@ _moveTail(c_next, element._c_node) changeDocumentBelow(element, self._doc, foreign) - def __delitem__(self, index): + def __delitem__(self, Py_ssize_t index): cdef xmlNode* c_node c_node = _findChild(self._c_node, index) if c_node is NULL: @@ -527,12 +527,12 @@ _removeText(c_node.next) _removeNode(c_node) - def __delslice__(self, start, stop): + def __delslice__(self, Py_ssize_t start, Py_ssize_t stop): cdef xmlNode* c_node c_node = _findChild(self._c_node, start) _deleteSlice(c_node, start, stop) - def __setslice__(self, start, stop, value): + def __setslice__(self, Py_ssize_t start, Py_ssize_t stop, value): cdef xmlNode* c_node cdef xmlNode* c_next cdef _Element mynode @@ -713,17 +713,17 @@ def __repr__(self): return "" % (self.tag, id(self)) - def __getitem__(self, index): + def __getitem__(self, Py_ssize_t index): cdef xmlNode* c_node c_node = _findChild(self._c_node, index) if c_node is NULL: raise IndexError, "list index out of range" return _elementFactory(self._doc, c_node) - def __getslice__(self, start, stop): + def __getslice__(self, Py_ssize_t start, Py_ssize_t stop): cdef xmlNode* c_node cdef _Document doc - cdef int c, c_stop + cdef Py_ssize_t c # this does not work for negative start, stop, however, # python seems to convert these to positive start, stop before # calling, so this all works perfectly (at the cost of a len() call) @@ -731,10 +731,9 @@ if c_node is NULL: return [] c = start - c_stop = stop result = [] doc = self._doc - while c_node is not NULL and c < c_stop: + while c_node is not NULL and c < stop: if _isElement(c_node): ret = python.PyList_Append(result, _elementFactory(doc, c_node)) if ret: @@ -744,7 +743,7 @@ return result def __len__(self): - cdef int c + cdef Py_ssize_t c cdef xmlNode* c_node c = 0 c_node = self._c_node.children @@ -766,10 +765,8 @@ return ElementChildIterator(self, reversed=True) def index(self, _Element x not None, start=None, stop=None): - cdef int k - cdef int l - cdef int c_stop - cdef int c_start + cdef Py_ssize_t k, l + cdef Py_ssize_t c_start, c_stop cdef xmlNode* c_child cdef xmlNode* c_start_node c_child = x._c_node @@ -830,7 +827,7 @@ return k else: return k - if c_start or c_stop: + if c_start != 0 or c_stop != 0: raise ValueError, "list.index(x): x not in slice" else: raise ValueError, "list.index(x): x not in list" @@ -1053,7 +1050,7 @@ return result def __len__(self): - cdef int c + cdef Py_ssize_t c cdef xmlNode* c_node c = 0 c_node = (self._c_node.properties) Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Sat May 6 10:54:25 2006 @@ -483,7 +483,6 @@ """ cdef xmlDoc* result cdef xmlParserCtxt* pctxt - cdef int c_len cdef int recover self._error_log.connect() pctxt = self._memory_parser_ctxt Modified: lxml/trunk/src/lxml/python.pxd ============================================================================== --- lxml/trunk/src/lxml/python.pxd (original) +++ lxml/trunk/src/lxml/python.pxd Sat May 6 10:54:25 2006 @@ -3,6 +3,7 @@ cdef extern from "Python.h": ctypedef struct PyObject ctypedef int size_t + ctypedef int Py_ssize_t cdef FILE* PyFile_AsFile(PyObject* p) cdef int PyFile_Check(object p) @@ -13,14 +14,14 @@ cdef object PyUnicode_FromEncodedObject(object s, char* encoding, char* errors) - cdef object PyUnicode_DecodeUTF8(char* s, int size, char* errors) + cdef object PyUnicode_DecodeUTF8(char* s, Py_ssize_t size, char* errors) cdef object PyUnicode_AsUTF8String(object ustring) - cdef object PyString_FromStringAndSize(char* s, int size) + cdef object PyString_FromStringAndSize(char* s, Py_ssize_t size) cdef object PyString_FromString(char* s) cdef object PyString_FromFormat(char* format, ...) cdef object PyBool_FromLong(long value) - cdef int PyList_GET_SIZE(object l) + cdef Py_ssize_t PyList_GET_SIZE(object l) cdef int PyList_Append(object l, object obj) cdef int PyList_Reverse(object l) cdef int PyDict_SetItemString(object d, char* key, object value) @@ -29,10 +30,11 @@ cdef PyObject* PyDict_GetItem(object d, object key) cdef int PyDict_DelItem(object d, object key) cdef int PyDict_Clear(object d) + cdef Py_ssize_t PyDict_Size(object d) cdef object PyList_AsTuple(object o) cdef object PySequence_List(object o) cdef object PySequence_Tuple(object o) - cdef object PyTuple_GET_ITEM(object o, int pos) + cdef object PyTuple_GET_ITEM(object o, Py_ssize_t pos) cdef int PyDict_Check(object instance) cdef int PyNumber_Check(object instance) Modified: lxml/trunk/src/lxml/xslt.pxi ============================================================================== --- lxml/trunk/src/lxml/xslt.pxi (original) +++ lxml/trunk/src/lxml/xslt.pxi Sat May 6 10:54:25 2006 @@ -234,7 +234,7 @@ cdef xmlDoc* c_doc cdef char** params cdef void* ptemp - cdef int i, kw_count + cdef Py_ssize_t i, kw_count input_doc = _documentOrRaise(_input) root_node = _rootNodeOf(_input) From scoder at codespeak.net Sat May 6 14:57:47 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Sat May 6 14:57:48 2006 Subject: [Lxml-checkins] r26870 - lxml/trunk/src/lxml Message-ID: <20060506125747.05F211007E@code0.codespeak.net> Author: scoder Date: Sat May 6 14:57:46 2006 New Revision: 26870 Modified: lxml/trunk/src/lxml/apihelpers.pxi lxml/trunk/src/lxml/cstd.pxd lxml/trunk/src/lxml/etree.pyx lxml/trunk/src/lxml/relaxng.pxi lxml/trunk/src/lxml/tree.pxd lxml/trunk/src/lxml/xmlerror.pxi lxml/trunk/src/lxml/xmlschema.pxi lxml/trunk/src/lxml/xslt.pxi Log: cleanup: moved strstr/strcmp/etc. from tree.pxd to cstd.pxd Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Sat May 6 14:57:46 2006 @@ -286,7 +286,7 @@ cdef object funicode(char* s): if isutf8(s): - return python.PyUnicode_DecodeUTF8(s, tree.strlen(s), NULL) + return python.PyUnicode_DecodeUTF8(s, cstd.strlen(s), NULL) return python.PyString_FromString(s) cdef object _utf8(object s): Modified: lxml/trunk/src/lxml/cstd.pxd ============================================================================== --- lxml/trunk/src/lxml/cstd.pxd (original) +++ lxml/trunk/src/lxml/cstd.pxd Sat May 6 14:57:46 2006 @@ -1,4 +1,11 @@ +cdef extern from "stdio.h": + ctypedef struct FILE + cdef int strlen(char* s) + cdef char* strstr(char* haystack, char* needle) + cdef int strcmp(char* s1, char* s2) + cdef int strncmp(char* s1, char* s2, int len) + cdef extern from "stdarg.h": ctypedef void *va_list void va_start(va_list ap, void *last) Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Sat May 6 14:57:46 2006 @@ -1259,11 +1259,11 @@ return node cdef int _tagMatches(self, xmlNode* c_node): - if tree.strcmp(c_node.name, self._name) == 0: + if cstd.strcmp(c_node.name, self._name) == 0: if c_node.ns == NULL or c_node.ns.href == NULL: return self._href == NULL else: - return tree.strcmp(c_node.ns.href, self._href) == 0 + return cstd.strcmp(c_node.ns.href, self._href) == 0 return 0 cdef xmlNode* _createElement(xmlDoc* c_doc, object name_utf, Modified: lxml/trunk/src/lxml/relaxng.pxi ============================================================================== --- lxml/trunk/src/lxml/relaxng.pxi (original) +++ lxml/trunk/src/lxml/relaxng.pxi Sat May 6 14:57:46 2006 @@ -32,7 +32,7 @@ c_node = root_node._c_node # work around for libxml2 bug if document is not RNG at all if c_node.ns is NULL or c_node.ns.href is NULL or \ - tree.strcmp(c_node.ns.href, + cstd.strcmp(c_node.ns.href, 'http://relaxng.org/ns/structure/1.0') != 0: raise RelaxNGParseError, "Document is not Relax NG" fake_c_doc = _fakeRootDoc(doc._c_doc, root_node._c_node) Modified: lxml/trunk/src/lxml/tree.pxd ============================================================================== --- lxml/trunk/src/lxml/tree.pxd (original) +++ lxml/trunk/src/lxml/tree.pxd Sat May 6 14:57:46 2006 @@ -1,11 +1,4 @@ -#from xmlparser cimport xmlDict - -cdef extern from "stdio.h": - ctypedef struct FILE - cdef int strlen(char* s) - cdef char* strstr(char* haystack, char* needle) - cdef int strcmp(char* s1, char* s2) - cdef int strncmp(char* s1, char* s2, int len) +from cstd cimport FILE cdef extern from "lxml-version.h": cdef char* LXML_VERSION_STRING Modified: lxml/trunk/src/lxml/xmlerror.pxi ============================================================================== --- lxml/trunk/src/lxml/xmlerror.pxi (original) +++ lxml/trunk/src/lxml/xmlerror.pxi Sat May 6 14:57:46 2006 @@ -34,7 +34,7 @@ self.level = error.level self.line = error.line self.message = python.PyString_FromStringAndSize( - error.message, tree.strlen(error.message) - 1) # strip EOL + error.message, cstd.strlen(error.message) - 1) # strip EOL if error.file is NULL: self.filename = '' else: @@ -259,7 +259,7 @@ cdef char* c_filename cdef char* c_element cdef int c_line - if __DEBUG == 0 or msg == NULL or tree.strlen(msg) < 10: + if __DEBUG == 0 or msg == NULL or cstd.strlen(msg) < 10: return if c_log_handler is not NULL: log_handler = <_ErrorLog>c_log_handler @@ -267,19 +267,19 @@ log_handler = __GLOBAL_ERROR_LOG cstd.va_start(args, msg) - if tree.strncmp(msg, '%s:', 3) == 0: + if cstd.strncmp(msg, '%s:', 3) == 0: c_text = cstd.va_charptr(args) else: c_text = NULL - if tree.strstr(msg, 'file %s') is not NULL: + if cstd.strstr(msg, 'file %s') is not NULL: c_filename = cstd.va_charptr(args) else: c_filename = NULL - if tree.strstr(msg, 'line %d') is not NULL: + if cstd.strstr(msg, 'line %d') is not NULL: c_line = cstd.va_int(args) else: c_line = -1 - if tree.strstr(msg, 'element %s') is not NULL: + if cstd.strstr(msg, 'element %s') is not NULL: c_element = cstd.va_charptr(args) else: c_element = NULL @@ -297,8 +297,8 @@ message = "" try: - if c_filename is not NULL and tree.strlen(c_filename) > 0: - if tree.strncmp(c_filename, 'XSLT:', 5) == 0: + if c_filename is not NULL and cstd.strlen(c_filename) > 0: + if cstd.strncmp(c_filename, 'XSLT:', 5) == 0: filename = '' else: filename = funicode(c_filename) Modified: lxml/trunk/src/lxml/xmlschema.pxi ============================================================================== --- lxml/trunk/src/lxml/xmlschema.pxi (original) +++ lxml/trunk/src/lxml/xmlschema.pxi Sat May 6 14:57:46 2006 @@ -31,7 +31,7 @@ # work around for libxml2 bug if document is not XML schema at all c_node = root_node._c_node if c_node.ns is NULL or c_node.ns.href is NULL or \ - tree.strcmp(c_node.ns.href, 'http://www.w3.org/2001/XMLSchema') != 0: + cstd.strcmp(c_node.ns.href, 'http://www.w3.org/2001/XMLSchema') != 0: raise XMLSchemaParseError, "Document is not XML Schema" fake_c_doc = _fakeRootDoc(doc._c_doc, root_node._c_node) Modified: lxml/trunk/src/lxml/xslt.pxi ============================================================================== --- lxml/trunk/src/lxml/xslt.pxi (original) +++ lxml/trunk/src/lxml/xslt.pxi Sat May 6 14:57:46 2006 @@ -68,7 +68,7 @@ # quick check if we are looking for the current stylesheet c_doc = xslt_resolver_context._c_style_doc if c_doc is not NULL and c_doc.URL is not NULL: - if tree.strcmp(c_uri, c_doc.URL) == 0: + if cstd.strcmp(c_uri, c_doc.URL) == 0: return tree.xmlCopyDoc(c_doc, 1) # call the Python document loaders From scoder at codespeak.net Sat May 6 18:54:59 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Sat May 6 18:55:00 2006 Subject: [Lxml-checkins] r26878 - in lxml/trunk/src/lxml: . tests Message-ID: <20060506165459.F233710083@code0.codespeak.net> Author: scoder Date: Sat May 6 18:54:58 2006 New Revision: 26878 Modified: lxml/trunk/src/lxml/parser.pxi lxml/trunk/src/lxml/tests/test_etree.py Log: XMLParser: support chunk_size < 0 for read-at-once, some cleanup in file-like error handling Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Sat May 6 18:54:58 2006 @@ -146,9 +146,28 @@ __GLOBAL_PARSER_CONTEXT._initParserDict(c_ctxt) c_ctxt._private = self._context +cdef _raiseParseError(xmlParserCtxt* ctxt, char* c_filename): + if c_filename is not NULL and \ + ctxt.lastError.domain == xmlerror.XML_FROM_IO: + if ctxt.lastError.message is not NULL: + message = "Error reading file %s: %s" % ( + funicode(c_filename), funicode(ctxt.lastError.message)) + else: + message = "Error reading file %s" % funicode(c_filename) + raise IOError, message + elif ctxt.lastError.message is not NULL: + raise XMLSyntaxError, funicode(ctxt.lastError.message) + else: + raise XMLSyntaxError + cdef xmlDoc* _handleParseResult(xmlParserCtxt* ctxt, xmlDoc* result, char* c_filename, int recover) except NULL: cdef _ResolverContext context + if ctxt.myDoc is not NULL: + if ctxt.myDoc != result: + tree.xmlFreeDoc(ctxt.myDoc) + ctxt.myDoc = NULL + if ctxt.wellFormed or recover: __GLOBAL_PARSER_CONTEXT._initDocDict(result) elif result is not NULL: @@ -165,18 +184,7 @@ context._raise_if_stored() if result is NULL: - if c_filename is not NULL and \ - ctxt.lastError.domain == xmlerror.XML_FROM_IO: - if ctxt.lastError.message is not NULL: - message = "Error reading file %s: %s" % ( - funicode(c_filename), funicode(ctxt.lastError.message)) - else: - message = "Error reading file %s" % funicode(c_filename) - raise IOError, message - elif ctxt.lastError.message is not NULL: - raise XMLSyntaxError, funicode(ctxt.lastError.message) - else: - raise XMLSyntaxError + _raiseParseError(ctxt, c_filename) return result ############################################################ @@ -191,7 +199,7 @@ xmlparser.XML_PARSE_NOERROR ) -cdef object __FILE_READ_CHUNK_SIZE +cdef int __FILE_READ_CHUNK_SIZE __FILE_READ_CHUNK_SIZE = 32768 cdef class XMLParser(BaseParser): @@ -213,6 +221,7 @@ * ns_clean - clean up redundant namespace declarations * recover - try hard to parse through broken XML * chunk_size - read this many bytes from file-like objects + (< 0 means: read everything in one step) Note that you must not share parsers between threads. This applies also to the default parser. @@ -229,6 +238,11 @@ self._memory_parser_ctxt = NULL self._file_parser_ctxt = NULL self._push_parser_ctxt = NULL + + self._chunk_size = int(chunk_size) + if self._chunk_size == 0: + raise ValueError, "Chunk size must not be 0" + BaseParser.__init__(self) parse_options = _XML_DEFAULT_PARSE_OPTIONS @@ -248,7 +262,6 @@ parse_options = parse_options | xmlparser.XML_PARSE_RECOVER self._parse_options = parse_options - self._chunk_size = int(chunk_size) def __dealloc__(self): if self._file_parser_ctxt != NULL: @@ -309,10 +322,15 @@ cdef xmlDoc* _parseDocFromFilelike(self, filelike, char* c_filename) except NULL: + # we read Python string, so we must convert to UTF-8 cdef xmlDoc* result cdef xmlParserCtxt* pctxt cdef int recover cdef int success + if self._chunk_size < 0: + # read whole file at once + data = _utf8(filelike.read()) + return self._parseDoc(data, c_filename) self._error_log.connect() pctxt = self._push_parser_ctxt if pctxt is NULL: @@ -338,7 +356,7 @@ data = data.replace('\r\n', '\n') success = xmlparser.xmlParseChunk(pctxt, _cstr(data), len(data), 0) if success != 0: - return _handleParseResult(pctxt, NULL, c_filename, 0) + _raiseParseError(pctxt, c_filename) data = _utf8( read(self._chunk_size) ) xmlparser.xmlParseChunk(pctxt, NULL, 0, 1) except Exception: Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Sat May 6 18:54:58 2006 @@ -435,6 +435,27 @@ self.assertEquals(etree.tostring(root).replace('\r', ''), xml.replace('\r', '')) + def test_parse_fileobject_chunk_size(self): + etree = self.etree + xml = '' + 'test' * 10 + '' + + self.assertRaises(ValueError, etree.XMLParser, chunk_size=0) + + parser = etree.XMLParser(chunk_size=-1) + f = SillyFileLike(xml) + root = etree.parse(f, parser).getroot() + self.assertEquals(etree.tostring(root), xml) + + parser = etree.XMLParser(chunk_size=3) + f = SillyFileLike(xml) + root = etree.parse(f, parser).getroot() + self.assertEquals(etree.tostring(root), xml) + + parser = etree.XMLParser(chunk_size=21) + f = SillyFileLike(xml) + root = etree.parse(f, parser).getroot() + self.assertEquals(etree.tostring(root), xml) + def _writeElement(self, element, encoding='us-ascii'): """Write out element for comparison. """ From scoder at codespeak.net Sat May 6 19:02:32 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Sat May 6 19:02:33 2006 Subject: [Lxml-checkins] r26880 - lxml/trunk/src/lxml Message-ID: <20060506170232.59C8810083@code0.codespeak.net> Author: scoder Date: Sat May 6 19:02:31 2006 New Revision: 26880 Modified: lxml/trunk/src/lxml/parser.pxi Log: fix for potential bug in XMLParser._parseDocFromFilelike: could stop reading prematurely on unicode files Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Sat May 6 19:02:31 2006 @@ -346,10 +346,7 @@ try: read = filelike.read - data = read(self._chunk_size) - if python.PyUnicode_Check(data): - data = _stripDeclaration(data) - data = _utf8(data) + data = _utf8( read(self._chunk_size) ) while data: if _LIBXML_VERSION_INT <= 20622: # CRLF reading bug in libxml2 <= 2.6.22 From scoder at codespeak.net Sat May 6 20:42:00 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Sat May 6 20:42:02 2006 Subject: [Lxml-checkins] r26892 - lxml/trunk/src/lxml/tests Message-ID: <20060506184200.C642910083@code0.codespeak.net> Author: scoder Date: Sat May 6 20:41:59 2006 New Revision: 26892 Modified: lxml/trunk/src/lxml/tests/test_etree.py Log: updated test case Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Sat May 6 20:41:59 2006 @@ -437,7 +437,7 @@ def test_parse_fileobject_chunk_size(self): etree = self.etree - xml = '' + 'test' * 10 + '' + xml = '' + 'test' * 20 + '' self.assertRaises(ValueError, etree.XMLParser, chunk_size=0) @@ -451,7 +451,7 @@ root = etree.parse(f, parser).getroot() self.assertEquals(etree.tostring(root), xml) - parser = etree.XMLParser(chunk_size=21) + parser = etree.XMLParser(chunk_size=13) f = SillyFileLike(xml) root = etree.parse(f, parser).getroot() self.assertEquals(etree.tostring(root), xml) From scoder at codespeak.net Sun May 7 21:17:55 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Sun May 7 21:17:56 2006 Subject: [Lxml-checkins] r26950 - lxml/trunk/src/lxml/tests Message-ID: <20060507191755.1871110076@code0.codespeak.net> Author: scoder Date: Sun May 7 21:17:54 2006 New Revision: 26950 Modified: lxml/trunk/src/lxml/tests/test_io.py Log: extended test case Modified: lxml/trunk/src/lxml/tests/test_io.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_io.py (original) +++ lxml/trunk/src/lxml/tests/test_io.py Sun May 7 21:17:54 2006 @@ -57,9 +57,12 @@ # and now do it again; previous content should still be there root2 = tree.parse(filename) self.assertEquals('a', root.tag) + self.assertEquals('a', root2.tag) # now remove all references to root2, and parse again del root2 root3 = tree.parse(filename) + self.assertEquals('a', root.tag) + self.assertEquals('a', root3.tag) # root2's memory should've been freed here # XXX how to check? From scoder at codespeak.net Sun May 7 21:20:25 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Sun May 7 21:20:28 2006 Subject: [Lxml-checkins] r26952 - in lxml/trunk/src/lxml: . tests Message-ID: <20060507192025.4DC1510080@code0.codespeak.net> Author: scoder Date: Sun May 7 21:20:22 2006 New Revision: 26952 Modified: lxml/trunk/src/lxml/cstd.pxd lxml/trunk/src/lxml/parser.pxi lxml/trunk/src/lxml/python.pxd lxml/trunk/src/lxml/tests/test_etree.py lxml/trunk/src/lxml/tree.pxd lxml/trunk/src/lxml/xmlparser.pxd Log: another rewrite of file-like parsing: let libxml2 pull the data by using IOInputStream and ReadIO => more generic Modified: lxml/trunk/src/lxml/cstd.pxd ============================================================================== --- lxml/trunk/src/lxml/cstd.pxd (original) +++ lxml/trunk/src/lxml/cstd.pxd Sun May 7 21:20:22 2006 @@ -1,10 +1,14 @@ cdef extern from "stdio.h": ctypedef struct FILE + +cdef extern from "string.h": + ctypedef int size_t cdef int strlen(char* s) cdef char* strstr(char* haystack, char* needle) cdef int strcmp(char* s1, char* s2) - cdef int strncmp(char* s1, char* s2, int len) + cdef int strncmp(char* s1, char* s2, size_t len) + cdef void* memcpy(void* dest, void* src, size_t len) cdef extern from "stdarg.h": ctypedef void *va_list Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Sun May 7 21:20:22 2006 @@ -59,6 +59,69 @@ ############################################################ +## support for file-like objects +############################################################ + +cdef class _FileParserContext: + cdef object _filelike + cdef object _url + cdef object _bytes_utf + cdef _ExceptionContext _exc_context + cdef cstd.size_t _bytes_read + cdef char* _c_url + def __init__(self, filelike, exc_context, url=None): + self._exc_context = exc_context + self._filelike = filelike + self._url = url + if url is None: + self._c_url = NULL + else: + self._c_url = _cstr(url) + self._bytes_utf = '' + self._bytes_read = 0 + + cdef xmlparser.xmlParserInput* _createParserInput(self, xmlParserCtxt* ctxt): + cdef xmlparser.xmlParserInputBuffer* c_buffer + c_buffer = xmlparser.xmlAllocParserInputBuffer(0) + c_buffer.context = self + c_buffer.readcallback = _copyFilelike + return xmlparser.xmlNewIOInputStream(ctxt, c_buffer, 0) + + cdef xmlDoc* _readDoc(self, xmlParserCtxt* ctxt, int options): + return xmlparser.xmlCtxtReadIO( + ctxt, _copyFilelike, NULL, self, + self._c_url, NULL, options) + + cdef int write(self, char* c_buffer, int c_size): + cdef char* c_start + cdef Py_ssize_t byte_count, remaining + if self._bytes_read < 0: + return 0 + try: + byte_count = python.PyString_GET_SIZE(self._bytes_utf) + remaining = byte_count - self._bytes_read + if remaining <= 0: + self._bytes_utf = _utf8( self._filelike.read(c_size) ) + self._bytes_read = 0 + remaining = python.PyString_GET_SIZE(self._bytes_utf) + if remaining == 0: + self._bytes_read = -1 + return 0 + if c_size > remaining: + c_size = remaining + c_start = _cstr(self._bytes_utf) + self._bytes_read + self._bytes_read = self._bytes_read + c_size + cstd.memcpy(c_buffer, c_start, c_size) + return c_size + except Exception: + self._exc_context._store_raised() + return -1 + +cdef int _copyFilelike(void* ctxt, char* c_buffer, int c_size): + return (<_FileParserContext>ctxt).write(c_buffer, c_size) + + +############################################################ ## support for custom document loaders ############################################################ @@ -66,6 +129,7 @@ xmlParserCtxt* c_context): cdef _ResolverContext context cdef _InputDocument doc_ref + cdef _FileParserContext file_context cdef xmlparser.xmlParserInput* c_input if c_context._private is NULL or \ not isinstance(c_context._private, _ResolverContext): @@ -104,9 +168,8 @@ c_input = xmlparser.xmlNewInputFromFile( c_context, _cstr(doc_ref._data_utf)) elif doc_ref._type == PARSER_DATA_FILE: - data = doc_ref._file.read() - c_input = xmlparser.xmlNewStringInputStream( - c_context, _cstr(data)) + file_context = _FileParserContext(doc_ref._file, context) + c_input = file_context._createParserInput(c_context) if data is not None: context._storage.add(data) @@ -194,14 +257,9 @@ cdef int _XML_DEFAULT_PARSE_OPTIONS _XML_DEFAULT_PARSE_OPTIONS = ( xmlparser.XML_PARSE_NOENT | - xmlparser.XML_PARSE_NOCDATA | - xmlparser.XML_PARSE_NOWARNING | - xmlparser.XML_PARSE_NOERROR + xmlparser.XML_PARSE_NOCDATA ) -cdef int __FILE_READ_CHUNK_SIZE -__FILE_READ_CHUNK_SIZE = 32768 - cdef class XMLParser(BaseParser): """The XML parser. Parsers can be supplied as additional argument to various parse functions of the lxml API. A default parser is always @@ -220,28 +278,21 @@ * no_network - prevent network access * ns_clean - clean up redundant namespace declarations * recover - try hard to parse through broken XML - * chunk_size - read this many bytes from file-like objects - (< 0 means: read everything in one step) Note that you must not share parsers between threads. This applies also to the default parser. """ cdef int _parse_options - cdef object _chunk_size cdef xmlParserCtxt* _file_parser_ctxt cdef xmlParserCtxt* _memory_parser_ctxt - cdef xmlParserCtxt* _push_parser_ctxt + cdef xmlParserCtxt* _filelike_parser_ctxt def __init__(self, attribute_defaults=False, dtd_validation=False, load_dtd=False, no_network=False, ns_clean=False, - recover=False, chunk_size=__FILE_READ_CHUNK_SIZE): + recover=False): cdef int parse_options - self._memory_parser_ctxt = NULL - self._file_parser_ctxt = NULL - self._push_parser_ctxt = NULL - - self._chunk_size = int(chunk_size) - if self._chunk_size == 0: - raise ValueError, "Chunk size must not be 0" + self._memory_parser_ctxt = NULL + self._file_parser_ctxt = NULL + self._filelike_parser_ctxt = NULL BaseParser.__init__(self) @@ -268,8 +319,8 @@ xmlparser.xmlFreeParserCtxt(self._file_parser_ctxt) if self._memory_parser_ctxt != NULL: xmlparser.xmlFreeParserCtxt(self._memory_parser_ctxt) - if self._push_parser_ctxt != NULL: - xmlparser.xmlFreeParserCtxt(self._push_parser_ctxt) + if self._filelike_parser_ctxt != NULL: + xmlparser.xmlFreeParserCtxt(self._filelike_parser_ctxt) def copy(self): "Create a new parser with the same configuration." @@ -323,51 +374,22 @@ cdef xmlDoc* _parseDocFromFilelike(self, filelike, char* c_filename) except NULL: # we read Python string, so we must convert to UTF-8 + cdef _FileParserContext file_context cdef xmlDoc* result cdef xmlParserCtxt* pctxt cdef int recover - cdef int success - if self._chunk_size < 0: - # read whole file at once - data = _utf8(filelike.read()) - return self._parseDoc(data, c_filename) self._error_log.connect() - pctxt = self._push_parser_ctxt + pctxt = self._filelike_parser_ctxt if pctxt is NULL: pctxt = self._createContext() - self._push_parser_ctxt = pctxt + self._filelike_parser_ctxt = pctxt self._initContext(pctxt) - result = NULL - success = xmlparser.xmlCtxtResetPush(pctxt, NULL, 0, c_filename, NULL) - if success != 0: - self._error_log.disconnect() - raise ParserError, "Failed to setup parser context" - xmlparser.xmlCtxtUseOptions(pctxt, self._parse_options) - - try: - read = filelike.read - data = _utf8( read(self._chunk_size) ) - while data: - if _LIBXML_VERSION_INT <= 20622: - # CRLF reading bug in libxml2 <= 2.6.22 - data = data.replace('\r\n', '\n') - success = xmlparser.xmlParseChunk(pctxt, _cstr(data), len(data), 0) - if success != 0: - _raiseParseError(pctxt, c_filename) - data = _utf8( read(self._chunk_size) ) - xmlparser.xmlParseChunk(pctxt, NULL, 0, 1) - except Exception: - if pctxt.myDoc is not NULL: - tree.xmlFreeDoc(pctxt.myDoc) - pctxt.myDoc = NULL - self._error_log.disconnect() - raise - + file_context = _FileParserContext(filelike, self._context) + result = file_context._readDoc(pctxt, self._parse_options) self._error_log.disconnect() - result = pctxt.myDoc - pctxt.myDoc = NULL recover = self._parse_options & xmlparser.XML_PARSE_RECOVER - return _handleParseResult(pctxt, result, c_filename, recover) + result = _handleParseResult(pctxt, result, c_filename, recover) + return result cdef xmlDoc* _internalParseDoc(char* c_text, int options, _ResolverContext context) except NULL: @@ -442,10 +464,7 @@ ############################################################ cdef int _HTML_DEFAULT_PARSE_OPTIONS -_HTML_DEFAULT_PARSE_OPTIONS = ( - htmlparser.HTML_PARSE_NOWARNING | - htmlparser.HTML_PARSE_NOERROR - ) +_HTML_DEFAULT_PARSE_OPTIONS = 0 cdef class HTMLParser(BaseParser): """The HTML parser. This parser allows reading HTML into a normal XML Modified: lxml/trunk/src/lxml/python.pxd ============================================================================== --- lxml/trunk/src/lxml/python.pxd (original) +++ lxml/trunk/src/lxml/python.pxd Sun May 7 21:20:22 2006 @@ -19,6 +19,7 @@ cdef object PyString_FromStringAndSize(char* s, Py_ssize_t size) cdef object PyString_FromString(char* s) cdef object PyString_FromFormat(char* format, ...) + cdef Py_ssize_t PyString_GET_SIZE(object s) cdef object PyBool_FromLong(long value) cdef Py_ssize_t PyList_GET_SIZE(object l) Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Sun May 7 21:20:22 2006 @@ -425,37 +425,6 @@ self.assertEquals(docinfo.root_name, 'html') self.assertEquals(docinfo.doctype, '') - def test_parse_fileobject_crlf(self): - # libxml2 < 2.6.23 has a bug reading CRLF files in chunks - etree = self.etree - parser = etree.XMLParser(chunk_size=3) - xml = '' + '\r\ntest\r\n\r\n' * 10 + '' - f = SillyFileLike(xml) - root = etree.parse(f, parser).getroot() - self.assertEquals(etree.tostring(root).replace('\r', ''), - xml.replace('\r', '')) - - def test_parse_fileobject_chunk_size(self): - etree = self.etree - xml = '' + 'test' * 20 + '' - - self.assertRaises(ValueError, etree.XMLParser, chunk_size=0) - - parser = etree.XMLParser(chunk_size=-1) - f = SillyFileLike(xml) - root = etree.parse(f, parser).getroot() - self.assertEquals(etree.tostring(root), xml) - - parser = etree.XMLParser(chunk_size=3) - f = SillyFileLike(xml) - root = etree.parse(f, parser).getroot() - self.assertEquals(etree.tostring(root), xml) - - parser = etree.XMLParser(chunk_size=13) - f = SillyFileLike(xml) - root = etree.parse(f, parser).getroot() - self.assertEquals(etree.tostring(root), xml) - def _writeElement(self, element, encoding='us-ascii'): """Write out element for comparison. """ Modified: lxml/trunk/src/lxml/tree.pxd ============================================================================== --- lxml/trunk/src/lxml/tree.pxd (original) +++ lxml/trunk/src/lxml/tree.pxd Sun May 7 21:20:22 2006 @@ -172,7 +172,6 @@ cdef char* xmlBufferContent(xmlBuffer* buf) cdef extern from "libxml/xmlIO.h": - cdef xmlOutputBuffer* xmlAllocOutputBuffer(xmlCharEncodingHandler* encoder) cdef xmlOutputBuffer* xmlOutputBufferCreateFile( FILE* file, @@ -181,6 +180,9 @@ cdef int xmlOutputBufferFlush(xmlOutputBuffer* out) cdef int xmlOutputBufferClose(xmlOutputBuffer* out) + ctypedef int (*xmlInputReadCallback)(void* context, char* buffer, int len) + ctypedef int (*xmlInputCloseCallback)(void * context) + cdef extern from "libxml/xmlsave.h": ctypedef struct xmlSaveCtxt: pass Modified: lxml/trunk/src/lxml/xmlparser.pxd ============================================================================== --- lxml/trunk/src/lxml/xmlparser.pxd (original) +++ lxml/trunk/src/lxml/xmlparser.pxd Sun May 7 21:20:22 2006 @@ -1,8 +1,16 @@ from tree cimport xmlDoc, xmlDict +from tree cimport xmlInputReadCallback, xmlInputCloseCallback from xmlerror cimport xmlError cdef extern from "libxml/tree.h": ctypedef struct xmlParserInput + ctypedef struct xmlParserInputBuffer: + void* context + xmlInputReadCallback readcallback + xmlInputCloseCallback closecallback + +cdef extern from "libxml/xmlIO.h": + cdef xmlParserInputBuffer* xmlAllocParserInputBuffer(int enc) cdef extern from "libxml/parser.h": @@ -37,15 +45,15 @@ XML_PARSE_NOXINCNODE = 32768 # do not generate XINCLUDE START/END nodes # libxml2 2.6.21+ only: #XML_PARSE_COMPACT = 65536 # compact small text nodes - + cdef void xmlInitParser() cdef int xmlLineNumbersDefault(int onoff) cdef xmlParserCtxt* xmlNewParserCtxt() + cdef xmlParserInput* xmlNewIOInputStream(xmlParserCtxt* ctxt, + xmlParserInputBuffer* input, + int enc) cdef int xmlCtxtUseOptions(xmlParserCtxt* ctxt, int options) cdef void xmlFreeParserCtxt(xmlParserCtxt* ctxt) - cdef int xmlCtxtResetPush(xmlParserCtxt* ctxt, - char* chunk, int size, - char* filename, char* encoding) cdef int xmlParseChunk(xmlParserCtxt* ctxt, char* chunk, int size, int terminate) cdef xmlDoc* xmlCtxtReadDoc(xmlParserCtxt* ctxt, @@ -53,6 +61,11 @@ int options) cdef xmlDoc* xmlCtxtReadFile(xmlParserCtxt* ctxt, char* filename, char* encoding, int options) + cdef xmlDoc* xmlCtxtReadIO(xmlParserCtxt* ctxt, + xmlInputReadCallback ioread, + xmlInputCloseCallback ioclose, + void* ioctx, + char* URL, char* encoding, int options) # entity loaders: From scoder at codespeak.net Sun May 7 22:31:27 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Sun May 7 22:31:29 2006 Subject: [Lxml-checkins] r26957 - in lxml/trunk/src/lxml: . tests Message-ID: <20060507203127.88C0A10083@code0.codespeak.net> Author: scoder Date: Sun May 7 22:31:25 2006 New Revision: 26957 Modified: lxml/trunk/src/lxml/etree.pyx lxml/trunk/src/lxml/htmlparser.pxd lxml/trunk/src/lxml/parser.pxi lxml/trunk/src/lxml/tests/test_htmlparser.py lxml/trunk/src/lxml/xslt.pxi Log: major restructuring and cleanup in parser.pxi * merge parse functions of XMLParser and HTMLParser back into base class * use same method for file-like parsing in both => reduced code duplication, more readable, less error prone, simpler to test Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Sun May 7 22:31:25 2006 @@ -129,7 +129,7 @@ raise type, value, traceback -cdef class BaseParser # forward declaration +cdef class _BaseParser # forward declaration cdef class _Document: """Internal base class to reference a libxml document. @@ -139,7 +139,7 @@ """ cdef int _ns_counter cdef xmlDoc* _c_doc - cdef BaseParser _parser + cdef _BaseParser _parser def __dealloc__(self): # if there are no more references to the document, it is safe Modified: lxml/trunk/src/lxml/htmlparser.pxd ============================================================================== --- lxml/trunk/src/lxml/htmlparser.pxd (original) +++ lxml/trunk/src/lxml/htmlparser.pxd Sun May 7 22:31:25 2006 @@ -1,4 +1,5 @@ from tree cimport xmlDoc, xmlDict +from tree cimport xmlInputReadCallback, xmlInputCloseCallback from xmlparser cimport xmlParserCtxt from xmlerror cimport xmlError @@ -24,3 +25,8 @@ cdef xmlDoc* htmlCtxtReadDoc(xmlParserCtxt* ctxt, char* buffer, char* URL, char* encoding, int options) + cdef xmlDoc* htmlCtxtReadIO(xmlParserCtxt* ctxt, + xmlInputReadCallback ioread, + xmlInputCloseCallback ioclose, + void* ioctx, + char* URL, char* encoding, int options) Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Sun May 7 22:31:25 2006 @@ -10,6 +10,10 @@ class ParserError(LxmlError): pass +ctypedef enum LxmlParserType: + LXML_XML_PARSER + LXML_HTML_PARSER + cdef class _ParserContext: """Global parser context to share the string dictionary. """ @@ -87,10 +91,16 @@ c_buffer.readcallback = _copyFilelike return xmlparser.xmlNewIOInputStream(ctxt, c_buffer, 0) - cdef xmlDoc* _readDoc(self, xmlParserCtxt* ctxt, int options): - return xmlparser.xmlCtxtReadIO( - ctxt, _copyFilelike, NULL, self, - self._c_url, NULL, options) + cdef xmlDoc* _readDoc(self, xmlParserCtxt* ctxt, int options, + LxmlParserType parser_type): + if parser_type == LXML_XML_PARSER: + return xmlparser.xmlCtxtReadIO( + ctxt, _copyFilelike, NULL, self, + self._c_url, NULL, options) + else: + return htmlparser.htmlCtxtReadIO( + ctxt, _copyFilelike, NULL, self, + self._c_url, NULL, options) cdef int write(self, char* c_buffer, int c_size): cdef char* c_start @@ -184,30 +194,107 @@ ## Parsers ############################################################ -cdef class BaseParser: +cdef class _BaseParser: + cdef int _parse_options cdef _ErrorLog _error_log cdef readonly object resolvers cdef _ResolverContext _context + cdef LxmlParserType _parser_type + cdef xmlParserCtxt* _parser_ctxt + def __init__(self): - cdef _ResolverContext context + cdef xmlParserCtxt* pctxt + if isinstance(self, HTMLParser): + self._parser_type = LXML_HTML_PARSER + pctxt = htmlparser.htmlCreateMemoryParserCtxt('dummy', 5) + elif isinstance(self, XMLParser): + self._parser_type = LXML_XML_PARSER + pctxt = xmlparser.xmlNewParserCtxt() + else: + raise TypeError, "This class cannot be instantiated" + self._parser_ctxt = pctxt + if pctxt is NULL: + raise ParserError, "Failed to create parser context" self._error_log = _ErrorLog() - self.resolvers = _ResolverRegistry() - self._context = _ResolverContext(self.resolvers) + self.resolvers = _ResolverRegistry() + self._context = _ResolverContext(self.resolvers) + pctxt._private = self._context + + def __dealloc__(self): + if self._parser_ctxt != NULL: + xmlparser.xmlFreeParserCtxt(self._parser_ctxt) property error_log: def __get__(self): return self._error_log.copy() - cdef _copy(self): - cdef BaseParser parser + def copy(self): + "Create a new parser with the same configuration." + cdef _BaseParser parser parser = self.__class__() + parser._parse_options = self._parse_options parser.resolvers = self.resolvers.copy() parser._context = _ResolverContext(parser.resolvers) return parser - cdef _initContext(self, xmlParserCtxt* c_ctxt): - __GLOBAL_PARSER_CONTEXT._initParserDict(c_ctxt) - c_ctxt._private = self._context + cdef xmlDoc* _parseDoc(self, char* c_text, char* c_filename) except NULL: + """Parse document, share dictionary if possible. + """ + cdef xmlDoc* result + cdef xmlParserCtxt* pctxt + cdef int recover + self._error_log.connect() + pctxt = self._parser_ctxt + __GLOBAL_PARSER_CONTEXT._initParserDict(pctxt) + + if self._parser_type == LXML_HTML_PARSER: + result = htmlparser.htmlCtxtReadDoc( + pctxt, c_text, c_filename, NULL, self._parse_options) + else: + result = xmlparser.xmlCtxtReadDoc( + pctxt, c_text, c_filename, NULL, self._parse_options) + + self._error_log.disconnect() + recover = self._parse_options & xmlparser.XML_PARSE_RECOVER + return _handleParseResult(pctxt, result, NULL, recover) + + cdef xmlDoc* _parseDocFromFile(self, char* c_filename) except NULL: + cdef xmlDoc* result + cdef xmlParserCtxt* pctxt + cdef int recover + self._error_log.connect() + pctxt = self._parser_ctxt + __GLOBAL_PARSER_CONTEXT._initParserDict(pctxt) + + if self._parser_type == LXML_HTML_PARSER: + result = htmlparser.htmlCtxtReadFile( + pctxt, c_filename, NULL, self._parse_options) + else: + result = xmlparser.xmlCtxtReadFile( + pctxt, c_filename, NULL, self._parse_options) + + self._error_log.disconnect() + recover = self._parse_options & xmlparser.XML_PARSE_RECOVER + return _handleParseResult(pctxt, result, c_filename, recover) + + cdef xmlDoc* _parseDocFromFilelike(self, filelike, + char* c_filename) except NULL: + # we read Python string, so we must convert to UTF-8 + cdef _FileParserContext file_context + cdef xmlDoc* result + cdef xmlParserCtxt* pctxt + cdef int recover + self._error_log.connect() + pctxt = self._parser_ctxt + __GLOBAL_PARSER_CONTEXT._initParserDict(pctxt) + + file_context = _FileParserContext(filelike, self._context) + result = file_context._readDoc( + pctxt, self._parse_options, self._parser_type) + + self._error_log.disconnect() + recover = self._parse_options & xmlparser.XML_PARSE_RECOVER + return _handleParseResult(pctxt, result, c_filename, recover) cdef _raiseParseError(xmlParserCtxt* ctxt, char* c_filename): if c_filename is not NULL and \ @@ -260,7 +347,7 @@ xmlparser.XML_PARSE_NOCDATA ) -cdef class XMLParser(BaseParser): +cdef class XMLParser(_BaseParser): """The XML parser. Parsers can be supplied as additional argument to various parse functions of the lxml API. A default parser is always available and can be replaced by a call to the global function @@ -282,19 +369,11 @@ Note that you must not share parsers between threads. This applies also to the default parser. """ - cdef int _parse_options - cdef xmlParserCtxt* _file_parser_ctxt - cdef xmlParserCtxt* _memory_parser_ctxt - cdef xmlParserCtxt* _filelike_parser_ctxt def __init__(self, attribute_defaults=False, dtd_validation=False, load_dtd=False, no_network=False, ns_clean=False, recover=False): cdef int parse_options - self._memory_parser_ctxt = NULL - self._file_parser_ctxt = NULL - self._filelike_parser_ctxt = NULL - - BaseParser.__init__(self) + _BaseParser.__init__(self) parse_options = _XML_DEFAULT_PARSE_OPTIONS if load_dtd: @@ -314,83 +393,6 @@ self._parse_options = parse_options - def __dealloc__(self): - if self._file_parser_ctxt != NULL: - xmlparser.xmlFreeParserCtxt(self._file_parser_ctxt) - if self._memory_parser_ctxt != NULL: - xmlparser.xmlFreeParserCtxt(self._memory_parser_ctxt) - if self._filelike_parser_ctxt != NULL: - xmlparser.xmlFreeParserCtxt(self._filelike_parser_ctxt) - - def copy(self): - "Create a new parser with the same configuration." - cdef XMLParser parser - parser = self._copy() - parser._parse_options = self._parse_options - return parser - - cdef xmlParserCtxt* _createContext(self) except NULL: - cdef xmlParserCtxt* pctxt - pctxt = xmlparser.xmlNewParserCtxt() - if pctxt is NULL: - self._error_log.disconnect() - raise ParserError, "Failed to create parser context" - return pctxt - - cdef xmlDoc* _parseDoc(self, char* c_text, char* c_filename) except NULL: - """Parse document, share dictionary if possible. - """ - cdef xmlDoc* result - cdef xmlParserCtxt* pctxt - cdef int recover - self._error_log.connect() - pctxt = self._memory_parser_ctxt - if pctxt is NULL: - pctxt = self._createContext() - self._memory_parser_ctxt = pctxt - self._initContext(pctxt) - result = xmlparser.xmlCtxtReadDoc( - pctxt, c_text, c_filename, NULL, self._parse_options) - self._error_log.disconnect() - recover = self._parse_options & xmlparser.XML_PARSE_RECOVER - return _handleParseResult(pctxt, result, NULL, recover) - - cdef xmlDoc* _parseDocFromFile(self, char* c_filename) except NULL: - cdef xmlDoc* result - cdef xmlParserCtxt* pctxt - cdef int recover - self._error_log.connect() - pctxt = self._file_parser_ctxt - if pctxt is NULL: - pctxt = self._createContext() - self._file_parser_ctxt = pctxt - self._initContext(pctxt) - result = xmlparser.xmlCtxtReadFile( - pctxt, c_filename, NULL, self._parse_options) - self._error_log.disconnect() - recover = self._parse_options & xmlparser.XML_PARSE_RECOVER - return _handleParseResult(pctxt, result, c_filename, recover) - - cdef xmlDoc* _parseDocFromFilelike(self, filelike, - char* c_filename) except NULL: - # we read Python string, so we must convert to UTF-8 - cdef _FileParserContext file_context - cdef xmlDoc* result - cdef xmlParserCtxt* pctxt - cdef int recover - self._error_log.connect() - pctxt = self._filelike_parser_ctxt - if pctxt is NULL: - pctxt = self._createContext() - self._filelike_parser_ctxt = pctxt - self._initContext(pctxt) - file_context = _FileParserContext(filelike, self._context) - result = file_context._readDoc(pctxt, self._parse_options) - self._error_log.disconnect() - recover = self._parse_options & xmlparser.XML_PARSE_RECOVER - result = _handleParseResult(pctxt, result, c_filename, recover) - return result - cdef xmlDoc* _internalParseDoc(char* c_text, int options, _ResolverContext context) except NULL: # internal parser function for XSLT @@ -435,7 +437,7 @@ cdef XMLParser __DEFAULT_XML_PARSER __DEFAULT_XML_PARSER = XMLParser() -cdef BaseParser __DEFAULT_PARSER +cdef _BaseParser __DEFAULT_PARSER __DEFAULT_PARSER = __DEFAULT_XML_PARSER def set_default_parser(parser=None): @@ -451,7 +453,7 @@ global __DEFAULT_PARSER if parser is None: __DEFAULT_PARSER = __DEFAULT_XML_PARSER - elif isinstance(parser, (HTMLParser, XMLParser)): + elif isinstance(parser, _BaseParser): __DEFAULT_PARSER = parser else: raise TypeError, "Invalid parser" @@ -466,7 +468,7 @@ cdef int _HTML_DEFAULT_PARSE_OPTIONS _HTML_DEFAULT_PARSE_OPTIONS = 0 -cdef class HTMLParser(BaseParser): +cdef class HTMLParser(_BaseParser): """The HTML parser. This parser allows reading HTML into a normal XML tree. By default, it can read broken (non well-formed) HTML, depending on the capabilities of libxml2. Use the 'recover' option to switch this off. @@ -478,14 +480,9 @@ Note that you must not share parsers between threads. """ - cdef int _parse_options - cdef xmlParserCtxt* _memory_parser_ctxt - cdef xmlParserCtxt* _file_parser_ctxt def __init__(self, recover=True, no_network=False, remove_blank_text=False): cdef int parse_options - self._memory_parser_ctxt = NULL - self._file_parser_ctxt = NULL - BaseParser.__init__(self) + _BaseParser.__init__(self) parse_options = _HTML_DEFAULT_PARSE_OPTIONS if recover: @@ -499,62 +496,6 @@ self._parse_options = parse_options - def __dealloc__(self): - if self._file_parser_ctxt != NULL: - htmlparser.htmlFreeParserCtxt(self._file_parser_ctxt) - if self._memory_parser_ctxt != NULL: - htmlparser.htmlFreeParserCtxt(self._memory_parser_ctxt) - - def copy(self): - "Create a new parser with the same configuration." - cdef HTMLParser parser - parser = self._copy() - parser._parse_options = self._parse_options - return parser - - cdef xmlDoc* _parseDoc(self, char* c_text, char* c_filename) except NULL: - """Parse HTML document, share dictionary if possible. - """ - cdef xmlDoc* result - cdef xmlParserCtxt* pctxt - cdef int recover - self._error_log.connect() - pctxt = self._memory_parser_ctxt - if pctxt is NULL: - pctxt = htmlparser.htmlCreateMemoryParserCtxt('dummy', 5) - if pctxt is NULL: - self._error_log.disconnect() - raise ParserError, "Failed to create parser context" - self._memory_parser_ctxt = pctxt - self._initContext(pctxt) - result = htmlparser.htmlCtxtReadDoc( - pctxt, c_text, c_filename, NULL, self._parse_options) - self._error_log.disconnect() - recover = self._parse_options & xmlparser.XML_PARSE_RECOVER - return _handleParseResult(pctxt, result, NULL, recover) - - cdef xmlDoc* _parseDocFromFile(self, char* c_filename) except NULL: - cdef xmlDoc* result - cdef xmlParserCtxt* pctxt - cdef int recover - self._error_log.connect() - pctxt = self._file_parser_ctxt - if pctxt is NULL: - pctxt = htmlparser.htmlCreateFileParserCtxt(c_filename, NULL) - if pctxt is NULL: - self._error_log.disconnect() - warnings = self._error_log.filter_from_warnings() - if warnings and warnings[-1].domain == xmlerror.XML_FROM_IO: - raise IOError, "Could not open file %s" % c_filename - raise ParserError, "Failed to create parser context" - self._file_parser_ctxt = pctxt - self._initContext(pctxt) - result = htmlparser.htmlCtxtReadFile( - pctxt, c_filename, NULL, self._parse_options) - self._error_log.disconnect() - recover = self._parse_options & xmlparser.XML_PARSE_RECOVER - return _handleParseResult(pctxt, result, c_filename, recover) - cdef HTMLParser __DEFAULT_HTML_PARSER __DEFAULT_HTML_PARSER = HTMLParser() @@ -566,45 +507,35 @@ cdef char* c_filename if parser is None: parser = __DEFAULT_PARSER + elif not isinstance(parser, _BaseParser): + raise TypeError, "invalid parser" __GLOBAL_PARSER_CONTEXT._initParser() if not filename: c_filename = NULL else: c_filename = _cstr(filename) - if isinstance(parser, XMLParser): - return (parser)._parseDoc(_cstr(text_utf), c_filename) - elif isinstance(parser, HTMLParser): - return (parser)._parseDoc(_cstr(text_utf), c_filename) - else: - raise TypeError, "invalid parser" + return (<_BaseParser>parser)._parseDoc(_cstr(text_utf), c_filename) cdef xmlDoc* _parseDocFromFile(filename, parser) except NULL: if parser is None: parser = __DEFAULT_PARSER - __GLOBAL_PARSER_CONTEXT._initParser() - if isinstance(parser, XMLParser): - return (parser)._parseDocFromFile(_cstr(filename)) - elif isinstance(parser, HTMLParser): - return (parser)._parseDocFromFile(_cstr(filename)) - else: + elif not isinstance(parser, _BaseParser): raise TypeError, "invalid parser" + __GLOBAL_PARSER_CONTEXT._initParser() + return (<_BaseParser>parser)._parseDocFromFile(_cstr(filename)) cdef xmlDoc* _parseDocFromFilelike(source, filename, parser) except NULL: cdef char* c_filename if parser is None: parser = __DEFAULT_PARSER + elif not isinstance(parser, _BaseParser): + raise TypeError, "invalid parser" __GLOBAL_PARSER_CONTEXT._initParser() if not filename: c_filename = NULL else: c_filename = _cstr(filename) - if isinstance(parser, XMLParser): - return (parser)._parseDocFromFilelike(source, c_filename) - elif isinstance(parser, HTMLParser): - data_utf = _utf8(source.read()) - return (parser)._parseDoc(_cstr(data_utf), c_filename) - else: - raise TypeError, "invalid parser" + return (<_BaseParser>parser)._parseDocFromFilelike(source, c_filename) cdef xmlDoc* _newDoc(): cdef xmlDoc* result Modified: lxml/trunk/src/lxml/tests/test_htmlparser.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_htmlparser.py (original) +++ lxml/trunk/src/lxml/tests/test_htmlparser.py Sun May 7 22:31:25 2006 @@ -60,6 +60,13 @@ def test_module_parse_html_filelike(self): parser = self.etree.HTMLParser() + f = SillyFileLike(self.html_str) + tree = self.etree.parse(f, parser) + html = self.etree.tostring(tree.getroot()) + self.assertEqual(unentitify(html), self.html_str) + + def test_module_parse_html_filelike_unicode(self): + parser = self.etree.HTMLParser() f = SillyFileLike(self.uhtml_str) tree = self.etree.parse(f, parser) html = self.etree.tostring(tree.getroot()) Modified: lxml/trunk/src/lxml/xslt.pxi ============================================================================== --- lxml/trunk/src/lxml/xslt.pxi (original) +++ lxml/trunk/src/lxml/xslt.pxi Sun May 7 22:31:25 2006 @@ -30,8 +30,8 @@ cdef class _XSLTResolverContext(_ResolverContext): cdef xmlDoc* _c_style_doc - cdef BaseParser _parser - def __init__(self, BaseParser parser not None): + cdef _BaseParser _parser + def __init__(self, _BaseParser parser not None): _ResolverContext.__init__(self, parser.resolvers) self._parser = parser self._c_style_doc = NULL From scoder at codespeak.net Mon May 8 06:44:28 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Mon May 8 06:44:31 2006 Subject: [Lxml-checkins] r26959 - lxml/trunk/src/lxml Message-ID: <20060508044428.5B4CB10076@code0.codespeak.net> Author: scoder Date: Mon May 8 06:44:26 2006 New Revision: 26959 Modified: lxml/trunk/src/lxml/parser.pxi Log: cleanup Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Mon May 8 06:44:26 2006 @@ -318,12 +318,13 @@ tree.xmlFreeDoc(ctxt.myDoc) ctxt.myDoc = NULL - if ctxt.wellFormed or recover: - __GLOBAL_PARSER_CONTEXT._initDocDict(result) - elif result is not NULL: - # free broken document - tree.xmlFreeDoc(result) - result = NULL + if result is not NULL: + if ctxt.wellFormed or recover: + __GLOBAL_PARSER_CONTEXT._initDocDict(result) + else: + # free broken document + tree.xmlFreeDoc(result) + result = NULL if ctxt._private is not NULL: context = <_ResolverContext>ctxt._private From scoder at codespeak.net Mon May 8 15:53:29 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Mon May 8 15:53:30 2006 Subject: [Lxml-checkins] r26965 - lxml/trunk Message-ID: <20060508135329.508E510070@code0.codespeak.net> Author: scoder Date: Mon May 8 15:53:28 2006 New Revision: 26965 Modified: lxml/trunk/bench.py Log: IO benchmarks Modified: lxml/trunk/bench.py ============================================================================== --- lxml/trunk/bench.py (original) +++ lxml/trunk/bench.py Mon May 8 15:53:28 2006 @@ -1,5 +1,6 @@ import sys, string, time, copy, gc from itertools import * +from StringIO import StringIO _TEXT = "some ASCII text" _UTEXT = u"some klingon: \F8D2" @@ -248,6 +249,26 @@ for child in reversed(root): pass + @with_text(text=True, utext=True) + def bench_tostring_utf8(self, root): + self.etree.tostring(root, 'UTF-8') + + @with_text(text=True, utext=True) + def bench_tostring_utf16(self, root): + self.etree.tostring(root, 'UTF-16') + + @with_text(text=True, utext=True) + def bench_tostring_utf8_unicode_XML(self, root): + xml = unicode(self.etree.tostring(root, 'UTF-8'), 'UTF-8') + self.etree.XML(xml) + + @with_text(text=True, utext=True) + def bench_write_utf8_parse_stringIO(self, root): + f = StringIO() + self.etree.ElementTree(root).write(f, 'UTF-8') + f.seek(0) + self.etree.parse(f) + def bench_append_from_document(self, root1, root2): # == "1,2 2,3 1,3 3,1 3,2 2,1" # trees 1 and 2, or 2 and 3, or ... for el in root2: @@ -588,7 +609,7 @@ for lib, (bench, benchmark_setup) in enumerate(izip(benchmark_suites, bench_calls)): bench_name, method_call = benchmark_setup[:2] tree_set_name = build_treeset_name(*benchmark_setup[-3:]) - print "%-3s: %-23s" % (bench.lib_name, bench_name[6:29]), + print "%-3s: %-28s" % (bench.lib_name, bench_name[6:34]), if method_call is None: print "skipped" continue From scoder at codespeak.net Mon May 8 16:45:50 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Mon May 8 16:45:51 2006 Subject: [Lxml-checkins] r26966 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20060508144550.42B6410070@code0.codespeak.net> Author: scoder Date: Mon May 8 16:45:47 2006 New Revision: 26966 Modified: lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/htmlparser.pxd lxml/trunk/src/lxml/parser.pxi lxml/trunk/src/lxml/python.pxd lxml/trunk/src/lxml/tests/test_htmlparser.py lxml/trunk/src/lxml/tree.pxd lxml/trunk/src/lxml/xmlparser.pxd Log: support parsing straight from Python unicode strings Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Mon May 8 16:45:47 2006 @@ -7,9 +7,14 @@ Features added -------------- +* Parsing a unicode string no longer copies the string (reduced memory + footprint) + * Parsing file-like objects now reads chunks rather than the whole file + (reduced memory footprint) -* Parsing StringIO objects from the start avoids copying the string +* Parsing StringIO objects from the start avoids copying the string (reduced + memory footprint) * Read-only 'docinfo' attribute in ElementTree class holds DOCTYPE information, original encoding and XML version as seen by the parser Modified: lxml/trunk/src/lxml/htmlparser.pxd ============================================================================== --- lxml/trunk/src/lxml/htmlparser.pxd (original) +++ lxml/trunk/src/lxml/htmlparser.pxd Mon May 8 16:45:47 2006 @@ -30,3 +30,6 @@ xmlInputCloseCallback ioclose, void* ioctx, char* URL, char* encoding, int options) + cdef xmlDoc* htmlCtxtReadMemory(xmlParserCtxt* ctxt, + char* buffer, int size, + char* filename, char* encoding, int options) Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Mon May 8 16:45:47 2006 @@ -61,6 +61,46 @@ cdef _ParserContext __GLOBAL_PARSER_CONTEXT __GLOBAL_PARSER_CONTEXT = _ParserContext() +############################################################ +## support for reading Python unicode +############################################################ + +# can libxml2 read plain Python unicode data? +cdef char* _UNICODE_ENCODING +_UNICODE_ENCODING = NULL + +cdef void _setupUnicodeParser(): + """Sets _READ_UNICODE to 1 if libxml2 supports reading native Python + unicode. This depends on iconv, so we simply check if we find a matching + encoding handler. + """ + cdef Py_ssize_t l + cdef char* buffer + cdef char* enc + utext = unicode("") + l = python.PyUnicode_GET_DATA_SIZE(utext) + buffer = python.PyUnicode_AS_DATA(utext) + enc = _findEncodingName(buffer, l) + if tree.xmlFindCharEncodingHandler(enc) is not NULL: + global _UNICODE_ENCODING + _UNICODE_ENCODING = enc + +cdef char* _findEncodingName(char* buffer, int size): + "Work around bug in libxml2: find iconv name of encoding on our own." + cdef int enc + enc = tree.xmlDetectCharEncoding(buffer, size) + if enc == tree.XML_CHAR_ENCODING_UTF16LE: + return "UTF16LE" + elif enc == tree.XML_CHAR_ENCODING_UTF16BE: + return "UTF16BE" + elif enc == tree.XML_CHAR_ENCODING_UCS4LE: + return "UCS-4LE" + elif enc == tree.XML_CHAR_ENCODING_UCS4BE: + return "UCS-4BE" + else: + return tree.xmlGetCharEncodingName(enc) + +_setupUnicodeParser() ############################################################ ## support for file-like objects @@ -237,6 +277,41 @@ parser._context = _ResolverContext(parser.resolvers) return parser + cdef xmlDoc* _parseUnicodeDoc(self, utext, char* c_filename) except NULL: + """Parse unicode document, share dictionary if possible. + """ + cdef xmlDoc* result + cdef xmlParserCtxt* pctxt + cdef int recover + cdef Py_ssize_t py_buffer_len + cdef int buffer_len + cdef char* c_text + cdef char* c_encoding + cdef int enc + py_buffer_len = python.PyUnicode_GET_DATA_SIZE(utext) + if py_buffer_len > python.INT_MAX: + text_utf = _utf8(utext) + return self._parseDoc(text_utf, c_filename) + buffer_len = py_buffer_len + + self._error_log.connect() + pctxt = self._parser_ctxt + __GLOBAL_PARSER_CONTEXT._initParserDict(pctxt) + + c_text = python.PyUnicode_AS_DATA(utext) + if self._parser_type == LXML_HTML_PARSER: + result = htmlparser.htmlCtxtReadMemory( + pctxt, c_text, buffer_len, c_filename, _UNICODE_ENCODING, + self._parse_options) + else: + result = xmlparser.xmlCtxtReadMemory( + pctxt, c_text, buffer_len, c_filename, _UNICODE_ENCODING, + self._parse_options) + + self._error_log.disconnect() + recover = self._parse_options & xmlparser.XML_PARSE_RECOVER + return _handleParseResult(pctxt, result, NULL, recover) + cdef xmlDoc* _parseDoc(self, char* c_text, char* c_filename) except NULL: """Parse document, share dictionary if possible. """ @@ -504,7 +579,7 @@ ## helper functions for document creation ############################################################ -cdef xmlDoc* _parseDoc(text_utf, filename, parser) except NULL: +cdef xmlDoc* _parseDoc(text, filename, parser) except NULL: cdef char* c_filename if parser is None: parser = __DEFAULT_PARSER @@ -515,7 +590,10 @@ c_filename = NULL else: c_filename = _cstr(filename) - return (<_BaseParser>parser)._parseDoc(_cstr(text_utf), c_filename) + if python.PyUnicode_Check(text): + return (<_BaseParser>parser)._parseUnicodeDoc(text, c_filename) + else: + return (<_BaseParser>parser)._parseDoc(_cstr(text), c_filename) cdef xmlDoc* _parseDocFromFile(filename, parser) except NULL: if parser is None: @@ -570,12 +648,15 @@ cdef _Document _parseMemoryDocument(text, url, parser): cdef xmlDoc* c_doc - text_utf = _utf8(text) if python.PyUnicode_Check(text): - text_utf = _stripDeclaration(text_utf) + # pass native unicode only if libxml2 can handle it + if _UNICODE_ENCODING is NULL: + text = _stripDeclaration(_utf8(text)) + else: + text = _utf8(text) if url is not None: url = _utf8(url) - c_doc = _parseDoc(text_utf, url, parser) + c_doc = _parseDoc(text, url, parser) return _documentFactory(c_doc, parser) cdef _Document _parseFilelikeDocument(source, url, parser): Modified: lxml/trunk/src/lxml/python.pxd ============================================================================== --- lxml/trunk/src/lxml/python.pxd (original) +++ lxml/trunk/src/lxml/python.pxd Mon May 8 16:45:47 2006 @@ -4,6 +4,7 @@ ctypedef struct PyObject ctypedef int size_t ctypedef int Py_ssize_t + cdef int INT_MAX cdef FILE* PyFile_AsFile(PyObject* p) cdef int PyFile_Check(object p) @@ -16,6 +17,8 @@ char* errors) cdef object PyUnicode_DecodeUTF8(char* s, Py_ssize_t size, char* errors) cdef object PyUnicode_AsUTF8String(object ustring) + cdef char* PyUnicode_AS_DATA(object ustring) + cdef Py_ssize_t PyUnicode_GET_DATA_SIZE(object ustring) cdef object PyString_FromStringAndSize(char* s, Py_ssize_t size) cdef object PyString_FromString(char* s) cdef object PyString_FromFormat(char* format, ...) Modified: lxml/trunk/src/lxml/tests/test_htmlparser.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_htmlparser.py (original) +++ lxml/trunk/src/lxml/tests/test_htmlparser.py Mon May 8 16:45:47 2006 @@ -27,6 +27,11 @@ self.assertEqual(self.etree.tostring(element), self.html_str) + def test_module_HTML_unicode(self): + element = self.etree.HTML(self.uhtml_str) + self.assertEqual(unentitify(self.etree.tostring(element)), + self.uhtml_str) + def test_module_parse_html_error(self): parser = self.etree.HTMLParser(recover=False) parse = self.etree.parse Modified: lxml/trunk/src/lxml/tree.pxd ============================================================================== --- lxml/trunk/src/lxml/tree.pxd (original) +++ lxml/trunk/src/lxml/tree.pxd Mon May 8 16:45:47 2006 @@ -8,8 +8,37 @@ cdef int LIBXML_VERSION cdef extern from "libxml/encoding.h": + ctypedef enum xmlCharEncoding: + XML_CHAR_ENCODING_ERROR = -1 # No char encoding detected + XML_CHAR_ENCODING_NONE = 0 # No char encoding detected + XML_CHAR_ENCODING_UTF8 = 1 # UTF-8 + XML_CHAR_ENCODING_UTF16LE = 2 # UTF-16 little endian + XML_CHAR_ENCODING_UTF16BE = 3 # UTF-16 big endian + XML_CHAR_ENCODING_UCS4LE = 4 # UCS-4 little endian + XML_CHAR_ENCODING_UCS4BE = 5 # UCS-4 big endian + XML_CHAR_ENCODING_EBCDIC = 6 # EBCDIC uh! + XML_CHAR_ENCODING_UCS4_2143 = 7 # UCS-4 unusual ordering + XML_CHAR_ENCODING_UCS4_3412 = 8 # UCS-4 unusual ordering + XML_CHAR_ENCODING_UCS2 = 9 # UCS-2 + XML_CHAR_ENCODING_8859_1 = 10 # ISO-8859-1 ISO Latin 1 + XML_CHAR_ENCODING_8859_2 = 11 # ISO-8859-2 ISO Latin 2 + XML_CHAR_ENCODING_8859_3 = 12 # ISO-8859-3 + XML_CHAR_ENCODING_8859_4 = 13 # ISO-8859-4 + XML_CHAR_ENCODING_8859_5 = 14 # ISO-8859-5 + XML_CHAR_ENCODING_8859_6 = 15 # ISO-8859-6 + XML_CHAR_ENCODING_8859_7 = 16 # ISO-8859-7 + XML_CHAR_ENCODING_8859_8 = 17 # ISO-8859-8 + XML_CHAR_ENCODING_8859_9 = 18 # ISO-8859-9 + XML_CHAR_ENCODING_2022_JP = 19 # ISO-2022-JP + XML_CHAR_ENCODING_SHIFT_JIS = 20 # Shift_JIS + XML_CHAR_ENCODING_EUC_JP = 21 # EUC-JP + XML_CHAR_ENCODING_ASCII = 22 # pure ASCII + ctypedef struct xmlCharEncodingHandler cdef xmlCharEncodingHandler* xmlFindCharEncodingHandler(char* name) + cdef xmlCharEncodingHandler* xmlGetCharEncodingHandler(int enc) + cdef int xmlDetectCharEncoding(char* text, int len) + cdef char* xmlGetCharEncodingName(int enc) cdef extern from "libxml/hash.h": ctypedef struct xmlHashTable Modified: lxml/trunk/src/lxml/xmlparser.pxd ============================================================================== --- lxml/trunk/src/lxml/xmlparser.pxd (original) +++ lxml/trunk/src/lxml/xmlparser.pxd Mon May 8 16:45:47 2006 @@ -23,6 +23,7 @@ xmlDict* dict void* _private int wellFormed + int recovery int options xmlError lastError @@ -66,6 +67,9 @@ xmlInputCloseCallback ioclose, void* ioctx, char* URL, char* encoding, int options) + cdef xmlDoc* xmlCtxtReadMemory(xmlParserCtxt* ctxt, + char* buffer, int size, + char* filename, char* encoding, int options) # entity loaders: From scoder at codespeak.net Mon May 8 17:46:30 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Mon May 8 17:46:31 2006 Subject: [Lxml-checkins] r26970 - lxml/trunk Message-ID: <20060508154630.41CFF1006D@code0.codespeak.net> Author: scoder Date: Mon May 8 17:46:29 2006 New Revision: 26970 Modified: lxml/trunk/bench.py Log: make benchmark output more readable Modified: lxml/trunk/bench.py ============================================================================== --- lxml/trunk/bench.py (original) +++ lxml/trunk/bench.py Mon May 8 17:46:29 2006 @@ -619,9 +619,10 @@ result = run_bench(bench, *benchmark_setup) + print "%9.4f msec/pass, best of (" % min(result), for t in result: print "%9.4f" % t, - print "msec/pass, best: %9.4f" % min(result) + print ")" if len(benchmark_suites) > 1: print # empty line between different benchmarks From scoder at codespeak.net Mon May 8 18:15:10 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Mon May 8 18:15:12 2006 Subject: [Lxml-checkins] r26973 - lxml/trunk/src/lxml Message-ID: <20060508161510.015C11006E@code0.codespeak.net> Author: scoder Date: Mon May 8 18:15:09 2006 New Revision: 26973 Modified: lxml/trunk/src/lxml/parser.pxi Log: cleanup Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Mon May 8 18:15:09 2006 @@ -62,14 +62,14 @@ __GLOBAL_PARSER_CONTEXT = _ParserContext() ############################################################ -## support for reading Python unicode +## support for Python unicode I/O ############################################################ -# can libxml2 read plain Python unicode data? +# name of Python unicode encoding as known to libxml2 cdef char* _UNICODE_ENCODING _UNICODE_ENCODING = NULL -cdef void _setupUnicodeParser(): +cdef void _setupPythonUnicode(): """Sets _READ_UNICODE to 1 if libxml2 supports reading native Python unicode. This depends on iconv, so we simply check if we find a matching encoding handler. @@ -100,7 +100,7 @@ else: return tree.xmlGetCharEncodingName(enc) -_setupUnicodeParser() +_setupPythonUnicode() ############################################################ ## support for file-like objects From scoder at codespeak.net Mon May 8 18:49:00 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Mon May 8 18:49:01 2006 Subject: [Lxml-checkins] r26974 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20060508164900.1DD881006E@code0.codespeak.net> Author: scoder Date: Mon May 8 18:48:58 2006 New Revision: 26974 Modified: lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/etree.pyx lxml/trunk/src/lxml/tests/test_etree.py lxml/trunk/src/lxml/tree.pxd Log: module level 'tounicode' function to return Python unicode string Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Mon May 8 18:48:58 2006 @@ -7,6 +7,9 @@ Features added -------------- +* Module level `tounicode` function to return XML serialization as Python + unicode string (equavalent to `tostring` function) + * Parsing a unicode string no longer copies the string (reduced memory footprint) Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Mon May 8 18:48:58 2006 @@ -1409,15 +1409,42 @@ # encoding during output enchandler = tree.xmlFindCharEncodingHandler(enc) c_buffer = tree.xmlAllocOutputBuffer(enchandler) - tree.xmlNodeDumpOutput(c_buffer, doc._c_doc, element._c_node, 0, 0, - enc) - _dumpNextNode(c_buffer, doc._c_doc, element._c_node, enc) - tree.xmlOutputBufferFlush(c_buffer) - if c_buffer.conv is not NULL: - result = tree.xmlBufferContent(c_buffer.conv) - else: - result = tree.xmlBufferContent(c_buffer.buffer) - tree.xmlOutputBufferClose(c_buffer) + try: + tree.xmlNodeDumpOutput(c_buffer, doc._c_doc, element._c_node, 0, 0, enc) + _dumpNextNode(c_buffer, doc._c_doc, element._c_node, enc) + tree.xmlOutputBufferFlush(c_buffer) + if c_buffer.conv is not NULL: + result = tree.xmlBufferContent(c_buffer.conv) + else: + result = tree.xmlBufferContent(c_buffer.buffer) + finally: + tree.xmlOutputBufferClose(c_buffer) + return result + +def tounicode(_NodeBase element): + cdef _Document doc + cdef tree.xmlOutputBuffer* c_buffer + cdef tree.xmlBuffer* c_result_buffer + + assert element is not None + # better, but not ET compatible : "_NodeBase element not None" + + doc = element._doc + c_buffer = tree.xmlAllocOutputBuffer(NULL) + try: + tree.xmlNodeDumpOutput(c_buffer, doc._c_doc, element._c_node, 0, 0, NULL) + _dumpNextNode(c_buffer, doc._c_doc, element._c_node, NULL) + tree.xmlOutputBufferFlush(c_buffer) + if c_buffer.conv is not NULL: + c_result_buffer = c_buffer.conv + else: + c_result_buffer = c_buffer.buffer + result = python.PyUnicode_DecodeUTF8( + tree.xmlBufferContent(c_result_buffer), + tree.xmlBufferLength(c_result_buffer), + 'strict') + finally: + tree.xmlOutputBufferClose(c_buffer) return result def parse(source, parser=None): Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Mon May 8 18:48:58 2006 @@ -425,6 +425,54 @@ self.assertEquals(docinfo.root_name, 'html') self.assertEquals(docinfo.doctype, '') + def test_tounicode(self): + tounicode = self.etree.tounicode + Element = self.etree.Element + SubElement = self.etree.SubElement + + a = Element('a') + b = SubElement(a, 'b') + c = SubElement(a, 'c') + + self.assert_(isinstance(tounicode(a), unicode)) + self.assertEquals('', + canonicalize(tounicode(a))) + + def test_tounicode_element(self): + tounicode = self.etree.tounicode + Element = self.etree.Element + SubElement = self.etree.SubElement + + a = Element('a') + b = SubElement(a, 'b') + c = SubElement(a, 'c') + d = SubElement(c, 'd') + self.assert_(isinstance(tounicode(b), unicode)) + self.assert_(isinstance(tounicode(c), unicode)) + self.assertEquals('', + canonicalize(tounicode(b))) + self.assertEquals('', + canonicalize(tounicode(c))) + + def test_tounicode_none(self): + tounicode = self.etree.tounicode + self.assertRaises(AssertionError, self.etree.tounicode, None) + + def test_tounicode_element_tail(self): + tounicode = self.etree.tounicode + Element = self.etree.Element + SubElement = self.etree.SubElement + + a = Element('a') + b = SubElement(a, 'b') + c = SubElement(a, 'c') + d = SubElement(c, 'd') + b.tail = 'Foo' + + self.assert_(isinstance(tounicode(b), unicode)) + self.assert_(tounicode(b) == 'Foo' or + tounicode(b) == 'Foo') + def _writeElement(self, element, encoding='us-ascii'): """Write out element for comparison. """ Modified: lxml/trunk/src/lxml/tree.pxd ============================================================================== --- lxml/trunk/src/lxml/tree.pxd (original) +++ lxml/trunk/src/lxml/tree.pxd Mon May 8 18:48:58 2006 @@ -199,6 +199,7 @@ cdef int xmlReconciliateNs(xmlDoc* doc, xmlNode* tree) cdef xmlBuffer* xmlBufferCreate() cdef char* xmlBufferContent(xmlBuffer* buf) + cdef int xmlBufferLength(xmlBuffer* buf) cdef extern from "libxml/xmlIO.h": cdef xmlOutputBuffer* xmlAllocOutputBuffer(xmlCharEncodingHandler* encoder) From scoder at codespeak.net Mon May 8 19:05:02 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Mon May 8 19:05:03 2006 Subject: [Lxml-checkins] r26975 - lxml/trunk Message-ID: <20060508170502.3FCEF1006E@code0.codespeak.net> Author: scoder Date: Mon May 8 19:05:01 2006 New Revision: 26975 Modified: lxml/trunk/CHANGES.txt Log: typo Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Mon May 8 19:05:01 2006 @@ -8,7 +8,7 @@ -------------- * Module level `tounicode` function to return XML serialization as Python - unicode string (equavalent to `tostring` function) + unicode string (equivalent to `tostring` function) * Parsing a unicode string no longer copies the string (reduced memory footprint) From scoder at codespeak.net Mon May 8 19:35:05 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Mon May 8 19:35:06 2006 Subject: [Lxml-checkins] r26977 - lxml/trunk/src/lxml Message-ID: <20060508173505.6C0B910070@code0.codespeak.net> Author: scoder Date: Mon May 8 19:35:03 2006 New Revision: 26977 Modified: lxml/trunk/src/lxml/etree.pyx Log: potential bug in string conversion (if it's not a string) Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Mon May 8 19:35:03 2006 @@ -1404,7 +1404,7 @@ if encoding in ('utf8', 'UTF8', 'utf-8'): encoding = 'UTF-8' doc = element._doc - enc = _cstr(encoding) + enc = encoding # it is necessary to *and* find the encoding handler *and* use # encoding during output enchandler = tree.xmlFindCharEncodingHandler(enc) From scoder at codespeak.net Mon May 8 21:22:58 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Mon May 8 21:23:00 2006 Subject: [Lxml-checkins] r26978 - lxml/trunk/doc Message-ID: <20060508192258.B98051006E@code0.codespeak.net> Author: scoder Date: Mon May 8 21:22:57 2006 New Revision: 26978 Modified: lxml/trunk/doc/compatibility.txt Log: note on differences in unicode string parsing between etree and ElementTree Modified: lxml/trunk/doc/compatibility.txt ============================================================================== --- lxml/trunk/doc/compatibility.txt (original) +++ lxml/trunk/doc/compatibility.txt Mon May 8 21:22:57 2006 @@ -32,6 +32,22 @@ XPath, XSLT, Relax NG, and XML Schema support, which (c)ElementTree does not offer. +* etree has a different idea about Python unicode strings than ElementTree. + In most parts of the API, ElementTree uses plain strings and unicode strings + as what they are. This includes Element.text, Element.tail and many other + places. However, the ElementTree parsers assume by default that any string + (`str` or `unicode`) contains ASCII data and raise an exception if strings + do not match the expected encoding. + + etree has the same idea about plain strings (`str`) as ElementTree. For + unicode strings, however, etree assumes throughout the API that they are + Python unicode encoded strings rather than byte data. This includes the + parsers. It is therefore perfectly correct to pass XML unicode data into + the etree parsers in form of Python unicode strings. It is an error, on the + other hand, if unicode strings specify an encoding in their XML declaration. + Note also that Python unicode strings are platform specific. Such an + encoding specifier would not be portable. + * ElementTree allows you to place an Element in two different trees as the same time. Thus, this:: From scoder at codespeak.net Tue May 9 11:15:32 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Tue May 9 11:15:33 2006 Subject: [Lxml-checkins] r26990 - lxml/trunk/src/lxml Message-ID: <20060509091532.4863610071@code0.codespeak.net> Author: scoder Date: Tue May 9 11:15:30 2006 New Revision: 26990 Modified: lxml/trunk/src/lxml/parser.pxi Log: doc updates Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Tue May 9 11:15:30 2006 @@ -70,9 +70,10 @@ _UNICODE_ENCODING = NULL cdef void _setupPythonUnicode(): - """Sets _READ_UNICODE to 1 if libxml2 supports reading native Python - unicode. This depends on iconv, so we simply check if we find a matching - encoding handler. + """Sets _UNICODE_ENCODING to the internal encoding name of Python unicode + strings if libxmls supports reading native Python unicode. This depends + on iconv and the local Python installation, so we simply check if we find + a matching encoding handler. """ cdef Py_ssize_t l cdef char* buffer From scoder at codespeak.net Tue May 9 12:13:22 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Tue May 9 12:13:24 2006 Subject: [Lxml-checkins] r26992 - lxml/trunk Message-ID: <20060509101322.71B6010071@code0.codespeak.net> Author: scoder Date: Tue May 9 12:13:21 2006 New Revision: 26992 Modified: lxml/trunk/CHANGES.txt Log: doc updates Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Tue May 9 12:13:21 2006 @@ -25,6 +25,8 @@ * etree module can be compiled without libxslt by commenting out the line 'include "xslt.pxi"' near the end of the etree.pyx source file +* Better error messages in parser exceptions + * Error reporting now also works in XSLT * Support for custom document loaders (URI resolvers) in parsers and XSLT, From scoder at codespeak.net Wed May 10 09:30:55 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Wed May 10 09:30:57 2006 Subject: [Lxml-checkins] r27024 - in lxml/trunk: . doc src/lxml src/lxml/tests Message-ID: <20060510073055.ECBAF10089@code0.codespeak.net> Author: scoder Date: Wed May 10 09:30:52 2006 New Revision: 27024 Modified: lxml/trunk/CHANGES.txt lxml/trunk/doc/api.txt lxml/trunk/src/lxml/apihelpers.pxi lxml/trunk/src/lxml/python.pxd lxml/trunk/src/lxml/tests/test_xslt.py lxml/trunk/src/lxml/xslt.pxd lxml/trunk/src/lxml/xslt.pxi Log: fix str() decoding bug in _XSLTResultTree.__str__ for non-UTF8 encodings, make unicode() work on _XSLTResultTree Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Wed May 10 09:30:52 2006 @@ -7,6 +7,8 @@ Features added -------------- +* Support for writing XSLT results to Python unicode strings via `unicode()` + * Module level `tounicode` function to return XML serialization as Python unicode string (equivalent to `tostring` function) @@ -56,6 +58,8 @@ Bugs fixed ---------- +* str(xslt_result) was broken for output other than UTF-8 + * Memory leak if write_c14n fails to write the file after conversion * ElementTree.xpath() and XPathDocumentEvaluator were not using the Modified: lxml/trunk/doc/api.txt ============================================================================== --- lxml/trunk/doc/api.txt (original) +++ lxml/trunk/doc/api.txt Wed May 10 09:30:52 2006 @@ -187,7 +187,6 @@ >>> f = StringIO('''\ ... - ... ... ... ... @@ -213,6 +212,33 @@ >>> str(result) '\nText\n' +The result is always a plain string, encoded as requested by the `xsl:output` +element in the stylesheet. If you want a Python unicode string instead, you +should set this encoding to `UTF-8` (or leave it as the `ASCII` default). +This allows you to call the `unicode()` function on the result:: + + >>> unicode(result) + u'\nText\n' + +However, encodings that are not supported by Python will result in an error:: + + >>> f = StringIO('''\ + ... + ... + ... + ... + ... + ... ''') + >>> xslt_doc = lxml.etree.parse(f) + >>> transform = lxml.etree.XSLT(xslt_doc) + + >>> result = transform(doc) + >>> unicode(result) + Traceback (most recent call last): + [...] + LookupError: unknown encoding: UCS4 + It is possible to pass parameters, in the form of XPath expressions, to the XSLT template:: Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Wed May 10 09:30:52 2006 @@ -121,6 +121,14 @@ if c_next is not NULL and c_next.type == tree.XML_TEXT_NODE: tree.xmlNodeDumpOutput(c_buffer, c_doc, c_next, 0, 0, encoding) +cdef object __REPLACE_XML_ENCODING +__REPLACE_XML_ENCODING = re.compile( + r'^(\s*<\?\s*xml[^>]+)\s+encoding\s*=\s*"[^"]*"\s*', re.U).sub + +cdef object _stripEncodingDeclaration(object xml_string): + # this is a hack to remove the XML encoding declaration from unicode + return __REPLACE_XML_ENCODING(r'\g<1>', xml_string) + cdef object _stripDeclaration(object xml_string): # this is a hack to remove the XML declaration when we encode to UTF-8 xml_string = xml_string.strip() Modified: lxml/trunk/src/lxml/python.pxd ============================================================================== --- lxml/trunk/src/lxml/python.pxd (original) +++ lxml/trunk/src/lxml/python.pxd Wed May 10 09:30:52 2006 @@ -15,6 +15,8 @@ cdef object PyUnicode_FromEncodedObject(object s, char* encoding, char* errors) + cdef object PyUnicode_Decode(char* s, Py_ssize_t size, + char* encoding, char* errors) cdef object PyUnicode_DecodeUTF8(char* s, Py_ssize_t size, char* errors) cdef object PyUnicode_AsUTF8String(object ustring) cdef char* PyUnicode_AS_DATA(object ustring) Modified: lxml/trunk/src/lxml/tests/test_xslt.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_xslt.py (original) +++ lxml/trunk/src/lxml/tests/test_xslt.py Wed May 10 09:30:52 2006 @@ -30,6 +30,66 @@ ''', st.tostring(res)) + def test_xslt_utf8(self): + tree = self.parse(u'\uF8D2\uF8D2') + style = self.parse('''\ + + + + + +''') + + st = etree.XSLT(style) + res = st.apply(tree) + expected = u'''\ + +\uF8D2 +''' + self.assertEquals(expected, + unicode(str(res), 'UTF-8')) + + def test_xslt_encoding(self): + tree = self.parse(u'\uF8D2\uF8D2') + style = self.parse('''\ + + + + + +''') + + st = etree.XSLT(style) + res = st.apply(tree) + expected = u'''\ + +\uF8D2 +''' + self.assertEquals(expected, + unicode(str(res), 'UTF-16')) + + def test_xslt_unicode(self): + tree = self.parse(u'\uF8D2\uF8D2') + style = self.parse('''\ + + + + + +''') + + st = etree.XSLT(style) + res = st.apply(tree) + expected = u'''\ + +\uF8D2 +''' + self.assertEquals(expected, + unicode(res)) + def test_exslt(self): tree = self.parse('BC') style = self.parse('''\ @@ -250,10 +310,11 @@ etree.tostring(result.getroot()) result = transform.apply(source) etree.tostring(result.getroot()) - - result = transform(source) - result = transform(source) str(result) + + result1 = transform(source) + result2 = transform(source) + self.assertEquals(str(result1), str(result2)) result = transform(source) str(result) Modified: lxml/trunk/src/lxml/xslt.pxd ============================================================================== --- lxml/trunk/src/lxml/xslt.pxd (original) +++ lxml/trunk/src/lxml/xslt.pxd Wed May 10 09:30:52 2006 @@ -12,6 +12,7 @@ xmlDoc* doc ctypedef struct xsltStylesheet: + char* encoding xmlDoc* doc ctypedef struct xsltTransformContext: Modified: lxml/trunk/src/lxml/xslt.pxi ============================================================================== --- lxml/trunk/src/lxml/xslt.pxi (original) +++ lxml/trunk/src/lxml/xslt.pxi Wed May 10 09:30:52 2006 @@ -315,20 +315,38 @@ cdef class _XSLTResultTree(_ElementTree): cdef XSLT _xslt - def __str__(self): - cdef char* s - cdef int l + cdef _saveToStringAndSize(self, char** s, int* l): cdef int r - r = xslt.xsltSaveResultToString(&s, &l, self._doc._c_doc, + r = xslt.xsltSaveResultToString(s, l, self._doc._c_doc, self._xslt._c_style) if r == -1: raise XSLTSaveError, "Error saving XSLT result to string" + + def __str__(self): + cdef char* s + cdef int l + self._saveToStringAndSize(&s, &l) if s is NULL: return '' - result = funicode(s) + # we must not use 'funicode' here as this is not always UTF-8 + result = python.PyString_FromStringAndSize(s, l) tree.xmlFree(s) return result + def __unicode__(self): + cdef char* encoding + cdef char* s + cdef int l + self._saveToStringAndSize(&s, &l) + if s is NULL: + return unicode() + encoding = self._xslt._c_style.encoding + if encoding is NULL: + encoding = 'ascii' + result = python.PyUnicode_Decode(s, l, encoding, 'strict') + tree.xmlFree(s) + return _stripEncodingDeclaration(result) + cdef _xsltResultTreeFactory(_Document doc, XSLT xslt): cdef _XSLTResultTree result result = <_XSLTResultTree>_newElementTree(doc, None, _XSLTResultTree) From scoder at codespeak.net Wed May 10 10:38:25 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Wed May 10 10:38:27 2006 Subject: [Lxml-checkins] r27028 - lxml/trunk/src/lxml Message-ID: <20060510083825.BB09710085@code0.codespeak.net> Author: scoder Date: Wed May 10 10:38:24 2006 New Revision: 27028 Modified: lxml/trunk/src/lxml/parser.pxi Log: cleanup Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Wed May 10 10:38:24 2006 @@ -377,12 +377,13 @@ ctxt.lastError.domain == xmlerror.XML_FROM_IO: if ctxt.lastError.message is not NULL: message = "Error reading file %s: %s" % ( - funicode(c_filename), funicode(ctxt.lastError.message)) + funicode(c_filename), + funicode(ctxt.lastError.message).strip()) else: message = "Error reading file %s" % funicode(c_filename) raise IOError, message elif ctxt.lastError.message is not NULL: - raise XMLSyntaxError, funicode(ctxt.lastError.message) + raise XMLSyntaxError, funicode(ctxt.lastError.message).strip() else: raise XMLSyntaxError @@ -652,7 +653,7 @@ if python.PyUnicode_Check(text): # pass native unicode only if libxml2 can handle it if _UNICODE_ENCODING is NULL: - text = _stripDeclaration(_utf8(text)) + text = _stripEncodingDeclaration(_utf8(text)) else: text = _utf8(text) if url is not None: From scoder at codespeak.net Wed May 10 10:49:18 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Wed May 10 10:49:19 2006 Subject: [Lxml-checkins] r27029 - in lxml/trunk: . doc src/lxml src/lxml/tests Message-ID: <20060510084918.032FE10087@code0.codespeak.net> Author: scoder Date: Wed May 10 10:49:16 2006 New Revision: 27029 Modified: lxml/trunk/CHANGES.txt lxml/trunk/doc/api.txt lxml/trunk/src/lxml/etree.pyx lxml/trunk/src/lxml/tests/test_etree.py Log: add __unicode__ method to _NodeBase and _ElementTree to let them return a XML unicode string of their tree on unicode() calls; cleanup and new doctests in api.txt Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Wed May 10 10:49:16 2006 @@ -7,10 +7,13 @@ Features added -------------- -* Support for writing XSLT results to Python unicode strings via `unicode()` +* Support for writing the XML representation of Elements and ElementTrees to + Python unicode strings via ``unicode()`` + +* Support for writing XSLT results to Python unicode strings via ``unicode()`` * Module level `tounicode` function to return XML serialization as Python - unicode string (equivalent to `tostring` function) + unicode string (equivalent to ``tostring`` function) * Parsing a unicode string no longer copies the string (reduced memory footprint) @@ -25,7 +28,7 @@ information, original encoding and XML version as seen by the parser * etree module can be compiled without libxslt by commenting out the line - 'include "xslt.pxi"' near the end of the etree.pyx source file + ``include "xslt.pxi"`` near the end of the etree.pyx source file * Better error messages in parser exceptions Modified: lxml/trunk/doc/api.txt ============================================================================== --- lxml/trunk/doc/api.txt (original) +++ lxml/trunk/doc/api.txt Wed May 10 10:49:16 2006 @@ -16,7 +16,7 @@ The following examples usually assume this to be executed first:: - >>> import lxml.etree + >>> from lxml import etree >>> from StringIO import StringIO If you need to know which version of lxml is installed, you can access the @@ -37,13 +37,13 @@ >>> xml = '' - >>> et = lxml.etree.parse(StringIO(xml)) - >>> print lxml.etree.tostring(et.getroot()) + >>> et = etree.parse(StringIO(xml)) + >>> print etree.tostring(et.getroot()) - >>> parser = lxml.etree.XMLParser(ns_clean=True) - >>> et = lxml.etree.parse(StringIO(xml), parser) - >>> print lxml.etree.tostring(et.getroot()) + >>> parser = etree.XMLParser(ns_clean=True) + >>> et = etree.parse(StringIO(xml), parser) + >>> print etree.tostring(et.getroot()) HTML parsing is similarly simple. The parsers have a ``recover`` keyword @@ -54,17 +54,17 @@ >>> broken_html = "test<body><h1>page title</h3>" - >>> parser = lxml.etree.HTMLParser() - >>> et = lxml.etree.parse(StringIO(broken_html), parser) + >>> parser = etree.HTMLParser() + >>> et = etree.parse(StringIO(broken_html), parser) - >>> print lxml.etree.tostring(et.getroot()) + >>> print etree.tostring(et.getroot()) <html><head><title>test

page title

Lxml has an HTML function, similar to the XML shortcut known from ElementTree:: - >>> html = lxml.etree.HTML(broken_html) - >>> print lxml.etree.tostring(html) + >>> html = etree.HTML(broken_html) + >>> print etree.tostring(html) test

page title

The use of the libxml2 parsers makes some additional information available at @@ -78,7 +78,7 @@ >>> xml_header = '' >>> xhtml = xml_header + doctype_string + '' - >>> et = lxml.etree.parse(StringIO(xhtml)) + >>> et = etree.parse(StringIO(xhtml)) >>> docinfo = et.docinfo >>> print docinfo.public_id -//W3C//DTD XHTML 1.0 Transitional//EN @@ -100,13 +100,13 @@ evaluation or schema validation. Whenever an exception is raised, you can retrieve the errors that occured and "might have" lead to the problem:: - >>> lxml.etree.clearErrorLog() + >>> etree.clearErrorLog() >>> broken_xml = '' >>> try: - ... lxml.etree.parse(StringIO(broken_xml)) - ... except lxml.etree.XMLSyntaxError, e: + ... etree.parse(StringIO(broken_xml)) + ... except etree.XMLSyntaxError, e: ... pass # just put the exception into e - >>> log = e.error_log.filter_levels(lxml.etree.ErrorLevels.FATAL) + >>> log = e.error_log.filter_levels(etree.ErrorLevels.FATAL) >>> print log :1:FATAL:PARSER:ERR_TAG_NOT_FINISHED: Premature end of data in tag a line 1 @@ -122,6 +122,55 @@ PARSER ERR_TAG_NOT_FINISHED +Python unicode strings +---------------------- + +lxml.etree has broader support for Python unicode strings than the ElementTree +library. First of all, its parsers can handle unicode strings straight away:: + + >>> uxml = u' \uf8d1 + \uf8d2 ' + >>> uxml + u' \uf8d1 + \uf8d2 ' + >>> root = etree.XML(uxml) + +This requires, however, that unicode strings do not specify a conflicting +encoding themselves and thus lie about their real encoding:: + + >>> try: + ... broken = etree.XML(u'' + uxml) + ... except etree.XMLSyntaxError: + ... print "This is not well-formed XML!" + This is not well-formed XML! + +To serialize the result, you can either use the normal ``tostring`` module +function or the new ``tounicode`` function, which is only available in +lxml.etree and always returns a Python unicode string:: + + >>> etree.tostring(root) + '  +  ' + + >>> etree.tounicode(root) + u' \uf8d1 + \uf8d2 ' + +On the output side, lxml.etree supports calling ``unicode()`` on XML tree +objects to retrieve a Python unicode representation:: + + >>> el = etree.Element("test") + >>> unicode(el) + u'' + + >>> subel = etree.SubElement(el, "subtest") + >>> et = etree.ElementTree(el) + >>> unicode(et) + u'' + +Note, however, that the ``str()`` function behaves as in the ElementTree +library and returns something like ````. This +is due to the fact that this function implies no clear encoding semantics. +The ``unicode()`` function, on the other hand, is specified to always returns +a Python unicode string. + + xpath method on ElementTree, Element ------------------------------------ @@ -152,7 +201,7 @@ Example:: >>> f = StringIO('') - >>> doc = lxml.etree.parse(f) + >>> doc = etree.parse(f) >>> r = doc.xpath('/foo/bar') >>> len(r) 1 @@ -167,7 +216,7 @@ ... Text ... ... ''') - >>> doc = lxml.etree.parse(f) + >>> doc = etree.parse(f) >>> r = doc.xpath('/t:foo/b:bar', {'t': 'http://codespeak.net/ns/test1', ... 'b': 'http://codespeak.net/ns/test2'}) >>> len(r) @@ -191,14 +240,14 @@ ... ...
...
''') - >>> xslt_doc = lxml.etree.parse(f) - >>> transform = lxml.etree.XSLT(xslt_doc) + >>> xslt_doc = etree.parse(f) + >>> transform = etree.XSLT(xslt_doc) You can then run the transformation on an ElementTree document by simply calling it, and this results in another ElementTree object:: >>> f = StringIO('Text') - >>> doc = lxml.etree.parse(f) + >>> doc = etree.parse(f) >>> result = transform(doc) The result object can be accessed like a normal ElementTree document:: @@ -230,8 +279,8 @@ ... ... ... ''') - >>> xslt_doc = lxml.etree.parse(f) - >>> transform = lxml.etree.XSLT(xslt_doc) + >>> xslt_doc = etree.parse(f) + >>> transform = etree.XSLT(xslt_doc) >>> result = transform(doc) >>> unicode(result) @@ -250,10 +299,10 @@ ... ... ... ''') - >>> xslt_doc = lxml.etree.parse(f) - >>> transform = lxml.etree.XSLT(xslt_doc) + >>> xslt_doc = etree.parse(f) + >>> transform = etree.XSLT(xslt_doc) >>> f = StringIO('Text') - >>> doc = lxml.etree.parse(f) + >>> doc = etree.parse(f) The parameters are passed as keyword parameters to the transform call. First let's try passing in a simple string expression:: @@ -293,20 +342,20 @@ ... ... ... ''') - >>> relaxng_doc = lxml.etree.parse(f) - >>> relaxng = lxml.etree.RelaxNG(relaxng_doc) + >>> relaxng_doc = etree.parse(f) + >>> relaxng = etree.RelaxNG(relaxng_doc) You can then validate some ElementTree document against the schema. You'll get back True if the document is valid against the Relax NG schema, and False if not:: >>> valid = StringIO('') - >>> doc = lxml.etree.parse(valid) + >>> doc = etree.parse(valid) >>> relaxng.validate(doc) 1 >>> invalid = StringIO('') - >>> doc2 = lxml.etree.parse(invalid) + >>> doc2 = etree.parse(invalid) >>> relaxng.validate(doc2) 0 @@ -314,7 +363,7 @@ method. This is sometimes used in conditional statements:: >>> invalid = StringIO('') - >>> doc2 = lxml.etree.parse(invalid) + >>> doc2 = etree.parse(invalid) >>> if not relaxng(doc2): ... print "invalid!" invalid! @@ -375,20 +424,20 @@ ... ... ... ''') - >>> xmlschema_doc = lxml.etree.parse(f) - >>> xmlschema = lxml.etree.XMLSchema(xmlschema_doc) + >>> xmlschema_doc = etree.parse(f) + >>> xmlschema = etree.XMLSchema(xmlschema_doc) You can then validate some ElementTree document with this. Like with RelaxNG, you'll get back true if the document is valid against the XML schema, and false if not:: >>> valid = StringIO('') - >>> doc = lxml.etree.parse(valid) + >>> doc = etree.parse(valid) >>> xmlschema.validate(doc) 1 >>> invalid = StringIO('') - >>> doc2 = lxml.etree.parse(invalid) + >>> doc2 = etree.parse(invalid) >>> xmlschema.validate(doc2) 0 @@ -396,7 +445,7 @@ method. This is sometimes used in conditional statements:: >>> invalid = StringIO('') - >>> doc2 = lxml.etree.parse(invalid) + >>> doc2 = etree.parse(invalid) >>> if not xmlschema(doc2): ... print "invalid!" invalid! @@ -448,9 +497,9 @@ ... ... ''') - >>> tree = lxml.etree.parse(data) + >>> tree = etree.parse(data) >>> tree.xinclude() - >>> lxml.etree.tostring(tree.getroot()) + >>> etree.tostring(tree.getroot()) '\n\n\n' @@ -463,7 +512,7 @@ C14N recommendation. For example:: >>> f = StringIO('') - >>> tree = lxml.etree.parse(f) + >>> tree = etree.parse(f) >>> f2 = StringIO() >>> tree.write_c14n(f2) >>> f2.getvalue() Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Wed May 10 10:49:16 2006 @@ -319,6 +319,9 @@ unregisterProxy(self) attemptDeallocation(self._c_node) + def __unicode__(self): + return tounicode(self) + def _init(self): """Called after object initialisation. Subclasses may override this if they recursively call _init() in the superclasses. @@ -387,6 +390,9 @@ return root.findall(path) # extensions to ElementTree API + def __unicode__(self): + return tounicode(self._context_node) + def xpath(self, _path, namespaces=None, **_variables): """XPath evaluate in context of document. Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Wed May 10 10:49:16 2006 @@ -457,7 +457,7 @@ def test_tounicode_none(self): tounicode = self.etree.tounicode self.assertRaises(AssertionError, self.etree.tounicode, None) - + def test_tounicode_element_tail(self): tounicode = self.etree.tounicode Element = self.etree.Element @@ -472,7 +472,68 @@ self.assert_(isinstance(tounicode(b), unicode)) self.assert_(tounicode(b) == 'Foo' or tounicode(b) == 'Foo') + + def test_unicode(self): + Element = self.etree.Element + SubElement = self.etree.SubElement + + a = Element('a') + b = SubElement(a, 'b') + c = SubElement(a, 'c') + + self.assert_(isinstance(unicode(a), unicode)) + self.assertEquals('', + canonicalize(unicode(a))) + + def test_unicode_element(self): + Element = self.etree.Element + SubElement = self.etree.SubElement + + a = Element('a') + b = SubElement(a, 'b') + c = SubElement(a, 'c') + d = SubElement(c, 'd') + self.assert_(isinstance(unicode(b), unicode)) + self.assert_(isinstance(unicode(c), unicode)) + self.assertEquals('', + canonicalize(unicode(b))) + self.assertEquals('', + canonicalize(unicode(c))) + + def test_unicode_elementtree(self): + ElementTree = self.etree.ElementTree + Element = self.etree.Element + SubElement = self.etree.SubElement + + a = Element('a') + b = SubElement(a, 'b') + c = SubElement(a, 'c') + d = SubElement(c, 'd') + + t = ElementTree(b) + self.assert_(isinstance(unicode(t), unicode)) + self.assertEquals('', + canonicalize(unicode(t))) + + t = ElementTree(c) + self.assert_(isinstance(unicode(t), unicode)) + self.assertEquals('', + canonicalize(unicode(t))) + + def test_tounicode_element_tail(self): + Element = self.etree.Element + SubElement = self.etree.SubElement + a = Element('a') + b = SubElement(a, 'b') + c = SubElement(a, 'c') + d = SubElement(c, 'd') + b.tail = 'Foo' + + self.assert_(isinstance(unicode(b), unicode)) + self.assert_(unicode(b) == 'Foo' or + unicode(b) == 'Foo') + def _writeElement(self, element, encoding='us-ascii'): """Write out element for comparison. """ From scoder at codespeak.net Wed May 10 10:52:40 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Wed May 10 10:52:40 2006 Subject: [Lxml-checkins] r27030 - lxml/trunk/doc Message-ID: <20060510085240.224591008B@code0.codespeak.net> Author: scoder Date: Wed May 10 10:52:38 2006 New Revision: 27030 Modified: lxml/trunk/doc/api.txt Log: api.txt: be more specific on unicode parser difference between etree and ElementTree Modified: lxml/trunk/doc/api.txt ============================================================================== --- lxml/trunk/doc/api.txt (original) +++ lxml/trunk/doc/api.txt Wed May 10 10:52:38 2006 @@ -126,7 +126,8 @@ ---------------------- lxml.etree has broader support for Python unicode strings than the ElementTree -library. First of all, its parsers can handle unicode strings straight away:: +library. First of all, where ElementTree would raise an exception, the +parsers in lxml.etree can handle unicode strings straight away:: >>> uxml = u' \uf8d1 + \uf8d2 ' >>> uxml From scoder at codespeak.net Wed May 10 11:04:01 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Wed May 10 11:04:02 2006 Subject: [Lxml-checkins] r27031 - lxml/trunk/doc Message-ID: <20060510090401.3148D1008F@code0.codespeak.net> Author: scoder Date: Wed May 10 11:04:00 2006 New Revision: 27031 Modified: lxml/trunk/doc/api.txt Log: new doctest Modified: lxml/trunk/doc/api.txt ============================================================================== --- lxml/trunk/doc/api.txt (original) +++ lxml/trunk/doc/api.txt Wed May 10 11:04:00 2006 @@ -156,6 +156,9 @@ On the output side, lxml.etree supports calling ``unicode()`` on XML tree objects to retrieve a Python unicode representation:: + >>> unicode(root) + u' \uf8d1 + \uf8d2 ' + >>> el = etree.Element("test") >>> unicode(el) u'' From scoder at codespeak.net Wed May 10 11:30:09 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Wed May 10 11:30:10 2006 Subject: [Lxml-checkins] r27034 - in lxml/trunk: . doc src/lxml src/lxml/tests Message-ID: <20060510093009.4632310092@code0.codespeak.net> Author: scoder Date: Wed May 10 11:30:06 2006 New Revision: 27034 Modified: lxml/trunk/CHANGES.txt lxml/trunk/doc/api.txt lxml/trunk/src/lxml/apihelpers.pxi lxml/trunk/src/lxml/etree.pyx lxml/trunk/src/lxml/tests/test_etree.py Log: discarded new 'tounicode()' function again, replaced by standard unicode() call Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Wed May 10 11:30:06 2006 @@ -12,9 +12,6 @@ * Support for writing XSLT results to Python unicode strings via ``unicode()`` -* Module level `tounicode` function to return XML serialization as Python - unicode string (equivalent to ``tostring`` function) - * Parsing a unicode string no longer copies the string (reduced memory footprint) @@ -61,7 +58,7 @@ Bugs fixed ---------- -* str(xslt_result) was broken for output other than UTF-8 +* str(xslt_result) was broken for XSLT output other than UTF-8 * Memory leak if write_c14n fails to write the file after conversion Modified: lxml/trunk/doc/api.txt ============================================================================== --- lxml/trunk/doc/api.txt (original) +++ lxml/trunk/doc/api.txt Wed May 10 11:30:06 2006 @@ -143,18 +143,18 @@ ... print "This is not well-formed XML!" This is not well-formed XML! -To serialize the result, you can either use the normal ``tostring`` module -function or the new ``tounicode`` function, which is only available in -lxml.etree and always returns a Python unicode string:: +To serialize the result, you would normally use the ``tostring`` module +function, which serializes to plain ASCII by default or a number of other +encodings if asked for:: >>> etree.tostring(root) '  +  ' - >>> etree.tounicode(root) - u' \uf8d1 + \uf8d2 ' + >>> etree.tostring(root, 'UTF-8') + ' \xef\xa3\x91 + \xef\xa3\x92 ' -On the output side, lxml.etree supports calling ``unicode()`` on XML tree -objects to retrieve a Python unicode representation:: +As an extension, lxml.etree supports calling the builtin ``unicode()`` +function on XML tree objects to retrieve a Python unicode representation:: >>> unicode(root) u' \uf8d1 + \uf8d2 ' @@ -164,6 +164,9 @@ u'' >>> subel = etree.SubElement(el, "subtest") + >>> unicode(el) + u'' + >>> et = etree.ElementTree(el) >>> unicode(et) u'' Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Wed May 10 11:30:06 2006 @@ -1,5 +1,60 @@ # Private helper functions +cdef _tostring(_NodeBase element, encoding): + "Serialize an element to an encoded string representation of its XML tree." + cdef _Document doc + cdef tree.xmlOutputBuffer* c_buffer + cdef tree.xmlCharEncodingHandler* enchandler + cdef char* enc + if element is None: + return None + #if encoding is None: + # encoding = 'UTF-8' + if encoding in ('utf8', 'UTF8', 'utf-8'): + encoding = 'UTF-8' + doc = element._doc + enc = encoding + # it is necessary to *and* find the encoding handler *and* use + # encoding during output + enchandler = tree.xmlFindCharEncodingHandler(enc) + c_buffer = tree.xmlAllocOutputBuffer(enchandler) + try: + tree.xmlNodeDumpOutput(c_buffer, doc._c_doc, element._c_node, 0, 0, enc) + _dumpNextNode(c_buffer, doc._c_doc, element._c_node, enc) + tree.xmlOutputBufferFlush(c_buffer) + if c_buffer.conv is not NULL: + result = tree.xmlBufferContent(c_buffer.conv) + else: + result = tree.xmlBufferContent(c_buffer.buffer) + finally: + tree.xmlOutputBufferClose(c_buffer) + return result + +cdef _tounicode(_NodeBase element): + "Serialize an element to the Python unicode representation of its XML tree." + cdef _Document doc + cdef tree.xmlOutputBuffer* c_buffer + cdef tree.xmlBuffer* c_result_buffer + if element is None: + return None + doc = element._doc + c_buffer = tree.xmlAllocOutputBuffer(NULL) + try: + tree.xmlNodeDumpOutput(c_buffer, doc._c_doc, element._c_node, 0, 0, NULL) + _dumpNextNode(c_buffer, doc._c_doc, element._c_node, NULL) + tree.xmlOutputBufferFlush(c_buffer) + if c_buffer.conv is not NULL: + c_result_buffer = c_buffer.conv + else: + c_result_buffer = c_buffer.buffer + result = python.PyUnicode_DecodeUTF8( + tree.xmlBufferContent(c_result_buffer), + tree.xmlBufferLength(c_result_buffer), + 'strict') + finally: + tree.xmlOutputBufferClose(c_buffer) + return result + cdef void displayNode(xmlNode* c_node, indent): # to help with debugging cdef xmlNode* c_child Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Wed May 10 11:30:06 2006 @@ -320,7 +320,7 @@ attemptDeallocation(self._c_node) def __unicode__(self): - return tounicode(self) + return _tounicode(self) def _init(self): """Called after object initialisation. Subclasses may override @@ -391,7 +391,7 @@ # extensions to ElementTree API def __unicode__(self): - return tounicode(self._context_node) + return _tounicode(self._context_node) def xpath(self, _path, namespaces=None, **_variables): """XPath evaluate in context of document. @@ -1396,62 +1396,16 @@ # better, but not ET compatible : "_NodeBase elem not None" _dumpToFile(sys.stdout, elem._doc._c_doc, elem._c_node) -def tostring(_NodeBase element, encoding='us-ascii'): - cdef _Document doc - cdef tree.xmlOutputBuffer* c_buffer - cdef tree.xmlCharEncodingHandler* enchandler - cdef char* enc - - assert element is not None +def tostring(element_or_tree, encoding='us-ascii'): + "Serialize an element to an encoded string representation of its XML tree." + assert element_or_tree is not None # better, but not ET compatible : "_NodeBase element not None" - - #if encoding is None: - # encoding = 'UTF-8' - if encoding in ('utf8', 'UTF8', 'utf-8'): - encoding = 'UTF-8' - doc = element._doc - enc = encoding - # it is necessary to *and* find the encoding handler *and* use - # encoding during output - enchandler = tree.xmlFindCharEncodingHandler(enc) - c_buffer = tree.xmlAllocOutputBuffer(enchandler) - try: - tree.xmlNodeDumpOutput(c_buffer, doc._c_doc, element._c_node, 0, 0, enc) - _dumpNextNode(c_buffer, doc._c_doc, element._c_node, enc) - tree.xmlOutputBufferFlush(c_buffer) - if c_buffer.conv is not NULL: - result = tree.xmlBufferContent(c_buffer.conv) - else: - result = tree.xmlBufferContent(c_buffer.buffer) - finally: - tree.xmlOutputBufferClose(c_buffer) - return result - -def tounicode(_NodeBase element): - cdef _Document doc - cdef tree.xmlOutputBuffer* c_buffer - cdef tree.xmlBuffer* c_result_buffer - - assert element is not None - # better, but not ET compatible : "_NodeBase element not None" - - doc = element._doc - c_buffer = tree.xmlAllocOutputBuffer(NULL) - try: - tree.xmlNodeDumpOutput(c_buffer, doc._c_doc, element._c_node, 0, 0, NULL) - _dumpNextNode(c_buffer, doc._c_doc, element._c_node, NULL) - tree.xmlOutputBufferFlush(c_buffer) - if c_buffer.conv is not NULL: - c_result_buffer = c_buffer.conv - else: - c_result_buffer = c_buffer.buffer - result = python.PyUnicode_DecodeUTF8( - tree.xmlBufferContent(c_result_buffer), - tree.xmlBufferLength(c_result_buffer), - 'strict') - finally: - tree.xmlOutputBufferClose(c_buffer) - return result + if isinstance(element_or_tree, _NodeBase): + return _tostring(<_NodeBase>element_or_tree, encoding) + elif isinstance(element_or_tree, _ElementTree): + return _tostring((<_ElementTree>element_or_tree)._context_node, encoding) + else: + raise TypeError, "Type '%s' cannot be serialized." % type(element_or_tree) def parse(source, parser=None): """Return an ElementTree object loaded with source elements. If no parser Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Wed May 10 11:30:06 2006 @@ -425,54 +425,6 @@ self.assertEquals(docinfo.root_name, 'html') self.assertEquals(docinfo.doctype, '') - def test_tounicode(self): - tounicode = self.etree.tounicode - Element = self.etree.Element - SubElement = self.etree.SubElement - - a = Element('a') - b = SubElement(a, 'b') - c = SubElement(a, 'c') - - self.assert_(isinstance(tounicode(a), unicode)) - self.assertEquals('', - canonicalize(tounicode(a))) - - def test_tounicode_element(self): - tounicode = self.etree.tounicode - Element = self.etree.Element - SubElement = self.etree.SubElement - - a = Element('a') - b = SubElement(a, 'b') - c = SubElement(a, 'c') - d = SubElement(c, 'd') - self.assert_(isinstance(tounicode(b), unicode)) - self.assert_(isinstance(tounicode(c), unicode)) - self.assertEquals('', - canonicalize(tounicode(b))) - self.assertEquals('', - canonicalize(tounicode(c))) - - def test_tounicode_none(self): - tounicode = self.etree.tounicode - self.assertRaises(AssertionError, self.etree.tounicode, None) - - def test_tounicode_element_tail(self): - tounicode = self.etree.tounicode - Element = self.etree.Element - SubElement = self.etree.SubElement - - a = Element('a') - b = SubElement(a, 'b') - c = SubElement(a, 'c') - d = SubElement(c, 'd') - b.tail = 'Foo' - - self.assert_(isinstance(tounicode(b), unicode)) - self.assert_(tounicode(b) == 'Foo' or - tounicode(b) == 'Foo') - def test_unicode(self): Element = self.etree.Element SubElement = self.etree.SubElement From scoder at codespeak.net Wed May 10 11:37:48 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Wed May 10 11:37:50 2006 Subject: [Lxml-checkins] r27035 - lxml/trunk/src/lxml Message-ID: <20060510093748.0E98210093@code0.codespeak.net> Author: scoder Date: Wed May 10 11:37:46 2006 New Revision: 27035 Modified: lxml/trunk/src/lxml/etree.pyx Log: doc fixes Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Wed May 10 11:37:46 2006 @@ -1398,8 +1398,7 @@ def tostring(element_or_tree, encoding='us-ascii'): "Serialize an element to an encoded string representation of its XML tree." - assert element_or_tree is not None - # better, but not ET compatible : "_NodeBase element not None" + assert element_or_tree is not None # for ElementTree compatibility only if isinstance(element_or_tree, _NodeBase): return _tostring(<_NodeBase>element_or_tree, encoding) elif isinstance(element_or_tree, _ElementTree): From scoder at codespeak.net Wed May 10 11:50:14 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Wed May 10 11:50:15 2006 Subject: [Lxml-checkins] r27036 - in lxml/branch/lxml-0.9.x: . src/lxml src/lxml/tests Message-ID: <20060510095014.A1D891009D@code0.codespeak.net> Author: scoder Date: Wed May 10 11:50:13 2006 New Revision: 27036 Modified: lxml/branch/lxml-0.9.x/CHANGES.txt lxml/branch/lxml-0.9.x/src/lxml/etree.pyx lxml/branch/lxml-0.9.x/src/lxml/tests/test_xslt.py lxml/branch/lxml-0.9.x/src/lxml/xslt.pxi Log: merged in bug fixes from trunk: C14N memory leak, str() on encoded XSLT results Modified: lxml/branch/lxml-0.9.x/CHANGES.txt ============================================================================== --- lxml/branch/lxml-0.9.x/CHANGES.txt (original) +++ lxml/branch/lxml-0.9.x/CHANGES.txt Wed May 10 11:50:13 2006 @@ -19,6 +19,10 @@ Bugs fixed ---------- +* str(xslt_result) was broken for output other than UTF-8 + +* Memory leak if write_c14n fails to write the file after conversion + * Crash in XMLSchema and RelaxNG when passing non-schema documents * Memory leak in RelaxNG() when RelaxNGParseError is raised Modified: lxml/branch/lxml-0.9.x/src/lxml/etree.pyx ============================================================================== --- lxml/branch/lxml-0.9.x/src/lxml/etree.pyx (original) +++ lxml/branch/lxml-0.9.x/src/lxml/etree.pyx Wed May 10 11:50:13 2006 @@ -348,10 +348,12 @@ if bytes < 0: raise C14NError, "C14N failed" - if not hasattr(file, 'write'): - file = open(file, 'wb') - file.write(data) - tree.xmlFree(data) + try: + if not hasattr(file, 'write'): + file = open(file, 'wb') + file.write(data) + finally: + tree.xmlFree(data) cdef _ElementTree _elementTreeFactory(_Document doc, _NodeBase context_node): Modified: lxml/branch/lxml-0.9.x/src/lxml/tests/test_xslt.py ============================================================================== --- lxml/branch/lxml-0.9.x/src/lxml/tests/test_xslt.py (original) +++ lxml/branch/lxml-0.9.x/src/lxml/tests/test_xslt.py Wed May 10 11:50:13 2006 @@ -29,6 +29,47 @@ B ''', st.tostring(res)) + + def test_xslt_utf8(self): + tree = self.parse(u'\uF8D2\uF8D2') + style = self.parse('''\ + + + + + +''') + + st = etree.XSLT(style) + res = st.apply(tree) + expected = u'''\ + +\uF8D2 +''' + self.assertEquals(expected, + unicode(str(res), 'UTF-8')) + + def test_xslt_encoding(self): + tree = self.parse(u'\uF8D2\uF8D2') + style = self.parse('''\ + + + + + +''') + + st = etree.XSLT(style) + res = st.apply(tree) + expected = u'''\ + +\uF8D2 +''' + self.assertEquals(expected, + unicode(str(res), 'UTF-16')) + def test_xslt_input(self): tree = self.parse('BC') style = self.parse('''\ Modified: lxml/branch/lxml-0.9.x/src/lxml/xslt.pxi ============================================================================== --- lxml/branch/lxml-0.9.x/src/lxml/xslt.pxi (original) +++ lxml/branch/lxml-0.9.x/src/lxml/xslt.pxi Wed May 10 11:50:13 2006 @@ -361,7 +361,8 @@ raise XSLTSaveError, "Error saving XSLT result to string" if s is NULL: return '' - result = funicode(s) + # we must not use 'funicode' here as this is not always UTF-8 + result = python.PyString_FromStringAndSize(s, l) tree.xmlFree(s) return result From scoder at codespeak.net Wed May 10 11:54:59 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Wed May 10 11:55:01 2006 Subject: [Lxml-checkins] r27037 - lxml/branch/lxml-0.9.x Message-ID: <20060510095459.EF8C4100A2@code0.codespeak.net> Author: scoder Date: Wed May 10 11:54:58 2006 New Revision: 27037 Modified: lxml/branch/lxml-0.9.x/CHANGES.txt lxml/branch/lxml-0.9.x/version.txt Log: 0.9.2 Modified: lxml/branch/lxml-0.9.x/CHANGES.txt ============================================================================== --- lxml/branch/lxml-0.9.x/CHANGES.txt (original) +++ lxml/branch/lxml-0.9.x/CHANGES.txt Wed May 10 11:54:58 2006 @@ -1,8 +1,8 @@ lxml changelog ============== -current -======= +0.9.2 (2006-05-10) +================== Features added -------------- @@ -19,7 +19,7 @@ Bugs fixed ---------- -* str(xslt_result) was broken for output other than UTF-8 +* str(xslt_result) was broken for XSLT output other than UTF-8 * Memory leak if write_c14n fails to write the file after conversion @@ -72,7 +72,7 @@ * XPath class for compiled XPath expressions -* XMLID module level function +* XMLID module level function (ElementTree compatible) * XMLParser API for customized libxml2 parser configuration Modified: lxml/branch/lxml-0.9.x/version.txt ============================================================================== --- lxml/branch/lxml-0.9.x/version.txt (original) +++ lxml/branch/lxml-0.9.x/version.txt Wed May 10 11:54:58 2006 @@ -1 +1 @@ -0.9.1 +0.9.2 From scoder at codespeak.net Wed May 10 12:25:45 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Wed May 10 12:25:46 2006 Subject: [Lxml-checkins] r27038 - lxml/trunk Message-ID: <20060510102545.39F8D100AD@code0.codespeak.net> Author: scoder Date: Wed May 10 12:25:43 2006 New Revision: 27038 Modified: lxml/trunk/CHANGES.txt lxml/trunk/version.txt Log: cleanup and version bump after releasing 0.9.2 Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Wed May 10 12:25:43 2006 @@ -46,6 +46,20 @@ * XMLDTDID function parses XML into tuple (root node, ID dict) based on xml:id implementation of libxml2 (as opposed to ET compatible XMLID) +Bugs fixed +---------- + +* ElementTree.xpath() and XPathDocumentEvaluator were not using the + ElementTree root node as reference point + +* Calling document('') in XSLT failed to return the stylesheet + +0.9.2 (2006-05-10) +================== + +Features added +-------------- + * Speedup for Element.makeelement(): the new element now reuses the original libxml2 document instead of creating a new empty one @@ -62,11 +76,6 @@ * Memory leak if write_c14n fails to write the file after conversion -* ElementTree.xpath() and XPathDocumentEvaluator were not using the - ElementTree root node as reference point - -* Calling document('') in XSLT failed to return the stylesheet - * Crash in XMLSchema and RelaxNG when passing non-schema documents * Memory leak in RelaxNG() when RelaxNGParseError is raised Modified: lxml/trunk/version.txt ============================================================================== --- lxml/trunk/version.txt (original) +++ lxml/trunk/version.txt Wed May 10 12:25:43 2006 @@ -1 +1 @@ -0.9.1 +0.9.2 From scoder at codespeak.net Wed May 10 13:03:02 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Wed May 10 13:03:03 2006 Subject: [Lxml-checkins] r27043 - in lxml/trunk: . doc src/lxml src/lxml/tests Message-ID: <20060510110302.93A6310092@code0.codespeak.net> Author: scoder Date: Wed May 10 13:03:00 2006 New Revision: 27043 Modified: lxml/trunk/CHANGES.txt lxml/trunk/doc/api.txt lxml/trunk/src/lxml/etree.pyx lxml/trunk/src/lxml/tests/test_etree.py Log: reverted 'unicode()' calls on Element/ElementTree to 'tounicode()' module function Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Wed May 10 13:03:00 2006 @@ -8,7 +8,7 @@ -------------- * Support for writing the XML representation of Elements and ElementTrees to - Python unicode strings via ``unicode()`` + Python unicode strings via ``etree.tounicode()`` * Support for writing XSLT results to Python unicode strings via ``unicode()`` Modified: lxml/trunk/doc/api.txt ============================================================================== --- lxml/trunk/doc/api.txt (original) +++ lxml/trunk/doc/api.txt Wed May 10 13:03:00 2006 @@ -153,30 +153,24 @@ >>> etree.tostring(root, 'UTF-8') ' \xef\xa3\x91 + \xef\xa3\x92 ' -As an extension, lxml.etree supports calling the builtin ``unicode()`` -function on XML tree objects to retrieve a Python unicode representation:: +As an extension, lxml.etree has a new ``lxml.etree.tounicode()`` function that +you can call on XML tree objects to retrieve a Python unicode representation:: - >>> unicode(root) + >>> etree.tounicode(root) u' \uf8d1 + \uf8d2 ' >>> el = etree.Element("test") - >>> unicode(el) + >>> etree.tounicode(el) u'' >>> subel = etree.SubElement(el, "subtest") - >>> unicode(el) + >>> etree.tounicode(el) u'' >>> et = etree.ElementTree(el) - >>> unicode(et) + >>> etree.tounicode(et) u'' -Note, however, that the ``str()`` function behaves as in the ElementTree -library and returns something like ````. This -is due to the fact that this function implies no clear encoding semantics. -The ``unicode()`` function, on the other hand, is specified to always returns -a Python unicode string. - xpath method on ElementTree, Element ------------------------------------ Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Wed May 10 13:03:00 2006 @@ -319,9 +319,6 @@ unregisterProxy(self) attemptDeallocation(self._c_node) - def __unicode__(self): - return _tounicode(self) - def _init(self): """Called after object initialisation. Subclasses may override this if they recursively call _init() in the superclasses. @@ -390,9 +387,6 @@ return root.findall(path) # extensions to ElementTree API - def __unicode__(self): - return _tounicode(self._context_node) - def xpath(self, _path, namespaces=None, **_variables): """XPath evaluate in context of document. @@ -1406,6 +1400,16 @@ else: raise TypeError, "Type '%s' cannot be serialized." % type(element_or_tree) +def tounicode(element_or_tree): + "Serialize an element to the Python unicode representation of its XML tree." + assert element_or_tree is not None # for ElementTree compatibility only + if isinstance(element_or_tree, _NodeBase): + return _tounicode(<_NodeBase>element_or_tree) + elif isinstance(element_or_tree, _ElementTree): + return _tounicode((<_ElementTree>element_or_tree)._context_node) + else: + raise TypeError, "Type '%s' cannot be serialized." % type(element_or_tree) + def parse(source, parser=None): """Return an ElementTree object loaded with source elements. If no parser is provided as second argument, the default parser is used. Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Wed May 10 13:03:00 2006 @@ -425,7 +425,8 @@ self.assertEquals(docinfo.root_name, 'html') self.assertEquals(docinfo.doctype, '') - def test_unicode(self): + def test_tounicode(self): + tounicode = self.etree.tounicode Element = self.etree.Element SubElement = self.etree.SubElement @@ -433,11 +434,12 @@ b = SubElement(a, 'b') c = SubElement(a, 'c') - self.assert_(isinstance(unicode(a), unicode)) + self.assert_(isinstance(tounicode(a), unicode)) self.assertEquals('', - canonicalize(unicode(a))) + canonicalize(tounicode(a))) - def test_unicode_element(self): + def test_tounicode_element(self): + tounicode = self.etree.tounicode Element = self.etree.Element SubElement = self.etree.SubElement @@ -445,34 +447,19 @@ b = SubElement(a, 'b') c = SubElement(a, 'c') d = SubElement(c, 'd') - self.assert_(isinstance(unicode(b), unicode)) - self.assert_(isinstance(unicode(c), unicode)) + self.assert_(isinstance(tounicode(b), unicode)) + self.assert_(isinstance(tounicode(c), unicode)) self.assertEquals('', - canonicalize(unicode(b))) + canonicalize(tounicode(b))) self.assertEquals('', - canonicalize(unicode(c))) + canonicalize(tounicode(c))) - def test_unicode_elementtree(self): - ElementTree = self.etree.ElementTree - Element = self.etree.Element - SubElement = self.etree.SubElement - - a = Element('a') - b = SubElement(a, 'b') - c = SubElement(a, 'c') - d = SubElement(c, 'd') - - t = ElementTree(b) - self.assert_(isinstance(unicode(t), unicode)) - self.assertEquals('', - canonicalize(unicode(t))) - - t = ElementTree(c) - self.assert_(isinstance(unicode(t), unicode)) - self.assertEquals('', - canonicalize(unicode(t))) + def test_tounicode_none(self): + tounicode = self.etree.tounicode + self.assertRaises(AssertionError, self.etree.tounicode, None) def test_tounicode_element_tail(self): + tounicode = self.etree.tounicode Element = self.etree.Element SubElement = self.etree.SubElement @@ -482,9 +469,9 @@ d = SubElement(c, 'd') b.tail = 'Foo' - self.assert_(isinstance(unicode(b), unicode)) - self.assert_(unicode(b) == 'Foo' or - unicode(b) == 'Foo') + self.assert_(isinstance(tounicode(b), unicode)) + self.assert_(tounicode(b) == 'Foo' or + tounicode(b) == 'Foo') def _writeElement(self, element, encoding='us-ascii'): """Write out element for comparison. From scoder at codespeak.net Wed May 10 13:13:27 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Wed May 10 13:13:29 2006 Subject: [Lxml-checkins] r27044 - lxml/trunk/doc Message-ID: <20060510111327.9A3A910092@code0.codespeak.net> Author: scoder Date: Wed May 10 13:13:26 2006 New Revision: 27044 Modified: lxml/trunk/doc/api.txt Log: doc cleanup in api.txt->XSLT Modified: lxml/trunk/doc/api.txt ============================================================================== --- lxml/trunk/doc/api.txt (original) +++ lxml/trunk/doc/api.txt Wed May 10 13:13:26 2006 @@ -272,7 +272,7 @@ However, encodings that are not supported by Python will result in an error:: - >>> f = StringIO('''\ + >>> xslt_tree = etree.XML('''\ ... ... @@ -280,8 +280,7 @@ ... ... ... ''') - >>> xslt_doc = etree.parse(f) - >>> transform = etree.XSLT(xslt_doc) + >>> transform = etree.XSLT(xslt_tree) >>> result = transform(doc) >>> unicode(result) @@ -292,16 +291,14 @@ It is possible to pass parameters, in the form of XPath expressions, to the XSLT template:: - >>> f = StringIO('''\ + >>> xslt_tree = etree.XML('''\ ... - ... ... ... ... ... ''') - >>> xslt_doc = etree.parse(f) - >>> transform = etree.XSLT(xslt_doc) + >>> transform = etree.XSLT(xslt_tree) >>> f = StringIO('Text') >>> doc = etree.parse(f) @@ -323,7 +320,7 @@ transformation to multiple documents, but is shorter to write for one-shot operations, as you do not have to instantiate a stylesheet yourself:: - >>> result = doc.xslt(xslt_doc, a="'A'") + >>> result = doc.xslt(xslt_tree, a="'A'") >>> str(result) '\nA\n' From scoder at codespeak.net Wed May 10 21:47:00 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Wed May 10 21:47:01 2006 Subject: [Lxml-checkins] r27056 - lxml/trunk/src/lxml Message-ID: <20060510194700.AE07110089@code0.codespeak.net> Author: scoder Date: Wed May 10 21:46:59 2006 New Revision: 27056 Modified: lxml/trunk/src/lxml/etree.pyx Log: doc: state that the result of tounicode() does not carry an encoding declaration which might be a problem Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Wed May 10 21:46:59 2006 @@ -1401,7 +1401,12 @@ raise TypeError, "Type '%s' cannot be serialized." % type(element_or_tree) def tounicode(element_or_tree): - "Serialize an element to the Python unicode representation of its XML tree." + """Serialize an element to the Python unicode representation of its XML + tree. + + Note that the result does not carry an XML encoding declaration and is + therefore not necessarily suited for serialization without further + treatment.""" assert element_or_tree is not None # for ElementTree compatibility only if isinstance(element_or_tree, _NodeBase): return _tounicode(<_NodeBase>element_or_tree) From scoder at codespeak.net Wed May 10 21:47:56 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Wed May 10 21:47:58 2006 Subject: [Lxml-checkins] r27057 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20060510194756.5846210088@code0.codespeak.net> Author: scoder Date: Wed May 10 21:47:54 2006 New Revision: 27057 Modified: lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/apihelpers.pxi lxml/trunk/src/lxml/tests/test_elementtree.py Log: fix: tostring() failed to serialize encodings that contain 0-bytes Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Wed May 10 21:47:54 2006 @@ -49,10 +49,12 @@ Bugs fixed ---------- +* ``tostring()`` failed to serialize encodings that contain 0-bytes + * ElementTree.xpath() and XPathDocumentEvaluator were not using the ElementTree root node as reference point -* Calling document('') in XSLT failed to return the stylesheet +* Calling ``document('')`` in XSLT failed to return the stylesheet 0.9.2 (2006-05-10) ================== Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Wed May 10 21:47:54 2006 @@ -4,6 +4,7 @@ "Serialize an element to an encoded string representation of its XML tree." cdef _Document doc cdef tree.xmlOutputBuffer* c_buffer + cdef tree.xmlBuffer* c_result_buffer cdef tree.xmlCharEncodingHandler* enchandler cdef char* enc if element is None: @@ -22,10 +23,13 @@ tree.xmlNodeDumpOutput(c_buffer, doc._c_doc, element._c_node, 0, 0, enc) _dumpNextNode(c_buffer, doc._c_doc, element._c_node, enc) tree.xmlOutputBufferFlush(c_buffer) - if c_buffer.conv is not NULL: - result = tree.xmlBufferContent(c_buffer.conv) + if c_buffer.conv is not NULL: + c_result_buffer = c_buffer.conv else: - result = tree.xmlBufferContent(c_buffer.buffer) + c_result_buffer = c_buffer.buffer + result = python.PyString_FromStringAndSize( + tree.xmlBufferContent(c_result_buffer), + tree.xmlBufferLength(c_result_buffer)) finally: tree.xmlOutputBufferClose(c_buffer) return result Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Wed May 10 21:47:54 2006 @@ -1546,6 +1546,19 @@ self.assertEquals('', canonicalize(tostring(a))) + + def test_tostring_encoding(self): + tostring = self.etree.tostring + Element = self.etree.Element + SubElement = self.etree.SubElement + + a = Element('a') + b = SubElement(a, 'b') + c = SubElement(a, 'c') + + result = unicode(tostring(a, 'UTF-16'), 'UTF-16') + self.assertEquals('', + canonicalize(result)) def test_tostring_element(self): tostring = self.etree.tostring From scoder at codespeak.net Wed May 10 22:29:40 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Wed May 10 22:29:42 2006 Subject: [Lxml-checkins] r27058 - lxml/trunk/src/lxml/tests Message-ID: <20060510202940.042ED1007E@code0.codespeak.net> Author: scoder Date: Wed May 10 22:29:40 2006 New Revision: 27058 Modified: lxml/trunk/src/lxml/tests/test_elementtree.py Log: moved test case to similar test cases Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Wed May 10 22:29:40 2006 @@ -1546,19 +1546,6 @@ self.assertEquals('', canonicalize(tostring(a))) - - def test_tostring_encoding(self): - tostring = self.etree.tostring - Element = self.etree.Element - SubElement = self.etree.SubElement - - a = Element('a') - b = SubElement(a, 'b') - c = SubElement(a, 'c') - - result = unicode(tostring(a, 'UTF-16'), 'UTF-16') - self.assertEquals('', - canonicalize(result)) def test_tostring_element(self): tostring = self.etree.tostring @@ -1698,7 +1685,20 @@ a = Element('a') a.text = u'S?k p? nettet' self.assert_(tostring(a, 'UTF-8') in [xml, prologue + xml]) + + def test_encoding_tostring_utf16(self): + tostring = self.etree.tostring + Element = self.etree.Element + SubElement = self.etree.SubElement + a = Element('a') + b = SubElement(a, 'b') + c = SubElement(a, 'c') + + result = unicode(tostring(a, 'UTF-16'), 'UTF-16') + self.assertEquals('', + canonicalize(result)) + def test_encoding_tostring_sub(self): Element = self.etree.Element SubElement = self.etree.SubElement From scoder at codespeak.net Wed May 10 22:35:47 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Wed May 10 22:35:50 2006 Subject: [Lxml-checkins] r27059 - in lxml/trunk: . doc src/lxml Message-ID: <20060510203547.2324F1007E@code0.codespeak.net> Author: scoder Date: Wed May 10 22:35:44 2006 New Revision: 27059 Modified: lxml/trunk/CHANGES.txt lxml/trunk/doc/api.txt lxml/trunk/src/lxml/apihelpers.pxi lxml/trunk/src/lxml/etree.pyx Log: fix tostring() to raise exception on buffer alloc errors; support writing XML declaration in tostring(), can be suppressed by xml_declaration keyword Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Wed May 10 22:35:44 2006 @@ -49,6 +49,8 @@ Bugs fixed ---------- +* ``tostring()`` now adds an XML declaration for non-ASCII encodings + * ``tostring()`` failed to serialize encodings that contain 0-bytes * ElementTree.xpath() and XPathDocumentEvaluator were not using the Modified: lxml/trunk/doc/api.txt ============================================================================== --- lxml/trunk/doc/api.txt (original) +++ lxml/trunk/doc/api.txt Wed May 10 22:35:44 2006 @@ -150,7 +150,7 @@ >>> etree.tostring(root) '  +  ' - >>> etree.tostring(root, 'UTF-8') + >>> etree.tostring(root, 'UTF-8', xml_declaration=False) ' \xef\xa3\x91 + \xef\xa3\x92 ' As an extension, lxml.etree has a new ``lxml.etree.tounicode()`` function that Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Wed May 10 22:35:44 2006 @@ -1,6 +1,6 @@ # Private helper functions -cdef _tostring(_NodeBase element, encoding): +cdef _tostring(_NodeBase element, encoding, int xml_declaration): "Serialize an element to an encoded string representation of its XML tree." cdef _Document doc cdef tree.xmlOutputBuffer* c_buffer @@ -9,8 +9,6 @@ cdef char* enc if element is None: return None - #if encoding is None: - # encoding = 'UTF-8' if encoding in ('utf8', 'UTF8', 'utf-8'): encoding = 'UTF-8' doc = element._doc @@ -19,6 +17,22 @@ # encoding during output enchandler = tree.xmlFindCharEncodingHandler(enc) c_buffer = tree.xmlAllocOutputBuffer(enchandler) + if c_buffer is NULL: + raise LxmlError, "Failed to create output buffer" + + if xml_declaration: + if doc._c_doc.version is NULL: + version = "1.0" + else: + version = doc._c_doc.version + xml_decl = "" % ( + version, encoding) + tree.xmlOutputBufferWriteString(c_buffer, "\n") + try: tree.xmlNodeDumpOutput(c_buffer, doc._c_doc, element._c_node, 0, 0, enc) _dumpNextNode(c_buffer, doc._c_doc, element._c_node, enc) @@ -43,6 +57,8 @@ return None doc = element._doc c_buffer = tree.xmlAllocOutputBuffer(NULL) + if c_buffer is NULL: + raise LxmlError, "Failed to create output buffer" try: tree.xmlNodeDumpOutput(c_buffer, doc._c_doc, element._c_node, 0, 0, NULL) _dumpNextNode(c_buffer, doc._c_doc, element._c_node, NULL) Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Wed May 10 22:35:44 2006 @@ -1390,13 +1390,24 @@ # better, but not ET compatible : "_NodeBase elem not None" _dumpToFile(sys.stdout, elem._doc._c_doc, elem._c_node) -def tostring(element_or_tree, encoding='us-ascii'): +def tostring(element_or_tree, encoding='us-ascii', xml_declaration=None): "Serialize an element to an encoded string representation of its XML tree." + cdef int write_declaration assert element_or_tree is not None # for ElementTree compatibility only + + encoding = str(encoding) + if xml_declaration is None: + # by default, write an XML declaration only for non-standard encodings + write_declaration = (encoding != 'us-ascii') + else: + write_declaration = bool(xml_declaration) + if isinstance(element_or_tree, _NodeBase): - return _tostring(<_NodeBase>element_or_tree, encoding) + return _tostring(<_NodeBase>element_or_tree, + encoding, write_declaration) elif isinstance(element_or_tree, _ElementTree): - return _tostring((<_ElementTree>element_or_tree)._context_node, encoding) + return _tostring((<_ElementTree>element_or_tree)._context_node, + encoding, write_declaration) else: raise TypeError, "Type '%s' cannot be serialized." % type(element_or_tree) From scoder at codespeak.net Thu May 11 08:15:33 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Thu May 11 08:15:35 2006 Subject: [Lxml-checkins] r27061 - lxml/trunk/doc Message-ID: <20060511061533.8F2A410089@code0.codespeak.net> Author: scoder Date: Thu May 11 08:15:31 2006 New Revision: 27061 Modified: lxml/trunk/doc/api.txt Log: api.txt: note on missing XML declarations in result of tounicode(), compare to tostring() Modified: lxml/trunk/doc/api.txt ============================================================================== --- lxml/trunk/doc/api.txt (original) +++ lxml/trunk/doc/api.txt Thu May 11 08:15:31 2006 @@ -138,7 +138,7 @@ encoding themselves and thus lie about their real encoding:: >>> try: - ... broken = etree.XML(u'' + uxml) + ... broken = etree.XML(u'\n' + uxml) ... except etree.XMLSyntaxError: ... print "This is not well-formed XML!" This is not well-formed XML! @@ -171,6 +171,13 @@ >>> etree.tounicode(et) u'' +Note that the unicode string returned by ``tounicode()`` never has an XML +declaration. This means, it does not specify an encoding nor an XML version. +This makes it possible to pass the unicode string back into the lxml parsers. +However, you may have to add a declaration yourself if you want to serialize +the unicode string to a byte stream later. In contrast, the ``tostring()`` +function automatically adds a declaration as needed. + xpath method on ElementTree, Element ------------------------------------ From scoder at codespeak.net Thu May 11 08:37:35 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Thu May 11 08:37:37 2006 Subject: [Lxml-checkins] r27062 - lxml/trunk/doc Message-ID: <20060511063735.E8BF010088@code0.codespeak.net> Author: scoder Date: Thu May 11 08:37:34 2006 New Revision: 27062 Modified: lxml/trunk/doc/api.txt Log: clarification on tostring/tounicode Modified: lxml/trunk/doc/api.txt ============================================================================== --- lxml/trunk/doc/api.txt (original) +++ lxml/trunk/doc/api.txt Thu May 11 08:37:34 2006 @@ -171,12 +171,12 @@ >>> etree.tounicode(et) u'' -Note that the unicode string returned by ``tounicode()`` never has an XML -declaration. This means, it does not specify an encoding nor an XML version. -This makes it possible to pass the unicode string back into the lxml parsers. -However, you may have to add a declaration yourself if you want to serialize -the unicode string to a byte stream later. In contrast, the ``tostring()`` -function automatically adds a declaration as needed. +Note that the unicode strings returned by ``tounicode()`` never have an XML +declaration and therefore do not specify an encoding. This makes it possible +to pass them back into the lxml parsers. However, you may have to add a +declaration yourself if you want to serialize such a unicode string to a byte +stream later. In contrast, the ``tostring()`` function automatically adds a +declaration as needed that reflects the encoding of the returned byte string. xpath method on ElementTree, Element From scoder at codespeak.net Thu May 11 08:50:06 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Thu May 11 08:50:07 2006 Subject: [Lxml-checkins] r27063 - lxml/trunk/doc Message-ID: <20060511065006.3B68010088@code0.codespeak.net> Author: scoder Date: Thu May 11 08:50:04 2006 New Revision: 27063 Modified: lxml/trunk/doc/api.txt Log: api.txt: small cleanup in XSLT docs Modified: lxml/trunk/doc/api.txt ============================================================================== --- lxml/trunk/doc/api.txt (original) +++ lxml/trunk/doc/api.txt Thu May 11 08:50:04 2006 @@ -269,10 +269,11 @@ >>> str(result) '\nText\n' -The result is always a plain string, encoded as requested by the `xsl:output` -element in the stylesheet. If you want a Python unicode string instead, you -should set this encoding to `UTF-8` (or leave it as the `ASCII` default). -This allows you to call the `unicode()` function on the result:: +The result is always a plain string, encoded as requested by the +``xsl:output`` element in the stylesheet. If you want a Python unicode string +instead, you should set this encoding to ``UTF-8`` (unless the `ASCII` default +is sufficient). This allows you to call the builtin ``unicode()`` function on +the result:: >>> unicode(result) u'\nText\n' From scoder at codespeak.net Thu May 11 09:01:48 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Thu May 11 09:01:50 2006 Subject: [Lxml-checkins] r27064 - lxml/trunk/doc Message-ID: <20060511070148.BF72C10088@code0.codespeak.net> Author: scoder Date: Thu May 11 09:01:47 2006 New Revision: 27064 Modified: lxml/trunk/doc/api.txt lxml/trunk/doc/main.txt Log: api.txt: clarification on output encoding in XSLT Modified: lxml/trunk/doc/api.txt ============================================================================== --- lxml/trunk/doc/api.txt (original) +++ lxml/trunk/doc/api.txt Thu May 11 09:01:47 2006 @@ -278,7 +278,8 @@ >>> unicode(result) u'\nText\n' -However, encodings that are not supported by Python will result in an error:: +You can use other encodings at the cost of multiple recoding. Encodings that +are not supported by Python will result in an error:: >>> xslt_tree = etree.XML('''\ ... Author: scoder Date: Thu May 11 14:44:09 2006 New Revision: 27073 Modified: lxml/trunk/src/lxml/tests/test_xpathevaluator.py Log: test case for handling unicode in namespaces of ETXPath expressions Modified: lxml/trunk/src/lxml/tests/test_xpathevaluator.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_xpathevaluator.py (original) +++ lxml/trunk/src/lxml/tests/test_xpathevaluator.py Thu May 11 14:44:09 2006 @@ -315,6 +315,19 @@ self.assertEquals(1, len(r)) self.assertEquals('{nsb}b', r[0].tag) + def test_xpath_compile_unicode(self): + x = self.parse(u'') + + expr = etree.ETXPath(u"/a/{nsa\uf8d2}b") + r = expr.evaluate(x) + self.assertEquals(1, len(r)) + self.assertEquals(u'{nsa\uf8d2}b', r[0].tag) + + expr = etree.ETXPath(u"/a/{nsb\uf8d1}b") + r = expr.evaluate(x) + self.assertEquals(1, len(r)) + self.assertEquals(u'{nsb\uf8d1}b', r[0].tag) + SAMPLE_XML = etree.parse(StringIO(""" text From scoder at codespeak.net Thu May 11 14:45:27 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Thu May 11 14:45:28 2006 Subject: [Lxml-checkins] r27074 - lxml/trunk/src/lxml Message-ID: <20060511124527.6978A1008D@code0.codespeak.net> Author: scoder Date: Thu May 11 14:45:25 2006 New Revision: 27074 Modified: lxml/trunk/src/lxml/xpath.pxi Log: some restructuring in ETXPath._nsextract_path() to fix path encoding Modified: lxml/trunk/src/lxml/xpath.pxi ============================================================================== --- lxml/trunk/src/lxml/xpath.pxi (original) +++ lxml/trunk/src/lxml/xpath.pxi Thu May 11 14:45:25 2006 @@ -211,14 +211,15 @@ """Special XPath class that supports the ElementTree {uri} notation for namespaces.""" def __init__(self, path, extensions=None): - path_utf, namespaces = self._nsextract_path(_utf8(path)) - XPath.__init__(self, funicode(path_utf), namespaces, extensions) + path, namespaces = self._nsextract_path(path) + XPath.__init__(self, path, namespaces, extensions) - cdef _nsextract_path(self, path_utf): + cdef _nsextract_path(self, path): # replace {namespaces} by new prefixes cdef int i - namespaces = {} + path_utf = path.encode('UTF-8') stripped_path = _replace_strings('', path_utf) # remove string literals + namespaces = {} namespace_defs = [] i = 1 for namespace_def in _find_namespaces(stripped_path): @@ -227,8 +228,11 @@ i = i+1 python.PyList_Append(namespace_defs, namespace_def) namespace = namespace_def[1:-1] # remove '{}' + namespace = python.PyUnicode_FromEncodedObject( + namespace, 'UTF-8', 'strict') python.PyDict_SetItem(namespaces, prefix, namespace) prefix_str = prefix + ':' # FIXME: this also replaces {namespaces} within strings! - path_utf = path_utf.replace(namespace_def, prefix_str) - return path_utf, namespaces + path_utf = path_utf.replace(namespace_def, prefix_str) + path = python.PyUnicode_FromEncodedObject(path_utf, 'UTF-8', 'strict') + return path, namespaces From scoder at codespeak.net Thu May 11 15:08:27 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Thu May 11 15:08:29 2006 Subject: [Lxml-checkins] r27076 - lxml/branch/lxml-0.9.x/doc Message-ID: <20060511130827.D6B2110082@code0.codespeak.net> Author: scoder Date: Thu May 11 15:08:26 2006 New Revision: 27076 Modified: lxml/branch/lxml-0.9.x/doc/main.txt Log: main.txt: 0.9.2, link to latest CHANGES.txt in SVN Modified: lxml/branch/lxml-0.9.x/doc/main.txt ============================================================================== --- lxml/branch/lxml-0.9.x/doc/main.txt (original) +++ lxml/branch/lxml-0.9.x/doc/main.txt Thu May 11 15:08:26 2006 @@ -16,6 +16,8 @@ News ---- +* 2006-05-10: `lxml 0.9.2`_ released (`changes for 0.9.2`_) + * 2006-03-30: `lxml 0.9.1`_ released (`changes for 0.9.1`_) * 2006-03-20: `lxml 0.9`_ released (`changes for 0.9`_) @@ -30,6 +32,8 @@ * 2005-04-08: `lxml 0.5`_ released! +.. _`lxml 0.9.2`: lxml-0.9.2.tgz + .. _`lxml 0.9.1`: lxml-0.9.1.tgz .. _`lxml 0.9`: lxml-0.9.tgz @@ -44,6 +48,8 @@ .. _`lxml 0.5`: lxml-0.5.tgz +.. _`CHANGES for 0.9.2`: changes-0.9.2.html + .. _`CHANGES for 0.9.1`: changes-0.9.1.html .. _`CHANGES for 0.9`: changes-0.9.html @@ -109,6 +115,8 @@ .. _`lxml at the Python cheeseshop`: http://cheeseshop.python.org/pypi/lxml/ +* `lxml 0.9.2`_ (2006-05-10) + * `lxml 0.9.1`_ (2006-03-30) * `lxml 0.9`_ (2006-03-20) @@ -136,9 +144,13 @@ svn co http://codespeak.net/svn/lxml/trunk lxml -You can also `browse it through the web`_. +You can also `browse it through the web`_. The `latest CHANGES`_ of the +developer version are also accessible. You can check there if a bug you found +has been fixed or a feature you want has been implemented in the latest trunk +version. .. _`browse it through the web`: http://codespeak.net/svn/lxml +.. _`latest CHANGES`: http://codespeak.net/svn/lxml/trunk/CHANGES.txt License ------- From scoder at codespeak.net Thu May 11 19:24:06 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Thu May 11 19:24:08 2006 Subject: [Lxml-checkins] r27090 - lxml/trunk/src/lxml Message-ID: <20060511172406.3BF9B10087@code0.codespeak.net> Author: scoder Date: Thu May 11 19:24:04 2006 New Revision: 27090 Modified: lxml/trunk/src/lxml/extensions.pxi Log: C-ification in extension function lookup, fast-path for main dictionary check Modified: lxml/trunk/src/lxml/extensions.pxi ============================================================================== --- lxml/trunk/src/lxml/extensions.pxi (original) +++ lxml/trunk/src/lxml/extensions.pxi Thu May 11 19:24:04 2006 @@ -22,6 +22,7 @@ cdef object _namespaces cdef object _utf_refs cdef object _function_cache + cdef object _function_cache_ns cdef object _called_function # for exception handling and temporary reference keeping: cdef _TempStore _temp_refs @@ -31,6 +32,7 @@ self._xpathCtxt = NULL self._utf_refs = {} self._function_cache = {} + self._function_cache_ns = {} self._called_function = None if extensions is not None: @@ -73,6 +75,7 @@ self._doc = doc self._exc.clear() python.PyDict_Clear(self._function_cache) + python.PyDict_Clear(self._function_cache_ns) namespaces = self._namespaces if namespaces is not None: self.registerNamespaces(namespaces) @@ -108,23 +111,48 @@ # extension functions - cdef int _prepare_function_call(self, ns_uri_utf, name_utf): + cdef int _prepare_function_call(self, char* c_ns_uri, char* c_name): + """Find an extension function and store it in 'self._called_function'. + This is absolutely performance-critical for XPath/XSLT! + Return 1 if it was found, 0 otherwise. + Parameters: c_ns_uri may be NULL, c_name must not be NULL + """ + cdef python.PyObject* c_dict cdef python.PyObject* dict_result - key = (ns_uri_utf, name_utf) - dict_result = python.PyDict_GetItem(self._function_cache, key) - if dict_result is not NULL: - function = dict_result - self._called_function = function - return function is not None + if c_ns_uri is NULL: + c_dict = self._function_cache + else: + c_dict = python.PyDict_GetItemString( + self._function_cache_ns, c_ns_uri) + + if c_dict is not NULL: + d = c_dict + dict_result = python.PyDict_GetItemString(d, c_name) + if dict_result is not NULL: + function = dict_result + self._called_function = function + return function is not None + else: + d = {} + python.PyDict_SetItem(self._function_cache_ns, ns_uri_utf, d) + + # first time we look up this function, so the rest is less critical + if c_ns_uri is not NULL: + ns_uri_utf = c_ns_uri + name_utf = c_name if self._extensions is not None: - dict_result = python.PyDict_GetItem(self._extensions, key) + dict_result = python.PyDict_GetItem( + self._extensions, (ns_uri_utf, name_utf)) + else: + dict_result = NULL if dict_result is not NULL: function = dict_result else: function = _find_extension(ns_uri_utf, name_utf) - python.PyDict_SetItem(self._function_cache, key, function) + # we also store None values here to make sure we remember + python.PyDict_SetItem(d, name_utf, function) self._called_function = function return function is not None @@ -180,14 +208,8 @@ char* c_name, char* c_ns_uri): "Module level lookup function for XPath/XSLT functions" cdef _BaseContext context - if c_name is NULL: - return NULL - if c_ns_uri is NULL: - ns_uri = None - else: - ns_uri = c_ns_uri context = <_BaseContext>ctxt - if context._prepare_function_call(ns_uri, c_name): + if context._prepare_function_call(c_ns_uri, c_name): return _call_prepared_function else: return NULL @@ -289,16 +311,15 @@ cdef _BaseContext context rctxt = ctxt.context context = <_BaseContext>(rctxt.userData) - name = rctxt.function - if rctxt.functionURI is not NULL: - uri = rctxt.functionURI - else: - uri = None - if context._prepare_function_call(uri, name): + if context._prepare_function_call(rctxt.functionURI, rctxt.function): _extension_function_call(context, ctxt, nargs) else: + if rctxt.functionURI is not NULL: + fref = "{%s}%s" % (rctxt.functionURI, rctxt.function) + else: + fref = rctxt.function xpath.xmlXPathErr(ctxt, xpath.XPATH_EXPR_ERROR) - exception = XPathFunctionError("XPath function {%s}%s not found" % (uri, name)) + exception = XPathFunctionError("XPath function '%s' not found" % fref) context._exc._store_exception(exception) cdef void _call_prepared_function(xpath.xmlXPathParserContext* ctxt, int nargs): From scoder at codespeak.net Thu May 11 21:34:55 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Thu May 11 21:34:57 2006 Subject: [Lxml-checkins] r27097 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20060511193455.6A80010098@code0.codespeak.net> Author: scoder Date: Thu May 11 21:34:50 2006 New Revision: 27097 Modified: lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/etree.pyx lxml/trunk/src/lxml/tests/test_elementtree.py lxml/trunk/src/lxml/tests/test_errors.py lxml/trunk/src/lxml/tree.pxd Log: fix: Element/SubElement failed to set attribute namespaces from passed attrib dictionary: namespaces were not even parsed and had to be set /after/ node namespace setup Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Thu May 11 21:34:50 2006 @@ -49,6 +49,9 @@ Bugs fixed ---------- +* Element/SubElement failed to set attribute namespaces from passed ``attrib`` + dictionary + * ``tostring()`` now adds an XML declaration for non-ASCII encodings * ``tostring()`` failed to serialize encodings that contain 0-bytes Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Thu May 11 21:34:50 2006 @@ -894,9 +894,10 @@ ns_utf, name_utf = _getNsTag(_tag) doc = self._doc c_doc = doc._c_doc - c_node = _createElement(c_doc, name_utf, attrib, _extra) + c_node = _createElement(c_doc, name_utf) # add namespaces to node if necessary doc._setNodeNamespaces(c_node, ns_utf, nsmap) + _setNodeAttributes(c_node, doc, attrib, _extra) return _elementFactory(doc, c_node) def find(self, path): @@ -1266,26 +1267,36 @@ return cstd.strcmp(c_node.ns.href, self._href) == 0 return 0 -cdef xmlNode* _createElement(xmlDoc* c_doc, object name_utf, - object attrib, object extra) except NULL: +cdef xmlNode* _createElement(xmlDoc* c_doc, object name_utf) except NULL: cdef xmlNode* c_node + c_node = tree.xmlNewDocNode(c_doc, NULL, _cstr(name_utf), NULL) + return c_node + +cdef xmlNode* _createComment(xmlDoc* c_doc, char* text): + cdef xmlNode* c_node + c_node = tree.xmlNewDocComment(c_doc, text) + return c_node + +cdef _setNodeAttributes(xmlNode* c_node, _Document doc, attrib, extra): + cdef xmlNs* c_ns + # 'extra' is not checked here (expected to be a keyword dict) + if attrib is not None and not hasattr(attrib, 'items'): + raise TypeError, "Invalid attribute dictionary: %s" % type(attrib) if extra: if attrib is None: attrib = extra else: attrib.update(extra) - c_node = tree.xmlNewDocNode(c_doc, NULL, _cstr(name_utf), NULL) if attrib: for name, value in attrib.items(): - attr_name_utf = _utf8(name) + attr_ns_utf, attr_name_utf = _getNsTag(name) value_utf = _utf8(value) - tree.xmlNewProp(c_node, _cstr(attr_name_utf), _cstr(value_utf)) - return c_node - -cdef xmlNode* _createComment(xmlDoc* c_doc, char* text): - cdef xmlNode* c_node - c_node = tree.xmlNewDocComment(c_doc, text) - return c_node + if attr_ns_utf is None: + tree.xmlNewProp(c_node, _cstr(attr_name_utf), _cstr(value_utf)) + else: + c_ns = doc._findOrBuildNodeNs(c_node, _cstr(attr_ns_utf)) + tree.xmlNewNsProp(c_node, c_ns, + _cstr(attr_name_utf), _cstr(value_utf)) # module-level API for ElementTree @@ -1296,11 +1307,12 @@ cdef _Document doc ns_utf, name_utf = _getNsTag(_tag) c_doc = _newDoc() - c_node = _createElement(c_doc, name_utf, attrib, _extra) + c_node = _createElement(c_doc, name_utf) tree.xmlDocSetRootElement(c_doc, c_node) doc = _documentFactory(c_doc, None) # add namespaces to node if necessary doc._setNodeNamespaces(c_node, ns_utf, nsmap) + _setNodeAttributes(c_node, doc, attrib, _extra) return _elementFactory(doc, c_node) def Comment(text=None): @@ -1323,10 +1335,11 @@ cdef _Document doc ns_utf, name_utf = _getNsTag(_tag) doc = _parent._doc - c_node = _createElement(doc._c_doc, name_utf, attrib, _extra) + c_node = _createElement(doc._c_doc, name_utf) tree.xmlAddChild(_parent._c_node, c_node) # add namespaces to node if necessary doc._setNodeNamespaces(c_node, ns_utf, nsmap) + _setNodeAttributes(c_node, doc, attrib, _extra) return _elementFactory(doc, c_node) def ElementTree(_Element element=None, file=None, parser=None): Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Thu May 11 21:34:50 2006 @@ -476,6 +476,13 @@ result.sort() self.assertEquals(['alpha', 'beta', 'gamma'], result) + def test_element_with_attributes_keywords(self): + Element = self.etree.Element + + el = Element('tag', foo='Foo', bar='Bar') + self.assertEquals('Foo', el.attrib['foo']) + self.assertEquals('Bar', el.attrib['bar']) + def test_element_with_attributes(self): Element = self.etree.Element @@ -483,13 +490,30 @@ self.assertEquals('Foo', el.attrib['foo']) self.assertEquals('Bar', el.attrib['bar']) + def test_element_with_attributes_ns(self): + Element = self.etree.Element + + el = Element('tag', {'{ns1}foo':'Foo', '{ns2}bar':'Bar'}) + self.assertEquals('Foo', el.attrib['{ns1}foo']) + self.assertEquals('Bar', el.attrib['{ns2}bar']) + def test_subelement_with_attributes(self): Element = self.etree.Element SubElement = self.etree.SubElement el = Element('tag') - SubElement(el, 'foo', baz="Baz") + SubElement(el, 'foo', attrib={'foo':'Foo'}, baz="Baz") self.assertEquals("Baz", el[0].attrib['baz']) + self.assertEquals('Foo', el[0].attrib['foo']) + + def test_subelement_with_attributes_ns(self): + Element = self.etree.Element + SubElement = self.etree.SubElement + + el = Element('tag') + SubElement(el, 'foo', {'{ns1}foo':'Foo', '{ns2}bar':'Bar'}) + self.assertEquals('Foo', el[0].attrib['{ns1}foo']) + self.assertEquals('Bar', el[0].attrib['{ns2}bar']) def test_write(self): ElementTree = self.etree.ElementTree Modified: lxml/trunk/src/lxml/tests/test_errors.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_errors.py (original) +++ lxml/trunk/src/lxml/tests/test_errors.py Thu May 11 21:34:50 2006 @@ -14,7 +14,7 @@ def test_bad_element(self): # attrib argument of Element() should be a dictionary, so if # we pass a string we should get an error. - self.assertRaises(AttributeError, self.etree.Element, 'a', 'b') + self.assertRaises(TypeError, self.etree.Element, 'a', 'b') def test_empty_parse(self): self.assertRaises(etree.XMLSyntaxError, etree.fromstring, '') Modified: lxml/trunk/src/lxml/tree.pxd ============================================================================== --- lxml/trunk/src/lxml/tree.pxd (original) +++ lxml/trunk/src/lxml/tree.pxd Thu May 11 21:34:50 2006 @@ -167,6 +167,8 @@ char* name, char* content) cdef xmlDoc* xmlNewDoc(char* version) cdef xmlAttr* xmlNewProp(xmlNode* node, char* name, char* value) + cdef xmlAttr* xmlNewNsProp(xmlNode* node, xmlNs* ns, + char* name, char* value) cdef char* xmlGetNoNsProp(xmlNode* node, char* name) cdef char* xmlGetNsProp(xmlNode* node, char* name, char* nameSpace) cdef void xmlSetNs(xmlNode* node, xmlNs* ns) From scoder at codespeak.net Thu May 11 22:37:29 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Thu May 11 22:37:31 2006 Subject: [Lxml-checkins] r27098 - lxml/trunk Message-ID: <20060511203729.8904B10090@code0.codespeak.net> Author: scoder Date: Thu May 11 22:37:27 2006 New Revision: 27098 Modified: lxml/trunk/bench.py Log: allow benchmarks with LARGE trees (-l/-L), bench serialization with attributes Modified: lxml/trunk/bench.py ============================================================================== --- lxml/trunk/bench.py (original) +++ lxml/trunk/bench.py Thu May 11 22:37:27 2006 @@ -2,11 +2,16 @@ from itertools import * from StringIO import StringIO -_TEXT = "some ASCII text" -_UTEXT = u"some klingon: \F8D2" +TREE_FACTOR = 1 # increase tree size with '-l / '-L' cmd option + +_TEXT = "some ASCII text" * 10 * TREE_FACTOR +_UTEXT = u"some klingon: \F8D2" * 10 * TREE_FACTOR _ATTRIBUTES = { - '{attr}test' : _UTEXT, - 'bla' : _TEXT + '{attr}test1' : _UTEXT, + '{attr}test2' : _UTEXT, + 'bla1' : _TEXT, + 'bla2' : _TEXT, + 'bla3' : _TEXT } def with_attributes(use_attributes): @@ -125,45 +130,45 @@ return all_trees def _setup_tree1(self, text, attributes): - "tree with 26 2nd level and 520 3rd level children" + "tree with 26 2nd level and 520 * TREE_FACTOR 3rd level children" atoz = self.atoz SubElement = self.etree.SubElement current_time = time.time t = current_time() - root = self.etree.Element('{a}root') + root = self.etree.Element('{abc}rootnode') for ch1 in atoz: - el = SubElement(root, "{b}"+ch1, attributes) + el = SubElement(root, "{bcd}"+ch1*5, attributes) for ch2 in atoz: - for i in range(20): - SubElement(el, "{c}%s%03d" % (ch2, i)) + for i in range(20 * TREE_FACTOR): + SubElement(el, "{cdefg}%s%05d" % (ch2, i)) t = current_time() - t return (root, t) def _setup_tree2(self, text, attributes): - "tree with 520 2nd level and 26 3rd level children" + "tree with 520 * TREE_FACTOR 2nd level and 26 3rd level children" atoz = self.atoz SubElement = self.etree.SubElement current_time = time.time t = current_time() - root = self.etree.Element('{a}root') + root = self.etree.Element('{abc}rootnode') for ch1 in atoz: - for i in range(20): - el = SubElement(root, "{b}"+ch1, attributes) + for i in range(20 * TREE_FACTOR): + el = SubElement(root, "{bcd}"+ch1*5, attributes) for ch2 in atoz: - SubElement(el, "{c}%s%03d" % (ch2, i)) + SubElement(el, "{cdefg}%s%05d" % (ch2, i)) t = current_time() - t return (root, t) def _setup_tree3(self, text, attributes): - "tree of depth 8 with 3 children per node" + "tree of depth 8 + TREE_FACTOR with 3 children per node" SubElement = self.etree.SubElement current_time = time.time t = current_time() - root = self.etree.Element('{a}root') + root = self.etree.Element('{abc}rootnode') children = [root] - for i in range(7): + for i in range(6 + TREE_FACTOR): tag_no = count().next - children = [ SubElement(c, "{b}a%d" % i, attributes) + children = [ SubElement(c, "{bcd}a%05d" % i, attributes) for i,c in enumerate(chain(children, children, children)) ] t = current_time() - t return (root, t) @@ -174,12 +179,12 @@ SubElement = self.etree.SubElement current_time = time.time t = current_time() - root = self.etree.Element('{a}root') + root = self.etree.Element('{abc}rootnode') children = [root] for ch1 in atoz: - el = SubElement(root, "{b}"+ch1, attributes) - SubElement(el, "{c}a", attributes) - SubElement(el, "{c}b", attributes) + el = SubElement(root, "{bcd}"+ch1*5, attributes) + SubElement(el, "{cdefg}abcde", attributes) + SubElement(el, "{cdefg}bcdef", attributes) t = current_time() - t return (root, t) @@ -249,19 +254,28 @@ for child in reversed(root): pass + @with_attributes(True) + @with_attributes(False) @with_text(text=True, utext=True) def bench_tostring_utf8(self, root): self.etree.tostring(root, 'UTF-8') + @with_attributes(True) + @with_attributes(False) @with_text(text=True, utext=True) def bench_tostring_utf16(self, root): self.etree.tostring(root, 'UTF-16') + @with_attributes(True) + @with_attributes(False) @with_text(text=True, utext=True) def bench_tostring_utf8_unicode_XML(self, root): xml = unicode(self.etree.tostring(root, 'UTF-8'), 'UTF-8') + open("test%03d.txt" % len(root), 'w').write(xml.encode('UTF-8')) self.etree.XML(xml) + @with_attributes(True) + @with_attributes(False) @with_text(text=True, utext=True) def bench_write_utf8_parse_stringIO(self, root): f = StringIO() @@ -495,42 +509,58 @@ if len(sys.argv) > 1: try: sys.argv.remove('-i') + # run benchmark 'inplace' sys.path.insert(0, 'src') except ValueError: pass try: sys.argv.remove('-nolxml') + # run without lxml import_lxml = False except ValueError: pass try: - sys.argv.remove('-c') + sys.argv.remove('-z') + # reset callgrind after tree setup callgrind_zero = True except ValueError: pass + try: + sys.argv.remove('-l') + # use large trees + TREE_FACTOR *= 2 + except ValueError: + pass + + try: + sys.argv.remove('-L') + # use LARGE trees + TREE_FACTOR *= 2 + except ValueError: + pass + _etrees = [] if import_lxml: from lxml import etree _etrees.append(etree) if len(sys.argv) > 1: - try: - sys.argv.remove('-a') - except ValueError: - pass - else: + if '-a' in sys.argv or '-c' in sys.argv: + # 'all' or 'C-implementations' ? try: - from elementtree import ElementTree as ET - _etrees.append(ET) + import cElementTree as cET + _etrees.append(cET) except ImportError: pass + if '-a' in sys.argv: + # 'all' ? try: - import cElementTree as cET - _etrees.append(cET) + from elementtree import ElementTree as ET + _etrees.append(ET) except ImportError: pass @@ -551,7 +581,9 @@ if not name.startswith('bench_'): name = 'bench_' + name selected.append(name) - benchmarks = [ [ b for b in bs if b[0] in selected ] + benchmarks = [ [ b for b in bs + if [ contains for contains in selected + if contains in b[0] ] ] for bs in benchmarks ] import time From scoder at codespeak.net Fri May 12 06:06:56 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Fri May 12 06:06:58 2006 Subject: [Lxml-checkins] r27102 - lxml/trunk/src/lxml Message-ID: <20060512040656.9902810090@code0.codespeak.net> Author: scoder Date: Fri May 12 06:06:54 2006 New Revision: 27102 Modified: lxml/trunk/src/lxml/etree.pyx Log: _setNodeAttributes -> _initNodeAttributes - make clear what it is meant to do Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Fri May 12 06:06:54 2006 @@ -897,7 +897,7 @@ c_node = _createElement(c_doc, name_utf) # add namespaces to node if necessary doc._setNodeNamespaces(c_node, ns_utf, nsmap) - _setNodeAttributes(c_node, doc, attrib, _extra) + _initNodeAttributes(c_node, doc, attrib, _extra) return _elementFactory(doc, c_node) def find(self, path): @@ -1277,7 +1277,7 @@ c_node = tree.xmlNewDocComment(c_doc, text) return c_node -cdef _setNodeAttributes(xmlNode* c_node, _Document doc, attrib, extra): +cdef _initNodeAttributes(xmlNode* c_node, _Document doc, attrib, extra): cdef xmlNs* c_ns # 'extra' is not checked here (expected to be a keyword dict) if attrib is not None and not hasattr(attrib, 'items'): @@ -1312,7 +1312,7 @@ doc = _documentFactory(c_doc, None) # add namespaces to node if necessary doc._setNodeNamespaces(c_node, ns_utf, nsmap) - _setNodeAttributes(c_node, doc, attrib, _extra) + _initNodeAttributes(c_node, doc, attrib, _extra) return _elementFactory(doc, c_node) def Comment(text=None): @@ -1339,7 +1339,7 @@ tree.xmlAddChild(_parent._c_node, c_node) # add namespaces to node if necessary doc._setNodeNamespaces(c_node, ns_utf, nsmap) - _setNodeAttributes(c_node, doc, attrib, _extra) + _initNodeAttributes(c_node, doc, attrib, _extra) return _elementFactory(doc, c_node) def ElementTree(_Element element=None, file=None, parser=None): From scoder at codespeak.net Fri May 12 07:45:07 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Fri May 12 07:45:09 2006 Subject: [Lxml-checkins] r27103 - in lxml/branch/lxml-0.9.x: . src/lxml src/lxml/tests Message-ID: <20060512054507.561A210090@code0.codespeak.net> Author: scoder Date: Fri May 12 07:45:05 2006 New Revision: 27103 Modified: lxml/branch/lxml-0.9.x/CHANGES.txt lxml/branch/lxml-0.9.x/src/lxml/etree.pyx lxml/branch/lxml-0.9.x/src/lxml/tests/test_elementtree.py lxml/branch/lxml-0.9.x/src/lxml/tests/test_errors.py lxml/branch/lxml-0.9.x/src/lxml/tree.pxd Log: merged in Element attribute initialization bugfix fron trunk Modified: lxml/branch/lxml-0.9.x/CHANGES.txt ============================================================================== --- lxml/branch/lxml-0.9.x/CHANGES.txt (original) +++ lxml/branch/lxml-0.9.x/CHANGES.txt Fri May 12 07:45:05 2006 @@ -1,6 +1,18 @@ lxml changelog ============== +current +======= + +Features added +-------------- + +Bugs fixed +---------- + +* Element/SubElement failed to set attribute namespaces from passed ``attrib`` + dictionary + 0.9.2 (2006-05-10) ================== Modified: lxml/branch/lxml-0.9.x/src/lxml/etree.pyx ============================================================================== --- lxml/branch/lxml-0.9.x/src/lxml/etree.pyx (original) +++ lxml/branch/lxml-0.9.x/src/lxml/etree.pyx Fri May 12 07:45:05 2006 @@ -763,9 +763,10 @@ ns_utf, name_utf = _getNsTag(_tag) doc = self._doc c_doc = doc._c_doc - c_node = _createElement(c_doc, name_utf, attrib, _extra) + c_node = _createElement(c_doc, name_utf) # add namespaces to node if necessary doc._setNodeNamespaces(c_node, ns_utf, nsmap) + _initNodeAttributes(c_node, doc, attrib, _extra) return _elementFactory(doc, c_node) def find(self, path): @@ -1114,26 +1115,35 @@ return tree.strcmp(c_node.ns.href, self._href) == 0 return 0 -cdef xmlNode* _createElement(xmlDoc* c_doc, object name_utf, - object attrib, object extra) except NULL: +cdef xmlNode* _createElement(xmlDoc* c_doc, object name_utf) except NULL: cdef xmlNode* c_node + c_node = tree.xmlNewDocNode(c_doc, NULL, name_utf, NULL) + return c_node + +cdef xmlNode* _createComment(xmlDoc* c_doc, char* text): + cdef xmlNode* c_node + c_node = tree.xmlNewDocComment(c_doc, text) + return c_node + +cdef _initNodeAttributes(xmlNode* c_node, _Document doc, attrib, extra): + cdef xmlNs* c_ns + # 'extra' is not checked here (expected to be a keyword dict) + if attrib is not None and not hasattr(attrib, 'items'): + raise TypeError, "Invalid attribute dictionary: %s" % type(attrib) if extra: if attrib is None: attrib = extra else: attrib.update(extra) - c_node = tree.xmlNewDocNode(c_doc, NULL, name_utf, NULL) if attrib: for name, value in attrib.items(): - attr_name_utf = _utf8(name) + attr_ns_utf, attr_name_utf = _getNsTag(name) value_utf = _utf8(value) - tree.xmlNewProp(c_node, attr_name_utf, value_utf) - return c_node - -cdef xmlNode* _createComment(xmlDoc* c_doc, char* text): - cdef xmlNode* c_node - c_node = tree.xmlNewDocComment(c_doc, text) - return c_node + if attr_ns_utf is None: + tree.xmlNewProp(c_node, attr_name_utf, value_utf) + else: + c_ns = doc._findOrBuildNodeNs(c_node, attr_ns_utf) + tree.xmlNewNsProp(c_node, c_ns, attr_name_utf, value_utf) # module-level API for ElementTree @@ -1144,11 +1154,12 @@ cdef _Document doc ns_utf, name_utf = _getNsTag(_tag) c_doc = theParser.newDoc() - c_node = _createElement(c_doc, name_utf, attrib, _extra) + c_node = _createElement(c_doc, name_utf) tree.xmlDocSetRootElement(c_doc, c_node) doc = _documentFactory(c_doc) # add namespaces to node if necessary doc._setNodeNamespaces(c_node, ns_utf, nsmap) + _initNodeAttributes(c_node, doc, attrib, _extra) return _elementFactory(doc, c_node) def Comment(text=None): @@ -1169,10 +1180,11 @@ _raiseIfNone(_parent) ns_utf, name_utf = _getNsTag(_tag) doc = _parent._doc - c_node = _createElement(doc._c_doc, name_utf, attrib, _extra) + c_node = _createElement(doc._c_doc, name_utf) tree.xmlAddChild(_parent._c_node, c_node) # add namespaces to node if necessary doc._setNodeNamespaces(c_node, ns_utf, nsmap) + _initNodeAttributes(c_node, doc, attrib, _extra) return _elementFactory(doc, c_node) def ElementTree(_Element element=None, file=None, parser=None): Modified: lxml/branch/lxml-0.9.x/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/branch/lxml-0.9.x/src/lxml/tests/test_elementtree.py (original) +++ lxml/branch/lxml-0.9.x/src/lxml/tests/test_elementtree.py Fri May 12 07:45:05 2006 @@ -475,6 +475,13 @@ result.sort() self.assertEquals(['alpha', 'beta', 'gamma'], result) + def test_element_with_attributes_keywords(self): + Element = self.etree.Element + + el = Element('tag', foo='Foo', bar='Bar') + self.assertEquals('Foo', el.attrib['foo']) + self.assertEquals('Bar', el.attrib['bar']) + def test_element_with_attributes(self): Element = self.etree.Element @@ -482,13 +489,30 @@ self.assertEquals('Foo', el.attrib['foo']) self.assertEquals('Bar', el.attrib['bar']) + def test_element_with_attributes_ns(self): + Element = self.etree.Element + + el = Element('tag', {'{ns1}foo':'Foo', '{ns2}bar':'Bar'}) + self.assertEquals('Foo', el.attrib['{ns1}foo']) + self.assertEquals('Bar', el.attrib['{ns2}bar']) + def test_subelement_with_attributes(self): Element = self.etree.Element SubElement = self.etree.SubElement el = Element('tag') - SubElement(el, 'foo', baz="Baz") + SubElement(el, 'foo', attrib={'foo':'Foo'}, baz="Baz") self.assertEquals("Baz", el[0].attrib['baz']) + self.assertEquals('Foo', el[0].attrib['foo']) + + def test_subelement_with_attributes_ns(self): + Element = self.etree.Element + SubElement = self.etree.SubElement + + el = Element('tag') + SubElement(el, 'foo', {'{ns1}foo':'Foo', '{ns2}bar':'Bar'}) + self.assertEquals('Foo', el[0].attrib['{ns1}foo']) + self.assertEquals('Bar', el[0].attrib['{ns2}bar']) def test_write(self): ElementTree = self.etree.ElementTree Modified: lxml/branch/lxml-0.9.x/src/lxml/tests/test_errors.py ============================================================================== --- lxml/branch/lxml-0.9.x/src/lxml/tests/test_errors.py (original) +++ lxml/branch/lxml-0.9.x/src/lxml/tests/test_errors.py Fri May 12 07:45:05 2006 @@ -14,7 +14,7 @@ def test_bad_element(self): # attrib argument of Element() should be a dictionary, so if # we pass a string we should get an error. - self.assertRaises(AttributeError, self.etree.Element, 'a', 'b') + self.assertRaises(TypeError, self.etree.Element, 'a', 'b') def test_empty_parse(self): self.assertRaises(etree.XMLSyntaxError, etree.fromstring, '') Modified: lxml/branch/lxml-0.9.x/src/lxml/tree.pxd ============================================================================== --- lxml/branch/lxml-0.9.x/src/lxml/tree.pxd (original) +++ lxml/branch/lxml-0.9.x/src/lxml/tree.pxd Fri May 12 07:45:05 2006 @@ -115,6 +115,8 @@ char* name, char* content) cdef xmlDoc* xmlNewDoc(char* version) cdef xmlAttr* xmlNewProp(xmlNode* node, char* name, char* value) + cdef xmlAttr* xmlNewNsProp(xmlNode* node, xmlNs* ns, + char* name, char* value) cdef char* xmlGetNoNsProp(xmlNode* node, char* name) cdef char* xmlGetNsProp(xmlNode* node, char* name, char* nameSpace) cdef void xmlSetNs(xmlNode* node, xmlNs* ns) From scoder at codespeak.net Fri May 12 16:18:29 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Fri May 12 16:18:30 2006 Subject: [Lxml-checkins] r27132 - lxml/trunk/src/lxml/tests Message-ID: <20060512141829.7CD5E100B7@code0.codespeak.net> Author: scoder Date: Fri May 12 16:18:28 2006 New Revision: 27132 Modified: lxml/trunk/src/lxml/tests/test_elementtree.py Log: test cases for Element.findall() Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Fri May 12 16:18:28 2006 @@ -476,6 +476,23 @@ result.sort() self.assertEquals(['alpha', 'beta', 'gamma'], result) + def test_findall(self): + XML = self.etree.XML + root = XML('') + self.assertEquals(len(root.findall("c")), 1) + self.assertEquals(len(root.findall(".//c")), 2) + self.assertEquals(len(root.findall(".//b")), 3) + self.assertEquals(len(root.findall(".//b")[0]), 1) + self.assertEquals(len(root.findall(".//b")[1]), 0) + self.assertEquals(len(root.findall(".//b")[2]), 0) + + def test_findall_ns(self): + XML = self.etree.XML + root = XML('') + self.assertEquals(len(root.findall(".//{X}b")), 2) + self.assertEquals(len(root.findall(".//b")), 3) + self.assertEquals(len(root.findall("b")), 2) + def test_element_with_attributes_keywords(self): Element = self.etree.Element @@ -1107,7 +1124,7 @@ list(a.getiterator('a'))) self.assertEquals( [a2], - list(e.getiterator('a'))) + list(c.getiterator('a'))) def test_getiterator_with_text(self): Element = self.etree.Element From scoder at codespeak.net Fri May 12 16:26:30 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Fri May 12 16:26:31 2006 Subject: [Lxml-checkins] r27133 - in lxml/trunk: . src/lxml Message-ID: <20060512142630.2BED0100B4@code0.codespeak.net> Author: scoder Date: Fri May 12 16:26:28 2006 New Revision: 27133 Modified: lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/apihelpers.pxi lxml/trunk/src/lxml/etree.pyx lxml/trunk/src/lxml/python.pxd Log: rewrite of ElementDepthFirstIterator to support tag selection: complete support for Element.getiterator() Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Fri May 12 16:26:28 2006 @@ -7,6 +7,8 @@ Features added -------------- +* Speedup of Element.findall(tag) and Element.getiterator(tag) + * Support for writing the XML representation of Elements and ElementTrees to Python unicode strings via ``etree.tounicode()`` Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Fri May 12 16:26:28 2006 @@ -281,8 +281,7 @@ return c_child c = c + 1 c_child = c_child.next - else: - return NULL + return NULL cdef xmlNode* _findChildBackwards(xmlNode* c_node, Py_ssize_t index): """Return child element of c_node with index, or return NULL if not found. @@ -298,8 +297,7 @@ return c_child c = c + 1 c_child = c_child.prev - else: - return NULL + return NULL cdef xmlNode* _nextElement(xmlNode* c_node): """Given a node, find the next sibling that is an element. @@ -321,6 +319,59 @@ c_node = c_node.prev return NULL +cdef xmlNode* _findDepthFirstInDescendents(xmlNode* c_node, + char* c_href, char* c_name): + if c_node is NULL: + return NULL + c_node = c_node.children + if c_node is NULL: + return NULL + if not _isElement(c_node): + c_node = _nextElement(c_node) + return _findDepthFirstInFollowing(c_node, c_href, c_name) + +cdef xmlNode* _findDepthFirstInFollowingSiblings(xmlNode* c_node, + char* c_href, char* c_name): + if c_node is NULL: + return NULL + c_node = _nextElement(c_node) + return _findDepthFirstInFollowing(c_node, c_href, c_name) + +cdef xmlNode* _findDepthFirstInFollowing(xmlNode* c_node, + char* c_href, char* c_name): + """Find the next matching node by traversing: + 1) the node itself + 2) its descendents + 3) its following siblings. + """ + cdef xmlNode* c_child + if c_name is NULL: + # always match + return c_node + while c_node is not NULL: + if _tagMatches(c_node, c_href, c_name): + return c_node + if c_node.children is not NULL: + c_child = _findDepthFirstInFollowing(c_node.children, c_href, c_name) + if c_child is not NULL: + return c_child + c_node = _nextElement(c_node) + return NULL + +cdef int _tagMatches(xmlNode* c_node, char* c_href, char* c_name): + if c_name is NULL: + # always match + return 1 + if c_href is NULL: + if c_node.ns is not NULL and c_node.ns.href is not NULL: + return 0 + return cstd.strcmp(c_node.name, c_name) == 0 + elif c_node.ns is NULL or c_node.ns.href is NULL: + return 0 + else: + return cstd.strcmp(c_node.name, c_name) == 0 and \ + cstd.strcmp(c_node.ns.href, c_href) == 0 + cdef void _removeNode(xmlNode* c_node): """Unlink and free a node and subnodes if possible. """ Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Fri May 12 16:26:28 2006 @@ -879,11 +879,9 @@ return None def getiterator(self, tag=None): - iterator = ElementDepthFirstIterator(self) - if tag is None or tag == '*': - return iterator - else: - return ElementTagFilter(iterator, tag) + if tag == '*': + tag = None + return ElementDepthFirstIterator(self, tag) def makeelement(self, _tag, attrib=None, nsmap=None, **_extra): "Creates a new element associated with the same document." @@ -1194,78 +1192,85 @@ cdef class ElementDepthFirstIterator: """Iterates over an element and its sub-elements in document order (depth - first pre-order).""" + first pre-order). + + If the 'tag' argument is not None, it returns only the elements that match + the respective name and namespace. + """ # we keep Python references here to control GC # keep next node to return and a stack of position state in the tree - cdef object _stack + cdef object _pystrings + cdef char* _href + cdef char* _name + cdef Py_ssize_t _depth cdef _NodeBase _next_node - def __init__(self, _NodeBase node not None): - cdef xmlNode* c_node + def __init__(self, _NodeBase node not None, tag=None): self._next_node = node - self._stack = [] - self._findAndPushNextNode(node) + self._depth = 0 + + if tag is None: + self._href = NULL + self._name = NULL + else: + self._pystrings = _getNsTag(tag) + if self._pystrings[0] is None: + self._href = NULL + else: + self._href = _cstr(self._pystrings[0]) + self._name = _cstr(self._pystrings[1]) + + if not _tagMatches(node._c_node, self._href, self._name): + # this cannot raise StopIteration, self._next_node != None + self.next() + def __iter__(self): return self + def __next__(self): - cdef xmlNode* c_node - cdef _NodeBase next_node + cdef _NodeBase current_node current_node = self._next_node if current_node is None: raise StopIteration - stack = self._stack - if python.PyList_GET_SIZE(stack) == 0: - self._next_node = None - return current_node - next_node = stack[-1] - self._next_node = next_node - self._findAndPushNextNode(next_node) + self._findAndPushNextNode() return current_node - cdef void _findAndPushNextNode(self, _NodeBase node): + cdef void _findAndPushNextNode(self): + cdef _NodeBase node cdef xmlNode* c_node - stack = self._stack - # try next child level until we hit a leaf - c_node = _findChildForwards(node._c_node, 0) + cdef xmlNode* c_parent + # find in descendants + node = self._next_node + c_parent = node._c_node + c_node = _findDepthFirstInDescendents(c_parent, self._href, self._name) if c_node is NULL: - pop = stack.pop - while c_node is NULL and python.PyList_GET_SIZE(stack): - # walk up the stack until we find a sibling - node = pop() - c_node = _nextElement(node._c_node) - if c_node is not NULL: - python.PyList_Append( - stack, _elementFactory(node._doc, c_node)) - -cdef class ElementTagFilter: - cdef object _iterator - cdef object _pystrings - cdef char* _href - cdef char* _name - def __init__(self, element_iterator, tag): - self._iterator = iter(element_iterator) - ns_href, name = _getNsTag(tag) - self._pystrings = (ns_href, name) # keep Python references - self._name = _cstr(name) - if ns_href is None: - self._href = NULL - else: - self._href = _cstr(ns_href) - def __iter__(self): - return self - def __next__(self): - cdef _NodeBase node - while 1: - node = self._iterator.next() - if self._tagMatches(node._c_node): - return node - - cdef int _tagMatches(self, xmlNode* c_node): - if cstd.strcmp(c_node.name, self._name) == 0: - if c_node.ns == NULL or c_node.ns.href == NULL: - return self._href == NULL - else: - return cstd.strcmp(c_node.ns.href, self._href) == 0 - return 0 + if self._depth < 1: + # nothing left to traverse + self._next_node = None + return + # try siblings + c_node = _findDepthFirstInFollowingSiblings( + c_parent, self._href, self._name) + + while c_node is NULL and self._depth > 1: + # walk up the parent pointers and continue with siblings + c_parent = c_parent.parent + self._depth = self._depth - 1 + if c_parent is NULL or not _isElement(c_parent): + break + c_node = _findDepthFirstInFollowingSiblings( + c_parent, self._href, self._name) + + if c_node is NULL: + self._next_node = None + return # all found, nothing left + # we are at a sibling, so set c_parent to our parent + c_parent = c_parent.parent + + self._next_node = _elementFactory(node._doc, c_node) + # fix depth counter by looking up path to original parent + while c_node is not c_parent: + self._depth = self._depth + 1 + c_node = c_node.parent cdef xmlNode* _createElement(xmlDoc* c_doc, object name_utf) except NULL: cdef xmlNode* c_node Modified: lxml/trunk/src/lxml/python.pxd ============================================================================== --- lxml/trunk/src/lxml/python.pxd (original) +++ lxml/trunk/src/lxml/python.pxd Fri May 12 16:26:28 2006 @@ -30,6 +30,8 @@ cdef Py_ssize_t PyList_GET_SIZE(object l) cdef int PyList_Append(object l, object obj) cdef int PyList_Reverse(object l) + cdef int PyList_Insert(object l, Py_ssize_t index, object o) + cdef object PyList_GET_ITEM(object l, Py_ssize_t index) cdef int PyDict_SetItemString(object d, char* key, object value) cdef int PyDict_SetItem(object d, object key, object value) cdef PyObject* PyDict_GetItemString(object d, char* key) From scoder at codespeak.net Fri May 12 16:34:53 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Fri May 12 16:34:55 2006 Subject: [Lxml-checkins] r27135 - lxml/trunk/src/lxml Message-ID: <20060512143453.CF773100B4@code0.codespeak.net> Author: scoder Date: Fri May 12 16:34:52 2006 New Revision: 27135 Modified: lxml/trunk/src/lxml/etree.pyx Log: moved special case for (tag == '*') from getiterator() into iterator class Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Fri May 12 16:34:52 2006 @@ -879,8 +879,6 @@ return None def getiterator(self, tag=None): - if tag == '*': - tag = None return ElementDepthFirstIterator(self, tag) def makeelement(self, _tag, attrib=None, nsmap=None, **_extra): @@ -1207,7 +1205,8 @@ def __init__(self, _NodeBase node not None, tag=None): self._next_node = node self._depth = 0 - + if tag == '*': + tag = None if tag is None: self._href = NULL self._name = NULL From scoder at codespeak.net Fri May 12 17:30:07 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Fri May 12 17:30:07 2006 Subject: [Lxml-checkins] r27141 - lxml/trunk/src/lxml Message-ID: <20060512153007.185A81007F@code0.codespeak.net> Author: scoder Date: Fri May 12 17:30:05 2006 New Revision: 27141 Modified: lxml/trunk/src/lxml/etree.pyx Log: doc clarifications, fixed name of _findAndPushNextNode to _prepareNextNode Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Fri May 12 17:30:05 2006 @@ -1192,8 +1192,11 @@ """Iterates over an element and its sub-elements in document order (depth first pre-order). - If the 'tag' argument is not None, it returns only the elements that match - the respective name and namespace. + If the optional 'tag' argument is not None, it returns only the elements + that match the respective name and namespace. + + Note that the behaviour of this iterator is completely undefined if the + tree it traverses is modified during iteration. """ # we keep Python references here to control GC # keep next node to return and a stack of position state in the tree @@ -1230,10 +1233,10 @@ current_node = self._next_node if current_node is None: raise StopIteration - self._findAndPushNextNode() + self._prepareNextNode() return current_node - cdef void _findAndPushNextNode(self): + cdef void _prepareNextNode(self): cdef _NodeBase node cdef xmlNode* c_node cdef xmlNode* c_parent @@ -1251,7 +1254,7 @@ c_parent, self._href, self._name) while c_node is NULL and self._depth > 1: - # walk up the parent pointers and continue with siblings + # walk up the parent pointers and continue with their siblings c_parent = c_parent.parent self._depth = self._depth - 1 if c_parent is NULL or not _isElement(c_parent): From scoder at codespeak.net Fri May 12 18:03:23 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Fri May 12 18:03:24 2006 Subject: [Lxml-checkins] r27143 - lxml/branch/lxml-0.9.x/doc Message-ID: <20060512160323.1A3CA100BD@code0.codespeak.net> Author: scoder Date: Fri May 12 18:03:21 2006 New Revision: 27143 Modified: lxml/branch/lxml-0.9.x/doc/main.txt Log: doc updates as encouraged by David Sankel Modified: lxml/branch/lxml-0.9.x/doc/main.txt ============================================================================== --- lxml/branch/lxml-0.9.x/doc/main.txt (original) +++ lxml/branch/lxml-0.9.x/doc/main.txt Fri May 12 18:03:21 2006 @@ -16,37 +16,21 @@ News ---- -* 2006-05-10: `lxml 0.9.2`_ released (`changes for 0.9.2`_) +* 2006-05-10: lxml 0.9.2 released (`changes for 0.9.2`_) -* 2006-03-30: `lxml 0.9.1`_ released (`changes for 0.9.1`_) +* 2006-03-30: lxml 0.9.1 released (`changes for 0.9.1`_) -* 2006-03-20: `lxml 0.9`_ released (`changes for 0.9`_) +* 2006-03-20: lxml 0.9 released (`changes for 0.9`_) -* 2005-11-03: `lxml 0.8`_ released (`changes for 0.8`_) +* 2005-11-03: lxml 0.8 released (`changes for 0.8`_) -* 2005-06-15: `lxml 0.7`_ released (`changes for 0.7`_) +* 2005-06-15: lxml 0.7 released (`changes for 0.7`_) -* 2005-05-14: `lxml 0.6`_ released (`changes for 0.6`_) +* 2005-05-14: lxml 0.6 released (`changes for 0.6`_) -* 2005-04-09: `lxml 0.5.1`_ released (`changes for 0.5.1`_) +* 2005-04-09: lxml 0.5.1 released (`changes for 0.5.1`_) -* 2005-04-08: `lxml 0.5`_ released! - -.. _`lxml 0.9.2`: lxml-0.9.2.tgz - -.. _`lxml 0.9.1`: lxml-0.9.1.tgz - -.. _`lxml 0.9`: lxml-0.9.tgz - -.. _`lxml 0.8`: lxml-0.8.tgz - -.. _`lxml 0.7`: lxml-0.7.tgz - -.. _`lxml 0.6`: lxml-0.6.tgz - -.. _`lxml 0.5.1`: lxml-0.5.1.tgz - -.. _`lxml 0.5`: lxml-0.5.tgz +* 2005-04-08: lxml 0.5 released! .. _`CHANGES for 0.9.2`: changes-0.9.2.html @@ -113,8 +97,15 @@ Download -------- +The best way to download binary versions is to visit `lxml at the Python +cheeseshop`_. It has the source, eggs and installers for various platforms. + .. _`lxml at the Python cheeseshop`: http://cheeseshop.python.org/pypi/lxml/ +Please take a look at the `installation instructions`_! + +.. _`installation instructions`: installation.html + * `lxml 0.9.2`_ (2006-05-10) * `lxml 0.9.1`_ (2006-03-30) @@ -131,13 +122,21 @@ * `lxml 0.5`_ (2005-04-08) -Instead of downloading the source here, you can also find `lxml at the -Python cheeseshop`_ in source, egg and installer form for various -platforms. +.. _`lxml 0.9.2`: lxml-0.9.2.tgz -See also the `installation instructions`_. +.. _`lxml 0.9.1`: lxml-0.9.1.tgz -.. _`installation instructions`: installation.html +.. _`lxml 0.9`: lxml-0.9.tgz + +.. _`lxml 0.8`: lxml-0.8.tgz + +.. _`lxml 0.7`: lxml-0.7.tgz + +.. _`lxml 0.6`: lxml-0.6.tgz + +.. _`lxml 0.5.1`: lxml-0.5.1.tgz + +.. _`lxml 0.5`: lxml-0.5.tgz It's also possible to check out the latest development version of lxml from svn directly, using a command like this:: From scoder at codespeak.net Fri May 12 18:48:57 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Fri May 12 18:48:59 2006 Subject: [Lxml-checkins] r27145 - lxml/branch/lxml-0.9.x/doc Message-ID: <20060512164857.1832E100BA@code0.codespeak.net> Author: scoder Date: Fri May 12 18:48:55 2006 New Revision: 27145 Modified: lxml/branch/lxml-0.9.x/doc/main.txt Log: merged news section into download section Modified: lxml/branch/lxml-0.9.x/doc/main.txt ============================================================================== --- lxml/branch/lxml-0.9.x/doc/main.txt (original) +++ lxml/branch/lxml-0.9.x/doc/main.txt Fri May 12 18:48:55 2006 @@ -13,24 +13,49 @@ .. _introduction: intro.html -News ----- +Download +-------- + +The best way to download binary versions is to visit `lxml at the Python +cheeseshop`_. It has the source, eggs and installers for various platforms. + +.. _`lxml at the Python cheeseshop`: http://cheeseshop.python.org/pypi/lxml/ + +Please take a look at the `installation instructions`_! + +.. _`installation instructions`: installation.html + +* `lxml 0.9.2`_, released 2006-05-10 (`changes for 0.9.2`_) -* 2006-05-10: lxml 0.9.2 released (`changes for 0.9.2`_) +* `lxml 0.9.1`_, released 2006-03-30 (`changes for 0.9.1`_) -* 2006-03-30: lxml 0.9.1 released (`changes for 0.9.1`_) +* `lxml 0.9`_, released 2006-03-20 (`changes for 0.9`_) -* 2006-03-20: lxml 0.9 released (`changes for 0.9`_) +* `lxml 0.8`_, released 2005-11-03 (`changes for 0.8`_) -* 2005-11-03: lxml 0.8 released (`changes for 0.8`_) +* `lxml 0.7`_, released 2005-06-15 (`changes for 0.7`_) -* 2005-06-15: lxml 0.7 released (`changes for 0.7`_) +* `lxml 0.6`_, released 2005-05-14 (`changes for 0.6`_) -* 2005-05-14: lxml 0.6 released (`changes for 0.6`_) +* `lxml 0.5.1`_, released 2005-04-09 (`changes for 0.5.1`_) -* 2005-04-09: lxml 0.5.1 released (`changes for 0.5.1`_) +* `lxml 0.5`_, released 2005-04-08 -* 2005-04-08: lxml 0.5 released! +.. _`lxml 0.9.2`: lxml-0.9.2.tgz + +.. _`lxml 0.9.1`: lxml-0.9.1.tgz + +.. _`lxml 0.9`: lxml-0.9.tgz + +.. _`lxml 0.8`: lxml-0.8.tgz + +.. _`lxml 0.7`: lxml-0.7.tgz + +.. _`lxml 0.6`: lxml-0.6.tgz + +.. _`lxml 0.5.1`: lxml-0.5.1.tgz + +.. _`lxml 0.5`: lxml-0.5.tgz .. _`CHANGES for 0.9.2`: changes-0.9.2.html @@ -46,6 +71,19 @@ .. _`CHANGES for 0.5.1`: changes-0.5.1.html +It's also possible to check out the latest development version of lxml +from svn directly, using a command like this:: + + svn co http://codespeak.net/svn/lxml/trunk lxml + +You can also `browse it through the web`_. The `latest CHANGES`_ of the +developer version are also accessible. You can check there if a bug you found +has been fixed or a feature you want has been implemented in the latest trunk +version. + +.. _`browse it through the web`: http://codespeak.net/svn/lxml +.. _`latest CHANGES`: http://codespeak.net/svn/lxml/trunk/CHANGES.txt + Documentation ------------- @@ -94,63 +132,6 @@ .. _`mailing list`: http://codespeak.net/mailman/listinfo/lxml-dev -Download --------- - -The best way to download binary versions is to visit `lxml at the Python -cheeseshop`_. It has the source, eggs and installers for various platforms. - -.. _`lxml at the Python cheeseshop`: http://cheeseshop.python.org/pypi/lxml/ - -Please take a look at the `installation instructions`_! - -.. _`installation instructions`: installation.html - -* `lxml 0.9.2`_ (2006-05-10) - -* `lxml 0.9.1`_ (2006-03-30) - -* `lxml 0.9`_ (2006-03-20) - -* `lxml 0.8`_ (2005-11-03) - -* `lxml 0.7`_ (2005-06-15) - -* `lxml 0.6`_ (2005-05-14) - -* `lxml 0.5.1`_ (2005-04-09) - -* `lxml 0.5`_ (2005-04-08) - -.. _`lxml 0.9.2`: lxml-0.9.2.tgz - -.. _`lxml 0.9.1`: lxml-0.9.1.tgz - -.. _`lxml 0.9`: lxml-0.9.tgz - -.. _`lxml 0.8`: lxml-0.8.tgz - -.. _`lxml 0.7`: lxml-0.7.tgz - -.. _`lxml 0.6`: lxml-0.6.tgz - -.. _`lxml 0.5.1`: lxml-0.5.1.tgz - -.. _`lxml 0.5`: lxml-0.5.tgz - -It's also possible to check out the latest development version of lxml -from svn directly, using a command like this:: - - svn co http://codespeak.net/svn/lxml/trunk lxml - -You can also `browse it through the web`_. The `latest CHANGES`_ of the -developer version are also accessible. You can check there if a bug you found -has been fixed or a feature you want has been implemented in the latest trunk -version. - -.. _`browse it through the web`: http://codespeak.net/svn/lxml -.. _`latest CHANGES`: http://codespeak.net/svn/lxml/trunk/CHANGES.txt - License ------- From scoder at codespeak.net Fri May 12 23:12:37 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Fri May 12 23:12:40 2006 Subject: [Lxml-checkins] r27155 - lxml/branch/lxml-0.9.x/doc Message-ID: <20060512211237.8137F100B8@code0.codespeak.net> Author: scoder Date: Fri May 12 23:12:36 2006 New Revision: 27155 Modified: lxml/branch/lxml-0.9.x/doc/main.txt Log: removed blank lines between links in main.txt Modified: lxml/branch/lxml-0.9.x/doc/main.txt ============================================================================== --- lxml/branch/lxml-0.9.x/doc/main.txt (original) +++ lxml/branch/lxml-0.9.x/doc/main.txt Fri May 12 23:12:36 2006 @@ -42,33 +42,20 @@ * `lxml 0.5`_, released 2005-04-08 .. _`lxml 0.9.2`: lxml-0.9.2.tgz - .. _`lxml 0.9.1`: lxml-0.9.1.tgz - .. _`lxml 0.9`: lxml-0.9.tgz - .. _`lxml 0.8`: lxml-0.8.tgz - .. _`lxml 0.7`: lxml-0.7.tgz - .. _`lxml 0.6`: lxml-0.6.tgz - .. _`lxml 0.5.1`: lxml-0.5.1.tgz - .. _`lxml 0.5`: lxml-0.5.tgz .. _`CHANGES for 0.9.2`: changes-0.9.2.html - .. _`CHANGES for 0.9.1`: changes-0.9.1.html - .. _`CHANGES for 0.9`: changes-0.9.html - .. _`CHANGES for 0.8`: changes-0.8.html - .. _`CHANGES for 0.7`: changes-0.7.html - .. _`CHANGES for 0.6`: changes-0.6.html - .. _`CHANGES for 0.5.1`: changes-0.5.1.html It's also possible to check out the latest development version of lxml @@ -104,25 +91,15 @@ in the standar dlibrary. .. _`ElementTree API`: http://effbot.org/zone/element-index.htm - .. _`ElementTree compatibility overview`: compatibility.html - .. _`extends this API`: api.html - .. _`extension functions`: extensions.html - .. _XPath: http://www.w3.org/TR/xpath - .. _`Relax NG`: http://www.relaxng.org/ - .. _`XML Schema`: http://www.w3.org/XML/Schema - .. _`XSLT`: http://www.w3.org/TR/xslt - .. _`c14n`: http://www.w3.org/TR/2001/REC-xml-c14n-20010315 - .. _`implementing namespaces`: namespace_extensions.html - .. _`SAX compliant API`: sax.html Mailing list From scoder at codespeak.net Fri May 12 23:13:50 2006 From: scoder at codespeak.net (scoder@codespeak.net) Date: Fri May 12 23:13:52 2006 Subject: [Lxml-checkins] r27156 - lxml/trunk Message-ID: <20060512211350.9E49C100B8@code0.codespeak.net> Author: scoder Date: Fri May 12 23:13:49 2006 New Revision: 27156 Modified: lxml/trunk/bench.py Log: variable renamed Modified: lxml/trunk/bench.py ============================================================================== --- lxml/trunk/bench.py (original) +++ lxml/trunk/bench.py Fri May 12 23:13:49 2006 @@ -582,8 +582,8 @@ name = 'bench_' + name selected.append(name) benchmarks = [ [ b for b in bs - if [ contains for contains in selected - if contains in b[0] ] ] + if [ match for match in selected + if match in b[0] ] ] for bs in benchmarks ] import time From ogrisel at codespeak.net Mon May 15 11:27:17 2006 From: ogrisel at codespeak.net (ogrisel@codespeak.net) Date: Mon May 15 11:27:18 2006 Subject: [Lxml-checkins] r27224 - lxml/www Message-ID: <20060515092717.505FD10093@code0.codespeak.net> Author: ogrisel Date: Mon May 15 11:27:16 2006 New Revision: 27224 Modified: lxml/www/style.css Log: new CSS style for the codespeak website (smaller fonts, centered text + various style improvements) Modified: lxml/www/style.css ============================================================================== --- lxml/www/style.css (original) +++ lxml/www/style.css Mon May 15 11:27:16 2006 @@ -1,31 +1,60 @@ body { + /* CSS Hack for IE that does not respect the "margin: auto" rule at the + * document level */ + text-align: center; + padding: 1em; +} + +div.document { + width: 45em; + font: 13px Arial, Verdana, Helvetica, sans-serif; + margin: 1em auto 1em auto; + background-color: white; + color: #222; + text-align: left; +} + +h1.title { background: url(http://codespeak.net/img/codespeak1b.png) no-repeat; - font: 120% Arial, Verdana, Helvetica, sans-serif; - border: 0; - margin: 0.5em 0em 0.5em 0.5em; - padding: 0 0 0 145px; + padding: 20px 0 0 180px; + height: 60px; + font-size: 200%; } -a { - text-decoration: underline; - background-color: transparent; +h1, h2, h3 { + color: #333; + font-weight: bold; } -p { - /*margin: 0.5em 0em 1em 0em;*/ - text-align: left; - line-height: 1.5em; - margin: 0.5em 0em 0em 0em; +h1 { + font-size: 120%; } -p a { - text-decoration: underline; +h2 { + font-size: 110%; } +h3 { + font-size: 105%; +} -p a:active { - color: Red; +a, a:visited { background-color: transparent; + font-weight: bold; + color: Black; + text-decoration: none; +} + +a:active { + color: Red; + text-decoration: underline; +} + +p { + /*margin: 0.5em 0em 1em 0em;*/ + text-align: justify; + line-height: 1.5em; + margin: 0.5em 0em 0em 0em; } hr { @@ -35,10 +64,8 @@ background-color: transparent; } - -ul { +ul { line-height: 1.5em; - /*list-style-image: url("bullet.gif"); */ margin-left: 1em; } @@ -47,28 +74,21 @@ margin-left: 0em; } -ul a, ol a { - text-decoration: underline; -} - blockquote { font-family: Times, "Times New Roman", serif; font-style: italic; - font-size: 120%; } code { - font-size: 120%; color: Black; - /*background-color: #dee7ec;*/ background-color: #cccccc; + font-family: "Courier New", Courier, monospace; } pre { - font-size: 120%; - padding: 1em; + padding: 0.5em; border: 1px solid #8cacbb; color: Black; - background-color: #dee7ec; background-color: #cccccc; + font-family: "Courier New", Courier, monospace; } From ogrisel at codespeak.net Mon May 15 11:28:00 2006 From: ogrisel at codespeak.net (ogrisel@codespeak.net) Date: Mon May 15 11:28:01 2006 Subject: [Lxml-checkins] r27225 - lxml/www Message-ID: <20060515092800.80F8210093@code0.codespeak.net> Author: ogrisel Date: Mon May 15 11:27:59 2006 New Revision: 27225 Modified: lxml/www/publish.py Log: changed the publish.py script to remove harcoded reference to the style.css url Modified: lxml/www/publish.py ============================================================================== --- lxml/www/publish.py (original) +++ lxml/www/publish.py Mon May 15 11:27:59 2006 @@ -1,9 +1,12 @@ -import os, sys +import os, shutil, sys def publish(dirname, lxml_path, release): if not os.path.exists(dirname): os.mkdir(dirname) - stylesheet_url = 'http://codespeak.net/lxml/style.css' + stylesheet_url = 'style.css' + + shutil.copy(stylesheet_url, dirname) + for name in ['main.txt', 'intro.txt', 'api.txt', 'compatibility.txt', 'extensions.txt', 'namespace_extensions.txt', 'sax.txt']: path = os.path.join(lxml_path, 'doc', name) @@ -22,10 +25,10 @@ os.path.join(dirname, 'index.html')) def rest2html(source_path, dest_path, stylesheet_url): - - command = ('rest2html --stylesheet=%s %s > %s' % + + command = ('rest2html --stylesheet=%s --link-stylesheet %s > %s' % (stylesheet_url, source_path, dest_path)) os.system(command) - + if __name__ == '__main__': publish(sys.argv[1], sys.argv[2], sys.argv[3]) From scoder at codespeak.net Tue May 16 19:42:09 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 16 May 2006 19:42:09 +0200 (CEST) Subject: [Lxml-checkins] r27294 - in lxml/trunk: . src/lxml Message-ID: <20060516174209.6F8A110077@code0.codespeak.net> Author: scoder Date: Tue May 16 19:42:03 2006 New Revision: 27294 Modified: lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/apihelpers.pxi lxml/trunk/src/lxml/etree.pyx lxml/trunk/src/lxml/parser.pxi lxml/trunk/src/lxml/tree.pxd Log: rewrite of ElementTree.write() to write directly to file/file-like instead of serializing to memory Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Tue May 16 19:42:03 2006 @@ -7,6 +7,9 @@ Features added -------------- +* ElementTree.write() no longer serializes in memory (reduced memory + footprint) + * Speedup of Element.findall(tag) and Element.getiterator(tag) * Support for writing the XML representation of Elements and ElementTrees to Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Tue May 16 19:42:03 2006 @@ -1,41 +1,31 @@ -# Private helper functions +# Private helper functions for input/output and API functions + +# XML I/O helpers -cdef _tostring(_NodeBase element, encoding, int xml_declaration): +cdef _tostring(_NodeBase element, encoding, int write_xml_declaration): "Serialize an element to an encoded string representation of its XML tree." cdef _Document doc cdef tree.xmlOutputBuffer* c_buffer cdef tree.xmlBuffer* c_result_buffer cdef tree.xmlCharEncodingHandler* enchandler - cdef char* enc + cdef char* c_enc + cdef char* c_version if element is None: return None if encoding in ('utf8', 'UTF8', 'utf-8'): encoding = 'UTF-8' doc = element._doc - enc = encoding + c_enc = encoding # it is necessary to *and* find the encoding handler *and* use # encoding during output - enchandler = tree.xmlFindCharEncodingHandler(enc) + enchandler = tree.xmlFindCharEncodingHandler(c_enc) c_buffer = tree.xmlAllocOutputBuffer(enchandler) if c_buffer is NULL: raise LxmlError, "Failed to create output buffer" - if xml_declaration: - if doc._c_doc.version is NULL: - version = "1.0" - else: - version = doc._c_doc.version - xml_decl = "" % ( - version, encoding) - tree.xmlOutputBufferWriteString(c_buffer, "\n") - try: - tree.xmlNodeDumpOutput(c_buffer, doc._c_doc, element._c_node, 0, 0, enc) - _dumpNextNode(c_buffer, doc._c_doc, element._c_node, enc) + _writeNodeToBuffer(c_buffer, doc._c_doc, element._c_node, + doc._c_doc.version, c_enc, write_xml_declaration) tree.xmlOutputBufferFlush(c_buffer) if c_buffer.conv is not NULL: c_result_buffer = c_buffer.conv @@ -60,8 +50,8 @@ if c_buffer is NULL: raise LxmlError, "Failed to create output buffer" try: - tree.xmlNodeDumpOutput(c_buffer, doc._c_doc, element._c_node, 0, 0, NULL) - _dumpNextNode(c_buffer, doc._c_doc, element._c_node, NULL) + _writeNodeToBuffer(c_buffer, doc._c_doc, element._c_node, + NULL, NULL, 0) tree.xmlOutputBufferFlush(c_buffer) if c_buffer.conv is not NULL: c_result_buffer = c_buffer.conv @@ -75,6 +65,101 @@ tree.xmlOutputBufferClose(c_buffer) return result +cdef void _writeNodeToBuffer(tree.xmlOutputBuffer* c_buffer, + xmlDoc* c_doc, xmlNode* c_node, + char* xml_version, char* encoding, + int write_xml_declaration): + if write_xml_declaration: + _writeDeclarationToBuffer(c_buffer, xml_version, encoding) + + tree.xmlNodeDumpOutput(c_buffer, c_doc, c_node, 0, 0, encoding) + _dumpNextNode(c_buffer, c_doc, c_node, encoding) + +cdef void _writeDeclarationToBuffer(tree.xmlOutputBuffer* c_buffer, + char* version, char* encoding): + if version is NULL: + version = "1.0" + tree.xmlOutputBufferWriteString(c_buffer, "\n") + +# output to file-like objects +cdef class _FileWriter: + cdef object _filelike + cdef _ExceptionContext _exc_context + def __init__(self, filelike, exc_context=None): + self._filelike = filelike + if exc_context is None: + self._exc_context = _ExceptionContext() + else: + self._exc_context = exc_context + + cdef tree.xmlOutputBuffer* _createOutputBuffer( + self, tree.xmlCharEncodingHandler* enchandler) except NULL: + cdef tree.xmlOutputBuffer* c_buffer + c_buffer = tree.xmlOutputBufferCreateIO( + _writeFilelikeWriter, _closeFilelikeWriter, + self, enchandler) + if c_buffer is NULL: + raise IOError, "Could not create I/O writer context." + return c_buffer + + cdef int write(self, char* c_buffer, int len): + try: + if self._filelike is None: + raise IOError, "File is already closed" + py_buffer = python.PyString_FromStringAndSize(c_buffer, len) + self._filelike.write(py_buffer) + return len + except Exception: + self._exc_context._store_raised() + return -1 + + cdef int close(self): + # we should not close the file here as we didn't open it + self._filelike = None + return 0 + +cdef int _writeFilelikeWriter(void* ctxt, char* c_buffer, int len): + return (<_FileWriter>ctxt).write(c_buffer, len) + +cdef int _closeFilelikeWriter(void* ctxt): + return (<_FileWriter>ctxt).close() + +cdef _tofile(f, _NodeBase element, encoding, int write_declaration): + cdef _FileWriter writer + cdef tree.xmlOutputBuffer* c_buffer + cdef tree.xmlCharEncodingHandler* enchandler + cdef char* c_enc + if encoding is None: + c_enc = NULL + else: + c_enc = encoding + + enchandler = tree.xmlFindCharEncodingHandler(c_enc) + if python.PyString_Check(f) or python.PyUnicode_Check(f): + filename = _utf8(f) + c_buffer = tree.xmlOutputBufferCreateFilename( + _cstr(filename), enchandler, 0) + elif hasattr(f, 'write'): + writer = _FileWriter(f) + c_buffer = writer._createOutputBuffer(enchandler) + else: + raise TypeError, "File or filename expected, got '%s'" % type(f) + + _writeNodeToBuffer(c_buffer, + element._doc._c_doc, element._c_node, + element._doc._c_doc.version, c_enc, + write_declaration) + + tree.xmlOutputBufferClose(c_buffer) + if writer is not None: + writer._exc_context._raise_if_stored() + +# Private helper functions + cdef void displayNode(xmlNode* c_node, indent): # to help with debugging cdef xmlNode* c_child @@ -189,8 +274,8 @@ tree.xmlOutputBufferWriteString(c_buffer, '\n') tree.xmlOutputBufferFlush(c_buffer) -cdef _dumpNextNode(tree.xmlOutputBuffer* c_buffer, xmlDoc* c_doc, - xmlNode* c_node, char* encoding): +cdef void _dumpNextNode(tree.xmlOutputBuffer* c_buffer, xmlDoc* c_doc, + xmlNode* c_node, char* encoding): cdef xmlNode* c_next c_next = c_node.next if c_next is not NULL and c_next.type == tree.XML_TEXT_NODE: @@ -525,4 +610,3 @@ while c_attr_current is not NULL: changeDocumentBelowHelper(c_current, doc) c_attr_current = c_attr_current.next - Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Tue May 16 19:42:03 2006 @@ -347,17 +347,14 @@ return DocInfo(self._doc) def write(self, file, encoding='us-ascii'): - if not hasattr(file, 'write'): - # file is a filename, we want a file object - file = open(file, 'wb') - - m = tostring(self._context_node, encoding) - # XXX this is purely for ElementTree compatibility.. + if encoding in ('utf8', 'UTF8', 'utf-8'): + encoding = 'UTF-8' if encoding == 'UTF-8' or encoding == 'us-ascii': - m = _stripDeclaration(m) - if m[-1:] == '\n': - m = m[:-1] - file.write(m) + # XXX this is purely for ElementTree compatibility.. + write_declaration = 0 + else: + write_declaration = 1 + _tofile(file, self._context_node, encoding, write_declaration) def getiterator(self, tag=None): root = self.getroot() Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Tue May 16 19:42:03 2006 @@ -129,21 +129,21 @@ cdef xmlparser.xmlParserInputBuffer* c_buffer c_buffer = xmlparser.xmlAllocParserInputBuffer(0) c_buffer.context = self - c_buffer.readcallback = _copyFilelike + c_buffer.readcallback = _readFilelikeParser return xmlparser.xmlNewIOInputStream(ctxt, c_buffer, 0) cdef xmlDoc* _readDoc(self, xmlParserCtxt* ctxt, int options, LxmlParserType parser_type): if parser_type == LXML_XML_PARSER: return xmlparser.xmlCtxtReadIO( - ctxt, _copyFilelike, NULL, self, + ctxt, _readFilelikeParser, NULL, self, self._c_url, NULL, options) else: return htmlparser.htmlCtxtReadIO( - ctxt, _copyFilelike, NULL, self, + ctxt, _readFilelikeParser, NULL, self, self._c_url, NULL, options) - cdef int write(self, char* c_buffer, int c_size): + cdef int copyToBuffer(self, char* c_buffer, int c_size): cdef char* c_start cdef Py_ssize_t byte_count, remaining if self._bytes_read < 0: @@ -168,9 +168,8 @@ self._exc_context._store_raised() return -1 -cdef int _copyFilelike(void* ctxt, char* c_buffer, int c_size): - return (<_FileParserContext>ctxt).write(c_buffer, c_size) - +cdef int _readFilelikeParser(void* ctxt, char* c_buffer, int c_size): + return (<_FileParserContext>ctxt).copyToBuffer(c_buffer, c_size) ############################################################ ## support for custom document loaders Modified: lxml/trunk/src/lxml/tree.pxd ============================================================================== --- lxml/trunk/src/lxml/tree.pxd (original) +++ lxml/trunk/src/lxml/tree.pxd Tue May 16 19:42:03 2006 @@ -204,17 +204,26 @@ cdef int xmlBufferLength(xmlBuffer* buf) cdef extern from "libxml/xmlIO.h": - cdef xmlOutputBuffer* xmlAllocOutputBuffer(xmlCharEncodingHandler* encoder) - cdef xmlOutputBuffer* xmlOutputBufferCreateFile( - FILE* file, - xmlCharEncodingHandler* encoder) cdef int xmlOutputBufferWriteString(xmlOutputBuffer* out, char* str) cdef int xmlOutputBufferFlush(xmlOutputBuffer* out) cdef int xmlOutputBufferClose(xmlOutputBuffer* out) ctypedef int (*xmlInputReadCallback)(void* context, char* buffer, int len) - ctypedef int (*xmlInputCloseCallback)(void * context) + ctypedef int (*xmlInputCloseCallback)(void* context) + + ctypedef int (*xmlOutputWriteCallback)(void* context, char* buffer, int len) + ctypedef int (*xmlOutputCloseCallback)(void* context) + cdef xmlOutputBuffer* xmlAllocOutputBuffer(xmlCharEncodingHandler* encoder) + cdef xmlOutputBuffer* xmlOutputBufferCreateIO( + xmlOutputWriteCallback iowrite, + xmlOutputCloseCallback ioclose, + void * ioctx, + xmlCharEncodingHandler* encoder) + cdef xmlOutputBuffer* xmlOutputBufferCreateFile( + FILE* file, xmlCharEncodingHandler* encoder) + cdef xmlOutputBuffer* xmlOutputBufferCreateFilename( + char* URI, xmlCharEncodingHandler* encoder, int compression) cdef extern from "libxml/xmlsave.h": ctypedef struct xmlSaveCtxt: pass From ogrisel at codespeak.net Tue May 16 23:01:22 2006 From: ogrisel at codespeak.net (ogrisel at codespeak.net) Date: Tue, 16 May 2006 23:01:22 +0200 (CEST) Subject: [Lxml-checkins] r27305 - lxml/www Message-ID: <20060516210122.1442C10094@code0.codespeak.net> Author: ogrisel Date: Tue May 16 23:01:21 2006 New Revision: 27305 Modified: lxml/www/style.css Log: style improvement to better distinguish headlines from regular links Modified: lxml/www/style.css ============================================================================== --- lxml/www/style.css (original) +++ lxml/www/style.css Tue May 16 23:01:21 2006 @@ -21,21 +21,22 @@ font-size: 200%; } -h1, h2, h3 { - color: #333; +h1.title, h1 a, h2 a, h3 a { + color: #666; font-weight: bold; + font-family: Helvetica, sans-serif; } h1 { - font-size: 120%; + font-size: 150%; } h2 { - font-size: 110%; + font-size: 130%; } h3 { - font-size: 105%; + font-size: 110%; } a, a:visited { @@ -47,6 +48,9 @@ a:active { color: Red; +} + +a:hover { text-decoration: underline; } From ogrisel at codespeak.net Tue May 16 23:26:05 2006 From: ogrisel at codespeak.net (ogrisel at codespeak.net) Date: Tue, 16 May 2006 23:26:05 +0200 (CEST) Subject: [Lxml-checkins] r27308 - lxml/www Message-ID: <20060516212605.BE515100A2@code0.codespeak.net> Author: ogrisel Date: Tue May 16 23:26:05 2006 New Revision: 27308 Modified: lxml/www/style.css Log: reST headlines are links to nowhere by default thus hide there link style by specializing the a rules Modified: lxml/www/style.css ============================================================================== --- lxml/www/style.css (original) +++ lxml/www/style.css Tue May 16 23:26:05 2006 @@ -46,11 +46,11 @@ text-decoration: none; } -a:active { +p a:active, ul a:active { color: Red; } -a:hover { +p a:hover, ul a:hover { text-decoration: underline; } From scoder at codespeak.net Tue May 16 23:28:28 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 16 May 2006 23:28:28 +0200 (CEST) Subject: [Lxml-checkins] r27309 - lxml/trunk/doc Message-ID: <20060516212828.52154100A2@code0.codespeak.net> Author: scoder Date: Tue May 16 23:28:27 2006 New Revision: 27309 Modified: lxml/trunk/doc/extensions.txt Log: tiny cleanup in docs Modified: lxml/trunk/doc/extensions.txt ============================================================================== --- lxml/trunk/doc/extensions.txt (original) +++ lxml/trunk/doc/extensions.txt Tue May 16 23:28:27 2006 @@ -101,7 +101,8 @@ >>> print e.evaluate('es:hello(local-name(/a))') Ola a - >>> e = etree.XPathEvaluator(doc, namespaces={'f' : 'http://mydomain.org/myfunctions'}) + >>> namespaces = {'f' : 'http://mydomain.org/myfunctions'} + >>> e = etree.XPathEvaluator(doc, namespaces=namespaces) >>> print e.evaluate('f:hello(local-name(/a))') Hello a From scoder at codespeak.net Wed May 17 00:10:13 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 17 May 2006 00:10:13 +0200 (CEST) Subject: [Lxml-checkins] r27310 - lxml/branch/xslt-access-control Message-ID: <20060516221013.107841009D@code0.codespeak.net> Author: scoder Date: Wed May 17 00:10:11 2006 New Revision: 27310 Added: lxml/branch/xslt-access-control/ - copied from r27309, lxml/trunk/ Log: new branch for implementing file/network access control in XSLT From scoder at codespeak.net Wed May 17 00:14:29 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 17 May 2006 00:14:29 +0200 (CEST) Subject: [Lxml-checkins] r27311 - in lxml/branch/xslt-access-control: doc src/lxml Message-ID: <20060516221429.131731009D@code0.codespeak.net> Author: scoder Date: Wed May 17 00:14:20 2006 New Revision: 27311 Modified: lxml/branch/xslt-access-control/doc/resolvers.txt lxml/branch/xslt-access-control/src/lxml/xslt.pxd lxml/branch/xslt-access-control/src/lxml/xslt.pxi Log: first shot on XSLT access control using XSLTAccessControl wrapper class for libxslt:security API Modified: lxml/branch/xslt-access-control/doc/resolvers.txt ============================================================================== --- lxml/branch/xslt-access-control/doc/resolvers.txt (original) +++ lxml/branch/xslt-access-control/doc/resolvers.txt Wed May 17 00:14:20 2006 @@ -186,3 +186,41 @@ inherits. For XSLT, the output document inherits the resolvers of the input document and not those of the stylesheet. Therefore, the last result does not inherit any resolvers at all. + + +I/O access control in XSLT +-------------------------- + +XSLT has an additional mechanism to control the access certain I/O operations +during the transformation process. This is most interesting where XSL scripts +come from potentially insecure sources and must be prevented from modifying +the local file system. Note, however, that there is no way to keep them from +eating up your precious CPU time, so this should not stop you from thinking +about what XSLT you execute. + +Access control is configured using the XSLTAccessControl class. It can be +called with a number of keyword arguments that allow or deny specific +operations:: + + >>> transform = etree.XSLT(honk_doc) + Resolving url honk:test as prefix honk ... done + >>> result = transform(normal_doc) + Resolving url hoi:test as prefix honk ... failed + Resolving url hoi:test as prefix hoi ... done + + >>> ac = etree.XSLTAccessControl(read_network=False) + >>> transform = etree.XSLT(honk_doc, access_control=ac) + Resolving url honk:test as prefix honk ... done + >>> result = transform(normal_doc) + Traceback (most recent call last): + [...] + XSLTApplyError: runtime error (element 'value-of') + +There are a few things to keep in mind: + +* ``read_file=False`` does not imply ``write_file=False``, all controls are + independent. +* XSL parsing (``xsl:import``, etc.) is not affected by this mechanism +* ``read_file`` only applies to files in the file system. Any custom schemes + for URLs or URIs are controlled via the ``*_network`` keywords. + Modified: lxml/branch/xslt-access-control/src/lxml/xslt.pxd ============================================================================== --- lxml/branch/xslt-access-control/src/lxml/xslt.pxd (original) +++ lxml/branch/xslt-access-control/src/lxml/xslt.pxd Wed May 17 00:14:20 2006 @@ -19,6 +19,7 @@ xsltStylesheet* style xmlXPathContext* xpathCtxt xsltDocument* document + void* _private cdef xsltStylesheet* xsltParseStylesheetDoc(xmlDoc* doc) cdef void xsltFreeStylesheet(xsltStylesheet* sheet) @@ -69,6 +70,33 @@ void* ctxt, void (*handler)(void* ctxt, char* msg, ...)) +cdef extern from "libxslt/security.h": + ctypedef struct xsltSecurityPrefs + ctypedef enum xsltSecurityOption: + XSLT_SECPREF_READ_FILE = 1 + XSLT_SECPREF_WRITE_FILE = 2 + XSLT_SECPREF_CREATE_DIRECTORY = 3 + XSLT_SECPREF_READ_NETWORK = 4 + XSLT_SECPREF_WRITE_NETWORK = 5 + + ctypedef int (*xsltSecurityCheck)(xsltSecurityPrefs* sec, + xsltTransformContext* ctxt, + char* value) + + cdef xsltSecurityPrefs* xsltNewSecurityPrefs() + cdef void xsltFreeSecurityPrefs(xsltSecurityPrefs* sec) + cdef int xsltSecurityForbid(xsltSecurityPrefs* sec, + xsltTransformContext* ctxt, + char* value) + cdef int xsltSecurityAllow(xsltSecurityPrefs* sec, + xsltTransformContext* ctxt, + char* value) + cdef int xsltSetSecurityPrefs(xsltSecurityPrefs* sec, + xsltSecurityOption option, + xsltSecurityCheck func) + cdef int xsltSetCtxtSecurityPrefs(xsltSecurityPrefs* sec, + xsltTransformContext* ctxt) + cdef extern from "libxslt/extra.h": cdef char* XSLT_LIBXSLT_NAMESPACE cdef char* XSLT_XALAN_NAMESPACE Modified: lxml/branch/xslt-access-control/src/lxml/xslt.pxi ============================================================================== --- lxml/branch/xslt-access-control/src/lxml/xslt.pxi (original) +++ lxml/branch/xslt-access-control/src/lxml/xslt.pxi Wed May 17 00:14:20 2006 @@ -17,6 +17,9 @@ class XSLTExtensionError(XSLTError): pass +class XSLTAccessDeniedError(XSLTError): + pass + # version information LIBXSLT_COMPILED_VERSION = __unpackIntVersion(xslt.LIBXSLT_VERSION) LIBXSLT_VERSION = __unpackIntVersion(xslt.xsltLibxsltVersion) @@ -122,6 +125,134 @@ xslt.xsltSetLoaderFunc(_doc_loader) +################################################################################ +# XSLT file/network access control + +cdef object __ACCESS_METHOD_MAP +__ACCESS_METHOD_MAP = { + xslt.XSLT_SECPREF_READ_FILE : "check_read_file", + xslt.XSLT_SECPREF_WRITE_FILE : "check_write_file", + xslt.XSLT_SECPREF_CREATE_DIRECTORY : "check_create_dir", + xslt.XSLT_SECPREF_READ_NETWORK : "check_read_network", + xslt.XSLT_SECPREF_WRITE_NETWORK : "check_write_network", + } + +cdef class XSLTAccessControl: + """Access control for XSLT: reading/writing files, directories and network + access. + + Access is granted to a type of resource via keyword arguments or for + specific URLs by subclassing and implementing filter methods 'check_*' + that return a truth value for their URL string argument: + + * read_file + * write_file + * create_dir + * read_network + * write_network + """ + cdef xslt.xsltSecurityPrefs* _prefs + def __init__(self, read_file=None, write_file=None, create_dir=None, + read_network=None, write_network=None): + self._prefs = xslt.xsltNewSecurityPrefs() + if self._prefs is NULL: + raise XSLTError, "Error preparing access control context" + self._setAccess(xslt.XSLT_SECPREF_READ_FILE, read_file) + self._setAccess(xslt.XSLT_SECPREF_WRITE_FILE, write_file) + self._setAccess(xslt.XSLT_SECPREF_CREATE_DIRECTORY, create_dir) + self._setAccess(xslt.XSLT_SECPREF_READ_NETWORK, read_network) + self._setAccess(xslt.XSLT_SECPREF_WRITE_NETWORK, write_network) + + def __dealloc__(self): + if self._prefs is not NULL: + xslt.xsltFreeSecurityPrefs(self._prefs) + + cdef _setAccess(self, xslt.xsltSecurityOption option, allow): + cdef xslt.xsltSecurityCheck function + if allow is None: + # check if the corresponding method is defined + method_name = __ACCESS_METHOD_MAP.get(option, None) + if method_name is None: + function = xslt.xsltSecurityAllow + elif hasattr(self, method_name): + if option == xslt.XSLT_SECPREF_READ_FILE: + function = _checkFileRead + elif option == xslt.XSLT_SECPREF_WRITE_FILE: + function = _checkFileWrite + elif option == xslt.XSLT_SECPREF_CREATE_DIRECTORY: + function = _checkDirCreate + elif option == xslt.XSLT_SECPREF_READ_NETWORK: + function = _checkNetworkRead + elif option == xslt.XSLT_SECPREF_WRITE_NETWORK: + function = _checkNetworkWrite + else: + function = xslt.xsltSecurityAllow + else: + function = xslt.xsltSecurityAllow + elif allow: + function = xslt.xsltSecurityAllow + else: + function = xslt.xsltSecurityForbid + xslt.xsltSetSecurityPrefs(self._prefs, option, function) + + cdef void _register_in_context(self, xslt.xsltTransformContext* ctxt): + ctxt._private = self + xslt.xsltSetCtxtSecurityPrefs(self._prefs, ctxt) + +cdef int _checkFileRead(xslt.xsltSecurityPrefs* sec, + xslt.xsltTransformContext* ctxt, char* value): + cdef XSLTAccessControl access_control + if ctxt is NULL or ctxt._private is NULL: + return 1 # no access control => allow everything + access_control = ctxt._private + try: + return bool( access_control.check_read_file(value) ) + except Exception: + return 0 + +cdef int _checkFileWrite(xslt.xsltSecurityPrefs* sec, + xslt.xsltTransformContext* ctxt, char* value): + cdef XSLTAccessControl access_control + if ctxt is NULL or ctxt._private is NULL: + return 1 # no access control => allow everything + access_control = ctxt._private + try: + return bool( access_control.check_write_file(value) ) + except Exception: + return 0 + +cdef int _checkDirCreate(xslt.xsltSecurityPrefs* sec, + xslt.xsltTransformContext* ctxt, char* value): + cdef XSLTAccessControl access_control + if ctxt is NULL or ctxt._private is NULL: + return 1 # no access control => allow everything + access_control = ctxt._private + try: + return bool( access_control.check_create_dir(value) ) + except Exception: + return 0 + +cdef int _checkNetworkRead(xslt.xsltSecurityPrefs* sec, + xslt.xsltTransformContext* ctxt, char* value): + cdef XSLTAccessControl access_control + if ctxt is NULL or ctxt._private is NULL: + return 1 # no access control => allow everything + access_control = ctxt._private + try: + return bool( access_control.check_read_network(value) ) + except Exception: + return 0 + +cdef int _checkNetworkWrite(xslt.xsltSecurityPrefs* sec, + xslt.xsltTransformContext* ctxt, char* value): + cdef XSLTAccessControl access_control + if ctxt is NULL or ctxt._private is NULL: + return 1 # no access control => allow everything + access_control = ctxt._private + try: + return bool( access_control.check_write_network(value) ) + except Exception: + return 0 ################################################################################ # XSLT @@ -157,7 +288,7 @@ self._extensions = {} python.PyDict_SetItem(self._extensions, (ns_utf, name_utf), function) -cdef class _ExsltRegExp # forward declaration +cdef class _ExsltRegExp # forward declarations cdef class XSLT: """Turn a document into an XSLT object. @@ -165,10 +296,11 @@ cdef _XSLTContext _context cdef xslt.xsltStylesheet* _c_style cdef _XSLTResolverContext _xslt_resolver_context + cdef XSLTAccessControl _access_control cdef _ExsltRegExp _regexp cdef _ErrorLog _error_log - def __init__(self, xslt_input, extensions=None, regexp=True): + def __init__(self, xslt_input, extensions=None, regexp=True, access_control=None): cdef xslt.xsltStylesheet* c_style cdef xmlDoc* c_doc cdef xmlDoc* fake_c_doc @@ -178,6 +310,9 @@ doc = _documentOrRaise(xslt_input) root_node = _rootNodeOf(xslt_input) + # set access control or raise TypeError + self._access_control = access_control + # make a copy of the document as stylesheet parsing modifies it fake_c_doc = _fakeRootDoc(doc._c_doc, root_node._c_node) c_doc = tree.xmlCopyDoc(fake_c_doc, 1) @@ -253,6 +388,9 @@ xslt.xsltSetTransformErrorFunc(transform_ctxt, self._error_log, _receiveGenericError) + if self._access_control is not None: + self._access_control._register_in_context(transform_ctxt) + ptemp = c_doc._private c_doc._private = resolver_context @@ -300,7 +438,13 @@ self._xslt_resolver_context._raise_if_stored() if c_result is NULL: - raise XSLTApplyError, "Error applying stylesheet" + message = "Error applying stylesheet" + errors = self._error_log.filter_from_errors() + if errors: + error = errors[-1] + if error.message: + message = error.message + raise XSLTApplyError, message result_doc = _documentFactory(c_result, input_doc._parser) return _xsltResultTreeFactory(result_doc, self) @@ -369,6 +513,7 @@ # enable EXSLT support for XSLT xslt.exsltRegisterAll() +# extension function lookup for XSLT cdef xpath.xmlXPathFunction _xslt_function_check(void* ctxt, char* c_name, char* c_ns_uri): "Find XSLT extension function from set of XPath and XSLT functions" From scoder at codespeak.net Wed May 17 01:25:17 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 17 May 2006 01:25:17 +0200 (CEST) Subject: [Lxml-checkins] r27314 - lxml/branch/xslt-access-control/src/lxml Message-ID: <20060516232517.4749310061@code0.codespeak.net> Author: scoder Date: Wed May 17 01:25:15 2006 New Revision: 27314 Modified: lxml/branch/xslt-access-control/src/lxml/xslt.pxi Log: cleanup Modified: lxml/branch/xslt-access-control/src/lxml/xslt.pxi ============================================================================== --- lxml/branch/xslt-access-control/src/lxml/xslt.pxi (original) +++ lxml/branch/xslt-access-control/src/lxml/xslt.pxi Wed May 17 01:25:15 2006 @@ -29,6 +29,18 @@ ################################################################################ +# Where do we store what? +# +# xsltStylesheet->doc->_private +# == _XSLTResolverContext for XSL stylesheet +# +# xsltTransformContext->document->doc->_private +# == _XSLTResolverContext for transformed document +# +################################################################################ + + +################################################################################ # XSLT document loaders cdef class _XSLTResolverContext(_ResolverContext): @@ -199,60 +211,51 @@ ctxt._private = self xslt.xsltSetCtxtSecurityPrefs(self._prefs, ctxt) -cdef int _checkFileRead(xslt.xsltSecurityPrefs* sec, - xslt.xsltTransformContext* ctxt, char* value): +cdef int _checkAccess(xslt.xsltTransformContext* ctxt, + char* c_value, method_name): + cdef xmlDoc* c_doc cdef XSLTAccessControl access_control if ctxt is NULL or ctxt._private is NULL: return 1 # no access control => allow everything access_control = ctxt._private try: - return bool( access_control.check_read_file(value) ) + if c_value is NULL: + value = None + else: + value = c_value + method = getattr(access_control, method_name, None) + if method is not None: + return bool( method(value) ) + else: + return 1 except Exception: + # try to store exception in current resolver context + c_doc = ctxt.style.doc + if c_doc is not NULL and c_doc._private is not NULL: + if isinstance(c_doc._private, _XSLTResolverContext): + resolver_context = <_XSLTResolverContext>c_doc._private + resolver_context._store_raised() return 0 +cdef int _checkFileRead(xslt.xsltSecurityPrefs* sec, + xslt.xsltTransformContext* ctxt, char* value): + return _checkAccess(ctxt, value, 'check_read_file') + cdef int _checkFileWrite(xslt.xsltSecurityPrefs* sec, xslt.xsltTransformContext* ctxt, char* value): - cdef XSLTAccessControl access_control - if ctxt is NULL or ctxt._private is NULL: - return 1 # no access control => allow everything - access_control = ctxt._private - try: - return bool( access_control.check_write_file(value) ) - except Exception: - return 0 + return _checkAccess(ctxt, value, 'check_write_file') cdef int _checkDirCreate(xslt.xsltSecurityPrefs* sec, xslt.xsltTransformContext* ctxt, char* value): - cdef XSLTAccessControl access_control - if ctxt is NULL or ctxt._private is NULL: - return 1 # no access control => allow everything - access_control = ctxt._private - try: - return bool( access_control.check_create_dir(value) ) - except Exception: - return 0 + return _checkAccess(ctxt, value, 'check_create_dir') cdef int _checkNetworkRead(xslt.xsltSecurityPrefs* sec, xslt.xsltTransformContext* ctxt, char* value): - cdef XSLTAccessControl access_control - if ctxt is NULL or ctxt._private is NULL: - return 1 # no access control => allow everything - access_control = ctxt._private - try: - return bool( access_control.check_read_network(value) ) - except Exception: - return 0 + return _checkAccess(ctxt, value, 'check_read_network') cdef int _checkNetworkWrite(xslt.xsltSecurityPrefs* sec, xslt.xsltTransformContext* ctxt, char* value): - cdef XSLTAccessControl access_control - if ctxt is NULL or ctxt._private is NULL: - return 1 # no access control => allow everything - access_control = ctxt._private - try: - return bool( access_control.check_write_network(value) ) - except Exception: - return 0 + return _checkAccess(ctxt, value, 'check_write_network') ################################################################################ # XSLT From scoder at codespeak.net Wed May 17 01:26:48 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 17 May 2006 01:26:48 +0200 (CEST) Subject: [Lxml-checkins] r27315 - lxml/branch/xslt-access-control/doc Message-ID: <20060516232648.3B43110061@code0.codespeak.net> Author: scoder Date: Wed May 17 01:26:46 2006 New Revision: 27315 Modified: lxml/branch/xslt-access-control/doc/resolvers.txt Log: added broken doctest for broken fine-grained access extension (problem seems to be libxslt here, see libxslt bug #342045) Modified: lxml/branch/xslt-access-control/doc/resolvers.txt ============================================================================== --- lxml/branch/xslt-access-control/doc/resolvers.txt (original) +++ lxml/branch/xslt-access-control/doc/resolvers.txt Wed May 17 01:26:46 2006 @@ -191,12 +191,12 @@ I/O access control in XSLT -------------------------- -XSLT has an additional mechanism to control the access certain I/O operations -during the transformation process. This is most interesting where XSL scripts -come from potentially insecure sources and must be prevented from modifying -the local file system. Note, however, that there is no way to keep them from -eating up your precious CPU time, so this should not stop you from thinking -about what XSLT you execute. +XSLT has an additional mechanism to control the access to certain I/O +operations during the transformation process. This is most interesting where +XSL scripts come from potentially insecure sources and must be prevented from +modifying the local file system. Note, however, that there is no way to keep +them from eating up your precious CPU time, so this should not stop you from +thinking about what XSLT you execute. Access control is configured using the XSLTAccessControl class. It can be called with a number of keyword arguments that allow or deny specific @@ -218,9 +218,71 @@ There are a few things to keep in mind: +* XSL parsing (``xsl:import``, etc.) is not affected by this mechanism * ``read_file=False`` does not imply ``write_file=False``, all controls are independent. -* XSL parsing (``xsl:import``, etc.) is not affected by this mechanism * ``read_file`` only applies to files in the file system. Any custom schemes - for URLs or URIs are controlled via the ``*_network`` keywords. + for URLs (not URIs) are controlled via the ``*_network`` keywords. + +################################################################################ +# BROKEN FROM HERE +################################################################################ + +If switching access on and off is not fine-grained enough for you purpose, you +can customize the XSLTAccessControl class by subclassing it and implementing +any of the special methods ``check_read_file``, ``check_write_file``, etc.:: + + >>> class NetReadAccessControl(etree.XSLTAccessControl): + ... prefix = 'hoi:' + ... def check_read_network(self, uri): + ... if not uri: + ... return 1 + ... return not uri.startswith(self.prefix) + + >>> ac = NetReadAccessControl() + + + >>> xml_text = """\ + ... + ... + ... + ... + ... + ... + ... + ... + ... """ + >>> xslt = etree.XML(xml_text) + + >>> transform = etree.XSLT(honk_doc, access_control=ac) + Resolving url honk:test as prefix honk ... done + >>> result = transform(normal_doc) + Resolving url hoi:test as prefix honk ... failed + Resolving url hoi:test as prefix hoi ... done + +# Traceback (most recent call last): +# [...] +# XSLTApplyError: runtime error (element 'value-of') + + >>> ac.prefix = 'honk:' + >>> transform = etree.XSLT(honk_doc, access_control=ac) + Resolving url honk:test as prefix honk ... done + >>> result = transform(normal_doc) + Resolving url hoi:test as prefix honk ... failed + Resolving url hoi:test as prefix hoi ... done + +# Traceback (most recent call last): +# [...] +# XSLTApplyError: runtime error (element 'value-of') + + >>> ac.prefix = 'IGNORE-ME:' + >>> transform = etree.XSLT(honk_doc, access_control=ac) + Resolving url honk:test as prefix honk ... done + >>> result = transform(normal_doc) + Resolving url hoi:test as prefix honk ... failed + Resolving url hoi:test as prefix hoi ... done +# Traceback (most recent call last): +# [...] +# XSLTApplyError: runtime error (element 'value-of') From scoder at codespeak.net Wed May 17 08:13:00 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 17 May 2006 08:13:00 +0200 (CEST) Subject: [Lxml-checkins] r27317 - in lxml/branch/xslt-access-control: doc src/lxml Message-ID: <20060517061300.4E0F4100AB@code0.codespeak.net> Author: scoder Date: Wed May 17 08:12:57 2006 New Revision: 27317 Modified: lxml/branch/xslt-access-control/doc/resolvers.txt lxml/branch/xslt-access-control/src/lxml/xslt.pxi Log: removed broken support for more fine-grained access control (overlaps with document loaders anyway) Modified: lxml/branch/xslt-access-control/doc/resolvers.txt ============================================================================== --- lxml/branch/xslt-access-control/doc/resolvers.txt (original) +++ lxml/branch/xslt-access-control/doc/resolvers.txt Wed May 17 08:12:57 2006 @@ -221,68 +221,8 @@ * XSL parsing (``xsl:import``, etc.) is not affected by this mechanism * ``read_file=False`` does not imply ``write_file=False``, all controls are independent. -* ``read_file`` only applies to files in the file system. Any custom schemes - for URLs (not URIs) are controlled via the ``*_network`` keywords. - -################################################################################ -# BROKEN FROM HERE -################################################################################ - -If switching access on and off is not fine-grained enough for you purpose, you -can customize the XSLTAccessControl class by subclassing it and implementing -any of the special methods ``check_read_file``, ``check_write_file``, etc.:: - - >>> class NetReadAccessControl(etree.XSLTAccessControl): - ... prefix = 'hoi:' - ... def check_read_network(self, uri): - ... if not uri: - ... return 1 - ... return not uri.startswith(self.prefix) - - >>> ac = NetReadAccessControl() - - - >>> xml_text = """\ - ... - ... - ... - ... - ... - ... - ... - ... - ... """ - >>> xslt = etree.XML(xml_text) - - >>> transform = etree.XSLT(honk_doc, access_control=ac) - Resolving url honk:test as prefix honk ... done - >>> result = transform(normal_doc) - Resolving url hoi:test as prefix honk ... failed - Resolving url hoi:test as prefix hoi ... done - -# Traceback (most recent call last): -# [...] -# XSLTApplyError: runtime error (element 'value-of') - - >>> ac.prefix = 'honk:' - >>> transform = etree.XSLT(honk_doc, access_control=ac) - Resolving url honk:test as prefix honk ... done - >>> result = transform(normal_doc) - Resolving url hoi:test as prefix honk ... failed - Resolving url hoi:test as prefix hoi ... done - -# Traceback (most recent call last): -# [...] -# XSLTApplyError: runtime error (element 'value-of') - - >>> ac.prefix = 'IGNORE-ME:' - >>> transform = etree.XSLT(honk_doc, access_control=ac) - Resolving url honk:test as prefix honk ... done - >>> result = transform(normal_doc) - Resolving url hoi:test as prefix honk ... failed - Resolving url hoi:test as prefix hoi ... done - -# Traceback (most recent call last): -# [...] -# XSLTApplyError: runtime error (element 'value-of') +* ``read_file`` only applies to files in the file system. Any other scheme + for URLs is controlled by the ``*_network`` keywords. +* If you need more fine-grained control than switching access on and off, you + should consider writing a custom document loader that returns empty + documents or raises exceptions if access is denied. Modified: lxml/branch/xslt-access-control/src/lxml/xslt.pxi ============================================================================== --- lxml/branch/xslt-access-control/src/lxml/xslt.pxi (original) +++ lxml/branch/xslt-access-control/src/lxml/xslt.pxi Wed May 17 08:12:57 2006 @@ -17,9 +17,6 @@ class XSLTExtensionError(XSLTError): pass -class XSLTAccessDeniedError(XSLTError): - pass - # version information LIBXSLT_COMPILED_VERSION = __unpackIntVersion(xslt.LIBXSLT_VERSION) LIBXSLT_VERSION = __unpackIntVersion(xslt.xsltLibxsltVersion) @@ -140,22 +137,10 @@ ################################################################################ # XSLT file/network access control -cdef object __ACCESS_METHOD_MAP -__ACCESS_METHOD_MAP = { - xslt.XSLT_SECPREF_READ_FILE : "check_read_file", - xslt.XSLT_SECPREF_WRITE_FILE : "check_write_file", - xslt.XSLT_SECPREF_CREATE_DIRECTORY : "check_create_dir", - xslt.XSLT_SECPREF_READ_NETWORK : "check_read_network", - xslt.XSLT_SECPREF_WRITE_NETWORK : "check_write_network", - } - cdef class XSLTAccessControl: """Access control for XSLT: reading/writing files, directories and network - access. - - Access is granted to a type of resource via keyword arguments or for - specific URLs by subclassing and implementing filter methods 'check_*' - that return a truth value for their URL string argument: + access. Access to a type of resource is granted or denied by passing the + following keyword arguments. All of them default to True. * read_file * write_file @@ -164,8 +149,8 @@ * write_network """ cdef xslt.xsltSecurityPrefs* _prefs - def __init__(self, read_file=None, write_file=None, create_dir=None, - read_network=None, write_network=None): + def __init__(self, read_file=True, write_file=True, create_dir=True, + read_network=True, write_network=True): self._prefs = xslt.xsltNewSecurityPrefs() if self._prefs is NULL: raise XSLTError, "Error preparing access control context" @@ -181,27 +166,7 @@ cdef _setAccess(self, xslt.xsltSecurityOption option, allow): cdef xslt.xsltSecurityCheck function - if allow is None: - # check if the corresponding method is defined - method_name = __ACCESS_METHOD_MAP.get(option, None) - if method_name is None: - function = xslt.xsltSecurityAllow - elif hasattr(self, method_name): - if option == xslt.XSLT_SECPREF_READ_FILE: - function = _checkFileRead - elif option == xslt.XSLT_SECPREF_WRITE_FILE: - function = _checkFileWrite - elif option == xslt.XSLT_SECPREF_CREATE_DIRECTORY: - function = _checkDirCreate - elif option == xslt.XSLT_SECPREF_READ_NETWORK: - function = _checkNetworkRead - elif option == xslt.XSLT_SECPREF_WRITE_NETWORK: - function = _checkNetworkWrite - else: - function = xslt.xsltSecurityAllow - else: - function = xslt.xsltSecurityAllow - elif allow: + if allow: function = xslt.xsltSecurityAllow else: function = xslt.xsltSecurityForbid @@ -211,52 +176,6 @@ ctxt._private = self xslt.xsltSetCtxtSecurityPrefs(self._prefs, ctxt) -cdef int _checkAccess(xslt.xsltTransformContext* ctxt, - char* c_value, method_name): - cdef xmlDoc* c_doc - cdef XSLTAccessControl access_control - if ctxt is NULL or ctxt._private is NULL: - return 1 # no access control => allow everything - access_control = ctxt._private - try: - if c_value is NULL: - value = None - else: - value = c_value - method = getattr(access_control, method_name, None) - if method is not None: - return bool( method(value) ) - else: - return 1 - except Exception: - # try to store exception in current resolver context - c_doc = ctxt.style.doc - if c_doc is not NULL and c_doc._private is not NULL: - if isinstance(c_doc._private, _XSLTResolverContext): - resolver_context = <_XSLTResolverContext>c_doc._private - resolver_context._store_raised() - return 0 - -cdef int _checkFileRead(xslt.xsltSecurityPrefs* sec, - xslt.xsltTransformContext* ctxt, char* value): - return _checkAccess(ctxt, value, 'check_read_file') - -cdef int _checkFileWrite(xslt.xsltSecurityPrefs* sec, - xslt.xsltTransformContext* ctxt, char* value): - return _checkAccess(ctxt, value, 'check_write_file') - -cdef int _checkDirCreate(xslt.xsltSecurityPrefs* sec, - xslt.xsltTransformContext* ctxt, char* value): - return _checkAccess(ctxt, value, 'check_create_dir') - -cdef int _checkNetworkRead(xslt.xsltSecurityPrefs* sec, - xslt.xsltTransformContext* ctxt, char* value): - return _checkAccess(ctxt, value, 'check_read_network') - -cdef int _checkNetworkWrite(xslt.xsltSecurityPrefs* sec, - xslt.xsltTransformContext* ctxt, char* value): - return _checkAccess(ctxt, value, 'check_write_network') - ################################################################################ # XSLT @@ -291,7 +210,7 @@ self._extensions = {} python.PyDict_SetItem(self._extensions, (ns_utf, name_utf), function) -cdef class _ExsltRegExp # forward declarations +cdef class _ExsltRegExp # forward declaration cdef class XSLT: """Turn a document into an XSLT object. From scoder at codespeak.net Wed May 17 08:32:43 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 17 May 2006 08:32:43 +0200 (CEST) Subject: [Lxml-checkins] r27318 - in lxml/trunk: . doc src/lxml Message-ID: <20060517063243.252FC100AB@code0.codespeak.net> Author: scoder Date: Wed May 17 08:32:40 2006 New Revision: 27318 Modified: lxml/trunk/CHANGES.txt lxml/trunk/doc/resolvers.txt lxml/trunk/src/lxml/xslt.pxd lxml/trunk/src/lxml/xslt.pxi Log: merged in XSLT access control from xslt-access-control branch Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Wed May 17 08:32:40 2006 @@ -7,6 +7,8 @@ Features added -------------- +* XSLT can block access to file system and network via ``XSLTAccessControl`` + * ElementTree.write() no longer serializes in memory (reduced memory footprint) Modified: lxml/trunk/doc/resolvers.txt ============================================================================== --- lxml/trunk/doc/resolvers.txt (original) +++ lxml/trunk/doc/resolvers.txt Wed May 17 08:32:40 2006 @@ -186,3 +186,43 @@ inherits. For XSLT, the output document inherits the resolvers of the input document and not those of the stylesheet. Therefore, the last result does not inherit any resolvers at all. + + +I/O access control in XSLT +-------------------------- + +XSLT has an additional mechanism to control the access to certain I/O +operations during the transformation process. This is most interesting where +XSL scripts come from potentially insecure sources and must be prevented from +modifying the local file system. Note, however, that there is no way to keep +them from eating up your precious CPU time, so this should not stop you from +thinking about what XSLT you execute. + +Access control is configured using the XSLTAccessControl class. It can be +called with a number of keyword arguments that allow or deny specific +operations:: + + >>> transform = etree.XSLT(honk_doc) + Resolving url honk:test as prefix honk ... done + >>> result = transform(normal_doc) + Resolving url hoi:test as prefix honk ... failed + Resolving url hoi:test as prefix hoi ... done + + >>> ac = etree.XSLTAccessControl(read_network=False) + >>> transform = etree.XSLT(honk_doc, access_control=ac) + Resolving url honk:test as prefix honk ... done + >>> result = transform(normal_doc) + Traceback (most recent call last): + [...] + XSLTApplyError: runtime error (element 'value-of') + +There are a few things to keep in mind: + +* XSL parsing (``xsl:import``, etc.) is not affected by this mechanism +* ``read_file=False`` does not imply ``write_file=False``, all controls are + independent. +* ``read_file`` only applies to files in the file system. Any other scheme + for URLs is controlled by the ``*_network`` keywords. +* If you need more fine-grained control than switching access on and off, you + should consider writing a custom document loader that returns empty + documents or raises exceptions if access is denied. Modified: lxml/trunk/src/lxml/xslt.pxd ============================================================================== --- lxml/trunk/src/lxml/xslt.pxd (original) +++ lxml/trunk/src/lxml/xslt.pxd Wed May 17 08:32:40 2006 @@ -69,6 +69,33 @@ void* ctxt, void (*handler)(void* ctxt, char* msg, ...)) +cdef extern from "libxslt/security.h": + ctypedef struct xsltSecurityPrefs + ctypedef enum xsltSecurityOption: + XSLT_SECPREF_READ_FILE = 1 + XSLT_SECPREF_WRITE_FILE = 2 + XSLT_SECPREF_CREATE_DIRECTORY = 3 + XSLT_SECPREF_READ_NETWORK = 4 + XSLT_SECPREF_WRITE_NETWORK = 5 + + ctypedef int (*xsltSecurityCheck)(xsltSecurityPrefs* sec, + xsltTransformContext* ctxt, + char* value) + + cdef xsltSecurityPrefs* xsltNewSecurityPrefs() + cdef void xsltFreeSecurityPrefs(xsltSecurityPrefs* sec) + cdef int xsltSecurityForbid(xsltSecurityPrefs* sec, + xsltTransformContext* ctxt, + char* value) + cdef int xsltSecurityAllow(xsltSecurityPrefs* sec, + xsltTransformContext* ctxt, + char* value) + cdef int xsltSetSecurityPrefs(xsltSecurityPrefs* sec, + xsltSecurityOption option, + xsltSecurityCheck func) + cdef int xsltSetCtxtSecurityPrefs(xsltSecurityPrefs* sec, + xsltTransformContext* ctxt) + cdef extern from "libxslt/extra.h": cdef char* XSLT_LIBXSLT_NAMESPACE cdef char* XSLT_XALAN_NAMESPACE Modified: lxml/trunk/src/lxml/xslt.pxi ============================================================================== --- lxml/trunk/src/lxml/xslt.pxi (original) +++ lxml/trunk/src/lxml/xslt.pxi Wed May 17 08:32:40 2006 @@ -26,6 +26,18 @@ ################################################################################ +# Where do we store what? +# +# xsltStylesheet->doc->_private +# == _XSLTResolverContext for XSL stylesheet +# +# xsltTransformContext->document->doc->_private +# == _XSLTResolverContext for transformed document +# +################################################################################ + + +################################################################################ # XSLT document loaders cdef class _XSLTResolverContext(_ResolverContext): @@ -122,6 +134,46 @@ xslt.xsltSetLoaderFunc(_doc_loader) +################################################################################ +# XSLT file/network access control + +cdef class XSLTAccessControl: + """Access control for XSLT: reading/writing files, directories and network + access. Access to a type of resource is granted or denied by passing the + following keyword arguments. All of them default to True. + + * read_file + * write_file + * create_dir + * read_network + * write_network + """ + cdef xslt.xsltSecurityPrefs* _prefs + def __init__(self, read_file=True, write_file=True, create_dir=True, + read_network=True, write_network=True): + self._prefs = xslt.xsltNewSecurityPrefs() + if self._prefs is NULL: + raise XSLTError, "Error preparing access control context" + self._setAccess(xslt.XSLT_SECPREF_READ_FILE, read_file) + self._setAccess(xslt.XSLT_SECPREF_WRITE_FILE, write_file) + self._setAccess(xslt.XSLT_SECPREF_CREATE_DIRECTORY, create_dir) + self._setAccess(xslt.XSLT_SECPREF_READ_NETWORK, read_network) + self._setAccess(xslt.XSLT_SECPREF_WRITE_NETWORK, write_network) + + def __dealloc__(self): + if self._prefs is not NULL: + xslt.xsltFreeSecurityPrefs(self._prefs) + + cdef _setAccess(self, xslt.xsltSecurityOption option, allow): + cdef xslt.xsltSecurityCheck function + if allow: + function = xslt.xsltSecurityAllow + else: + function = xslt.xsltSecurityForbid + xslt.xsltSetSecurityPrefs(self._prefs, option, function) + + cdef void _register_in_context(self, xslt.xsltTransformContext* ctxt): + xslt.xsltSetCtxtSecurityPrefs(self._prefs, ctxt) ################################################################################ # XSLT @@ -165,10 +217,11 @@ cdef _XSLTContext _context cdef xslt.xsltStylesheet* _c_style cdef _XSLTResolverContext _xslt_resolver_context + cdef XSLTAccessControl _access_control cdef _ExsltRegExp _regexp cdef _ErrorLog _error_log - def __init__(self, xslt_input, extensions=None, regexp=True): + def __init__(self, xslt_input, extensions=None, regexp=True, access_control=None): cdef xslt.xsltStylesheet* c_style cdef xmlDoc* c_doc cdef xmlDoc* fake_c_doc @@ -178,6 +231,9 @@ doc = _documentOrRaise(xslt_input) root_node = _rootNodeOf(xslt_input) + # set access control or raise TypeError + self._access_control = access_control + # make a copy of the document as stylesheet parsing modifies it fake_c_doc = _fakeRootDoc(doc._c_doc, root_node._c_node) c_doc = tree.xmlCopyDoc(fake_c_doc, 1) @@ -253,6 +309,9 @@ xslt.xsltSetTransformErrorFunc(transform_ctxt, self._error_log, _receiveGenericError) + if self._access_control is not None: + self._access_control._register_in_context(transform_ctxt) + ptemp = c_doc._private c_doc._private = resolver_context @@ -300,7 +359,13 @@ self._xslt_resolver_context._raise_if_stored() if c_result is NULL: - raise XSLTApplyError, "Error applying stylesheet" + message = "Error applying stylesheet" + errors = self._error_log.filter_from_errors() + if errors: + error = errors[-1] + if error.message: + message = error.message + raise XSLTApplyError, message result_doc = _documentFactory(c_result, input_doc._parser) return _xsltResultTreeFactory(result_doc, self) @@ -369,6 +434,7 @@ # enable EXSLT support for XSLT xslt.exsltRegisterAll() +# extension function lookup for XSLT cdef xpath.xmlXPathFunction _xslt_function_check(void* ctxt, char* c_name, char* c_ns_uri): "Find XSLT extension function from set of XPath and XSLT functions" From scoder at codespeak.net Wed May 17 08:33:08 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 17 May 2006 08:33:08 +0200 (CEST) Subject: [Lxml-checkins] r27319 - lxml/trunk/doc Message-ID: <20060517063308.8F394100AB@code0.codespeak.net> Author: scoder Date: Wed May 17 08:33:07 2006 New Revision: 27319 Modified: lxml/trunk/doc/api.txt Log: small cleanup in api.txt Modified: lxml/trunk/doc/api.txt ============================================================================== --- lxml/trunk/doc/api.txt (original) +++ lxml/trunk/doc/api.txt Wed May 17 08:33:07 2006 @@ -6,7 +6,7 @@ the need to expose a feature in an easy way led to the invention of a new API. lxml.etree -========== +---------- lxml.etree tries to follow the `ElementTree API`_ wherever it can. There are however some incompatibilities (see compatibility.txt). The extensions are @@ -14,17 +14,17 @@ .. _`ElementTree API`: http://effbot.org/zone/element-index.htm -The following examples usually assume this to be executed first:: - - >>> from lxml import etree - >>> from StringIO import StringIO - If you need to know which version of lxml is installed, you can access the ``lxml.etree.LXML_VERSION`` attribute to retrieve a version tuple. Note, however, that it did not exist before version 1.0, so you will get an AttributeError in older versions. The versions of libxml2 and libxslt are available through the attributes ``LIBXML_VERSION`` and ``LIBXSLT_VERSION``. +The following examples usually assume this to be executed first:: + + >>> from lxml import etree + >>> from StringIO import StringIO + Parsers ------- From scoder at codespeak.net Wed May 17 08:34:27 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 17 May 2006 08:34:27 +0200 (CEST) Subject: [Lxml-checkins] r27320 - lxml/trunk/src/lxml Message-ID: <20060517063427.DB204100AB@code0.codespeak.net> Author: scoder Date: Wed May 17 08:34:26 2006 New Revision: 27320 Modified: lxml/trunk/src/lxml/xslt.pxi Log: doc updates Modified: lxml/trunk/src/lxml/xslt.pxi ============================================================================== --- lxml/trunk/src/lxml/xslt.pxi (original) +++ lxml/trunk/src/lxml/xslt.pxi Wed May 17 08:34:26 2006 @@ -140,7 +140,7 @@ cdef class XSLTAccessControl: """Access control for XSLT: reading/writing files, directories and network access. Access to a type of resource is granted or denied by passing the - following keyword arguments. All of them default to True. + following keyword arguments. All of them default to True to allow access. * read_file * write_file From scoder at codespeak.net Wed May 17 08:42:43 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 17 May 2006 08:42:43 +0200 (CEST) Subject: [Lxml-checkins] r27321 - lxml/trunk/doc Message-ID: <20060517064243.7FD90100AC@code0.codespeak.net> Author: scoder Date: Wed May 17 08:42:41 2006 New Revision: 27321 Modified: lxml/trunk/doc/resolvers.txt Log: doc updates Modified: lxml/trunk/doc/resolvers.txt ============================================================================== --- lxml/trunk/doc/resolvers.txt (original) +++ lxml/trunk/doc/resolvers.txt Wed May 17 08:42:41 2006 @@ -198,7 +198,7 @@ them from eating up your precious CPU time, so this should not stop you from thinking about what XSLT you execute. -Access control is configured using the XSLTAccessControl class. It can be +Access control is configured using the ``XSLTAccessControl`` class. It can be called with a number of keyword arguments that allow or deny specific operations:: From scoder at codespeak.net Wed May 17 08:56:03 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 17 May 2006 08:56:03 +0200 (CEST) Subject: [Lxml-checkins] r27322 - lxml/trunk/src/lxml Message-ID: <20060517065603.7C2721006F@code0.codespeak.net> Author: scoder Date: Wed May 17 08:56:02 2006 New Revision: 27322 Modified: lxml/trunk/src/lxml/xslt.pxi Log: doc updates Modified: lxml/trunk/src/lxml/xslt.pxi ============================================================================== --- lxml/trunk/src/lxml/xslt.pxi (original) +++ lxml/trunk/src/lxml/xslt.pxi Wed May 17 08:56:02 2006 @@ -139,8 +139,9 @@ cdef class XSLTAccessControl: """Access control for XSLT: reading/writing files, directories and network - access. Access to a type of resource is granted or denied by passing the - following keyword arguments. All of them default to True to allow access. + I/O. Access to a type of resource is granted or denied by passing any of + the following keyword arguments. All of them default to True to allow + access. * read_file * write_file From scoder at codespeak.net Wed May 17 09:56:17 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 17 May 2006 09:56:17 +0200 (CEST) Subject: [Lxml-checkins] r27325 - in lxml/trunk: . doc Message-ID: <20060517075617.44196100B0@code0.codespeak.net> Author: scoder Date: Wed May 17 09:56:15 2006 New Revision: 27325 Added: lxml/trunk/doc/build.txt Modified: lxml/trunk/INSTALL.txt lxml/trunk/doc/main.txt Log: merged in main.txt updates from 0.9.x branch, new doc/build.txt that describes how to build lxml from sources (including static linking on Windows) Modified: lxml/trunk/INSTALL.txt ============================================================================== --- lxml/trunk/INSTALL.txt (original) +++ lxml/trunk/INSTALL.txt Wed May 17 09:56:15 2006 @@ -30,11 +30,13 @@ than the required version above. While there were not any bug reports so far, you may still encounter certain differences in behaviour in rare cases. -If you want to build lxml from SVN, you also need Pyrex_. If you are using a -released version of lxml, it should come with the generated C file in the -source distribution, so no Pyrex is needed in that case. +If you want to build lxml from SVN, you also need Pyrex_. Please read `how to +build lxml from source`_ in this case. If you are using a released version of +lxml, it should come with the generated C file in the source distribution, so +no Pyrex is needed in that case. .. _Pyrex: http://www.cosc.canterbury.ac.nz/~greg/python/Pyrex/ +.. _`how to build lxml from source`: build.html Note that Pyrex up to and including version 0.9.4 has known problems when compiling lxml with gcc 4.0 or Python 2.4. Do not use it. If you want to @@ -59,49 +61,5 @@ This has been reported to work on Linux, MacOS-X 10.4 and Windows, as long as libxml2 and libxslt are properly installed. To compile and install lxml -without easy_install, download the source tar-ball, unpack it and type:: - - python setup.py install - -If you do not want to install lxml right away, but first test it from the -source directory, you can build it in-place like this:: - - python setup.py build_ext -i - -or just:: - - make - -If you then place lxml's "src" directory on your PYTHONPATH somehow, you can -import lxml.etree and play with it. - - -Running the tests and reporting errors --------------------------------------- - -The source distribution (tgz) contains a test suite for lxml. You can run it -from the top-level directory:: - - python test.py - -Note that the test script only tests the in-place build (see "Installation" -above), as it searches the "src" directory. You can use the following -one-step command to trigger an in-place build and test it:: - - make clean test - -To run the ElementTree and cElementTree compatibility tests, make sure -you have lxml on your PYTHONPATH first, then run:: - - python selftest.py - -and:: - - python selftest2.py - -If the tests give failures, errors, or worse, segmentation faults, we'd really -like to know. Please contact us on the `mailing list`_, and please specify the -version of lxml, libxml2, libxslt and Python you were using, as well as your -operating system type (Linux, Windows, MacOs, ...). - -.. _`mailing list`: http://codespeak.net/mailman/listinfo/lxml-dev +without easy_install, please read `how to build lxml from source`_ (or the +file ``build.txt`` in the ``doc`` directory of the source tree). Added: lxml/trunk/doc/build.txt ============================================================================== --- (empty file) +++ lxml/trunk/doc/build.txt Wed May 17 09:56:15 2006 @@ -0,0 +1,170 @@ +How to build lxml from source +============================= + +To build lxml from source, you need libxml2 and libxslt properly installed. + +Pyrex +----- + +The lxml.etree module is written in Pyrex_. To build lxml from source, you +therefore need a working Pyrex installation. Pyrex now supports EasyInstall, +so you can install it by running the following command as super-user:: + + easy_install Pyrex + +.. _Pyrex: http://www.cosc.canterbury.ac.nz/~greg/python/Pyrex/ +.. _easy_install: http://peak.telecommunity.com/DevCenter/EasyInstall + +Note that Pyrex up to and including version 0.9.4 has known problems when +compiling lxml with gcc 4.0 or Python 2.4. Do not use it. If you want to +build lxml from non-release sources, please install Pyrex version 0.9.4.1 or +later. + + +Subversion +---------- + +The lxml package is developed in a Subversion repository. You can retrieve +the current developer version by calling:: + + svn co http://codespeak.net/svn/lxml/trunk lxml + +This will create a directory ``lxml`` and download the source into it. You +can also `browse the repository through the web`_ or use your favourite SVN +client to access it. + +.. _`browse the repository through the web`: http://codespeak.net/svn/lxml + + +The distutils approach +---------------------- + +Usually, building lxml is done through distutils. Do a Subversion checkout +(or download the source tar-ball and unpack it) and then type:: + + python setup.py build + +If you want to test lxml from the source directory, it is better to build it +in-place like this:: + + python setup.py build_ext -i + +or, in Unix-like environments:: + + make + +If you then place lxml's "src" directory on your PYTHONPATH somehow, you can +import ``lxml.etree`` and play with it. + + +Running the tests and reporting errors +-------------------------------------- + +The source distribution (tgz) contains a test suite for lxml. You can run it +from the top-level directory:: + + python test.py + +Note that the test script only tests the in-place build (see "Installation" +above), as it searches the "src" directory. You can use the following +one-step command to trigger an in-place build and test it:: + + make clean test + +To run the ElementTree and cElementTree compatibility tests, make sure +you have lxml on your PYTHONPATH first, then run:: + + python selftest.py + +and:: + + python selftest2.py + +If the tests give failures, errors, or worse, segmentation faults, we'd really +like to know. Please contact us on the `mailing list`_, and please specify the +version of lxml, libxml2, libxslt and Python you were using, as well as your +operating system type (Linux, Windows, MacOs, ...). + +.. _`mailing list`: http://codespeak.net/mailman/listinfo/lxml-dev + + +Static linking on Windows +------------------------- + +Most operating systems have proper package mangement that makes installing +current versions of libxml2 and libxslt easy. However, Microsoft Windows +lacks these capabilities. It can therefore be interesting to statically link +the external libraries into lxml.etree to avoid having to install them +separately. `David Sankel`_ proposed the following approach. + +.. _`David Sankel`: http://codespeak.net/pipermail/lxml-dev/2006-May/001196.html + +Download lxml and all required libraries to the same directory. The iconv, +libxml2, libxslt, and zlib libraries are all availible from xmlsoft.org. The +place to go on the ftp site is ftp://xmlsoft.org/libxml2/win32. + +Your directory should now have something like the following files in it:: + +iconv-1.9.1.win32.zip +libxml2-2.6.23.win32.zip +libxslt-1.1.15.win32.zip +lxml-0.9.2.tgz +zlib-1.2.3.win32.zip + +Now extract each of those files in the _same_ directory. Now you should have +something like this:: + +iconv-1.9.1.win32/ +iconv-1.9.1.win32.zip +libxml2-2.6.23.win32/ +libxml2-2.6.23.win32.zip +libxslt-1.1.15.win32/ +libxslt-1.1.15.win32.zip +lxml-0.9.2/ +lxml-0.9.2.tgz +zlib-1.2.3.win32/ +zlib-1.2.3.win32.zip + +Go to the lxml-0.9.2 directory and edit the Makefile. There should be a +section that looks like this:: + + ext_modules = [ Extension( + "lxml.etree", + sources = sources, + extra_compile_args = ['-w'] + flags('xslt-config --cflags'), + extra_link_args = flags('xslt-config --libs') + )], + +The problem here is that the Windows version of libxslt does not install the +little program ``xslt-config``, which would normally auto-configure the build +process. + +Change this section to something like this, but take care to use the correct +version numbers:: + + ext_modules = [ Extension( + "lxml.etree", + sources = sources, + extra_compile_args = ['-w'] + [ + "-I..\\libxml2-2.6.23.win32\\include ", + "-I..\\libxslt-1.1.15.win32\\include", + "-I..\\zlib-1.2.3.win32\\include", + "-I..\\iconv-1.9.1.win32\\include" + ], + extra_link_args = [ + "..\\libxml2-2.6.23.win32\\lib\\libxml2_a.lib", + "..\\libxslt-1.1.15.win32\\lib\\libxslt_a.lib", + "..\\zlib-1.2.3.win32\\lib\\zlib.lib", + "..\\iconv- 1.9.1.win32\\lib\\iconv_a.lib" + ] + )], + +The ``_a`` part of the library names means that we are linking statically +against the named library files. If you want to use DLLs, you need to link +against the DLL version of the libraries. + +Now you should be able to use setup.py and everything should work well. Try calling:: + + python setup.py bdist_wininst + +This will create a windows installer in the ``pkg`` directory. Modified: lxml/trunk/doc/main.txt ============================================================================== --- lxml/trunk/doc/main.txt (original) +++ lxml/trunk/doc/main.txt Wed May 17 09:56:15 2006 @@ -13,49 +13,65 @@ .. _introduction: intro.html -News ----- +Download +-------- -* 2006-03-30: `lxml 0.9.1`_ released (`changes for 0.9.1`_) +The best way to download binary versions is to visit `lxml at the Python +cheeseshop`_. It has the source, eggs and installers for various platforms. -* 2006-03-20: `lxml 0.9`_ released (`changes for 0.9`_) +.. _`lxml at the Python cheeseshop`: http://cheeseshop.python.org/pypi/lxml/ -* 2005-11-03: `lxml 0.8`_ released (`changes for 0.8`_) +Please take a look at the `installation instructions`_! -* 2005-06-15: `lxml 0.7`_ released (`changes for 0.7`_) +.. _`installation instructions`: installation.html -* 2005-05-14: `lxml 0.6`_ released (`changes for 0.6`_) +* `lxml 0.9.2`_, released 2006-05-10 (`changes for 0.9.2`_) -* 2005-04-09: `lxml 0.5.1`_ released (`changes for 0.5.1`_) +* `lxml 0.9.1`_, released 2006-03-30 (`changes for 0.9.1`_) -* 2005-04-08: `lxml 0.5`_ released! +* `lxml 0.9`_, released 2006-03-20 (`changes for 0.9`_) -.. _`lxml 0.9.1`: lxml-0.9.1.tgz +* `lxml 0.8`_, released 2005-11-03 (`changes for 0.8`_) -.. _`lxml 0.9`: lxml-0.9.tgz +* `lxml 0.7`_, released 2005-06-15 (`changes for 0.7`_) -.. _`lxml 0.8`: lxml-0.8.tgz +* `lxml 0.6`_, released 2005-05-14 (`changes for 0.6`_) -.. _`lxml 0.7`: lxml-0.7.tgz +* `lxml 0.5.1`_, released 2005-04-09 (`changes for 0.5.1`_) -.. _`lxml 0.6`: lxml-0.6.tgz +* `lxml 0.5`_, released 2005-04-08 +.. _`lxml 0.9.2`: lxml-0.9.2.tgz +.. _`lxml 0.9.1`: lxml-0.9.1.tgz +.. _`lxml 0.9`: lxml-0.9.tgz +.. _`lxml 0.8`: lxml-0.8.tgz +.. _`lxml 0.7`: lxml-0.7.tgz +.. _`lxml 0.6`: lxml-0.6.tgz .. _`lxml 0.5.1`: lxml-0.5.1.tgz - .. _`lxml 0.5`: lxml-0.5.tgz +.. _`CHANGES for 0.9.2`: changes-0.9.2.html .. _`CHANGES for 0.9.1`: changes-0.9.1.html - .. _`CHANGES for 0.9`: changes-0.9.html - .. _`CHANGES for 0.8`: changes-0.8.html - .. _`CHANGES for 0.7`: changes-0.7.html - .. _`CHANGES for 0.6`: changes-0.6.html - .. _`CHANGES for 0.5.1`: changes-0.5.1.html +It's also possible to check out the latest development version of lxml +from svn directly, using a command like this:: + + svn co http://codespeak.net/svn/lxml/trunk lxml + +You can also `browse it through the web`_. Please read `how to build lxml +from source`_ first. The `latest CHANGES`_ of the developer version are also +accessible. You can check there if a bug you found has been fixed or a +feature you want has been implemented in the latest trunk version. + +.. _`how to build lxml from source`: build.html +.. _`browse it through the web`: http://codespeak.net/svn/lxml +.. _`latest CHANGES`: http://codespeak.net/svn/lxml/trunk/CHANGES.txt + Documentation ------------- @@ -76,25 +92,15 @@ in the standar dlibrary. .. _`ElementTree API`: http://effbot.org/zone/element-index.htm - .. _`ElementTree compatibility overview`: compatibility.html - .. _`extends this API`: api.html - .. _`extension functions`: extensions.html - .. _XPath: http://www.w3.org/TR/xpath - .. _`Relax NG`: http://www.relaxng.org/ - .. _`XML Schema`: http://www.w3.org/XML/Schema - .. _`XSLT`: http://www.w3.org/TR/xslt - .. _`c14n`: http://www.w3.org/TR/2001/REC-xml-c14n-20010315 - .. _`implementing namespaces`: namespace_extensions.html - .. _`SAX compliant API`: sax.html Mailing list @@ -104,46 +110,6 @@ .. _`mailing list`: http://codespeak.net/mailman/listinfo/lxml-dev -Download --------- - -* `lxml 0.9.1`_ (2006-03-30) - -* `lxml 0.9`_ (2006-03-20) - -* `lxml 0.8`_ (2005-11-03) - -* `lxml 0.7`_ (2005-06-15) - -* `lxml 0.6`_ (2005-05-14) - -* `lxml 0.5.1`_ (2005-04-09) - -* `lxml 0.5`_ (2005-04-08) - -Instead of downloading the source here, you can also find `lxml at the -Python cheeseshop`_ in source, egg and installer form for various -platforms. - -.. _`lxml at the Python cheeseshop`: http://cheeseshop.python.org/pypi/lxml/ - -See also the `installation instructions`_. - -.. _`installation instructions`: installation.html - -It's also possible to check out the latest development version of lxml -from svn directly, using a command like this:: - - svn co http://codespeak.net/svn/lxml/trunk lxml - -You can also `browse it through the web`_. The `latest CHANGES`_ of the -developer version are also accessible. You can check there if a bug you found -has been fixed or a feature you want has been implemented in the latest trunk -version. - -.. _`browse it through the web`: http://codespeak.net/svn/lxml -.. _`latest CHANGES`: http://codespeak.net/svn/lxml/trunk/CHANGES.txt - License ------- From scoder at codespeak.net Wed May 17 10:00:48 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 17 May 2006 10:00:48 +0200 (CEST) Subject: [Lxml-checkins] r27326 - in lxml/branch/lxml-0.9.x: . doc Message-ID: <20060517080048.2171B100B0@code0.codespeak.net> Author: scoder Date: Wed May 17 10:00:46 2006 New Revision: 27326 Added: lxml/branch/lxml-0.9.x/doc/build.txt - copied unchanged from r27325, lxml/trunk/doc/build.txt Modified: lxml/branch/lxml-0.9.x/INSTALL.txt lxml/branch/lxml-0.9.x/doc/main.txt Log: merged in build.txt from trunk Modified: lxml/branch/lxml-0.9.x/INSTALL.txt ============================================================================== --- lxml/branch/lxml-0.9.x/INSTALL.txt (original) +++ lxml/branch/lxml-0.9.x/INSTALL.txt Wed May 17 10:00:46 2006 @@ -26,11 +26,13 @@ than the required version above. While there were not any bug reports so far, you may still encounter certain differences in behaviour in rare cases. -If you want to build lxml from SVN, you also need Pyrex_. If you are using a -released version of lxml, it should come with the generated C file in the -source distribution, so no Pyrex is needed in that case. +If you want to build lxml from SVN, you also need Pyrex_. Please read `how to +build lxml from source`_ in this case. If you are using a released version of +lxml, it should come with the generated C file in the source distribution, so +no Pyrex is needed in that case. .. _Pyrex: http://www.cosc.canterbury.ac.nz/~greg/python/Pyrex/ +.. _`how to build lxml from source`: build.html Note that Pyrex up to and including version 0.9.4 has known problems when compiling lxml with gcc 4.0 or Python 2.4. Do not use it. If you want to @@ -55,49 +57,5 @@ This has been reported to work on Linux, MacOS-X 10.4 and Windows, as long as libxml2 and libxslt are properly installed. To compile and install lxml -without easy_install, download the source tar-ball, unpack it and type:: - - python setup.py install - -If you do not want to install lxml right away, but first test it from the -source directory, you can build it in-place like this:: - - python setup.py build_ext -i - -or just:: - - make - -If you then place lxml's "src" directory on your PYTHONPATH somehow, you can -import lxml.etree and play with it. - - -Running the tests and reporting errors --------------------------------------- - -The source distribution (tgz) contains a test suite for lxml. You can run it -from the top-level directory:: - - python test.py - -Note that the test script only tests the in-place build (see "Installation" -above), as it searches the "src" directory. You can use the following -one-step command to trigger an in-place build and test it:: - - make clean test - -To run the ElementTree and cElementTree compatibility tests, make sure -you have lxml on your PYTHONPATH first, then run:: - - python selftest.py - -and:: - - python selftest2.py - -If the tests give failures, errors, or worse, segmentation faults, we'd really -like to know. Please contact us on the `mailing list`_, and please specify the -version of lxml, libxml2, libxslt and Python you were using, as well as your -operating system type (Linux, Windows, MacOs, ...). - -.. _`mailing list`: http://codespeak.net/mailman/listinfo/lxml-dev +without easy_install, please read `how to build lxml from source`_ (or the +file ``build.txt`` in the ``doc`` directory of the source tree). Modified: lxml/branch/lxml-0.9.x/doc/main.txt ============================================================================== --- lxml/branch/lxml-0.9.x/doc/main.txt (original) +++ lxml/branch/lxml-0.9.x/doc/main.txt Wed May 17 10:00:46 2006 @@ -63,11 +63,12 @@ svn co http://codespeak.net/svn/lxml/trunk lxml -You can also `browse it through the web`_. The `latest CHANGES`_ of the -developer version are also accessible. You can check there if a bug you found -has been fixed or a feature you want has been implemented in the latest trunk -version. +You can also `browse it through the web`_. Please read `how to build lxml +from source`_ first. The `latest CHANGES`_ of the developer version are also +accessible. You can check there if a bug you found has been fixed or a +feature you want has been implemented in the latest trunk version. +.. _`how to build lxml from source`: build.html .. _`browse it through the web`: http://codespeak.net/svn/lxml .. _`latest CHANGES`: http://codespeak.net/svn/lxml/trunk/CHANGES.txt From scoder at codespeak.net Wed May 17 10:06:33 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 17 May 2006 10:06:33 +0200 (CEST) Subject: [Lxml-checkins] r27327 - lxml/trunk/doc Message-ID: <20060517080633.D6D9A100B0@code0.codespeak.net> Author: scoder Date: Wed May 17 10:06:32 2006 New Revision: 27327 Modified: lxml/trunk/doc/build.txt Log: typos etc. Modified: lxml/trunk/doc/build.txt ============================================================================== --- lxml/trunk/doc/build.txt (original) +++ lxml/trunk/doc/build.txt Wed May 17 10:06:32 2006 @@ -53,7 +53,7 @@ make -If you then place lxml's "src" directory on your PYTHONPATH somehow, you can +If you then place lxml's ``src`` directory on your PYTHONPATH somehow, you can import ``lxml.etree`` and play with it. @@ -66,7 +66,7 @@ python test.py Note that the test script only tests the in-place build (see "Installation" -above), as it searches the "src" directory. You can use the following +above), as it searches the ``src`` directory. You can use the following one-step command to trigger an in-place build and test it:: make clean test @@ -91,7 +91,7 @@ Static linking on Windows ------------------------- -Most operating systems have proper package mangement that makes installing +Most operating systems have proper package management that makes installing current versions of libxml2 and libxslt easy. However, Microsoft Windows lacks these capabilities. It can therefore be interesting to statically link the external libraries into lxml.etree to avoid having to install them @@ -100,7 +100,7 @@ .. _`David Sankel`: http://codespeak.net/pipermail/lxml-dev/2006-May/001196.html Download lxml and all required libraries to the same directory. The iconv, -libxml2, libxslt, and zlib libraries are all availible from xmlsoft.org. The +libxml2, libxslt, and zlib libraries are all available from xmlsoft.org. The place to go on the ftp site is ftp://xmlsoft.org/libxml2/win32. Your directory should now have something like the following files in it:: From scoder at codespeak.net Wed May 17 10:06:59 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 17 May 2006 10:06:59 +0200 (CEST) Subject: [Lxml-checkins] r27328 - lxml/branch/lxml-0.9.x/doc Message-ID: <20060517080659.B6706100B3@code0.codespeak.net> Author: scoder Date: Wed May 17 10:06:58 2006 New Revision: 27328 Modified: lxml/branch/lxml-0.9.x/doc/build.txt Log: merged in doc fixes from trunk Modified: lxml/branch/lxml-0.9.x/doc/build.txt ============================================================================== --- lxml/branch/lxml-0.9.x/doc/build.txt (original) +++ lxml/branch/lxml-0.9.x/doc/build.txt Wed May 17 10:06:58 2006 @@ -53,7 +53,7 @@ make -If you then place lxml's "src" directory on your PYTHONPATH somehow, you can +If you then place lxml's ``src`` directory on your PYTHONPATH somehow, you can import ``lxml.etree`` and play with it. @@ -66,7 +66,7 @@ python test.py Note that the test script only tests the in-place build (see "Installation" -above), as it searches the "src" directory. You can use the following +above), as it searches the ``src`` directory. You can use the following one-step command to trigger an in-place build and test it:: make clean test @@ -91,7 +91,7 @@ Static linking on Windows ------------------------- -Most operating systems have proper package mangement that makes installing +Most operating systems have proper package management that makes installing current versions of libxml2 and libxslt easy. However, Microsoft Windows lacks these capabilities. It can therefore be interesting to statically link the external libraries into lxml.etree to avoid having to install them @@ -100,7 +100,7 @@ .. _`David Sankel`: http://codespeak.net/pipermail/lxml-dev/2006-May/001196.html Download lxml and all required libraries to the same directory. The iconv, -libxml2, libxslt, and zlib libraries are all availible from xmlsoft.org. The +libxml2, libxslt, and zlib libraries are all available from xmlsoft.org. The place to go on the ftp site is ftp://xmlsoft.org/libxml2/win32. Your directory should now have something like the following files in it:: From scoder at codespeak.net Wed May 17 10:12:07 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 17 May 2006 10:12:07 +0200 (CEST) Subject: [Lxml-checkins] r27329 - lxml/trunk/doc Message-ID: <20060517081207.29787100B3@code0.codespeak.net> Author: scoder Date: Wed May 17 10:12:06 2006 New Revision: 27329 Modified: lxml/trunk/doc/build.txt Log: doc fixes Modified: lxml/trunk/doc/build.txt ============================================================================== --- lxml/trunk/doc/build.txt (original) +++ lxml/trunk/doc/build.txt Wed May 17 10:12:06 2006 @@ -125,8 +125,8 @@ zlib-1.2.3.win32/ zlib-1.2.3.win32.zip -Go to the lxml-0.9.2 directory and edit the Makefile. There should be a -section that looks like this:: +Go to the lxml-0.9.2 directory and edit the file ``setup.py``. There should be +a section that looks like this:: ext_modules = [ Extension( "lxml.etree", From scoder at codespeak.net Wed May 17 10:12:28 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 17 May 2006 10:12:28 +0200 (CEST) Subject: [Lxml-checkins] r27330 - lxml/branch/lxml-0.9.x/doc Message-ID: <20060517081228.DEBA2100B3@code0.codespeak.net> Author: scoder Date: Wed May 17 10:12:27 2006 New Revision: 27330 Modified: lxml/branch/lxml-0.9.x/doc/build.txt Log: merged in doc fixes from trunk Modified: lxml/branch/lxml-0.9.x/doc/build.txt ============================================================================== --- lxml/branch/lxml-0.9.x/doc/build.txt (original) +++ lxml/branch/lxml-0.9.x/doc/build.txt Wed May 17 10:12:27 2006 @@ -125,8 +125,8 @@ zlib-1.2.3.win32/ zlib-1.2.3.win32.zip -Go to the lxml-0.9.2 directory and edit the Makefile. There should be a -section that looks like this:: +Go to the lxml-0.9.2 directory and edit the file ``setup.py``. There should be +a section that looks like this:: ext_modules = [ Extension( "lxml.etree", From scoder at codespeak.net Wed May 17 10:53:11 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 17 May 2006 10:53:11 +0200 (CEST) Subject: [Lxml-checkins] r27331 - lxml/trunk/doc Message-ID: <20060517085311.8551010050@code0.codespeak.net> Author: scoder Date: Wed May 17 10:53:10 2006 New Revision: 27331 Modified: lxml/trunk/doc/build.txt Log: doc fixes Modified: lxml/trunk/doc/build.txt ============================================================================== --- lxml/trunk/doc/build.txt (original) +++ lxml/trunk/doc/build.txt Wed May 17 10:53:10 2006 @@ -65,9 +65,9 @@ python test.py -Note that the test script only tests the in-place build (see "Installation" -above), as it searches the ``src`` directory. You can use the following -one-step command to trigger an in-place build and test it:: +Note that the test script only tests the in-place build (see distutils +building above), as it searches the ``src`` directory. You can use the +following one-step command to trigger an in-place build and test it:: make clean test From scoder at codespeak.net Wed May 17 10:53:35 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 17 May 2006 10:53:35 +0200 (CEST) Subject: [Lxml-checkins] r27332 - lxml/branch/lxml-0.9.x/doc Message-ID: <20060517085335.8AAAB10050@code0.codespeak.net> Author: scoder Date: Wed May 17 10:53:34 2006 New Revision: 27332 Modified: lxml/branch/lxml-0.9.x/doc/build.txt Log: merged in doc fixes from trunk Modified: lxml/branch/lxml-0.9.x/doc/build.txt ============================================================================== --- lxml/branch/lxml-0.9.x/doc/build.txt (original) +++ lxml/branch/lxml-0.9.x/doc/build.txt Wed May 17 10:53:34 2006 @@ -65,9 +65,9 @@ python test.py -Note that the test script only tests the in-place build (see "Installation" -above), as it searches the ``src`` directory. You can use the following -one-step command to trigger an in-place build and test it:: +Note that the test script only tests the in-place build (see distutils +building above), as it searches the ``src`` directory. You can use the +following one-step command to trigger an in-place build and test it:: make clean test From scoder at codespeak.net Wed May 17 11:13:48 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 17 May 2006 11:13:48 +0200 (CEST) Subject: [Lxml-checkins] r27333 - lxml/trunk/src/lxml/tests Message-ID: <20060517091348.51C92100A8@code0.codespeak.net> Author: scoder Date: Wed May 17 11:13:47 2006 New Revision: 27333 Modified: lxml/trunk/src/lxml/tests/test_etree.py Log: test case for Element.append(None) - could crash in 0.9 Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Wed May 17 11:13:47 2006 @@ -119,7 +119,12 @@ self.assertRaises(TypeError, a.__setitem__, 0, 'foo') - + + def test_append_None(self): + # raises AssertionError in ElementTree + Element = self.etree.Element + self.assertRaises(TypeError, Element('a').append, None) + # gives error in ElementTree def test_comment_empty(self): Element = self.etree.Element From scoder at codespeak.net Wed May 17 11:19:38 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 17 May 2006 11:19:38 +0200 (CEST) Subject: [Lxml-checkins] r27334 - in lxml/trunk: doc src/lxml src/lxml/tests Message-ID: <20060517091938.71F87100A8@code0.codespeak.net> Author: scoder Date: Wed May 17 11:19:36 2006 New Revision: 27334 Modified: lxml/trunk/doc/compatibility.txt lxml/trunk/src/lxml/etree.pyx lxml/trunk/src/lxml/tests/test_elementtree.py lxml/trunk/src/lxml/tests/test_etree.py Log: prefer TypeError over AssertionError when passing None into API functions Modified: lxml/trunk/doc/compatibility.txt ============================================================================== --- lxml/trunk/doc/compatibility.txt (original) +++ lxml/trunk/doc/compatibility.txt Wed May 17 11:19:36 2006 @@ -86,7 +86,9 @@ * When trying to set a subelement using __setitem__ that is in fact not an Element but some other object, etree raises a TypeError, and ElementTree - raises an AssertionError. + raises an AssertionError. This also applies to some other places of the + API. In general, etree tries to avoid AssertionErrors in favour of being + more specific about the reason for the exception. * ElementTree ignores comments when parsing XML, while etree will read them in and treat them as Comment elements. Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Wed May 17 11:19:36 2006 @@ -1402,16 +1402,12 @@ def iselement(element): return isinstance(element, _Element) -def dump(_NodeBase elem): - assert elem is not None, "Must supply element." - # better, but not ET compatible : "_NodeBase elem not None" +def dump(_NodeBase elem not None): _dumpToFile(sys.stdout, elem._doc._c_doc, elem._c_node) def tostring(element_or_tree, encoding='us-ascii', xml_declaration=None): "Serialize an element to an encoded string representation of its XML tree." cdef int write_declaration - assert element_or_tree is not None # for ElementTree compatibility only - encoding = str(encoding) if xml_declaration is None: # by default, write an XML declaration only for non-standard encodings @@ -1435,7 +1431,6 @@ Note that the result does not carry an XML encoding declaration and is therefore not necessarily suited for serialization without further treatment.""" - assert element_or_tree is not None # for ElementTree compatibility only if isinstance(element_or_tree, _NodeBase): return _tounicode(<_NodeBase>element_or_tree) elif isinstance(element_or_tree, _ElementTree): Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Wed May 17 11:19:36 2006 @@ -1601,10 +1601,6 @@ canonicalize(tostring(b))) self.assertEquals('', canonicalize(tostring(c))) - - def test_tostring_none(self): - tostring = self.etree.tostring - self.assertRaises(AssertionError, self.etree.tostring, None) def test_tostring_element_tail(self): tostring = self.etree.tostring Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Wed May 17 11:19:36 2006 @@ -155,7 +155,7 @@ # test passing 'None' to dump def test_dump_none(self): - self.assertRaises(AssertionError, etree.dump, None) + self.assertRaises(TypeError, etree.dump, None) def test_prefix(self): ElementTree = self.etree.ElementTree @@ -430,6 +430,11 @@ self.assertEquals(docinfo.root_name, 'html') self.assertEquals(docinfo.doctype, '') + def test_tostring_none(self): + # ElementTree raises an AssertionError here + tostring = self.etree.tostring + self.assertRaises(TypeError, self.etree.tostring, None) + def test_tounicode(self): tounicode = self.etree.tounicode Element = self.etree.Element @@ -461,7 +466,7 @@ def test_tounicode_none(self): tounicode = self.etree.tounicode - self.assertRaises(AssertionError, self.etree.tounicode, None) + self.assertRaises(TypeError, self.etree.tounicode, None) def test_tounicode_element_tail(self): tounicode = self.etree.tounicode From scoder at codespeak.net Wed May 17 11:21:06 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 17 May 2006 11:21:06 +0200 (CEST) Subject: [Lxml-checkins] r27335 - lxml/trunk/src/lxml Message-ID: <20060517092106.E3333100A8@code0.codespeak.net> Author: scoder Date: Wed May 17 11:21:05 2006 New Revision: 27335 Modified: lxml/trunk/src/lxml/etree.pyx Log: set lxml.etree.__version__ to LXML_VERSION_STRING Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Wed May 17 11:21:05 2006 @@ -84,6 +84,8 @@ LIBXML_COMPILED_VERSION = __unpackIntVersion(tree.LIBXML_VERSION) LXML_VERSION = __unpackDottedVersion(tree.LXML_VERSION_STRING) +__version__ = tree.LXML_VERSION_STRING + # class for temporary storage of Python references cdef class _TempStore: From scoder at codespeak.net Wed May 17 11:32:39 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 17 May 2006 11:32:39 +0200 (CEST) Subject: [Lxml-checkins] r27336 - lxml/branch/lxml-0.9.x/src/lxml Message-ID: <20060517093239.A7B4D10097@code0.codespeak.net> Author: scoder Date: Wed May 17 11:32:38 2006 New Revision: 27336 Modified: lxml/branch/lxml-0.9.x/src/lxml/etree.pyx Log: fix exception raising from _raiseIfNone Modified: lxml/branch/lxml-0.9.x/src/lxml/etree.pyx ============================================================================== --- lxml/branch/lxml-0.9.x/src/lxml/etree.pyx (original) +++ lxml/branch/lxml-0.9.x/src/lxml/etree.pyx Wed May 17 11:32:38 2006 @@ -1300,7 +1300,7 @@ # Private helper functions -cdef void _raiseIfNone(el): +cdef _raiseIfNone(el): if el is None: raise TypeError, "Argument must not be None." From scoder at codespeak.net Wed May 17 11:36:06 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 17 May 2006 11:36:06 +0200 (CEST) Subject: [Lxml-checkins] r27337 - lxml/trunk/src/lxml/tests Message-ID: <20060517093606.8A0291009C@code0.codespeak.net> Author: scoder Date: Wed May 17 11:36:04 2006 New Revision: 27337 Modified: lxml/trunk/src/lxml/tests/test_elementtree.py lxml/trunk/src/lxml/tests/test_etree.py Log: moved tostring(UTF-16) test case to test_etree as it fails in ElementTree Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Wed May 17 11:36:04 2006 @@ -1723,19 +1723,6 @@ a.text = u'S?k p? nettet' self.assert_(tostring(a, 'UTF-8') in [xml, prologue + xml]) - def test_encoding_tostring_utf16(self): - tostring = self.etree.tostring - Element = self.etree.Element - SubElement = self.etree.SubElement - - a = Element('a') - b = SubElement(a, 'b') - c = SubElement(a, 'c') - - result = unicode(tostring(a, 'UTF-16'), 'UTF-16') - self.assertEquals('', - canonicalize(result)) - def test_encoding_tostring_sub(self): Element = self.etree.Element SubElement = self.etree.SubElement Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Wed May 17 11:36:04 2006 @@ -430,6 +430,20 @@ self.assertEquals(docinfo.root_name, 'html') self.assertEquals(docinfo.doctype, '') + def test_encoding_tostring_utf16(self): + # ElementTree fails to serialize this + tostring = self.etree.tostring + Element = self.etree.Element + SubElement = self.etree.SubElement + + a = Element('a') + b = SubElement(a, 'b') + c = SubElement(a, 'c') + + result = unicode(tostring(a, 'UTF-16'), 'UTF-16') + self.assertEquals('', + canonicalize(result)) + def test_tostring_none(self): # ElementTree raises an AssertionError here tostring = self.etree.tostring From scoder at codespeak.net Wed May 17 12:13:58 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 17 May 2006 12:13:58 +0200 (CEST) Subject: [Lxml-checkins] r27338 - lxml/trunk Message-ID: <20060517101358.A3D2A10050@code0.codespeak.net> Author: scoder Date: Wed May 17 12:13:57 2006 New Revision: 27338 Modified: lxml/trunk/setup.py Log: cleanup in setup.py Modified: lxml/trunk/setup.py ============================================================================== --- lxml/trunk/setup.py (original) +++ lxml/trunk/setup.py Wed May 17 12:13:57 2006 @@ -12,7 +12,7 @@ except IOError: svn_version = version else: - revision = re.search("]*name=\"\"[^>]*revision=\"([^\"]+)\"", + revision = re.search(']*name=""[^>]*revision="([^"]+)"', svn_entries).group(1) svn_version = version + '-' + revision @@ -47,22 +47,19 @@ sources = ["src/lxml/etree.c"] try: - changelog = open("CHANGES.txt", 'r') + changelog = open(os.path.join(src_dir, "CHANGES.txt"), 'r') except: print "*NOTE*: couldn't open CHANGES.txt !" else: - inside = 0 changelog_lines = [] for line in changelog: if line.startswith('====='): - inside += 1 - if inside > 3: + if len(changelog_lines) > 1: break - if inside > 1: + if changelog_lines: changelog_lines.append(line) elif version in line: changelog_lines.append(line) - inside += 1 if changelog_lines: changelog_text = ''.join(changelog_lines[:-1]) From scoder at codespeak.net Wed May 17 12:43:24 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 17 May 2006 12:43:24 +0200 (CEST) Subject: [Lxml-checkins] r27339 - lxml/branch/lxml-0.9.x Message-ID: <20060517104324.75B0D10050@code0.codespeak.net> Author: scoder Date: Wed May 17 12:43:22 2006 New Revision: 27339 Modified: lxml/branch/lxml-0.9.x/CHANGES.txt Log: CHANGES.txt: possible crashes in 0.9.x when passing None arguments Modified: lxml/branch/lxml-0.9.x/CHANGES.txt ============================================================================== --- lxml/branch/lxml-0.9.x/CHANGES.txt (original) +++ lxml/branch/lxml-0.9.x/CHANGES.txt Wed May 17 12:43:22 2006 @@ -10,6 +10,8 @@ Bugs fixed ---------- +* Some API functions didn't handle invalid None arguments correctly + * Element/SubElement failed to set attribute namespaces from passed ``attrib`` dictionary From scoder at codespeak.net Wed May 17 12:58:41 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 17 May 2006 12:58:41 +0200 (CEST) Subject: [Lxml-checkins] r27341 - lxml/trunk/src/lxml Message-ID: <20060517105841.1B60810064@code0.codespeak.net> Author: scoder Date: Wed May 17 12:58:39 2006 New Revision: 27341 Added: lxml/trunk/src/lxml/xmlwriter.pxi - copied unchanged from r27340, lxml/trunk/src/lxml/apihelpers.pxi Log: copied apihelpers.pxi to xmlwriter.pxi before split From scoder at codespeak.net Wed May 17 13:04:39 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 17 May 2006 13:04:39 +0200 (CEST) Subject: [Lxml-checkins] r27342 - lxml/trunk/src/lxml Message-ID: <20060517110439.880CB10060@code0.codespeak.net> Author: scoder Date: Wed May 17 13:04:38 2006 New Revision: 27342 Modified: lxml/trunk/src/lxml/apihelpers.pxi lxml/trunk/src/lxml/etree.pyx lxml/trunk/src/lxml/xmlwriter.pxi Log: moved XML output funtions from apihelpers.pxi to xmlwriter.pxi Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Wed May 17 13:04:38 2006 @@ -1,164 +1,4 @@ -# Private helper functions for input/output and API functions - -# XML I/O helpers - -cdef _tostring(_NodeBase element, encoding, int write_xml_declaration): - "Serialize an element to an encoded string representation of its XML tree." - cdef _Document doc - cdef tree.xmlOutputBuffer* c_buffer - cdef tree.xmlBuffer* c_result_buffer - cdef tree.xmlCharEncodingHandler* enchandler - cdef char* c_enc - cdef char* c_version - if element is None: - return None - if encoding in ('utf8', 'UTF8', 'utf-8'): - encoding = 'UTF-8' - doc = element._doc - c_enc = encoding - # it is necessary to *and* find the encoding handler *and* use - # encoding during output - enchandler = tree.xmlFindCharEncodingHandler(c_enc) - c_buffer = tree.xmlAllocOutputBuffer(enchandler) - if c_buffer is NULL: - raise LxmlError, "Failed to create output buffer" - - try: - _writeNodeToBuffer(c_buffer, doc._c_doc, element._c_node, - doc._c_doc.version, c_enc, write_xml_declaration) - tree.xmlOutputBufferFlush(c_buffer) - if c_buffer.conv is not NULL: - c_result_buffer = c_buffer.conv - else: - c_result_buffer = c_buffer.buffer - result = python.PyString_FromStringAndSize( - tree.xmlBufferContent(c_result_buffer), - tree.xmlBufferLength(c_result_buffer)) - finally: - tree.xmlOutputBufferClose(c_buffer) - return result - -cdef _tounicode(_NodeBase element): - "Serialize an element to the Python unicode representation of its XML tree." - cdef _Document doc - cdef tree.xmlOutputBuffer* c_buffer - cdef tree.xmlBuffer* c_result_buffer - if element is None: - return None - doc = element._doc - c_buffer = tree.xmlAllocOutputBuffer(NULL) - if c_buffer is NULL: - raise LxmlError, "Failed to create output buffer" - try: - _writeNodeToBuffer(c_buffer, doc._c_doc, element._c_node, - NULL, NULL, 0) - tree.xmlOutputBufferFlush(c_buffer) - if c_buffer.conv is not NULL: - c_result_buffer = c_buffer.conv - else: - c_result_buffer = c_buffer.buffer - result = python.PyUnicode_DecodeUTF8( - tree.xmlBufferContent(c_result_buffer), - tree.xmlBufferLength(c_result_buffer), - 'strict') - finally: - tree.xmlOutputBufferClose(c_buffer) - return result - -cdef void _writeNodeToBuffer(tree.xmlOutputBuffer* c_buffer, - xmlDoc* c_doc, xmlNode* c_node, - char* xml_version, char* encoding, - int write_xml_declaration): - if write_xml_declaration: - _writeDeclarationToBuffer(c_buffer, xml_version, encoding) - - tree.xmlNodeDumpOutput(c_buffer, c_doc, c_node, 0, 0, encoding) - _dumpNextNode(c_buffer, c_doc, c_node, encoding) - -cdef void _writeDeclarationToBuffer(tree.xmlOutputBuffer* c_buffer, - char* version, char* encoding): - if version is NULL: - version = "1.0" - tree.xmlOutputBufferWriteString(c_buffer, "\n") - -# output to file-like objects -cdef class _FileWriter: - cdef object _filelike - cdef _ExceptionContext _exc_context - def __init__(self, filelike, exc_context=None): - self._filelike = filelike - if exc_context is None: - self._exc_context = _ExceptionContext() - else: - self._exc_context = exc_context - - cdef tree.xmlOutputBuffer* _createOutputBuffer( - self, tree.xmlCharEncodingHandler* enchandler) except NULL: - cdef tree.xmlOutputBuffer* c_buffer - c_buffer = tree.xmlOutputBufferCreateIO( - _writeFilelikeWriter, _closeFilelikeWriter, - self, enchandler) - if c_buffer is NULL: - raise IOError, "Could not create I/O writer context." - return c_buffer - - cdef int write(self, char* c_buffer, int len): - try: - if self._filelike is None: - raise IOError, "File is already closed" - py_buffer = python.PyString_FromStringAndSize(c_buffer, len) - self._filelike.write(py_buffer) - return len - except Exception: - self._exc_context._store_raised() - return -1 - - cdef int close(self): - # we should not close the file here as we didn't open it - self._filelike = None - return 0 - -cdef int _writeFilelikeWriter(void* ctxt, char* c_buffer, int len): - return (<_FileWriter>ctxt).write(c_buffer, len) - -cdef int _closeFilelikeWriter(void* ctxt): - return (<_FileWriter>ctxt).close() - -cdef _tofile(f, _NodeBase element, encoding, int write_declaration): - cdef _FileWriter writer - cdef tree.xmlOutputBuffer* c_buffer - cdef tree.xmlCharEncodingHandler* enchandler - cdef char* c_enc - if encoding is None: - c_enc = NULL - else: - c_enc = encoding - - enchandler = tree.xmlFindCharEncodingHandler(c_enc) - if python.PyString_Check(f) or python.PyUnicode_Check(f): - filename = _utf8(f) - c_buffer = tree.xmlOutputBufferCreateFilename( - _cstr(filename), enchandler, 0) - elif hasattr(f, 'write'): - writer = _FileWriter(f) - c_buffer = writer._createOutputBuffer(enchandler) - else: - raise TypeError, "File or filename expected, got '%s'" % type(f) - - _writeNodeToBuffer(c_buffer, - element._doc._c_doc, element._c_node, - element._doc._c_doc.version, c_enc, - write_declaration) - - tree.xmlOutputBufferClose(c_buffer) - if writer is not None: - writer._exc_context._raise_if_stored() - -# Private helper functions +# Private helper functions for API functions cdef void displayNode(xmlNode* c_node, indent): # to help with debugging @@ -260,27 +100,6 @@ c_attrib_node.ns.href) return funicode(value) -cdef _dumpToFile(f, xmlDoc* c_doc, xmlNode* c_node): - cdef python.PyObject* o - cdef tree.xmlOutputBuffer* c_buffer - - if not python.PyFile_Check(f): - raise ValueError, "Not a file" - o = f - c_buffer = tree.xmlOutputBufferCreateFile(python.PyFile_AsFile(o), NULL) - tree.xmlNodeDumpOutput(c_buffer, c_doc, c_node, 0, 0, NULL) - # dump next node if it's a text node - _dumpNextNode(c_buffer, c_doc, c_node, NULL) - tree.xmlOutputBufferWriteString(c_buffer, '\n') - tree.xmlOutputBufferFlush(c_buffer) - -cdef void _dumpNextNode(tree.xmlOutputBuffer* c_buffer, xmlDoc* c_doc, - xmlNode* c_node, char* encoding): - cdef xmlNode* c_next - c_next = c_node.next - if c_next is not NULL and c_next.type == tree.XML_TEXT_NODE: - tree.xmlNodeDumpOutput(c_buffer, c_doc, c_next, 0, 0, encoding) - cdef object __REPLACE_XML_ENCODING __REPLACE_XML_ENCODING = re.compile( r'^(\s*<\?\s*xml[^>]+)\s+encoding\s*=\s*"[^"]*"\s*', re.U).sub Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Wed May 17 13:04:38 2006 @@ -1456,6 +1456,7 @@ include "nsclasses.pxi" # Namespace implementation and registry include "docloader.pxi" # Support for custom document loaders include "parser.pxi" # XML Parser +include "xmlwriter.pxi" # XML output functions include "xmlid.pxi" # XMLID and IDDict include "extensions.pxi" # XPath/XSLT extension functions include "xpath.pxi" # XPath evaluation Modified: lxml/trunk/src/lxml/xmlwriter.pxi ============================================================================== --- lxml/trunk/src/lxml/xmlwriter.pxi (original) +++ lxml/trunk/src/lxml/xmlwriter.pxi Wed May 17 13:04:38 2006 @@ -1,6 +1,4 @@ -# Private helper functions for input/output and API functions - -# XML I/O helpers +# XML serialization and output functions cdef _tostring(_NodeBase element, encoding, int write_xml_declaration): "Serialize an element to an encoded string representation of its XML tree." @@ -86,6 +84,7 @@ tree.xmlOutputBufferWriteString(c_buffer, "'?>\n") # output to file-like objects + cdef class _FileWriter: cdef object _filelike cdef _ExceptionContext _exc_context @@ -158,107 +157,7 @@ if writer is not None: writer._exc_context._raise_if_stored() -# Private helper functions - -cdef void displayNode(xmlNode* c_node, indent): - # to help with debugging - cdef xmlNode* c_child - print indent * ' ', c_node - c_child = c_node.children - while c_child is not NULL: - displayNode(c_child, indent + 1) - c_child = c_child.next - -cdef _Document _documentOrRaise(object input): - cdef _Document doc - doc = _documentOf(input) - if doc is None: - raise TypeError, "Invalid input object: %s" % type(input) - else: - return doc - -cdef _Document _documentOf(object input): - # call this to get the document of a - # _Document, _ElementTree or _NodeBase object - if isinstance(input, _ElementTree): - return (<_ElementTree>input)._doc - elif isinstance(input, _NodeBase): - return (<_NodeBase>input)._doc - elif isinstance(input, _Document): - return <_Document>input - else: - return None - -cdef _NodeBase _rootNodeOf(object input): - # call this to get the root node of a - # _Document, _ElementTree or _NodeBase object - if isinstance(input, _ElementTree): - return (<_ElementTree>input)._context_node - elif isinstance(input, _NodeBase): - return <_NodeBase>input - elif isinstance(input, _Document): - return (<_Document>input).getroot() - else: - return None - -cdef xmlDoc* _fakeRootDoc(xmlDoc* c_base_doc, xmlNode* c_node): - # build a temporary document that has the given node as root node - # note that copy and original must not be modified during its lifetime!! - # always call _destroyFakeDoc() after use! - cdef xmlNode* c_child - cdef xmlNode* c_root - cdef xmlDoc* c_doc - c_root = tree.xmlDocGetRootElement(c_base_doc) - if c_root == c_node: - # already the root node - return c_base_doc - - c_doc = tree.xmlCopyDoc(c_base_doc, 0) # non recursive! - c_root = tree.xmlDocCopyNode(c_node, c_doc, 2) # non recursive! - - c_root.children = c_node.children - c_root.last = c_node.last - c_root.next = c_root.prev = c_root.parent = NULL - - # store original node - c_root._private = c_node - - # divert parent pointers of children - c_child = c_root.children - while c_child is not NULL: - c_child.parent = c_root - c_child = c_child.next - - c_doc.children = c_root - return c_doc - -cdef void _destroyFakeDoc(xmlDoc* c_base_doc, xmlDoc* c_doc): - # delete a temporary document - cdef xmlNode* c_child - cdef xmlNode* c_parent - cdef xmlNode* c_root - if c_doc != c_base_doc: - c_root = tree.xmlDocGetRootElement(c_doc) - - # restore parent pointers of children - c_parent = c_root._private - c_child = c_root.children - while c_child is not NULL: - c_child.parent = c_parent - c_child = c_child.next - - # prevent recursive removal of children - c_root.children = c_root.last = c_root._private = NULL - tree.xmlFreeDoc(c_doc) - -cdef object _attributeValue(xmlNode* c_element, xmlNode* c_attrib_node): - cdef char* value - if c_attrib_node.ns is NULL or c_attrib_node.ns.href is NULL: - value = tree.xmlGetNoNsProp(c_element, c_attrib_node.name) - else: - value = tree.xmlGetNsProp(c_element, c_attrib_node.name, - c_attrib_node.ns.href) - return funicode(value) +# node dump functions (mainly for debug) cdef _dumpToFile(f, xmlDoc* c_doc, xmlNode* c_node): cdef python.PyObject* o @@ -280,333 +179,3 @@ c_next = c_node.next if c_next is not NULL and c_next.type == tree.XML_TEXT_NODE: tree.xmlNodeDumpOutput(c_buffer, c_doc, c_next, 0, 0, encoding) - -cdef object __REPLACE_XML_ENCODING -__REPLACE_XML_ENCODING = re.compile( - r'^(\s*<\?\s*xml[^>]+)\s+encoding\s*=\s*"[^"]*"\s*', re.U).sub - -cdef object _stripEncodingDeclaration(object xml_string): - # this is a hack to remove the XML encoding declaration from unicode - return __REPLACE_XML_ENCODING(r'\g<1>', xml_string) - -cdef object _stripDeclaration(object xml_string): - # this is a hack to remove the XML declaration when we encode to UTF-8 - xml_string = xml_string.strip() - if xml_string[:5] == '') - if i != -1: - i = i + 2 - while xml_string[i:i+1] in '\n\r ': - i = i+1 - xml_string = xml_string[i:] - return xml_string - -cdef _collectText(xmlNode* c_node): - """Collect all text nodes and return them as a unicode string. - - Start collecting at c_node. - - If there was no text to collect, return None - """ - cdef Py_ssize_t scount - cdef char* text - cdef xmlNode* c_node_cur - # check for multiple text nodes - scount = 0 - text = NULL - c_node_cur = c_node - while c_node_cur is not NULL and c_node_cur.type == tree.XML_TEXT_NODE: - if c_node_cur.content[0] != c'\0': - text = c_node_cur.content - scount = scount + 1 - c_node_cur = c_node_cur.next - - # handle two most common cases first - if text is NULL: - return None - if scount == 1: - return funicode(text) - - # the rest is not performance critical anymore - result = '' - while c_node is not NULL and c_node.type == tree.XML_TEXT_NODE: - result = result + c_node.content - c_node = c_node.next - return funicode(result) - -cdef _removeText(xmlNode* c_node): - """Remove all text nodes. - - Start removing at c_node. - """ - cdef xmlNode* c_next - while c_node is not NULL and c_node.type == tree.XML_TEXT_NODE: - c_next = c_node.next - tree.xmlUnlinkNode(c_node) - # XXX cannot safely free in case of direct text node proxies.. - tree.xmlFreeNode(c_node) - c_node = c_next - -cdef xmlNode* _findChild(xmlNode* c_node, Py_ssize_t index): - if index < 0: - return _findChildBackwards(c_node, -index - 1) - else: - return _findChildForwards(c_node, index) - -cdef xmlNode* _findChildForwards(xmlNode* c_node, Py_ssize_t index): - """Return child element of c_node with index, or return NULL if not found. - """ - cdef xmlNode* c_child - cdef Py_ssize_t c - c_child = c_node.children - c = 0 - while c_child is not NULL: - if _isElement(c_child): - if c == index: - return c_child - c = c + 1 - c_child = c_child.next - return NULL - -cdef xmlNode* _findChildBackwards(xmlNode* c_node, Py_ssize_t index): - """Return child element of c_node with index, or return NULL if not found. - Search from the end. - """ - cdef xmlNode* c_child - cdef Py_ssize_t c - c_child = c_node.last - c = 0 - while c_child is not NULL: - if _isElement(c_child): - if c == index: - return c_child - c = c + 1 - c_child = c_child.prev - return NULL - -cdef xmlNode* _nextElement(xmlNode* c_node): - """Given a node, find the next sibling that is an element. - """ - c_node = c_node.next - while c_node is not NULL: - if _isElement(c_node): - return c_node - c_node = c_node.next - return NULL - -cdef xmlNode* _previousElement(xmlNode* c_node): - """Given a node, find the next sibling that is an element. - """ - c_node = c_node.prev - while c_node is not NULL: - if _isElement(c_node): - return c_node - c_node = c_node.prev - return NULL - -cdef xmlNode* _findDepthFirstInDescendents(xmlNode* c_node, - char* c_href, char* c_name): - if c_node is NULL: - return NULL - c_node = c_node.children - if c_node is NULL: - return NULL - if not _isElement(c_node): - c_node = _nextElement(c_node) - return _findDepthFirstInFollowing(c_node, c_href, c_name) - -cdef xmlNode* _findDepthFirstInFollowingSiblings(xmlNode* c_node, - char* c_href, char* c_name): - if c_node is NULL: - return NULL - c_node = _nextElement(c_node) - return _findDepthFirstInFollowing(c_node, c_href, c_name) - -cdef xmlNode* _findDepthFirstInFollowing(xmlNode* c_node, - char* c_href, char* c_name): - """Find the next matching node by traversing: - 1) the node itself - 2) its descendents - 3) its following siblings. - """ - cdef xmlNode* c_child - if c_name is NULL: - # always match - return c_node - while c_node is not NULL: - if _tagMatches(c_node, c_href, c_name): - return c_node - if c_node.children is not NULL: - c_child = _findDepthFirstInFollowing(c_node.children, c_href, c_name) - if c_child is not NULL: - return c_child - c_node = _nextElement(c_node) - return NULL - -cdef int _tagMatches(xmlNode* c_node, char* c_href, char* c_name): - if c_name is NULL: - # always match - return 1 - if c_href is NULL: - if c_node.ns is not NULL and c_node.ns.href is not NULL: - return 0 - return cstd.strcmp(c_node.name, c_name) == 0 - elif c_node.ns is NULL or c_node.ns.href is NULL: - return 0 - else: - return cstd.strcmp(c_node.name, c_name) == 0 and \ - cstd.strcmp(c_node.ns.href, c_href) == 0 - -cdef void _removeNode(xmlNode* c_node): - """Unlink and free a node and subnodes if possible. - """ - tree.xmlUnlinkNode(c_node) - attemptDeallocation(c_node) - -cdef void _moveTail(xmlNode* c_tail, xmlNode* c_target): - cdef xmlNode* c_next - # tail support: look for any text nodes trailing this node and - # move them too - while c_tail is not NULL and c_tail.type == tree.XML_TEXT_NODE: - c_next = c_tail.next - tree.xmlUnlinkNode(c_tail) - tree.xmlAddNextSibling(c_target, c_tail) - c_target = c_tail - c_tail = c_next - -cdef xmlNode* _deleteSlice(xmlNode* c_node, Py_ssize_t start, Py_ssize_t stop): - """Delete slice, starting with c_node, start counting at start, end at stop. - """ - cdef xmlNode* c_next - cdef Py_ssize_t c - if c_node is NULL: - return NULL - # now start deleting nodes - c = start - while c_node is not NULL and c < stop: - c_next = c_node.next - if _isElement(c_node): - _removeText(c_node.next) - c_next = c_node.next - _removeNode(c_node) - c = c + 1 - c_node = c_next - return c_node - -cdef int isutf8(char* s): - cdef char c - c = s[0] - while c != c'\0': - if c & 0x80: - return 1 - s = s + 1 - c = s[0] - return 0 - -cdef object funicode(char* s): - if isutf8(s): - return python.PyUnicode_DecodeUTF8(s, cstd.strlen(s), NULL) - return python.PyString_FromString(s) - -cdef object _utf8(object s): - if python.PyString_Check(s): - assert not isutf8(_cstr(s)), "All strings must be Unicode or ASCII" - return s - elif python.PyUnicode_Check(s): - return python.PyUnicode_AsUTF8String(s) - else: - raise TypeError, "Argument must be string or unicode." - -cdef _getNsTag(tag): - """Given a tag, find namespace URI and tag name. - Return None for NS uri if no namespace URI available. - """ - cdef char* c_tag - cdef char* c_pos - cdef int nslen - if isinstance(tag, QName): - tag = (tag).text - tag = _utf8(tag) - c_tag = _cstr(tag) - if c_tag[0] == c'{': - c_pos = tree.xmlStrchr(c_tag+1, c'}') - if c_pos is NULL: - raise ValueError, "Invalid tag name" - nslen = c_pos - c_tag - 1 - ns = python.PyString_FromStringAndSize(c_tag+1, nslen) - tag = python.PyString_FromString(c_pos+1) - else: - ns = None - return ns, tag - -cdef object _namespacedName(xmlNode* c_node): - cdef char* href - cdef char* name - name = c_node.name - if c_node.ns is NULL or c_node.ns.href is NULL: - return funicode(name) - else: - href = c_node.ns.href - s = python.PyString_FromFormat("{%s}%s", href, name) - if isutf8(href) or isutf8(name): - return python.PyUnicode_FromEncodedObject(s, 'UTF-8', NULL) - else: - return s - -cdef _getFilenameForFile(source): - """Given a Python File or Gzip object, give filename back. - - Returns None if not a file object. - """ - # file instances have a name attribute - if hasattr(source, 'name'): - return source.name - # gzip file instances have a filename attribute - if hasattr(source, 'filename'): - return source.filename - # urllib2 - if hasattr(source, 'geturl'): - return source.geturl() - return None - -cdef void changeDocumentBelow(_NodeBase node, _Document doc, int recursive): - """For a node and all nodes below, change document. - - A node can change document in certain operations as an XML - subtree can move. This updates all possible proxies in the - tree below (including the current node). It also reconciliates - namespaces so they're correct inside the new environment. - """ - if recursive: - changeDocumentBelowHelper(node._c_node, doc) - tree.xmlReconciliateNs(doc._c_doc, node._c_node) - -cdef void changeDocumentBelowHelper(xmlNode* c_node, _Document doc): - cdef ProxyRef* ref - cdef xmlNode* c_current - cdef xmlAttr* c_attr_current - cdef _NodeBase proxy - - if c_node is NULL: - return - # different _c_doc - c_node.doc = doc._c_doc - - if c_node._private is not NULL: - ref = c_node._private - while ref is not NULL: - proxy = <_NodeBase>ref.proxy - proxy._doc = doc - ref = ref.next - - # adjust all children - c_current = c_node.children - while c_current is not NULL: - changeDocumentBelowHelper(c_current, doc) - c_current = c_current.next - - # adjust all attributes - c_attr_current = c_node.properties - while c_attr_current is not NULL: - changeDocumentBelowHelper(c_current, doc) - c_attr_current = c_attr_current.next From scoder at codespeak.net Wed May 17 13:16:31 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 17 May 2006 13:16:31 +0200 (CEST) Subject: [Lxml-checkins] r27343 - lxml/trunk/src/lxml Message-ID: <20060517111631.2FCDE10064@code0.codespeak.net> Author: scoder Date: Wed May 17 13:16:30 2006 New Revision: 27343 Modified: lxml/trunk/src/lxml/xslt.pxi Log: fixed potential memory leak on exception in _XSLTResultTree.__str__ Modified: lxml/trunk/src/lxml/xslt.pxi ============================================================================== --- lxml/trunk/src/lxml/xslt.pxi (original) +++ lxml/trunk/src/lxml/xslt.pxi Wed May 17 13:16:30 2006 @@ -395,8 +395,10 @@ if s is NULL: return '' # we must not use 'funicode' here as this is not always UTF-8 - result = python.PyString_FromStringAndSize(s, l) - tree.xmlFree(s) + try: + result = python.PyString_FromStringAndSize(s, l) + finally: + tree.xmlFree(s) return result def __unicode__(self): @@ -409,8 +411,10 @@ encoding = self._xslt._c_style.encoding if encoding is NULL: encoding = 'ascii' - result = python.PyUnicode_Decode(s, l, encoding, 'strict') - tree.xmlFree(s) + try: + result = python.PyUnicode_Decode(s, l, encoding, 'strict') + finally: + tree.xmlFree(s) return _stripEncodingDeclaration(result) cdef _xsltResultTreeFactory(_Document doc, XSLT xslt): From faassen at codespeak.net Wed May 17 13:28:33 2006 From: faassen at codespeak.net (faassen at codespeak.net) Date: Wed, 17 May 2006 13:28:33 +0200 (CEST) Subject: [Lxml-checkins] r27345 - lxml/branch/lxml-0.9.x/doc Message-ID: <20060517112833.A996310063@code0.codespeak.net> Author: faassen Date: Wed May 17 13:28:31 2006 New Revision: 27345 Modified: lxml/branch/lxml-0.9.x/doc/build.txt Log: Fix ReST errors. Modified: lxml/branch/lxml-0.9.x/doc/build.txt ============================================================================== --- lxml/branch/lxml-0.9.x/doc/build.txt (original) +++ lxml/branch/lxml-0.9.x/doc/build.txt Wed May 17 13:28:31 2006 @@ -105,25 +105,25 @@ Your directory should now have something like the following files in it:: -iconv-1.9.1.win32.zip -libxml2-2.6.23.win32.zip -libxslt-1.1.15.win32.zip -lxml-0.9.2.tgz -zlib-1.2.3.win32.zip + iconv-1.9.1.win32.zip + libxml2-2.6.23.win32.zip + libxslt-1.1.15.win32.zip + lxml-0.9.2.tgz + zlib-1.2.3.win32.zip Now extract each of those files in the _same_ directory. Now you should have something like this:: -iconv-1.9.1.win32/ -iconv-1.9.1.win32.zip -libxml2-2.6.23.win32/ -libxml2-2.6.23.win32.zip -libxslt-1.1.15.win32/ -libxslt-1.1.15.win32.zip -lxml-0.9.2/ -lxml-0.9.2.tgz -zlib-1.2.3.win32/ -zlib-1.2.3.win32.zip + iconv-1.9.1.win32/ + iconv-1.9.1.win32.zip + libxml2-2.6.23.win32/ + libxml2-2.6.23.win32.zip + libxslt-1.1.15.win32/ + libxslt-1.1.15.win32.zip + lxml-0.9.2/ + lxml-0.9.2.tgz + zlib-1.2.3.win32/ + zlib-1.2.3.win32.zip Go to the lxml-0.9.2 directory and edit the file ``setup.py``. There should be a section that looks like this:: From faassen at codespeak.net Wed May 17 13:28:41 2006 From: faassen at codespeak.net (faassen at codespeak.net) Date: Wed, 17 May 2006 13:28:41 +0200 (CEST) Subject: [Lxml-checkins] r27346 - lxml/branch/lxml-0.9.x Message-ID: <20060517112841.A58BD10063@code0.codespeak.net> Author: faassen Date: Wed May 17 13:28:40 2006 New Revision: 27346 Modified: lxml/branch/lxml-0.9.x/CREDITS.txt Log: Update credits. Modified: lxml/branch/lxml-0.9.x/CREDITS.txt ============================================================================== --- lxml/branch/lxml-0.9.x/CREDITS.txt (original) +++ lxml/branch/lxml-0.9.x/CREDITS.txt Wed May 17 13:28:40 2006 @@ -7,7 +7,8 @@ Stefan Behnel - core development work (SAX support, misc patches) -Olivier Grisel - improved (c)ElementTree compatibility patches +Olivier Grisel - improved (c)ElementTree compatibility patches, + website improvements. Florian Wagner - help with copy.deepcopy support, bug reporting @@ -41,6 +42,10 @@ Trent Mick - setup.py patch +Steve Howe - Windows builds + +David Sankel - building statically on Windows + Thanks also to: * the libxml2 project for a great XML library. @@ -53,4 +58,3 @@ Holger Krekel for hosting it on codespeak.net * Infrae for initiating the project. - From scoder at codespeak.net Wed May 17 13:44:40 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 17 May 2006 13:44:40 +0200 (CEST) Subject: [Lxml-checkins] r27347 - in lxml/trunk: . src/lxml Message-ID: <20060517114440.BDDE210063@code0.codespeak.net> Author: scoder Date: Wed May 17 13:44:39 2006 New Revision: 27347 Modified: lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/xslt.pxi Log: register all libxslt extra functions (document, write, debug, output) Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Wed May 17 13:44:39 2006 @@ -44,7 +44,8 @@ * Implementation of exslt:regexp for XSLT based on the Python 're' module, enabled by default, can be switched off with 'regexp=False' keyword argument -* Support for exslt extensions (libexslt) and node-set function +* Support for exslt extensions (libexslt) and libxslt extra functions + (node-set, document, write, output) * Substantial speedup in XPath.evaluate() Modified: lxml/trunk/src/lxml/xslt.pxi ============================================================================== --- lxml/trunk/src/lxml/xslt.pxi (original) +++ lxml/trunk/src/lxml/xslt.pxi Wed May 17 13:44:39 2006 @@ -423,18 +423,9 @@ result._xslt = xslt return result -# do not register all libxslt extra functions, provide only "node-set" -# functions like "output" and "write" are a potential security risk -#xslt.xsltRegisterAllExtras() -xslt.xsltRegisterExtModuleFunction("node-set", - xslt.XSLT_LIBXSLT_NAMESPACE, - xslt.xsltFunctionNodeSet) -xslt.xsltRegisterExtModuleFunction("node-set", - xslt.XSLT_SAXON_NAMESPACE, - xslt.xsltFunctionNodeSet) -xslt.xsltRegisterExtModuleFunction("node-set", - xslt.XSLT_XT_NAMESPACE, - xslt.xsltFunctionNodeSet) +# functions like "output" and "write" are a potential security risk, but we +# rely on the user to configure XSLTAccessControl as needed +xslt.xsltRegisterAllExtras() # enable EXSLT support for XSLT xslt.exsltRegisterAll() From scoder at codespeak.net Wed May 17 13:51:54 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 17 May 2006 13:51:54 +0200 (CEST) Subject: [Lxml-checkins] r27348 - lxml/trunk/doc Message-ID: <20060517115154.DC51610063@code0.codespeak.net> Author: scoder Date: Wed May 17 13:51:53 2006 New Revision: 27348 Modified: lxml/trunk/doc/api.txt Log: short note on EXSLT etc. support in XSLT Modified: lxml/trunk/doc/api.txt ============================================================================== --- lxml/trunk/doc/api.txt (original) +++ lxml/trunk/doc/api.txt Wed May 17 13:51:53 2006 @@ -333,6 +333,13 @@ >>> str(result) '\nA\n' +By default, XSLT supports all extension functions from libxslt and libexslt as +well as Python regular expressions through EXSLT. Note that some extensions +enable style sheets to read and write files on the local file system. See the +`document loader documentation`_ on how to deal with this. + +.. _`resolver documentation`: resolvers.html + RelaxNG ------- From scoder at codespeak.net Wed May 17 14:02:11 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 17 May 2006 14:02:11 +0200 (CEST) Subject: [Lxml-checkins] r27349 - lxml/trunk/doc Message-ID: <20060517120211.B2DF410063@code0.codespeak.net> Author: scoder Date: Wed May 17 14:02:08 2006 New Revision: 27349 Modified: lxml/trunk/doc/main.txt Log: typo Modified: lxml/trunk/doc/main.txt ============================================================================== --- lxml/trunk/doc/main.txt (original) +++ lxml/trunk/doc/main.txt Wed May 17 14:02:08 2006 @@ -89,7 +89,7 @@ simple way to write arbitrary XML driven APIs on top of lxml. lxml also offers a `SAX compliant API`_, that works with the SAX support -in the standar dlibrary. +in the standard library. .. _`ElementTree API`: http://effbot.org/zone/element-index.htm .. _`ElementTree compatibility overview`: compatibility.html From faassen at codespeak.net Wed May 17 14:17:03 2006 From: faassen at codespeak.net (faassen at codespeak.net) Date: Wed, 17 May 2006 14:17:03 +0200 (CEST) Subject: [Lxml-checkins] r27351 - lxml/branch/lxml-0.9.x/doc Message-ID: <20060517121703.5B31110063@code0.codespeak.net> Author: faassen Date: Wed May 17 14:17:01 2006 New Revision: 27351 Modified: lxml/branch/lxml-0.9.x/doc/main.txt Log: Fix typo. Modified: lxml/branch/lxml-0.9.x/doc/main.txt ============================================================================== --- lxml/branch/lxml-0.9.x/doc/main.txt (original) +++ lxml/branch/lxml-0.9.x/doc/main.txt Wed May 17 14:17:01 2006 @@ -89,7 +89,7 @@ simple way to write arbitrary XML driven APIs on top of lxml. lxml also offers a `SAX compliant API`_, that works with the SAX support -in the standar dlibrary. +in the standard library. .. _`ElementTree API`: http://effbot.org/zone/element-index.htm .. _`ElementTree compatibility overview`: compatibility.html From faassen at codespeak.net Wed May 17 14:17:14 2006 From: faassen at codespeak.net (faassen at codespeak.net) Date: Wed, 17 May 2006 14:17:14 +0200 (CEST) Subject: [Lxml-checkins] r27352 - lxml/www Message-ID: <20060517121714.2E95010063@code0.codespeak.net> Author: faassen Date: Wed May 17 14:17:13 2006 New Revision: 27352 Modified: lxml/www/publish.py Log: Add another file to produce. Modified: lxml/www/publish.py ============================================================================== --- lxml/www/publish.py (original) +++ lxml/www/publish.py Wed May 17 14:17:13 2006 @@ -8,7 +8,8 @@ shutil.copy(stylesheet_url, dirname) for name in ['main.txt', 'intro.txt', 'api.txt', 'compatibility.txt', - 'extensions.txt', 'namespace_extensions.txt', 'sax.txt']: + 'extensions.txt', 'namespace_extensions.txt', 'sax.txt', + 'build.txt']: path = os.path.join(lxml_path, 'doc', name) outname = os.path.splitext(name)[0] + '.html' outpath = os.path.join(dirname, outname) From scoder at codespeak.net Wed May 17 14:25:08 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 17 May 2006 14:25:08 +0200 (CEST) Subject: [Lxml-checkins] r27353 - lxml/trunk Message-ID: <20060517122508.28BC110063@code0.codespeak.net> Author: scoder Date: Wed May 17 14:25:06 2006 New Revision: 27353 Modified: lxml/trunk/MANIFEST.in Log: include generated .html pages in source tgz Modified: lxml/trunk/MANIFEST.in ============================================================================== --- lxml/trunk/MANIFEST.in (original) +++ lxml/trunk/MANIFEST.in Wed May 17 14:25:06 2006 @@ -1,5 +1,5 @@ include setup.py MANIFEST.in *.txt recursive-include src *.pyx *.pxd *.pxi *.py etree.c etree.h recursive-include src/lxml/tests *.rng *.xslt *.xml -recursive-include doc *.txt *.xml *.mgp +recursive-include doc *.txt *.html *.xml *.mgp exclude doc/pyrex.txt From scoder at codespeak.net Wed May 17 14:25:22 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 17 May 2006 14:25:22 +0200 (CEST) Subject: [Lxml-checkins] r27354 - lxml/trunk/doc Message-ID: <20060517122522.74C5C10063@code0.codespeak.net> Author: scoder Date: Wed May 17 14:25:21 2006 New Revision: 27354 Modified: lxml/trunk/doc/api.txt lxml/trunk/doc/compatibility.txt lxml/trunk/doc/main.txt Log: doc re-reads and fixes Modified: lxml/trunk/doc/api.txt ============================================================================== --- lxml/trunk/doc/api.txt (original) +++ lxml/trunk/doc/api.txt Wed May 17 14:25:21 2006 @@ -9,10 +9,11 @@ ---------- lxml.etree tries to follow the `ElementTree API`_ wherever it can. There are -however some incompatibilities (see compatibility.txt). The extensions are +however some incompatibilities (see `compatibility`_). The extensions are documented here. .. _`ElementTree API`: http://effbot.org/zone/element-index.htm +.. _`compatibility`: compatibility.html If you need to know which version of lxml is installed, you can access the ``lxml.etree.LXML_VERSION`` attribute to retrieve a version tuple. Note, @@ -338,7 +339,7 @@ enable style sheets to read and write files on the local file system. See the `document loader documentation`_ on how to deal with this. -.. _`resolver documentation`: resolvers.html +.. _`document loader documentation`: resolvers.html RelaxNG Modified: lxml/trunk/doc/compatibility.txt ============================================================================== --- lxml/trunk/doc/compatibility.txt (original) +++ lxml/trunk/doc/compatibility.txt Wed May 17 14:25:21 2006 @@ -36,17 +36,17 @@ In most parts of the API, ElementTree uses plain strings and unicode strings as what they are. This includes Element.text, Element.tail and many other places. However, the ElementTree parsers assume by default that any string - (`str` or `unicode`) contains ASCII data and raise an exception if strings - do not match the expected encoding. + (`str` or `unicode`) contains ASCII data. They raise an exception if + strings do not match the expected encoding. etree has the same idea about plain strings (`str`) as ElementTree. For unicode strings, however, etree assumes throughout the API that they are Python unicode encoded strings rather than byte data. This includes the parsers. It is therefore perfectly correct to pass XML unicode data into the etree parsers in form of Python unicode strings. It is an error, on the - other hand, if unicode strings specify an encoding in their XML declaration. - Note also that Python unicode strings are platform specific. Such an - encoding specifier would not be portable. + other hand, if unicode strings specify an encoding in their XML declaration, + as this conflicts with the characteristic encoding of Python unicode + strings. * ElementTree allows you to place an Element in two different trees as the same time. Thus, this:: @@ -114,7 +114,7 @@ like ElementTree's. copy.copy() however does *not* create a shallow copy where elements are shared between trees, as this makes no sense in the context of libxml2 trees. Note that lxml can deep-copy trees considerably - faster than than ElementTree. + faster than ElementTree. * etree allows navigation to the parent of a node by the ``getparent()`` method. This is not possible in ElementTree as the underlying tree model Modified: lxml/trunk/doc/main.txt ============================================================================== --- lxml/trunk/doc/main.txt (original) +++ lxml/trunk/doc/main.txt Wed May 17 14:25:21 2006 @@ -99,7 +99,7 @@ .. _`Relax NG`: http://www.relaxng.org/ .. _`XML Schema`: http://www.w3.org/XML/Schema .. _`XSLT`: http://www.w3.org/TR/xslt -.. _`c14n`: http://www.w3.org/TR/2001/REC-xml-c14n-20010315 +.. _`c14n`: http://www.w3.org/TR/xml-c14n .. _`implementing namespaces`: namespace_extensions.html .. _`SAX compliant API`: sax.html From scoder at codespeak.net Wed May 17 14:35:44 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 17 May 2006 14:35:44 +0200 (CEST) Subject: [Lxml-checkins] r27356 - lxml/trunk/doc Message-ID: <20060517123544.6FDA010063@code0.codespeak.net> Author: scoder Date: Wed May 17 14:35:43 2006 New Revision: 27356 Modified: lxml/trunk/doc/build.txt Log: doc fixes Modified: lxml/trunk/doc/build.txt ============================================================================== --- lxml/trunk/doc/build.txt (original) +++ lxml/trunk/doc/build.txt Wed May 17 14:35:43 2006 @@ -103,7 +103,8 @@ libxml2, libxslt, and zlib libraries are all available from xmlsoft.org. The place to go on the ftp site is ftp://xmlsoft.org/libxml2/win32. -Your directory should now have something like the following files in it:: +Your directory should now have the following files in it (although possibly +different versions):: iconv-1.9.1.win32.zip libxml2-2.6.23.win32.zip @@ -111,7 +112,7 @@ lxml-0.9.2.tgz zlib-1.2.3.win32.zip -Now extract each of those files in the _same_ directory. Now you should have +Now extract each of those files in the *same* directory. Now you should have something like this:: iconv-1.9.1.win32/ From scoder at codespeak.net Wed May 17 14:39:35 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 17 May 2006 14:39:35 +0200 (CEST) Subject: [Lxml-checkins] r27358 - in lxml/trunk: . doc Message-ID: <20060517123935.BA79710068@code0.codespeak.net> Author: scoder Date: Wed May 17 14:39:34 2006 New Revision: 27358 Modified: lxml/trunk/CREDITS.txt lxml/trunk/doc/build.txt Log: merged in doc updates from branch Modified: lxml/trunk/CREDITS.txt ============================================================================== --- lxml/trunk/CREDITS.txt (original) +++ lxml/trunk/CREDITS.txt Wed May 17 14:39:34 2006 @@ -7,7 +7,8 @@ Stefan Behnel - core development work (SAX support, misc patches) -Olivier Grisel - improved (c)ElementTree compatibility patches +Olivier Grisel - improved (c)ElementTree compatibility patches, + website improvements. Florian Wagner - help with copy.deepcopy support, bug reporting @@ -41,6 +42,10 @@ Trent Mick - setup.py patch +Steve Howe - Windows builds + +David Sankel - building statically on Windows + Thanks also to: * the libxml2 project for a great XML library. @@ -53,4 +58,3 @@ Holger Krekel for hosting it on codespeak.net * Infrae for initiating the project. - Modified: lxml/trunk/doc/build.txt ============================================================================== --- lxml/trunk/doc/build.txt (original) +++ lxml/trunk/doc/build.txt Wed May 17 14:39:34 2006 @@ -106,25 +106,25 @@ Your directory should now have the following files in it (although possibly different versions):: -iconv-1.9.1.win32.zip -libxml2-2.6.23.win32.zip -libxslt-1.1.15.win32.zip -lxml-0.9.2.tgz -zlib-1.2.3.win32.zip + iconv-1.9.1.win32.zip + libxml2-2.6.23.win32.zip + libxslt-1.1.15.win32.zip + lxml-0.9.2.tgz + zlib-1.2.3.win32.zip Now extract each of those files in the *same* directory. Now you should have something like this:: -iconv-1.9.1.win32/ -iconv-1.9.1.win32.zip -libxml2-2.6.23.win32/ -libxml2-2.6.23.win32.zip -libxslt-1.1.15.win32/ -libxslt-1.1.15.win32.zip -lxml-0.9.2/ -lxml-0.9.2.tgz -zlib-1.2.3.win32/ -zlib-1.2.3.win32.zip + iconv-1.9.1.win32/ + iconv-1.9.1.win32.zip + libxml2-2.6.23.win32/ + libxml2-2.6.23.win32.zip + libxslt-1.1.15.win32/ + libxslt-1.1.15.win32.zip + lxml-0.9.2/ + lxml-0.9.2.tgz + zlib-1.2.3.win32/ + zlib-1.2.3.win32.zip Go to the lxml-0.9.2 directory and edit the file ``setup.py``. There should be a section that looks like this:: From scoder at codespeak.net Wed May 17 14:43:06 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 17 May 2006 14:43:06 +0200 (CEST) Subject: [Lxml-checkins] r27359 - lxml/trunk/doc Message-ID: <20060517124306.D0CFB1006B@code0.codespeak.net> Author: scoder Date: Wed May 17 14:43:05 2006 New Revision: 27359 Modified: lxml/trunk/doc/build.txt Log: doc fixes Modified: lxml/trunk/doc/build.txt ============================================================================== --- lxml/trunk/doc/build.txt (original) +++ lxml/trunk/doc/build.txt Wed May 17 14:43:05 2006 @@ -112,7 +112,7 @@ lxml-0.9.2.tgz zlib-1.2.3.win32.zip -Now extract each of those files in the *same* directory. Now you should have +Now extract each of those files in the *same* directory. This should give you something like this:: iconv-1.9.1.win32/ From scoder at codespeak.net Wed May 17 15:01:39 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 17 May 2006 15:01:39 +0200 (CEST) Subject: [Lxml-checkins] r27362 - lxml/trunk Message-ID: <20060517130139.588A510060@code0.codespeak.net> Author: scoder Date: Wed May 17 15:01:37 2006 New Revision: 27362 Modified: lxml/trunk/TODO.txt Log: cleanup in TODO.txt Modified: lxml/trunk/TODO.txt ============================================================================== --- lxml/trunk/TODO.txt (original) +++ lxml/trunk/TODO.txt Wed May 17 15:01:37 2006 @@ -6,24 +6,10 @@ * potential threading issues in XPath extension functions? -* Python extension functions, threading issues. - -* Improved Relax NG error reporting. Right now we only get valid or invalid. - -* Improved XML Schema error reporting. Right now we only get valid or invalid. - -* Improved error handling in general; test structured exceptions in more - detail. - * See whether XInclude support can mimic ElementTree's API. * Test XML entities, also in an ElementTree context. -* Support for loading files from other places than filesystem, for - instance xslt:include, xslt:import, XInclude, Relax NG import. - -* More tests for error handling. - In general ---------- @@ -31,43 +17,24 @@ * will namespaces nodes of unknown namespaces be added (and never freed?) -* Various (c)ElementTree builders and parser APIs. Are they needed? - * iterparse support would be nice. -* memory errors and memory leaks when returning nodes from XPath - extension functions. - Top level --------- -* parse() support for custom parsers. (?) - * ProcessingInstruction -* XMLID - ElementInterface ----------------- -* improve getiterator() implementation to use Python-level iterators - ElementTree ----------- * _setroot(), even though this is not strictly a public method. -* parse() - this seems hard to implement sanely so this may be an - incompatibility. - -* improve write() and write_c14n() support to use file pointers - directly where possible, instead of going through memory. - QName ----- -Not yet implemented. - Features -------- From scoder at codespeak.net Thu May 18 08:38:23 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 18 May 2006 08:38:23 +0200 (CEST) Subject: [Lxml-checkins] r27383 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20060518063823.C91BA10063@code0.codespeak.net> Author: scoder Date: Thu May 18 08:38:20 2006 New Revision: 27383 Modified: lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/etree.pyx lxml/trunk/src/lxml/python.pxd lxml/trunk/src/lxml/tests/test_etree.py lxml/trunk/src/lxml/tree.pxd lxml/trunk/src/lxml/xmlwriter.pxi Log: support XML pretty printing in output functions, major cleanup in xmlwriter.pxi Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Thu May 18 08:38:20 2006 @@ -7,6 +7,8 @@ Features added -------------- +* Formatted output via ``pretty_print`` keyword to serialization functions + * XSLT can block access to file system and network via ``XSLTAccessControl`` * ElementTree.write() no longer serializes in memory (reduced memory Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Thu May 18 08:38:20 2006 @@ -348,7 +348,7 @@ def __get__(self): return DocInfo(self._doc) - def write(self, file, encoding='us-ascii'): + def write(self, file, encoding='us-ascii', pretty_print=False): if encoding in ('utf8', 'UTF8', 'utf-8'): encoding = 'UTF-8' if encoding == 'UTF-8' or encoding == 'us-ascii': @@ -356,7 +356,8 @@ write_declaration = 0 else: write_declaration = 1 - _tofile(file, self._context_node, encoding, write_declaration) + _tofilelike(file, self._context_node, encoding, + write_declaration, bool(pretty_print)) def getiterator(self, tag=None): root = self.getroot() @@ -1405,12 +1406,15 @@ return isinstance(element, _Element) def dump(_NodeBase elem not None): - _dumpToFile(sys.stdout, elem._doc._c_doc, elem._c_node) + _dumpToFile(sys.stdout, elem._c_node) -def tostring(element_or_tree, encoding='us-ascii', xml_declaration=None): +def tostring(element_or_tree, encoding='us-ascii', + xml_declaration=None, pretty_print=False): "Serialize an element to an encoded string representation of its XML tree." cdef int write_declaration + cdef int c_pretty_print encoding = str(encoding) + c_pretty_print = bool(pretty_print) if xml_declaration is None: # by default, write an XML declaration only for non-standard encodings write_declaration = (encoding != 'us-ascii') @@ -1419,24 +1423,27 @@ if isinstance(element_or_tree, _NodeBase): return _tostring(<_NodeBase>element_or_tree, - encoding, write_declaration) + encoding, write_declaration, c_pretty_print) elif isinstance(element_or_tree, _ElementTree): return _tostring((<_ElementTree>element_or_tree)._context_node, - encoding, write_declaration) + encoding, write_declaration, c_pretty_print) else: raise TypeError, "Type '%s' cannot be serialized." % type(element_or_tree) -def tounicode(element_or_tree): +def tounicode(element_or_tree, pretty_print=False): """Serialize an element to the Python unicode representation of its XML tree. Note that the result does not carry an XML encoding declaration and is therefore not necessarily suited for serialization without further treatment.""" + cdef int c_pretty_print + c_pretty_print = bool(pretty_print) if isinstance(element_or_tree, _NodeBase): - return _tounicode(<_NodeBase>element_or_tree) + return _tounicode(<_NodeBase>element_or_tree, c_pretty_print) elif isinstance(element_or_tree, _ElementTree): - return _tounicode((<_ElementTree>element_or_tree)._context_node) + return _tounicode((<_ElementTree>element_or_tree)._context_node, + c_pretty_print) else: raise TypeError, "Type '%s' cannot be serialized." % type(element_or_tree) Modified: lxml/trunk/src/lxml/python.pxd ============================================================================== --- lxml/trunk/src/lxml/python.pxd (original) +++ lxml/trunk/src/lxml/python.pxd Thu May 18 08:38:20 2006 @@ -6,7 +6,7 @@ ctypedef int Py_ssize_t cdef int INT_MAX - cdef FILE* PyFile_AsFile(PyObject* p) + cdef FILE* PyFile_AsFile(object p) cdef int PyFile_Check(object p) cdef object PyFile_Name(object p) Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Thu May 18 08:38:20 2006 @@ -449,6 +449,24 @@ tostring = self.etree.tostring self.assertRaises(TypeError, self.etree.tostring, None) + def test_tostring_pretty(self): + tostring = self.etree.tostring + Element = self.etree.Element + SubElement = self.etree.SubElement + + a = Element('a') + b = SubElement(a, 'b') + c = SubElement(a, 'c') + + result = tostring(a) + self.assertEquals(result, "") + + result = tostring(a, pretty_print=False) + self.assertEquals(result, "") + + result = tostring(a, pretty_print=True) + self.assertEquals(result, "\n \n \n") + def test_tounicode(self): tounicode = self.etree.tounicode Element = self.etree.Element @@ -497,6 +515,24 @@ self.assert_(tounicode(b) == 'Foo' or tounicode(b) == 'Foo') + def test_tounicode_pretty(self): + tounicode = self.etree.tounicode + Element = self.etree.Element + SubElement = self.etree.SubElement + + a = Element('a') + b = SubElement(a, 'b') + c = SubElement(a, 'c') + + result = tounicode(a) + self.assertEquals(result, "") + + result = tounicode(a, pretty_print=False) + self.assertEquals(result, "") + + result = tounicode(a, pretty_print=True) + self.assertEquals(result, "\n \n \n") + def _writeElement(self, element, encoding='us-ascii'): """Write out element for comparison. """ Modified: lxml/trunk/src/lxml/tree.pxd ============================================================================== --- lxml/trunk/src/lxml/tree.pxd (original) +++ lxml/trunk/src/lxml/tree.pxd Thu May 18 08:38:20 2006 @@ -202,6 +202,7 @@ cdef xmlBuffer* xmlBufferCreate() cdef char* xmlBufferContent(xmlBuffer* buf) cdef int xmlBufferLength(xmlBuffer* buf) + cdef int xmlKeepBlanksDefault(int val) cdef extern from "libxml/xmlIO.h": cdef int xmlOutputBufferWriteString(xmlOutputBuffer* out, char* str) Modified: lxml/trunk/src/lxml/xmlwriter.pxi ============================================================================== --- lxml/trunk/src/lxml/xmlwriter.pxi (original) +++ lxml/trunk/src/lxml/xmlwriter.pxi Thu May 18 08:38:20 2006 @@ -1,8 +1,10 @@ # XML serialization and output functions -cdef _tostring(_NodeBase element, encoding, int write_xml_declaration): +tree.xmlKeepBlanksDefault(0) + +cdef _tostring(_NodeBase element, encoding, + int write_xml_declaration, int pretty_print): "Serialize an element to an encoded string representation of its XML tree." - cdef _Document doc cdef tree.xmlOutputBuffer* c_buffer cdef tree.xmlBuffer* c_result_buffer cdef tree.xmlCharEncodingHandler* enchandler @@ -12,7 +14,6 @@ return None if encoding in ('utf8', 'UTF8', 'utf-8'): encoding = 'UTF-8' - doc = element._doc c_enc = encoding # it is necessary to *and* find the encoding handler *and* use # encoding during output @@ -22,8 +23,8 @@ raise LxmlError, "Failed to create output buffer" try: - _writeNodeToBuffer(c_buffer, doc._c_doc, element._c_node, - doc._c_doc.version, c_enc, write_xml_declaration) + _writeNodeToBuffer(c_buffer, element._c_node, c_enc, + write_xml_declaration, pretty_print) tree.xmlOutputBufferFlush(c_buffer) if c_buffer.conv is not NULL: c_result_buffer = c_buffer.conv @@ -36,20 +37,17 @@ tree.xmlOutputBufferClose(c_buffer) return result -cdef _tounicode(_NodeBase element): +cdef _tounicode(_NodeBase element, int pretty_print): "Serialize an element to the Python unicode representation of its XML tree." - cdef _Document doc cdef tree.xmlOutputBuffer* c_buffer cdef tree.xmlBuffer* c_result_buffer if element is None: return None - doc = element._doc c_buffer = tree.xmlAllocOutputBuffer(NULL) if c_buffer is NULL: raise LxmlError, "Failed to create output buffer" try: - _writeNodeToBuffer(c_buffer, doc._c_doc, element._c_node, - NULL, NULL, 0) + _writeNodeToBuffer(c_buffer, element._c_node, NULL, 0, pretty_print) tree.xmlOutputBufferFlush(c_buffer) if c_buffer.conv is not NULL: c_result_buffer = c_buffer.conv @@ -64,14 +62,15 @@ return result cdef void _writeNodeToBuffer(tree.xmlOutputBuffer* c_buffer, - xmlDoc* c_doc, xmlNode* c_node, - char* xml_version, char* encoding, - int write_xml_declaration): + xmlNode* c_node, char* encoding, + int write_xml_declaration, int pretty_print): + cdef xmlDoc* c_doc + c_doc = c_node.doc if write_xml_declaration: - _writeDeclarationToBuffer(c_buffer, xml_version, encoding) + _writeDeclarationToBuffer(c_buffer, c_doc.version, encoding) - tree.xmlNodeDumpOutput(c_buffer, c_doc, c_node, 0, 0, encoding) - _dumpNextNode(c_buffer, c_doc, c_node, encoding) + tree.xmlNodeDumpOutput(c_buffer, c_doc, c_node, 0, pretty_print, encoding) + _writeTail(c_buffer, c_node, encoding, pretty_print) cdef void _writeDeclarationToBuffer(tree.xmlOutputBuffer* c_buffer, char* version, char* encoding): @@ -83,6 +82,16 @@ tree.xmlOutputBufferWriteString(c_buffer, encoding) tree.xmlOutputBufferWriteString(c_buffer, "'?>\n") + +cdef void _writeTail(tree.xmlOutputBuffer* c_buffer, xmlNode* c_node, + char* encoding, int pretty_print): + "Write the element tail." + c_node = c_node.next + while c_node is not NULL and c_node.type == tree.XML_TEXT_NODE: + tree.xmlNodeDumpOutput(c_buffer, c_node.doc, c_node, 0, + pretty_print, encoding) + c_node = c_node.next + # output to file-like objects cdef class _FileWriter: @@ -127,7 +136,8 @@ cdef int _closeFilelikeWriter(void* ctxt): return (<_FileWriter>ctxt).close() -cdef _tofile(f, _NodeBase element, encoding, int write_declaration): +cdef _tofilelike(f, _NodeBase element, encoding, + int write_xml_declaration, int pretty_print): cdef _FileWriter writer cdef tree.xmlOutputBuffer* c_buffer cdef tree.xmlCharEncodingHandler* enchandler @@ -148,34 +158,21 @@ else: raise TypeError, "File or filename expected, got '%s'" % type(f) - _writeNodeToBuffer(c_buffer, - element._doc._c_doc, element._c_node, - element._doc._c_doc.version, c_enc, - write_declaration) + _writeNodeToBuffer(c_buffer, element._c_node, c_enc, + write_xml_declaration, pretty_print) tree.xmlOutputBufferClose(c_buffer) if writer is not None: writer._exc_context._raise_if_stored() -# node dump functions (mainly for debug) +# dump node to file (mainly for debug) -cdef _dumpToFile(f, xmlDoc* c_doc, xmlNode* c_node): - cdef python.PyObject* o +cdef _dumpToFile(f, xmlNode* c_node): cdef tree.xmlOutputBuffer* c_buffer - if not python.PyFile_Check(f): raise ValueError, "Not a file" - o = f - c_buffer = tree.xmlOutputBufferCreateFile(python.PyFile_AsFile(o), NULL) - tree.xmlNodeDumpOutput(c_buffer, c_doc, c_node, 0, 0, NULL) - # dump next node if it's a text node - _dumpNextNode(c_buffer, c_doc, c_node, NULL) + c_buffer = tree.xmlOutputBufferCreateFile(python.PyFile_AsFile(f), NULL) + tree.xmlNodeDumpOutput(c_buffer, c_node.doc, c_node, 0, 0, NULL) + _writeTail(c_buffer, c_node, NULL, 0) tree.xmlOutputBufferWriteString(c_buffer, '\n') tree.xmlOutputBufferFlush(c_buffer) - -cdef void _dumpNextNode(tree.xmlOutputBuffer* c_buffer, xmlDoc* c_doc, - xmlNode* c_node, char* encoding): - cdef xmlNode* c_next - c_next = c_node.next - if c_next is not NULL and c_next.type == tree.XML_TEXT_NODE: - tree.xmlNodeDumpOutput(c_buffer, c_doc, c_next, 0, 0, encoding) From scoder at codespeak.net Thu May 18 08:55:39 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 18 May 2006 08:55:39 +0200 (CEST) Subject: [Lxml-checkins] r27384 - lxml/trunk/src/lxml Message-ID: <20060518065539.6A27910063@code0.codespeak.net> Author: scoder Date: Thu May 18 08:55:38 2006 New Revision: 27384 Modified: lxml/trunk/src/lxml/xmlwriter.pxi Log: prettification Modified: lxml/trunk/src/lxml/xmlwriter.pxi ============================================================================== --- lxml/trunk/src/lxml/xmlwriter.pxi (original) +++ lxml/trunk/src/lxml/xmlwriter.pxi Thu May 18 08:55:38 2006 @@ -137,7 +137,7 @@ return (<_FileWriter>ctxt).close() cdef _tofilelike(f, _NodeBase element, encoding, - int write_xml_declaration, int pretty_print): + int write_xml_declaration, int pretty_print): cdef _FileWriter writer cdef tree.xmlOutputBuffer* c_buffer cdef tree.xmlCharEncodingHandler* enchandler From scoder at codespeak.net Thu May 18 08:59:08 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 18 May 2006 08:59:08 +0200 (CEST) Subject: [Lxml-checkins] r27385 - in lxml/trunk: . src/lxml Message-ID: <20060518065908.C0F0110063@code0.codespeak.net> Author: scoder Date: Thu May 18 08:59:07 2006 New Revision: 27385 Modified: lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/etree.pyx Log: fix: prevent ElementTree methods from treating empty root node (raise AssertionError) Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Thu May 18 08:59:07 2006 @@ -59,6 +59,9 @@ Bugs fixed ---------- +* Some ElementTree methods could crash if the root node was not initialized + (neither file nor element passed to the constructor) + * Element/SubElement failed to set attribute namespaces from passed ``attrib`` dictionary Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Thu May 18 08:59:07 2006 @@ -330,6 +330,11 @@ cdef _Document _doc cdef _NodeBase _context_node + # we have to take care here: the document may not have a root node! + cdef _assertHasRoot(self): + assert self._context_node is not None, \ + "ElementTree not initialized, missing root" + def parse(self, source, parser=None): """Updates self with the content of source and returns its root """ @@ -349,6 +354,7 @@ return DocInfo(self._doc) def write(self, file, encoding='us-ascii', pretty_print=False): + self._assertHasRoot() if encoding in ('utf8', 'UTF8', 'utf-8'): encoding = 'UTF-8' if encoding == 'UTF-8' or encoding == 'us-ascii': @@ -366,22 +372,22 @@ return root.getiterator(tag) def find(self, path): + self._assertHasRoot() root = self.getroot() - assert root is not None if path[:1] == "/": path = "." + path return root.find(path) def findtext(self, path, default=None): + self._assertHasRoot() root = self.getroot() - assert root is not None if path[:1] == "/": path = "." + path return root.findtext(path, default) def findall(self, path): + self._assertHasRoot() root = self.getroot() - assert root is not None if path[:1] == "/": path = "." + path return root.findall(path) @@ -402,6 +408,7 @@ against the same document, it is more efficient to use XPathEvaluator directly. """ + self._assertHasRoot() evaluator = XPathElementEvaluator(self._context_node, namespaces) return evaluator.evaluate(_path, **_variables) @@ -417,6 +424,7 @@ multiple documents, it is more efficient to use the XSLT class directly. """ + self._assertHasRoot() style = XSLT(_xslt, extensions) return style(self, **_kw) @@ -432,6 +440,7 @@ multiple documents, it is more efficient to use the RelaxNG class directly. """ + self._assertHasRoot() schema = RelaxNG(relaxng) return schema.validate(self) @@ -447,6 +456,7 @@ multiple documents, it is more efficient to use the XMLSchema class directly. """ + self._assertHasRoot() schema = XMLSchema(xmlschema) return schema.validate(self) @@ -460,6 +470,7 @@ # at all. The XInclude nodes appear to be still being in the same # parent and same document, but they must not be connected to the # tree.. + self._assertHasRoot() result = xinclude.xmlXIncludeProcessTree(self._context_node._c_node) if result == -1: raise XIncludeError, "XInclude processing failed" @@ -471,6 +482,7 @@ cdef xmlDoc* c_doc cdef char* data cdef int bytes + self._assertHasRoot() c_base_doc = self._doc._c_doc c_doc = _fakeRootDoc(c_base_doc, self._context_node._c_node) From scoder at codespeak.net Thu May 18 10:37:20 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 18 May 2006 10:37:20 +0200 (CEST) Subject: [Lxml-checkins] r27388 - in lxml/trunk: . doc Message-ID: <20060518083720.AE7FF1006B@code0.codespeak.net> Author: scoder Date: Thu May 18 10:37:19 2006 New Revision: 27388 Added: lxml/trunk/doc/pubkey.asc Modified: lxml/trunk/MANIFEST.in Log: added public key used for package signing Modified: lxml/trunk/MANIFEST.in ============================================================================== --- lxml/trunk/MANIFEST.in (original) +++ lxml/trunk/MANIFEST.in Thu May 18 10:37:19 2006 @@ -1,5 +1,5 @@ include setup.py MANIFEST.in *.txt recursive-include src *.pyx *.pxd *.pxi *.py etree.c etree.h recursive-include src/lxml/tests *.rng *.xslt *.xml -recursive-include doc *.txt *.html *.xml *.mgp +recursive-include doc *.txt *.html *.xml *.mgp pubkey.asc exclude doc/pyrex.txt Added: lxml/trunk/doc/pubkey.asc ============================================================================== --- (empty file) +++ lxml/trunk/doc/pubkey.asc Thu May 18 10:37:19 2006 @@ -0,0 +1,36 @@ +-----BEGIN PGP PUBLIC KEY BLOCK----- +Version: GnuPG v1.4.2 (GNU/Linux) + +mQGiBEQf3JQRBACciSqxoX0q3VurkRENVVtG/pVqtFh/d2CohbVJlLCrO4s7nnPj +CTfZFt6tmykZjsLJl24XpEJt0O/C0jLcaBqvXVgVvRXHz4DjEYYuQF4LPthhI4MA +4T7ExptX4lU5g3BVJ46vPU8uRBbbxarBRas9rYewgnrYKWpZZCa7yMq+9wCgnyyR +Si4E3viLwi77jda135nA6vcD/iqu8zIl9/dFuUcOvxJrhrm+UdY72puZ1TVczSAH +GOqMjrKkfyHlaJh/ZzWENpTZIfOdVhy7Chvva18vH4Wz7jKj5UeIpRrBvjAD28r3 +Y3W5bfsnpPkvDOyU1vqBsw4q+/250GXEX0JqV2Rbf5yLVgEZPdGrswO460dr4UVS +8RS0BACYTmyrz57AugHc5tRkqNw6o7ux2deOT0c3AbUcOWtOocGumCsUf+M1nOrc +VWkeBWTv4HIIiecWYY/KwIemTthQGjxywaZDxOlBT0BOL/+vfYTq/plZULXr+g90 +rSe82+kLl9N5onkBDJKeDIcJDzRoxIRPV1i0Om/5JBI4jmUnv7QnU3RlZmFuIEJl +aG5lbCA8c2NvZGVyQHVzZXJzLmJlcmxpb3MuZGU+iF8EExECACAFAkQiqKYCGwMG +CwkIBwMCBBUCCAMEFgIDAQIeAQIXgAAKCRANPVNpCNOgHi+2AJ0a0JH8iP3RqrOL +JefvHz1dSl3MxACYo7Ma6CeIgsGnyaSSdNOmNVXn+IhGBBARAgAGBQJEIqk0AAoJ +ELO5mMzzmgZbmCcAoKZ2En1IlsxBpaPPxgWYrUOWfc6hAKCBWODMMOYptCBkSrjg +m3gsrjHgYbQsU3RlZmFuIEJlaG5lbCA8c2NvZGVyQHVzZXJzLnNvdXJjZWZvcmdl +Lm5ldD6IYAQTEQIAIAUCRB/clAIbAwYLCQgHAwIEFQIIAwQWAgMBAh4BAheAAAoJ +EA09U2kI06Aen2YAn0hvuDs+Gslq9vPRFFbsFNJI40PmAJ0chjiiEy0xV5C+n6YX +XFuldRDILYhGBBARAgAGBQJEIp4AAAoJELO5mMzzmgZbgKQAn3pWrmFdj8YaEyuR +tEjKVZJDQ6ZVAJ0Y1igwADT40BPra+G/xiLa3YbCrrkCDQREH9ynEAgAiR4/0r0d +doViNECfSLClllu5K0Bo1SEiMtvVNC3sJYgVzBddD8Xn8UAdjyAgmaL5FC2FsNQu +RxxKkNlHNYCq8ZSWtZaL2MQ+SyMUyHv6VXVCGuSW0COpzbx58u+SZpjyESJ1kaZc +73SaIw6kv/dVQHjurwmlo1lg3dLZ3PG08WGCYUMqkkv2K+J7+puzE2Cjo31gTq4s +LYDCV26wjVQ6BqT2EcHQhVEjh0xq5ugc908cr/2FQAKkTifEbF+OVBGWiFMGgri+ +6+G54/BV/RakpvNCFYBiZHn/M9mQaWt7XoTmnEQ1ldq5KNlRhkqnQRF/NK5VpGcQ +29As28aqpZTECwADBgf/WlRvBRI1Q1eIv2falEv7C6sOxqc3kr5z1uUBTRG5v9t6 +ff9k/J4oC6cnQx00GK3ZR8ija6bl8zwu+0m0M3rW49Krb1rsiT7r4ahOZ7p9RRro +oG3NbUJYgMG10D1nxpaioYqa/m+PpILJM0wfYZZEuX0xkZcOB24yb+J7EIcGR09T +mMd5sXtdTU+w/p7Xi2cP61uQ8qixyHBH8E06qgW2JtVFV9rGn7CNUOvkNaUBRnY5 +QxhdkvKJRx7voOLYWZFUBIWgto+6vmTgKmc2Ho6qddzME9UgwUNcknRgm0cf6Cxr +6zPtxZl8a6KemjQcK7kARSmMNCDkqp/Pohe519A5vYhJBBgRAgAJBQJEH9ynAhsM +AAoJEA09U2kI06Aesv4AnjiVQVLzqnNS/64vvMMP1UARY3HtAJ90YxNGhRNIhWYL +UU16oJlGD/9M1Q== +=gWy2 +-----END PGP PUBLIC KEY BLOCK----- From scoder at codespeak.net Thu May 18 10:38:08 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 18 May 2006 10:38:08 +0200 (CEST) Subject: [Lxml-checkins] r27389 - lxml/trunk/doc Message-ID: <20060518083808.6F8551006B@code0.codespeak.net> Author: scoder Date: Thu May 18 10:38:07 2006 New Revision: 27389 Modified: lxml/trunk/doc/build.txt Log: added section by Andreas Pakulat on how to build Debian packages from SVN sources Modified: lxml/trunk/doc/build.txt ============================================================================== --- lxml/trunk/doc/build.txt (original) +++ lxml/trunk/doc/build.txt Thu May 18 10:38:07 2006 @@ -169,3 +169,26 @@ python setup.py bdist_wininst This will create a windows installer in the ``pkg`` directory. + + +Building Debian packages from Subversion sources +------------------------------------------------ + +`Andreas Pakulat`_ proposed the following approach. + +.. _`Andreas Pakulat`: http://codespeak.net/pipermail/lxml-dev/2006-May/001254.html + +* ``apt-get source lxml`` +* remove the unpacked directory +* tar.gz the trunk version and replace the orig.tar.gz that lies in the + directory +* do ``dpkg -x lxml-...dsc`` and cd into the newly created directory +* run ``dch -i`` and add a comment like "use trunk version", this will + increase the debian version number so apt/dpkg don't get confused +* run ``dpkg-buildpackage -rfakeroot -us -uc`` to build the package + +Eventually dpkg-buildpackage will tell you that some dependecies are missing, +you can either install them manually or run apt-get build-dep lxml + +That will give you .deb packages in the parent directory which can be +installed using ``dpkg -i``. From scoder at codespeak.net Thu May 18 10:41:04 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 18 May 2006 10:41:04 +0200 (CEST) Subject: [Lxml-checkins] r27390 - lxml/trunk/doc Message-ID: <20060518084104.DF4D81006B@code0.codespeak.net> Author: scoder Date: Thu May 18 10:41:03 2006 New Revision: 27390 Modified: lxml/trunk/doc/build.txt Log: doc cleanup Modified: lxml/trunk/doc/build.txt ============================================================================== --- lxml/trunk/doc/build.txt (original) +++ lxml/trunk/doc/build.txt Thu May 18 10:41:03 2006 @@ -171,8 +171,8 @@ This will create a windows installer in the ``pkg`` directory. -Building Debian packages from Subversion sources ------------------------------------------------- +Building Debian packages from SVN sources +----------------------------------------- `Andreas Pakulat`_ proposed the following approach. @@ -180,7 +180,7 @@ * ``apt-get source lxml`` * remove the unpacked directory -* tar.gz the trunk version and replace the orig.tar.gz that lies in the +* tar.gz the lxml SVN version and replace the orig.tar.gz that lies in the directory * do ``dpkg -x lxml-...dsc`` and cd into the newly created directory * run ``dch -i`` and add a comment like "use trunk version", this will From scoder at codespeak.net Thu May 18 11:00:04 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 18 May 2006 11:00:04 +0200 (CEST) Subject: [Lxml-checkins] r27391 - lxml/trunk/src/lxml Message-ID: <20060518090004.D5C3F1006D@code0.codespeak.net> Author: scoder Date: Thu May 18 11:00:03 2006 New Revision: 27391 Modified: lxml/trunk/src/lxml/etree.pyx Log: allow 'alpha' and 'beta' in version strings, represent as -2 and -1 in version tuple Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Thu May 18 11:00:03 2006 @@ -64,6 +64,7 @@ try: version_list.append(int(item)) except ValueError: + item = {'alpha':-2, 'beta':-1}.get(item.lower(), item) version_list.append(item) return tuple(version_list) From scoder at codespeak.net Thu May 18 11:03:51 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 18 May 2006 11:03:51 +0200 (CEST) Subject: [Lxml-checkins] r27392 - lxml/trunk/src/lxml Message-ID: <20060518090351.9479D10061@code0.codespeak.net> Author: scoder Date: Thu May 18 11:03:48 2006 New Revision: 27392 Modified: lxml/trunk/src/lxml/etree.pyx Log: cleanup: make special casing more explicit Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Thu May 18 11:03:48 2006 @@ -64,7 +64,10 @@ try: version_list.append(int(item)) except ValueError: - item = {'alpha':-2, 'beta':-1}.get(item.lower(), item) + if item == 'alpha': + item = -2 + elif item == 'beta': + item = -1 version_list.append(item) return tuple(version_list) From scoder at codespeak.net Thu May 18 11:10:37 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 18 May 2006 11:10:37 +0200 (CEST) Subject: [Lxml-checkins] r27393 - lxml/trunk Message-ID: <20060518091037.9FF5610061@code0.codespeak.net> Author: scoder Date: Thu May 18 11:10:36 2006 New Revision: 27393 Modified: lxml/trunk/setup.py Log: fix commit 26466: clean up appending '-lexslt' to xslt_libs Modified: lxml/trunk/setup.py ============================================================================== --- lxml/trunk/setup.py (original) +++ lxml/trunk/setup.py Thu May 18 11:10:36 2006 @@ -68,8 +68,7 @@ # compile also against libexslt! xslt_libs = flags('xslt-config --libs') -xslt_libs.append('-lexslt') -for i, libname in (): # enumerate(xslt_libs): +for i, libname in enumerate(xslt_libs): if 'exslt' in libname: break if 'xslt' in libname: From scoder at codespeak.net Thu May 18 11:48:31 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 18 May 2006 11:48:31 +0200 (CEST) Subject: [Lxml-checkins] r27395 - in lxml/trunk: . doc Message-ID: <20060518094831.A5E8310061@code0.codespeak.net> Author: scoder Date: Thu May 18 11:48:30 2006 New Revision: 27395 Modified: lxml/trunk/doc/build.txt lxml/trunk/setup.py Log: simplify static compilation by providing a place to fill in the library names Modified: lxml/trunk/doc/build.txt ============================================================================== --- lxml/trunk/doc/build.txt (original) +++ lxml/trunk/doc/build.txt Thu May 18 11:48:30 2006 @@ -100,8 +100,8 @@ .. _`David Sankel`: http://codespeak.net/pipermail/lxml-dev/2006-May/001196.html Download lxml and all required libraries to the same directory. The iconv, -libxml2, libxslt, and zlib libraries are all available from xmlsoft.org. The -place to go on the ftp site is ftp://xmlsoft.org/libxml2/win32. +libxml2, libxslt, and zlib libraries are all available from the ftp site +ftp://ftp.zlatkovic.com/pub/libxml/. Your directory should now have the following files in it (although possibly different versions):: @@ -126,47 +126,47 @@ zlib-1.2.3.win32/ zlib-1.2.3.win32.zip -Go to the lxml-0.9.2 directory and edit the file ``setup.py``. There should be -a section that looks like this:: +Go to the lxml-0.9.2 directory and edit the file ``setup.py``. There should +be a section near the top that looks like this:: - ext_modules = [ Extension( - "lxml.etree", - sources = sources, - extra_compile_args = ['-w'] + flags('xslt-config --cflags'), - extra_link_args = flags('xslt-config --libs') - )], - -The problem here is that the Windows version of libxslt does not install the -little program ``xslt-config``, which would normally auto-configure the build -process. + def setupStaticBuild(): + cflags = [ + ] + xslt_libs = [ + ] + result = (cflags, xslt_libs) + # return result + raise NotImplementedError, \ + "Static build not configured, see doc/build.txt" Change this section to something like this, but take care to use the correct version numbers:: - ext_modules = [ Extension( - "lxml.etree", - sources = sources, - extra_compile_args = ['-w'] + [ + def setupStaticBuild(): + cflags = [ "-I..\\libxml2-2.6.23.win32\\include ", "-I..\\libxslt-1.1.15.win32\\include", "-I..\\zlib-1.2.3.win32\\include", "-I..\\iconv-1.9.1.win32\\include" - ], - extra_link_args = [ + ] + xslt_libs = [ "..\\libxml2-2.6.23.win32\\lib\\libxml2_a.lib", "..\\libxslt-1.1.15.win32\\lib\\libxslt_a.lib", + "..\\libxslt-1.1.15.win32\\lib\\libexslt_a.lib", "..\\zlib-1.2.3.win32\\lib\\zlib.lib", - "..\\iconv- 1.9.1.win32\\lib\\iconv_a.lib" + "..\\iconv-1.9.1.win32\\lib\\iconv_a.lib" ] - )], + result = (cflags, xslt_libs) + return result The ``_a`` part of the library names means that we are linking statically against the named library files. If you want to use DLLs, you need to link against the DLL version of the libraries. -Now you should be able to use setup.py and everything should work well. Try calling:: +Now you should be able to use setup.py and everything should work well. Try +calling:: - python setup.py bdist_wininst + python setup.py bdist_wininst --static This will create a windows installer in the ``pkg`` directory. Modified: lxml/trunk/setup.py ============================================================================== --- lxml/trunk/setup.py (original) +++ lxml/trunk/setup.py Thu May 18 11:48:30 2006 @@ -1,9 +1,32 @@ import sys, os, os.path, re +setup_args = {} +try: + from setuptools import setup + from setuptools.extension import Extension + # prevent setuptools from making local etree.so copies: + setup_args['zip_safe'] = False +except ImportError: + from distutils.core import setup + from distutils.extension import Extension + +# This is called if the '--static' option is passed +def setupStaticBuild(): + "See doc/build.txt to make this work." + cflags = [ + ] + xslt_libs = [ + ] + result = (cflags, xslt_libs) + # return result + raise NotImplementedError, \ + "Static build not configured, see doc/build.txt" + def flags(cmd): wf, rf, ef = os.popen3(cmd) return rf.read().strip().split(' ') + src_dir = os.path.join(os.getcwd(), os.path.dirname(sys.argv[0])) version = open(os.path.join(src_dir, 'version.txt')).read().strip() @@ -26,17 +49,7 @@ print "Building lxml version", svn_version -setup_args = {} -changelog_text = "" - -try: - from setuptools import setup - from setuptools.extension import Extension - # prevent setuptools from making local etree.so copies: - setup_args['zip_safe'] = False -except ImportError: - from distutils.core import setup - from distutils.extension import Extension +# setup etree extension building try: from Pyrex.Distutils import build_ext as build_pyx @@ -47,6 +60,35 @@ sources = ["src/lxml/etree.c"] try: + sys.argv.remove('--static') +except ValueError: + # we are not compiling statically + cflags = flags('xslt-config --cflags') + xslt_libs = flags('xslt-config --libs') + + # compile also against libexslt! + for i, libname in enumerate(xslt_libs): + if 'exslt' in libname: + break + if 'xslt' in libname: + xslt_libs.insert(i, libname.replace('xslt', 'exslt')) + break +else: + # use the static setup as configured in setupStaticBuild + cflags, xslt_libs = setupStaticBuild() + +ext_modules = [ Extension( + "lxml.etree", + sources = sources, + extra_compile_args = ['-w'] + cflags, + extra_link_args = xslt_libs + )] + + +# setup ChangeLog entry + +changelog_text = "" +try: changelog = open(os.path.join(src_dir, "CHANGES.txt"), 'r') except: print "*NOTE*: couldn't open CHANGES.txt !" @@ -66,14 +108,6 @@ changelog.close() -# compile also against libexslt! -xslt_libs = flags('xslt-config --libs') -for i, libname in enumerate(xslt_libs): - if 'exslt' in libname: - break - if 'xslt' in libname: - xslt_libs.insert(i, libname.replace('xslt', 'exslt')) - break setup( name = "lxml", @@ -109,11 +143,6 @@ package_dir = {'': 'src'}, packages = ['lxml'], - ext_modules = [ Extension( - "lxml.etree", - sources = sources, - extra_compile_args = ['-w'] + flags('xslt-config --cflags'), - extra_link_args = xslt_libs - )], + ext_modules = ext_modules, **setup_args ) From scoder at codespeak.net Thu May 18 11:54:27 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 18 May 2006 11:54:27 +0200 (CEST) Subject: [Lxml-checkins] r27396 - lxml/trunk Message-ID: <20060518095427.4362A10061@code0.codespeak.net> Author: scoder Date: Thu May 18 11:54:26 2006 New Revision: 27396 Modified: lxml/trunk/version.txt Log: set version to 1.0.beta Modified: lxml/trunk/version.txt ============================================================================== --- lxml/trunk/version.txt (original) +++ lxml/trunk/version.txt Thu May 18 11:54:26 2006 @@ -1 +1 @@ -0.9.2 +1.0.beta From scoder at codespeak.net Thu May 18 11:55:05 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 18 May 2006 11:55:05 +0200 (CEST) Subject: [Lxml-checkins] r27397 - lxml/trunk/src/lxml Message-ID: <20060518095505.C517810061@code0.codespeak.net> Author: scoder Date: Thu May 18 11:55:04 2006 New Revision: 27397 Modified: lxml/trunk/src/lxml/etree.pyx Log: small cleanup in version extraction code Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Thu May 18 11:55:04 2006 @@ -62,13 +62,13 @@ l = (version.replace('-', '.').split('.') + [0]*4)[:4] for item in l: try: - version_list.append(int(item)) + item = int(item) except ValueError: if item == 'alpha': item = -2 elif item == 'beta': item = -1 - version_list.append(item) + version_list.append(item) return tuple(version_list) cdef __unpackIntVersion(int c_version): From scoder at codespeak.net Thu May 18 11:57:38 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 18 May 2006 11:57:38 +0200 (CEST) Subject: [Lxml-checkins] r27398 - lxml/trunk/src/lxml/tests Message-ID: <20060518095738.3494710061@code0.codespeak.net> Author: scoder Date: Thu May 18 11:57:37 2006 New Revision: 27398 Modified: lxml/trunk/src/lxml/tests/test_etree.py Log: test case for etree.__version__ string and LXML_VERSION tuple Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Thu May 18 11:57:37 2006 @@ -16,7 +16,18 @@ class ETreeOnlyTestCase(HelperTestCase): """Tests only for etree, not ElementTree""" etree = etree - + + def test_version(self): + self.assert_(isinstance(etree.__version__, str)) + self.assert_(isinstance(etree.LXML_VERSION, tuple)) + self.assertEqual(len(etree.LXML_VERSION), 4) + self.assert_(isinstance(etree.LXML_VERSION[0], int)) + self.assert_(isinstance(etree.LXML_VERSION[1], int)) + self.assert_(isinstance(etree.LXML_VERSION[2], int)) + self.assert_(isinstance(etree.LXML_VERSION[3], int)) + self.assert_(etree.__version__.startswith( + str(etree.LXML_VERSION[0]))) + def test_parse_error(self): parse = self.etree.parse # from StringIO From scoder at codespeak.net Thu May 18 12:27:47 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 18 May 2006 12:27:47 +0200 (CEST) Subject: [Lxml-checkins] r27399 - lxml/trunk/doc Message-ID: <20060518102747.77A421006B@code0.codespeak.net> Author: scoder Date: Thu May 18 12:27:45 2006 New Revision: 27399 Modified: lxml/trunk/doc/build.txt Log: clarifications in doc/build.txt Modified: lxml/trunk/doc/build.txt ============================================================================== --- lxml/trunk/doc/build.txt (original) +++ lxml/trunk/doc/build.txt Thu May 18 12:27:45 2006 @@ -92,10 +92,11 @@ ------------------------- Most operating systems have proper package management that makes installing -current versions of libxml2 and libxslt easy. However, Microsoft Windows -lacks these capabilities. It can therefore be interesting to statically link -the external libraries into lxml.etree to avoid having to install them -separately. `David Sankel`_ proposed the following approach. +current versions of libxml2 and libxslt easy. The most famous exception is +Microsoft Windows, which entirely lacks these capabilities. It can therefore +be interesting to statically link the external libraries into lxml.etree to +avoid having to install them separately. `David Sankel`_ proposed the +following approach. .. _`David Sankel`: http://codespeak.net/pipermail/lxml-dev/2006-May/001196.html @@ -160,11 +161,11 @@ return result The ``_a`` part of the library names means that we are linking statically -against the named library files. If you want to use DLLs, you need to link -against the DLL version of the libraries. +against the named library files. If you want to use dynamic libraries, you +need to link against the DLL version of the libraries. -Now you should be able to use setup.py and everything should work well. Try -calling:: +Now you should be able to pass the ``--static`` option to setup.py and +everything should work well. Try calling:: python setup.py bdist_wininst --static From scoder at codespeak.net Thu May 18 13:01:58 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 18 May 2006 13:01:58 +0200 (CEST) Subject: [Lxml-checkins] r27402 - lxml/trunk Message-ID: <20060518110158.34E581006B@code0.codespeak.net> Author: scoder Date: Thu May 18 13:01:57 2006 New Revision: 27402 Modified: lxml/trunk/MANIFEST.in Log: forgot to include .css file for generated HTML pages Modified: lxml/trunk/MANIFEST.in ============================================================================== --- lxml/trunk/MANIFEST.in (original) +++ lxml/trunk/MANIFEST.in Thu May 18 13:01:57 2006 @@ -1,5 +1,5 @@ include setup.py MANIFEST.in *.txt recursive-include src *.pyx *.pxd *.pxi *.py etree.c etree.h recursive-include src/lxml/tests *.rng *.xslt *.xml -recursive-include doc *.txt *.html *.xml *.mgp pubkey.asc +recursive-include doc *.txt *.html *.css *.xml *.mgp pubkey.asc exclude doc/pyrex.txt From scoder at codespeak.net Thu May 18 13:02:30 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 18 May 2006 13:02:30 +0200 (CEST) Subject: [Lxml-checkins] r27403 - lxml/trunk/doc Message-ID: <20060518110230.0DDCB1006B@code0.codespeak.net> Author: scoder Date: Thu May 18 13:02:29 2006 New Revision: 27403 Modified: lxml/trunk/doc/build.txt Log: refer to version 1.0.0 in docs Modified: lxml/trunk/doc/build.txt ============================================================================== --- lxml/trunk/doc/build.txt (original) +++ lxml/trunk/doc/build.txt Thu May 18 13:02:29 2006 @@ -122,12 +122,12 @@ libxml2-2.6.23.win32.zip libxslt-1.1.15.win32/ libxslt-1.1.15.win32.zip - lxml-0.9.2/ - lxml-0.9.2.tgz + lxml-1.0.0/ + lxml-1.0.0.tgz zlib-1.2.3.win32/ zlib-1.2.3.win32.zip -Go to the lxml-0.9.2 directory and edit the file ``setup.py``. There should +Go to the lxml-1.0.0 directory and edit the file ``setup.py``. There should be a section near the top that looks like this:: def setupStaticBuild(): From scoder at codespeak.net Thu May 18 13:02:54 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 18 May 2006 13:02:54 +0200 (CEST) Subject: [Lxml-checkins] r27404 - lxml/trunk/doc Message-ID: <20060518110254.40A9E1006B@code0.codespeak.net> Author: scoder Date: Thu May 18 13:02:53 2006 New Revision: 27404 Modified: lxml/trunk/doc/build.txt Log: refer to version 1.0.0 in docs Modified: lxml/trunk/doc/build.txt ============================================================================== --- lxml/trunk/doc/build.txt (original) +++ lxml/trunk/doc/build.txt Thu May 18 13:02:53 2006 @@ -110,7 +110,7 @@ iconv-1.9.1.win32.zip libxml2-2.6.23.win32.zip libxslt-1.1.15.win32.zip - lxml-0.9.2.tgz + lxml-1.0.0.tgz zlib-1.2.3.win32.zip Now extract each of those files in the *same* directory. This should give you From scoder at codespeak.net Thu May 18 13:13:27 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 18 May 2006 13:13:27 +0200 (CEST) Subject: [Lxml-checkins] r27406 - lxml/trunk/doc Message-ID: <20060518111327.AB0FC1006B@code0.codespeak.net> Author: scoder Date: Thu May 18 13:13:26 2006 New Revision: 27406 Modified: lxml/trunk/doc/api.txt Log: clarification on tounicode() vs. tostring() Modified: lxml/trunk/doc/api.txt ============================================================================== --- lxml/trunk/doc/api.txt (original) +++ lxml/trunk/doc/api.txt Thu May 18 13:13:26 2006 @@ -172,12 +172,14 @@ >>> etree.tounicode(et) u'' -Note that the unicode strings returned by ``tounicode()`` never have an XML -declaration and therefore do not specify an encoding. This makes it possible -to pass them back into the lxml parsers. However, you may have to add a -declaration yourself if you want to serialize such a unicode string to a byte -stream later. In contrast, the ``tostring()`` function automatically adds a -declaration as needed that reflects the encoding of the returned byte string. +If you want to save the result to a file or pass it over the network, you +should use ``write()`` or ``tostring()`` with an encoding argument (typically +UTF-8) to serialize the XML. The main reason is that unicode strings returned +by ``tounicode()`` never have an XML declaration and therefore do not specify +an encoding. In contrast, the ``tostring()`` function automatically adds a +declaration as needed that reflects the encoding of the returned string. This +makes it possible for other parsers to correctly parse the XML byte stream. +Note that using ``tostring()`` with UTF-8 is also typically faster. xpath method on ElementTree, Element From scoder at codespeak.net Thu May 18 13:27:40 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 18 May 2006 13:27:40 +0200 (CEST) Subject: [Lxml-checkins] r27408 - lxml/trunk Message-ID: <20060518112740.089CF1006B@code0.codespeak.net> Author: scoder Date: Thu May 18 13:27:38 2006 New Revision: 27408 Modified: lxml/trunk/CHANGES.txt Log: set current version in CHANGES.txt to 1.0.beta Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Thu May 18 13:27:38 2006 @@ -1,8 +1,8 @@ lxml changelog ============== -current -======= +1.0.beta (2006-05-18) +===================== Features added -------------- From scoder at codespeak.net Thu May 18 13:30:52 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 18 May 2006 13:30:52 +0200 (CEST) Subject: [Lxml-checkins] r27409 - lxml/tag/lxml-1.0.beta Message-ID: <20060518113052.E267E10061@code0.codespeak.net> Author: scoder Date: Thu May 18 13:30:51 2006 New Revision: 27409 Added: lxml/tag/lxml-1.0.beta/ - copied from r27408, lxml/trunk/ Log: tag for 1.0.beta From scoder at codespeak.net Thu May 18 13:45:11 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 18 May 2006 13:45:11 +0200 (CEST) Subject: [Lxml-checkins] r27410 - lxml/trunk Message-ID: <20060518114511.B312F1006B@code0.codespeak.net> Author: scoder Date: Thu May 18 13:45:10 2006 New Revision: 27410 Modified: lxml/trunk/MANIFEST.in Log: explicitly name .txt files in root directory Modified: lxml/trunk/MANIFEST.in ============================================================================== --- lxml/trunk/MANIFEST.in (original) +++ lxml/trunk/MANIFEST.in Thu May 18 13:45:10 2006 @@ -1,4 +1,5 @@ -include setup.py MANIFEST.in *.txt +include setup.py MANIFEST.in version.txt +include CHANGES.txt CREDITS.txt INSTALL.txt LICENSES.txt README.txt TODO.txt recursive-include src *.pyx *.pxd *.pxi *.py etree.c etree.h recursive-include src/lxml/tests *.rng *.xslt *.xml recursive-include doc *.txt *.html *.css *.xml *.mgp pubkey.asc From scoder at codespeak.net Thu May 18 13:53:49 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 18 May 2006 13:53:49 +0200 (CEST) Subject: [Lxml-checkins] r27411 - lxml/trunk Message-ID: <20060518115349.52B581006B@code0.codespeak.net> Author: scoder Date: Thu May 18 13:53:48 2006 New Revision: 27411 Modified: lxml/trunk/setup.py Log: automise setting trove devel status (alpha/beta/stable) from version string Modified: lxml/trunk/setup.py ============================================================================== --- lxml/trunk/setup.py (original) +++ lxml/trunk/setup.py Thu May 18 13:53:48 2006 @@ -47,6 +47,13 @@ ''' % svn_version) version_h.close() +if 'alpha' in version: + dev_status = 'Development Status :: 3 - Alpha' +elif 'beta' in version: + dev_status = 'Development Status :: 4 - Beta' +else: + dev_status = 'Development Status :: 5 - Production/Stable' + print "Building lxml version", svn_version # setup etree extension building @@ -130,7 +137,7 @@ """ + changelog_text, classifiers = [ - 'Development Status :: 5 - Production/Stable', + dev_status, 'Intended Audience :: Developers', 'Intended Audience :: Information Technology', 'License :: OSI Approved :: BSD License', From scoder at codespeak.net Thu May 18 15:42:55 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 18 May 2006 15:42:55 +0200 (CEST) Subject: [Lxml-checkins] r27413 - lxml/trunk Message-ID: <20060518134255.9030A1006B@code0.codespeak.net> Author: scoder Date: Thu May 18 15:42:54 2006 New Revision: 27413 Modified: lxml/trunk/bench.py Log: fix bench.py xslt_extensions_old and xpath_extensions_old, used a non-public API that's no longer available Modified: lxml/trunk/bench.py ============================================================================== --- lxml/trunk/bench.py (original) +++ lxml/trunk/bench.py Thu May 18 15:42:54 2006 @@ -449,7 +449,7 @@ return element[0] else: return () - extensions = {None : {'child' : return_child}} + extensions = {(None, 'child') : return_child} xpath = self.etree.XPath("child(.)", extensions=extensions) for child in root: xpath(child) @@ -474,7 +474,7 @@ def return_child(_, elements): return elements[0][0] - extensions = {'testns' : {'child' : return_child}} + extensions = {('testns', 'child') : return_child} transform = self.etree.XSLT(tree, extensions) for i in range(10): From scoder at codespeak.net Thu May 18 16:19:08 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 18 May 2006 16:19:08 +0200 (CEST) Subject: [Lxml-checkins] r27414 - in lxml/trunk: . src/lxml Message-ID: <20060518141908.D4E741006B@code0.codespeak.net> Author: scoder Date: Thu May 18 16:19:07 2006 New Revision: 27414 Modified: lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/apihelpers.pxi lxml/trunk/src/lxml/etree.pyx Log: cleanup changeDocumentBelow, rename it to moveNodeToDocument to reflect its use Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Thu May 18 16:19:07 2006 @@ -1,6 +1,16 @@ lxml changelog ============== +current +======= + +Features added +-------------- + +Bugs fixed +---------- + + 1.0.beta (2006-05-18) ===================== Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Thu May 18 16:19:07 2006 @@ -388,7 +388,7 @@ return source.geturl() return None -cdef void changeDocumentBelow(_NodeBase node, _Document doc, int recursive): +cdef void moveNodeToDocument(_NodeBase node, _Document doc): """For a node and all nodes below, change document. A node can change document in certain operations as an XML @@ -396,11 +396,11 @@ tree below (including the current node). It also reconciliates namespaces so they're correct inside the new environment. """ - if recursive: - changeDocumentBelowHelper(node._c_node, doc) + if node._doc is not doc: + changeDocumentBelow(node._c_node, doc) tree.xmlReconciliateNs(doc._c_doc, node._c_node) - -cdef void changeDocumentBelowHelper(xmlNode* c_node, _Document doc): + +cdef void changeDocumentBelow(xmlNode* c_node, _Document doc): cdef ProxyRef* ref cdef xmlNode* c_current cdef xmlAttr* c_attr_current @@ -410,22 +410,23 @@ return # different _c_doc c_node.doc = doc._c_doc - - if c_node._private is not NULL: - ref = c_node._private - while ref is not NULL: - proxy = <_NodeBase>ref.proxy - proxy._doc = doc - ref = ref.next # adjust all children c_current = c_node.children while c_current is not NULL: - changeDocumentBelowHelper(c_current, doc) + changeDocumentBelow(c_current, doc) c_current = c_current.next # adjust all attributes c_attr_current = c_node.properties while c_attr_current is not NULL: - changeDocumentBelowHelper(c_current, doc) + changeDocumentBelow(c_current, doc) c_attr_current = c_attr_current.next + + # adjust Python references last + if c_node._private is not NULL: + ref = c_node._private + while ref is not NULL: + proxy = <_NodeBase>ref.proxy + proxy._doc = doc + ref = ref.next Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Thu May 18 16:19:07 2006 @@ -524,16 +524,14 @@ def __setitem__(self, Py_ssize_t index, _NodeBase element): cdef xmlNode* c_node cdef xmlNode* c_next - cdef int foreign c_node = _findChild(self._c_node, index) if c_node is NULL: raise IndexError - foreign = self._doc is not element._doc c_next = element._c_node.next _removeText(c_node.next) tree.xmlReplaceNode(c_node, element._c_node) _moveTail(c_next, element._c_node) - changeDocumentBelow(element, self._doc, foreign) + moveNodeToDocument(element, self._doc) def __delitem__(self, Py_ssize_t index): cdef xmlNode* c_node @@ -552,7 +550,6 @@ cdef xmlNode* c_node cdef xmlNode* c_next cdef _Element mynode - cdef int foreign # first, find start of slice c_node = _findChild(self._c_node, start) # now delete the slice @@ -568,7 +565,6 @@ for mynode in value: if mynode is None: raise TypeError, "Node must not be None." - foreign = self._doc is not mynode._doc # store possible text tail c_next = mynode._c_node.next # now move node previous to insertion point @@ -577,7 +573,7 @@ # and move tail just behind his node _moveTail(c_next, mynode._c_node) # move it into a new document - changeDocumentBelow(mynode, self._doc, foreign) + moveNodeToDocument(mynode, self._doc) def __deepcopy__(self, memo): return self.__copy__() @@ -600,8 +596,6 @@ def append(self, _Element element not None): cdef xmlNode* c_next cdef xmlNode* c_node - cdef int foreign - foreign = self._doc is not element._doc c_node = element._c_node # store possible text node c_next = c_node.next @@ -612,7 +606,7 @@ _moveTail(c_next, c_node) # uh oh, elements may be pointing to different doc when # parent element has moved; change them too.. - changeDocumentBelow(element, self._doc, foreign) + moveNodeToDocument(element, self._doc) def clear(self): cdef xmlAttr* c_attr @@ -642,16 +636,14 @@ def insert(self, index, _Element element not None): cdef xmlNode* c_node cdef xmlNode* c_next - cdef int foreign c_node = _findChild(self._c_node, index) if c_node is NULL: self.append(element) return - foreign = self._doc is not element._doc c_next = element._c_node.next tree.xmlAddPrevSibling(c_node, element._c_node) _moveTail(c_next, element._c_node) - changeDocumentBelow(element, self._doc, foreign) + moveNodeToDocument(element, self._doc) def remove(self, _Element element not None): cdef xmlNode* c_node @@ -1381,16 +1373,7 @@ c_doc = _newDoc() doc = _documentFactory(c_doc, parser) - etree = _elementTreeFactory(doc, element) - -## # XXX what if element and file are both not None? -## if element is not None: -## c_next = element._c_node.next -## tree.xmlDocSetRootElement(etree._c_doc, element._c_node) -## _moveTail(c_next, element._c_node) -## changeDocumentBelow(element, etree) - - return etree + return _elementTreeFactory(doc, element) def HTML(text): cdef _Document doc From scoder at codespeak.net Thu May 18 21:07:47 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 18 May 2006 21:07:47 +0200 (CEST) Subject: [Lxml-checkins] r27420 - lxml/trunk/src/lxml Message-ID: <20060518190747.8FD3B1006E@code0.codespeak.net> Author: scoder Date: Thu May 18 21:07:46 2006 New Revision: 27420 Modified: lxml/trunk/src/lxml/etree.pyx Log: whitespace Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Thu May 18 21:07:46 2006 @@ -1148,7 +1148,7 @@ return True else: return False - + cdef _Attrib _attribFactory(_Document doc, xmlNode* c_node): cdef _Attrib result result = getProxy(c_node, PROXY_ATTRIB) From scoder at codespeak.net Thu May 18 21:09:30 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 18 May 2006 21:09:30 +0200 (CEST) Subject: [Lxml-checkins] r27421 - lxml/trunk/src/lxml Message-ID: <20060518190930.16B291006E@code0.codespeak.net> Author: scoder Date: Thu May 18 21:09:29 2006 New Revision: 27421 Modified: lxml/trunk/src/lxml/apihelpers.pxi Log: do not update C pointers of elements and attributes to the xmlDoc in changeDocumentBelow: already done by libxml2's xmlSetTreeDoc Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Thu May 18 21:09:29 2006 @@ -401,6 +401,11 @@ tree.xmlReconciliateNs(doc._c_doc, node._c_node) cdef void changeDocumentBelow(xmlNode* c_node, _Document doc): + """Update the Python references in the tree below the node. + + Note that we expect C pointers to the document to be updated already by + libxml2. + """ cdef ProxyRef* ref cdef xmlNode* c_current cdef xmlAttr* c_attr_current @@ -408,22 +413,14 @@ if c_node is NULL: return - # different _c_doc - c_node.doc = doc._c_doc # adjust all children c_current = c_node.children while c_current is not NULL: changeDocumentBelow(c_current, doc) c_current = c_current.next - - # adjust all attributes - c_attr_current = c_node.properties - while c_attr_current is not NULL: - changeDocumentBelow(c_current, doc) - c_attr_current = c_attr_current.next - # adjust Python references last + # adjust Python references last (may trigger GC on _Document) if c_node._private is not NULL: ref = c_node._private while ref is not NULL: From scoder at codespeak.net Thu May 18 21:10:39 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 18 May 2006 21:10:39 +0200 (CEST) Subject: [Lxml-checkins] r27422 - lxml/trunk Message-ID: <20060518191039.B7C5D1006E@code0.codespeak.net> Author: scoder Date: Thu May 18 21:10:38 2006 New Revision: 27422 Modified: lxml/trunk/bench.py Log: cleanup in get_attributes benchmark: do not set them, only read them Modified: lxml/trunk/bench.py ============================================================================== --- lxml/trunk/bench.py (original) +++ lxml/trunk/bench.py Thu May 18 21:10:38 2006 @@ -361,9 +361,8 @@ @with_attributes(True) def bench_get_attributes(self, root): for child in root: - child.set('a', 'bla') - for child in root: - child.get('a') + child.get('bla1') + child.get('{attr}test1') def bench_setget_attributes(self, root): for child in root: From scoder at codespeak.net Thu May 18 21:25:19 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 18 May 2006 21:25:19 +0200 (CEST) Subject: [Lxml-checkins] r27423 - lxml/trunk/src/lxml Message-ID: <20060518192519.2DA2D1006E@code0.codespeak.net> Author: scoder Date: Thu May 18 21:25:18 2006 New Revision: 27423 Modified: lxml/trunk/src/lxml/apihelpers.pxi Log: more cleanup in changeDocumentBelow to remove redundancy Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Thu May 18 21:25:18 2006 @@ -408,22 +408,16 @@ """ cdef ProxyRef* ref cdef xmlNode* c_current - cdef xmlAttr* c_attr_current cdef _NodeBase proxy - - if c_node is NULL: - return - - # adjust all children + # adjust all children recursively c_current = c_node.children while c_current is not NULL: changeDocumentBelow(c_current, doc) c_current = c_current.next - # adjust Python references last (may trigger GC on _Document) - if c_node._private is not NULL: - ref = c_node._private - while ref is not NULL: - proxy = <_NodeBase>ref.proxy - proxy._doc = doc - ref = ref.next + # adjust Python references of current node + ref = c_node._private + while ref is not NULL: + proxy = <_NodeBase>ref.proxy + proxy._doc = doc + ref = ref.next From scoder at codespeak.net Thu May 18 23:42:41 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 18 May 2006 23:42:41 +0200 (CEST) Subject: [Lxml-checkins] r27425 - lxml/trunk/src/lxml Message-ID: <20060518214241.7252C1006E@code0.codespeak.net> Author: scoder Date: Thu May 18 23:42:39 2006 New Revision: 27425 Modified: lxml/trunk/src/lxml/parser.pxi Log: helper functions for copying xmlDoc's: central point to work around libxml2 URL bug and hand on parser dict Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Thu May 18 23:42:39 2006 @@ -623,6 +623,32 @@ __GLOBAL_PARSER_CONTEXT._initDocDict(result) return result +cdef xmlDoc* _copyDoc(xmlDoc* c_doc, int recursive): + cdef xmlDoc* result + result = tree.xmlCopyDoc(c_doc, recursive) + if c_doc.URL is not NULL: + # handle a bug in older libxml2 versions + if result.URL is not NULL: + tree.xmlFree(result.URL) + result.URL = tree.xmlStrdup(c_doc.URL) + __GLOBAL_PARSER_CONTEXT._initDocDict(result) + return result + +cdef xmlDoc* _copyDocRoot(xmlDoc* c_doc, xmlNode* c_new_root): + "Recursively copy the document and make c_new_root the new root node." + cdef xmlDoc* result + cdef xmlDoc* fake_c_doc + fake_c_doc = _fakeRootDoc(c_doc, c_new_root) + result = tree.xmlCopyDoc(fake_c_doc, 1) + _destroyFakeDoc(c_doc, fake_c_doc) + if c_doc.URL is not NULL: + # handle a bug in older libxml2 versions + if result.URL is not NULL: + tree.xmlFree(result.URL) + result.URL = tree.xmlStrdup(c_doc.URL) + __GLOBAL_PARSER_CONTEXT._initDocDict(result) + return result + ############################################################ ## API level helper functions for _Document creation ## (here we convert to UTF-8) From scoder at codespeak.net Thu May 18 23:49:25 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 18 May 2006 23:49:25 +0200 (CEST) Subject: [Lxml-checkins] r27426 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20060518214925.ED4991006E@code0.codespeak.net> Author: scoder Date: Thu May 18 23:49:24 2006 New Revision: 27426 Modified: lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/apihelpers.pxi lxml/trunk/src/lxml/etree.pyx lxml/trunk/src/lxml/tests/test_elementtree.py lxml/trunk/src/lxml/xslt.pxi Log: fix memory deallocation crash introduced by new Element.__copy__ method, also clean up and fix copying documents by use of _copyDoc and _copyDocRoot helper functions Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Thu May 18 23:49:24 2006 @@ -7,9 +7,13 @@ Features added -------------- +* Deep copying Elements and ElementTrees maintains the document information + Bugs fixed ---------- +* Memory deallocation crash resulting from deep copying elements + 1.0.beta (2006-05-18) ===================== Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Thu May 18 23:49:24 2006 @@ -53,7 +53,7 @@ # already the root node return c_base_doc - c_doc = tree.xmlCopyDoc(c_base_doc, 0) # non recursive! + c_doc = _copyDoc(c_base_doc, 0) # non recursive! c_root = tree.xmlDocCopyNode(c_node, c_doc, 2) # non recursive! c_root.children = c_node.children Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Thu May 18 23:49:24 2006 @@ -581,15 +581,13 @@ def __copy__(self): cdef xmlNode* c_node cdef xmlDoc* c_doc - cdef xmlDoc* fake_c_doc cdef _Document doc + cdef _Document new_doc doc = self._doc - fake_c_doc = _fakeRootDoc(doc._c_doc, self._c_node) - c_doc = tree.xmlCopyDoc(fake_c_doc, 1) # recursive copy - _destroyFakeDoc(doc._c_doc, fake_c_doc) - doc = _documentFactory(c_doc, doc._parser) - return doc.getroot() - + c_doc = _copyDocRoot(doc._c_doc, self._c_node) # recursive + new_doc = _documentFactory(c_doc, doc._parser) + return new_doc.getroot() + def set(self, key, value): self.attrib[key] = value Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Thu May 18 23:49:24 2006 @@ -1804,6 +1804,14 @@ self.assertEquals('Foo', a.text) # XXX ElementTree will share nodes, but lxml.etree won't.. + def test_deepcopy_append(self): + # previously caused a crash + Element = self.etree.Element + + a = Element('a') + b = copy.deepcopy(a) + b.append( Element('c') ) + def test_element_boolean(self): etree = self.etree e = etree.Element('foo') Modified: lxml/trunk/src/lxml/xslt.pxi ============================================================================== --- lxml/trunk/src/lxml/xslt.pxi (original) +++ lxml/trunk/src/lxml/xslt.pxi Thu May 18 23:49:24 2006 @@ -81,7 +81,7 @@ c_doc = xslt_resolver_context._c_style_doc if c_doc is not NULL and c_doc.URL is not NULL: if cstd.strcmp(c_uri, c_doc.URL) == 0: - return tree.xmlCopyDoc(c_doc, 1) + return _copyDoc(c_doc, 1) # call the Python document loaders c_doc = NULL @@ -236,23 +236,16 @@ self._access_control = access_control # make a copy of the document as stylesheet parsing modifies it - fake_c_doc = _fakeRootDoc(doc._c_doc, root_node._c_node) - c_doc = tree.xmlCopyDoc(fake_c_doc, 1) - _destroyFakeDoc(doc._c_doc, fake_c_doc) + c_doc = _copyDocRoot(doc._c_doc, root_node._c_node) # make sure we always have a stylesheet URL - if c_doc.URL is not NULL: - # handle a bug in older libxml2 versions - tree.xmlFree(c_doc.URL) - if doc._c_doc.URL is not NULL: - c_doc.URL = tree.xmlStrdup(doc._c_doc.URL) - else: + if c_doc.URL is NULL: doc_url_utf = "XSLT:__STRING__XSLT__%s" % id(self) c_doc.URL = tree.xmlStrdup(_cstr(doc_url_utf)) self._xslt_resolver_context = _XSLTResolverContext(doc._parser) # keep a copy in case we need to access the stylesheet via 'document()' - self._xslt_resolver_context._c_style_doc = tree.xmlCopyDoc(c_doc, 1) + self._xslt_resolver_context._c_style_doc = _copyDoc(c_doc, 1) c_doc._private = self._xslt_resolver_context c_style = xslt.xsltParseStylesheetDoc(c_doc) @@ -274,7 +267,7 @@ if self._xslt_resolver_context is not None and \ self._xslt_resolver_context._c_style_doc is not NULL: tree.xmlFreeDoc(self._xslt_resolver_context._c_style_doc) - # this cleans up copy of doc as well + # this cleans up the doc copy as well xslt.xsltFreeStylesheet(self._c_style) property error_log: From scoder at codespeak.net Fri May 19 00:10:45 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 19 May 2006 00:10:45 +0200 (CEST) Subject: [Lxml-checkins] r27429 - lxml/trunk Message-ID: <20060518221045.A252A10063@code0.codespeak.net> Author: scoder Date: Fri May 19 00:10:43 2006 New Revision: 27429 Modified: lxml/trunk/bench.py Log: fix running all tests in bench.py Modified: lxml/trunk/bench.py ============================================================================== --- lxml/trunk/bench.py (original) +++ lxml/trunk/bench.py Fri May 19 00:10:43 2006 @@ -574,12 +574,12 @@ # sorted by name and tree tuple benchmarks = [ sorted(b.benchmarks()) for b in benchmark_suites ] - if len(sys.argv) > 1: - selected = [] - for name in sys.argv[1:]: - if not name.startswith('bench_'): - name = 'bench_' + name - selected.append(name) + selected = [] + for name in sys.argv[1:]: + if not name.startswith('bench_'): + name = 'bench_' + name + selected.append(name) + if selected: benchmarks = [ [ b for b in bs if [ match for match in selected if match in b[0] ] ] From scoder at codespeak.net Fri May 19 00:15:14 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 19 May 2006 00:15:14 +0200 (CEST) Subject: [Lxml-checkins] r27430 - lxml/trunk Message-ID: <20060518221514.838D710063@code0.codespeak.net> Author: scoder Date: Fri May 19 00:15:12 2006 New Revision: 27430 Modified: lxml/trunk/bench.py Log: 2nd try: fix running all tests in bench.py Modified: lxml/trunk/bench.py ============================================================================== --- lxml/trunk/bench.py (original) +++ lxml/trunk/bench.py Fri May 19 00:15:12 2006 @@ -555,13 +555,13 @@ except ImportError: pass - if '-a' in sys.argv: + try: + sys.argv.remove('-a') # 'all' ? - try: - from elementtree import ElementTree as ET - _etrees.append(ET) - except ImportError: - pass + from elementtree import ElementTree as ET + _etrees.append(ET) + except (ValueError, ImportError): + pass if not _etrees: print "No library to test. Exiting." @@ -574,12 +574,11 @@ # sorted by name and tree tuple benchmarks = [ sorted(b.benchmarks()) for b in benchmark_suites ] - selected = [] - for name in sys.argv[1:]: - if not name.startswith('bench_'): - name = 'bench_' + name - selected.append(name) - if selected: + if len(sys.argv) > 1: + selected = [] + for name in sys.argv[1:]: + selected.append(name) + print selected benchmarks = [ [ b for b in bs if [ match for match in selected if match in b[0] ] ] From scoder at codespeak.net Fri May 19 07:22:28 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 19 May 2006 07:22:28 +0200 (CEST) Subject: [Lxml-checkins] r27436 - lxml/trunk/src/lxml/tests Message-ID: <20060519052228.1BB6B1006E@code0.codespeak.net> Author: scoder Date: Fri May 19 07:22:26 2006 New Revision: 27436 Modified: lxml/trunk/src/lxml/tests/test_elementtree.py Log: extended test case Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Fri May 19 07:22:26 2006 @@ -1807,10 +1807,17 @@ def test_deepcopy_append(self): # previously caused a crash Element = self.etree.Element + tostring = self.etree.tostring a = Element('a') b = copy.deepcopy(a) - b.append( Element('c') ) + a.append( Element('C') ) + b.append( Element('X') ) + + self.assertEquals('', + tostring(a).replace(' ', '')) + self.assertEquals('', + tostring(b).replace(' ', '')) def test_element_boolean(self): etree = self.etree From scoder at codespeak.net Fri May 19 07:38:19 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 19 May 2006 07:38:19 +0200 (CEST) Subject: [Lxml-checkins] r27437 - lxml/trunk Message-ID: <20060519053819.ED2A91006E@code0.codespeak.net> Author: scoder Date: Fri May 19 07:38:18 2006 New Revision: 27437 Modified: lxml/trunk/bench.py Log: allow benchmarks to actively skip a testrun by raising 'SkippedTest', also catch Exceptions raised in tests Modified: lxml/trunk/bench.py ============================================================================== --- lxml/trunk/bench.py (original) +++ lxml/trunk/bench.py Fri May 19 07:38:18 2006 @@ -49,6 +49,8 @@ return function return set_libs +class SkippedTest(Exception): + pass class BenchMarkBase(object): atoz = string.ascii_lowercase @@ -408,14 +410,17 @@ for i in repeat: child.text + @onlylib('lxe') def bench_index(self, root): for child in root: root.index(child) + @onlylib('lxe') def bench_index_slice(self, root): for child in root[5:100]: root.index(child, 5, 100) + @onlylib('lxe') def bench_index_slice_neg(self, root): for child in root[-100:-5]: root.index(child, start=-100, stop=-5) @@ -647,12 +652,17 @@ print "(%-10s)" % tree_set_name, sys.stdout.flush() - result = run_bench(bench, *benchmark_setup) - - print "%9.4f msec/pass, best of (" % min(result), - for t in result: - print "%9.4f" % t, - print ")" + try: + result = run_bench(bench, *benchmark_setup) + except SkippedTest: + print "skipped" + except Exception, e: + print "failed: %s: %s" % (e.__class__.__name__, e) + else: + print "%9.4f msec/pass, best of (" % min(result), + for t in result: + print "%9.4f" % t, + print ")" if len(benchmark_suites) > 1: print # empty line between different benchmarks From scoder at codespeak.net Fri May 19 07:56:27 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 19 May 2006 07:56:27 +0200 (CEST) Subject: [Lxml-checkins] r27438 - lxml/trunk Message-ID: <20060519055627.CAB3F1006E@code0.codespeak.net> Author: scoder Date: Fri May 19 07:56:26 2006 New Revision: 27438 Modified: lxml/trunk/bench.py Log: fix test comparison in bench.py if tests are skipped via 'onlylib': previously could end up showing unrelated benchmarks next to each other and stopping before finishing all tests Modified: lxml/trunk/bench.py ============================================================================== --- lxml/trunk/bench.py (original) +++ lxml/trunk/bench.py Fri May 19 07:56:26 2006 @@ -203,8 +203,9 @@ continue method = getattr(self, name) if hasattr(method, 'LIBS') and self.lib_name not in method.LIBS: - benchmarks.append((name, None, (), 0, 0)) - continue + method_call = None + else: + method_call = method if method.__doc__: tree_sets = method.__doc__.split() else: @@ -223,7 +224,7 @@ for tree_tuple in tree_tuples: for tn in sorted(getattr(method, 'TEXT', (0,))): for an in sorted(getattr(method, 'ATTRIBUTES', (0,))): - benchmarks.append((name, method, tree_tuple, tn, an)) + benchmarks.append((name, method_call, tree_tuple, tn, an)) return benchmarks @@ -591,6 +592,9 @@ import time def run_bench(suite, method_name, method_call, tree_set, tn, an): + if method_call is None: + raise SkippedTest + current_time = time.time call_repeat = range(10) @@ -642,13 +646,9 @@ for bench_calls in izip(*benchmarks): for lib, (bench, benchmark_setup) in enumerate(izip(benchmark_suites, bench_calls)): - bench_name, method_call = benchmark_setup[:2] + bench_name = benchmark_setup[0] tree_set_name = build_treeset_name(*benchmark_setup[-3:]) print "%-3s: %-28s" % (bench.lib_name, bench_name[6:34]), - if method_call is None: - print "skipped" - continue - print "(%-10s)" % tree_set_name, sys.stdout.flush() From scoder at codespeak.net Fri May 19 08:17:18 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 19 May 2006 08:17:18 +0200 (CEST) Subject: [Lxml-checkins] r27439 - lxml/trunk Message-ID: <20060519061718.8A5601006E@code0.codespeak.net> Author: scoder Date: Fri May 19 08:17:17 2006 New Revision: 27439 Modified: lxml/trunk/bench.py Log: fix: add text to children of tree roots, previous benchmarks did not actually use it Modified: lxml/trunk/bench.py ============================================================================== --- lxml/trunk/bench.py (original) +++ lxml/trunk/bench.py Fri May 19 08:17:17 2006 @@ -4,8 +4,8 @@ TREE_FACTOR = 1 # increase tree size with '-l / '-L' cmd option -_TEXT = "some ASCII text" * 10 * TREE_FACTOR -_UTEXT = u"some klingon: \F8D2" * 10 * TREE_FACTOR +_TEXT = "some ASCII text" * TREE_FACTOR +_UTEXT = u"some klingon: \F8D2" * TREE_FACTOR _ATTRIBUTES = { '{attr}test1' : _UTEXT, '{attr}test2' : _UTEXT, @@ -140,6 +140,7 @@ root = self.etree.Element('{abc}rootnode') for ch1 in atoz: el = SubElement(root, "{bcd}"+ch1*5, attributes) + el.text = text for ch2 in atoz: for i in range(20 * TREE_FACTOR): SubElement(el, "{cdefg}%s%05d" % (ch2, i)) @@ -156,6 +157,7 @@ for ch1 in atoz: for i in range(20 * TREE_FACTOR): el = SubElement(root, "{bcd}"+ch1*5, attributes) + el.text = text for ch2 in atoz: SubElement(el, "{cdefg}%s%05d" % (ch2, i)) t = current_time() - t @@ -172,6 +174,8 @@ tag_no = count().next children = [ SubElement(c, "{bcd}a%05d" % i, attributes) for i,c in enumerate(chain(children, children, children)) ] + for child in root: + child.text = text t = current_time() - t return (root, t) @@ -185,6 +189,7 @@ children = [root] for ch1 in atoz: el = SubElement(root, "{bcd}"+ch1*5, attributes) + el.text = text SubElement(el, "{cdefg}abcde", attributes) SubElement(el, "{cdefg}bcdef", attributes) t = current_time() - t From scoder at codespeak.net Fri May 19 08:40:11 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 19 May 2006 08:40:11 +0200 (CEST) Subject: [Lxml-checkins] r27440 - lxml/trunk Message-ID: <20060519064011.B5CE610063@code0.codespeak.net> Author: scoder Date: Fri May 19 08:40:10 2006 New Revision: 27440 Modified: lxml/trunk/bench.py Log: cleanup in bench.py Modified: lxml/trunk/bench.py ============================================================================== --- lxml/trunk/bench.py (original) +++ lxml/trunk/bench.py Fri May 19 08:40:10 2006 @@ -72,7 +72,7 @@ setattr(self, fname, lambda : deepcopy(root)) else: def set_property(root, fname): - setattr(self, fname, self.et_make_factory(root)) + setattr(self, fname, self.et_make_clone_factory(root)) attribute_list = list(izip(count(), ({}, _ATTRIBUTES))) text_list = list(izip(count(), (None, _TEXT, _UTEXT))) @@ -95,7 +95,7 @@ def tree_builder(self, tree, tn, an): return getattr(self, self._tree_builder_name(tree, tn, an)) - def et_make_factory(self, elem): + def et_make_clone_factory(self, elem): def generate_elem(append, elem, level): var = "e" + str(level) arg = repr(elem.tag) @@ -343,7 +343,6 @@ child.append(el) def bench_makeelement(self, root): - Element = self.etree.Element empty_attrib = {} for child in root: child.makeelement('{test}test', empty_attrib) @@ -567,8 +566,8 @@ pass try: - sys.argv.remove('-a') # 'all' ? + sys.argv.remove('-a') from elementtree import ElementTree as ET _etrees.append(ET) except (ValueError, ImportError): From scoder at codespeak.net Fri May 19 08:43:34 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 19 May 2006 08:43:34 +0200 (CEST) Subject: [Lxml-checkins] r27441 - lxml/trunk Message-ID: <20060519064334.ED59F10063@code0.codespeak.net> Author: scoder Date: Fri May 19 08:43:34 2006 New Revision: 27441 Modified: lxml/trunk/bench.py Log: do not use unicode in attribute values Modified: lxml/trunk/bench.py ============================================================================== --- lxml/trunk/bench.py (original) +++ lxml/trunk/bench.py Fri May 19 08:43:34 2006 @@ -7,8 +7,8 @@ _TEXT = "some ASCII text" * TREE_FACTOR _UTEXT = u"some klingon: \F8D2" * TREE_FACTOR _ATTRIBUTES = { - '{attr}test1' : _UTEXT, - '{attr}test2' : _UTEXT, + '{attr}test1' : _TEXT, + '{attr}test2' : _TEXT, 'bla1' : _TEXT, 'bla2' : _TEXT, 'bla3' : _TEXT From scoder at codespeak.net Fri May 19 10:44:22 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 19 May 2006 10:44:22 +0200 (CEST) Subject: [Lxml-checkins] r27448 - lxml/trunk Message-ID: <20060519084422.0FDE110063@code0.codespeak.net> Author: scoder Date: Fri May 19 10:44:21 2006 New Revision: 27448 Modified: lxml/trunk/bench.py Log: fix getiterator benchmarks to actually find the searched elements, new benchmarks for findall and replacing children within same document Modified: lxml/trunk/bench.py ============================================================================== --- lxml/trunk/bench.py (original) +++ lxml/trunk/bench.py Fri May 19 10:44:21 2006 @@ -61,6 +61,8 @@ 'cElementTree' : 'cET' } + SEARCH_TAG = "{cdefg}00001" + def __init__(self, etree): self.etree = etree libname = etree.__name__.split('.')[-1] @@ -172,7 +174,7 @@ children = [root] for i in range(6 + TREE_FACTOR): tag_no = count().next - children = [ SubElement(c, "{bcd}a%05d" % i, attributes) + children = [ SubElement(c, "{cdefg}a%05d" % i, attributes) for i,c in enumerate(chain(children, children, children)) ] for child in root: child.text = text @@ -190,8 +192,8 @@ for ch1 in atoz: el = SubElement(root, "{bcd}"+ch1*5, attributes) el.text = text - SubElement(el, "{cdefg}abcde", attributes) - SubElement(el, "{cdefg}bcdef", attributes) + SubElement(el, "{cdefg}00001", attributes) + SubElement(el, "{cdefg}00002", attributes) t = current_time() - t return (root, t) @@ -347,12 +349,17 @@ for child in root: child.makeelement('{test}test', empty_attrib) - def bench_replace_children(self, root): + def bench_replace_children_element(self, root): Element = self.etree.Element for child in root: el = Element('{test}test') child[:] = [el] + def bench_replace_children(self, root): + Element = self.etree.Element + for child in root: + child[:] = [ child[0] ] + def bench_remove_children(self, root): for child in root: root.remove(child) @@ -430,14 +437,20 @@ for child in root[-100:-5]: root.index(child, start=-100, stop=-5) - def bench_getiterator(self, root): + def bench_getiterator_all(self, root): + list(root.getiterator()) + + def bench_getiterator_islice(self, root): list(islice(root.getiterator(), 10, 110)) def bench_getiterator_tag(self, root): - list(islice(root.getiterator("{b}a"), 3, 10)) + list(islice(root.getiterator(self.SEARCH_TAG), 3, 10)) def bench_getiterator_tag_all(self, root): - list(root.getiterator("{b}a")) + list(root.getiterator(self.SEARCH_TAG)) + + def bench_findall(self, root): + root.findall(".//" + self.SEARCH_TAG) @onlylib('lxe') def bench_xpath_class(self, root): From scoder at codespeak.net Fri May 19 11:35:37 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 19 May 2006 11:35:37 +0200 (CEST) Subject: [Lxml-checkins] r27449 - lxml/trunk Message-ID: <20060519093537.A055610061@code0.codespeak.net> Author: scoder Date: Fri May 19 11:35:36 2006 New Revision: 27449 Modified: lxml/trunk/bench.py Log: xpath and findall benchmarks Modified: lxml/trunk/bench.py ============================================================================== --- lxml/trunk/bench.py (original) +++ lxml/trunk/bench.py Fri May 19 11:35:36 2006 @@ -450,6 +450,9 @@ list(root.getiterator(self.SEARCH_TAG)) def bench_findall(self, root): + root.findall(".//*") + + def bench_findall_tag(self, root): root.findall(".//" + self.SEARCH_TAG) @onlylib('lxe') @@ -459,12 +462,23 @@ xpath(child) @onlylib('lxe') + def bench_xpath_class_repeat(self, root): + for child in root: + xpath = self.etree.XPath("./*[0]") + xpath(child) + + @onlylib('lxe') def bench_xpath_element(self, root): + xpath = self.etree.XPathElementEvaluator(root) for child in root: - xpath = self.etree.XPathElementEvaluator(child) xpath.evaluate("./*[0]") @onlylib('lxe') + def bench_xpath_method(self, root): + for child in root: + child.xpath("./*[0]") + + @onlylib('lxe') def bench_xpath_extensions_old(self, root): def return_child(_, element): if element: From scoder at codespeak.net Fri May 19 11:54:57 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 19 May 2006 11:54:57 +0200 (CEST) Subject: [Lxml-checkins] r27453 - lxml/trunk/doc Message-ID: <20060519095457.D1A7610061@code0.codespeak.net> Author: scoder Date: Fri May 19 11:54:56 2006 New Revision: 27453 Added: lxml/trunk/doc/performance.txt Modified: lxml/trunk/doc/main.txt Log: new doc/performance.txt to compare lxml with ET and cET Modified: lxml/trunk/doc/main.txt ============================================================================== --- lxml/trunk/doc/main.txt (original) +++ lxml/trunk/doc/main.txt Fri May 19 11:54:56 2006 @@ -75,9 +75,10 @@ Documentation ------------- -lxml follows the `ElementTree API`_ as much as possible, building it -on top of the native libxml2 tree. See also the `ElementTree -compatibility overview`_. +lxml follows the ElementTree_ API as much as possible, building it on top of +the native libxml2 tree. See also the `ElementTree compatibility overview`_ +and the `benchmark results`_ comparing lxml to the original ElementTree_ and +cElementTree_ implementations. lxml also `extends this API`_ to expose libxml2 and libxslt specific functionality, such as XPath_, `Relax NG`_, `XML Schema`_, `XSLT`_, and @@ -91,17 +92,21 @@ lxml also offers a `SAX compliant API`_, that works with the SAX support in the standard library. -.. _`ElementTree API`: http://effbot.org/zone/element-index.htm +.. _ElementTree: http://effbot.org/zone/element-index.htm +.. _cElementTree: http://effbot.org/zone/celementtree.htm + +.. _`benchmark results`: performance.html .. _`ElementTree compatibility overview`: compatibility.html .. _`extends this API`: api.html .. _`extension functions`: extensions.html +.. _`implementing namespaces`: namespace_extensions.html +.. _`SAX compliant API`: sax.html + .. _XPath: http://www.w3.org/TR/xpath .. _`Relax NG`: http://www.relaxng.org/ .. _`XML Schema`: http://www.w3.org/XML/Schema .. _`XSLT`: http://www.w3.org/TR/xslt .. _`c14n`: http://www.w3.org/TR/xml-c14n -.. _`implementing namespaces`: namespace_extensions.html -.. _`SAX compliant API`: sax.html Mailing list ------------ Added: lxml/trunk/doc/performance.txt ============================================================================== --- (empty file) +++ lxml/trunk/doc/performance.txt Fri May 19 11:54:56 2006 @@ -0,0 +1,245 @@ +Benchmarks and speed +==================== + +As an XML library, lxml.etree is very fast. It is also slow. It depends on +what you do with it. This text describes where lxml.etree (lxe) excels, gives +hints on some performance traps and compares the overall performance to the +original ElementTree_ (ET) and cElementTree_ (cET) libraries by Fredrik Lundh. +The cElementTree library is a fast C-implementation of the original +ElementTree. + +The statements made here are backed by the benchmark script `bench.py`_ that +comes with the lxml source distribution. The numbers cited below compare lxml +1.0, ElementTree 1.2.6 and cElementTree 1.0.5. + +.. _ElementTree: http://effbot.org/zone/element-index.htm +.. _cElementTree: http://effbot.org/zone/celementtree.htm +.. _`bench.py`: http://codespeak.net/svn/lxml/trunk/bench.py + +The ``bench.py`` script runs a number of simple tests on the different +libraries, using different XML tree configurations: different tree sizes, with +or without attributes (-/A) and with or without ASCII or unicode text (-/S/U). +In the result extracts cited below, T1 refers to a 3-level tree with many +children at the third level, T2 is swapped around to have many children at the +root element, T3 is a deep tree with few children at each level and T4 is a +small tree, slightly broader than deep. + + +Bad things first +---------------- + +First thing to say: there *is* an overhead involved in having a C library +mimic the ElementTree API. As opposed to ElementTree, lxml has to generate +Python objects on the fly when asked for them. What this means is: the more +of your code runs in Python, the slower your application gets. Note, however, +that this is true for most performance critical Python applications. + + +Parsing and Serialising +----------------------- + +This is one of the areas where lxml excels. The reason is that both parts are +executed entirely at the C level, without major interaction with Python code. +The results are rather impressive. Compared to cElementTree, lxml is about 20 +to 40 times faster on serialisation:: + + lxe: tostring_utf16 (SA T2) 30.9846 msec/pass + cET: tostring_utf16 (SA T2) 715.5002 msec/pass + ET : tostring_utf16 (SA T2) 758.5271 msec/pass + + lxe: tostring_utf16 (U- T3) 3.0509 msec/pass + cET: tostring_utf16 (U- T3) 72.4721 msec/pass + ET : tostring_utf16 (U- T3) 87.0735 msec/pass + + lxe: tostring_utf8 (UA T2) 26.8996 msec/pass + cET: tostring_utf8 (UA T2) 700.4889 msec/pass + ET : tostring_utf8 (UA T2) 745.3317 msec/pass + + lxe: tostring_utf8 (S- T3) 2.1876 msec/pass + cET: tostring_utf8 (S- T3) 71.1290 msec/pass + ET : tostring_utf8 (S- T3) 87.1525 msec/pass + +For parsing, the difference between the libraries is smaller. The (c)ET +libraries use the expat parser, which is known to be fast and similar in +performance to the libxml2 parser. If you take a complete serialize-parse +cycle, the numbers will look like this:: + + lxe: write_utf8_parse_stringIO (S- T1) 187.0444 msec/pass + cET: write_utf8_parse_stringIO (S- T1) 828.4068 msec/pass + ET : write_utf8_parse_stringIO (S- T1) 1181.0658 msec/pass + + lxe: write_utf8_parse_stringIO (UA T2) 213.6599 msec/pass + cET: write_utf8_parse_stringIO (UA T2) 927.2374 msec/pass + ET : write_utf8_parse_stringIO (UA T2) 1297.9678 msec/pass + +So, lxml also wins this contest, but considering the previous numbers on +serialization, parser performance is otherwise roughly comparable between cET +and lxml. + + +The ElementTree API +------------------- + +Since all three libraries implement the same API, their performance is easy to +compare in this area. A major disadvantage for lxml is the different tree +model that underlies libxml2. It allows lxml to provide parent pointers for +elements, but also increases the overhead of tree restructuring. This can be +seen from the tree setup times of the benchmark:: + + Setup times for trees in seconds: + lxe: -- S- U- -A SA UA + T1: 0.1658 0.1236 0.1241 0.1243 0.1261 0.1254 + T2: 0.1281 0.1282 0.1299 0.1381 0.1389 0.1395 + T3: 0.0366 0.0300 0.0290 0.0850 0.0851 0.0893 + T4: 0.0010 0.0006 0.0006 0.0018 0.0018 0.0019 + cET: -- S- U- -A SA UA + T1: 0.0417 0.0409 0.0403 0.0410 0.0410 0.0415 + T2: 0.0413 0.0414 0.0413 0.0417 0.0411 0.0417 + T3: 0.0097 0.0100 0.0099 0.0187 0.0142 0.0146 + T4: 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 + ET : -- S- U- -A SA UA + T1: 0.2189 0.2832 0.2210 0.2646 0.2905 0.2214 + T2: 0.3022 0.2322 0.2868 0.3192 0.2290 0.3075 + T3: 0.0519 0.0553 0.0527 0.0601 0.0572 0.0911 + T4: 0.0009 0.0008 0.0008 0.0008 0.0009 0.0009 + +While lxml is still faster than ET in most cases (30-60%), cET can be three to +four times as fast as lxml here. So, if the main performance bottleneck of an +application is creating large XML trees in memory through calls to Element and +SubElement, cET is the best choice. Note, however, that the serialisation +performance may even out this advantage. + +A critical action for lxml is moving elements between document contexts. It +requires lxml to do recursive adaptations throughout the moved tree structure. + +The following benchmark appends all root children of the second tree to the +root of the first tree:: + + lxe: append_from_document (-- T1,T2) 11.7905 msec/pass + cET: append_from_document (-- T1,T2) 0.4673 msec/pass + ET : append_from_document (-- T1,T2) 2.0460 msec/pass + + lxe: append_from_document (-- T3,T4) 0.2017 msec/pass + cET: append_from_document (-- T3,T4) 0.0227 msec/pass + ET : append_from_document (-- T3,T4) 0.1563 msec/pass + +Although this are fairly small numbers compared to parsing, this easily shows +the different performance classes for lxml and (c)ET. Where the latter do not +have to care about parent pointers and tree structures, lxml has to deep +traverse the appended tree. The performance difference therefore increases +with the size of the tree that is moved. + +This difference is not always as visible, but applies to most parts of the +API, like inserting newly created elements:: + + lxe: insert_from_document (-- T1,T2) 16.4772 msec/pass + cET: insert_from_document (-- T1,T2) 1.1874 msec/pass + ET : insert_from_document (-- T1,T2) 3.5447 msec/pass + +Or replacing the child slice by a new element:: + + lxe: replace_children_element (-- T1 ) 9.1834 msec/pass + cET: replace_children_element (-- T1 ) 0.9731 msec/pass + ET : replace_children_element (-- T1 ) 14.8213 msec/pass + +You should keep this difference in mind when you merge very large trees. On +the other hand, deep copying a tree is fast in lxml:: + + lxe: deepcopy (-- T1 ) 24.7359 msec/pass + cET: deepcopy (-- T1 ) 450.5479 msec/pass + ET : deepcopy (-- T1 ) 717.8308 msec/pass + + lxe: deepcopy (-- T3 ) 2.1182 msec/pass + cET: deepcopy (-- T3 ) 107.2124 msec/pass + ET : deepcopy (-- T3 ) 173.9782 msec/pass + +So, if you often need to create independent subtrees from a large tree that +you have parsed in, lxml is the best choice here. + + +Tree traversal +-------------- + +Another area where lxml is very fast is iteration for tree traversal. If your +algorithms can benefit from step-by-step traversal of the XML tree and +especially if few elements are of interest, lxml is a good choice:: + + lxe: getiterator_all (-- T2 ) 32.3100 msec/pass + cET: getiterator_all (-- T2 ) 37.2489 msec/pass + ET : getiterator_all (-- T2 ) 46.2996 msec/pass + + lxe: getiterator_islice (-- T2 ) 3.3567 msec/pass + cET: getiterator_islice (-- T2 ) 0.3289 msec/pass + ET : getiterator_islice (-- T2 ) 43.9938 msec/pass + + lxe: getiterator_tag (-- T2 ) 4.7438 msec/pass + cET: getiterator_tag (-- T2 ) 31.8628 msec/pass + ET : getiterator_tag (-- T2 ) 36.4583 msec/pass + + lxe: getiterator_tag_all (-- T2 ) 4.6267 msec/pass + cET: getiterator_tag_all (-- T2 ) 32.1669 msec/pass + ET : getiterator_tag_all (-- T2 ) 36.3365 msec/pass + +This similarly shows in ``Element.findall()``:: + + lxe: findall (-- T2 ) 36.4730 msec/pass + cET: findall (-- T2 ) 38.8718 msec/pass + ET : findall (-- T2 ) 50.9692 msec/pass + + lxe: findall (-- T3 ) 4.3956 msec/pass + cET: findall (-- T3 ) 11.8051 msec/pass + ET : findall (-- T3 ) 11.2570 msec/pass + + lxe: findall_tag (-- T2 ) 4.3950 msec/pass + cET: findall_tag (-- T2 ) 31.3107 msec/pass + ET : findall_tag (-- T2 ) 36.7813 msec/pass + + lxe: findall_tag (-- T3 ) 0.5946 msec/pass + cET: findall_tag (-- T3 ) 7.4491 msec/pass + ET : findall_tag (-- T3 ) 9.2943 msec/pass + +Note that all three libraries currently use the same Python implementation for +``findall()``, except for their native tree iterator. + + +XPath +----- + +This part of lxml does not have an equivalent in ElementTree. However, lxml +provides more than one way of accessing it and you should take care which part +of the lxml API you use. The most straight forward way is to call the +``xpath()`` method on an Element or ElementTree:: + + lxe: xpath_method (-- T1) 9.9304 msec/pass + lxe: xpath_method (-- T2) 29.3595 msec/pass + lxe: xpath_method (-- T3) 0.2791 msec/pass + lxe: xpath_method (-- T4) 0.9906 msec/pass + +This is well suited for testing and when the XPath expressions are as diverse +as the trees they are called on. However, if you have a single XPath +expression that you want to apply to a larger number of different elements, +the ``XPath`` class is the most efficient way to do it:: + + lxe: xpath_class (-- T1) 4.7921 msec/pass + lxe: xpath_class (-- T2) 9.6187 msec/pass + lxe: xpath_class (-- T3) 0.2215 msec/pass + lxe: xpath_class (-- T4) 0.2697 msec/pass + +Note that this still allows you to use variables in the expression, so you can +parse it once and then adapt it through variables at call time. In other +cases, where you have a fixed Element or ElementTree and want to run different +expressions on it, you should consider the ``XPathEvaluator``:: + + lxe: xpath_element (-- T1) 5.3826 msec/pass + lxe: xpath_element (-- T2) 11.3929 msec/pass + lxe: xpath_element (-- T3) 0.2514 msec/pass + lxe: xpath_element (-- T4) 0.3038 msec/pass + +While it looks slightly slower, creating an XPath object for each of the +expressions generates a much higher overhead here:: + + lxe: xpath_class_repeat (-- T1) 6.8099 msec/pass + lxe: xpath_class_repeat (-- T2) 26.7462 msec/pass + lxe: xpath_class_repeat (-- T3) 0.3126 msec/pass + lxe: xpath_class_repeat (-- T4) 1.1111 msec/pass + From scoder at codespeak.net Fri May 19 12:01:38 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 19 May 2006 12:01:38 +0200 (CEST) Subject: [Lxml-checkins] r27454 - lxml/trunk/doc Message-ID: <20060519100138.C957E10063@code0.codespeak.net> Author: scoder Date: Fri May 19 12:01:37 2006 New Revision: 27454 Modified: lxml/trunk/doc/performance.txt Log: small clarifications Modified: lxml/trunk/doc/performance.txt ============================================================================== --- lxml/trunk/doc/performance.txt (original) +++ lxml/trunk/doc/performance.txt Fri May 19 12:01:37 2006 @@ -62,7 +62,7 @@ For parsing, the difference between the libraries is smaller. The (c)ET libraries use the expat parser, which is known to be fast and similar in performance to the libxml2 parser. If you take a complete serialize-parse -cycle, the numbers will look like this:: +cycle, the numbers will look similar to these:: lxe: write_utf8_parse_stringIO (S- T1) 187.0444 msec/pass cET: write_utf8_parse_stringIO (S- T1) 828.4068 msec/pass @@ -153,8 +153,8 @@ cET: deepcopy (-- T3 ) 107.2124 msec/pass ET : deepcopy (-- T3 ) 173.9782 msec/pass -So, if you often need to create independent subtrees from a large tree that -you have parsed in, lxml is the best choice here. +So, for example, if you often need to create independent subtrees from a large +tree that you have parsed in, lxml is by far the best choice here. Tree traversal From scoder at codespeak.net Fri May 19 12:05:58 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 19 May 2006 12:05:58 +0200 (CEST) Subject: [Lxml-checkins] r27455 - lxml/trunk/doc Message-ID: <20060519100558.DCCFD1006E@code0.codespeak.net> Author: scoder Date: Fri May 19 12:05:57 2006 New Revision: 27455 Modified: lxml/trunk/doc/performance.txt Log: readability Modified: lxml/trunk/doc/performance.txt ============================================================================== --- lxml/trunk/doc/performance.txt (original) +++ lxml/trunk/doc/performance.txt Fri May 19 12:05:57 2006 @@ -164,39 +164,39 @@ algorithms can benefit from step-by-step traversal of the XML tree and especially if few elements are of interest, lxml is a good choice:: - lxe: getiterator_all (-- T2 ) 32.3100 msec/pass - cET: getiterator_all (-- T2 ) 37.2489 msec/pass - ET : getiterator_all (-- T2 ) 46.2996 msec/pass - - lxe: getiterator_islice (-- T2 ) 3.3567 msec/pass - cET: getiterator_islice (-- T2 ) 0.3289 msec/pass - ET : getiterator_islice (-- T2 ) 43.9938 msec/pass - - lxe: getiterator_tag (-- T2 ) 4.7438 msec/pass - cET: getiterator_tag (-- T2 ) 31.8628 msec/pass - ET : getiterator_tag (-- T2 ) 36.4583 msec/pass - - lxe: getiterator_tag_all (-- T2 ) 4.6267 msec/pass - cET: getiterator_tag_all (-- T2 ) 32.1669 msec/pass - ET : getiterator_tag_all (-- T2 ) 36.3365 msec/pass + lxe: getiterator_all (-- T2 ) 32.3100 msec/pass + cET: getiterator_all (-- T2 ) 37.2489 msec/pass + ET : getiterator_all (-- T2 ) 46.2996 msec/pass + + lxe: getiterator_islice (-- T2 ) 3.3567 msec/pass + cET: getiterator_islice (-- T2 ) 0.3289 msec/pass + ET : getiterator_islice (-- T2 ) 43.9938 msec/pass + + lxe: getiterator_tag (-- T2 ) 4.7438 msec/pass + cET: getiterator_tag (-- T2 ) 31.8628 msec/pass + ET : getiterator_tag (-- T2 ) 36.4583 msec/pass + + lxe: getiterator_tag_all (-- T2 ) 4.6267 msec/pass + cET: getiterator_tag_all (-- T2 ) 32.1669 msec/pass + ET : getiterator_tag_all (-- T2 ) 36.3365 msec/pass This similarly shows in ``Element.findall()``:: - lxe: findall (-- T2 ) 36.4730 msec/pass - cET: findall (-- T2 ) 38.8718 msec/pass - ET : findall (-- T2 ) 50.9692 msec/pass - - lxe: findall (-- T3 ) 4.3956 msec/pass - cET: findall (-- T3 ) 11.8051 msec/pass - ET : findall (-- T3 ) 11.2570 msec/pass - - lxe: findall_tag (-- T2 ) 4.3950 msec/pass - cET: findall_tag (-- T2 ) 31.3107 msec/pass - ET : findall_tag (-- T2 ) 36.7813 msec/pass - - lxe: findall_tag (-- T3 ) 0.5946 msec/pass - cET: findall_tag (-- T3 ) 7.4491 msec/pass - ET : findall_tag (-- T3 ) 9.2943 msec/pass + lxe: findall (-- T2 ) 36.4730 msec/pass + cET: findall (-- T2 ) 38.8718 msec/pass + ET : findall (-- T2 ) 50.9692 msec/pass + + lxe: findall (-- T3 ) 4.3956 msec/pass + cET: findall (-- T3 ) 11.8051 msec/pass + ET : findall (-- T3 ) 11.2570 msec/pass + + lxe: findall_tag (-- T2 ) 4.3950 msec/pass + cET: findall_tag (-- T2 ) 31.3107 msec/pass + ET : findall_tag (-- T2 ) 36.7813 msec/pass + + lxe: findall_tag (-- T3 ) 0.5946 msec/pass + cET: findall_tag (-- T3 ) 7.4491 msec/pass + ET : findall_tag (-- T3 ) 9.2943 msec/pass Note that all three libraries currently use the same Python implementation for ``findall()``, except for their native tree iterator. From scoder at codespeak.net Fri May 19 12:08:12 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 19 May 2006 12:08:12 +0200 (CEST) Subject: [Lxml-checkins] r27456 - lxml/trunk/doc Message-ID: <20060519100812.2DA141006E@code0.codespeak.net> Author: scoder Date: Fri May 19 12:08:11 2006 New Revision: 27456 Modified: lxml/trunk/doc/performance.txt Log: readability Modified: lxml/trunk/doc/performance.txt ============================================================================== --- lxml/trunk/doc/performance.txt (original) +++ lxml/trunk/doc/performance.txt Fri May 19 12:08:11 2006 @@ -8,14 +8,14 @@ The cElementTree library is a fast C-implementation of the original ElementTree. -The statements made here are backed by the benchmark script `bench.py`_ that -comes with the lxml source distribution. The numbers cited below compare lxml -1.0, ElementTree 1.2.6 and cElementTree 1.0.5. - .. _ElementTree: http://effbot.org/zone/element-index.htm .. _cElementTree: http://effbot.org/zone/celementtree.htm .. _`bench.py`: http://codespeak.net/svn/lxml/trunk/bench.py +The statements made here are backed by the benchmark script `bench.py`_ that +comes with the lxml source distribution. The numbers cited below compare lxml +1.0, ElementTree 1.2.6 and cElementTree 1.0.5. + The ``bench.py`` script runs a number of simple tests on the different libraries, using different XML tree configurations: different tree sizes, with or without attributes (-/A) and with or without ASCII or unicode text (-/S/U). From scoder at codespeak.net Fri May 19 12:08:45 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 19 May 2006 12:08:45 +0200 (CEST) Subject: [Lxml-checkins] r27457 - lxml/trunk/doc Message-ID: <20060519100845.96B071006E@code0.codespeak.net> Author: scoder Date: Fri May 19 12:08:44 2006 New Revision: 27457 Modified: lxml/trunk/doc/performance.txt Log: readability Modified: lxml/trunk/doc/performance.txt ============================================================================== --- lxml/trunk/doc/performance.txt (original) +++ lxml/trunk/doc/performance.txt Fri May 19 12:08:44 2006 @@ -10,12 +10,13 @@ .. _ElementTree: http://effbot.org/zone/element-index.htm .. _cElementTree: http://effbot.org/zone/celementtree.htm -.. _`bench.py`: http://codespeak.net/svn/lxml/trunk/bench.py The statements made here are backed by the benchmark script `bench.py`_ that comes with the lxml source distribution. The numbers cited below compare lxml 1.0, ElementTree 1.2.6 and cElementTree 1.0.5. +.. _`bench.py`: http://codespeak.net/svn/lxml/trunk/bench.py + The ``bench.py`` script runs a number of simple tests on the different libraries, using different XML tree configurations: different tree sizes, with or without attributes (-/A) and with or without ASCII or unicode text (-/S/U). From scoder at codespeak.net Fri May 19 12:10:20 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 19 May 2006 12:10:20 +0200 (CEST) Subject: [Lxml-checkins] r27458 - lxml/trunk/doc Message-ID: <20060519101020.8BA591006E@code0.codespeak.net> Author: scoder Date: Fri May 19 12:10:19 2006 New Revision: 27458 Modified: lxml/trunk/doc/performance.txt Log: readability Modified: lxml/trunk/doc/performance.txt ============================================================================== --- lxml/trunk/doc/performance.txt (original) +++ lxml/trunk/doc/performance.txt Fri May 19 12:10:19 2006 @@ -12,8 +12,8 @@ .. _cElementTree: http://effbot.org/zone/celementtree.htm The statements made here are backed by the benchmark script `bench.py`_ that -comes with the lxml source distribution. The numbers cited below compare lxml -1.0, ElementTree 1.2.6 and cElementTree 1.0.5. +comes with the lxml source distribution. The numbers that are cited below +compare lxml 1.0, ElementTree 1.2.6 and cElementTree 1.0.5. .. _`bench.py`: http://codespeak.net/svn/lxml/trunk/bench.py From scoder at codespeak.net Fri May 19 12:13:02 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 19 May 2006 12:13:02 +0200 (CEST) Subject: [Lxml-checkins] r27459 - lxml/trunk/doc Message-ID: <20060519101302.072AD1006E@code0.codespeak.net> Author: scoder Date: Fri May 19 12:13:02 2006 New Revision: 27459 Modified: lxml/trunk/doc/performance.txt Log: clarify how the benchmarks work Modified: lxml/trunk/doc/performance.txt ============================================================================== --- lxml/trunk/doc/performance.txt (original) +++ lxml/trunk/doc/performance.txt Fri May 19 12:13:02 2006 @@ -23,7 +23,8 @@ In the result extracts cited below, T1 refers to a 3-level tree with many children at the third level, T2 is swapped around to have many children at the root element, T3 is a deep tree with few children at each level and T4 is a -small tree, slightly broader than deep. +small tree, slightly broader than deep. Most benchmarks run in a loop over +all children of the tree root. Bad things first From scoder at codespeak.net Fri May 19 12:17:25 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 19 May 2006 12:17:25 +0200 (CEST) Subject: [Lxml-checkins] r27462 - lxml/trunk/doc Message-ID: <20060519101725.6C6FC10071@code0.codespeak.net> Author: scoder Date: Fri May 19 12:17:24 2006 New Revision: 27462 Modified: lxml/trunk/doc/performance.txt Log: clarify that cET and lxml are close in parser performance Modified: lxml/trunk/doc/performance.txt ============================================================================== --- lxml/trunk/doc/performance.txt (original) +++ lxml/trunk/doc/performance.txt Fri May 19 12:17:24 2006 @@ -75,8 +75,7 @@ ET : write_utf8_parse_stringIO (UA T2) 1297.9678 msec/pass So, lxml also wins this contest, but considering the previous numbers on -serialization, parser performance is otherwise roughly comparable between cET -and lxml. +serialization, cET comes rather close in plain parser performance. The ElementTree API From scoder at codespeak.net Fri May 19 12:34:46 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 19 May 2006 12:34:46 +0200 (CEST) Subject: [Lxml-checkins] r27463 - lxml/trunk/doc Message-ID: <20060519103446.A264D10063@code0.codespeak.net> Author: scoder Date: Fri May 19 12:34:45 2006 New Revision: 27463 Modified: lxml/trunk/doc/performance.txt Log: make clear why lxml is slower in tree construction Modified: lxml/trunk/doc/performance.txt ============================================================================== --- lxml/trunk/doc/performance.txt (original) +++ lxml/trunk/doc/performance.txt Fri May 19 12:34:45 2006 @@ -82,33 +82,40 @@ ------------------- Since all three libraries implement the same API, their performance is easy to -compare in this area. A major disadvantage for lxml is the different tree -model that underlies libxml2. It allows lxml to provide parent pointers for -elements, but also increases the overhead of tree restructuring. This can be -seen from the tree setup times of the benchmark:: +compare in this area. A major disadvantage for lxml's performance is the +different tree model that underlies libxml2. It allows lxml to provide parent +pointers for elements, but also increases the overhead of tree building and +restructuring. This can be seen from the tree setup times of the benchmark +(given in seconds):: - Setup times for trees in seconds: lxe: -- S- U- -A SA UA T1: 0.1658 0.1236 0.1241 0.1243 0.1261 0.1254 T2: 0.1281 0.1282 0.1299 0.1381 0.1389 0.1395 T3: 0.0366 0.0300 0.0290 0.0850 0.0851 0.0893 T4: 0.0010 0.0006 0.0006 0.0018 0.0018 0.0019 + cET: -- S- U- -A SA UA T1: 0.0417 0.0409 0.0403 0.0410 0.0410 0.0415 T2: 0.0413 0.0414 0.0413 0.0417 0.0411 0.0417 T3: 0.0097 0.0100 0.0099 0.0187 0.0142 0.0146 T4: 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 + ET : -- S- U- -A SA UA T1: 0.2189 0.2832 0.2210 0.2646 0.2905 0.2214 T2: 0.3022 0.2322 0.2868 0.3192 0.2290 0.3075 T3: 0.0519 0.0553 0.0527 0.0601 0.0572 0.0911 T4: 0.0009 0.0008 0.0008 0.0008 0.0009 0.0009 -While lxml is still faster than ET in most cases (30-60%), cET can be three to -four times as fast as lxml here. So, if the main performance bottleneck of an -application is creating large XML trees in memory through calls to Element and -SubElement, cET is the best choice. Note, however, that the serialisation -performance may even out this advantage. +While lxml is still faster than ET in most cases (30-60%), cET can be up to +three times faster than lxml here. One of the reasons is that lxml must +additionally discard the created Python elements after their use, when they +are no longer referenced. ET and cET represent the tree itself through these +objects, which reduces their overhead in creating them. + +So, if the main performance bottleneck of an application is creating large XML +trees in memory through calls to Element and SubElement, cET is the best +choice. Note, however, that the serialisation performance may even out this +advantage. A critical action for lxml is moving elements between document contexts. It requires lxml to do recursive adaptations throughout the moved tree structure. From scoder at codespeak.net Fri May 19 12:36:58 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 19 May 2006 12:36:58 +0200 (CEST) Subject: [Lxml-checkins] r27464 - lxml/trunk/doc Message-ID: <20060519103658.D140C10063@code0.codespeak.net> Author: scoder Date: Fri May 19 12:36:57 2006 New Revision: 27464 Modified: lxml/trunk/doc/performance.txt Log: fix outlier in results Modified: lxml/trunk/doc/performance.txt ============================================================================== --- lxml/trunk/doc/performance.txt (original) +++ lxml/trunk/doc/performance.txt Fri May 19 12:36:57 2006 @@ -89,7 +89,7 @@ (given in seconds):: lxe: -- S- U- -A SA UA - T1: 0.1658 0.1236 0.1241 0.1243 0.1261 0.1254 + T1: 0.1360 0.1236 0.1241 0.1243 0.1261 0.1254 T2: 0.1281 0.1282 0.1299 0.1381 0.1389 0.1395 T3: 0.0366 0.0300 0.0290 0.0850 0.0851 0.0893 T4: 0.0010 0.0006 0.0006 0.0018 0.0018 0.0019 From scoder at codespeak.net Fri May 19 13:36:31 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 19 May 2006 13:36:31 +0200 (CEST) Subject: [Lxml-checkins] r27467 - lxml/trunk/src/lxml Message-ID: <20060519113631.507BB1006E@code0.codespeak.net> Author: scoder Date: Fri May 19 13:36:29 2006 New Revision: 27467 Modified: lxml/trunk/src/lxml/apihelpers.pxi lxml/trunk/src/lxml/etree.pyx Log: refactoring of Element.get/set and Attrib.get/set etc. to use external helper functions, reduces code duplication Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Fri May 19 13:36:29 2006 @@ -100,6 +100,36 @@ c_attrib_node.ns.href) return funicode(value) +cdef object _getAttributeValue(_NodeBase element, key, default): + cdef char* c_result + cdef char* c_tag + ns, tag = _getNsTag(key) + c_tag = _cstr(tag) + if ns is None: + c_result = tree.xmlGetNoNsProp(element._c_node, c_tag) + else: + c_result = tree.xmlGetNsProp(element._c_node, c_tag, _cstr(ns)) + if c_result is NULL: + # XXX free namespace that is not in use..? + return default + result = funicode(c_result) + tree.xmlFree(c_result) + return result + +cdef void _setAttributeValue(_NodeBase element, key, value): + cdef xmlNs* c_ns + cdef char* c_value + cdef char* c_tag + ns, tag = _getNsTag(key) + c_tag = _cstr(tag) + value = _utf8(value) + c_value = _cstr(value) + if ns is None: + tree.xmlSetProp(element._c_node, c_tag, c_value) + else: + c_ns = element._doc._findOrBuildNodeNs(element._c_node, _cstr(ns)) + tree.xmlSetNsProp(element._c_node, c_ns, c_tag, c_value) + cdef object __REPLACE_XML_ENCODING __REPLACE_XML_ENCODING = re.compile( r'^(\s*<\?\s*xml[^>]+)\s+encoding\s*=\s*"[^"]*"\s*', re.U).sub Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Fri May 19 13:36:29 2006 @@ -589,7 +589,7 @@ return new_doc.getroot() def set(self, key, value): - self.attrib[key] = value + _setAttributeValue(self, key, value) def append(self, _Element element not None): cdef xmlNode* c_next @@ -839,22 +839,7 @@ raise ValueError, "list.index(x): x not in list" def get(self, key, default=None): - # XXX more redundancy, but might be slightly faster than - # return self.attrib.get(key, default) - cdef char* cresult - cdef char* c_tag - ns, tag = _getNsTag(key) - c_tag = _cstr(tag) - if ns is None: - cresult = tree.xmlGetNoNsProp(self._c_node, c_tag) - else: - cresult = tree.xmlGetNsProp(self._c_node, c_tag, _cstr(ns)) - if cresult is NULL: - result = default - else: - result = funicode(cresult) - tree.xmlFree(cresult) - return result + return _getAttributeValue(self, key, default) def keys(self): return self.attrib.keys() @@ -1000,21 +985,9 @@ cdef class _Attrib(_NodeBase): # MANIPULATORS def __setitem__(self, key, value): - cdef xmlNs* c_ns - cdef char* c_value - cdef char* c_tag - ns, tag = _getNsTag(key) - c_tag = _cstr(tag) - value = _utf8(value) - c_value = _cstr(value) - if ns is None: - tree.xmlSetProp(self._c_node, c_tag, c_value) - else: - c_ns = self._doc._findOrBuildNodeNs(self._c_node, _cstr(ns)) - tree.xmlSetNsProp(self._c_node, c_ns, c_tag, c_value) + _setAttributeValue(self, key, value) def __delitem__(self, key): - cdef xmlNs* c_ns cdef xmlAttr* c_attr cdef char* c_tag ns, tag = _getNsTag(key) @@ -1036,21 +1009,20 @@ return repr(result) def __getitem__(self, key): - cdef xmlNs* c_ns - cdef char* cresult - cdef char* c_tag - ns, tag = _getNsTag(key) - c_tag = _cstr(tag) - if ns is None: - cresult = tree.xmlGetNoNsProp(self._c_node, c_tag) - else: - cresult = tree.xmlGetNsProp(self._c_node, c_tag, _cstr(ns)) - if cresult is NULL: - # XXX free namespace that is not in use..? + result = _getAttributeValue(self, key, None) + if result is None: raise KeyError, key - result = funicode(cresult) - tree.xmlFree(cresult) - return result + else: + return result + + def __nonzero__(self): + cdef xmlNode* c_node + c_node = (self._c_node.properties) + while c_node is not NULL: + if c_node.type == tree.XML_ATTRIBUTE_NODE: + return True + c_node = c_node.next + return False def __len__(self): cdef Py_ssize_t c @@ -1064,10 +1036,7 @@ return c def get(self, key, default=None): - try: - return self.__getitem__(key) - except KeyError: - return default + return _getAttributeValue(self, key, default) def keys(self): result = [] @@ -1116,36 +1085,25 @@ return iter(self.items()) def has_key(self, key): - cdef xmlNs* c_ns - cdef char* result - cdef char* c_tag - ns, tag = _getNsTag(key) - c_tag = _cstr(tag) - if ns is None: - result = tree.xmlGetNoNsProp(self._c_node, c_tag) - else: - result = tree.xmlGetNsProp(self._c_node, c_tag, _cstr(ns)) - if result is not NULL: - tree.xmlFree(result) + if key in self: return True else: return False def __contains__(self, key): - cdef xmlNs* c_ns - cdef char* result + cdef char* c_result cdef char* c_tag ns, tag = _getNsTag(key) c_tag = _cstr(tag) if ns is None: - result = tree.xmlGetNoNsProp(self._c_node, c_tag) - else: - result = tree.xmlGetNsProp(self._c_node, c_tag, _cstr(ns)) - if result is not NULL: - tree.xmlFree(result) - return True + c_result = tree.xmlGetNoNsProp(self._c_node, c_tag) else: + c_result = tree.xmlGetNsProp(self._c_node, c_tag, _cstr(ns)) + if c_result is NULL: return False + else: + tree.xmlFree(c_result) + return True cdef _Attrib _attribFactory(_Document doc, xmlNode* c_node): cdef _Attrib result From scoder at codespeak.net Fri May 19 14:01:20 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 19 May 2006 14:01:20 +0200 (CEST) Subject: [Lxml-checkins] r27468 - lxml/trunk/src/lxml Message-ID: <20060519120120.6F02310063@code0.codespeak.net> Author: scoder Date: Fri May 19 14:01:18 2006 New Revision: 27468 Modified: lxml/trunk/src/lxml/etree.pyx Log: be more conservative in ElementDepthFirstIterator to prevent possible problems under tree modifications, some cleanup Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Fri May 19 14:01:18 2006 @@ -1163,12 +1163,12 @@ tree it traverses is modified during iteration. """ # we keep Python references here to control GC - # keep next node to return and a stack of position state in the tree + # keep next node to return and a depth counter in the tree + cdef _NodeBase _next_node + cdef Py_ssize_t _depth cdef object _pystrings cdef char* _href cdef char* _name - cdef Py_ssize_t _depth - cdef _NodeBase _next_node def __init__(self, _NodeBase node not None, tag=None): self._next_node = node self._depth = 0 @@ -1226,7 +1226,7 @@ c_node = _findDepthFirstInFollowingSiblings( c_parent, self._href, self._name) - if c_node is NULL: + if c_node is NULL or not _isElement(c_parent): self._next_node = None return # all found, nothing left # we are at a sibling, so set c_parent to our parent From scoder at codespeak.net Fri May 19 14:21:15 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 19 May 2006 14:21:15 +0200 (CEST) Subject: [Lxml-checkins] r27469 - lxml/trunk/src/lxml Message-ID: <20060519122115.8102210063@code0.codespeak.net> Author: scoder Date: Fri May 19 14:21:14 2006 New Revision: 27469 Modified: lxml/trunk/src/lxml/apihelpers.pxi lxml/trunk/src/lxml/cstd.pxd lxml/trunk/src/lxml/tree.pxd Log: some cleanup in _getNsTag, use cstd.strchr instead of tree.Strchr Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Fri May 19 14:21:14 2006 @@ -371,23 +371,26 @@ Return None for NS uri if no namespace URI available. """ cdef char* c_tag - cdef char* c_pos - cdef int nslen + cdef char* c_ns_end + cdef Py_ssize_t taglen + cdef Py_ssize_t nslen if isinstance(tag, QName): tag = (tag).text tag = _utf8(tag) c_tag = _cstr(tag) if c_tag[0] == c'{': - c_pos = tree.xmlStrchr(c_tag+1, c'}') - if c_pos is NULL: + c_tag = c_tag + 1 + c_ns_end = cstd.strchr(c_tag, c'}') + if c_ns_end is NULL: raise ValueError, "Invalid tag name" - nslen = c_pos - c_tag - 1 - ns = python.PyString_FromStringAndSize(c_tag+1, nslen) - tag = python.PyString_FromString(c_pos+1) + nslen = c_ns_end - c_tag + taglen = python.PyString_GET_SIZE(tag) - nslen - 2 + ns = python.PyString_FromStringAndSize(c_tag, nslen) + tag = python.PyString_FromStringAndSize(c_ns_end+1, taglen) else: ns = None return ns, tag - + cdef object _namespacedName(xmlNode* c_node): cdef char* href cdef char* name Modified: lxml/trunk/src/lxml/cstd.pxd ============================================================================== --- lxml/trunk/src/lxml/cstd.pxd (original) +++ lxml/trunk/src/lxml/cstd.pxd Fri May 19 14:21:14 2006 @@ -6,6 +6,7 @@ ctypedef int size_t cdef int strlen(char* s) cdef char* strstr(char* haystack, char* needle) + cdef char* strchr(char* haystack, int needle) cdef int strcmp(char* s1, char* s2) cdef int strncmp(char* s1, char* s2, size_t len) cdef void* memcpy(void* dest, void* src, size_t len) Modified: lxml/trunk/src/lxml/tree.pxd ============================================================================== --- lxml/trunk/src/lxml/tree.pxd (original) +++ lxml/trunk/src/lxml/tree.pxd Fri May 19 14:21:14 2006 @@ -236,7 +236,6 @@ cdef extern from "libxml/xmlstring.h": cdef char* xmlStrdup(char* cur) - cdef char* xmlStrchr(char* cur, char value) cdef extern from "etree.h": cdef int _isElement(xmlNode* node) From scoder at codespeak.net Fri May 19 14:22:12 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 19 May 2006 14:22:12 +0200 (CEST) Subject: [Lxml-checkins] r27470 - lxml/trunk Message-ID: <20060519122212.ED7A710063@code0.codespeak.net> Author: scoder Date: Fri May 19 14:22:11 2006 New Revision: 27470 Modified: lxml/trunk/bench.py Log: forgotten debug output in bench.py Modified: lxml/trunk/bench.py ============================================================================== --- lxml/trunk/bench.py (original) +++ lxml/trunk/bench.py Fri May 19 14:22:11 2006 @@ -615,7 +615,6 @@ selected = [] for name in sys.argv[1:]: selected.append(name) - print selected benchmarks = [ [ b for b in bs if [ match for match in selected if match in b[0] ] ] From scoder at codespeak.net Fri May 19 15:09:18 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 19 May 2006 15:09:18 +0200 (CEST) Subject: [Lxml-checkins] r27473 - lxml/trunk Message-ID: <20060519130918.F0F5010061@code0.codespeak.net> Author: scoder Date: Fri May 19 15:09:17 2006 New Revision: 27473 Modified: lxml/trunk/bench.py Log: more forgotten debug code in bench.py Modified: lxml/trunk/bench.py ============================================================================== --- lxml/trunk/bench.py (original) +++ lxml/trunk/bench.py Fri May 19 15:09:17 2006 @@ -281,7 +281,6 @@ @with_text(text=True, utext=True) def bench_tostring_utf8_unicode_XML(self, root): xml = unicode(self.etree.tostring(root, 'UTF-8'), 'UTF-8') - open("test%03d.txt" % len(root), 'w').write(xml.encode('UTF-8')) self.etree.XML(xml) @with_attributes(True) From scoder at codespeak.net Fri May 19 15:19:54 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 19 May 2006 15:19:54 +0200 (CEST) Subject: [Lxml-checkins] r27474 - in lxml/trunk/src/lxml: . tests Message-ID: <20060519131954.433D710063@code0.codespeak.net> Author: scoder Date: Fri May 19 15:19:53 2006 New Revision: 27474 Modified: lxml/trunk/src/lxml/etree.pyx lxml/trunk/src/lxml/tests/test_elementtree.py Log: Element.__contains__ for quick check if an element has a certain child, some cleanup Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Fri May 19 15:19:53 2006 @@ -764,6 +764,13 @@ c_node = _findChildBackwards(self._c_node, 0) return c_node != NULL + def __contains__(self, element): + cdef xmlNode* c_node + if not isinstance(element, _NodeBase): + return 0 + c_node = (<_NodeBase>element)._c_node + return c_node is not NULL and c_node.parent is self._c_node + def __iter__(self): return ElementChildIterator(self) @@ -1020,9 +1027,9 @@ c_node = (self._c_node.properties) while c_node is not NULL: if c_node.type == tree.XML_ATTRIBUTE_NODE: - return True + return 1 c_node = c_node.next - return False + return 0 def __len__(self): cdef Py_ssize_t c @@ -1100,10 +1107,10 @@ else: c_result = tree.xmlGetNsProp(self._c_node, c_tag, _cstr(ns)) if c_result is NULL: - return False + return 0 else: tree.xmlFree(c_result) - return True + return 1 cdef _Attrib _attribFactory(_Document doc, xmlNode* c_node): cdef _Attrib result Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Fri May 19 15:19:53 2006 @@ -89,6 +89,24 @@ self.assertEquals('two', root[1].tag) self.assertEquals('three', root[2].tag) + def test_element_contains(self): + Element = self.etree.Element + SubElement = self.etree.SubElement + + root1 = Element('root') + SubElement(root1, 'one') + self.assert_(root1[0] in root1) + + root2 = Element('root') + SubElement(root2, 'two') + SubElement(root2, 'three') + self.assert_(root2[0] in root2) + self.assert_(root2[1] in root2) + + self.assertFalse(root1[0] in root2) + self.assertFalse(root2[0] in root1) + self.assertFalse(None in root2) + def test_element_indexing_with_text(self): ElementTree = self.etree.ElementTree From scoder at codespeak.net Fri May 19 15:37:59 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 19 May 2006 15:37:59 +0200 (CEST) Subject: [Lxml-checkins] r27475 - lxml/trunk Message-ID: <20060519133759.4AE8C10063@code0.codespeak.net> Author: scoder Date: Fri May 19 15:37:58 2006 New Revision: 27475 Modified: lxml/trunk/bench.py Log: fix tag names in tree 4 Modified: lxml/trunk/bench.py ============================================================================== --- lxml/trunk/bench.py (original) +++ lxml/trunk/bench.py Fri May 19 15:37:58 2006 @@ -61,7 +61,7 @@ 'cElementTree' : 'cET' } - SEARCH_TAG = "{cdefg}00001" + SEARCH_TAG = "{cdefg}a00001" def __init__(self, etree): self.etree = etree @@ -183,17 +183,16 @@ def _setup_tree4(self, text, attributes): "small tree with 26 2nd level and 2 3rd level children" - atoz = self.atoz SubElement = self.etree.SubElement current_time = time.time t = current_time() root = self.etree.Element('{abc}rootnode') children = [root] - for ch1 in atoz: + for ch1 in self.atoz: el = SubElement(root, "{bcd}"+ch1*5, attributes) el.text = text - SubElement(el, "{cdefg}00001", attributes) - SubElement(el, "{cdefg}00002", attributes) + SubElement(el, "{cdefg}a00001", attributes) + SubElement(el, "{cdefg}a00002", attributes) t = current_time() - t return (root, t) From scoder at codespeak.net Fri May 19 15:47:02 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 19 May 2006 15:47:02 +0200 (CEST) Subject: [Lxml-checkins] r27476 - lxml/trunk/doc Message-ID: <20060519134702.B252710063@code0.codespeak.net> Author: scoder Date: Fri May 19 15:47:01 2006 New Revision: 27476 Modified: lxml/trunk/doc/performance.txt Log: update benchmark results in doc/performance.txt for bench.py changes Modified: lxml/trunk/doc/performance.txt ============================================================================== --- lxml/trunk/doc/performance.txt (original) +++ lxml/trunk/doc/performance.txt Fri May 19 15:47:01 2006 @@ -172,21 +172,21 @@ algorithms can benefit from step-by-step traversal of the XML tree and especially if few elements are of interest, lxml is a good choice:: - lxe: getiterator_all (-- T2 ) 32.3100 msec/pass - cET: getiterator_all (-- T2 ) 37.2489 msec/pass - ET : getiterator_all (-- T2 ) 46.2996 msec/pass - - lxe: getiterator_islice (-- T2 ) 3.3567 msec/pass - cET: getiterator_islice (-- T2 ) 0.3289 msec/pass - ET : getiterator_islice (-- T2 ) 43.9938 msec/pass - - lxe: getiterator_tag (-- T2 ) 4.7438 msec/pass - cET: getiterator_tag (-- T2 ) 31.8628 msec/pass - ET : getiterator_tag (-- T2 ) 36.4583 msec/pass - - lxe: getiterator_tag_all (-- T2 ) 4.6267 msec/pass - cET: getiterator_tag_all (-- T2 ) 32.1669 msec/pass - ET : getiterator_tag_all (-- T2 ) 36.3365 msec/pass + lxe: getiterator_all (-- T2 ) 31.2719 msec/pass + cET: getiterator_all (-- T2 ) 36.3687 msec/pass + ET : getiterator_all (-- T2 ) 46.2846 msec/pass + + lxe: getiterator_islice (-- T2 ) 2.8503 msec/pass + cET: getiterator_islice (-- T2 ) 0.3299 msec/pass + ET : getiterator_islice (-- T2 ) 44.5898 msec/pass + + lxe: getiterator_tag (-- T2 ) 3.0983 msec/pass + cET: getiterator_tag (-- T2 ) 11.2861 msec/pass + ET : getiterator_tag (-- T2 ) 37.5661 msec/pass + + lxe: getiterator_tag_all (-- T2 ) 4.9760 msec/pass + cET: getiterator_tag_all (-- T2 ) 33.2602 msec/pass + ET : getiterator_tag_all (-- T2 ) 37.6200 msec/pass This similarly shows in ``Element.findall()``:: From scoder at codespeak.net Sat May 20 13:06:57 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 20 May 2006 13:06:57 +0200 (CEST) Subject: [Lxml-checkins] r27500 - lxml/trunk/src/lxml/tests Message-ID: <20060520110657.041C910072@code0.codespeak.net> Author: scoder Date: Sat May 20 13:06:56 2006 New Revision: 27500 Modified: lxml/trunk/src/lxml/tests/test_htmlparser.py lxml/trunk/src/lxml/tests/test_unicode.py Log: fix encoding used in test cases Modified: lxml/trunk/src/lxml/tests/test_htmlparser.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_htmlparser.py (original) +++ lxml/trunk/src/lxml/tests/test_htmlparser.py Sat May 20 13:06:56 2006 @@ -8,7 +8,7 @@ import tempfile from common_imports import StringIO, etree, fileInTestDir -from common_imports import SillyFileLike, HelperTestCase, unentitify +from common_imports import SillyFileLike, HelperTestCase class HtmlParserTestCaseBase(HelperTestCase): """HTML parser test cases @@ -29,7 +29,7 @@ def test_module_HTML_unicode(self): element = self.etree.HTML(self.uhtml_str) - self.assertEqual(unentitify(self.etree.tostring(element)), + self.assertEqual(unicode(self.etree.tostring(element, 'UTF8'), 'UTF8'), self.uhtml_str) def test_module_parse_html_error(self): @@ -67,15 +67,15 @@ parser = self.etree.HTMLParser() f = SillyFileLike(self.html_str) tree = self.etree.parse(f, parser) - html = self.etree.tostring(tree.getroot()) - self.assertEqual(unentitify(html), self.html_str) + html = self.etree.tostring(tree.getroot(), 'UTF-8') + self.assertEqual(html, self.html_str) def test_module_parse_html_filelike_unicode(self): parser = self.etree.HTMLParser() f = SillyFileLike(self.uhtml_str) tree = self.etree.parse(f, parser) - html = self.etree.tostring(tree.getroot()) - self.assertEqual(unentitify(html), self.uhtml_str) + html = self.etree.tostring(tree.getroot(), 'UTF-8') + self.assertEqual(unicode(html, 'UTF-8'), self.uhtml_str) def test_html_file_error(self): parser = self.etree.HTMLParser() Modified: lxml/trunk/src/lxml/tests/test_unicode.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_unicode.py (original) +++ lxml/trunk/src/lxml/tests/test_unicode.py Sat May 20 13:06:56 2006 @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- import unittest, doctest -from common_imports import StringIO, etree, SillyFileLike, unentitify +from common_imports import StringIO, etree, SillyFileLike ascii_uni = u'a' @@ -45,7 +45,8 @@ # parse unicode from unamed file object (not support by ElementTree) f = SillyFileLike(uxml) root = etree.parse(f).getroot() - self.assertEquals(unentitify(etree.tostring(root)), uxml) + self.assertEquals(unicode(etree.tostring(root, 'UTF-8'), 'UTF-8'), + uxml) def test_suite(): suite = unittest.TestSuite() From scoder at codespeak.net Sat May 20 13:08:24 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 20 May 2006 13:08:24 +0200 (CEST) Subject: [Lxml-checkins] r27501 - lxml/trunk/src/lxml/tests Message-ID: <20060520110824.945811006E@code0.codespeak.net> Author: scoder Date: Sat May 20 13:08:23 2006 New Revision: 27501 Modified: lxml/trunk/src/lxml/tests/test_elementtree.py Log: removed broken test case that used an invalid encoding Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Sat May 20 13:08:23 2006 @@ -1675,19 +1675,9 @@ a.text = u'S?k p? nettet' self.assertXML( u'S?k p? nettet'.encode('UTF-8'), - a) - - def test_encoding2(self): - ElementTree = self.etree.ElementTree - Element = self.etree.Element - - a = Element('a') - a.text = u'S?k p? nettet' - self.assertXML( - u'S?k p? nettet'.encode('UTF-8'), a, 'UTF-8') - def test_encoding3(self): + def test_encoding2(self): ElementTree = self.etree.ElementTree Element = self.etree.Element From scoder at codespeak.net Sat May 20 13:21:11 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 20 May 2006 13:21:11 +0200 (CEST) Subject: [Lxml-checkins] r27502 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20060520112111.53D3210070@code0.codespeak.net> Author: scoder Date: Sat May 20 13:21:08 2006 New Revision: 27502 Modified: lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/etree.pyx lxml/trunk/src/lxml/tests/test_elementtree.py lxml/trunk/src/lxml/tree.pxd lxml/trunk/src/lxml/xmlwriter.pxi Log: some cleanup in encoding setup for tostring and write, raise LookupError if requested encoding cannot be found Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Sat May 20 13:21:08 2006 @@ -12,6 +12,8 @@ Bugs fixed ---------- +* Serialization functions now raise LookupError for unknown encodings + * Memory deallocation crash resulting from deep copying elements Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Sat May 20 13:21:08 2006 @@ -334,8 +334,12 @@ cdef _Document _doc cdef _NodeBase _context_node - # we have to take care here: the document may not have a root node! cdef _assertHasRoot(self): + """We have to take care here: the document may not have a root node! + This can happen if ElementTree() is called without any argument and + the caller 'forgets' to call parse() afterwards, so this is a bug in + the caller program. + """ assert self._context_node is not None, \ "ElementTree not initialized, missing root" @@ -357,15 +361,20 @@ def __get__(self): return DocInfo(self._doc) - def write(self, file, encoding='us-ascii', pretty_print=False): + def write(self, file, encoding=None, pretty_print=False): + """Write the tree to a file or file-like object. + + Defaults to ASCII encoding. + """ self._assertHasRoot() - if encoding in ('utf8', 'UTF8', 'utf-8'): - encoding = 'UTF-8' - if encoding == 'UTF-8' or encoding == 'us-ascii': - # XXX this is purely for ElementTree compatibility.. + # suppress decl. in default case (purely for ElementTree compatibility) + if encoding is None: + encoding = 'ASCII' write_declaration = 0 else: - write_declaration = 1 + encoding = encoding.upper() + write_declaration = encoding not in \ + ('US-ASCII', 'ASCII', 'UTF8', 'UTF-8') _tofilelike(file, self._context_node, encoding, write_declaration, bool(pretty_print)) @@ -521,7 +530,7 @@ # MANIPULATORS - def __setitem__(self, Py_ssize_t index, _NodeBase element): + def __setitem__(self, Py_ssize_t index, _NodeBase element not None): cdef xmlNode* c_node cdef xmlNode* c_next c_node = _findChild(self._c_node, index) @@ -1370,16 +1379,24 @@ def dump(_NodeBase elem not None): _dumpToFile(sys.stdout, elem._c_node) -def tostring(element_or_tree, encoding='us-ascii', +def tostring(element_or_tree, encoding=None, xml_declaration=None, pretty_print=False): - "Serialize an element to an encoded string representation of its XML tree." + """Serialize an element to an encoded string representation of its XML + tree. + + Defaults to ASCII encoding without XML declaration. + """ cdef int write_declaration cdef int c_pretty_print - encoding = str(encoding) + if encoding is None: + encoding = 'ASCII' + else: + encoding = encoding.upper() c_pretty_print = bool(pretty_print) if xml_declaration is None: # by default, write an XML declaration only for non-standard encodings - write_declaration = (encoding != 'us-ascii') + write_declaration = encoding not in \ + ('ASCII', 'UTF-8', 'UTF8', 'US-ASCII') else: write_declaration = bool(xml_declaration) @@ -1397,8 +1414,9 @@ tree. Note that the result does not carry an XML encoding declaration and is - therefore not necessarily suited for serialization without further - treatment.""" + therefore not necessarily suited for serialization to byte streams without + further treatment. + """ cdef int c_pretty_print c_pretty_print = bool(pretty_print) if isinstance(element_or_tree, _NodeBase): Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Sat May 20 13:21:08 2006 @@ -1731,6 +1731,14 @@ a.text = u'S?k p? nettet' self.assert_(tostring(a, 'UTF-8') in [xml, prologue + xml]) + def test_encoding_tostring_unknown(self): + Element = self.etree.Element + tostring = self.etree.tostring + + a = Element('a') + a.text = u'S?k p? nettet' + self.assertRaises(LookupError, tostring, a, 'Invalid Encoding') + def test_encoding_tostring_sub(self): Element = self.etree.Element SubElement = self.etree.SubElement Modified: lxml/trunk/src/lxml/tree.pxd ============================================================================== --- lxml/trunk/src/lxml/tree.pxd (original) +++ lxml/trunk/src/lxml/tree.pxd Sat May 20 13:21:08 2006 @@ -38,7 +38,7 @@ cdef xmlCharEncodingHandler* xmlFindCharEncodingHandler(char* name) cdef xmlCharEncodingHandler* xmlGetCharEncodingHandler(int enc) cdef int xmlDetectCharEncoding(char* text, int len) - cdef char* xmlGetCharEncodingName(int enc) + cdef char* xmlGetCharEncodingName(xmlCharEncoding enc) cdef extern from "libxml/hash.h": ctypedef struct xmlHashTable Modified: lxml/trunk/src/lxml/xmlwriter.pxi ============================================================================== --- lxml/trunk/src/lxml/xmlwriter.pxi (original) +++ lxml/trunk/src/lxml/xmlwriter.pxi Sat May 20 13:21:08 2006 @@ -12,12 +12,16 @@ cdef char* c_version if element is None: return None - if encoding in ('utf8', 'UTF8', 'utf-8'): - encoding = 'UTF-8' - c_enc = encoding + if encoding is None: + c_enc = NULL + else: + c_enc = encoding # it is necessary to *and* find the encoding handler *and* use # encoding during output enchandler = tree.xmlFindCharEncodingHandler(c_enc) + if enchandler is NULL: + raise LookupError, python.PyString_FromFormat( + "unknown encoding: '%s'", c_enc) c_buffer = tree.xmlAllocOutputBuffer(enchandler) if c_buffer is NULL: raise LxmlError, "Failed to create output buffer" @@ -146,8 +150,11 @@ c_enc = NULL else: c_enc = encoding - enchandler = tree.xmlFindCharEncodingHandler(c_enc) + if enchandler is NULL: + raise LookupError, python.PyString_FromFormat( + "unknown encoding: '%s'", c_enc) + if python.PyString_Check(f) or python.PyUnicode_Check(f): filename = _utf8(f) c_buffer = tree.xmlOutputBufferCreateFilename( From scoder at codespeak.net Sat May 20 13:27:33 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 20 May 2006 13:27:33 +0200 (CEST) Subject: [Lxml-checkins] r27503 - lxml/trunk Message-ID: <20060520112733.78B0910070@code0.codespeak.net> Author: scoder Date: Sat May 20 13:27:32 2006 New Revision: 27503 Modified: lxml/trunk/CHANGES.txt Log: CHANGES.txt: make 'deep copy maintains doc info' feature a fixed bug Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Sat May 20 13:27:32 2006 @@ -7,11 +7,11 @@ Features added -------------- -* Deep copying Elements and ElementTrees maintains the document information - Bugs fixed ---------- +* Deep copying Elements and ElementTrees maintains the document information + * Serialization functions now raise LookupError for unknown encodings * Memory deallocation crash resulting from deep copying elements From scoder at codespeak.net Sat May 20 19:01:14 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 20 May 2006 19:01:14 +0200 (CEST) Subject: [Lxml-checkins] r27513 - in lxml/trunk: . doc src/lxml src/lxml/tests Message-ID: <20060520170114.E0CFB10070@code0.codespeak.net> Author: scoder Date: Sat May 20 19:01:11 2006 New Revision: 27513 Modified: lxml/trunk/CHANGES.txt lxml/trunk/doc/compatibility.txt lxml/trunk/src/lxml/etree.pyx lxml/trunk/src/lxml/tests/test_elementtree.py lxml/trunk/src/lxml/tests/test_etree.py lxml/trunk/src/lxml/tests/test_unicode.py lxml/trunk/src/lxml/tree.pxd Log: implemented setting text of comments, make whitespace handling around comment texts consistent in lxml (not ET compatible on serialization!) Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Sat May 20 19:01:11 2006 @@ -7,6 +7,8 @@ Features added -------------- +* Comment texts can now be changed through the API + Bugs fixed ---------- Modified: lxml/trunk/doc/compatibility.txt ============================================================================== --- lxml/trunk/doc/compatibility.txt (original) +++ lxml/trunk/doc/compatibility.txt Sat May 20 19:01:11 2006 @@ -81,18 +81,22 @@ will be hard to solve. It won't affect some applications, but if you want to port code you must unfortunately make sure that it doesn't. -* ElementTree has a bug when serializing an empty Comment (no text argument - given) to XML, etree serializes this successfully. - * When trying to set a subelement using __setitem__ that is in fact not an Element but some other object, etree raises a TypeError, and ElementTree raises an AssertionError. This also applies to some other places of the API. In general, etree tries to avoid AssertionErrors in favour of being more specific about the reason for the exception. +* ElementTree has a bug when serializing an empty Comment (no text argument + given) to XML, etree serializes this successfully. + * ElementTree ignores comments when parsing XML, while etree will read them in and treat them as Comment elements. +* ElementTree adds whitespace around comments on serialization, lxml does + not. This means that a comment text "text" that ElementTree serializes as + "" will become "" in lxml. + * Because etree is built on top of libxml2, which is namespace prefix aware, etree preserves namespaces declarations and prefixes while ElementTree tends to come up with its own prefixes (ns0, ns1, etc). When no namespace prefix Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Sat May 20 19:01:11 2006 @@ -960,11 +960,30 @@ property text: def __get__(self): - return funicode(self._c_node.content) + if self._c_node.content is NULL: + return '' + else: + return funicode(self._c_node.content) def __set__(self, value): - pass - + cdef tree.xmlDict* c_dict + cdef char* c_text + if value is None: + value = '' + else: + value = _utf8(value) + c_text = self._c_node.content + if c_text is not NULL: + if self._c_node.doc is not NULL: + c_dict = self._c_node.doc.dict + else: + c_dict = NULL + # this code is copied from libxml2's DICT_FREE + if c_dict is NULL or \ + tree.xmlDictOwns(c_dict, c_text) == 0: + tree.xmlFree(c_text) + self._c_node.content = tree.xmlStrdup(_cstr(value)) + # ACCESSORS def __repr__(self): return "" % self.text @@ -1307,12 +1326,12 @@ cdef xmlNode* c_node cdef xmlDoc* c_doc if text is None: - text = ' ' + text = '' else: - text = ' %s ' % _utf8(text) + text = _utf8(text) c_doc = _newDoc() doc = _documentFactory(c_doc, None) - c_node = _createComment(c_doc, text) + c_node = _createComment(c_doc, _cstr(text)) tree.xmlAddChild(c_doc, c_node) return _commentFactory(doc, c_node) Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Sat May 20 19:01:11 2006 @@ -706,9 +706,19 @@ a = Element('a') a.append(Comment('foo')) - self.assertXML( - '', - a) + self.assertEqual(a[0].text, 'foo') + + def test_comment_text(self): + Element = self.etree.Element + SubElement = self.etree.SubElement + Comment = self.etree.Comment + + a = Element('a') + a.append(Comment('foo')) + self.assertEqual(a[0].text, 'foo') + + a[0].text = "TEST" + self.assertEqual(a[0].text, 'TEST') def test_comment_whitespace(self): Element = self.etree.Element @@ -717,9 +727,7 @@ a = Element('a') a.append(Comment(' foo ')) - self.assertXML( - '', - a) + self.assertEqual(a[0].text, ' foo ') def test_comment_nonsense(self): Comment = self.etree.Comment Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Sat May 20 19:01:11 2006 @@ -144,10 +144,26 @@ a = Element('a') a.append(Comment()) self.assertEquals( - '', + '', self._writeElement(a)) - # ignores Comment in ElementTree + # ElementTree ignores comments + def test_comment_parse_empty(self): + ElementTree = self.etree.ElementTree + tostring = self.etree.tostring + + xml = '' + f = StringIO(xml) + doc = ElementTree(file=f) + a = doc.getroot() + self.assertEquals( + '', + a[1].text) + self.assertEquals( + xml, + tostring(a)) + + # ElementTree ignores comments def test_comment_no_proxy_yet(self): ElementTree = self.etree.ElementTree @@ -158,6 +174,35 @@ ' hoi ', a[1].text) + # ElementTree adds whitespace around comments + def test_comment_text(self): + Element = self.etree.Element + Comment = self.etree.Comment + tostring = self.etree.tostring + + a = Element('a') + a.append(Comment('foo')) + self.assertEquals( + '', + tostring(a)) + + a[0].text = "TEST" + self.assertEquals( + '', + tostring(a)) + + # ElementTree adds whitespace around comments + def test_comment_whitespace(self): + Element = self.etree.Element + Comment = self.etree.Comment + tostring = self.etree.tostring + + a = Element('a') + a.append(Comment(' foo ')) + self.assertEquals( + '', + tostring(a)) + # test weird dictionary interaction leading to segfault previously def test_weird_dict_interaction(self): root = self.etree.Element('root') Modified: lxml/trunk/src/lxml/tests/test_unicode.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_unicode.py (original) +++ lxml/trunk/src/lxml/tests/test_unicode.py Sat May 20 19:01:11 2006 @@ -35,7 +35,7 @@ def test_unicode_comment(self): el = etree.Comment(uni) - self.assertEquals(' %s ' % uni, el.text) + self.assertEquals(uni, el.text) def test_unicode_parse_stringio(self): el = etree.parse(StringIO(u'

%s

' % uni)).getroot() Modified: lxml/trunk/src/lxml/tree.pxd ============================================================================== --- lxml/trunk/src/lxml/tree.pxd (original) +++ lxml/trunk/src/lxml/tree.pxd Sat May 20 19:01:11 2006 @@ -51,6 +51,7 @@ # for some reason need to define this in this section; # libxml/dict.h appears to be broken to include in C ctypedef struct xmlDict + cdef int xmlDictOwns(xmlDict* dict, char* name) ctypedef struct xmlDoc ctypedef struct xmlAttr From scoder at codespeak.net Sun May 21 19:47:13 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 21 May 2006 19:47:13 +0200 (CEST) Subject: [Lxml-checkins] r27542 - in lxml/trunk: . src/lxml Message-ID: <20060521174713.7865910075@code0.codespeak.net> Author: scoder Date: Sun May 21 19:47:05 2006 New Revision: 27542 Modified: lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/etree.pyx lxml/trunk/src/lxml/tree.pxd lxml/trunk/src/lxml/xmlwriter.pxi Log: fix memory leak when using iconv concerters, support pretty_print in dump() defaulting to True (as it is for debug anyway), some cleanup Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Sun May 21 19:47:05 2006 @@ -12,6 +12,8 @@ Bugs fixed ---------- +* Memory leak when using iconv encoders in tostring/write + * Deep copying Elements and ElementTrees maintains the document information * Serialization functions now raise LookupError for unknown encodings Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Sun May 21 19:47:05 2006 @@ -1395,8 +1395,8 @@ def iselement(element): return isinstance(element, _Element) -def dump(_NodeBase elem not None): - _dumpToFile(sys.stdout, elem._c_node) +def dump(_NodeBase elem not None, pretty_print=True): + _dumpToFile(sys.stdout, elem._c_node, bool(pretty_print)) def tostring(element_or_tree, encoding=None, xml_declaration=None, pretty_print=False): Modified: lxml/trunk/src/lxml/tree.pxd ============================================================================== --- lxml/trunk/src/lxml/tree.pxd (original) +++ lxml/trunk/src/lxml/tree.pxd Sun May 21 19:47:05 2006 @@ -37,6 +37,7 @@ ctypedef struct xmlCharEncodingHandler cdef xmlCharEncodingHandler* xmlFindCharEncodingHandler(char* name) cdef xmlCharEncodingHandler* xmlGetCharEncodingHandler(int enc) + cdef int xmlCharEncCloseFunc(xmlCharEncodingHandler* handler) cdef int xmlDetectCharEncoding(char* text, int len) cdef char* xmlGetCharEncodingName(xmlCharEncoding enc) Modified: lxml/trunk/src/lxml/xmlwriter.pxi ============================================================================== --- lxml/trunk/src/lxml/xmlwriter.pxi (original) +++ lxml/trunk/src/lxml/xmlwriter.pxi Sun May 21 19:47:05 2006 @@ -24,6 +24,7 @@ "unknown encoding: '%s'", c_enc) c_buffer = tree.xmlAllocOutputBuffer(enchandler) if c_buffer is NULL: + tree.xmlCharEncCloseFunc(enchandler) raise LxmlError, "Failed to create output buffer" try: @@ -39,6 +40,7 @@ tree.xmlBufferLength(c_result_buffer)) finally: tree.xmlOutputBufferClose(c_buffer) + tree.xmlCharEncCloseFunc(enchandler) return result cdef _tounicode(_NodeBase element, int pretty_print): @@ -86,7 +88,6 @@ tree.xmlOutputBufferWriteString(c_buffer, encoding) tree.xmlOutputBufferWriteString(c_buffer, "'?>\n") - cdef void _writeTail(tree.xmlOutputBuffer* c_buffer, xmlNode* c_node, char* encoding, int pretty_print): "Write the element tail." @@ -98,7 +99,7 @@ # output to file-like objects -cdef class _FileWriter: +cdef class _FilelikeWriter: cdef object _filelike cdef _ExceptionContext _exc_context def __init__(self, filelike, exc_context=None): @@ -135,14 +136,14 @@ return 0 cdef int _writeFilelikeWriter(void* ctxt, char* c_buffer, int len): - return (<_FileWriter>ctxt).write(c_buffer, len) + return (<_FilelikeWriter>ctxt).write(c_buffer, len) cdef int _closeFilelikeWriter(void* ctxt): - return (<_FileWriter>ctxt).close() + return (<_FilelikeWriter>ctxt).close() cdef _tofilelike(f, _NodeBase element, encoding, int write_xml_declaration, int pretty_print): - cdef _FileWriter writer + cdef _FilelikeWriter writer cdef tree.xmlOutputBuffer* c_buffer cdef tree.xmlCharEncodingHandler* enchandler cdef char* c_enc @@ -160,26 +161,28 @@ c_buffer = tree.xmlOutputBufferCreateFilename( _cstr(filename), enchandler, 0) elif hasattr(f, 'write'): - writer = _FileWriter(f) + writer = _FilelikeWriter(f) c_buffer = writer._createOutputBuffer(enchandler) else: + tree.xmlCharEncCloseFunc(enchandler) raise TypeError, "File or filename expected, got '%s'" % type(f) _writeNodeToBuffer(c_buffer, element._c_node, c_enc, write_xml_declaration, pretty_print) tree.xmlOutputBufferClose(c_buffer) + tree.xmlCharEncCloseFunc(enchandler) if writer is not None: writer._exc_context._raise_if_stored() # dump node to file (mainly for debug) -cdef _dumpToFile(f, xmlNode* c_node): +cdef _dumpToFile(f, xmlNode* c_node, int pretty_print): cdef tree.xmlOutputBuffer* c_buffer if not python.PyFile_Check(f): raise ValueError, "Not a file" c_buffer = tree.xmlOutputBufferCreateFile(python.PyFile_AsFile(f), NULL) - tree.xmlNodeDumpOutput(c_buffer, c_node.doc, c_node, 0, 0, NULL) + tree.xmlNodeDumpOutput(c_buffer, c_node.doc, c_node, 0, pretty_print, NULL) _writeTail(c_buffer, c_node, NULL, 0) tree.xmlOutputBufferWriteString(c_buffer, '\n') tree.xmlOutputBufferFlush(c_buffer) From scoder at codespeak.net Sun May 21 20:18:17 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 21 May 2006 20:18:17 +0200 (CEST) Subject: [Lxml-checkins] r27543 - lxml/trunk/src/lxml Message-ID: <20060521181817.1314F10076@code0.codespeak.net> Author: scoder Date: Sun May 21 20:18:16 2006 New Revision: 27543 Added: lxml/trunk/src/lxml/serializer.pxi - copied unchanged from r27542, lxml/trunk/src/lxml/xmlwriter.pxi Removed: lxml/trunk/src/lxml/xmlwriter.pxi Modified: lxml/trunk/src/lxml/etree.pyx Log: renamed xmlwriter.pxi to serializer.pxi as xmlwriter is misleading in libxml2 context Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Sun May 21 20:18:16 2006 @@ -1462,7 +1462,7 @@ include "nsclasses.pxi" # Namespace implementation and registry include "docloader.pxi" # Support for custom document loaders include "parser.pxi" # XML Parser -include "xmlwriter.pxi" # XML output functions +include "serializer.pxi" # XML output functions include "xmlid.pxi" # XMLID and IDDict include "extensions.pxi" # XPath/XSLT extension functions include "xpath.pxi" # XPath evaluation Deleted: /lxml/trunk/src/lxml/xmlwriter.pxi ============================================================================== --- /lxml/trunk/src/lxml/xmlwriter.pxi Sun May 21 20:18:16 2006 +++ (empty file) @@ -1,188 +0,0 @@ -# XML serialization and output functions - -tree.xmlKeepBlanksDefault(0) - -cdef _tostring(_NodeBase element, encoding, - int write_xml_declaration, int pretty_print): - "Serialize an element to an encoded string representation of its XML tree." - cdef tree.xmlOutputBuffer* c_buffer - cdef tree.xmlBuffer* c_result_buffer - cdef tree.xmlCharEncodingHandler* enchandler - cdef char* c_enc - cdef char* c_version - if element is None: - return None - if encoding is None: - c_enc = NULL - else: - c_enc = encoding - # it is necessary to *and* find the encoding handler *and* use - # encoding during output - enchandler = tree.xmlFindCharEncodingHandler(c_enc) - if enchandler is NULL: - raise LookupError, python.PyString_FromFormat( - "unknown encoding: '%s'", c_enc) - c_buffer = tree.xmlAllocOutputBuffer(enchandler) - if c_buffer is NULL: - tree.xmlCharEncCloseFunc(enchandler) - raise LxmlError, "Failed to create output buffer" - - try: - _writeNodeToBuffer(c_buffer, element._c_node, c_enc, - write_xml_declaration, pretty_print) - tree.xmlOutputBufferFlush(c_buffer) - if c_buffer.conv is not NULL: - c_result_buffer = c_buffer.conv - else: - c_result_buffer = c_buffer.buffer - result = python.PyString_FromStringAndSize( - tree.xmlBufferContent(c_result_buffer), - tree.xmlBufferLength(c_result_buffer)) - finally: - tree.xmlOutputBufferClose(c_buffer) - tree.xmlCharEncCloseFunc(enchandler) - return result - -cdef _tounicode(_NodeBase element, int pretty_print): - "Serialize an element to the Python unicode representation of its XML tree." - cdef tree.xmlOutputBuffer* c_buffer - cdef tree.xmlBuffer* c_result_buffer - if element is None: - return None - c_buffer = tree.xmlAllocOutputBuffer(NULL) - if c_buffer is NULL: - raise LxmlError, "Failed to create output buffer" - try: - _writeNodeToBuffer(c_buffer, element._c_node, NULL, 0, pretty_print) - tree.xmlOutputBufferFlush(c_buffer) - if c_buffer.conv is not NULL: - c_result_buffer = c_buffer.conv - else: - c_result_buffer = c_buffer.buffer - result = python.PyUnicode_DecodeUTF8( - tree.xmlBufferContent(c_result_buffer), - tree.xmlBufferLength(c_result_buffer), - 'strict') - finally: - tree.xmlOutputBufferClose(c_buffer) - return result - -cdef void _writeNodeToBuffer(tree.xmlOutputBuffer* c_buffer, - xmlNode* c_node, char* encoding, - int write_xml_declaration, int pretty_print): - cdef xmlDoc* c_doc - c_doc = c_node.doc - if write_xml_declaration: - _writeDeclarationToBuffer(c_buffer, c_doc.version, encoding) - - tree.xmlNodeDumpOutput(c_buffer, c_doc, c_node, 0, pretty_print, encoding) - _writeTail(c_buffer, c_node, encoding, pretty_print) - -cdef void _writeDeclarationToBuffer(tree.xmlOutputBuffer* c_buffer, - char* version, char* encoding): - if version is NULL: - version = "1.0" - tree.xmlOutputBufferWriteString(c_buffer, "\n") - -cdef void _writeTail(tree.xmlOutputBuffer* c_buffer, xmlNode* c_node, - char* encoding, int pretty_print): - "Write the element tail." - c_node = c_node.next - while c_node is not NULL and c_node.type == tree.XML_TEXT_NODE: - tree.xmlNodeDumpOutput(c_buffer, c_node.doc, c_node, 0, - pretty_print, encoding) - c_node = c_node.next - -# output to file-like objects - -cdef class _FilelikeWriter: - cdef object _filelike - cdef _ExceptionContext _exc_context - def __init__(self, filelike, exc_context=None): - self._filelike = filelike - if exc_context is None: - self._exc_context = _ExceptionContext() - else: - self._exc_context = exc_context - - cdef tree.xmlOutputBuffer* _createOutputBuffer( - self, tree.xmlCharEncodingHandler* enchandler) except NULL: - cdef tree.xmlOutputBuffer* c_buffer - c_buffer = tree.xmlOutputBufferCreateIO( - _writeFilelikeWriter, _closeFilelikeWriter, - self, enchandler) - if c_buffer is NULL: - raise IOError, "Could not create I/O writer context." - return c_buffer - - cdef int write(self, char* c_buffer, int len): - try: - if self._filelike is None: - raise IOError, "File is already closed" - py_buffer = python.PyString_FromStringAndSize(c_buffer, len) - self._filelike.write(py_buffer) - return len - except Exception: - self._exc_context._store_raised() - return -1 - - cdef int close(self): - # we should not close the file here as we didn't open it - self._filelike = None - return 0 - -cdef int _writeFilelikeWriter(void* ctxt, char* c_buffer, int len): - return (<_FilelikeWriter>ctxt).write(c_buffer, len) - -cdef int _closeFilelikeWriter(void* ctxt): - return (<_FilelikeWriter>ctxt).close() - -cdef _tofilelike(f, _NodeBase element, encoding, - int write_xml_declaration, int pretty_print): - cdef _FilelikeWriter writer - cdef tree.xmlOutputBuffer* c_buffer - cdef tree.xmlCharEncodingHandler* enchandler - cdef char* c_enc - if encoding is None: - c_enc = NULL - else: - c_enc = encoding - enchandler = tree.xmlFindCharEncodingHandler(c_enc) - if enchandler is NULL: - raise LookupError, python.PyString_FromFormat( - "unknown encoding: '%s'", c_enc) - - if python.PyString_Check(f) or python.PyUnicode_Check(f): - filename = _utf8(f) - c_buffer = tree.xmlOutputBufferCreateFilename( - _cstr(filename), enchandler, 0) - elif hasattr(f, 'write'): - writer = _FilelikeWriter(f) - c_buffer = writer._createOutputBuffer(enchandler) - else: - tree.xmlCharEncCloseFunc(enchandler) - raise TypeError, "File or filename expected, got '%s'" % type(f) - - _writeNodeToBuffer(c_buffer, element._c_node, c_enc, - write_xml_declaration, pretty_print) - - tree.xmlOutputBufferClose(c_buffer) - tree.xmlCharEncCloseFunc(enchandler) - if writer is not None: - writer._exc_context._raise_if_stored() - -# dump node to file (mainly for debug) - -cdef _dumpToFile(f, xmlNode* c_node, int pretty_print): - cdef tree.xmlOutputBuffer* c_buffer - if not python.PyFile_Check(f): - raise ValueError, "Not a file" - c_buffer = tree.xmlOutputBufferCreateFile(python.PyFile_AsFile(f), NULL) - tree.xmlNodeDumpOutput(c_buffer, c_node.doc, c_node, 0, pretty_print, NULL) - _writeTail(c_buffer, c_node, NULL, 0) - tree.xmlOutputBufferWriteString(c_buffer, '\n') - tree.xmlOutputBufferFlush(c_buffer) From scoder at codespeak.net Sun May 21 20:46:51 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 21 May 2006 20:46:51 +0200 (CEST) Subject: [Lxml-checkins] r27544 - lxml/branch/xmlsave Message-ID: <20060521184651.36C2310076@code0.codespeak.net> Author: scoder Date: Sun May 21 20:46:50 2006 New Revision: 27544 Added: lxml/branch/xmlsave/ - copied from r27543, lxml/trunk/ Log: new branch for libxml2 xmlsave output support (XMLFormatter) From scoder at codespeak.net Sun May 21 21:46:45 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 21 May 2006 21:46:45 +0200 (CEST) Subject: [Lxml-checkins] r27552 - in lxml/trunk: . src/lxml Message-ID: <20060521194645.582A810076@code0.codespeak.net> Author: scoder Date: Sun May 21 21:46:43 2006 New Revision: 27552 Modified: lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/xmlerror.pxi Log: added last_error attribute in _ErrorLog to access last error or fatal error, some cleanup in xmlerror.pxi Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Sun May 21 21:46:43 2006 @@ -7,6 +7,8 @@ Features added -------------- +* Error logs now have a ``last_error`` attribute for convenience + * Comment texts can now be changed through the API Bugs fixed Modified: lxml/trunk/src/lxml/xmlerror.pxi ============================================================================== --- lxml/trunk/src/lxml/xmlerror.pxi (original) +++ lxml/trunk/src/lxml/xmlerror.pxi Sun May 21 21:46:43 2006 @@ -74,11 +74,13 @@ cdef class _BaseErrorLog: "Immutable base version of an error log." cdef object _entries - def __init__(self, entries): + cdef readonly object last_error + def __init__(self, entries, last_error=None): self._entries = entries + self.last_error = last_error def copy(self): - return _BaseErrorLog(self._entries) + return _BaseErrorLog(self._entries, self.last_error) def __iter__(self): return iter(self._entries) @@ -145,19 +147,7 @@ "Convenience method to get all warnings or worse." return self.filter_from_level(ErrorLevels.WARNING) -cdef class _ErrorLog(_BaseErrorLog): - def __init__(self): - _BaseErrorLog.__init__(self, []) - - def clear(self): - del self._entries[:] - - def copy(self): - return _BaseErrorLog(self._entries[:]) - - def __iter__(self): - return iter(self._entries[:]) - +cdef class _ExtensibleErrorLog(_BaseErrorLog): cdef void connect(self): del self._entries[:] xmlerror.xmlSetStructuredErrorFunc(self, _receiveError) @@ -166,12 +156,16 @@ xmlerror.xmlSetStructuredErrorFunc(NULL, _receiveError) cdef void _receive(self, xmlerror.xmlError* error): + cdef int level cdef _LogEntry entry entry = _LogEntry() entry._set(error) if __GLOBAL_ERROR_LOG is not self: __GLOBAL_ERROR_LOG.receive(entry) self.receive(entry) + level = error.level + if level == xmlerror.XML_ERR_ERROR or level == xmlerror.XML_ERR_FATAL: + self.last_error = entry cdef void _receiveGeneric(self, int domain, int type, int level, int line, message, filename): @@ -181,35 +175,52 @@ if __GLOBAL_ERROR_LOG is not self: __GLOBAL_ERROR_LOG.receive(entry) self.receive(entry) + if level == xmlerror.XML_ERR_ERROR or level == xmlerror.XML_ERR_FATAL: + self.last_error = entry + +cdef class _ErrorLog(_ExtensibleErrorLog): + def __init__(self): + _ExtensibleErrorLog.__init__(self, []) + + def clear(self): + del self._entries[:] + + def copy(self): + return _BaseErrorLog(self._entries[:], self.last_error) + + def __iter__(self): + return iter(self._entries[:]) def receive(self, entry): python.PyList_Append(self._entries, entry) cdef class _DomainErrorLog(_ErrorLog): - def receive(self, entry): - if entry.domain in self._accepted_domains: - _ErrorLog.receive(self, entry) def __init__(self, domains): _ErrorLog.__init__(self) self._accepted_domains = tuple(domains) + def receive(self, entry): + if entry.domain in self._accepted_domains: + _ErrorLog.receive(self, entry) + cdef class _RotatingErrorLog(_ErrorLog): cdef int _max_len def __init__(self, max_len): _ErrorLog.__init__(self) self._max_len = max_len + def receive(self, entry): entries = self._entries if python.PyList_GET_SIZE(entries) > self._max_len: del entries[0] python.PyList_Append(entries, entry) -cdef class PyErrorLog(_ErrorLog): +cdef class PyErrorLog(_ExtensibleErrorLog): cdef object _log cdef object _level_map cdef object _varsOf def __init__(self, logger_name=None): - _ErrorLog.__init__(self) + _ExtensibleErrorLog.__init__(self, []) import logging self._level_map = { ErrorLevels.WARNING : logging.WARNING, From scoder at codespeak.net Sun May 21 21:49:03 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 21 May 2006 21:49:03 +0200 (CEST) Subject: [Lxml-checkins] r27553 - in lxml/branch/xmlsave: . src/lxml Message-ID: <20060521194903.625B310076@code0.codespeak.net> Author: scoder Date: Sun May 21 21:49:01 2006 New Revision: 27553 Modified: lxml/branch/xmlsave/CHANGES.txt lxml/branch/xmlsave/src/lxml/xmlerror.pxi Log: merged in ErrorLog.last_error support from trunk Modified: lxml/branch/xmlsave/CHANGES.txt ============================================================================== --- lxml/branch/xmlsave/CHANGES.txt (original) +++ lxml/branch/xmlsave/CHANGES.txt Sun May 21 21:49:01 2006 @@ -7,6 +7,8 @@ Features added -------------- +* Error logs now have a ``last_error`` attribute for convenience + * Comment texts can now be changed through the API Bugs fixed Modified: lxml/branch/xmlsave/src/lxml/xmlerror.pxi ============================================================================== --- lxml/branch/xmlsave/src/lxml/xmlerror.pxi (original) +++ lxml/branch/xmlsave/src/lxml/xmlerror.pxi Sun May 21 21:49:01 2006 @@ -74,11 +74,13 @@ cdef class _BaseErrorLog: "Immutable base version of an error log." cdef object _entries - def __init__(self, entries): + cdef readonly object last_error + def __init__(self, entries, last_error=None): self._entries = entries + self.last_error = last_error def copy(self): - return _BaseErrorLog(self._entries) + return _BaseErrorLog(self._entries, self.last_error) def __iter__(self): return iter(self._entries) @@ -145,19 +147,7 @@ "Convenience method to get all warnings or worse." return self.filter_from_level(ErrorLevels.WARNING) -cdef class _ErrorLog(_BaseErrorLog): - def __init__(self): - _BaseErrorLog.__init__(self, []) - - def clear(self): - del self._entries[:] - - def copy(self): - return _BaseErrorLog(self._entries[:]) - - def __iter__(self): - return iter(self._entries[:]) - +cdef class _ExtensibleErrorLog(_BaseErrorLog): cdef void connect(self): del self._entries[:] xmlerror.xmlSetStructuredErrorFunc(self, _receiveError) @@ -166,12 +156,16 @@ xmlerror.xmlSetStructuredErrorFunc(NULL, _receiveError) cdef void _receive(self, xmlerror.xmlError* error): + cdef int level cdef _LogEntry entry entry = _LogEntry() entry._set(error) if __GLOBAL_ERROR_LOG is not self: __GLOBAL_ERROR_LOG.receive(entry) self.receive(entry) + level = error.level + if level == xmlerror.XML_ERR_ERROR or level == xmlerror.XML_ERR_FATAL: + self.last_error = entry cdef void _receiveGeneric(self, int domain, int type, int level, int line, message, filename): @@ -181,35 +175,52 @@ if __GLOBAL_ERROR_LOG is not self: __GLOBAL_ERROR_LOG.receive(entry) self.receive(entry) + if level == xmlerror.XML_ERR_ERROR or level == xmlerror.XML_ERR_FATAL: + self.last_error = entry + +cdef class _ErrorLog(_ExtensibleErrorLog): + def __init__(self): + _ExtensibleErrorLog.__init__(self, []) + + def clear(self): + del self._entries[:] + + def copy(self): + return _BaseErrorLog(self._entries[:], self.last_error) + + def __iter__(self): + return iter(self._entries[:]) def receive(self, entry): python.PyList_Append(self._entries, entry) cdef class _DomainErrorLog(_ErrorLog): - def receive(self, entry): - if entry.domain in self._accepted_domains: - _ErrorLog.receive(self, entry) def __init__(self, domains): _ErrorLog.__init__(self) self._accepted_domains = tuple(domains) + def receive(self, entry): + if entry.domain in self._accepted_domains: + _ErrorLog.receive(self, entry) + cdef class _RotatingErrorLog(_ErrorLog): cdef int _max_len def __init__(self, max_len): _ErrorLog.__init__(self) self._max_len = max_len + def receive(self, entry): entries = self._entries if python.PyList_GET_SIZE(entries) > self._max_len: del entries[0] python.PyList_Append(entries, entry) -cdef class PyErrorLog(_ErrorLog): +cdef class PyErrorLog(_ExtensibleErrorLog): cdef object _log cdef object _level_map cdef object _varsOf def __init__(self, logger_name=None): - _ErrorLog.__init__(self) + _ExtensibleErrorLog.__init__(self, []) import logging self._level_map = { ErrorLevels.WARNING : logging.WARNING, From scoder at codespeak.net Sun May 21 22:00:14 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 21 May 2006 22:00:14 +0200 (CEST) Subject: [Lxml-checkins] r27554 - in lxml/trunk: doc src/lxml Message-ID: <20060521200014.55DF110075@code0.codespeak.net> Author: scoder Date: Sun May 21 22:00:13 2006 New Revision: 27554 Modified: lxml/trunk/doc/api.txt lxml/trunk/src/lxml/xmlerror.pxi Log: doctest and bug fix for last_error attribute Modified: lxml/trunk/doc/api.txt ============================================================================== --- lxml/trunk/doc/api.txt (original) +++ lxml/trunk/doc/api.txt Sun May 21 22:00:13 2006 @@ -122,6 +122,13 @@ >>> print entry.domain_name, entry.type_name, entry.filename PARSER ERR_TAG_NOT_FINISHED +There is also a convenience attribute ``last_error`` that returns the last +error:: + + >>> entry = e.error_log.last_error + >>> print entry.domain_name, entry.type_name, entry.filename + PARSER ERR_TAG_NOT_FINISHED + Python unicode strings ---------------------- Modified: lxml/trunk/src/lxml/xmlerror.pxi ============================================================================== --- lxml/trunk/src/lxml/xmlerror.pxi (original) +++ lxml/trunk/src/lxml/xmlerror.pxi Sun May 21 22:00:13 2006 @@ -156,15 +156,18 @@ xmlerror.xmlSetStructuredErrorFunc(NULL, _receiveError) cdef void _receive(self, xmlerror.xmlError* error): - cdef int level + cdef int is_error cdef _LogEntry entry entry = _LogEntry() entry._set(error) + is_error = error.level == xmlerror.XML_ERR_ERROR or \ + error.level == xmlerror.XML_ERR_FATAL if __GLOBAL_ERROR_LOG is not self: __GLOBAL_ERROR_LOG.receive(entry) + if is_error: + __GLOBAL_ERROR_LOG.last_error = entry self.receive(entry) - level = error.level - if level == xmlerror.XML_ERR_ERROR or level == xmlerror.XML_ERR_FATAL: + if is_error: self.last_error = entry cdef void _receiveGeneric(self, int domain, int type, int level, int line, @@ -172,10 +175,14 @@ cdef _LogEntry entry entry = _LogEntry() entry._setGeneric(domain, type, level, line, message, filename) + is_error = level == xmlerror.XML_ERR_ERROR or \ + level == xmlerror.XML_ERR_FATAL if __GLOBAL_ERROR_LOG is not self: __GLOBAL_ERROR_LOG.receive(entry) + if is_error: + __GLOBAL_ERROR_LOG.last_error = entry self.receive(entry) - if level == xmlerror.XML_ERR_ERROR or level == xmlerror.XML_ERR_FATAL: + if is_error: self.last_error = entry cdef class _ErrorLog(_ExtensibleErrorLog): From scoder at codespeak.net Sun May 21 22:00:52 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 21 May 2006 22:00:52 +0200 (CEST) Subject: [Lxml-checkins] r27555 - in lxml/branch/xmlsave: doc src/lxml Message-ID: <20060521200052.7D0B610075@code0.codespeak.net> Author: scoder Date: Sun May 21 22:00:51 2006 New Revision: 27555 Modified: lxml/branch/xmlsave/doc/api.txt lxml/branch/xmlsave/src/lxml/xmlerror.pxi Log: merged in bug fix for ErrorLog.last_error support from trunk Modified: lxml/branch/xmlsave/doc/api.txt ============================================================================== --- lxml/branch/xmlsave/doc/api.txt (original) +++ lxml/branch/xmlsave/doc/api.txt Sun May 21 22:00:51 2006 @@ -122,6 +122,13 @@ >>> print entry.domain_name, entry.type_name, entry.filename PARSER ERR_TAG_NOT_FINISHED +There is also a convenience attribute ``last_error`` that returns the last +error:: + + >>> entry = e.error_log.last_error + >>> print entry.domain_name, entry.type_name, entry.filename + PARSER ERR_TAG_NOT_FINISHED + Python unicode strings ---------------------- Modified: lxml/branch/xmlsave/src/lxml/xmlerror.pxi ============================================================================== --- lxml/branch/xmlsave/src/lxml/xmlerror.pxi (original) +++ lxml/branch/xmlsave/src/lxml/xmlerror.pxi Sun May 21 22:00:51 2006 @@ -156,15 +156,18 @@ xmlerror.xmlSetStructuredErrorFunc(NULL, _receiveError) cdef void _receive(self, xmlerror.xmlError* error): - cdef int level + cdef int is_error cdef _LogEntry entry entry = _LogEntry() entry._set(error) + is_error = error.level == xmlerror.XML_ERR_ERROR or \ + error.level == xmlerror.XML_ERR_FATAL if __GLOBAL_ERROR_LOG is not self: __GLOBAL_ERROR_LOG.receive(entry) + if is_error: + __GLOBAL_ERROR_LOG.last_error = entry self.receive(entry) - level = error.level - if level == xmlerror.XML_ERR_ERROR or level == xmlerror.XML_ERR_FATAL: + if is_error: self.last_error = entry cdef void _receiveGeneric(self, int domain, int type, int level, int line, @@ -172,10 +175,14 @@ cdef _LogEntry entry entry = _LogEntry() entry._setGeneric(domain, type, level, line, message, filename) + is_error = level == xmlerror.XML_ERR_ERROR or \ + level == xmlerror.XML_ERR_FATAL if __GLOBAL_ERROR_LOG is not self: __GLOBAL_ERROR_LOG.receive(entry) + if is_error: + __GLOBAL_ERROR_LOG.last_error = entry self.receive(entry) - if level == xmlerror.XML_ERR_ERROR or level == xmlerror.XML_ERR_FATAL: + if is_error: self.last_error = entry cdef class _ErrorLog(_ExtensibleErrorLog): From scoder at codespeak.net Sun May 21 22:54:11 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 21 May 2006 22:54:11 +0200 (CEST) Subject: [Lxml-checkins] r27556 - lxml/branch/xmlsave/src/lxml Message-ID: <20060521205411.15D5A10076@code0.codespeak.net> Author: scoder Date: Sun May 21 22:54:10 2006 New Revision: 27556 Modified: lxml/branch/xmlsave/src/lxml/etree.pyx lxml/branch/xmlsave/src/lxml/serializer.pxi lxml/branch/xmlsave/src/lxml/tree.pxd Log: first shot on XMLFormatter, two entity encoding related test cases still fail Modified: lxml/branch/xmlsave/src/lxml/etree.pyx ============================================================================== --- lxml/branch/xmlsave/src/lxml/etree.pyx (original) +++ lxml/branch/xmlsave/src/lxml/etree.pyx Sun May 21 22:54:10 2006 @@ -361,7 +361,7 @@ def __get__(self): return DocInfo(self._doc) - def write(self, file, encoding=None, pretty_print=False): + def write(self, file, encoding=None, formatter=None): """Write the tree to a file or file-like object. Defaults to ASCII encoding. @@ -375,8 +375,7 @@ encoding = encoding.upper() write_declaration = encoding not in \ ('US-ASCII', 'ASCII', 'UTF8', 'UTF-8') - _tofilelike(file, self._context_node, encoding, - write_declaration, bool(pretty_print)) + _tofilelike(file, self._context_node, encoding, formatter) def getiterator(self, tag=None): root = self.getroot() Modified: lxml/branch/xmlsave/src/lxml/serializer.pxi ============================================================================== --- lxml/branch/xmlsave/src/lxml/serializer.pxi (original) +++ lxml/branch/xmlsave/src/lxml/serializer.pxi Sun May 21 22:54:10 2006 @@ -2,6 +2,83 @@ tree.xmlKeepBlanksDefault(0) +class XMLOutputError(LxmlError): + pass + +cdef class XMLFormatter: + cdef int _pretty_print + cdef int _write_declaration + cdef int _save_options + cdef _ErrorLog _error_log + + def __init__(self, pretty_print=False, write_declaration=None, + split_empty_tags=False): + cdef int save_options + save_options = tree.XML_SAVE_NO_XHTML + + if pretty_print: + self._pretty_print = True + save_options = save_options | tree.XML_SAVE_FORMAT + else: + self._pretty_print = False + + if split_empty_tags: + save_options = save_options | tree.XML_SAVE_NO_EMPTY + + if write_declaration is None: + self._write_declaration = -1 + elif write_declaration: + self._write_declaration = True + else: + self._write_declaration = False + save_options = save_options | tree.XML_SAVE_NO_DECL + + self._save_options = save_options + self._error_log = _ErrorLog() + + property error_log: + def __get__(self): + return self._error_log.copy() + + cdef int _optionsForEncoding(self, encoding): + cdef int save_options + if self._write_declaration != -1: + return self._save_options + # purely for ElementTree compatibility: suppress decl. in default cases + save_options = self._save_options + if encoding is None: + save_options = save_options | tree.XML_SAVE_NO_DECL + elif encoding.upper() in ('US-ASCII', 'ASCII', 'UTF8', 'UTF-8'): + save_options = save_options | tree.XML_SAVE_NO_DECL + return save_options + + cdef int _saveNode(self, tree.xmlSaveCtxt* save_ctxt, + xmlNode* c_node) except -1: + cdef long result + self._error_log.connect() + result = tree.xmlSaveTree(save_ctxt, c_node) + tree.xmlSaveClose(save_ctxt) + self._error_log.disconnect() + if result < 0: + error = self._error_log.last_error + if error is not None: + if error.domain == xmlerror.XML_FROM_IO: + raise IOError, error.message + else: + raise XMLOutputError, error.message + else: + raise XMLOutputError, "Error serializing the tree" + else: + return 0 + +cdef class XHTMLFormatter(XMLFormatter): + def __init__(self, **kwargs): + XMLFormatter.__init__(self, **kwargs) + self._save_options = self._save_options & (~tree.XML_SAVE_NO_XHTML) + +cdef XMLFormatter __DEFAULT_XML_FORMATTER +__DEFAULT_XML_FORMATTER = XMLFormatter() + cdef _tostring(_NodeBase element, encoding, int write_xml_declaration, int pretty_print): "Serialize an element to an encoded string representation of its XML tree." @@ -119,6 +196,24 @@ raise IOError, "Could not create I/O writer context." return c_buffer + cdef tree.xmlSaveCtxt* _createSaveContext( + self, char* encoding, int save_options) except NULL: + cdef tree.xmlCharEncodingHandler* enchandler + cdef tree.xmlSaveCtxt* c_ctxt + c_ctxt = tree.xmlSaveToIO( + _writeFilelikeWriter, _closeFilelikeWriter, + self, encoding, save_options) + if c_ctxt is NULL: + # this is only done to check if we knew the encoding + enchandler = tree.xmlFindCharEncodingHandler(encoding) + if enchandler is NULL: + raise LookupError, python.PyString_FromFormat( + "unknown encoding: '%s'", encoding) + else: + tree.xmlCharEncCloseFunc(enchandler) + raise IOError, "Could not create I/O writer context." + return c_ctxt + cdef int write(self, char* c_buffer, int len): try: if self._filelike is None: @@ -141,37 +236,31 @@ cdef int _closeFilelikeWriter(void* ctxt): return (<_FilelikeWriter>ctxt).close() -cdef _tofilelike(f, _NodeBase element, encoding, - int write_xml_declaration, int pretty_print): +cdef _tofilelike(f, _NodeBase element, encoding, XMLFormatter formatter): cdef _FilelikeWriter writer - cdef tree.xmlOutputBuffer* c_buffer - cdef tree.xmlCharEncodingHandler* enchandler + cdef tree.xmlSaveCtxt* save_ctxt cdef char* c_enc + cdef int save_options if encoding is None: c_enc = NULL else: + encoding = encoding.upper() c_enc = encoding - enchandler = tree.xmlFindCharEncodingHandler(c_enc) - if enchandler is NULL: - raise LookupError, python.PyString_FromFormat( - "unknown encoding: '%s'", c_enc) + if formatter is None: + formatter = __DEFAULT_XML_FORMATTER + save_options = formatter._optionsForEncoding(encoding) if python.PyString_Check(f) or python.PyUnicode_Check(f): - filename = _utf8(f) - c_buffer = tree.xmlOutputBufferCreateFilename( - _cstr(filename), enchandler, 0) + filename = _utf8(f) + save_ctxt = tree.xmlSaveToFilename( + _cstr(filename), c_enc, save_options) elif hasattr(f, 'write'): - writer = _FilelikeWriter(f) - c_buffer = writer._createOutputBuffer(enchandler) + writer = _FilelikeWriter(f) + save_ctxt = writer._createSaveContext(c_enc, save_options) else: - tree.xmlCharEncCloseFunc(enchandler) raise TypeError, "File or filename expected, got '%s'" % type(f) - _writeNodeToBuffer(c_buffer, element._c_node, c_enc, - write_xml_declaration, pretty_print) - - tree.xmlOutputBufferClose(c_buffer) - tree.xmlCharEncCloseFunc(enchandler) + formatter._saveNode(save_ctxt, element._c_node) if writer is not None: writer._exc_context._raise_if_stored() Modified: lxml/branch/xmlsave/src/lxml/tree.pxd ============================================================================== --- lxml/branch/xmlsave/src/lxml/tree.pxd (original) +++ lxml/branch/xmlsave/src/lxml/tree.pxd Sun May 21 22:54:10 2006 @@ -227,13 +227,26 @@ FILE* file, xmlCharEncodingHandler* encoder) cdef xmlOutputBuffer* xmlOutputBufferCreateFilename( char* URI, xmlCharEncodingHandler* encoder, int compression) + cdef extern from "libxml/xmlsave.h": + ctypedef enum xmlSaveOption: + XML_SAVE_FORMAT = 1 # format save output + XML_SAVE_NO_DECL = 2 # drop the xml declaration + XML_SAVE_NO_EMPTY = 4 # no empty tags + XML_SAVE_NO_XHTML = 8 # disable XHTML1 specific rules + ctypedef struct xmlSaveCtxt: pass cdef xmlSaveCtxt* xmlSaveToFilename(char* filename, char* encoding, int options) + cdef xmlSaveCtxt* xmlSaveToBuffer(xmlBuffer* buffer, char* encoding, + int options) + cdef xmlSaveCtxt* xmlSaveToIO(xmlOutputWriteCallback iowrite, + xmlOutputCloseCallback ioclose, + void* ioctx, char* encoding, int options) cdef long xmlSaveDoc(xmlSaveCtxt* ctxt, xmlDoc* doc) + cdef long xmlSaveTree(xmlSaveCtxt* ctxt, xmlNode* node) cdef int xmlSaveClose(xmlSaveCtxt* ctxt) cdef extern from "libxml/xmlstring.h": From scoder at codespeak.net Mon May 22 07:16:25 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 22 May 2006 07:16:25 +0200 (CEST) Subject: [Lxml-checkins] r27572 - lxml/branch/xmlsave/src/lxml/tests Message-ID: <20060522051625.CCB6010071@code0.codespeak.net> Author: scoder Date: Mon May 22 07:16:24 2006 New Revision: 27572 Modified: lxml/branch/xmlsave/src/lxml/tests/test_elementtree.py Log: fix prologue testing in test_elementtree.py (ET only special cases 'utf-8' encoding name, not 'UTF-8') Modified: lxml/branch/xmlsave/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/branch/xmlsave/src/lxml/tests/test_elementtree.py (original) +++ lxml/branch/xmlsave/src/lxml/tests/test_elementtree.py Mon May 22 07:16:24 2006 @@ -1683,8 +1683,8 @@ a.text = u'S?k p? nettet' self.assertXML( u'S?k p? nettet'.encode('UTF-8'), - a, 'UTF-8') - + a, 'utf-8') + def test_encoding2(self): ElementTree = self.etree.ElementTree Element = self.etree.Element @@ -1694,13 +1694,9 @@ f = StringIO() tree = ElementTree(element=a) - tree.write(f, 'UTF-8') - data = f.getvalue() - - # XXX prologue generation seems to be inconsistent between libraries.. - xml = u'S?k p? nettet'.encode('UTF-8') - prologue = u'\n'.encode('UTF-8') - self.assert_(data in [xml, prologue + xml]) + tree.write(f, 'utf-8') + self.assertEqual(u'S?k p? nettet'.encode('UTF-8'), + f.getvalue()) ## # ignore wrong (left-over?) encoding declaration in unicode strings ## def _test_wrong_unicode_encoding(self): @@ -1731,13 +1727,10 @@ Element = self.etree.Element tostring = self.etree.tostring - # XXX prologue generation seems to be inconsistent between libraries.. - xml = u'S?k p? nettet'.encode('UTF-8') - prologue = u'\n'.encode('UTF-8') - a = Element('a') a.text = u'S?k p? nettet' - self.assert_(tostring(a, 'UTF-8') in [xml, prologue + xml]) + self.assertEqual(u'S?k p? nettet'.encode('UTF-8'), + tostring(a, 'utf-8')) def test_encoding_tostring_unknown(self): Element = self.etree.Element @@ -1755,11 +1748,8 @@ a = Element('a') b = SubElement(a, 'b') b.text = u'S?k p? nettet' - - # XXX prologue generation seems to be inconsistent between libraries.. - xml = u'S?k p? nettet'.encode('UTF-8') - prologue = u'\n'.encode('UTF-8') - self.assert_(tostring(b, 'UTF-8') in [xml, prologue + xml]) + self.assertEqual(u'S?k p? nettet'.encode('UTF-8'), + tostring(b, 'utf-8')) def test_encoding_tostring_sub_tail(self): Element = self.etree.Element @@ -1770,9 +1760,8 @@ b = SubElement(a, 'b') b.text = u'S?k p? nettet' b.tail = u'S?k' - xml = u'S?k p? nettetS?k'.encode('UTF-8') - prologue = u'\n'.encode('UTF-8') - self.assert_(tostring(b, 'UTF-8') in [xml, prologue + xml]) + self.assertEqual(u'S?k p? nettetS?k'.encode('UTF-8'), + tostring(b, 'utf-8')) def test_encoding_tostring_default_encoding(self): Element = self.etree.Element From scoder at codespeak.net Mon May 22 07:17:11 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 22 May 2006 07:17:11 +0200 (CEST) Subject: [Lxml-checkins] r27573 - lxml/trunk/src/lxml/tests Message-ID: <20060522051711.11B7310071@code0.codespeak.net> Author: scoder Date: Mon May 22 07:17:09 2006 New Revision: 27573 Modified: lxml/trunk/src/lxml/tests/test_elementtree.py Log: fix prologue testing in test_elementtree.py (ET only special cases 'utf-8' encoding name, not 'UTF-8') Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Mon May 22 07:17:09 2006 @@ -1683,8 +1683,8 @@ a.text = u'S?k p? nettet' self.assertXML( u'S?k p? nettet'.encode('UTF-8'), - a, 'UTF-8') - + a, 'utf-8') + def test_encoding2(self): ElementTree = self.etree.ElementTree Element = self.etree.Element @@ -1694,13 +1694,9 @@ f = StringIO() tree = ElementTree(element=a) - tree.write(f, 'UTF-8') - data = f.getvalue() - - # XXX prologue generation seems to be inconsistent between libraries.. - xml = u'S?k p? nettet'.encode('UTF-8') - prologue = u'\n'.encode('UTF-8') - self.assert_(data in [xml, prologue + xml]) + tree.write(f, 'utf-8') + self.assertEqual(u'S?k p? nettet'.encode('UTF-8'), + f.getvalue()) ## # ignore wrong (left-over?) encoding declaration in unicode strings ## def _test_wrong_unicode_encoding(self): @@ -1731,13 +1727,10 @@ Element = self.etree.Element tostring = self.etree.tostring - # XXX prologue generation seems to be inconsistent between libraries.. - xml = u'S?k p? nettet'.encode('UTF-8') - prologue = u'\n'.encode('UTF-8') - a = Element('a') a.text = u'S?k p? nettet' - self.assert_(tostring(a, 'UTF-8') in [xml, prologue + xml]) + self.assertEqual(u'S?k p? nettet'.encode('UTF-8'), + tostring(a, 'utf-8')) def test_encoding_tostring_unknown(self): Element = self.etree.Element @@ -1755,11 +1748,8 @@ a = Element('a') b = SubElement(a, 'b') b.text = u'S?k p? nettet' - - # XXX prologue generation seems to be inconsistent between libraries.. - xml = u'S?k p? nettet'.encode('UTF-8') - prologue = u'\n'.encode('UTF-8') - self.assert_(tostring(b, 'UTF-8') in [xml, prologue + xml]) + self.assertEqual(u'S?k p? nettet'.encode('UTF-8'), + tostring(b, 'utf-8')) def test_encoding_tostring_sub_tail(self): Element = self.etree.Element @@ -1770,9 +1760,8 @@ b = SubElement(a, 'b') b.text = u'S?k p? nettet' b.tail = u'S?k' - xml = u'S?k p? nettetS?k'.encode('UTF-8') - prologue = u'\n'.encode('UTF-8') - self.assert_(tostring(b, 'UTF-8') in [xml, prologue + xml]) + self.assertEqual(u'S?k p? nettetS?k'.encode('UTF-8'), + tostring(b, 'utf-8')) def test_encoding_tostring_default_encoding(self): Element = self.etree.Element From scoder at codespeak.net Mon May 22 07:23:35 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 22 May 2006 07:23:35 +0200 (CEST) Subject: [Lxml-checkins] r27574 - lxml/trunk/src/lxml/tests Message-ID: <20060522052335.2A92F10071@code0.codespeak.net> Author: scoder Date: Mon May 22 07:23:34 2006 New Revision: 27574 Modified: lxml/trunk/src/lxml/tests/test_elementtree.py Log: test raising parser error on XML declaration in unicode strings Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Mon May 22 07:23:34 2006 @@ -1698,16 +1698,12 @@ self.assertEqual(u'S?k p? nettet'.encode('UTF-8'), f.getvalue()) -## # ignore wrong (left-over?) encoding declaration in unicode strings -## def _test_wrong_unicode_encoding(self): -## XML = self.etree.XML - -## test_utf = u'S?k p? nettet' -## parsed = XML(test_utf) -## self.assertXML( -## u'S?k p? nettet'.encode('UTF-8'), -## parsed, 'UTF-8') - + # raise error on wrong (left-over?) encoding declaration in unicode strings + def _test_wrong_unicode_encoding(self): + XML = self.etree.XML + test_utf = u'S?k p? nettet' + self.assertRaises(SyntaxError, XML, test_utf) + def test_encoding_default_encoding(self): ElementTree = self.etree.ElementTree Element = self.etree.Element From scoder at codespeak.net Mon May 22 07:24:59 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 22 May 2006 07:24:59 +0200 (CEST) Subject: [Lxml-checkins] r27575 - lxml/branch/xmlsave/src/lxml/tests Message-ID: <20060522052459.25E1910071@code0.codespeak.net> Author: scoder Date: Mon May 22 07:24:58 2006 New Revision: 27575 Modified: lxml/branch/xmlsave/src/lxml/tests/test_elementtree.py Log: merged in test case from trunk Modified: lxml/branch/xmlsave/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/branch/xmlsave/src/lxml/tests/test_elementtree.py (original) +++ lxml/branch/xmlsave/src/lxml/tests/test_elementtree.py Mon May 22 07:24:58 2006 @@ -1698,16 +1698,12 @@ self.assertEqual(u'S?k p? nettet'.encode('UTF-8'), f.getvalue()) -## # ignore wrong (left-over?) encoding declaration in unicode strings -## def _test_wrong_unicode_encoding(self): -## XML = self.etree.XML - -## test_utf = u'S?k p? nettet' -## parsed = XML(test_utf) -## self.assertXML( -## u'S?k p? nettet'.encode('UTF-8'), -## parsed, 'UTF-8') - + # raise error on wrong (left-over?) encoding declaration in unicode strings + def _test_wrong_unicode_encoding(self): + XML = self.etree.XML + test_utf = u'S?k p? nettet' + self.assertRaises(SyntaxError, XML, test_utf) + def test_encoding_default_encoding(self): ElementTree = self.etree.ElementTree Element = self.etree.Element From scoder at codespeak.net Mon May 22 07:48:37 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 22 May 2006 07:48:37 +0200 (CEST) Subject: [Lxml-checkins] r27576 - lxml/branch/xmlsave/src/lxml Message-ID: <20060522054837.572F710071@code0.codespeak.net> Author: scoder Date: Mon May 22 07:48:36 2006 New Revision: 27576 Modified: lxml/branch/xmlsave/src/lxml/serializer.pxi Log: doc string Modified: lxml/branch/xmlsave/src/lxml/serializer.pxi ============================================================================== --- lxml/branch/xmlsave/src/lxml/serializer.pxi (original) +++ lxml/branch/xmlsave/src/lxml/serializer.pxi Mon May 22 07:48:36 2006 @@ -41,10 +41,11 @@ return self._error_log.copy() cdef int _optionsForEncoding(self, encoding): + """Purely for ElementTree compatibility: suppress XML declaration in + default cases.""" cdef int save_options if self._write_declaration != -1: return self._save_options - # purely for ElementTree compatibility: suppress decl. in default cases save_options = self._save_options if encoding is None: save_options = save_options | tree.XML_SAVE_NO_DECL From scoder at codespeak.net Mon May 22 08:43:16 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 22 May 2006 08:43:16 +0200 (CEST) Subject: [Lxml-checkins] r27577 - lxml/branch/xmlsave/src/lxml/tests Message-ID: <20060522064316.51C8610071@code0.codespeak.net> Author: scoder Date: Mon May 22 08:43:14 2006 New Revision: 27577 Modified: lxml/branch/xmlsave/src/lxml/tests/test_elementtree.py lxml/branch/xmlsave/src/lxml/tests/test_htmlparser.py lxml/branch/xmlsave/src/lxml/tests/test_io.py Log: prevent test cases from leaking temp files, some cleanup in test_elementtree.py Modified: lxml/branch/xmlsave/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/branch/xmlsave/src/lxml/tests/test_elementtree.py (original) +++ lxml/branch/xmlsave/src/lxml/tests/test_elementtree.py Mon May 22 08:43:14 2006 @@ -1676,7 +1676,6 @@ ) def test_encoding(self): - ElementTree = self.etree.ElementTree Element = self.etree.Element a = Element('a') @@ -1704,7 +1703,7 @@ test_utf = u'S?k p? nettet' self.assertRaises(SyntaxError, XML, test_utf) - def test_encoding_default_encoding(self): + def test_encoding_write_default_encoding(self): ElementTree = self.etree.ElementTree Element = self.etree.Element @@ -1716,7 +1715,7 @@ tree.write(f) data = f.getvalue() self.assertEquals( - 'Søk på nettet', + u'S?k p? nettet'.encode('ASCII', 'xmlcharrefreplace'), data) def test_encoding_tostring(self): @@ -1783,8 +1782,9 @@ # the same, just hex versus decimal expected = 'Søk på nettet' - expected2 = 'Søk på nettet' - self.assert_(tostring(b) in [expected, expected2]) + self.assertEquals( + expected, + tostring(b)) def test_deepcopy(self): Element = self.etree.Element @@ -1885,15 +1885,17 @@ """ ElementTree = self.etree.ElementTree handle, filename = tempfile.mkstemp() - f = open(filename, 'wb') - tree = ElementTree(element=element) - tree.write(f, encoding) - f.close() - f = open(filename, 'rb') - data = unicode(f.read(), encoding) - f.close() - os.close(handle) - os.remove(filename) + try: + f = open(filename, 'wb') + tree = ElementTree(element=element) + tree.write(f, encoding) + f.close() + f = open(filename, 'rb') + data = unicode(f.read(), encoding) + f.close() + finally: + os.close(handle) + os.remove(filename) return canonicalize(data) def assertXML(self, expected, element, encoding='us-ascii'): Modified: lxml/branch/xmlsave/src/lxml/tests/test_htmlparser.py ============================================================================== --- lxml/branch/xmlsave/src/lxml/tests/test_htmlparser.py (original) +++ lxml/branch/xmlsave/src/lxml/tests/test_htmlparser.py Mon May 22 08:43:14 2006 @@ -5,7 +5,7 @@ """ import unittest -import tempfile +import tempfile, os from common_imports import StringIO, etree, fileInTestDir from common_imports import SillyFileLike, HelperTestCase @@ -59,9 +59,13 @@ parser = self.etree.HTMLParser() filename = tempfile.mktemp(suffix=".html") open(filename, 'wb').write(self.html_str) - f = open(filename, 'r') - tree = self.etree.parse(f, parser) - self.assertEqual(self.etree.tostring(tree.getroot()), self.html_str) + try: + f = open(filename, 'r') + tree = self.etree.parse(f, parser) + f.close() + self.assertEqual(self.etree.tostring(tree.getroot()), self.html_str) + finally: + os.remove(filename) def test_module_parse_html_filelike(self): parser = self.etree.HTMLParser() Modified: lxml/branch/xmlsave/src/lxml/tests/test_io.py ============================================================================== --- lxml/branch/xmlsave/src/lxml/tests/test_io.py (original) +++ lxml/branch/xmlsave/src/lxml/tests/test_io.py Mon May 22 08:43:14 2006 @@ -5,7 +5,7 @@ """ import unittest -import tempfile, gzip +import tempfile, gzip, os from common_imports import etree, ElementTree, fileInTestDir from common_imports import SillyFileLike, LargeFileLike @@ -27,15 +27,22 @@ filename = tempfile.mktemp(suffix=".xml") self.tree.write(filename) - self.assertEqual(open(filename).read(), self.root_str) + try: + self.assertEqual(open(filename).read(), self.root_str) + finally: + os.remove(filename) def test_module_parse_gzipobject(self): # (c)ElementTree supports gzip instance as parse argument filename = tempfile.mktemp(suffix=".xml.gz") gzip.open(filename, 'wb').write(self.root_str) - f_gz = gzip.open(filename, 'r') - tree = self.etree.parse(f_gz) - self.assertEqual(self.etree.tostring(tree.getroot()), self.root_str) + try: + f_gz = gzip.open(filename, 'r') + tree = self.etree.parse(f_gz) + self.assertEqual(self.etree.tostring(tree.getroot()), self.root_str) + finally: + os.remove(filename) + def test_class_parse_filename(self): # (c)ElementTree class ElementTree has a 'parse' method that returns @@ -45,26 +52,32 @@ filename = tempfile.mktemp(suffix=".xml") open(filename, 'wb').write(self.root_str) - tree = self.etree.ElementTree() - root = tree.parse(filename) - self.assertEqual(self.etree.tostring(root), self.root_str) + try: + tree = self.etree.ElementTree() + root = tree.parse(filename) + self.assertEqual(self.etree.tostring(root), self.root_str) + finally: + os.remove(filename) def test_class_parse_filename_remove_previous(self): filename = tempfile.mktemp(suffix=".xml") open(filename, "wb").write(self.root_str) - tree = self.etree.ElementTree() - root = tree.parse(filename) - # and now do it again; previous content should still be there - root2 = tree.parse(filename) - self.assertEquals('a', root.tag) - self.assertEquals('a', root2.tag) - # now remove all references to root2, and parse again - del root2 - root3 = tree.parse(filename) - self.assertEquals('a', root.tag) - self.assertEquals('a', root3.tag) - # root2's memory should've been freed here - # XXX how to check? + try: + tree = self.etree.ElementTree() + root = tree.parse(filename) + # and now do it again; previous content should still be there + root2 = tree.parse(filename) + self.assertEquals('a', root.tag) + self.assertEquals('a', root2.tag) + # now remove all references to root2, and parse again + del root2 + root3 = tree.parse(filename) + self.assertEquals('a', root.tag) + self.assertEquals('a', root3.tag) + # root2's memory should've been freed here + # XXX how to check? + finally: + os.remove(filename) def test_class_parse_fileobject(self): # (c)ElementTree class ElementTree has a 'parse' method that returns @@ -74,10 +87,13 @@ filename = tempfile.mktemp(suffix=".xml") open(filename, 'wb').write(self.root_str) - f = open(filename, 'r') - tree = self.etree.ElementTree() - root = tree.parse(f) - self.assertEqual(self.etree.tostring(root), self.root_str) + try: + f = open(filename, 'r') + tree = self.etree.ElementTree() + root = tree.parse(f) + self.assertEqual(self.etree.tostring(root), self.root_str) + finally: + os.remove(filename) def test_class_parse_unamed_fileobject(self): # (c)ElementTree class ElementTree has a 'parse' method that returns From scoder at codespeak.net Mon May 22 08:43:50 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 22 May 2006 08:43:50 +0200 (CEST) Subject: [Lxml-checkins] r27578 - lxml/trunk/src/lxml/tests Message-ID: <20060522064350.60D7010071@code0.codespeak.net> Author: scoder Date: Mon May 22 08:43:49 2006 New Revision: 27578 Modified: lxml/trunk/src/lxml/tests/test_elementtree.py lxml/trunk/src/lxml/tests/test_htmlparser.py lxml/trunk/src/lxml/tests/test_io.py Log: prevent test cases from leaking temp files, some cleanup in test_elementtree.py Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Mon May 22 08:43:49 2006 @@ -1676,7 +1676,6 @@ ) def test_encoding(self): - ElementTree = self.etree.ElementTree Element = self.etree.Element a = Element('a') @@ -1704,7 +1703,7 @@ test_utf = u'S?k p? nettet' self.assertRaises(SyntaxError, XML, test_utf) - def test_encoding_default_encoding(self): + def test_encoding_write_default_encoding(self): ElementTree = self.etree.ElementTree Element = self.etree.Element @@ -1716,7 +1715,7 @@ tree.write(f) data = f.getvalue() self.assertEquals( - 'Søk på nettet', + u'S?k p? nettet'.encode('ASCII', 'xmlcharrefreplace'), data) def test_encoding_tostring(self): @@ -1783,8 +1782,9 @@ # the same, just hex versus decimal expected = 'Søk på nettet' - expected2 = 'Søk på nettet' - self.assert_(tostring(b) in [expected, expected2]) + self.assertEquals( + expected, + tostring(b)) def test_deepcopy(self): Element = self.etree.Element @@ -1885,15 +1885,17 @@ """ ElementTree = self.etree.ElementTree handle, filename = tempfile.mkstemp() - f = open(filename, 'wb') - tree = ElementTree(element=element) - tree.write(f, encoding) - f.close() - f = open(filename, 'rb') - data = unicode(f.read(), encoding) - f.close() - os.close(handle) - os.remove(filename) + try: + f = open(filename, 'wb') + tree = ElementTree(element=element) + tree.write(f, encoding) + f.close() + f = open(filename, 'rb') + data = unicode(f.read(), encoding) + f.close() + finally: + os.close(handle) + os.remove(filename) return canonicalize(data) def assertXML(self, expected, element, encoding='us-ascii'): Modified: lxml/trunk/src/lxml/tests/test_htmlparser.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_htmlparser.py (original) +++ lxml/trunk/src/lxml/tests/test_htmlparser.py Mon May 22 08:43:49 2006 @@ -5,7 +5,7 @@ """ import unittest -import tempfile +import tempfile, os from common_imports import StringIO, etree, fileInTestDir from common_imports import SillyFileLike, HelperTestCase @@ -59,9 +59,13 @@ parser = self.etree.HTMLParser() filename = tempfile.mktemp(suffix=".html") open(filename, 'wb').write(self.html_str) - f = open(filename, 'r') - tree = self.etree.parse(f, parser) - self.assertEqual(self.etree.tostring(tree.getroot()), self.html_str) + try: + f = open(filename, 'r') + tree = self.etree.parse(f, parser) + f.close() + self.assertEqual(self.etree.tostring(tree.getroot()), self.html_str) + finally: + os.remove(filename) def test_module_parse_html_filelike(self): parser = self.etree.HTMLParser() Modified: lxml/trunk/src/lxml/tests/test_io.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_io.py (original) +++ lxml/trunk/src/lxml/tests/test_io.py Mon May 22 08:43:49 2006 @@ -5,7 +5,7 @@ """ import unittest -import tempfile, gzip +import tempfile, gzip, os from common_imports import etree, ElementTree, fileInTestDir from common_imports import SillyFileLike, LargeFileLike @@ -27,15 +27,22 @@ filename = tempfile.mktemp(suffix=".xml") self.tree.write(filename) - self.assertEqual(open(filename).read(), self.root_str) + try: + self.assertEqual(open(filename).read(), self.root_str) + finally: + os.remove(filename) def test_module_parse_gzipobject(self): # (c)ElementTree supports gzip instance as parse argument filename = tempfile.mktemp(suffix=".xml.gz") gzip.open(filename, 'wb').write(self.root_str) - f_gz = gzip.open(filename, 'r') - tree = self.etree.parse(f_gz) - self.assertEqual(self.etree.tostring(tree.getroot()), self.root_str) + try: + f_gz = gzip.open(filename, 'r') + tree = self.etree.parse(f_gz) + self.assertEqual(self.etree.tostring(tree.getroot()), self.root_str) + finally: + os.remove(filename) + def test_class_parse_filename(self): # (c)ElementTree class ElementTree has a 'parse' method that returns @@ -45,26 +52,32 @@ filename = tempfile.mktemp(suffix=".xml") open(filename, 'wb').write(self.root_str) - tree = self.etree.ElementTree() - root = tree.parse(filename) - self.assertEqual(self.etree.tostring(root), self.root_str) + try: + tree = self.etree.ElementTree() + root = tree.parse(filename) + self.assertEqual(self.etree.tostring(root), self.root_str) + finally: + os.remove(filename) def test_class_parse_filename_remove_previous(self): filename = tempfile.mktemp(suffix=".xml") open(filename, "wb").write(self.root_str) - tree = self.etree.ElementTree() - root = tree.parse(filename) - # and now do it again; previous content should still be there - root2 = tree.parse(filename) - self.assertEquals('a', root.tag) - self.assertEquals('a', root2.tag) - # now remove all references to root2, and parse again - del root2 - root3 = tree.parse(filename) - self.assertEquals('a', root.tag) - self.assertEquals('a', root3.tag) - # root2's memory should've been freed here - # XXX how to check? + try: + tree = self.etree.ElementTree() + root = tree.parse(filename) + # and now do it again; previous content should still be there + root2 = tree.parse(filename) + self.assertEquals('a', root.tag) + self.assertEquals('a', root2.tag) + # now remove all references to root2, and parse again + del root2 + root3 = tree.parse(filename) + self.assertEquals('a', root.tag) + self.assertEquals('a', root3.tag) + # root2's memory should've been freed here + # XXX how to check? + finally: + os.remove(filename) def test_class_parse_fileobject(self): # (c)ElementTree class ElementTree has a 'parse' method that returns @@ -74,10 +87,13 @@ filename = tempfile.mktemp(suffix=".xml") open(filename, 'wb').write(self.root_str) - f = open(filename, 'r') - tree = self.etree.ElementTree() - root = tree.parse(f) - self.assertEqual(self.etree.tostring(root), self.root_str) + try: + f = open(filename, 'r') + tree = self.etree.ElementTree() + root = tree.parse(f) + self.assertEqual(self.etree.tostring(root), self.root_str) + finally: + os.remove(filename) def test_class_parse_unamed_fileobject(self): # (c)ElementTree class ElementTree has a 'parse' method that returns From scoder at codespeak.net Mon May 22 08:54:34 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 22 May 2006 08:54:34 +0200 (CEST) Subject: [Lxml-checkins] r27579 - lxml/branch/xmlsave/src/lxml Message-ID: <20060522065434.4F29C10071@code0.codespeak.net> Author: scoder Date: Mon May 22 08:54:33 2006 New Revision: 27579 Modified: lxml/branch/xmlsave/src/lxml/serializer.pxi Log: renamed internal method Modified: lxml/branch/xmlsave/src/lxml/serializer.pxi ============================================================================== --- lxml/branch/xmlsave/src/lxml/serializer.pxi (original) +++ lxml/branch/xmlsave/src/lxml/serializer.pxi Mon May 22 08:54:33 2006 @@ -53,7 +53,7 @@ save_options = save_options | tree.XML_SAVE_NO_DECL return save_options - cdef int _saveNode(self, tree.xmlSaveCtxt* save_ctxt, + cdef int _saveNodeAndClose(self, tree.xmlSaveCtxt* save_ctxt, xmlNode* c_node) except -1: cdef long result self._error_log.connect() @@ -261,7 +261,7 @@ else: raise TypeError, "File or filename expected, got '%s'" % type(f) - formatter._saveNode(save_ctxt, element._c_node) + formatter._saveNodeAndClose(save_ctxt, element._c_node) if writer is not None: writer._exc_context._raise_if_stored() From scoder at codespeak.net Mon May 22 08:56:09 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 22 May 2006 08:56:09 +0200 (CEST) Subject: [Lxml-checkins] r27580 - lxml/branch/xmlsave/src/lxml/tests Message-ID: <20060522065609.0077510071@code0.codespeak.net> Author: scoder Date: Mon May 22 08:56:09 2006 New Revision: 27580 Modified: lxml/branch/xmlsave/src/lxml/tests/common_imports.py Log: utility function unhex_entities() in tests/common_imports.py to replace hex entities by their plain integer equivalent Modified: lxml/branch/xmlsave/src/lxml/tests/common_imports.py ============================================================================== --- lxml/branch/xmlsave/src/lxml/tests/common_imports.py (original) +++ lxml/branch/xmlsave/src/lxml/tests/common_imports.py Mon May 22 08:56:09 2006 @@ -86,3 +86,8 @@ for entity_name, value in re.findall("(&#([0-9]+);)", xml): xml = xml.replace(entity_name, unichr(int(value))) return xml + +def unhex_entities(xml): + for entity_name, value in re.findall("(&#(x[0-9a-fA-F]+);)", xml): + xml = xml.replace(entity_name, "&#%s;" % eval('0'+value)) + return xml From scoder at codespeak.net Mon May 22 08:56:53 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 22 May 2006 08:56:53 +0200 (CEST) Subject: [Lxml-checkins] r27581 - lxml/branch/xmlsave/src/lxml/tests Message-ID: <20060522065653.B56B910071@code0.codespeak.net> Author: scoder Date: Mon May 22 08:56:52 2006 New Revision: 27581 Modified: lxml/branch/xmlsave/src/lxml/tests/test_elementtree.py Log: use unhex_entities() to compare write() results (fixes one test case) Modified: lxml/branch/xmlsave/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/branch/xmlsave/src/lxml/tests/test_elementtree.py (original) +++ lxml/branch/xmlsave/src/lxml/tests/test_elementtree.py Mon May 22 08:56:52 2006 @@ -11,7 +11,8 @@ import unittest, doctest import os, shutil, tempfile, copy -from common_imports import StringIO, etree, ElementTree, HelperTestCase, fileInTestDir, canonicalize +from common_imports import StringIO, etree, ElementTree, HelperTestCase, fileInTestDir +from common_imports import canonicalize, unhex_entities class ETreeTestCaseBase(unittest.TestCase): etree = None @@ -1716,7 +1717,7 @@ data = f.getvalue() self.assertEquals( u'S?k p? nettet'.encode('ASCII', 'xmlcharrefreplace'), - data) + unhex_entities(data)) def test_encoding_tostring(self): Element = self.etree.Element From scoder at codespeak.net Mon May 22 10:19:08 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 22 May 2006 10:19:08 +0200 (CEST) Subject: [Lxml-checkins] r27583 - lxml/trunk/src/lxml Message-ID: <20060522081908.D790C10071@code0.codespeak.net> Author: scoder Date: Mon May 22 10:19:07 2006 New Revision: 27583 Modified: lxml/trunk/src/lxml/etree.pyx lxml/trunk/src/lxml/tree.pxd Log: use correct API in comment text setting Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Mon May 22 10:19:07 2006 @@ -969,20 +969,11 @@ cdef tree.xmlDict* c_dict cdef char* c_text if value is None: - value = '' + c_text = NULL else: value = _utf8(value) - c_text = self._c_node.content - if c_text is not NULL: - if self._c_node.doc is not NULL: - c_dict = self._c_node.doc.dict - else: - c_dict = NULL - # this code is copied from libxml2's DICT_FREE - if c_dict is NULL or \ - tree.xmlDictOwns(c_dict, c_text) == 0: - tree.xmlFree(c_text) - self._c_node.content = tree.xmlStrdup(_cstr(value)) + c_text = _cstr(value) + tree.xmlNodeSetContent(self._c_node, c_text) # ACCESSORS def __repr__(self): Modified: lxml/trunk/src/lxml/tree.pxd ============================================================================== --- lxml/trunk/src/lxml/tree.pxd (original) +++ lxml/trunk/src/lxml/tree.pxd Mon May 22 10:19:07 2006 @@ -198,6 +198,7 @@ xmlDoc* doc, xmlNode* cur, int level, int format, char* encoding) cdef void xmlNodeSetName(xmlNode* cur, char* name) + cdef void xmlNodeSetContent(xmlNode* cur, char* content) cdef xmlDoc* xmlCopyDoc(xmlDoc* doc, int recursive) cdef xmlNode* xmlCopyNode(xmlNode* node, int extended) cdef int xmlReconciliateNs(xmlDoc* doc, xmlNode* tree) From scoder at codespeak.net Mon May 22 10:44:17 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 22 May 2006 10:44:17 +0200 (CEST) Subject: [Lxml-checkins] r27584 - lxml/branch/xmlsave/src/lxml/tests Message-ID: <20060522084417.EB53C10071@code0.codespeak.net> Author: scoder Date: Mon May 22 10:44:16 2006 New Revision: 27584 Modified: lxml/branch/xmlsave/src/lxml/tests/test_elementtree.py Log: test XML escaping and latin1 encoding Modified: lxml/branch/xmlsave/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/branch/xmlsave/src/lxml/tests/test_elementtree.py (original) +++ lxml/branch/xmlsave/src/lxml/tests/test_elementtree.py Mon May 22 10:44:16 2006 @@ -188,6 +188,32 @@ self.assertEquals(None, root.text) self.assertEquals('One', root[0].text) + def test_text_escape_in(self): + ElementTree = self.etree.ElementTree + + f = StringIO('This is > than a text') + doc = ElementTree(file=f) + root = doc.getroot() + self.assertEquals('This is > than a text', root.text) + + def test_text_escape_out(self): + ElementTree = self.etree.ElementTree + Element = self.etree.Element + + a = Element("a") + a.text = "<>&" + self.assertXML('<>&', + a) + + def test_text_escape_tostring(self): + tostring = self.etree.tostring + Element = self.etree.Element + + a = Element("a") + a.text = "<>&" + self.assertEqual('<>&', + tostring(a)) + def test_tail(self): ElementTree = self.etree.ElementTree @@ -1698,6 +1724,24 @@ self.assertEqual(u'S?k p? nettet'.encode('UTF-8'), f.getvalue()) + def test_encoding3(self): + ElementTree = self.etree.ElementTree + Element = self.etree.Element + + a = Element('a') + a.text = u'S?k p? nettet' + + f = StringIO() + tree = ElementTree(element=a) + tree.write(f, 'iso-8859-1') + result = f.getvalue() + declaration = "" + self.assertEqual(result[:len(declaration)], + declaration) + result = result[len(declaration):].strip() + self.assertEqual(u'S?k p? nettet'.encode('iso-8859-1'), + result) + # raise error on wrong (left-over?) encoding declaration in unicode strings def _test_wrong_unicode_encoding(self): XML = self.etree.XML @@ -1781,7 +1825,6 @@ b = SubElement(a, 'b') b.text = u'S?k p? nettet' - # the same, just hex versus decimal expected = 'Søk på nettet' self.assertEquals( expected, From scoder at codespeak.net Mon May 22 11:41:01 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 22 May 2006 11:41:01 +0200 (CEST) Subject: [Lxml-checkins] r27585 - lxml/branch/xmlsave/src/lxml/tests Message-ID: <20060522094101.6E5F610071@code0.codespeak.net> Author: scoder Date: Mon May 22 11:41:00 2006 New Revision: 27585 Modified: lxml/branch/xmlsave/src/lxml/tests/test_sax.py Log: compare c14n output in test_sax.py Modified: lxml/branch/xmlsave/src/lxml/tests/test_sax.py ============================================================================== --- lxml/branch/xmlsave/src/lxml/tests/test_sax.py (original) +++ lxml/branch/xmlsave/src/lxml/tests/test_sax.py Mon May 22 11:41:00 2006 @@ -7,7 +7,7 @@ import unittest, doctest from StringIO import StringIO -from common_imports import HelperTestCase +from common_imports import HelperTestCase, canonicalize from lxml import sax class ETreeSaxTestCase(HelperTestCase): @@ -15,7 +15,7 @@ def test_etree_sax_simple(self): tree = self.parse('abba') xml_out = self._saxify_serialize(tree) - self.assertEquals('abba', + self.assertEquals('abba', xml_out) def test_etree_sax_double(self): @@ -27,7 +27,7 @@ def test_etree_sax_attributes(self): tree = self.parse('abba') xml_out = self._saxify_serialize(tree) - self.assertEquals('abba', + self.assertEquals('abba', xml_out) def test_etree_sax_ns1(self): @@ -54,11 +54,11 @@ b = a[0] xml_out = self._saxify_serialize(a) - self.assertEquals('', + self.assertEquals('', xml_out) xml_out = self._saxify_serialize(b) - self.assertEquals('', + self.assertEquals('', xml_out) def test_element_sax_ns(self): @@ -167,7 +167,7 @@ new_tree = self._saxify_unsaxify(tree) f = StringIO() new_tree.write(f) - return f.getvalue() + return canonicalize(f.getvalue()) def test_suite(): From scoder at codespeak.net Mon May 22 11:48:03 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 22 May 2006 11:48:03 +0200 (CEST) Subject: [Lxml-checkins] r27586 - lxml/branch/xmlsave/src/lxml/tests Message-ID: <20060522094803.C64D310071@code0.codespeak.net> Author: scoder Date: Mon May 22 11:48:02 2006 New Revision: 27586 Modified: lxml/branch/xmlsave/src/lxml/tests/test_sax.py Log: actually, use tostring() for comparison in test_sax.py Modified: lxml/branch/xmlsave/src/lxml/tests/test_sax.py ============================================================================== --- lxml/branch/xmlsave/src/lxml/tests/test_sax.py (original) +++ lxml/branch/xmlsave/src/lxml/tests/test_sax.py Mon May 22 11:48:02 2006 @@ -7,15 +7,15 @@ import unittest, doctest from StringIO import StringIO -from common_imports import HelperTestCase, canonicalize -from lxml import sax +from common_imports import HelperTestCase +from lxml import etree, sax class ETreeSaxTestCase(HelperTestCase): def test_etree_sax_simple(self): tree = self.parse('abba') xml_out = self._saxify_serialize(tree) - self.assertEquals('abba', + self.assertEquals('abba', xml_out) def test_etree_sax_double(self): @@ -27,7 +27,7 @@ def test_etree_sax_attributes(self): tree = self.parse('abba') xml_out = self._saxify_serialize(tree) - self.assertEquals('abba', + self.assertEquals('abba', xml_out) def test_etree_sax_ns1(self): @@ -54,11 +54,11 @@ b = a[0] xml_out = self._saxify_serialize(a) - self.assertEquals('', + self.assertEquals('', xml_out) xml_out = self._saxify_serialize(b) - self.assertEquals('', + self.assertEquals('', xml_out) def test_element_sax_ns(self): @@ -165,9 +165,7 @@ def _saxify_serialize(self, tree): new_tree = self._saxify_unsaxify(tree) - f = StringIO() - new_tree.write(f) - return canonicalize(f.getvalue()) + return etree.tostring(new_tree.getroot()) def test_suite(): From scoder at codespeak.net Mon May 22 11:50:38 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 22 May 2006 11:50:38 +0200 (CEST) Subject: [Lxml-checkins] r27587 - lxml/branch/xmlsave/src/lxml Message-ID: <20060522095038.768EC10071@code0.codespeak.net> Author: scoder Date: Mon May 22 11:50:37 2006 New Revision: 27587 Modified: lxml/branch/xmlsave/src/lxml/etree.pyx Log: check for NULL results Modified: lxml/branch/xmlsave/src/lxml/etree.pyx ============================================================================== --- lxml/branch/xmlsave/src/lxml/etree.pyx (original) +++ lxml/branch/xmlsave/src/lxml/etree.pyx Mon May 22 11:50:37 2006 @@ -702,6 +702,8 @@ text = _utf8(value) c_text_node = tree.xmlNewDocText(self._doc._c_doc, _cstr(text)) + if c_text_node is NULL: + raise LxmlError, "Error creating text node" if self._c_node.children is NULL: tree.xmlAddChild(self._c_node, c_text_node) else: @@ -720,6 +722,8 @@ return text = _utf8(value) c_text_node = tree.xmlNewDocText(self._doc._c_doc, _cstr(text)) + if c_text_node is NULL: + raise LxmlError, "Error creating text node" # XXX what if we're the top element? tree.xmlAddNextSibling(self._c_node, c_text_node) From scoder at codespeak.net Mon May 22 11:54:13 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 22 May 2006 11:54:13 +0200 (CEST) Subject: [Lxml-checkins] r27588 - lxml/branch/xmlsave/src/lxml Message-ID: <20060522095413.8B2DA10071@code0.codespeak.net> Author: scoder Date: Mon May 22 11:54:12 2006 New Revision: 27588 Modified: lxml/branch/xmlsave/src/lxml/serializer.pxi lxml/branch/xmlsave/src/lxml/tree.pxd Log: use xmlDocSave in XMLFormatter._saveNodeAndClose() to make it write the XML declaration, prevent escaping characters (libxml2 bug) except on escape_characters keyword Modified: lxml/branch/xmlsave/src/lxml/serializer.pxi ============================================================================== --- lxml/branch/xmlsave/src/lxml/serializer.pxi (original) +++ lxml/branch/xmlsave/src/lxml/serializer.pxi Mon May 22 11:54:12 2006 @@ -8,11 +8,12 @@ cdef class XMLFormatter: cdef int _pretty_print cdef int _write_declaration + cdef int _escape_characters cdef int _save_options cdef _ErrorLog _error_log def __init__(self, pretty_print=False, write_declaration=None, - split_empty_tags=False): + split_empty_tags=False, escape_characters=None): cdef int save_options save_options = tree.XML_SAVE_NO_XHTML @@ -33,6 +34,11 @@ self._write_declaration = False save_options = save_options | tree.XML_SAVE_NO_DECL + if escape_characters is None: + self._escape_characters = -1 + else: + self._escape_characters = bool(escape_characters) + self._save_options = save_options self._error_log = _ErrorLog() @@ -40,6 +46,16 @@ def __get__(self): return self._error_log.copy() + cdef _raiseError(self): + error = self._error_log.last_error + if error is not None: + if error.domain == xmlerror.XML_FROM_IO: + raise IOError, error.message + else: + raise XMLOutputError, error.message + else: + raise XMLOutputError, "Error serializing the tree" + cdef int _optionsForEncoding(self, encoding): """Purely for ElementTree compatibility: suppress XML declaration in default cases.""" @@ -49,31 +65,41 @@ save_options = self._save_options if encoding is None: save_options = save_options | tree.XML_SAVE_NO_DECL - elif encoding.upper() in ('US-ASCII', 'ASCII', 'UTF8', 'UTF-8'): + elif encoding in ('US-ASCII', 'ASCII', 'UTF8', 'UTF-8'): save_options = save_options | tree.XML_SAVE_NO_DECL return save_options - cdef int _saveNodeAndClose(self, tree.xmlSaveCtxt* save_ctxt, - xmlNode* c_node) except -1: + cdef void _setupCharacterEscaping(self, tree.xmlSaveCtxt* save_ctxt, + encoding): + """libxml2 defaults to escaping every non-ascii character whatever the + encoding, but we only want that for ASCII encoding.""" + if self._escape_characters == -1: + if encoding is not None and encoding not in ('US-ASCII', 'ASCII'): + tree.xmlSaveSetEscape(save_ctxt, NULL) + + cdef _saveNodeAndClose(self, tree.xmlSaveCtxt* save_ctxt, + xmlNode* c_node): cdef long result + cdef xmlDoc* c_doc + cdef xmlDoc* c_root_doc self._error_log.connect() - result = tree.xmlSaveTree(save_ctxt, c_node) + if self._escape_characters == 0: + tree.xmlSaveSetEscape(save_ctxt, NULL) + + c_doc = c_node.doc + c_root_doc = _fakeRootDoc(c_doc, c_node) + result = tree.xmlSaveDoc(save_ctxt, c_root_doc) tree.xmlSaveClose(save_ctxt) + _destroyFakeDoc(c_doc, c_root_doc) + self._error_log.disconnect() if result < 0: - error = self._error_log.last_error - if error is not None: - if error.domain == xmlerror.XML_FROM_IO: - raise IOError, error.message - else: - raise XMLOutputError, error.message - else: - raise XMLOutputError, "Error serializing the tree" - else: - return 0 + self._raiseError() cdef class XHTMLFormatter(XMLFormatter): def __init__(self, **kwargs): + if 'escape_entities' not in kwargs: + kwargs['escape_entities'] = True XMLFormatter.__init__(self, **kwargs) self._save_options = self._save_options & (~tree.XML_SAVE_NO_XHTML) @@ -261,6 +287,7 @@ else: raise TypeError, "File or filename expected, got '%s'" % type(f) + formatter._setupCharacterEscaping(save_ctxt, encoding) formatter._saveNodeAndClose(save_ctxt, element._c_node) if writer is not None: writer._exc_context._raise_if_stored() Modified: lxml/branch/xmlsave/src/lxml/tree.pxd ============================================================================== --- lxml/branch/xmlsave/src/lxml/tree.pxd (original) +++ lxml/branch/xmlsave/src/lxml/tree.pxd Mon May 22 11:54:12 2006 @@ -248,6 +248,7 @@ cdef long xmlSaveDoc(xmlSaveCtxt* ctxt, xmlDoc* doc) cdef long xmlSaveTree(xmlSaveCtxt* ctxt, xmlNode* node) cdef int xmlSaveClose(xmlSaveCtxt* ctxt) + cdef int xmlSaveSetEscape(xmlSaveCtxt* ctxt, void* escape_function) cdef extern from "libxml/xmlstring.h": cdef char* xmlStrdup(char* cur) From scoder at codespeak.net Mon May 22 11:55:36 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 22 May 2006 11:55:36 +0200 (CEST) Subject: [Lxml-checkins] r27589 - lxml/branch/xmlsave/src/lxml/tests Message-ID: <20060522095536.A2A6110071@code0.codespeak.net> Author: scoder Date: Mon May 22 11:55:35 2006 New Revision: 27589 Modified: lxml/branch/xmlsave/src/lxml/tests/test_elementtree.py lxml/branch/xmlsave/src/lxml/tests/test_io.py Log: cleanup in test cases, ignore acceptable incompatibilities between ET and etree in terms of whitespace Modified: lxml/branch/xmlsave/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/branch/xmlsave/src/lxml/tests/test_elementtree.py (original) +++ lxml/branch/xmlsave/src/lxml/tests/test_elementtree.py Mon May 22 11:55:35 2006 @@ -9,7 +9,7 @@ """ import unittest, doctest -import os, shutil, tempfile, copy +import os, re, shutil, tempfile, copy from common_imports import StringIO, etree, ElementTree, HelperTestCase, fileInTestDir from common_imports import canonicalize, unhex_entities @@ -211,7 +211,7 @@ a = Element("a") a.text = "<>&" - self.assertEqual('<>&', + self.assertEquals('<>&', tostring(a)) def test_tail(self): @@ -733,7 +733,7 @@ a = Element('a') a.append(Comment('foo')) - self.assertEqual(a[0].text, 'foo') + self.assertEquals(a[0].text, 'foo') def test_comment_text(self): Element = self.etree.Element @@ -742,10 +742,10 @@ a = Element('a') a.append(Comment('foo')) - self.assertEqual(a[0].text, 'foo') + self.assertEquals(a[0].text, 'foo') a[0].text = "TEST" - self.assertEqual(a[0].text, 'TEST') + self.assertEquals(a[0].text, 'TEST') def test_comment_whitespace(self): Element = self.etree.Element @@ -754,7 +754,7 @@ a = Element('a') a.append(Comment(' foo ')) - self.assertEqual(a[0].text, ' foo ') + self.assertEquals(a[0].text, ' foo ') def test_comment_nonsense(self): Comment = self.etree.Comment @@ -1711,7 +1711,7 @@ u'S?k p? nettet'.encode('UTF-8'), a, 'utf-8') - def test_encoding2(self): + def test_encoding_exact(self): ElementTree = self.etree.ElementTree Element = self.etree.Element @@ -1721,26 +1721,24 @@ f = StringIO() tree = ElementTree(element=a) tree.write(f, 'utf-8') - self.assertEqual(u'S?k p? nettet'.encode('UTF-8'), - f.getvalue()) + self.assertEquals(u'S?k p? nettet'.encode('UTF-8'), + f.getvalue().strip()) - def test_encoding3(self): + def test_encoding_latin1(self): ElementTree = self.etree.ElementTree Element = self.etree.Element a = Element('a') a.text = u'S?k p? nettet' - + f = StringIO() tree = ElementTree(element=a) tree.write(f, 'iso-8859-1') result = f.getvalue() declaration = "" - self.assertEqual(result[:len(declaration)], - declaration) - result = result[len(declaration):].strip() - self.assertEqual(u'S?k p? nettet'.encode('iso-8859-1'), - result) + self.assertEncodingDeclaration(result,'iso-8859-1') + self.assertEquals(u'S?k p? nettet'.encode('iso-8859-1'), + result.split('?>', 1)[-1].strip()) # raise error on wrong (left-over?) encoding declaration in unicode strings def _test_wrong_unicode_encoding(self): @@ -1761,7 +1759,7 @@ data = f.getvalue() self.assertEquals( u'S?k p? nettet'.encode('ASCII', 'xmlcharrefreplace'), - unhex_entities(data)) + unhex_entities(data.strip())) def test_encoding_tostring(self): Element = self.etree.Element @@ -1769,8 +1767,8 @@ a = Element('a') a.text = u'S?k p? nettet' - self.assertEqual(u'S?k p? nettet'.encode('UTF-8'), - tostring(a, 'utf-8')) + self.assertEquals(u'S?k p? nettet'.encode('UTF-8'), + tostring(a, 'utf-8')) def test_encoding_tostring_unknown(self): Element = self.etree.Element @@ -1788,8 +1786,8 @@ a = Element('a') b = SubElement(a, 'b') b.text = u'S?k p? nettet' - self.assertEqual(u'S?k p? nettet'.encode('UTF-8'), - tostring(b, 'utf-8')) + self.assertEquals(u'S?k p? nettet'.encode('UTF-8'), + tostring(b, 'utf-8')) def test_encoding_tostring_sub_tail(self): Element = self.etree.Element @@ -1800,8 +1798,8 @@ b = SubElement(a, 'b') b.text = u'S?k p? nettet' b.tail = u'S?k' - self.assertEqual(u'S?k p? nettetS?k'.encode('UTF-8'), - tostring(b, 'utf-8')) + self.assertEquals(u'S?k p? nettetS?k'.encode('UTF-8'), + tostring(b, 'utf-8')) def test_encoding_tostring_default_encoding(self): Element = self.etree.Element @@ -1949,9 +1947,17 @@ """ self.assertEquals(expected, self._writeElement(element, encoding)) self.assertEquals(expected, self._writeElementFile(element, encoding)) - + + def assertEncodingDeclaration(self, result, encoding): + "Checks if the result XML byte string specifies the encoding." + has_encoding = re.compile(r"<\?xml[^>]+ encoding=[\"']([^\"']+)[\"']").match + self.assert_(has_encoding(result)) + result_encoding = has_encoding(result).group(1) + self.assertEqual(result_encoding.upper(), encoding.upper()) + def _rootstring(self, tree): - return self.etree.tostring(tree.getroot()).replace(' ', '').replace('\n', '') + return self.etree.tostring( + tree.getroot()).replace(' ', '').replace('\n', '') def _check_element_tree(self, tree): self._check_element(tree.getroot()) Modified: lxml/branch/xmlsave/src/lxml/tests/test_io.py ============================================================================== --- lxml/branch/xmlsave/src/lxml/tests/test_io.py (original) +++ lxml/branch/xmlsave/src/lxml/tests/test_io.py Mon May 22 11:55:35 2006 @@ -28,7 +28,7 @@ filename = tempfile.mktemp(suffix=".xml") self.tree.write(filename) try: - self.assertEqual(open(filename).read(), self.root_str) + self.assertEqual(open(filename).read().strip(), self.root_str) finally: os.remove(filename) From scoder at codespeak.net Mon May 22 12:06:43 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 22 May 2006 12:06:43 +0200 (CEST) Subject: [Lxml-checkins] r27590 - lxml/branch/xmlsave/src/lxml Message-ID: <20060522100643.8C8C510071@code0.codespeak.net> Author: scoder Date: Mon May 22 12:06:42 2006 New Revision: 27590 Modified: lxml/branch/xmlsave/src/lxml/serializer.pxi Log: cleanup Modified: lxml/branch/xmlsave/src/lxml/serializer.pxi ============================================================================== --- lxml/branch/xmlsave/src/lxml/serializer.pxi (original) +++ lxml/branch/xmlsave/src/lxml/serializer.pxi Mon May 22 12:06:42 2006 @@ -71,29 +71,42 @@ cdef void _setupCharacterEscaping(self, tree.xmlSaveCtxt* save_ctxt, encoding): - """libxml2 defaults to escaping every non-ascii character whatever the - encoding, but we only want that for ASCII encoding.""" + """Work-around for libxml2 bug: it defaults to escaping every + non-ascii character whatever the encoding, but we only want that for + ASCII encoding.""" if self._escape_characters == -1: if encoding is not None and encoding not in ('US-ASCII', 'ASCII'): tree.xmlSaveSetEscape(save_ctxt, NULL) - cdef _saveNodeAndClose(self, tree.xmlSaveCtxt* save_ctxt, - xmlNode* c_node): - cdef long result + cdef _saveDocAndClose(self, tree.xmlSaveCtxt* save_ctxt, + xmlNode* c_root_node): + cdef long bytes_written cdef xmlDoc* c_doc cdef xmlDoc* c_root_doc self._error_log.connect() if self._escape_characters == 0: tree.xmlSaveSetEscape(save_ctxt, NULL) - c_doc = c_node.doc - c_root_doc = _fakeRootDoc(c_doc, c_node) - result = tree.xmlSaveDoc(save_ctxt, c_root_doc) + c_doc = c_root_node.doc + c_root_doc = _fakeRootDoc(c_doc, c_root_node) + bytes_written = tree.xmlSaveDoc(save_ctxt, c_root_doc) tree.xmlSaveClose(save_ctxt) _destroyFakeDoc(c_doc, c_root_doc) self._error_log.disconnect() - if result < 0: + if bytes_written < 0: + self._raiseError() + + cdef _saveNodeAndClose(self, tree.xmlSaveCtxt* save_ctxt, + xmlNode* c_node): + cdef long bytes_written + self._error_log.connect() + if self._escape_characters == 0: + tree.xmlSaveSetEscape(save_ctxt, NULL) + bytes_written = tree.xmlSaveTree(save_ctxt, c_node) + tree.xmlSaveClose(save_ctxt) + self._error_log.disconnect() + if bytes_written < 0: self._raiseError() cdef class XHTMLFormatter(XMLFormatter): @@ -129,7 +142,7 @@ c_buffer = tree.xmlAllocOutputBuffer(enchandler) if c_buffer is NULL: tree.xmlCharEncCloseFunc(enchandler) - raise LxmlError, "Failed to create output buffer" + raise LxmlOutputError, "Failed to create output buffer" try: _writeNodeToBuffer(c_buffer, element._c_node, c_enc, @@ -155,7 +168,7 @@ return None c_buffer = tree.xmlAllocOutputBuffer(NULL) if c_buffer is NULL: - raise LxmlError, "Failed to create output buffer" + raise LxmlOutputError, "Failed to create output buffer" try: _writeNodeToBuffer(c_buffer, element._c_node, NULL, 0, pretty_print) tree.xmlOutputBufferFlush(c_buffer) @@ -288,7 +301,7 @@ raise TypeError, "File or filename expected, got '%s'" % type(f) formatter._setupCharacterEscaping(save_ctxt, encoding) - formatter._saveNodeAndClose(save_ctxt, element._c_node) + formatter._saveDocAndClose(save_ctxt, element._c_node) if writer is not None: writer._exc_context._raise_if_stored() From scoder at codespeak.net Mon May 22 12:31:39 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 22 May 2006 12:31:39 +0200 (CEST) Subject: [Lxml-checkins] r27592 - in lxml/trunk: . doc src/lxml Message-ID: <20060522103139.D512210071@code0.codespeak.net> Author: scoder Date: Mon May 22 12:31:38 2006 New Revision: 27592 Modified: lxml/trunk/CHANGES.txt lxml/trunk/doc/api.txt lxml/trunk/src/lxml/etree.pyx lxml/trunk/src/lxml/tree.pxd Log: getpath() method on Element to return a structural XPath expression for the element Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Mon May 22 12:31:38 2006 @@ -7,6 +7,9 @@ Features added -------------- +* Element.getpath() returns an XPath expression to find the node in the tree + structure + * Error logs now have a ``last_error`` attribute for convenience * Comment texts can now be changed through the API Modified: lxml/trunk/doc/api.txt ============================================================================== --- lxml/trunk/doc/api.txt (original) +++ lxml/trunk/doc/api.txt Mon May 22 12:31:38 2006 @@ -244,6 +244,19 @@ >>> r[0].text 'Text' +A related convenience method of Elements is ``getpath()``, which returns a +structural XPath expression for the respective element:: + + >>> a = etree.Element("a") + >>> b = etree.SubElement(a, "b") + >>> c = etree.SubElement(a, "c") + >>> d1 = etree.SubElement(c, "d") + >>> d2 = etree.SubElement(c, "d") + >>> print d2.getpath() + /a/c/d[2] + >>> a.xpath(d2.getpath()) == [d2] + True + XSLT ---- Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Mon May 22 12:31:38 2006 @@ -787,6 +787,10 @@ return ElementChildIterator(self, reversed=True) def index(self, _Element x not None, start=None, stop=None): + """Find the position of the child within the parent. + + This method is not part of the original ElementTree API. + """ cdef Py_ssize_t k, l cdef Py_ssize_t c_start, c_stop cdef xmlNode* c_child @@ -885,6 +889,15 @@ return _elementFactory(self._doc, c_node) return None + def getpath(self): + cdef char* c_path + c_path = tree.xmlGetNodePath(self._c_node) + if c_path is NULL: + raise LxmlError, "Cannot create node path." + path = c_path + tree.xmlFree(c_path) + return path + def getiterator(self, tag=None): return ElementDepthFirstIterator(self, tag) Modified: lxml/trunk/src/lxml/tree.pxd ============================================================================== --- lxml/trunk/src/lxml/tree.pxd (original) +++ lxml/trunk/src/lxml/tree.pxd Mon May 22 12:31:38 2006 @@ -177,6 +177,7 @@ cdef void xmlSetProp(xmlNode* node, char* name, char* value) cdef void xmlSetNsProp(xmlNode* node, xmlNs* ns, char* name, char* value) cdef void xmlRemoveProp(xmlAttr* cur) + cdef char* xmlGetNodePath(xmlNode* node) cdef void xmlDocDumpMemory(xmlDoc* cur, char** mem, int* size) cdef void xmlDocDumpMemoryEnc(xmlDoc* cur, char** mem, int* size, char* encoding) From scoder at codespeak.net Mon May 22 12:33:00 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 22 May 2006 12:33:00 +0200 (CEST) Subject: [Lxml-checkins] r27593 - lxml/trunk/src/lxml Message-ID: <20060522103300.165F210071@code0.codespeak.net> Author: scoder Date: Mon May 22 12:32:59 2006 New Revision: 27593 Modified: lxml/trunk/src/lxml/etree.pyx Log: cleanup Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Mon May 22 12:32:59 2006 @@ -893,7 +893,7 @@ cdef char* c_path c_path = tree.xmlGetNodePath(self._c_node) if c_path is NULL: - raise LxmlError, "Cannot create node path." + raise LxmlError, "Error creating node path." path = c_path tree.xmlFree(c_path) return path From scoder at codespeak.net Mon May 22 14:24:57 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 22 May 2006 14:24:57 +0200 (CEST) Subject: [Lxml-checkins] r27595 - lxml/branch/xmlsave/src/lxml Message-ID: <20060522122457.7572A10071@code0.codespeak.net> Author: scoder Date: Mon May 22 14:24:56 2006 New Revision: 27595 Modified: lxml/branch/xmlsave/src/lxml/etree.pyx lxml/branch/xmlsave/src/lxml/serializer.pxi lxml/branch/xmlsave/src/lxml/tree.pxd Log: support writing .tail to files, cleanup Modified: lxml/branch/xmlsave/src/lxml/etree.pyx ============================================================================== --- lxml/branch/xmlsave/src/lxml/etree.pyx (original) +++ lxml/branch/xmlsave/src/lxml/etree.pyx Mon May 22 14:24:56 2006 @@ -367,15 +367,7 @@ Defaults to ASCII encoding. """ self._assertHasRoot() - # suppress decl. in default case (purely for ElementTree compatibility) - if encoding is None: - encoding = 'ASCII' - write_declaration = 0 - else: - encoding = encoding.upper() - write_declaration = encoding not in \ - ('US-ASCII', 'ASCII', 'UTF8', 'UTF-8') - _tofilelike(file, self._context_node, encoding, formatter) + _tofilelike(file, self._context_node, 0, encoding, formatter) def getiterator(self, tag=None): root = self.getroot() Modified: lxml/branch/xmlsave/src/lxml/serializer.pxi ============================================================================== --- lxml/branch/xmlsave/src/lxml/serializer.pxi (original) +++ lxml/branch/xmlsave/src/lxml/serializer.pxi Mon May 22 14:24:56 2006 @@ -78,9 +78,10 @@ if encoding is not None and encoding not in ('US-ASCII', 'ASCII'): tree.xmlSaveSetEscape(save_ctxt, NULL) - cdef _saveDocAndClose(self, tree.xmlSaveCtxt* save_ctxt, - xmlNode* c_root_node): + cdef _saveDocNodeAndClose(self, tree.xmlSaveCtxt* save_ctxt, + xmlNode* c_root_node, int add_tail): cdef long bytes_written + cdef xmlNode* c_node cdef xmlDoc* c_doc cdef xmlDoc* c_root_doc self._error_log.connect() @@ -90,21 +91,35 @@ c_doc = c_root_node.doc c_root_doc = _fakeRootDoc(c_doc, c_root_node) bytes_written = tree.xmlSaveDoc(save_ctxt, c_root_doc) - tree.xmlSaveClose(save_ctxt) _destroyFakeDoc(c_doc, c_root_doc) + if add_tail: + c_node = c_root_node.next + while bytes_written >= 0 and c_node is not NULL and \ + c_node.type == tree.XML_TEXT_NODE: + bytes_written = tree.xmlSaveTree(save_ctxt, c_node) + c_node = c_node.next + tree.xmlSaveClose(save_ctxt) self._error_log.disconnect() if bytes_written < 0: self._raiseError() cdef _saveNodeAndClose(self, tree.xmlSaveCtxt* save_ctxt, - xmlNode* c_node): + xmlNode* c_node, int add_tail): cdef long bytes_written self._error_log.connect() if self._escape_characters == 0: tree.xmlSaveSetEscape(save_ctxt, NULL) + bytes_written = tree.xmlSaveTree(save_ctxt, c_node) + if add_tail: + c_node = c_node.next + while bytes_written >= 0 and c_node is not NULL and \ + c_node.type == tree.XML_TEXT_NODE: + bytes_written = tree.xmlSaveTree(save_ctxt, c_node) + c_node = c_node.next tree.xmlSaveClose(save_ctxt) + self._error_log.disconnect() if bytes_written < 0: self._raiseError() @@ -126,7 +141,6 @@ cdef tree.xmlBuffer* c_result_buffer cdef tree.xmlCharEncodingHandler* enchandler cdef char* c_enc - cdef char* c_version if element is None: return None if encoding is None: @@ -276,7 +290,8 @@ cdef int _closeFilelikeWriter(void* ctxt): return (<_FilelikeWriter>ctxt).close() -cdef _tofilelike(f, _NodeBase element, encoding, XMLFormatter formatter): +cdef _tofilelike(f, _NodeBase element, int add_tail, + encoding, XMLFormatter formatter): cdef _FilelikeWriter writer cdef tree.xmlSaveCtxt* save_ctxt cdef char* c_enc @@ -294,6 +309,8 @@ filename = _utf8(f) save_ctxt = tree.xmlSaveToFilename( _cstr(filename), c_enc, save_options) + if save_ctxt is NULL: + raise IOError, "Failed to create I/O writer context" elif hasattr(f, 'write'): writer = _FilelikeWriter(f) save_ctxt = writer._createSaveContext(c_enc, save_options) @@ -301,7 +318,7 @@ raise TypeError, "File or filename expected, got '%s'" % type(f) formatter._setupCharacterEscaping(save_ctxt, encoding) - formatter._saveDocAndClose(save_ctxt, element._c_node) + formatter._saveDocNodeAndClose(save_ctxt, element._c_node, add_tail) if writer is not None: writer._exc_context._raise_if_stored() Modified: lxml/branch/xmlsave/src/lxml/tree.pxd ============================================================================== --- lxml/branch/xmlsave/src/lxml/tree.pxd (original) +++ lxml/branch/xmlsave/src/lxml/tree.pxd Mon May 22 14:24:56 2006 @@ -202,6 +202,7 @@ cdef xmlNode* xmlCopyNode(xmlNode* node, int extended) cdef int xmlReconciliateNs(xmlDoc* doc, xmlNode* tree) cdef xmlBuffer* xmlBufferCreate() + cdef void xmlBufferFree(xmlBuffer* buf) cdef char* xmlBufferContent(xmlBuffer* buf) cdef int xmlBufferLength(xmlBuffer* buf) cdef int xmlKeepBlanksDefault(int val) From scoder at codespeak.net Mon May 22 15:12:15 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 22 May 2006 15:12:15 +0200 (CEST) Subject: [Lxml-checkins] r27596 - lxml/branch/xmlsave/src/lxml Message-ID: <20060522131215.8833610071@code0.codespeak.net> Author: scoder Date: Mon May 22 15:12:14 2006 New Revision: 27596 Modified: lxml/branch/xmlsave/src/lxml/etree.pyx lxml/branch/xmlsave/src/lxml/serializer.pxi lxml/branch/xmlsave/src/lxml/xmlerror.pxi Log: some cleanup, fixed threading issues with global libxml2 settings, renamed XMLFormatter to XMLSerializer and reverted write() back to 'xml_declaration' keyword instead of 'formatter' to make XMLSerializer a separate API Modified: lxml/branch/xmlsave/src/lxml/etree.pyx ============================================================================== --- lxml/branch/xmlsave/src/lxml/etree.pyx (original) +++ lxml/branch/xmlsave/src/lxml/etree.pyx Mon May 22 15:12:14 2006 @@ -33,6 +33,11 @@ # make the compiled-in debug state publicly available DEBUG = __DEBUG +def initThread(): + "Call this method to set up the library from within a new thread." + _initThreadLogging() + tree.xmlKeepBlanksDefault(0) + # Error superclass for ElementTree compatibility class Error(Exception): pass @@ -361,13 +366,17 @@ def __get__(self): return DocInfo(self._doc) - def write(self, file, encoding=None, formatter=None): + def write(self, file, encoding=None, xml_declaration=None): """Write the tree to a file or file-like object. Defaults to ASCII encoding. """ self._assertHasRoot() - _tofilelike(file, self._context_node, 0, encoding, formatter) + if xml_declaration is None: + serializer = None + else: + serializer = XMLSerializer(write_declaration=xml_declaration) + _tofilelike(file, self._context_node, 0, encoding, serializer) def getiterator(self, tag=None): root = self.getroot() @@ -1400,7 +1409,7 @@ Defaults to ASCII encoding without XML declaration. """ - cdef int write_declaration + cdef int c_write_declaration cdef int c_pretty_print if encoding is None: encoding = 'ASCII' @@ -1409,17 +1418,17 @@ c_pretty_print = bool(pretty_print) if xml_declaration is None: # by default, write an XML declaration only for non-standard encodings - write_declaration = encoding not in \ + c_write_declaration = encoding not in \ ('ASCII', 'UTF-8', 'UTF8', 'US-ASCII') else: - write_declaration = bool(xml_declaration) + c_write_declaration = bool(xml_declaration) if isinstance(element_or_tree, _NodeBase): return _tostring(<_NodeBase>element_or_tree, - encoding, write_declaration, c_pretty_print) + encoding, c_write_declaration, c_pretty_print) elif isinstance(element_or_tree, _ElementTree): return _tostring((<_ElementTree>element_or_tree)._context_node, - encoding, write_declaration, c_pretty_print) + encoding, c_write_declaration, c_pretty_print) else: raise TypeError, "Type '%s' cannot be serialized." % type(element_or_tree) @@ -1498,3 +1507,6 @@ include "relaxng.pxi" # RelaxNG include "xmlschema.pxi" # XMLSchema + +# configure main thread +initThread() Modified: lxml/branch/xmlsave/src/lxml/serializer.pxi ============================================================================== --- lxml/branch/xmlsave/src/lxml/serializer.pxi (original) +++ lxml/branch/xmlsave/src/lxml/serializer.pxi Mon May 22 15:12:14 2006 @@ -1,11 +1,9 @@ # XML serialization and output functions -tree.xmlKeepBlanksDefault(0) - class XMLOutputError(LxmlError): pass -cdef class XMLFormatter: +cdef class XMLSerializer: cdef int _pretty_print cdef int _write_declaration cdef int _escape_characters @@ -124,15 +122,15 @@ if bytes_written < 0: self._raiseError() -cdef class XHTMLFormatter(XMLFormatter): +cdef class XHTMLSerializer(XMLSerializer): def __init__(self, **kwargs): if 'escape_entities' not in kwargs: kwargs['escape_entities'] = True - XMLFormatter.__init__(self, **kwargs) + XMLSerializer.__init__(self, **kwargs) self._save_options = self._save_options & (~tree.XML_SAVE_NO_XHTML) -cdef XMLFormatter __DEFAULT_XML_FORMATTER -__DEFAULT_XML_FORMATTER = XMLFormatter() +cdef XMLSerializer __DEFAULT_XML_SERIALIZER +__DEFAULT_XML_SERIALIZER = XMLSerializer() cdef _tostring(_NodeBase element, encoding, int write_xml_declaration, int pretty_print): @@ -290,8 +288,8 @@ cdef int _closeFilelikeWriter(void* ctxt): return (<_FilelikeWriter>ctxt).close() -cdef _tofilelike(f, _NodeBase element, int add_tail, - encoding, XMLFormatter formatter): +cdef _tofilelike(f, _NodeBase element, int add_tail, encoding, + XMLSerializer serializer): cdef _FilelikeWriter writer cdef tree.xmlSaveCtxt* save_ctxt cdef char* c_enc @@ -301,9 +299,9 @@ else: encoding = encoding.upper() c_enc = encoding - if formatter is None: - formatter = __DEFAULT_XML_FORMATTER - save_options = formatter._optionsForEncoding(encoding) + if serializer is None: + serializer = __DEFAULT_XML_SERIALIZER + save_options = serializer._optionsForEncoding(encoding) if python.PyString_Check(f) or python.PyUnicode_Check(f): filename = _utf8(f) @@ -317,8 +315,8 @@ else: raise TypeError, "File or filename expected, got '%s'" % type(f) - formatter._setupCharacterEscaping(save_ctxt, encoding) - formatter._saveDocNodeAndClose(save_ctxt, element._c_node, add_tail) + serializer._setupCharacterEscaping(save_ctxt, encoding) + serializer._saveDocNodeAndClose(save_ctxt, element._c_node, add_tail) if writer is not None: writer._exc_context._raise_if_stored() Modified: lxml/branch/xmlsave/src/lxml/xmlerror.pxi ============================================================================== --- lxml/branch/xmlsave/src/lxml/xmlerror.pxi (original) +++ lxml/branch/xmlsave/src/lxml/xmlerror.pxi Mon May 22 15:12:14 2006 @@ -9,8 +9,10 @@ Note that this log is already bounded to a fixed size.""" __GLOBAL_ERROR_LOG.clear() -def initThreadLogging(): - "Setup logging for the current thread." +cdef void _initThreadLogging(): + "Setup logging for the current thread. Called from etree.initThread()." + # switch on line number reporting + xmlparser.xmlLineNumbersDefault(1) _logLibxmlErrors() try: _logLibxsltErrors() @@ -18,7 +20,6 @@ # compiled without libxslt pass - # Logging classes cdef class _LogEntry: @@ -339,12 +340,6 @@ xmlerror.xmlSetGenericErrorFunc(NULL, _nullGenericErrorFunc) xmlerror.xmlSetStructuredErrorFunc(NULL, _receiveError) -# init global logging -initThreadLogging() - -# switch on line number reporting -xmlparser.xmlLineNumbersDefault(1) - ################################################################################ ## CONSTANTS FROM "xmlerror.pxd" ################################################################################ From scoder at codespeak.net Mon May 22 16:29:41 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 22 May 2006 16:29:41 +0200 (CEST) Subject: [Lxml-checkins] r27599 - lxml/trunk/src/lxml/tests Message-ID: <20060522142941.BA2C510071@code0.codespeak.net> Author: scoder Date: Mon May 22 16:29:40 2006 New Revision: 27599 Modified: lxml/trunk/src/lxml/tests/test_elementtree.py lxml/trunk/src/lxml/tests/test_etree.py Log: some cleanup in test cases, new test cases merged in from xmlsave branch Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Mon May 22 16:29:40 2006 @@ -9,7 +9,7 @@ """ import unittest, doctest -import os, shutil, tempfile, copy +import os, re, shutil, tempfile, copy from common_imports import StringIO, etree, ElementTree, HelperTestCase, fileInTestDir, canonicalize @@ -187,6 +187,31 @@ self.assertEquals(None, root.text) self.assertEquals('One', root[0].text) + def test_text_escape_in(self): + ElementTree = self.etree.ElementTree + + f = StringIO('This is > than a text') + doc = ElementTree(file=f) + root = doc.getroot() + self.assertEquals('This is > than a text', root.text) + + def test_text_escape_out(self): + Element = self.etree.Element + + a = Element("a") + a.text = "<>&" + self.assertXML('<>&', + a) + + def test_text_escape_tostring(self): + tostring = self.etree.tostring + Element = self.etree.Element + + a = Element("a") + a.text = "<>&" + self.assertEquals('<>&', + tostring(a)) + def test_tail(self): ElementTree = self.etree.ElementTree @@ -706,7 +731,7 @@ a = Element('a') a.append(Comment('foo')) - self.assertEqual(a[0].text, 'foo') + self.assertEquals(a[0].text, 'foo') def test_comment_text(self): Element = self.etree.Element @@ -715,10 +740,10 @@ a = Element('a') a.append(Comment('foo')) - self.assertEqual(a[0].text, 'foo') + self.assertEquals(a[0].text, 'foo') a[0].text = "TEST" - self.assertEqual(a[0].text, 'TEST') + self.assertEquals(a[0].text, 'TEST') def test_comment_whitespace(self): Element = self.etree.Element @@ -727,7 +752,7 @@ a = Element('a') a.append(Comment(' foo ')) - self.assertEqual(a[0].text, ' foo ') + self.assertEquals(a[0].text, ' foo ') def test_comment_nonsense(self): Comment = self.etree.Comment @@ -1684,7 +1709,7 @@ u'S?k p? nettet'.encode('UTF-8'), a, 'utf-8') - def test_encoding2(self): + def test_encoding_exact(self): ElementTree = self.etree.ElementTree Element = self.etree.Element @@ -1694,8 +1719,27 @@ f = StringIO() tree = ElementTree(element=a) tree.write(f, 'utf-8') - self.assertEqual(u'S?k p? nettet'.encode('UTF-8'), - f.getvalue()) + self.assertEquals(u'S?k p? nettet'.encode('UTF-8'), + f.getvalue()) + + def test_encoding_latin1(self): + ElementTree = self.etree.ElementTree + Element = self.etree.Element + + a = Element('a') + a.text = u'S?k p? nettet' + + f = StringIO() + tree = ElementTree(element=a) + tree.write(f, 'iso-8859-1') + result = f.getvalue() + declaration = "" + self.assertEncodingDeclaration(result,'iso-8859-1') + result = result.split('?>', 1)[-1] + if result[0] == '\n': + result = result[1:] + self.assertEquals(u'S?k p? nettet'.encode('iso-8859-1'), + result) # raise error on wrong (left-over?) encoding declaration in unicode strings def _test_wrong_unicode_encoding(self): @@ -1724,7 +1768,7 @@ a = Element('a') a.text = u'S?k p? nettet' - self.assertEqual(u'S?k p? nettet'.encode('UTF-8'), + self.assertEquals(u'S?k p? nettet'.encode('UTF-8'), tostring(a, 'utf-8')) def test_encoding_tostring_unknown(self): @@ -1743,7 +1787,7 @@ a = Element('a') b = SubElement(a, 'b') b.text = u'S?k p? nettet' - self.assertEqual(u'S?k p? nettet'.encode('UTF-8'), + self.assertEquals(u'S?k p? nettet'.encode('UTF-8'), tostring(b, 'utf-8')) def test_encoding_tostring_sub_tail(self): @@ -1755,7 +1799,7 @@ b = SubElement(a, 'b') b.text = u'S?k p? nettet' b.tail = u'S?k' - self.assertEqual(u'S?k p? nettetS?k'.encode('UTF-8'), + self.assertEquals(u'S?k p? nettetS?k'.encode('UTF-8'), tostring(b, 'utf-8')) def test_encoding_tostring_default_encoding(self): @@ -1780,7 +1824,6 @@ b = SubElement(a, 'b') b.text = u'S?k p? nettet' - # the same, just hex versus decimal expected = 'Søk på nettet' self.assertEquals( expected, @@ -1905,6 +1948,13 @@ """ self.assertEquals(expected, self._writeElement(element, encoding)) self.assertEquals(expected, self._writeElementFile(element, encoding)) + + def assertEncodingDeclaration(self, result, encoding): + "Checks if the result XML byte string specifies the encoding." + has_encoding = re.compile(r"<\?xml[^>]+ encoding=[\"']([^\"']+)[\"']").match + self.assert_(has_encoding(result)) + result_encoding = has_encoding(result).group(1) + self.assertEquals(result_encoding.upper(), encoding.upper()) def _rootstring(self, tree): return self.etree.tostring(tree.getroot()).replace(' ', '').replace('\n', '') Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Mon May 22 16:29:40 2006 @@ -599,7 +599,7 @@ data = f.getvalue() return canonicalize(data) - + class ETreeXIncludeTestCase(HelperTestCase): def test_xinclude(self): tree = etree.parse(fileInTestDir('test_xinclude.xml')) From scoder at codespeak.net Mon May 22 16:30:35 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 22 May 2006 16:30:35 +0200 (CEST) Subject: [Lxml-checkins] r27600 - in lxml/trunk: . src/lxml Message-ID: <20060522143035.ED66E10071@code0.codespeak.net> Author: scoder Date: Mon May 22 16:30:34 2006 New Revision: 27600 Modified: lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/etree.pyx lxml/trunk/src/lxml/serializer.pxi lxml/trunk/src/lxml/xmlerror.pxi Log: cleanup for thread setup: initThread() instead of initThreadLogging(), xml_declaration keyword in ET.write() Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Mon May 22 16:30:34 2006 @@ -17,6 +17,9 @@ Bugs fixed ---------- +* Removed public function ``initThreadLogging()``, replaced by more general + ``initThread()`` which fixes a number of setup problems in threads + * Memory leak when using iconv encoders in tostring/write * Deep copying Elements and ElementTrees maintains the document information Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Mon May 22 16:30:34 2006 @@ -33,6 +33,11 @@ # make the compiled-in debug state publicly available DEBUG = __DEBUG +def initThread(): + "Call this method to set up the library from within a new thread." + _initThreadLogging() + tree.xmlKeepBlanksDefault(0) + # Error superclass for ElementTree compatibility class Error(Exception): pass @@ -361,22 +366,28 @@ def __get__(self): return DocInfo(self._doc) - def write(self, file, encoding=None, pretty_print=False): + def write(self, file, encoding=None, + pretty_print=False, xml_declaration=None): """Write the tree to a file or file-like object. - Defaults to ASCII encoding. + Defaults to ASCII encoding and writing a declaration as needed. """ + cdef int c_write_declaration self._assertHasRoot() # suppress decl. in default case (purely for ElementTree compatibility) - if encoding is None: + if xml_declaration is not None: + c_write_declaration = bool(xml_declaration) + if encoding is None: + encoding = 'ASCII' + elif encoding is None: encoding = 'ASCII' - write_declaration = 0 + c_write_declaration = 0 else: encoding = encoding.upper() - write_declaration = encoding not in \ - ('US-ASCII', 'ASCII', 'UTF8', 'UTF-8') + c_write_declaration = encoding not in \ + ('US-ASCII', 'ASCII', 'UTF8', 'UTF-8') _tofilelike(file, self._context_node, encoding, - write_declaration, bool(pretty_print)) + c_write_declaration, bool(pretty_print)) def getiterator(self, tag=None): root = self.getroot() @@ -1507,3 +1518,6 @@ include "relaxng.pxi" # RelaxNG include "xmlschema.pxi" # XMLSchema + +# configure main thread +initThread() Modified: lxml/trunk/src/lxml/serializer.pxi ============================================================================== --- lxml/trunk/src/lxml/serializer.pxi (original) +++ lxml/trunk/src/lxml/serializer.pxi Mon May 22 16:30:34 2006 @@ -1,7 +1,5 @@ # XML serialization and output functions -tree.xmlKeepBlanksDefault(0) - cdef _tostring(_NodeBase element, encoding, int write_xml_declaration, int pretty_print): "Serialize an element to an encoded string representation of its XML tree." Modified: lxml/trunk/src/lxml/xmlerror.pxi ============================================================================== --- lxml/trunk/src/lxml/xmlerror.pxi (original) +++ lxml/trunk/src/lxml/xmlerror.pxi Mon May 22 16:30:34 2006 @@ -9,8 +9,10 @@ Note that this log is already bounded to a fixed size.""" __GLOBAL_ERROR_LOG.clear() -def initThreadLogging(): - "Setup logging for the current thread." +cdef void _initThreadLogging(): + "Setup logging for the current thread. Called from etree.initThread()." + # switch on line number reporting + xmlparser.xmlLineNumbersDefault(1) _logLibxmlErrors() try: _logLibxsltErrors() @@ -339,12 +341,6 @@ xmlerror.xmlSetGenericErrorFunc(NULL, _nullGenericErrorFunc) xmlerror.xmlSetStructuredErrorFunc(NULL, _receiveError) -# init global logging -initThreadLogging() - -# switch on line number reporting -xmlparser.xmlLineNumbersDefault(1) - ################################################################################ ## CONSTANTS FROM "xmlerror.pxd" ################################################################################ From scoder at codespeak.net Tue May 23 08:51:08 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 23 May 2006 08:51:08 +0200 (CEST) Subject: [Lxml-checkins] r27612 - in lxml/trunk/src/lxml: . tests Message-ID: <20060523065108.408C410064@code0.codespeak.net> Author: scoder Date: Tue May 23 08:51:04 2006 New Revision: 27612 Modified: lxml/trunk/src/lxml/etree.pyx lxml/trunk/src/lxml/parser.pxi lxml/trunk/src/lxml/tests/test_etree.py Log: let public API functions raise TypeError on the 'parser' argument rather than type checking later (why bother if Pyrex does it for us) Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Tue May 23 08:51:04 2006 @@ -270,7 +270,7 @@ if node_ns_utf is not None: self._setNodeNs(c_node, node_ns_utf) -cdef _Document _documentFactory(xmlDoc* c_doc, parser): +cdef _Document _documentFactory(xmlDoc* c_doc, _BaseParser parser): cdef _Document result result = _Document() result._c_doc = c_doc @@ -348,7 +348,7 @@ assert self._context_node is not None, \ "ElementTree not initialized, missing root" - def parse(self, source, parser=None): + def parse(self, source, _BaseParser parser=None): """Updates self with the content of source and returns its root """ self._doc = _parseDocument(source, parser) @@ -1363,7 +1363,7 @@ _initNodeAttributes(c_node, doc, attrib, _extra) return _elementFactory(doc, c_node) -def ElementTree(_Element element=None, file=None, parser=None): +def ElementTree(_Element element=None, file=None, _BaseParser parser=None): cdef xmlNode* c_next cdef xmlNode* c_node cdef xmlNode* c_node_copy @@ -1461,7 +1461,7 @@ else: raise TypeError, "Type '%s' cannot be serialized." % type(element_or_tree) -def parse(source, parser=None): +def parse(source, _BaseParser parser=None): """Return an ElementTree object loaded with source elements. If no parser is provided as second argument, the default parser is used. """ Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Tue May 23 08:51:04 2006 @@ -517,7 +517,7 @@ cdef _BaseParser __DEFAULT_PARSER __DEFAULT_PARSER = __DEFAULT_XML_PARSER -def set_default_parser(parser=None): +def set_default_parser(_BaseParser parser=None): """Set a default parser. This parser is used globally whenever no parser is supplied to the various parse functions of the lxml API. If this function is called without a parser (or if it is None), the default parser @@ -530,10 +530,8 @@ global __DEFAULT_PARSER if parser is None: __DEFAULT_PARSER = __DEFAULT_XML_PARSER - elif isinstance(parser, _BaseParser): - __DEFAULT_PARSER = parser else: - raise TypeError, "Invalid parser" + __DEFAULT_PARSER = parser def get_default_parser(): return __DEFAULT_PARSER @@ -580,12 +578,10 @@ ## helper functions for document creation ############################################################ -cdef xmlDoc* _parseDoc(text, filename, parser) except NULL: +cdef xmlDoc* _parseDoc(text, filename, _BaseParser parser) except NULL: cdef char* c_filename if parser is None: parser = __DEFAULT_PARSER - elif not isinstance(parser, _BaseParser): - raise TypeError, "invalid parser" __GLOBAL_PARSER_CONTEXT._initParser() if not filename: c_filename = NULL @@ -596,20 +592,17 @@ else: return (<_BaseParser>parser)._parseDoc(_cstr(text), c_filename) -cdef xmlDoc* _parseDocFromFile(filename, parser) except NULL: +cdef xmlDoc* _parseDocFromFile(filename, _BaseParser parser) except NULL: if parser is None: parser = __DEFAULT_PARSER - elif not isinstance(parser, _BaseParser): - raise TypeError, "invalid parser" __GLOBAL_PARSER_CONTEXT._initParser() return (<_BaseParser>parser)._parseDocFromFile(_cstr(filename)) -cdef xmlDoc* _parseDocFromFilelike(source, filename, parser) except NULL: +cdef xmlDoc* _parseDocFromFilelike(source, filename, + _BaseParser parser) except NULL: cdef char* c_filename if parser is None: parser = __DEFAULT_PARSER - elif not isinstance(parser, _BaseParser): - raise TypeError, "invalid parser" __GLOBAL_PARSER_CONTEXT._initParser() if not filename: c_filename = NULL @@ -654,7 +647,7 @@ ## (here we convert to UTF-8) ############################################################ -cdef _Document _parseDocument(source, parser): +cdef _Document _parseDocument(source, _BaseParser parser): cdef xmlDoc* c_doc filename = _getFilenameForFile(source) if hasattr(source, 'getvalue') and hasattr(source, 'tell'): @@ -673,7 +666,7 @@ c_doc = _parseDocFromFile(_utf8(filename), parser) return _documentFactory(c_doc, parser) -cdef _Document _parseMemoryDocument(text, url, parser): +cdef _Document _parseMemoryDocument(text, url, _BaseParser parser): cdef xmlDoc* c_doc if python.PyUnicode_Check(text): # pass native unicode only if libxml2 can handle it @@ -686,7 +679,7 @@ c_doc = _parseDoc(text, url, parser) return _documentFactory(c_doc, parser) -cdef _Document _parseFilelikeDocument(source, url, parser): +cdef _Document _parseFilelikeDocument(source, url, _BaseParser parser): cdef xmlDoc* c_doc if url is not None: url = _utf8(url) Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Tue May 23 08:51:04 2006 @@ -35,6 +35,10 @@ self.assertRaises(SyntaxError, parse, f) f.close() + def test_parse_parser_type_error(self): + parse = self.etree.parse + self.assertRaises(TypeError, parse, 'notthere.xml', object()) + def test_parse_error_logging(self): parse = self.etree.parse # from StringIO From scoder at codespeak.net Fri May 26 09:04:40 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 26 May 2006 09:04:40 +0200 (CEST) Subject: [Lxml-checkins] r27690 - lxml/trunk Message-ID: <20060526070440.5F2D910053@code0.codespeak.net> Author: scoder Date: Fri May 26 09:04:36 2006 New Revision: 27690 Modified: lxml/trunk/CHANGES.txt Log: typo Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Fri May 26 09:04:36 2006 @@ -35,7 +35,7 @@ Features added -------------- -* Formatted output via ``pretty_print`` keyword to serialization functions +* Formatted output via ``pretty_print`` keyword in serialization functions * XSLT can block access to file system and network via ``XSLTAccessControl`` From scoder at codespeak.net Fri May 26 09:35:55 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 26 May 2006 09:35:55 +0200 (CEST) Subject: [Lxml-checkins] r27691 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20060526073555.C6DDC10061@code0.codespeak.net> Author: scoder Date: Fri May 26 09:35:30 2006 New Revision: 27691 Modified: lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/apihelpers.pxi lxml/trunk/src/lxml/relaxng.pxi lxml/trunk/src/lxml/tests/test_elementtree.py lxml/trunk/src/lxml/tests/test_relaxng.py lxml/trunk/src/lxml/tests/test_xmlschema.py lxml/trunk/src/lxml/tests/test_xpathevaluator.py lxml/trunk/src/lxml/tests/test_xslt.py lxml/trunk/src/lxml/xmlschema.pxi lxml/trunk/src/lxml/xpath.pxi lxml/trunk/src/lxml/xslt.pxi Log: fix crashes when calling API functions with uninitialized ElementTree objects Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Fri May 26 09:35:30 2006 @@ -17,6 +17,9 @@ Bugs fixed ---------- +* Crashes when calling XSLT, RelaxNG, etc. with uninitialized ElementTree + objects + * Removed public function ``initThreadLogging()``, replaced by more general ``initThread()`` which fixes a number of setup problems in threads Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Fri May 26 09:35:30 2006 @@ -10,16 +10,49 @@ c_child = c_child.next cdef _Document _documentOrRaise(object input): + """Call this to get the document of a _Document, _ElementTree or _NodeBase + object, or to raise an exception if it can't be determined. + + Should be used in all API functions for consistency. + """ cdef _Document doc - doc = _documentOf(input) - if doc is None: + if isinstance(input, _ElementTree): + doc = (<_ElementTree>input)._doc + elif isinstance(input, _NodeBase): + doc = (<_NodeBase>input)._doc + elif isinstance(input, _Document): + doc = <_Document>input + else: raise TypeError, "Invalid input object: %s" % type(input) + if doc is None: + raise ValueError, "Input object has no document: %s" % type(input) else: return doc +cdef _NodeBase _rootNodeOrRaise(object input): + """Call this to get the root node of a _Document, _ElementTree or + _NodeBase object, or to raise an exception if it can't be determined. + + Should be used in all API functions for consistency. + """ + cdef _NodeBase node + if isinstance(input, _ElementTree): + node = (<_ElementTree>input)._context_node + elif isinstance(input, _NodeBase): + node = <_NodeBase>input + elif isinstance(input, _Document): + node = (<_Document>input).getroot() + else: + raise TypeError, "Invalid input object: %s" % type(input) + if node is None: + raise ValueError, "Input object has no element: %s" % type(input) + else: + return node + cdef _Document _documentOf(object input): # call this to get the document of a # _Document, _ElementTree or _NodeBase object + # may return None! if isinstance(input, _ElementTree): return (<_ElementTree>input)._doc elif isinstance(input, _NodeBase): @@ -32,6 +65,7 @@ cdef _NodeBase _rootNodeOf(object input): # call this to get the root node of a # _Document, _ElementTree or _NodeBase object + # may return None! if isinstance(input, _ElementTree): return (<_ElementTree>input)._context_node elif isinstance(input, _NodeBase): Modified: lxml/trunk/src/lxml/relaxng.pxi ============================================================================== --- lxml/trunk/src/lxml/relaxng.pxi (original) +++ lxml/trunk/src/lxml/relaxng.pxi Fri May 26 09:35:30 2006 @@ -28,7 +28,7 @@ fake_c_doc = NULL if etree is not None: doc = _documentOrRaise(etree) - root_node = _rootNodeOf(etree) + root_node = _rootNodeOrRaise(etree) c_node = root_node._c_node # work around for libxml2 bug if document is not RNG at all if c_node.ns is NULL or c_node.ns.href is NULL or \ @@ -78,7 +78,7 @@ cdef int ret doc = _documentOrRaise(etree) - root_node = _rootNodeOf(etree) + root_node = _rootNodeOrRaise(etree) self._error_log.connect() valid_ctxt = relaxng.xmlRelaxNGNewValidCtxt(self._c_schema) Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Fri May 26 09:35:30 2006 @@ -1678,7 +1678,7 @@ def test_parse_file_nonexistent(self): parse = self.etree.parse self.assertRaises(IOError, parse, fileInTestDir('notthere.xml')) - + def test_parse_file_object(self): parse = self.etree.parse # from file object Modified: lxml/trunk/src/lxml/tests/test_relaxng.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_relaxng.py (original) +++ lxml/trunk/src/lxml/tests/test_relaxng.py Fri May 26 09:35:30 2006 @@ -25,6 +25,9 @@ self.assert_(schema.validate(tree_valid)) self.assert_(not schema.validate(tree_invalid)) + def test_relaxng_elementtree_error(self): + self.assertRaises(ValueError, etree.RelaxNG, etree.ElementTree()) + def test_relaxng_error(self): tree_invalid = self.parse('') schema = self.parse('''\ Modified: lxml/trunk/src/lxml/tests/test_xmlschema.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_xmlschema.py (original) +++ lxml/trunk/src/lxml/tests/test_xmlschema.py Fri May 26 09:35:30 2006 @@ -26,6 +26,9 @@ self.assert_(schema.validate(tree_valid)) self.assert_(not schema.validate(tree_invalid)) + def test_xmlschema_elementtree_error(self): + self.assertRaises(ValueError, etree.XMLSchema, etree.ElementTree()) + def test_xmlschema_invalid_schema1(self): schema = self.parse('''\ Modified: lxml/trunk/src/lxml/tests/test_xpathevaluator.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_xpathevaluator.py (original) +++ lxml/trunk/src/lxml/tests/test_xpathevaluator.py Fri May 26 09:35:30 2006 @@ -300,6 +300,9 @@ def test_xpath_compile_error(self): self.assertRaises(SyntaxError, etree.XPath, '\\fad') + def test_xpath_elementtree_error(self): + self.assertRaises(ValueError, etree.XPath('*'), etree.ElementTree()) + class ETreeETXPathClassTestCase(HelperTestCase): "Tests for the ETXPath class" def test_xpath_compile_ns(self): Modified: lxml/trunk/src/lxml/tests/test_xslt.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_xslt.py (original) +++ lxml/trunk/src/lxml/tests/test_xslt.py Fri May 26 09:35:30 2006 @@ -30,6 +30,9 @@ ''', st.tostring(res)) + def test_xslt_elementtree_error(self): + self.assertRaises(ValueError, etree.XSLT, etree.ElementTree()) + def test_xslt_utf8(self): tree = self.parse(u'\uF8D2\uF8D2') style = self.parse('''\ Modified: lxml/trunk/src/lxml/xmlschema.pxi ============================================================================== --- lxml/trunk/src/lxml/xmlschema.pxi (original) +++ lxml/trunk/src/lxml/xmlschema.pxi Fri May 26 09:35:30 2006 @@ -26,7 +26,7 @@ self._c_schema = NULL if etree is not None: doc = _documentOrRaise(etree) - root_node = _rootNodeOf(etree) + root_node = _rootNodeOrRaise(etree) # work around for libxml2 bug if document is not XML schema at all c_node = root_node._c_node @@ -73,7 +73,7 @@ cdef int ret doc = _documentOrRaise(etree) - root_node = _rootNodeOf(etree) + root_node = _rootNodeOrRaise(etree) self._error_log.connect() valid_ctxt = xmlschema.xmlSchemaNewValidCtxt(self._c_schema) Modified: lxml/trunk/src/lxml/xpath.pxi ============================================================================== --- lxml/trunk/src/lxml/xpath.pxi (original) +++ lxml/trunk/src/lxml/xpath.pxi Fri May 26 09:35:30 2006 @@ -179,7 +179,7 @@ cdef _XPathContext context document = _documentOrRaise(_etree_or_element) - element = _rootNodeOf(_etree_or_element) + element = _rootNodeOrRaise(_etree_or_element) xpathCtxt = self._xpathCtxt xpathCtxt.doc = document._c_doc Modified: lxml/trunk/src/lxml/xslt.pxi ============================================================================== --- lxml/trunk/src/lxml/xslt.pxi (original) +++ lxml/trunk/src/lxml/xslt.pxi Fri May 26 09:35:30 2006 @@ -230,7 +230,7 @@ cdef _NodeBase root_node doc = _documentOrRaise(xslt_input) - root_node = _rootNodeOf(xslt_input) + root_node = _rootNodeOrRaise(xslt_input) # set access control or raise TypeError self._access_control = access_control @@ -287,7 +287,7 @@ cdef Py_ssize_t i, kw_count input_doc = _documentOrRaise(_input) - root_node = _rootNodeOf(_input) + root_node = _rootNodeOrRaise(_input) resolver_context = _XSLTResolverContext(input_doc._parser) resolver_context._c_style_doc = self._xslt_resolver_context._c_style_doc From scoder at codespeak.net Fri May 26 09:45:39 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 26 May 2006 09:45:39 +0200 (CEST) Subject: [Lxml-checkins] r27692 - lxml/trunk Message-ID: <20060526074539.5E60110068@code0.codespeak.net> Author: scoder Date: Fri May 26 09:45:37 2006 New Revision: 27692 Modified: lxml/trunk/CHANGES.txt Log: cleanup in CHANGES.txt Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Fri May 26 09:45:37 2006 @@ -7,12 +7,12 @@ Features added -------------- -* Element.getpath() returns an XPath expression to find the node in the tree - structure +* Element.getpath() returns a simple XPath expression to find the node in the + tree structure -* Error logs now have a ``last_error`` attribute for convenience +* Error logs have a ``last_error`` attribute for convenience -* Comment texts can now be changed through the API +* Comment texts can be changed through the API Bugs fixed ---------- @@ -27,7 +27,7 @@ * Deep copying Elements and ElementTrees maintains the document information -* Serialization functions now raise LookupError for unknown encodings +* Serialization functions raise LookupError for unknown encodings * Memory deallocation crash resulting from deep copying elements @@ -55,8 +55,8 @@ * Parsing a unicode string no longer copies the string (reduced memory footprint) -* Parsing file-like objects now reads chunks rather than the whole file - (reduced memory footprint) +* Parsing file-like objects reads chunks rather than the whole file (reduced + memory footprint) * Parsing StringIO objects from the start avoids copying the string (reduced memory footprint) @@ -69,7 +69,7 @@ * Better error messages in parser exceptions -* Error reporting now also works in XSLT +* Error reporting also works in XSLT * Support for custom document loaders (URI resolvers) in parsers and XSLT, resolvers are registered at parser level @@ -96,7 +96,7 @@ * Element/SubElement failed to set attribute namespaces from passed ``attrib`` dictionary -* ``tostring()`` now adds an XML declaration for non-ASCII encodings +* ``tostring()`` adds an XML declaration for non-ASCII encodings * ``tostring()`` failed to serialize encodings that contain 0-bytes @@ -111,14 +111,14 @@ Features added -------------- -* Speedup for Element.makeelement(): the new element now reuses the original +* Speedup for Element.makeelement(): the new element reuses the original libxml2 document instead of creating a new empty one * Speedup for reversed() iteration over element children (Py2.4+ only) * ElementTree compatible QName class -* RelaxNG and XMLSchema now accept any Element, not only ElementTrees +* RelaxNG and XMLSchema accept any Element, not only ElementTrees Bugs fixed ---------- @@ -140,7 +140,7 @@ * lxml.sax.ElementTreeContentHandler checks closing elements and raises SaxError on mismatch -* lxml.sax.ElementTreeContentHandler now supports namespace-less SAX events +* lxml.sax.ElementTreeContentHandler supports namespace-less SAX events (startElement, endElement) and defaults to empty attributes (keyword argument) @@ -209,8 +209,8 @@ * ElementTree objects no longer interfere, Elements can be root of different ElementTrees at the same time -* document('') now works in XSLT documents read from files (in-memory - documents cannot support this due to libxslt deficiencies) +* document('') works in XSLT documents read from files (in-memory documents + cannot support this due to libxslt deficiencies) 0.8 (2005-11-03) ================ @@ -225,7 +225,7 @@ that it works than if copy.copy() isn't supported at all. * Increased compatibility with (c)ElementTree; .parse() on ElementTree is - now supported and parsing of gzipped XML files works. + supported and parsing of gzipped XML files works. * implemented index() on elements, allowing one to find the index of a SubElement. @@ -249,11 +249,10 @@ * Fixed error with uncaught exception in Pyrex code. -* Calling lxml.etree.fromstring('') now throws XMLSyntaxError instead - of a segfault. +* Calling lxml.etree.fromstring('') throws XMLSyntaxError instead of a + segfault. -* has_key() now works on attrib. 'in' tests also work correctly now on - attrib. +* has_key() works on attrib. 'in' tests also work correctly on attrib. * INSTALL.txt was saying 2.2.16 instead of 2.6.16 as a supported libxml2 version, as it should. @@ -267,8 +266,8 @@ Features added -------------- -* parameters (XPath expressions) can now be passed to XSLT using - keyword parameters. +* parameters (XPath expressions) can be passed to XSLT using keyword + parameters. * Simple XInclude support. Calling the xinclude() method on a tree will process any XInclude statements in the document. @@ -315,16 +314,15 @@ * Can pass None to 'dump()' without segfaults. -* tostring() now works properly for non-root elements as well. +* tostring() works properly for non-root elements as well. * Cleaned out the tostring() method so it should handle encoding correctly. -* Cleaned out the ElementTree.write() method so it should handle - encoding correctly. Writing directly to a file should also be faster - now, as there is no need to go through a Python string in that - case. Made sure the test cases test both serializing to StringIO as - well as serializing to a real file. +* Cleaned out the ElementTree.write() method so it should handle encoding + correctly. Writing directly to a file should also be faster, as there is no + need to go through a Python string in that case. Made sure the test cases + test both serializing to StringIO as well as serializing to a real file. 0.6 (2005-05-14) ================ From scoder at codespeak.net Fri May 26 10:20:37 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 26 May 2006 10:20:37 +0200 (CEST) Subject: [Lxml-checkins] r27697 - in lxml/trunk: . doc src/lxml Message-ID: <20060526082037.07E841007C@code0.codespeak.net> Author: scoder Date: Fri May 26 10:20:33 2006 New Revision: 27697 Modified: lxml/trunk/CHANGES.txt lxml/trunk/doc/api.txt lxml/trunk/src/lxml/etree.pyx Log: moved getpath() method from _Element to _ElementTree as we are dealing with absolute paths, so access through ElementTree makes more sense Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Fri May 26 10:20:33 2006 @@ -7,8 +7,8 @@ Features added -------------- -* Element.getpath() returns a simple XPath expression to find the node in the - tree structure +* ElementTree.getpath(element) returns a simple, absolute XPath expression to + find the element in the tree structure * Error logs have a ``last_error`` attribute for convenience Modified: lxml/trunk/doc/api.txt ============================================================================== --- lxml/trunk/doc/api.txt (original) +++ lxml/trunk/doc/api.txt Fri May 26 10:20:33 2006 @@ -244,19 +244,25 @@ >>> r[0].text 'Text' -A related convenience method of Elements is ``getpath()``, which returns a -structural XPath expression for the respective element:: +A related convenience method of ElementTree is ``getpath(element)``, which +returns a structural XPath expression for an element:: >>> a = etree.Element("a") >>> b = etree.SubElement(a, "b") >>> c = etree.SubElement(a, "c") >>> d1 = etree.SubElement(c, "d") >>> d2 = etree.SubElement(c, "d") - >>> print d2.getpath() + + >>> tree = etree.ElementTree(a) + >>> print tree.getpath(d2) /a/c/d[2] - >>> a.xpath(d2.getpath()) == [d2] + >>> a.xpath(tree.getpath(d2)) == [d2] True + >>> tree = etree.ElementTree(c) + >>> print tree.getpath(d2) + /c/d[2] + XSLT ---- Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Fri May 26 10:20:33 2006 @@ -389,6 +389,20 @@ _tofilelike(file, self._context_node, encoding, c_write_declaration, bool(pretty_print)) + def getpath(self, _NodeBase element not None): + cdef xmlDoc* c_doc + cdef char* c_path + if element._doc is not self._doc: + raise ValueError, "Element is not in this tree." + c_doc = _fakeRootDoc(self._doc._c_doc, self._context_node._c_node) + c_path = tree.xmlGetNodePath(element._c_node) + _destroyFakeDoc(self._doc._c_doc, c_doc) + if c_path is NULL: + raise LxmlError, "Error creating node path." + path = c_path + tree.xmlFree(c_path) + return path + def getiterator(self, tag=None): root = self.getroot() if root is None: @@ -900,15 +914,6 @@ return _elementFactory(self._doc, c_node) return None - def getpath(self): - cdef char* c_path - c_path = tree.xmlGetNodePath(self._c_node) - if c_path is NULL: - raise LxmlError, "Error creating node path." - path = c_path - tree.xmlFree(c_path) - return path - def getiterator(self, tag=None): return ElementDepthFirstIterator(self, tag) From scoder at codespeak.net Fri May 26 11:48:44 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 26 May 2006 11:48:44 +0200 (CEST) Subject: [Lxml-checkins] r27705 - lxml/trunk/src/lxml Message-ID: <20060526094844.3E78F1007B@code0.codespeak.net> Author: scoder Date: Fri May 26 11:48:42 2006 New Revision: 27705 Modified: lxml/trunk/src/lxml/apihelpers.pxi lxml/trunk/src/lxml/proxy.pxi Log: moved _fakeRootDoc and _destroyFakeDoc to proxy.pxi: is related as it changes node._private and hooks into in-memory stucture Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Fri May 26 11:48:42 2006 @@ -75,56 +75,6 @@ else: return None -cdef xmlDoc* _fakeRootDoc(xmlDoc* c_base_doc, xmlNode* c_node): - # build a temporary document that has the given node as root node - # note that copy and original must not be modified during its lifetime!! - # always call _destroyFakeDoc() after use! - cdef xmlNode* c_child - cdef xmlNode* c_root - cdef xmlDoc* c_doc - c_root = tree.xmlDocGetRootElement(c_base_doc) - if c_root == c_node: - # already the root node - return c_base_doc - - c_doc = _copyDoc(c_base_doc, 0) # non recursive! - c_root = tree.xmlDocCopyNode(c_node, c_doc, 2) # non recursive! - - c_root.children = c_node.children - c_root.last = c_node.last - c_root.next = c_root.prev = c_root.parent = NULL - - # store original node - c_root._private = c_node - - # divert parent pointers of children - c_child = c_root.children - while c_child is not NULL: - c_child.parent = c_root - c_child = c_child.next - - c_doc.children = c_root - return c_doc - -cdef void _destroyFakeDoc(xmlDoc* c_base_doc, xmlDoc* c_doc): - # delete a temporary document - cdef xmlNode* c_child - cdef xmlNode* c_parent - cdef xmlNode* c_root - if c_doc != c_base_doc: - c_root = tree.xmlDocGetRootElement(c_doc) - - # restore parent pointers of children - c_parent = c_root._private - c_child = c_root.children - while c_child is not NULL: - c_child.parent = c_parent - c_child = c_child.next - - # prevent recursive removal of children - c_root.children = c_root.last = c_root._private = NULL - tree.xmlFreeDoc(c_doc) - cdef object _attributeValue(xmlNode* c_element, xmlNode* c_attrib_node): cdef char* value if c_attrib_node.ns is NULL or c_attrib_node.ns.href is NULL: Modified: lxml/trunk/src/lxml/proxy.pxi ============================================================================== --- lxml/trunk/src/lxml/proxy.pxi (original) +++ lxml/trunk/src/lxml/proxy.pxi Fri May 26 11:48:42 2006 @@ -75,6 +75,58 @@ #print "Proxy:", proxy, "Proxy type:", proxy_type assert 0, "Tried to unregister unknown proxy" +################################################################################ +# temporarily make a node the root node of its document + +cdef xmlDoc* _fakeRootDoc(xmlDoc* c_base_doc, xmlNode* c_node): + # build a temporary document that has the given node as root node + # note that copy and original must not be modified during its lifetime!! + # always call _destroyFakeDoc() after use! + cdef xmlNode* c_child + cdef xmlNode* c_root + cdef xmlDoc* c_doc + c_root = tree.xmlDocGetRootElement(c_base_doc) + if c_root == c_node: + # already the root node + return c_base_doc + + c_doc = _copyDoc(c_base_doc, 0) # non recursive! + c_root = tree.xmlDocCopyNode(c_node, c_doc, 2) # non recursive! + + c_root.children = c_node.children + c_root.last = c_node.last + c_root.next = c_root.prev = c_root.parent = NULL + + # store original node + c_root._private = c_node + + # divert parent pointers of children + c_child = c_root.children + while c_child is not NULL: + c_child.parent = c_root + c_child = c_child.next + + c_doc.children = c_root + return c_doc + +cdef void _destroyFakeDoc(xmlDoc* c_base_doc, xmlDoc* c_doc): + # delete a temporary document + cdef xmlNode* c_child + cdef xmlNode* c_parent + cdef xmlNode* c_root + if c_doc != c_base_doc: + c_root = tree.xmlDocGetRootElement(c_doc) + + # restore parent pointers of children + c_parent = c_root._private + c_child = c_root.children + while c_child is not NULL: + c_child.parent = c_parent + c_child = c_child.next + + # prevent recursive removal of children + c_root.children = c_root.last = c_root._private = NULL + tree.xmlFreeDoc(c_doc) ################################################################################ # support for freeing tree elements when proxy objects are destroyed From scoder at codespeak.net Fri May 26 13:38:40 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 26 May 2006 13:38:40 +0200 (CEST) Subject: [Lxml-checkins] r27724 - lxml/trunk/src/lxml Message-ID: <20060526113840.3B2E01007C@code0.codespeak.net> Author: scoder Date: Fri May 26 13:38:18 2006 New Revision: 27724 Modified: lxml/trunk/src/lxml/proxy.pxi lxml/trunk/src/lxml/xslt.pxi Log: fixes in_fakeRootDoc(): store original root node in document rather than new root node to allow instantiating new root node Modified: lxml/trunk/src/lxml/proxy.pxi ============================================================================== --- lxml/trunk/src/lxml/proxy.pxi (original) +++ lxml/trunk/src/lxml/proxy.pxi Fri May 26 13:38:18 2006 @@ -86,19 +86,20 @@ cdef xmlNode* c_root cdef xmlDoc* c_doc c_root = tree.xmlDocGetRootElement(c_base_doc) - if c_root == c_node: + if c_root is c_node: # already the root node return c_base_doc c_doc = _copyDoc(c_base_doc, 0) # non recursive! c_root = tree.xmlDocCopyNode(c_node, c_doc, 2) # non recursive! + tree.xmlDocSetRootElement(c_doc, c_root) c_root.children = c_node.children c_root.last = c_node.last c_root.next = c_root.prev = c_root.parent = NULL # store original node - c_root._private = c_node + c_doc._private = c_node # divert parent pointers of children c_child = c_root.children @@ -118,14 +119,14 @@ c_root = tree.xmlDocGetRootElement(c_doc) # restore parent pointers of children - c_parent = c_root._private + c_parent = c_doc._private c_child = c_root.children while c_child is not NULL: c_child.parent = c_parent c_child = c_child.next # prevent recursive removal of children - c_root.children = c_root.last = c_root._private = NULL + c_root.children = c_root.last = NULL tree.xmlFreeDoc(c_doc) ################################################################################ Modified: lxml/trunk/src/lxml/xslt.pxi ============================================================================== --- lxml/trunk/src/lxml/xslt.pxi (original) +++ lxml/trunk/src/lxml/xslt.pxi Fri May 26 13:38:18 2006 @@ -306,7 +306,7 @@ if self._access_control is not None: self._access_control._register_in_context(transform_ctxt) - ptemp = c_doc._private + ptemp = c_doc._private # store original _private pointer! c_doc._private = resolver_context kw_count = python.PyDict_Size(_kw) From scoder at codespeak.net Fri May 26 15:35:07 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 26 May 2006 15:35:07 +0200 (CEST) Subject: [Lxml-checkins] r27726 - in lxml/trunk: . doc src/lxml src/lxml/tests Message-ID: <20060526133507.CB88F1007C@code0.codespeak.net> Author: scoder Date: Fri May 26 15:35:02 2006 New Revision: 27726 Modified: lxml/trunk/CHANGES.txt lxml/trunk/doc/api.txt lxml/trunk/src/lxml/etree.pyx lxml/trunk/src/lxml/tests/test_xpathevaluator.py lxml/trunk/src/lxml/xmlid.pxi lxml/trunk/src/lxml/xpath.pxi Log: fix semantics of absolute XPath expressions in XPathDocumentEvaluator and ET.xpath() by using _fakeRootDoc(), raise exception on Element.xpath('/...') etc. Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Fri May 26 15:35:02 2006 @@ -17,6 +17,11 @@ Bugs fixed ---------- +* Running absolute XPath expressions on Elements now raises an exception in + most cases. Otherwise, the behaviour is explicitly marked as undefined. + +* Evaluating absolute XPath expressions ('/*') on an ElementTree could fail + * Crashes when calling XSLT, RelaxNG, etc. with uninitialized ElementTree objects Modified: lxml/trunk/doc/api.txt ============================================================================== --- lxml/trunk/doc/api.txt (original) +++ lxml/trunk/doc/api.txt Fri May 26 15:35:02 2006 @@ -244,8 +244,8 @@ >>> r[0].text 'Text' -A related convenience method of ElementTree is ``getpath(element)``, which -returns a structural XPath expression for an element:: +A related convenience method of ElementTree objects is ``getpath(element)``, +which returns a structural, absolute XPath expression to find that element:: >>> a = etree.Element("a") >>> b = etree.SubElement(a, "b") @@ -253,16 +253,11 @@ >>> d1 = etree.SubElement(c, "d") >>> d2 = etree.SubElement(c, "d") - >>> tree = etree.ElementTree(a) - >>> print tree.getpath(d2) - /a/c/d[2] - >>> a.xpath(tree.getpath(d2)) == [d2] - True - >>> tree = etree.ElementTree(c) >>> print tree.getpath(d2) /c/d[2] - + >>> tree.xpath(tree.getpath(d2)) == [d2] + True XSLT ---- Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Fri May 26 15:35:02 2006 @@ -447,7 +447,7 @@ XPathEvaluator directly. """ self._assertHasRoot() - evaluator = XPathElementEvaluator(self._context_node, namespaces) + evaluator = XPathDocumentEvaluator(self, namespaces) return evaluator.evaluate(_path, **_variables) def xslt(self, _xslt, extensions=None, **_kw): Modified: lxml/trunk/src/lxml/tests/test_xpathevaluator.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_xpathevaluator.py (original) +++ lxml/trunk/src/lxml/tests/test_xpathevaluator.py Fri May 26 15:35:02 2006 @@ -74,8 +74,8 @@ c = root[0] self.assertEquals([c[0], c[1]], c.xpath('b')) - self.assertEquals([c[0], c[1], root[1][0]], - c.xpath('//b')) + self.assertEquals([c[0], c[1]], + c.xpath('.//b')) def test_xpath_ns(self): tree = self.parse('') @@ -88,15 +88,41 @@ tree.xpath('//foo:b', {'foo': 'uri:c'})) self.assertEquals( [root[0]], - root.xpath('//baz:b', {'baz': 'uri:a'})) + root.xpath('.//baz:b', {'baz': 'uri:a'})) self.assertRaises( TypeError, - root.xpath, '//b', {None: 'uri:a'}) + root.xpath, './/b', {None: 'uri:a'}) def test_xpath_error(self): tree = self.parse('') self.assertRaises(SyntaxError, tree.xpath, '\\fad') + def test_elementtree_getpath(self): + a = etree.Element("a") + b = etree.SubElement(a, "b") + c = etree.SubElement(a, "c") + d1 = etree.SubElement(c, "d") + d2 = etree.SubElement(c, "d") + + tree = etree.ElementTree(a) + self.assertEqual('/a/c/d', + tree.getpath(d2)[:6]) + self.assertEqual([d2], + tree.xpath(tree.getpath(d2))) + + def test_elementtree_getpath_partial(self): + a = etree.Element("a") + b = etree.SubElement(a, "b") + c = etree.SubElement(a, "c") + d1 = etree.SubElement(c, "d") + d2 = etree.SubElement(c, "d") + + tree = etree.ElementTree(c) + self.assertEqual('/c/d', + tree.getpath(d2)[:4]) + self.assertEqual([d2], + tree.xpath(tree.getpath(d2))) + def test_xpath_evaluator(self): tree = self.parse('') e = etree.XPathEvaluator(tree) Modified: lxml/trunk/src/lxml/xmlid.pxi ============================================================================== --- lxml/trunk/src/lxml/xmlid.pxi (original) +++ lxml/trunk/src/lxml/xmlid.pxi Fri May 26 15:35:02 2006 @@ -8,7 +8,7 @@ root = XML(text) # ElementTree compatible implementation: look for 'id' attributes dic = {} - for elem in root.xpath('//*[string(@id)]'): + for elem in ElementTree(root).xpath('//*[string(@id)]'): python.PyDict_SetItem(dic, elem.get('id'), elem) return (root, dic) Modified: lxml/trunk/src/lxml/xpath.pxi ============================================================================== --- lxml/trunk/src/lxml/xpath.pxi (original) +++ lxml/trunk/src/lxml/xpath.pxi Fri May 26 15:35:02 2006 @@ -88,6 +88,9 @@ cdef class XPathElementEvaluator(XPathEvaluatorBase): """Create an XPath evaluator for an element. + Note that the result of evaluating absolute XPath expressions (starting + with '/') is undefined for Elements. Use an ElementTree instead. + XPath evaluators must not be shared between threads. """ cdef _Element _element @@ -114,25 +117,33 @@ add = self._context.addNamespace for prefix, uri in namespaces.items(): add(prefix, uri) - + def evaluate(self, _path, **_variables): - """Evaluate an XPath expression on the document. Variables may be - provided as keyword arguments. Note that namespaces are currently not - supported for variables.""" + """Evaluate an XPath expression on the document. + + Variables may be provided as keyword arguments. Note that namespaces + are currently not supported for variables. + + The result of evaluating absolute XPath expressions (starting with + '/') is undefined for Elements. Use an ElementTree instead. + """ cdef xpath.xmlXPathContext* xpathCtxt cdef xpath.xmlXPathObject* xpathObj cdef xmlNode* c_node cdef _Document doc + path = _utf8(_path) + if path.lstrip().startswith('/'): + raise LxmlSyntaxError, "cannot use absolute path on element" xpathCtxt = self._xpathCtxt xpathCtxt.node = self._element._c_node doc = self._element._doc self._context.register_context(xpathCtxt, doc) - self._context.registerVariables(_variables) - - path = _utf8(_path) - xpathObj = xpath.xmlXPathEvalExpression(_cstr(path), xpathCtxt) - self._context.unregister_context() + try: + self._context.registerVariables(_variables) + xpathObj = xpath.xmlXPathEvalExpression(_cstr(path), xpathCtxt) + finally: + self._context.unregister_context() return self._handle_result(xpathObj, doc) @@ -146,9 +157,39 @@ XPathElementEvaluator.__init__( self, etree._context_node, namespaces, extensions) + def evaluate(self, _path, **_variables): + """Evaluate an XPath expression on the document. + + Variables may be provided as keyword arguments. Note that namespaces + are currently not supported for variables. + """ + cdef xpath.xmlXPathContext* xpathCtxt + cdef xpath.xmlXPathObject* xpathObj + cdef xmlNode* c_node + cdef xmlDoc* c_doc + cdef _Document doc + path = _utf8(_path) + xpathCtxt = self._xpathCtxt + doc = self._element._doc + + self._context.register_context(xpathCtxt, doc) + c_doc = _fakeRootDoc(doc._c_doc, self._element._c_node) + try: + self._context.registerVariables(_variables) + xpathCtxt.doc = c_doc + xpathCtxt.node = tree.xmlDocGetRootElement(c_doc) + xpathObj = xpath.xmlXPathEvalExpression(_cstr(path), xpathCtxt) + finally: + _destroyFakeDoc(doc._c_doc, c_doc) + self._context.unregister_context() + + return self._handle_result(xpathObj, doc) + def XPathEvaluator(etree_or_element, namespaces=None, extensions=None): - """Creates and XPath evaluator for an ElementTree or an Element. + """Creates and XPath evaluator for an ElementTree or an Element. Note + that the result of absolute XPath expressions (starting with '/') is + undefined for Elements. Use an ElementTree instead. XPath evaluators must not be shared between threads. """ @@ -161,11 +202,14 @@ cdef class XPath(XPathEvaluatorBase): cdef xpath.xmlXPathCompExpr* _xpath cdef readonly object path + cdef int _absolute def __init__(self, path, namespaces=None, extensions=None): XPathEvaluatorBase.__init__(self, namespaces, extensions, None) + self._xpath = NULL self.path = path path = _utf8(path) + self._absolute = path.lstrip().startswith('/') self._xpath = xpath.xmlXPathCompile(_cstr(path)) if self._xpath is NULL: self._raise_parse_error() @@ -180,18 +224,21 @@ document = _documentOrRaise(_etree_or_element) element = _rootNodeOrRaise(_etree_or_element) + + if self._absolute and element is _etree_or_element: + raise ValueError, "cannot use absolute path on element" xpathCtxt = self._xpathCtxt xpathCtxt.doc = document._c_doc xpathCtxt.node = element._c_node context = self._context - context._release_temp_refs() context.register_context(xpathCtxt, document) - context.registerVariables(_variables) - - xpathObj = xpath.xmlXPathCompiledEval(self._xpath, xpathCtxt) - context.unregister_context() + try: + context.registerVariables(_variables) + xpathObj = xpath.xmlXPathCompiledEval(self._xpath, xpathCtxt) + finally: + context.unregister_context() return self._handle_result(xpathObj, document) def evaluate(self, _tree, **_variables): From scoder at codespeak.net Fri May 26 16:13:40 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 26 May 2006 16:13:40 +0200 (CEST) Subject: [Lxml-checkins] r27727 - lxml/trunk/doc Message-ID: <20060526141340.139891007C@code0.codespeak.net> Author: scoder Date: Fri May 26 16:13:38 2006 New Revision: 27727 Modified: lxml/trunk/doc/api.txt Log: doc clarification on using absolute vs. relative expressions on ElementTree and Element Modified: lxml/trunk/doc/api.txt ============================================================================== --- lxml/trunk/doc/api.txt (original) +++ lxml/trunk/doc/api.txt Fri May 26 16:13:38 2006 @@ -192,14 +192,24 @@ xpath method on ElementTree, Element ------------------------------------ -lxml.etree extends the ElementTree and Element interfaces with an xpath -method. For ElementTree, the xpath method performs a global xpath query -against the document. When xpath is used on an element, the xpath expression -is performed taking the element as the xpath context node. +lxml.etree supports the simple path syntax of the ``findall()`` etc. methods +on ElementTree and Element, as known from the original ElementTree library. +As an extension, these classes also provide an ``xpath()`` method that +supports expressions in the complete XPath syntax. + +For ElementTree, the xpath method performs a global xpath query against the +document (if absolute) or against the root node (if relative). + +When xpath is used on an element, the xpath expression is performed taking the +element as the xpath context node. Note that it is illegal to run an absolute +XPath expression (like ``/a``) against an element. The result is undefined. You call the xpath() method with the XPath expression to use. Optionally, you can provide a second argument, which should be a dictionary mapping the -namespace prefixes used in the XPath expression to namespace URIs. +namespace prefixes used in the XPath expression to namespace URIs. The +optional third argument is used to define `extension functions`_. + +.. _`extension functions`: extensions.html The return values of xpath vary, depending on the XPath expression used: From scoder at codespeak.net Fri May 26 16:13:50 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 26 May 2006 16:13:50 +0200 (CEST) Subject: [Lxml-checkins] r27728 - lxml/trunk/src/lxml Message-ID: <20060526141350.4B5E81007C@code0.codespeak.net> Author: scoder Date: Fri May 26 16:13:49 2006 New Revision: 27728 Modified: lxml/trunk/src/lxml/xpath.pxi Log: cleanup Modified: lxml/trunk/src/lxml/xpath.pxi ============================================================================== --- lxml/trunk/src/lxml/xpath.pxi (original) +++ lxml/trunk/src/lxml/xpath.pxi Fri May 26 16:13:49 2006 @@ -129,7 +129,6 @@ """ cdef xpath.xmlXPathContext* xpathCtxt cdef xpath.xmlXPathObject* xpathObj - cdef xmlNode* c_node cdef _Document doc path = _utf8(_path) if path.lstrip().startswith('/'): @@ -165,7 +164,6 @@ """ cdef xpath.xmlXPathContext* xpathCtxt cdef xpath.xmlXPathObject* xpathObj - cdef xmlNode* c_node cdef xmlDoc* c_doc cdef _Document doc path = _utf8(_path) @@ -209,10 +207,10 @@ self._xpath = NULL self.path = path path = _utf8(path) - self._absolute = path.lstrip().startswith('/') self._xpath = xpath.xmlXPathCompile(_cstr(path)) if self._xpath is NULL: self._raise_parse_error() + self._absolute = path.lstrip().startswith('/') self._xpathCtxt = xpath.xmlXPathNewContext(NULL) def __call__(self, _etree_or_element, **_variables): From scoder at codespeak.net Fri May 26 18:25:04 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 26 May 2006 18:25:04 +0200 (CEST) Subject: [Lxml-checkins] r27731 - in lxml/trunk: . src/lxml Message-ID: <20060526162504.54BED10088@code0.codespeak.net> Author: scoder Date: Fri May 26 18:25:01 2006 New Revision: 27731 Modified: lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/apihelpers.pxi lxml/trunk/src/lxml/etree.pyx lxml/trunk/src/lxml/xslt.pxi Log: fix: document reference in ElementTree objects was not updated when their root element was moved to a different document Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Fri May 26 18:25:01 2006 @@ -17,6 +17,9 @@ Bugs fixed ---------- +* Document reference in ElementTree objects was not updated when the root + element was moved to a different document + * Running absolute XPath expressions on Elements now raises an exception in most cases. Otherwise, the behaviour is explicitly marked as undefined. Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Fri May 26 18:25:01 2006 @@ -16,8 +16,11 @@ Should be used in all API functions for consistency. """ cdef _Document doc + cdef _NodeBase element if isinstance(input, _ElementTree): - doc = (<_ElementTree>input)._doc + element = (<_ElementTree>input)._context_node + if element is not None: + doc = element._doc elif isinstance(input, _NodeBase): doc = (<_NodeBase>input)._doc elif isinstance(input, _Document): @@ -53,14 +56,16 @@ # call this to get the document of a # _Document, _ElementTree or _NodeBase object # may return None! + cdef _NodeBase element if isinstance(input, _ElementTree): - return (<_ElementTree>input)._doc + element = (<_ElementTree>input)._context_node + if element is not None: + return element._doc elif isinstance(input, _NodeBase): return (<_NodeBase>input)._doc elif isinstance(input, _Document): return <_Document>input - else: - return None + return None cdef _NodeBase _rootNodeOf(object input): # call this to get the root node of a Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Fri May 26 18:25:01 2006 @@ -2,6 +2,7 @@ from tree cimport xmlDoc, xmlNode, xmlAttr, xmlNs, _isElement from python cimport isinstance, issubclass, hasattr, callable from python cimport iter, str, _cstr, Py_ssize_t +cimport xpath cimport xinclude cimport c14n cimport cstd @@ -339,6 +340,10 @@ cdef _Document _doc cdef _NodeBase _context_node + # Note that _doc is only used to store the original document if we do not + # have a _context_node. All methods should prefer self._context_node._doc + # to honour tree restructuring + cdef _assertHasRoot(self): """We have to take care here: the document may not have a root node! This can happen if ElementTree() is called without any argument and @@ -351,10 +356,11 @@ def parse(self, source, _BaseParser parser=None): """Updates self with the content of source and returns its root """ - self._doc = _parseDocument(source, parser) - self._context_node = self._doc.getroot() + cdef _Document doc + doc = _parseDocument(source, parser) + self._context_node = doc.getroot() return self._context_node - + def getroot(self): return self._context_node @@ -364,7 +370,8 @@ of a parsed document (e.g. those returned by the parse functions). """ def __get__(self): - return DocInfo(self._doc) + self._assertHasRoot() + return DocInfo(self._context_node._doc) def write(self, file, encoding=None, pretty_print=False, xml_declaration=None): @@ -390,13 +397,15 @@ c_write_declaration, bool(pretty_print)) def getpath(self, _NodeBase element not None): + cdef _Document doc cdef xmlDoc* c_doc cdef char* c_path - if element._doc is not self._doc: + doc = self._context_node._doc + if element._doc is not doc: raise ValueError, "Element is not in this tree." - c_doc = _fakeRootDoc(self._doc._c_doc, self._context_node._c_node) + c_doc = _fakeRootDoc(doc._c_doc, self._context_node._c_node) c_path = tree.xmlGetNodePath(element._c_node) - _destroyFakeDoc(self._doc._c_doc, c_doc) + _destroyFakeDoc(doc._c_doc, c_doc) if c_path is NULL: raise LxmlError, "Error creating node path." path = c_path @@ -521,7 +530,7 @@ cdef char* data cdef int bytes self._assertHasRoot() - c_base_doc = self._doc._c_doc + c_base_doc = self._context_node._doc._c_doc c_doc = _fakeRootDoc(c_base_doc, self._context_node._c_node) bytes = c14n.xmlC14NDocDumpMemory(c_doc, NULL, 0, NULL, 1, &data) Modified: lxml/trunk/src/lxml/xslt.pxi ============================================================================== --- lxml/trunk/src/lxml/xslt.pxi (original) +++ lxml/trunk/src/lxml/xslt.pxi Fri May 26 18:25:01 2006 @@ -375,9 +375,16 @@ cdef class _XSLTResultTree(_ElementTree): cdef XSLT _xslt cdef _saveToStringAndSize(self, char** s, int* l): + cdef _Document doc cdef int r - r = xslt.xsltSaveResultToString(s, l, self._doc._c_doc, - self._xslt._c_style) + if self._context_node is not None: + doc = self._context_node._doc + if doc is None: + doc = self._doc + if doc is None: + s[0] = NULL + return + r = xslt.xsltSaveResultToString(s, l, doc._c_doc, self._xslt._c_style) if r == -1: raise XSLTSaveError, "Error saving XSLT result to string" From scoder at codespeak.net Fri May 26 18:27:36 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 26 May 2006 18:27:36 +0200 (CEST) Subject: [Lxml-checkins] r27732 - lxml/trunk/src/lxml Message-ID: <20060526162736.A3C0010088@code0.codespeak.net> Author: scoder Date: Fri May 26 18:27:33 2006 New Revision: 27732 Modified: lxml/trunk/src/lxml/extensions.pxi lxml/trunk/src/lxml/parser.pxi lxml/trunk/src/lxml/tree.pxd lxml/trunk/src/lxml/xpath.pxd lxml/trunk/src/lxml/xpath.pxi Log: reuse the parser dictionary also for XPath parsing Modified: lxml/trunk/src/lxml/extensions.pxi ============================================================================== --- lxml/trunk/src/lxml/extensions.pxi (original) +++ lxml/trunk/src/lxml/extensions.pxi Fri May 26 18:27:33 2006 @@ -1,7 +1,5 @@ # supports for extension functions in XPath and XSLT -cimport xpath - class XPathError(LxmlError): pass Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Fri May 26 18:27:33 2006 @@ -42,6 +42,15 @@ pctxt.dict = self._c_dict xmlparser.xmlDictReference(pctxt.dict) + cdef void _initXPathParserDict(self, xpath.xmlXPathContext* pctxt): + "Assure we always use the same string dictionary." + if self._c_dict is NULL or self._c_dict is pctxt.dict: + return + if pctxt.dict is not NULL: + xmlparser.xmlDictFree(pctxt.dict) + pctxt.dict = self._c_dict + xmlparser.xmlDictReference(pctxt.dict) + cdef void _initDocDict(self, xmlDoc* result): "Store dict of last object parsed if no shared dict yet" if result is NULL: Modified: lxml/trunk/src/lxml/tree.pxd ============================================================================== --- lxml/trunk/src/lxml/tree.pxd (original) +++ lxml/trunk/src/lxml/tree.pxd Fri May 26 18:27:33 2006 @@ -52,7 +52,6 @@ # for some reason need to define this in this section; # libxml/dict.h appears to be broken to include in C ctypedef struct xmlDict - cdef int xmlDictOwns(xmlDict* dict, char* name) ctypedef struct xmlDoc ctypedef struct xmlAttr Modified: lxml/trunk/src/lxml/xpath.pxd ============================================================================== --- lxml/trunk/src/lxml/xpath.pxd (original) +++ lxml/trunk/src/lxml/xpath.pxd Fri May 26 18:27:33 2006 @@ -54,6 +54,7 @@ ctypedef struct xmlXPathContext: tree.xmlDoc* doc tree.xmlNode* node + tree.xmlDict* dict char* function char* functionURI # actually signature is void (*error)(void*, xmlError*) @@ -81,6 +82,8 @@ cdef xmlXPathObject* xmlXPathCompiledEval(xmlXPathCompExpr* comp, xmlXPathContext* ctxt) cdef xmlXPathCompExpr* xmlXPathCompile(char* str) + cdef xmlXPathCompExpr* xmlXPathCtxtCompile(xmlXPathContext* ctxt, + char* str) cdef void xmlXPathFreeContext(xmlXPathContext* ctxt) cdef void xmlXPathFreeCompExpr(xmlXPathCompExpr* comp) cdef void xmlXPathFreeObject(xmlXPathObject* obj) Modified: lxml/trunk/src/lxml/xpath.pxi ============================================================================== --- lxml/trunk/src/lxml/xpath.pxi (original) +++ lxml/trunk/src/lxml/xpath.pxi Fri May 26 18:27:33 2006 @@ -44,6 +44,8 @@ xpath.xmlXPathRegisterVariable( self._xpathCtxt, _cstr(name_utf), _wrapXPathObject(value)) +cdef void _setupDict(xpath.xmlXPathContext* xpathCtxt): + __GLOBAL_PARSER_CONTEXT._initXPathParserDict(xpathCtxt) cdef class XPathEvaluatorBase: cdef xpath.xmlXPathContext* _xpathCtxt @@ -103,6 +105,7 @@ self._xpathCtxt = xpathCtxt if xpathCtxt is NULL: raise XPathContextError, "Unable to create new XPath context" + _setupDict(xpathCtxt) self._element = element XPathEvaluatorBase.__init__(self, namespaces, extensions) @@ -207,11 +210,12 @@ self._xpath = NULL self.path = path path = _utf8(path) - self._xpath = xpath.xmlXPathCompile(_cstr(path)) + self._xpathCtxt = xpath.xmlXPathNewContext(NULL) + _setupDict(self._xpathCtxt) + self._xpath = xpath.xmlXPathCtxtCompile(self._xpathCtxt, _cstr(path)) if self._xpath is NULL: self._raise_parse_error() self._absolute = path.lstrip().startswith('/') - self._xpathCtxt = xpath.xmlXPathNewContext(NULL) def __call__(self, _etree_or_element, **_variables): cdef xpath.xmlXPathContext* xpathCtxt From scoder at codespeak.net Fri May 26 19:03:05 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 26 May 2006 19:03:05 +0200 (CEST) Subject: [Lxml-checkins] r27733 - lxml/trunk/src/lxml Message-ID: <20060526170305.C5A7210088@code0.codespeak.net> Author: scoder Date: Fri May 26 19:03:04 2006 New Revision: 27733 Modified: lxml/trunk/src/lxml/xpath.pxi Log: factored out test for absolute paths into evaluator base class method Modified: lxml/trunk/src/lxml/xpath.pxi ============================================================================== --- lxml/trunk/src/lxml/xpath.pxi (original) +++ lxml/trunk/src/lxml/xpath.pxi Fri May 26 19:03:04 2006 @@ -58,6 +58,20 @@ if self._xpathCtxt is not NULL: xpath.xmlXPathFreeContext(self._xpathCtxt) + cdef int _checkAbsolutePath(self, char* path): + cdef char c + if path is NULL: + return 0 + c = path[0] + while c != c'\0': + if c == c'/': + return 1 + elif c != c' ' and c != c'\t': + break + path = path + 1 + c = path[0] + return 0 + cdef _raise_parse_error(self): if self._xpathCtxt is not NULL and \ self._xpathCtxt.lastError.message is not NULL: @@ -133,8 +147,10 @@ cdef xpath.xmlXPathContext* xpathCtxt cdef xpath.xmlXPathObject* xpathObj cdef _Document doc + cdef char* c_path path = _utf8(_path) - if path.lstrip().startswith('/'): + c_path = _cstr(path) + if self._checkAbsolutePath(c_path): raise LxmlSyntaxError, "cannot use absolute path on element" xpathCtxt = self._xpathCtxt xpathCtxt.node = self._element._c_node @@ -143,7 +159,7 @@ self._context.register_context(xpathCtxt, doc) try: self._context.registerVariables(_variables) - xpathObj = xpath.xmlXPathEvalExpression(_cstr(path), xpathCtxt) + xpathObj = xpath.xmlXPathEvalExpression(c_path, xpathCtxt) finally: self._context.unregister_context() @@ -206,16 +222,18 @@ cdef int _absolute def __init__(self, path, namespaces=None, extensions=None): + cdef char* c_path XPathEvaluatorBase.__init__(self, namespaces, extensions, None) self._xpath = NULL self.path = path path = _utf8(path) + c_path = _cstr(path) self._xpathCtxt = xpath.xmlXPathNewContext(NULL) _setupDict(self._xpathCtxt) - self._xpath = xpath.xmlXPathCtxtCompile(self._xpathCtxt, _cstr(path)) + self._xpath = xpath.xmlXPathCtxtCompile(self._xpathCtxt, c_path) if self._xpath is NULL: self._raise_parse_error() - self._absolute = path.lstrip().startswith('/') + self._absolute = self._checkAbsolutePath(c_path) def __call__(self, _etree_or_element, **_variables): cdef xpath.xmlXPathContext* xpathCtxt From scoder at codespeak.net Fri May 26 19:07:19 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 26 May 2006 19:07:19 +0200 (CEST) Subject: [Lxml-checkins] r27734 - lxml/trunk/src/lxml Message-ID: <20060526170719.DA54610088@code0.codespeak.net> Author: scoder Date: Fri May 26 19:07:18 2006 New Revision: 27734 Modified: lxml/trunk/src/lxml/xpath.pxi Log: cleanup Modified: lxml/trunk/src/lxml/xpath.pxi ============================================================================== --- lxml/trunk/src/lxml/xpath.pxi (original) +++ lxml/trunk/src/lxml/xpath.pxi Fri May 26 19:07:18 2006 @@ -63,14 +63,10 @@ if path is NULL: return 0 c = path[0] - while c != c'\0': - if c == c'/': - return 1 - elif c != c' ' and c != c'\t': - break + while c == c' ' or c == c'\t': path = path + 1 c = path[0] - return 0 + return c == c'/' cdef _raise_parse_error(self): if self._xpathCtxt is not NULL and \ From scoder at codespeak.net Fri May 26 19:26:50 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 26 May 2006 19:26:50 +0200 (CEST) Subject: [Lxml-checkins] r27736 - lxml/trunk/src/lxml/tests Message-ID: <20060526172650.47C2E10088@code0.codespeak.net> Author: scoder Date: Fri May 26 19:26:48 2006 New Revision: 27736 Modified: lxml/trunk/src/lxml/tests/test_xpathevaluator.py Log: test cases for exceptions on calling absolute XPath expression on an Element Modified: lxml/trunk/src/lxml/tests/test_xpathevaluator.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_xpathevaluator.py (original) +++ lxml/trunk/src/lxml/tests/test_xpathevaluator.py Fri May 26 19:26:48 2006 @@ -329,6 +329,13 @@ def test_xpath_elementtree_error(self): self.assertRaises(ValueError, etree.XPath('*'), etree.ElementTree()) + def test_xpath_element_absolute_error(self): + self.assertRaises(ValueError, etree.XPath(' / * '), etree.Element("test")) + + def test_xpath_element_absolute_error2(self): + el = etree.Element("test") + self.assertRaises(SyntaxError, el.xpath, ' /* ') + class ETreeETXPathClassTestCase(HelperTestCase): "Tests for the ETXPath class" def test_xpath_compile_ns(self): From scoder at codespeak.net Fri May 26 19:39:54 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 26 May 2006 19:39:54 +0200 (CEST) Subject: [Lxml-checkins] r27737 - lxml/trunk/src/lxml Message-ID: <20060526173954.8E9881008D@code0.codespeak.net> Author: scoder Date: Fri May 26 19:39:53 2006 New Revision: 27737 Modified: lxml/trunk/src/lxml/xpath.pxi Log: cleanup Modified: lxml/trunk/src/lxml/xpath.pxi ============================================================================== --- lxml/trunk/src/lxml/xpath.pxi (original) +++ lxml/trunk/src/lxml/xpath.pxi Fri May 26 19:39:53 2006 @@ -280,7 +280,7 @@ cdef _nsextract_path(self, path): # replace {namespaces} by new prefixes cdef int i - path_utf = path.encode('UTF-8') + path_utf = _utf8(path) stripped_path = _replace_strings('', path_utf) # remove string literals namespaces = {} namespace_defs = [] From scoder at codespeak.net Fri May 26 21:37:48 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 26 May 2006 21:37:48 +0200 (CEST) Subject: [Lxml-checkins] r27741 - in lxml/trunk: . doc src/lxml src/lxml/tests Message-ID: <20060526193748.E34A010097@code0.codespeak.net> Author: scoder Date: Fri May 26 21:37:45 2006 New Revision: 27741 Modified: lxml/trunk/CHANGES.txt lxml/trunk/doc/api.txt lxml/trunk/src/lxml/etree.pyx lxml/trunk/src/lxml/tests/test_etree.py lxml/trunk/src/lxml/tests/test_xpathevaluator.py lxml/trunk/src/lxml/xpath.pxi Log: new method Element.getroottree() to return root ElementTree of the document, make absolute XPaths available to elements again, define element.xpath('/...') as element.getroottree().xpath('/...') Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Fri May 26 21:37:45 2006 @@ -7,6 +7,9 @@ Features added -------------- +* ``element.getroottree()`` returns an ElementTree for the root node of the + document that contains the element. + * ElementTree.getpath(element) returns a simple, absolute XPath expression to find the element in the tree structure @@ -20,8 +23,8 @@ * Document reference in ElementTree objects was not updated when the root element was moved to a different document -* Running absolute XPath expressions on Elements now raises an exception in - most cases. Otherwise, the behaviour is explicitly marked as undefined. +* Running absolute XPath expressions on an Elements now correctly evaluates + against the root tree * Evaluating absolute XPath expressions ('/*') on an ElementTree could fail Modified: lxml/trunk/doc/api.txt ============================================================================== --- lxml/trunk/doc/api.txt (original) +++ lxml/trunk/doc/api.txt Fri May 26 21:37:45 2006 @@ -197,17 +197,44 @@ As an extension, these classes also provide an ``xpath()`` method that supports expressions in the complete XPath syntax. -For ElementTree, the xpath method performs a global xpath query against the -document (if absolute) or against the root node (if relative). +For ElementTree, the xpath method performs a global XPath query against the +document (if absolute) or against the root node (if relative):: -When xpath is used on an element, the xpath expression is performed taking the -element as the xpath context node. Note that it is illegal to run an absolute -XPath expression (like ``/a``) against an element. The result is undefined. - -You call the xpath() method with the XPath expression to use. Optionally, you -can provide a second argument, which should be a dictionary mapping the -namespace prefixes used in the XPath expression to namespace URIs. The -optional third argument is used to define `extension functions`_. + >>> f = StringIO('') + >>> tree = etree.parse(f) + + >>> r = tree.xpath('/foo/bar') + >>> len(r) + 1 + >>> r[0].tag + 'bar' + + >>> r = tree.xpath('bar') + >>> r[0].tag + 'bar' + +When ``xpath()`` is used on an element, the XPath expression is evaluated +against the element (if relative) or against the root tree (if absolute):: + + >>> root = tree.getroot() + >>> r = root.xpath('bar') + >>> r[0].tag + 'bar' + + >>> bar = root[0] + >>> r = bar.xpath('/foo/bar') + >>> r[0].tag + 'bar' + + >>> tree = bar.getroottree() + >>> r = tree.xpath('/foo/bar') + >>> r[0].tag + 'bar' + +Optionally, you can provide a ``namespaces`` keyword argument, which should be +a dictionary mapping the namespace prefixes used in the XPath expression to +namespace URIs. The optional ``extensions`` argument is used to define +`extension functions`_ in Python. .. _`extension functions`: extensions.html Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Fri May 26 21:37:45 2006 @@ -440,11 +440,12 @@ return root.findall(path) # extensions to ElementTree API - def xpath(self, _path, namespaces=None, **_variables): + def xpath(self, _path, namespaces=None, extensions=None, **_variables): """XPath evaluate in context of document. - namespaces is an optional dictionary with prefix to namespace URI - mappings, used by XPath. + ``namespaces`` is an optional dictionary with prefix to namespace URI + mappings, used by XPath. ``extensions`` defines additional extension + functions. Returns a list (nodeset), or bool, float or string. @@ -456,7 +457,7 @@ XPathEvaluator directly. """ self._assertHasRoot() - evaluator = XPathDocumentEvaluator(self, namespaces) + evaluator = XPathDocumentEvaluator(self, namespaces, extensions) return evaluator.evaluate(_path, **_variables) def xslt(self, _xslt, extensions=None, **_kw): @@ -923,6 +924,11 @@ return _elementFactory(self._doc, c_node) return None + def getroottree(self): + """Return an ElementTree for the root node of the document that + contains this element.""" + return _elementTreeFactory(self._doc, None) + def getiterator(self, tag=None): return ElementDepthFirstIterator(self, tag) @@ -950,8 +956,8 @@ def findall(self, path): return _elementpath.findall(self, path) - def xpath(self, _path, namespaces=None, **_variables): - evaluator = XPathElementEvaluator(self, namespaces) + def xpath(self, _path, namespaces=None, extensions=None, **_variables): + evaluator = XPathElementEvaluator(self, namespaces, extensions) return evaluator.evaluate(_path, **_variables) cdef _Element _elementFactory(_Document doc, xmlNode* c_node): Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Fri May 26 21:37:45 2006 @@ -264,6 +264,24 @@ b, d.getparent()) + def test_getroottree(self): + Element = self.etree.Element + SubElement = self.etree.SubElement + + a = Element('a') + b = SubElement(a, 'b') + c = SubElement(a, 'c') + d = SubElement(b, 'd') + self.assertEquals( + a, + a.getroottree().getroot()) + self.assertEquals( + a, + b.getroottree().getroot()) + self.assertEquals( + a, + d.getroottree().getroot()) + def test_parseid(self): parseid = self.etree.parseid XML = self.etree.XML Modified: lxml/trunk/src/lxml/tests/test_xpathevaluator.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_xpathevaluator.py (original) +++ lxml/trunk/src/lxml/tests/test_xpathevaluator.py Fri May 26 21:37:45 2006 @@ -74,8 +74,8 @@ c = root[0] self.assertEquals([c[0], c[1]], c.xpath('b')) - self.assertEquals([c[0], c[1]], - c.xpath('.//b')) + self.assertEquals([c[0], c[1], root[1][0]], + c.xpath('//b')) def test_xpath_ns(self): tree = self.parse('') @@ -88,10 +88,10 @@ tree.xpath('//foo:b', {'foo': 'uri:c'})) self.assertEquals( [root[0]], - root.xpath('.//baz:b', {'baz': 'uri:a'})) + root.xpath('//baz:b', {'baz': 'uri:a'})) self.assertRaises( TypeError, - root.xpath, './/b', {None: 'uri:a'}) + root.xpath, '//b', {None: 'uri:a'}) def test_xpath_error(self): tree = self.parse('') @@ -329,13 +329,6 @@ def test_xpath_elementtree_error(self): self.assertRaises(ValueError, etree.XPath('*'), etree.ElementTree()) - def test_xpath_element_absolute_error(self): - self.assertRaises(ValueError, etree.XPath(' / * '), etree.Element("test")) - - def test_xpath_element_absolute_error2(self): - el = etree.Element("test") - self.assertRaises(SyntaxError, el.xpath, ' /* ') - class ETreeETXPathClassTestCase(HelperTestCase): "Tests for the ETXPath class" def test_xpath_compile_ns(self): Modified: lxml/trunk/src/lxml/xpath.pxi ============================================================================== --- lxml/trunk/src/lxml/xpath.pxi (original) +++ lxml/trunk/src/lxml/xpath.pxi Fri May 26 21:37:45 2006 @@ -100,8 +100,8 @@ cdef class XPathElementEvaluator(XPathEvaluatorBase): """Create an XPath evaluator for an element. - Note that the result of evaluating absolute XPath expressions (starting - with '/') is undefined for Elements. Use an ElementTree instead. + Absolute XPath expressions (starting with '/') will be evaluated against + the ElementTree as returned by getroottree(). XPath evaluators must not be shared between threads. """ @@ -137,17 +137,14 @@ Variables may be provided as keyword arguments. Note that namespaces are currently not supported for variables. - The result of evaluating absolute XPath expressions (starting with - '/') is undefined for Elements. Use an ElementTree instead. + Absolute XPath expressions (starting with '/') will be evaluated + against the ElementTree as returned by getroottree(). """ cdef xpath.xmlXPathContext* xpathCtxt cdef xpath.xmlXPathObject* xpathObj cdef _Document doc cdef char* c_path path = _utf8(_path) - c_path = _cstr(path) - if self._checkAbsolutePath(c_path): - raise LxmlSyntaxError, "cannot use absolute path on element" xpathCtxt = self._xpathCtxt xpathCtxt.node = self._element._c_node doc = self._element._doc @@ -155,7 +152,7 @@ self._context.register_context(xpathCtxt, doc) try: self._context.registerVariables(_variables) - xpathObj = xpath.xmlXPathEvalExpression(c_path, xpathCtxt) + xpathObj = xpath.xmlXPathEvalExpression(_cstr(path), xpathCtxt) finally: self._context.unregister_context() @@ -200,9 +197,7 @@ def XPathEvaluator(etree_or_element, namespaces=None, extensions=None): - """Creates and XPath evaluator for an ElementTree or an Element. Note - that the result of absolute XPath expressions (starting with '/') is - undefined for Elements. Use an ElementTree instead. + """Creates and XPath evaluator for an ElementTree or an Element. XPath evaluators must not be shared between threads. """ @@ -215,7 +210,6 @@ cdef class XPath(XPathEvaluatorBase): cdef xpath.xmlXPathCompExpr* _xpath cdef readonly object path - cdef int _absolute def __init__(self, path, namespaces=None, extensions=None): cdef char* c_path @@ -229,7 +223,6 @@ self._xpath = xpath.xmlXPathCtxtCompile(self._xpathCtxt, c_path) if self._xpath is NULL: self._raise_parse_error() - self._absolute = self._checkAbsolutePath(c_path) def __call__(self, _etree_or_element, **_variables): cdef xpath.xmlXPathContext* xpathCtxt @@ -240,9 +233,6 @@ document = _documentOrRaise(_etree_or_element) element = _rootNodeOrRaise(_etree_or_element) - - if self._absolute and element is _etree_or_element: - raise ValueError, "cannot use absolute path on element" xpathCtxt = self._xpathCtxt xpathCtxt.doc = document._c_doc From scoder at codespeak.net Fri May 26 21:41:56 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 26 May 2006 21:41:56 +0200 (CEST) Subject: [Lxml-checkins] r27742 - lxml/trunk Message-ID: <20060526194156.E917010097@code0.codespeak.net> Author: scoder Date: Fri May 26 21:41:55 2006 New Revision: 27742 Modified: lxml/trunk/CHANGES.txt Log: typo Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Fri May 26 21:41:55 2006 @@ -23,8 +23,8 @@ * Document reference in ElementTree objects was not updated when the root element was moved to a different document -* Running absolute XPath expressions on an Elements now correctly evaluates - against the root tree +* Running absolute XPath expressions on an Element now evaluates against the + root tree * Evaluating absolute XPath expressions ('/*') on an ElementTree could fail From scoder at codespeak.net Fri May 26 22:16:18 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 26 May 2006 22:16:18 +0200 (CEST) Subject: [Lxml-checkins] r27746 - lxml/trunk/src/lxml Message-ID: <20060526201618.C9AA91009A@code0.codespeak.net> Author: scoder Date: Fri May 26 22:16:17 2006 New Revision: 27746 Modified: lxml/trunk/src/lxml/etree.pyx Log: make True/False a little more constant Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Fri May 26 22:16:17 2006 @@ -8,6 +8,12 @@ cimport cstd import re +import __builtin__ +cdef object True +cdef object False +True = __builtin__.True +False = __builtin__.False + import _elementpath from StringIO import StringIO import sys From scoder at codespeak.net Sat May 27 06:36:25 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 27 May 2006 06:36:25 +0200 (CEST) Subject: [Lxml-checkins] r27748 - lxml/trunk Message-ID: <20060527043625.7A343100AD@code0.codespeak.net> Author: scoder Date: Sat May 27 06:36:21 2006 New Revision: 27748 Modified: lxml/trunk/CREDITS.txt Log: cleanup and updates in CREDITS.txt Modified: lxml/trunk/CREDITS.txt ============================================================================== --- lxml/trunk/CREDITS.txt (original) +++ lxml/trunk/CREDITS.txt Sat May 27 06:36:21 2006 @@ -1,52 +1,58 @@ Credits ------- -Martijn Faassen - initial and main developer +Martijn Faassen - initial main developer -Marc-Antoine Parent - XPath extension function help and patches +Stefan Behnel - main developer and maintainer -Stefan Behnel - core development work (SAX support, misc patches) +Marc-Antoine Parent - XPath extension function help and patches Olivier Grisel - improved (c)ElementTree compatibility patches, website improvements. +Kasimier Buchcik - help with specs and libxml2 + Florian Wagner - help with copy.deepcopy support, bug reporting Emil Kroymann - help with encoding support, bug reporting Slou - help with index() support, bug reporting -Duncan Booth - bugfixing - Paul Everitt - bug reporting, feedback on API design -Julien Anguenot - bug reporting - Paul Clifford - Python 2.2 compatibility fixes -Wade Leftwich - unicode bug reporting +Victor Ng - Discussions on memory management strategies, vlibxml2 -Henrik Thostrup Jensen - bug reporting +Robert Kern - feedback on API design -dharana - bug reporting +Trent Mick - setup.py patch -Hamish Lawson - bug reporting +Steve Howe - Windows builds -Gavrie Philipson - bug reporting +David Sankel - building statically on Windows -Victor Ng - Discussions on memory management strategies, vlibxml2 +Duncan Booth - bugfixing -Robert Kern - feedback on API design +Dean Pavlekovic - bug reporting + +Julien Anguenot - bug reporting + +Wade Leftwich - unicode bug reporting Kieran Holland - iteration crash bug report -Trent Mick - setup.py patch +Henrik Thostrup Jensen - bug reporting -Steve Howe - Windows builds +dharana - bug reporting + +Hamish Lawson - bug reporting + +Gavrie Philipson - bug reporting -David Sankel - building statically on Windows Thanks also to: +--------------- * the libxml2 project for a great XML library. From scoder at codespeak.net Sat May 27 12:11:41 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 27 May 2006 12:11:41 +0200 (CEST) Subject: [Lxml-checkins] r27751 - lxml/trunk/doc Message-ID: <20060527101141.96D8A100B3@code0.codespeak.net> Author: scoder Date: Sat May 27 12:11:38 2006 New Revision: 27751 Added: lxml/trunk/doc/FAQ.txt Log: new FAQ.txt Added: lxml/trunk/doc/FAQ.txt ============================================================================== --- (empty file) +++ lxml/trunk/doc/FAQ.txt Sat May 27 12:11:38 2006 @@ -0,0 +1,78 @@ +Frequently Asked Questions +========================== + +See also the notes on compatibility_ to ElementTree_. + +.. _compatibility: compatibility.html +.. _ElementTree: http://effbot.org/zone/element-index.htm + + +1) Is there a tutorial? + + There is a `tutorial for ElementTree`_ which also works for lxml.etree. + The `API documentation`_ also contains many examples. + + .. _`tutorial for ElementTree`: http://effbot.org/zone/element.htm + .. _`API documentation`: api.html + + +2) Where can I find more documentation about lxml? + + There is a lot of documentation as lxml implements the well-known + `ElementTree API`_ and tries to follow its documentation as closely as + possible. There are a couple of issues where lxml cannot keep up + compatibility. They are described in the compatibility_ documentation. + The lxml specific extensions to the API are described by individual files + in the ``doc`` directory of the distribution and on `the web page`_. + + .. _`ElementTree API`: http://effbot.org/zone/element-index.htm + .. _`the web page`: http://codespeak.net/lxml/#documentation + + +3) Why are there ``findall()`` and ``xpath()`` methods on Element(Tree)? + + ``findall()`` is specified in the `ElementTree API`_. It supports a + `simple subset of the XPath language`_, without predicates, conditions and + other advanced features. It is very handy for finding specific tags in a + tree. Another important difference is namespace handling, which uses the + ``{namespace}tagname`` notation. This is not supported by XPath. The + findall, find and findtext methods are compatible with other ElementTree + implementations and allow writing portable code that runs on ElementTree, + cElementTree and lxml.etree. + + ``xpath()``, on the other hand, supports the complete power of the XPath + language, including predicates, XPath functions and Python extension + functions. The syntax is defined by the `XPath specification`_. If you + need the expressiveness and selectivity of XPath, the xpath method, the + ``XPath`` class and the ``XPathEvaluator`` are the best choice. + + .. _`simple subset of the XPath language`: http://effbot.org/zone/element-xpath.htm + .. _`XPath specification`: http://www.w3.org/TR/xpath + + +4) Why doesn't ``findall()`` support XPath expressions? + + It was decided that it is more important to keep compatibility with + ElementTree_ to simplify code migration between the libraries. The main + difference compared to XPath is the ``{namespace}tagname`` notation used in + ``findall()``, which is not valid XPath. + + ElementTree and lxml.etree use the same implementation, which assures 100% + compatibility. Note that ``findall()`` is `so fast`_ in lxml that a native + implementation would not bring any performance benefits. + + .. _`so fast`: performance.html#tree-traversal + + +5) Why is my application so slow? + + lxml.etree is a very fast library for processing XML. There are, however, + `a few caveats`_ involved in the mapping of the powerful libxml2 library to + the simple and convenient ElementTree API. Not all operations are as fast + as the simplicity of the API might suggest. The `benchmark page`_ has a + comparison to other ElementTree implementations and a number of tips for + performance tweaking. + + .. _`a few caveats`: performance.html#the-elementtree-api + .. _`benchmark page`: performance.html + From scoder at codespeak.net Sat May 27 12:18:28 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 27 May 2006 12:18:28 +0200 (CEST) Subject: [Lxml-checkins] r27752 - lxml/trunk/doc Message-ID: <20060527101828.D9C2F100B3@code0.codespeak.net> Author: scoder Date: Sat May 27 12:18:27 2006 New Revision: 27752 Modified: lxml/trunk/doc/main.txt Log: link to FAQ from main doc page Modified: lxml/trunk/doc/main.txt ============================================================================== --- lxml/trunk/doc/main.txt (original) +++ lxml/trunk/doc/main.txt Sat May 27 12:18:27 2006 @@ -4,14 +4,16 @@ Introduction ------------ -lxml is a Pythonic binding for the libxml2_ and libxslt_ libraries. See -the introduction_ for more information about background and goals. +lxml is a Pythonic binding for the libxml2_ and libxslt_ libraries. See the +introduction_ for more information about background and goals. Some common +questions are answered in the FAQ_. .. _libxml2: http://xmlsoft.org - .. _libxslt: http://xmlsoft.org/XSLT .. _introduction: intro.html +.. _FAQ: FAQ.html + Download -------- @@ -72,6 +74,7 @@ .. _`browse it through the web`: http://codespeak.net/svn/lxml .. _`latest CHANGES`: http://codespeak.net/svn/lxml/trunk/CHANGES.txt + Documentation ------------- @@ -108,6 +111,7 @@ .. _`XSLT`: http://www.w3.org/TR/xslt .. _`c14n`: http://www.w3.org/TR/xml-c14n + Mailing list ------------ @@ -115,6 +119,7 @@ .. _`mailing list`: http://codespeak.net/mailman/listinfo/lxml-dev + License ------- From scoder at codespeak.net Sat May 27 12:25:00 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 27 May 2006 12:25:00 +0200 (CEST) Subject: [Lxml-checkins] r27753 - lxml/trunk/doc Message-ID: <20060527102500.4E0B7100B3@code0.codespeak.net> Author: scoder Date: Sat May 27 12:24:58 2006 New Revision: 27753 Modified: lxml/trunk/doc/FAQ.txt Log: small updates in FAQ.txt Modified: lxml/trunk/doc/FAQ.txt ============================================================================== --- lxml/trunk/doc/FAQ.txt (original) +++ lxml/trunk/doc/FAQ.txt Sat May 27 12:24:58 2006 @@ -29,7 +29,7 @@ .. _`the web page`: http://codespeak.net/lxml/#documentation -3) Why are there ``findall()`` and ``xpath()`` methods on Element(Tree)? +3) What are the ``findall()`` and ``xpath()`` methods on Element(Tree)? ``findall()`` is specified in the `ElementTree API`_. It supports a `simple subset of the XPath language`_, without predicates, conditions and @@ -43,14 +43,15 @@ ``xpath()``, on the other hand, supports the complete power of the XPath language, including predicates, XPath functions and Python extension functions. The syntax is defined by the `XPath specification`_. If you - need the expressiveness and selectivity of XPath, the xpath method, the - ``XPath`` class and the ``XPathEvaluator`` are the best choice. + need the expressiveness and selectivity of XPath, the ``xpath()`` method, + the ``XPath`` class and the ``XPathEvaluator`` are the best choice_. .. _`simple subset of the XPath language`: http://effbot.org/zone/element-xpath.htm .. _`XPath specification`: http://www.w3.org/TR/xpath + .. _choice: performance.html#xpath -4) Why doesn't ``findall()`` support XPath expressions? +4) Why doesn't ``findall()`` support full XPath expressions? It was decided that it is more important to keep compatibility with ElementTree_ to simplify code migration between the libraries. The main From scoder at codespeak.net Sat May 27 12:26:13 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 27 May 2006 12:26:13 +0200 (CEST) Subject: [Lxml-checkins] r27754 - lxml/trunk/doc Message-ID: <20060527102613.64A37100B3@code0.codespeak.net> Author: scoder Date: Sat May 27 12:26:12 2006 New Revision: 27754 Modified: lxml/trunk/doc/FAQ.txt Log: small updates in FAQ.txt Modified: lxml/trunk/doc/FAQ.txt ============================================================================== --- lxml/trunk/doc/FAQ.txt (original) +++ lxml/trunk/doc/FAQ.txt Sat May 27 12:26:12 2006 @@ -31,7 +31,7 @@ 3) What are the ``findall()`` and ``xpath()`` methods on Element(Tree)? - ``findall()`` is specified in the `ElementTree API`_. It supports a + ``findall()`` is part of the original `ElementTree API`_. It supports a `simple subset of the XPath language`_, without predicates, conditions and other advanced features. It is very handy for finding specific tags in a tree. Another important difference is namespace handling, which uses the From scoder at codespeak.net Sat May 27 12:27:50 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 27 May 2006 12:27:50 +0200 (CEST) Subject: [Lxml-checkins] r27755 - lxml/trunk/doc Message-ID: <20060527102750.22A8A100B3@code0.codespeak.net> Author: scoder Date: Sat May 27 12:27:48 2006 New Revision: 27755 Modified: lxml/trunk/doc/performance.txt Log: typo Modified: lxml/trunk/doc/performance.txt ============================================================================== --- lxml/trunk/doc/performance.txt (original) +++ lxml/trunk/doc/performance.txt Sat May 27 12:27:48 2006 @@ -131,7 +131,7 @@ cET: append_from_document (-- T3,T4) 0.0227 msec/pass ET : append_from_document (-- T3,T4) 0.1563 msec/pass -Although this are fairly small numbers compared to parsing, this easily shows +Although these are fairly small numbers compared to parsing, this easily shows the different performance classes for lxml and (c)ET. Where the latter do not have to care about parent pointers and tree structures, lxml has to deep traverse the appended tree. The performance difference therefore increases From scoder at codespeak.net Sat May 27 12:30:55 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 27 May 2006 12:30:55 +0200 (CEST) Subject: [Lxml-checkins] r27756 - lxml/trunk/doc Message-ID: <20060527103055.2F59D100B3@code0.codespeak.net> Author: scoder Date: Sat May 27 12:30:53 2006 New Revision: 27756 Modified: lxml/trunk/doc/performance.txt Log: state Python version in performance.txt Modified: lxml/trunk/doc/performance.txt ============================================================================== --- lxml/trunk/doc/performance.txt (original) +++ lxml/trunk/doc/performance.txt Sat May 27 12:30:53 2006 @@ -12,8 +12,8 @@ .. _cElementTree: http://effbot.org/zone/celementtree.htm The statements made here are backed by the benchmark script `bench.py`_ that -comes with the lxml source distribution. The numbers that are cited below -compare lxml 1.0, ElementTree 1.2.6 and cElementTree 1.0.5. +comes with the lxml source distribution. The timings cited below compare lxml +1.0, ElementTree 1.2.6 and cElementTree 1.0.5 under CPython 2.4.2. .. _`bench.py`: http://codespeak.net/svn/lxml/trunk/bench.py From scoder at codespeak.net Sat May 27 17:59:42 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 27 May 2006 17:59:42 +0200 (CEST) Subject: [Lxml-checkins] r27769 - in lxml/trunk: doc src/lxml/tests Message-ID: <20060527155942.AD84C100A0@code0.codespeak.net> Author: scoder Date: Sat May 27 17:59:40 2006 New Revision: 27769 Modified: lxml/trunk/doc/FAQ.txt lxml/trunk/src/lxml/tests/test_xslt.py Log: test case and FAQ entry for difference of str(xslt(doc)) and xslt(doc).write() Modified: lxml/trunk/doc/FAQ.txt ============================================================================== --- lxml/trunk/doc/FAQ.txt (original) +++ lxml/trunk/doc/FAQ.txt Sat May 27 17:59:40 2006 @@ -65,7 +65,23 @@ .. _`so fast`: performance.html#tree-traversal -5) Why is my application so slow? +5) What is the difference between str(xslt(doc)) and xslt(doc).write() ? + + The str() implementation knows about the output method chosen in the + stylesheet (xsl:output), write() doesn't. If you call write(), the result + will be a normal XML tree serialization in the requested encoding. Calling + this method may also fail for XSLT results that are not XML trees + (e.g. string results). + + If you call str(), it will return the serialized result as specified by the + XSL transform. This correctly serializes string results to encoded Python + strings and honours ``xsl:output`` options like ``indent``. This almost + certainly does what you want, so you should only use ``write()`` if you are + sure that the XSLT result is an XML tree and you want to override the + encoding and indentation options requested by the stylesheet. + + +6) Why is my application so slow? lxml.etree is a very fast library for processing XML. There are, however, `a few caveats`_ involved in the mapping of the powerful libxml2 library to Modified: lxml/trunk/src/lxml/tests/test_xslt.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_xslt.py (original) +++ lxml/trunk/src/lxml/tests/test_xslt.py Sat May 27 17:59:40 2006 @@ -6,7 +6,7 @@ import unittest, doctest -from common_imports import etree, HelperTestCase, fileInTestDir +from common_imports import etree, StringIO, HelperTestCase, fileInTestDir class ETreeXSLTTestCase(HelperTestCase): """XPath tests etree""" @@ -73,6 +73,29 @@ self.assertEquals(expected, unicode(str(res), 'UTF-16')) + def test_xslt_encoding_override(self): + tree = self.parse(u'\uF8D2\uF8D2') + style = self.parse('''\ + + + + + +''') + + st = etree.XSLT(style) + res = st.apply(tree) + expected = u"""\ + +\uF8D2""" + + f = StringIO() + res.write(f, 'UTF-16') + result = unicode(f.getvalue(), 'UTF-16') + self.assertEquals(expected, + result) + def test_xslt_unicode(self): tree = self.parse(u'\uF8D2\uF8D2') style = self.parse('''\ From scoder at codespeak.net Sun May 28 06:48:56 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 28 May 2006 06:48:56 +0200 (CEST) Subject: [Lxml-checkins] r27779 - lxml/trunk/doc Message-ID: <20060528044856.AB213100A8@code0.codespeak.net> Author: scoder Date: Sun May 28 06:48:53 2006 New Revision: 27779 Modified: lxml/trunk/doc/FAQ.txt Log: clarification on XSLTResultTree Modified: lxml/trunk/doc/FAQ.txt ============================================================================== --- lxml/trunk/doc/FAQ.txt (original) +++ lxml/trunk/doc/FAQ.txt Sun May 28 06:48:53 2006 @@ -67,11 +67,12 @@ 5) What is the difference between str(xslt(doc)) and xslt(doc).write() ? - The str() implementation knows about the output method chosen in the - stylesheet (xsl:output), write() doesn't. If you call write(), the result - will be a normal XML tree serialization in the requested encoding. Calling - this method may also fail for XSLT results that are not XML trees - (e.g. string results). + The str() implementation of the XSLTResultTree class (a subclass of + ElementTree) knows about the output method chosen in the stylesheet + (xsl:output), write() doesn't. If you call write(), the result will be a + normal XML tree serialization in the requested encoding. Calling this + method may also fail for XSLT results that are not XML trees (e.g. string + results). If you call str(), it will return the serialized result as specified by the XSL transform. This correctly serializes string results to encoded Python From scoder at codespeak.net Sun May 28 07:20:16 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 28 May 2006 07:20:16 +0200 (CEST) Subject: [Lxml-checkins] r27780 - lxml/trunk/doc Message-ID: <20060528052016.3ACFF100AB@code0.codespeak.net> Author: scoder Date: Sun May 28 07:20:14 2006 New Revision: 27780 Modified: lxml/trunk/doc/FAQ.txt Log: FAQ entry on crashes and bug reporting Modified: lxml/trunk/doc/FAQ.txt ============================================================================== --- lxml/trunk/doc/FAQ.txt (original) +++ lxml/trunk/doc/FAQ.txt Sun May 28 07:20:14 2006 @@ -7,7 +7,7 @@ .. _ElementTree: http://effbot.org/zone/element-index.htm -1) Is there a tutorial? +#) Is there a tutorial? There is a `tutorial for ElementTree`_ which also works for lxml.etree. The `API documentation`_ also contains many examples. @@ -16,7 +16,7 @@ .. _`API documentation`: api.html -2) Where can I find more documentation about lxml? +#) Where can I find more documentation about lxml? There is a lot of documentation as lxml implements the well-known `ElementTree API`_ and tries to follow its documentation as closely as @@ -29,7 +29,39 @@ .. _`the web page`: http://codespeak.net/lxml/#documentation -3) What are the ``findall()`` and ``xpath()`` methods on Element(Tree)? +#) My application crashes! Why does lxml.etree do that? + + a) If you are using threads, make sure that you are not sharing non + thread-safe objects between threads. Especially the default parser, + XSLT() and the validators are not thread-safe for performance reasons. + You have to create a new one for each thread, use a thread-safe object + pool or assure thread-safe access to them yourself. + + b) One of the goals of lxml is "no segfaults", so if there is no clear + warning in the documentation that you were doing something potentially + harmful, you have found a bug and we would like to hear about it. + Please report this bug to the mailing list. See the next section on how + to do that. + + +#) I think I have found a bug in lxml. What should I do? + + a) First, you should look at the `current developer changelog`_ to see if + this is a known problem that has already been fixed in the SVN trunk. + + .. _`current developer changelog`: http://codespeak.net/svn/lxml/trunk/CHANGES.txt + + b) Otherwise, we would really like to hear about it. Please report it to + the `mailing list`_ so that we can fix it. It is very helpful in this + case if you can come up with a short code snippet that demonstrates your + problem. Please also report the version of lxml, libxml2 and libxslt + that you are using (see the module attributes ``etree.LXML_VERSION`` + etc.). + + .. _`mailing list`: http://codespeak.net/mailman/listinfo/lxml-dev + + +#) What are the ``findall()`` and ``xpath()`` methods on Element(Tree)? ``findall()`` is part of the original `ElementTree API`_. It supports a `simple subset of the XPath language`_, without predicates, conditions and @@ -51,7 +83,7 @@ .. _choice: performance.html#xpath -4) Why doesn't ``findall()`` support full XPath expressions? +#) Why doesn't ``findall()`` support full XPath expressions? It was decided that it is more important to keep compatibility with ElementTree_ to simplify code migration between the libraries. The main @@ -65,7 +97,7 @@ .. _`so fast`: performance.html#tree-traversal -5) What is the difference between str(xslt(doc)) and xslt(doc).write() ? +#) What is the difference between str(xslt(doc)) and xslt(doc).write() ? The str() implementation of the XSLTResultTree class (a subclass of ElementTree) knows about the output method chosen in the stylesheet @@ -82,7 +114,7 @@ encoding and indentation options requested by the stylesheet. -6) Why is my application so slow? +#) Why is my application so slow? lxml.etree is a very fast library for processing XML. There are, however, `a few caveats`_ involved in the mapping of the powerful libxml2 library to From scoder at codespeak.net Sun May 28 09:32:13 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 28 May 2006 09:32:13 +0200 (CEST) Subject: [Lxml-checkins] r27781 - lxml/trunk/doc Message-ID: <20060528073213.5CDBA100AC@code0.codespeak.net> Author: scoder Date: Sun May 28 09:32:12 2006 New Revision: 27781 Modified: lxml/trunk/doc/api.txt Log: cleaner test case Modified: lxml/trunk/doc/api.txt ============================================================================== --- lxml/trunk/doc/api.txt (original) +++ lxml/trunk/doc/api.txt Sun May 28 09:32:12 2006 @@ -79,8 +79,8 @@ >>> xml_header = '' >>> xhtml = xml_header + doctype_string + '' - >>> et = etree.parse(StringIO(xhtml)) - >>> docinfo = et.docinfo + >>> tree = etree.parse(StringIO(xhtml)) + >>> docinfo = tree.docinfo >>> print docinfo.public_id -//W3C//DTD XHTML 1.0 Transitional//EN >>> print docinfo.system_url From scoder at codespeak.net Sun May 28 09:32:58 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 28 May 2006 09:32:58 +0200 (CEST) Subject: [Lxml-checkins] r27782 - lxml/trunk/src/lxml/tests Message-ID: <20060528073258.1BCD9100AD@code0.codespeak.net> Author: scoder Date: Sun May 28 09:32:55 2006 New Revision: 27782 Modified: lxml/trunk/src/lxml/tests/test_elementtree.py Log: new parser test case that fails in libxml2 <= 2.6.22 Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Sun May 28 09:32:55 2006 @@ -1699,7 +1699,14 @@ '', tree.getroot() ) - + + def test_parse_with_encoding(self): + # this can fail in libxml2 <= 2.6.22 + parse = self.etree.parse + tree = parse(StringIO('')) + self.assertXML('', + tree.getroot()) + def test_encoding(self): Element = self.etree.Element From scoder at codespeak.net Sun May 28 10:27:59 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 28 May 2006 10:27:59 +0200 (CEST) Subject: [Lxml-checkins] r27783 - in lxml/trunk: . src/lxml Message-ID: <20060528082759.CF840100B5@code0.codespeak.net> Author: scoder Date: Sun May 28 10:27:57 2006 New Revision: 27783 Modified: lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/parser.pxi Log: fix parsing strings with encoding declaration under libxml2 <= 2.6.22 Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Sun May 28 10:27:57 2006 @@ -20,6 +20,9 @@ Bugs fixed ---------- +* On libxml2 <= 2.6.22, parsing strings with encoding declaration could fail + in certain cases + * Document reference in ElementTree objects was not updated when the root element was moved to a different document Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Sun May 28 10:27:57 2006 @@ -295,12 +295,11 @@ cdef Py_ssize_t py_buffer_len cdef int buffer_len cdef char* c_text - cdef char* c_encoding - cdef int enc py_buffer_len = python.PyUnicode_GET_DATA_SIZE(utext) if py_buffer_len > python.INT_MAX: text_utf = _utf8(utext) - return self._parseDoc(text_utf, c_filename) + py_buffer_len = python.PyString_GET_SIZE(text_utf) + return self._parseDoc(_cstr(text_utf), py_buffer_len, c_filename) buffer_len = py_buffer_len self._error_log.connect() @@ -321,22 +320,26 @@ recover = self._parse_options & xmlparser.XML_PARSE_RECOVER return _handleParseResult(pctxt, result, NULL, recover) - cdef xmlDoc* _parseDoc(self, char* c_text, char* c_filename) except NULL: + cdef xmlDoc* _parseDoc(self, char* c_text, Py_ssize_t c_len, + char* c_filename) except NULL: """Parse document, share dictionary if possible. """ cdef xmlDoc* result cdef xmlParserCtxt* pctxt cdef int recover + if c_len > python.INT_MAX: + raise ParserError, "string is too long to parse it with libxml2" + self._error_log.connect() pctxt = self._parser_ctxt __GLOBAL_PARSER_CONTEXT._initParserDict(pctxt) if self._parser_type == LXML_HTML_PARSER: - result = htmlparser.htmlCtxtReadDoc( - pctxt, c_text, c_filename, NULL, self._parse_options) + result = htmlparser.htmlCtxtReadMemory( + pctxt, c_text, c_len, c_filename, NULL, self._parse_options) else: - result = xmlparser.xmlCtxtReadDoc( - pctxt, c_text, c_filename, NULL, self._parse_options) + result = xmlparser.xmlCtxtReadMemory( + pctxt, c_text, c_len, c_filename, NULL, self._parse_options) self._error_log.disconnect() recover = self._parse_options & xmlparser.XML_PARSE_RECOVER @@ -589,6 +592,8 @@ cdef xmlDoc* _parseDoc(text, filename, _BaseParser parser) except NULL: cdef char* c_filename + cdef char* c_text + cdef Py_ssize_t c_len if parser is None: parser = __DEFAULT_PARSER __GLOBAL_PARSER_CONTEXT._initParser() @@ -599,7 +604,9 @@ if python.PyUnicode_Check(text): return (<_BaseParser>parser)._parseUnicodeDoc(text, c_filename) else: - return (<_BaseParser>parser)._parseDoc(_cstr(text), c_filename) + c_text = _cstr(text) + c_len = python.PyString_GET_SIZE(text) + return (<_BaseParser>parser)._parseDoc(c_text, c_len, c_filename) cdef xmlDoc* _parseDocFromFile(filename, _BaseParser parser) except NULL: if parser is None: From scoder at codespeak.net Sun May 28 19:53:31 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 28 May 2006 19:53:31 +0200 (CEST) Subject: [Lxml-checkins] r27795 - in lxml/trunk: . doc src/lxml Message-ID: <20060528175331.69BD310036@code0.codespeak.net> Author: scoder Date: Sun May 28 19:53:28 2006 New Revision: 27795 Modified: lxml/trunk/CHANGES.txt lxml/trunk/doc/api.txt lxml/trunk/src/lxml/xmlerror.pxi Log: make logging to Python stdlib logging package work Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Sun May 28 19:53:28 2006 @@ -7,6 +7,8 @@ Features added -------------- +* PyErrorLog for error logging through the Python ``logging`` module + * ``element.getroottree()`` returns an ElementTree for the root node of the document that contains the element. Modified: lxml/trunk/doc/api.txt ============================================================================== --- lxml/trunk/doc/api.txt (original) +++ lxml/trunk/doc/api.txt Sun May 28 19:53:28 2006 @@ -123,12 +123,20 @@ PARSER ERR_TAG_NOT_FINISHED There is also a convenience attribute ``last_error`` that returns the last -error:: +error or fatal error that occurred:: >>> entry = e.error_log.last_error >>> print entry.domain_name, entry.type_name, entry.filename PARSER ERR_TAG_NOT_FINISHED +Alternatively, lxml.etree supports logging libxml2 messages to the Python +stdlib logging module. This is done through the ``etree.PyErrorLog`` class. +It disables the error reporting from exceptions and forwards log messages to a +Python logger. To use it, see the descriptions of the function +``etree.useGlobalPythonLog`` and the class ``etree.PyErrorLog`` for help. +Note that this does not affect the local error logs of XSLT, XMLSchema, +etc. which are described in their respective sections below. + Python unicode strings ---------------------- @@ -462,7 +470,7 @@ it for relevant messages:: >>> log = relaxng.error_log - >>> print log.filter_from_errors() + >>> print log.last_error :1:ERROR:RELAXNGV:ERR_LT_IN_ATTRIBUTE: Did not expect element c there You can see that the error (ERROR) happened during RelaxNG validation @@ -541,13 +549,15 @@ Error reporting works like for the RelaxNG class:: >>> log = xmlschema.error_log - >>> errors = log.filter_from_errors() - >>> print errors[0].domain_name + >>> error = log.last_error + >>> print error.domain_name SCHEMASV - >>> print errors[0].type_name + >>> print error.type_name SCHEMAV_ELEMENT_CONTENT -If you were to print this log entry, you would get something like the following:: +If you were to print this log entry, you would get something like the +following. Note that the error message depends on the libxml2 version in +use:: :1:ERROR::SCHEMAV_ELEMENT_CONTENT: Element 'c': This element is not expected. Expected is ( b ). Modified: lxml/trunk/src/lxml/xmlerror.pxi ============================================================================== --- lxml/trunk/src/lxml/xmlerror.pxi (original) +++ lxml/trunk/src/lxml/xmlerror.pxi Sun May 28 19:53:28 2006 @@ -30,7 +30,7 @@ cdef readonly object level cdef readonly object message cdef readonly object filename - cdef _set(self, xmlerror.xmlError* error): + cdef _setError(self, xmlerror.xmlError* error): self.domain = error.domain self.type = error.code self.level = error.level @@ -52,14 +52,9 @@ self.filename = filename def __repr__(self): - if self.filename: - return "%s:%d:%s:%s:%s: %s" % ( - self.filename, self.line, self.level_name, - self.domain_name, self.type_name, self.message) - else: - return "[]:%s:%s:%s: %s" % ( - self.level_name, self.domain_name, - self.type_name, self.message) + return "%s:%d:%s:%s:%s: %s" % ( + self.filename, self.line, self.level_name, + self.domain_name, self.type_name, self.message) property domain_name: def __get__(self): @@ -74,15 +69,55 @@ return ErrorLevels._names[self.level] cdef class _BaseErrorLog: - "Immutable base version of an error log." - cdef object _entries cdef readonly object last_error + def __init__(self, last_error=None): + self.last_error = last_error + + def copy(self): + return _BaseErrorLog(self.last_error) + + def __repr__(self): + return '' + + cdef void _receive(self, xmlerror.xmlError* error): + cdef int is_error + cdef _LogEntry entry + entry = _LogEntry() + entry._setError(error) + is_error = error.level == xmlerror.XML_ERR_ERROR or \ + error.level == xmlerror.XML_ERR_FATAL + if __GLOBAL_ERROR_LOG is not self: + __GLOBAL_ERROR_LOG.receive(entry) + if is_error: + __GLOBAL_ERROR_LOG.last_error = entry + self.receive(entry) + if is_error: + self.last_error = entry + + cdef void _receiveGeneric(self, int domain, int type, int level, int line, + message, filename): + cdef _LogEntry entry + entry = _LogEntry() + entry._setGeneric(domain, type, level, line, message, filename) + is_error = level == xmlerror.XML_ERR_ERROR or \ + level == xmlerror.XML_ERR_FATAL + if __GLOBAL_ERROR_LOG is not self: + __GLOBAL_ERROR_LOG.receive(entry) + if is_error: + __GLOBAL_ERROR_LOG.last_error = entry + self.receive(entry) + if is_error: + self.last_error = entry + +cdef class _ListErrorLog(_BaseErrorLog): + "Immutable base version of a list based error log." + cdef object _entries def __init__(self, entries, last_error=None): + _BaseErrorLog.__init__(self, last_error) self._entries = entries - self.last_error = last_error def copy(self): - return _BaseErrorLog(self._entries, self.last_error) + return _ListErrorLog(self._entries, self.last_error) def __iter__(self): return iter(self._entries) @@ -104,7 +139,7 @@ for entry in self._entries: if entry.domain in domains: python.PyList_Append(filtered, entry) - return _BaseErrorLog(filtered) + return _ListErrorLog(filtered) def filter_types(self, types): cdef _LogEntry entry @@ -114,7 +149,7 @@ for entry in self._entries: if entry.type in types: python.PyList_Append(filtered, entry) - return _BaseErrorLog(filtered) + return _ListErrorLog(filtered) def filter_levels(self, levels): """Return a log with all messages of the requested level(s). Takes a @@ -126,7 +161,7 @@ for entry in self._entries: if entry.level in levels: python.PyList_Append(filtered, entry) - return _BaseErrorLog(filtered) + return _ListErrorLog(filtered) def filter_from_level(self, level): "Return a log with all messages of the requested level of worse." @@ -135,7 +170,7 @@ for entry in self._entries: if entry.level >= level: python.PyList_Append(filtered, entry) - return _BaseErrorLog(filtered) + return _ListErrorLog(filtered) def filter_from_fatals(self): "Convenience method to get all fatal error messages." @@ -149,7 +184,10 @@ "Convenience method to get all warnings or worse." return self.filter_from_level(ErrorLevels.WARNING) -cdef class _ExtensibleErrorLog(_BaseErrorLog): +cdef class _ErrorLog(_ListErrorLog): + def __init__(self): + _ListErrorLog.__init__(self, []) + cdef void connect(self): del self._entries[:] xmlerror.xmlSetStructuredErrorFunc(self, _receiveError) @@ -157,45 +195,11 @@ cdef void disconnect(self): xmlerror.xmlSetStructuredErrorFunc(NULL, _receiveError) - cdef void _receive(self, xmlerror.xmlError* error): - cdef int is_error - cdef _LogEntry entry - entry = _LogEntry() - entry._set(error) - is_error = error.level == xmlerror.XML_ERR_ERROR or \ - error.level == xmlerror.XML_ERR_FATAL - if __GLOBAL_ERROR_LOG is not self: - __GLOBAL_ERROR_LOG.receive(entry) - if is_error: - __GLOBAL_ERROR_LOG.last_error = entry - self.receive(entry) - if is_error: - self.last_error = entry - - cdef void _receiveGeneric(self, int domain, int type, int level, int line, - message, filename): - cdef _LogEntry entry - entry = _LogEntry() - entry._setGeneric(domain, type, level, line, message, filename) - is_error = level == xmlerror.XML_ERR_ERROR or \ - level == xmlerror.XML_ERR_FATAL - if __GLOBAL_ERROR_LOG is not self: - __GLOBAL_ERROR_LOG.receive(entry) - if is_error: - __GLOBAL_ERROR_LOG.last_error = entry - self.receive(entry) - if is_error: - self.last_error = entry - -cdef class _ErrorLog(_ExtensibleErrorLog): - def __init__(self): - _ExtensibleErrorLog.__init__(self, []) - def clear(self): del self._entries[:] def copy(self): - return _BaseErrorLog(self._entries[:], self.last_error) + return _ListErrorLog(self._entries[:], self.last_error) def __iter__(self): return iter(self._entries[:]) @@ -224,44 +228,72 @@ del entries[0] python.PyList_Append(entries, entry) -cdef class PyErrorLog(_ExtensibleErrorLog): +cdef class PyErrorLog(_BaseErrorLog): + """A global error log that connects to the Python stdlib logging package. + + The constructor accepts an optional logger name. + + If you want to change the mapping between libxml2's ErrorLevels and Python + logging levels, you can modify the level_map dictionary from a subclass. + + The default mapping is:: + + ErrorLevels.WARNING = logging.WARNING + ErrorLevels.ERROR = logging.ERROR + ErrorLevels.FATAL = logging.CRITICAL + + You can also override the method ``receive()`` that takes a LogEntry + object and calls ``self.log(log_entry, format_string, arg1, arg2, ...)`` + with appropriate data. + """ + cdef public object level_map cdef object _log - cdef object _level_map - cdef object _varsOf def __init__(self, logger_name=None): - _ExtensibleErrorLog.__init__(self, []) + _BaseErrorLog.__init__(self) import logging - self._level_map = { + self.level_map = { ErrorLevels.WARNING : logging.WARNING, ErrorLevels.ERROR : logging.ERROR, ErrorLevels.FATAL : logging.CRITICAL } - self._varsOf = vars if logger_name: - logger = logging.getLogger(name) + logger = logging.getLogger(logger_name) else: logger = logging.getLogger() self._log = logger.log def copy(self): - return self + return _ListErrorLog([]) - def receive(self, entry): - py_level = self._level_map[entry.level] + def log(self, entry, message_format_string, *args): self._log( - py_level, - "%(asctime)s %(levelname)s %(domain_name)s %(message)s", - self._varsOf(entry) + self.level_map.get(entry.level, 0), + message_format_string, *args ) -# global list to collect error output messages from libxml2/libxslt -cdef _RotatingErrorLog __GLOBAL_ERROR_LOG + def receive(self, entry): + self.log(entry, entry) + +# global list log to collect error output messages from libxml2/libxslt +cdef _BaseErrorLog __GLOBAL_ERROR_LOG __GLOBAL_ERROR_LOG = _RotatingErrorLog(__MAX_LOG_SIZE) -def __copyGlobalErrorLog(): +cdef __copyGlobalErrorLog(): "Helper function for properties in exceptions." return __GLOBAL_ERROR_LOG.copy() +def useGlobalPythonLog(PyErrorLog log not None): + """Replace the global error log by an etree.PyErrorLog that uses the + standard Python logging package. + + Note that this slows down processing and disables access to the global + error log from exceptions. Parsers, XSLT etc. will continue to provide + their normal local error log. + """ + global __GLOBAL_ERROR_LOG + __GLOBAL_ERROR_LOG = log + + # local log function: forward error to logger object cdef void _receiveError(void* c_log_handler, xmlerror.xmlError* error): cdef _ErrorLog log_handler From scoder at codespeak.net Sun May 28 19:55:08 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 28 May 2006 19:55:08 +0200 (CEST) Subject: [Lxml-checkins] r27796 - lxml/trunk/src/lxml Message-ID: <20060528175508.759AF10036@code0.codespeak.net> Author: scoder Date: Sun May 28 19:55:07 2006 New Revision: 27796 Modified: lxml/trunk/src/lxml/etree.pyx Log: cleanup: reuse Pyrex initialized None values Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Sun May 28 19:55:07 2006 @@ -121,9 +121,6 @@ # class for temporarily storing exceptions raised in extensions cdef class _ExceptionContext: cdef object _exc_info - def __init__(self): - self._exc_info = None - cdef void clear(self): self._exc_info = None @@ -1207,9 +1204,7 @@ else: c_node = _findChildForwards(node._c_node, 0) self._next_element = _nextElement - if c_node is NULL: - self._node = None - else: + if c_node is not NULL: self._node = _elementFactory(node._doc, c_node) def __iter__(self): return self From scoder at codespeak.net Sun May 28 20:15:38 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 28 May 2006 20:15:38 +0200 (CEST) Subject: [Lxml-checkins] r27799 - lxml/trunk/doc Message-ID: <20060528181538.EE04C1006B@code0.codespeak.net> Author: scoder Date: Sun May 28 20:15:32 2006 New Revision: 27799 Modified: lxml/trunk/doc/api.txt Log: whitespace Modified: lxml/trunk/doc/api.txt ============================================================================== --- lxml/trunk/doc/api.txt (original) +++ lxml/trunk/doc/api.txt Sun May 28 20:15:32 2006 @@ -304,6 +304,7 @@ >>> tree.xpath(tree.getpath(d2)) == [d2] True + XSLT ---- From scoder at codespeak.net Sun May 28 20:16:09 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 28 May 2006 20:16:09 +0200 (CEST) Subject: [Lxml-checkins] r27800 - lxml/trunk/doc Message-ID: <20060528181609.7F7A21006B@code0.codespeak.net> Author: scoder Date: Sun May 28 20:16:07 2006 New Revision: 27800 Modified: lxml/trunk/doc/FAQ.txt Log: FAQ entry on threading Modified: lxml/trunk/doc/FAQ.txt ============================================================================== --- lxml/trunk/doc/FAQ.txt (original) +++ lxml/trunk/doc/FAQ.txt Sun May 28 20:16:07 2006 @@ -61,6 +61,26 @@ .. _`mailing list`: http://codespeak.net/mailman/listinfo/lxml-dev +#) Can I use threads to concurrently access the lxml API? + + You should be able to use lxml in a multi-threaded environment, although + this is not very well tested. Note that lxml does not provide any + thread-safety by itself (mainly for performance reasons), so you have to + take care when you use parts of the API concurrently. Most importantly, + you must not forget to call ``etree.initThread()`` from each newly + generated thread to initialize lxml and libxml2 for the new thread context. + If you call API functions from a thread without having called this function + first, lxml can easily crash your program. + + Basically none of the API classes is thread-safe, including parsers, XPath, + XSLT and the validators. You cannot use such an object concurrently. + However, it is perfectly viable to create independent instances for each + thread. This is a cheap thing to do for parsers, but more expensive for + XSLT and validators, which have to compile trees recursively. So you might + want to consider a thread pool approach or threaded processing chains to + reduce the overhead if you require threading here. + + #) What are the ``findall()`` and ``xpath()`` methods on Element(Tree)? ``findall()`` is part of the original `ElementTree API`_. It supports a From scoder at codespeak.net Mon May 29 10:48:30 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 29 May 2006 10:48:30 +0200 (CEST) Subject: [Lxml-checkins] r27824 - in lxml/trunk: . src/lxml Message-ID: <20060529084830.F020E10053@code0.codespeak.net> Author: scoder Date: Mon May 29 10:48:24 2006 New Revision: 27824 Modified: lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/python.pxd lxml/trunk/src/lxml/xmlerror.pxd lxml/trunk/src/lxml/xmlerror.pxi Log: made error name lookup more robust, updating constants from xmlerror.h is easier now, also compiles faster Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Mon May 29 10:48:24 2006 @@ -22,6 +22,11 @@ Bugs fixed ---------- +* Unknown constants from newer libxml2 versions could raise exceptions in the + error handlers + +* lxml.etree compiles much faster + * On libxml2 <= 2.6.22, parsing strings with encoding declaration could fail in certain cases Modified: lxml/trunk/src/lxml/python.pxd ============================================================================== --- lxml/trunk/src/lxml/python.pxd (original) +++ lxml/trunk/src/lxml/python.pxd Mon May 29 10:48:24 2006 @@ -50,6 +50,8 @@ cdef int PySequence_Check(object instance) cdef int PyType_Check(object instance) + cdef int PyObject_SetAttr(object o, object name, object value) + cdef void* PyMem_Malloc(size_t size) cdef void PyMem_Free(void* p) Modified: lxml/trunk/src/lxml/xmlerror.pxd ============================================================================== --- lxml/trunk/src/lxml/xmlerror.pxd (original) +++ lxml/trunk/src/lxml/xmlerror.pxd Mon May 29 10:48:24 2006 @@ -47,12 +47,12 @@ XML_FROM_CHECK = 24 # The error checking module XML_FROM_WRITER = 25 # The xmlwriter module XML_FROM_MODULE = 26 # The dynamically loaded module modu - + XML_FROM_I18N = 27 # The module handling character conversion ctypedef enum xmlParserErrors: XML_ERR_OK = 0 - XML_ERR_INTERNAL_ERROR = 1 - XML_ERR_NO_MEMORY = 2 + XML_ERR_INTERNAL_ERROR = 1 # 1 + XML_ERR_NO_MEMORY = 2 # 2 XML_ERR_DOCUMENT_START = 3 # 3 XML_ERR_DOCUMENT_EMPTY = 4 # 4 XML_ERR_DOCUMENT_END = 5 # 5 @@ -152,10 +152,17 @@ XML_WAR_NS_URI = 99 # 99 XML_WAR_NS_URI_RELATIVE = 100 # 100 XML_ERR_MISSING_ENCODING = 101 # 101 + XML_WAR_SPACE_VALUE = 102 # 102 + XML_ERR_NOT_STANDALONE = 103 # 103 + XML_ERR_ENTITY_PROCESSING = 104 # 104 + XML_ERR_NOTATION_PROCESSING = 105 # 105 + XML_WAR_NS_COLUMN = 106 # 106 + XML_WAR_ENTITY_REDEFINED = 107 # 107 XML_NS_ERR_XML_NAMESPACE = 200 XML_NS_ERR_UNDEFINED_NAMESPACE = 201 # 201 XML_NS_ERR_QNAME = 202 # 202 XML_NS_ERR_ATTRIBUTE_REDEFINED = 203 # 203 + XML_NS_ERR_EMPTY = 204 # 204 XML_DTD_ATTRIBUTE_DEFAULT = 500 XML_DTD_ATTRIBUTE_REDEFINED = 501 # 501 XML_DTD_ATTRIBUTE_VALUE = 502 # 502 @@ -610,6 +617,8 @@ XML_SCHEMAV_CVC_AU = 1874 # 1874 XML_SCHEMAV_CVC_TYPE_1 = 1875 # 1875 XML_SCHEMAV_CVC_TYPE_2 = 1876 # 1876 + XML_SCHEMAV_CVC_IDC = 1877 # 1877 + XML_SCHEMAV_CVC_WILDCARD = 1878 # 1878 XML_XPTR_UNKNOWN_SCHEME = 1900 XML_XPTR_CHILDSEQ_START = 1901 # 1901 XML_XPTR_EVAL_FAILED = 1902 # 1902 @@ -618,9 +627,12 @@ XML_C14N_REQUIRES_UTF8 = 1951 # 1951 XML_C14N_CREATE_STACK = 1952 # 1952 XML_C14N_INVALID_NODE = 1953 # 1953 + XML_C14N_UNKNOW_NODE = 1954 # 1954 + XML_C14N_RELATIVE_NAMESPACE = 1955 # 1955 XML_FTP_PASV_ANSWER = 2000 XML_FTP_EPSV_ANSWER = 2001 # 2001 XML_FTP_ACCNT = 2002 # 2002 + XML_FTP_URL_SYNTAX = 2003 # 2003 XML_HTTP_URL_SYNTAX = 2020 XML_HTTP_USE_IP = 2021 # 2021 XML_HTTP_UNKNOWN_HOST = 2022 # 2022 @@ -704,6 +716,18 @@ XML_SCHEMAP_DERIVATION_OK_RESTRICTION_2_1_3 = 3077 # 3077 XML_SCHEMAP_AU_PROPS_CORRECT_2 = 3078 # 3078 XML_SCHEMAP_A_PROPS_CORRECT_2 = 3079 # 3079 + XML_SCHEMAP_C_PROPS_CORRECT = 3080 # 3080 + XML_SCHEMAP_SRC_REDEFINE = 3081 # 3081 + XML_SCHEMAP_SRC_IMPORT = 3082 # 3082 + XML_SCHEMAP_WARN_SKIP_SCHEMA = 3083 # 3083 + XML_SCHEMAP_WARN_UNLOCATED_SCHEMA = 3084 # 3084 + XML_SCHEMAP_WARN_ATTR_REDECL_PROH = 3085 # 3085 + XML_SCHEMAP_WARN_ATTR_POINTLESS_PROH = 3086 # 3085 + XML_SCHEMAP_AG_PROPS_CORRECT = 3087 # 3086 + XML_SCHEMAP_COS_CT_EXTENDS_1_2 = 3088 # 3087 + XML_SCHEMAP_AU_PROPS_CORRECT = 3089 # 3088 + XML_SCHEMAP_A_PROPS_CORRECT_3 = 3090 # 3089 + XML_SCHEMAP_COS_ALL_LIMITED = 3091 # 3090 XML_MODULE_OPEN = 4900 # 4900 XML_MODULE_CLOSE = 4901 # 4901 XML_CHECK_FOUND_ELEMENT = 5000 @@ -744,6 +768,10 @@ XML_CHECK_OUTSIDE_DICT = 5035 # 5035 XML_CHECK_WRONG_NAME = 5036 # 5036 XML_CHECK_NAME_NOT_NULL = 5037 # 5037 - XML_CHECK_ = 5038 # 5033 - XML_CHECK_X = 5039 # 503 - + XML_I18N_NO_NAME = 6000 + XML_I18N_NO_HANDLER = 6001 # 6001 + XML_I18N_EXCESS_HANDLER = 6002 # 6002 + XML_I18N_CONV_FAILED = 6003 # 6003 + XML_I18N_NO_OUTPUT = 6004 # 6004 + XML_CHECK_ = 6005 # 5033 + XML_CHECK_X = 6006 # 503 Modified: lxml/trunk/src/lxml/xmlerror.pxi ============================================================================== --- lxml/trunk/src/lxml/xmlerror.pxi (original) +++ lxml/trunk/src/lxml/xmlerror.pxi Mon May 29 10:48:24 2006 @@ -9,6 +9,8 @@ Note that this log is already bounded to a fixed size.""" __GLOBAL_ERROR_LOG.clear() +# setup functions + cdef void _initThreadLogging(): "Setup logging for the current thread. Called from etree.initThread()." # switch on line number reporting @@ -58,15 +60,15 @@ property domain_name: def __get__(self): - return ErrorDomains._names[self.domain] + return ErrorDomains._getName(self.domain, "unknown") property type_name: def __get__(self): - return ErrorTypes._names[self.type] + return ErrorTypes._getName(self.type, "unknown") property level_name: def __get__(self): - return ErrorLevels._names[self.level] + return ErrorLevels._getName(self.level, "unknown") cdef class _BaseErrorLog: cdef readonly object last_error @@ -374,757 +376,801 @@ xmlerror.xmlSetStructuredErrorFunc(NULL, _receiveError) ################################################################################ -## CONSTANTS FROM "xmlerror.pxd" +## CONSTANTS FROM "xmlerror.h" ################################################################################ +cdef void __initErrorConstants(): + find_constants = re.compile(r"\s*([a-zA-Z0-9_]+)\s*=\s*([0-9]+)").findall + for cls, constants, prefix in [(ErrorLevels, __ERROR_LEVELS, 'XML_ERR_'), + (ErrorDomains, __ERROR_DOMAINS, 'XML_FROM_'), + (ErrorTypes, __ERROR_TYPES, 'XML_')]: + prefix_len = len(prefix) + reverse_dict = {} + cls._names = reverse_dict + cls._getName = reverse_dict.get + for name, value in find_constants(constants): + if name[:prefix_len] == prefix and len(name) > prefix_len: + name = name[prefix_len:] + value = int(value) + python.PyObject_SetAttr(cls, name, value) + python.PyDict_SetItem(reverse_dict, value, name) + class ErrorLevels: "Libxml2 error levels" - _names = {} - NONE = 0 - WARNING = 1 # A simple warning - ERROR = 2 # A recoverable error - FATAL = 3 # A fatal error + +cdef object __ERROR_LEVELS +__ERROR_LEVELS = """ + XML_ERR_NONE = 0 + XML_ERR_WARNING = 1 : A simple warning + XML_ERR_ERROR = 2 : A recoverable error + XML_ERR_FATAL = 3 : A fatal error +""" class ErrorDomains: "Libxml2 error domains" - _names = {} - NONE = 0 - PARSER = 1 # The XML parser - TREE = 2 # The tree module - NAMESPACE = 3 # The XML Namespace module - DTD = 4 # The XML DTD validation with parser contex - HTML = 5 # The HTML parser - MEMORY = 6 # The memory allocator - OUTPUT = 7 # The serialization code - IO = 8 # The Input/Output stack - FTP = 9 # The FTP module - HTTP = 10 # The FTP module - XINCLUDE = 11 # The XInclude processing - XPATH = 12 # The XPath module - XPOINTER = 13 # The XPointer module - REGEXP = 14 # The regular expressions module - DATATYPE = 15 # The W3C XML Schemas Datatype module - SCHEMASP = 16 # The W3C XML Schemas parser module - SCHEMASV = 17 # The W3C XML Schemas validation module - RELAXNGP = 18 # The Relax-NG parser module - RELAXNGV = 19 # The Relax-NG validator module - CATALOG = 20 # The Catalog module - C14N = 21 # The Canonicalization module - XSLT = 22 # The XSLT engine from libxslt - VALID = 23 # The XML DTD validation with valid context - CHECK = 24 # The error checking module - WRITER = 25 # The xmlwriter module - MODULE = 26 # The dynamically loaded module modu + +cdef object __ERROR_DOMAINS +__ERROR_DOMAINS = """ + XML_FROM_NONE = 0 + XML_FROM_PARSER = 1 : The XML parser + XML_FROM_TREE = 2 : The tree module + XML_FROM_NAMESPACE = 3 : The XML Namespace module + XML_FROM_DTD = 4 : The XML DTD validation with parser contex + XML_FROM_HTML = 5 : The HTML parser + XML_FROM_MEMORY = 6 : The memory allocator + XML_FROM_OUTPUT = 7 : The serialization code + XML_FROM_IO = 8 : The Input/Output stack + XML_FROM_FTP = 9 : The FTP module + XML_FROM_HTTP = 10 : The HTTP module + XML_FROM_XINCLUDE = 11 : The XInclude processing + XML_FROM_XPATH = 12 : The XPath module + XML_FROM_XPOINTER = 13 : The XPointer module + XML_FROM_REGEXP = 14 : The regular expressions module + XML_FROM_DATATYPE = 15 : The W3C XML Schemas Datatype module + XML_FROM_SCHEMASP = 16 : The W3C XML Schemas parser module + XML_FROM_SCHEMASV = 17 : The W3C XML Schemas validation module + XML_FROM_RELAXNGP = 18 : The Relax-NG parser module + XML_FROM_RELAXNGV = 19 : The Relax-NG validator module + XML_FROM_CATALOG = 20 : The Catalog module + XML_FROM_C14N = 21 : The Canonicalization module + XML_FROM_XSLT = 22 : The XSLT engine from libxslt + XML_FROM_VALID = 23 : The XML DTD validation with valid context + XML_FROM_CHECK = 24 : The error checking module + XML_FROM_WRITER = 25 : The xmlwriter module + XML_FROM_MODULE = 26 : The dynamically loaded module modul + XML_FROM_I18N = 27 : The module handling character conversion +""" class ErrorTypes: "Libxml2 error types" - _names = {} - ERR_OK = 0 - ERR_INTERNAL_ERROR = 1 - ERR_NO_MEMORY = 2 - ERR_DOCUMENT_START = 3 # 3 - ERR_DOCUMENT_EMPTY = 4 # 4 - ERR_DOCUMENT_END = 5 # 5 - ERR_INVALID_HEX_CHARREF = 6 # 6 - ERR_INVALID_DEC_CHARREF = 7 # 7 - ERR_INVALID_CHARREF = 8 # 8 - ERR_INVALID_CHAR = 9 # 9 - ERR_CHARREF_AT_EOF = 10 # 10 - ERR_CHARREF_IN_PROLOG = 11 # 11 - ERR_CHARREF_IN_EPILOG = 12 # 12 - ERR_CHARREF_IN_DTD = 13 # 13 - ERR_ENTITYREF_AT_EOF = 14 # 14 - ERR_ENTITYREF_IN_PROLOG = 15 # 15 - ERR_ENTITYREF_IN_EPILOG = 16 # 16 - ERR_ENTITYREF_IN_DTD = 17 # 17 - ERR_PEREF_AT_EOF = 18 # 18 - ERR_PEREF_IN_PROLOG = 19 # 19 - ERR_PEREF_IN_EPILOG = 20 # 20 - ERR_PEREF_IN_INT_SUBSET = 21 # 21 - ERR_ENTITYREF_NO_NAME = 22 # 22 - ERR_ENTITYREF_SEMICOL_MISSING = 23 # 23 - ERR_PEREF_NO_NAME = 24 # 24 - ERR_PEREF_SEMICOL_MISSING = 25 # 25 - ERR_UNDECLARED_ENTITY = 26 # 26 - WAR_UNDECLARED_ENTITY = 27 # 27 - ERR_UNPARSED_ENTITY = 28 # 28 - ERR_ENTITY_IS_EXTERNAL = 29 # 29 - ERR_ENTITY_IS_PARAMETER = 30 # 30 - ERR_UNKNOWN_ENCODING = 31 # 31 - ERR_UNSUPPORTED_ENCODING = 32 # 32 - ERR_STRING_NOT_STARTED = 33 # 33 - ERR_STRING_NOT_CLOSED = 34 # 34 - ERR_NS_DECL_ERROR = 35 # 35 - ERR_ENTITY_NOT_STARTED = 36 # 36 - ERR_ENTITY_NOT_FINISHED = 37 # 37 - ERR_LT_IN_ATTRIBUTE = 38 # 38 - ERR_ATTRIBUTE_NOT_STARTED = 39 # 39 - ERR_ATTRIBUTE_NOT_FINISHED = 40 # 40 - ERR_ATTRIBUTE_WITHOUT_VALUE = 41 # 41 - ERR_ATTRIBUTE_REDEFINED = 42 # 42 - ERR_LITERAL_NOT_STARTED = 43 # 43 - ERR_LITERAL_NOT_FINISHED = 44 # 44 - ERR_COMMENT_NOT_FINISHED = 45 # 45 - ERR_PI_NOT_STARTED = 46 # 46 - ERR_PI_NOT_FINISHED = 47 # 47 - ERR_NOTATION_NOT_STARTED = 48 # 48 - ERR_NOTATION_NOT_FINISHED = 49 # 49 - ERR_ATTLIST_NOT_STARTED = 50 # 50 - ERR_ATTLIST_NOT_FINISHED = 51 # 51 - ERR_MIXED_NOT_STARTED = 52 # 52 - ERR_MIXED_NOT_FINISHED = 53 # 53 - ERR_ELEMCONTENT_NOT_STARTED = 54 # 54 - ERR_ELEMCONTENT_NOT_FINISHED = 55 # 55 - ERR_XMLDECL_NOT_STARTED = 56 # 56 - ERR_XMLDECL_NOT_FINISHED = 57 # 57 - ERR_CONDSEC_NOT_STARTED = 58 # 58 - ERR_CONDSEC_NOT_FINISHED = 59 # 59 - ERR_EXT_SUBSET_NOT_FINISHED = 60 # 60 - ERR_DOCTYPE_NOT_FINISHED = 61 # 61 - ERR_MISPLACED_CDATA_END = 62 # 62 - ERR_CDATA_NOT_FINISHED = 63 # 63 - ERR_RESERVED_XML_NAME = 64 # 64 - ERR_SPACE_REQUIRED = 65 # 65 - ERR_SEPARATOR_REQUIRED = 66 # 66 - ERR_NMTOKEN_REQUIRED = 67 # 67 - ERR_NAME_REQUIRED = 68 # 68 - ERR_PCDATA_REQUIRED = 69 # 69 - ERR_URI_REQUIRED = 70 # 70 - ERR_PUBID_REQUIRED = 71 # 71 - ERR_LT_REQUIRED = 72 # 72 - ERR_GT_REQUIRED = 73 # 73 - ERR_LTSLASH_REQUIRED = 74 # 74 - ERR_EQUAL_REQUIRED = 75 # 75 - ERR_TAG_NAME_MISMATCH = 76 # 76 - ERR_TAG_NOT_FINISHED = 77 # 77 - ERR_STANDALONE_VALUE = 78 # 78 - ERR_ENCODING_NAME = 79 # 79 - ERR_HYPHEN_IN_COMMENT = 80 # 80 - ERR_INVALID_ENCODING = 81 # 81 - ERR_EXT_ENTITY_STANDALONE = 82 # 82 - ERR_CONDSEC_INVALID = 83 # 83 - ERR_VALUE_REQUIRED = 84 # 84 - ERR_NOT_WELL_BALANCED = 85 # 85 - ERR_EXTRA_CONTENT = 86 # 86 - ERR_ENTITY_CHAR_ERROR = 87 # 87 - ERR_ENTITY_PE_INTERNAL = 88 # 88 - ERR_ENTITY_LOOP = 89 # 89 - ERR_ENTITY_BOUNDARY = 90 # 90 - ERR_INVALID_URI = 91 # 91 - ERR_URI_FRAGMENT = 92 # 92 - WAR_CATALOG_PI = 93 # 93 - ERR_NO_DTD = 94 # 94 - ERR_CONDSEC_INVALID_KEYWORD = 95 # 95 - ERR_VERSION_MISSING = 96 # 96 - WAR_UNKNOWN_VERSION = 97 # 97 - WAR_LANG_VALUE = 98 # 98 - WAR_NS_URI = 99 # 99 - WAR_NS_URI_RELATIVE = 100 # 100 - ERR_MISSING_ENCODING = 101 # 101 - NS_ERR_XML_NAMESPACE = 200 - NS_ERR_UNDEFINED_NAMESPACE = 201 # 201 - NS_ERR_QNAME = 202 # 202 - NS_ERR_ATTRIBUTE_REDEFINED = 203 # 203 - DTD_ATTRIBUTE_DEFAULT = 500 - DTD_ATTRIBUTE_REDEFINED = 501 # 501 - DTD_ATTRIBUTE_VALUE = 502 # 502 - DTD_CONTENT_ERROR = 503 # 503 - DTD_CONTENT_MODEL = 504 # 504 - DTD_CONTENT_NOT_DETERMINIST = 505 # 505 - DTD_DIFFERENT_PREFIX = 506 # 506 - DTD_ELEM_DEFAULT_NAMESPACE = 507 # 507 - DTD_ELEM_NAMESPACE = 508 # 508 - DTD_ELEM_REDEFINED = 509 # 509 - DTD_EMPTY_NOTATION = 510 # 510 - DTD_ENTITY_TYPE = 511 # 511 - DTD_ID_FIXED = 512 # 512 - DTD_ID_REDEFINED = 513 # 513 - DTD_ID_SUBSET = 514 # 514 - DTD_INVALID_CHILD = 515 # 515 - DTD_INVALID_DEFAULT = 516 # 516 - DTD_LOAD_ERROR = 517 # 517 - DTD_MISSING_ATTRIBUTE = 518 # 518 - DTD_MIXED_CORRUPT = 519 # 519 - DTD_MULTIPLE_ID = 520 # 520 - DTD_NO_DOC = 521 # 521 - DTD_NO_DTD = 522 # 522 - DTD_NO_ELEM_NAME = 523 # 523 - DTD_NO_PREFIX = 524 # 524 - DTD_NO_ROOT = 525 # 525 - DTD_NOTATION_REDEFINED = 526 # 526 - DTD_NOTATION_VALUE = 527 # 527 - DTD_NOT_EMPTY = 528 # 528 - DTD_NOT_PCDATA = 529 # 529 - DTD_NOT_STANDALONE = 530 # 530 - DTD_ROOT_NAME = 531 # 531 - DTD_STANDALONE_WHITE_SPACE = 532 # 532 - DTD_UNKNOWN_ATTRIBUTE = 533 # 533 - DTD_UNKNOWN_ELEM = 534 # 534 - DTD_UNKNOWN_ENTITY = 535 # 535 - DTD_UNKNOWN_ID = 536 # 536 - DTD_UNKNOWN_NOTATION = 537 # 537 - DTD_STANDALONE_DEFAULTED = 538 # 538 - DTD_XMLID_VALUE = 539 # 539 - DTD_XMLID_TYPE = 540 # 540 - HTML_STRUCURE_ERROR = 800 - HTML_UNKNOWN_TAG = 801 # 801 - RNGP_ANYNAME_ATTR_ANCESTOR = 1000 - RNGP_ATTR_CONFLICT = 1001 # 1001 - RNGP_ATTRIBUTE_CHILDREN = 1002 # 1002 - RNGP_ATTRIBUTE_CONTENT = 1003 # 1003 - RNGP_ATTRIBUTE_EMPTY = 1004 # 1004 - RNGP_ATTRIBUTE_NOOP = 1005 # 1005 - RNGP_CHOICE_CONTENT = 1006 # 1006 - RNGP_CHOICE_EMPTY = 1007 # 1007 - RNGP_CREATE_FAILURE = 1008 # 1008 - RNGP_DATA_CONTENT = 1009 # 1009 - RNGP_DEF_CHOICE_AND_INTERLEAVE = 1010 # 1010 - RNGP_DEFINE_CREATE_FAILED = 1011 # 1011 - RNGP_DEFINE_EMPTY = 1012 # 1012 - RNGP_DEFINE_MISSING = 1013 # 1013 - RNGP_DEFINE_NAME_MISSING = 1014 # 1014 - RNGP_ELEM_CONTENT_EMPTY = 1015 # 1015 - RNGP_ELEM_CONTENT_ERROR = 1016 # 1016 - RNGP_ELEMENT_EMPTY = 1017 # 1017 - RNGP_ELEMENT_CONTENT = 1018 # 1018 - RNGP_ELEMENT_NAME = 1019 # 1019 - RNGP_ELEMENT_NO_CONTENT = 1020 # 1020 - RNGP_ELEM_TEXT_CONFLICT = 1021 # 1021 - RNGP_EMPTY = 1022 # 1022 - RNGP_EMPTY_CONSTRUCT = 1023 # 1023 - RNGP_EMPTY_CONTENT = 1024 # 1024 - RNGP_EMPTY_NOT_EMPTY = 1025 # 1025 - RNGP_ERROR_TYPE_LIB = 1026 # 1026 - RNGP_EXCEPT_EMPTY = 1027 # 1027 - RNGP_EXCEPT_MISSING = 1028 # 1028 - RNGP_EXCEPT_MULTIPLE = 1029 # 1029 - RNGP_EXCEPT_NO_CONTENT = 1030 # 1030 - RNGP_EXTERNALREF_EMTPY = 1031 # 1031 - RNGP_EXTERNAL_REF_FAILURE = 1032 # 1032 - RNGP_EXTERNALREF_RECURSE = 1033 # 1033 - RNGP_FORBIDDEN_ATTRIBUTE = 1034 # 1034 - RNGP_FOREIGN_ELEMENT = 1035 # 1035 - RNGP_GRAMMAR_CONTENT = 1036 # 1036 - RNGP_GRAMMAR_EMPTY = 1037 # 1037 - RNGP_GRAMMAR_MISSING = 1038 # 1038 - RNGP_GRAMMAR_NO_START = 1039 # 1039 - RNGP_GROUP_ATTR_CONFLICT = 1040 # 1040 - RNGP_HREF_ERROR = 1041 # 1041 - RNGP_INCLUDE_EMPTY = 1042 # 1042 - RNGP_INCLUDE_FAILURE = 1043 # 1043 - RNGP_INCLUDE_RECURSE = 1044 # 1044 - RNGP_INTERLEAVE_ADD = 1045 # 1045 - RNGP_INTERLEAVE_CREATE_FAILED = 1046 # 1046 - RNGP_INTERLEAVE_EMPTY = 1047 # 1047 - RNGP_INTERLEAVE_NO_CONTENT = 1048 # 1048 - RNGP_INVALID_DEFINE_NAME = 1049 # 1049 - RNGP_INVALID_URI = 1050 # 1050 - RNGP_INVALID_VALUE = 1051 # 1051 - RNGP_MISSING_HREF = 1052 # 1052 - RNGP_NAME_MISSING = 1053 # 1053 - RNGP_NEED_COMBINE = 1054 # 1054 - RNGP_NOTALLOWED_NOT_EMPTY = 1055 # 1055 - RNGP_NSNAME_ATTR_ANCESTOR = 1056 # 1056 - RNGP_NSNAME_NO_NS = 1057 # 1057 - RNGP_PARAM_FORBIDDEN = 1058 # 1058 - RNGP_PARAM_NAME_MISSING = 1059 # 1059 - RNGP_PARENTREF_CREATE_FAILED = 1060 # 1060 - RNGP_PARENTREF_NAME_INVALID = 1061 # 1061 - RNGP_PARENTREF_NO_NAME = 1062 # 1062 - RNGP_PARENTREF_NO_PARENT = 1063 # 1063 - RNGP_PARENTREF_NOT_EMPTY = 1064 # 1064 - RNGP_PARSE_ERROR = 1065 # 1065 - RNGP_PAT_ANYNAME_EXCEPT_ANYNAME = 1066 # 1066 - RNGP_PAT_ATTR_ATTR = 1067 # 1067 - RNGP_PAT_ATTR_ELEM = 1068 # 1068 - RNGP_PAT_DATA_EXCEPT_ATTR = 1069 # 1069 - RNGP_PAT_DATA_EXCEPT_ELEM = 1070 # 1070 - RNGP_PAT_DATA_EXCEPT_EMPTY = 1071 # 1071 - RNGP_PAT_DATA_EXCEPT_GROUP = 1072 # 1072 - RNGP_PAT_DATA_EXCEPT_INTERLEAVE = 1073 # 1073 - RNGP_PAT_DATA_EXCEPT_LIST = 1074 # 1074 - RNGP_PAT_DATA_EXCEPT_ONEMORE = 1075 # 1075 - RNGP_PAT_DATA_EXCEPT_REF = 1076 # 1076 - RNGP_PAT_DATA_EXCEPT_TEXT = 1077 # 1077 - RNGP_PAT_LIST_ATTR = 1078 # 1078 - RNGP_PAT_LIST_ELEM = 1079 # 1079 - RNGP_PAT_LIST_INTERLEAVE = 1080 # 1080 - RNGP_PAT_LIST_LIST = 1081 # 1081 - RNGP_PAT_LIST_REF = 1082 # 1082 - RNGP_PAT_LIST_TEXT = 1083 # 1083 - RNGP_PAT_NSNAME_EXCEPT_ANYNAME = 1084 # 1084 - RNGP_PAT_NSNAME_EXCEPT_NSNAME = 1085 # 1085 - RNGP_PAT_ONEMORE_GROUP_ATTR = 1086 # 1086 - RNGP_PAT_ONEMORE_INTERLEAVE_ATTR = 1087 # 1087 - RNGP_PAT_START_ATTR = 1088 # 1088 - RNGP_PAT_START_DATA = 1089 # 1089 - RNGP_PAT_START_EMPTY = 1090 # 1090 - RNGP_PAT_START_GROUP = 1091 # 1091 - RNGP_PAT_START_INTERLEAVE = 1092 # 1092 - RNGP_PAT_START_LIST = 1093 # 1093 - RNGP_PAT_START_ONEMORE = 1094 # 1094 - RNGP_PAT_START_TEXT = 1095 # 1095 - RNGP_PAT_START_VALUE = 1096 # 1096 - RNGP_PREFIX_UNDEFINED = 1097 # 1097 - RNGP_REF_CREATE_FAILED = 1098 # 1098 - RNGP_REF_CYCLE = 1099 # 1099 - RNGP_REF_NAME_INVALID = 1100 # 1100 - RNGP_REF_NO_DEF = 1101 # 1101 - RNGP_REF_NO_NAME = 1102 # 1102 - RNGP_REF_NOT_EMPTY = 1103 # 1103 - RNGP_START_CHOICE_AND_INTERLEAVE = 1104 # 1104 - RNGP_START_CONTENT = 1105 # 1105 - RNGP_START_EMPTY = 1106 # 1106 - RNGP_START_MISSING = 1107 # 1107 - RNGP_TEXT_EXPECTED = 1108 # 1108 - RNGP_TEXT_HAS_CHILD = 1109 # 1109 - RNGP_TYPE_MISSING = 1110 # 1110 - RNGP_TYPE_NOT_FOUND = 1111 # 1111 - RNGP_TYPE_VALUE = 1112 # 1112 - RNGP_UNKNOWN_ATTRIBUTE = 1113 # 1113 - RNGP_UNKNOWN_COMBINE = 1114 # 1114 - RNGP_UNKNOWN_CONSTRUCT = 1115 # 1115 - RNGP_UNKNOWN_TYPE_LIB = 1116 # 1116 - RNGP_URI_FRAGMENT = 1117 # 1117 - RNGP_URI_NOT_ABSOLUTE = 1118 # 1118 - RNGP_VALUE_EMPTY = 1119 # 1119 - RNGP_VALUE_NO_CONTENT = 1120 # 1120 - RNGP_XMLNS_NAME = 1121 # 1121 - RNGP_XML_NS = 1122 # 1122 - XPATH_EXPRESSION_OK = 1200 - XPATH_NUMBER_ERROR = 1201 # 1201 - XPATH_UNFINISHED_LITERAL_ERROR = 1202 # 1202 - XPATH_START_LITERAL_ERROR = 1203 # 1203 - XPATH_VARIABLE_REF_ERROR = 1204 # 1204 - XPATH_UNDEF_VARIABLE_ERROR = 1205 # 1205 - XPATH_INVALID_PREDICATE_ERROR = 1206 # 1206 - XPATH_EXPR_ERROR = 1207 # 1207 - XPATH_UNCLOSED_ERROR = 1208 # 1208 - XPATH_UNKNOWN_FUNC_ERROR = 1209 # 1209 - XPATH_INVALID_OPERAND = 1210 # 1210 - XPATH_INVALID_TYPE = 1211 # 1211 - XPATH_INVALID_ARITY = 1212 # 1212 - XPATH_INVALID_CTXT_SIZE = 1213 # 1213 - XPATH_INVALID_CTXT_POSITION = 1214 # 1214 - XPATH_MEMORY_ERROR = 1215 # 1215 - XPTR_SYNTAX_ERROR = 1216 # 1216 - XPTR_RESOURCE_ERROR = 1217 # 1217 - XPTR_SUB_RESOURCE_ERROR = 1218 # 1218 - XPATH_UNDEF_PREFIX_ERROR = 1219 # 1219 - XPATH_ENCODING_ERROR = 1220 # 1220 - XPATH_INVALID_CHAR_ERROR = 1221 # 1221 - TREE_INVALID_HEX = 1300 - TREE_INVALID_DEC = 1301 # 1301 - TREE_UNTERMINATED_ENTITY = 1302 # 1302 - SAVE_NOT_UTF8 = 1400 - SAVE_CHAR_INVALID = 1401 # 1401 - SAVE_NO_DOCTYPE = 1402 # 1402 - SAVE_UNKNOWN_ENCODING = 1403 # 1403 - REGEXP_COMPILE_ERROR = 1450 - IO_UNKNOWN = 1500 - IO_EACCES = 1501 # 1501 - IO_EAGAIN = 1502 # 1502 - IO_EBADF = 1503 # 1503 - IO_EBADMSG = 1504 # 1504 - IO_EBUSY = 1505 # 1505 - IO_ECANCELED = 1506 # 1506 - IO_ECHILD = 1507 # 1507 - IO_EDEADLK = 1508 # 1508 - IO_EDOM = 1509 # 1509 - IO_EEXIST = 1510 # 1510 - IO_EFAULT = 1511 # 1511 - IO_EFBIG = 1512 # 1512 - IO_EINPROGRESS = 1513 # 1513 - IO_EINTR = 1514 # 1514 - IO_EINVAL = 1515 # 1515 - IO_EIO = 1516 # 1516 - IO_EISDIR = 1517 # 1517 - IO_EMFILE = 1518 # 1518 - IO_EMLINK = 1519 # 1519 - IO_EMSGSIZE = 1520 # 1520 - IO_ENAMETOOLONG = 1521 # 1521 - IO_ENFILE = 1522 # 1522 - IO_ENODEV = 1523 # 1523 - IO_ENOENT = 1524 # 1524 - IO_ENOEXEC = 1525 # 1525 - IO_ENOLCK = 1526 # 1526 - IO_ENOMEM = 1527 # 1527 - IO_ENOSPC = 1528 # 1528 - IO_ENOSYS = 1529 # 1529 - IO_ENOTDIR = 1530 # 1530 - IO_ENOTEMPTY = 1531 # 1531 - IO_ENOTSUP = 1532 # 1532 - IO_ENOTTY = 1533 # 1533 - IO_ENXIO = 1534 # 1534 - IO_EPERM = 1535 # 1535 - IO_EPIPE = 1536 # 1536 - IO_ERANGE = 1537 # 1537 - IO_EROFS = 1538 # 1538 - IO_ESPIPE = 1539 # 1539 - IO_ESRCH = 1540 # 1540 - IO_ETIMEDOUT = 1541 # 1541 - IO_EXDEV = 1542 # 1542 - IO_NETWORK_ATTEMPT = 1543 # 1543 - IO_ENCODER = 1544 # 1544 - IO_FLUSH = 1545 # 1545 - IO_WRITE = 1546 # 1546 - IO_NO_INPUT = 1547 # 1547 - IO_BUFFER_FULL = 1548 # 1548 - IO_LOAD_ERROR = 1549 # 1549 - IO_ENOTSOCK = 1550 # 1550 - IO_EISCONN = 1551 # 1551 - IO_ECONNREFUSED = 1552 # 1552 - IO_ENETUNREACH = 1553 # 1553 - IO_EADDRINUSE = 1554 # 1554 - IO_EALREADY = 1555 # 1555 - IO_EAFNOSUPPORT = 1556 # 1556 - XINCLUDE_RECURSION = 1600 - XINCLUDE_PARSE_VALUE = 1601 # 1601 - XINCLUDE_ENTITY_DEF_MISMATCH = 1602 # 1602 - XINCLUDE_NO_HREF = 1603 # 1603 - XINCLUDE_NO_FALLBACK = 1604 # 1604 - XINCLUDE_HREF_URI = 1605 # 1605 - XINCLUDE_TEXT_FRAGMENT = 1606 # 1606 - XINCLUDE_TEXT_DOCUMENT = 1607 # 1607 - XINCLUDE_INVALID_CHAR = 1608 # 1608 - XINCLUDE_BUILD_FAILED = 1609 # 1609 - XINCLUDE_UNKNOWN_ENCODING = 1610 # 1610 - XINCLUDE_MULTIPLE_ROOT = 1611 # 1611 - XINCLUDE_XPTR_FAILED = 1612 # 1612 - XINCLUDE_XPTR_RESULT = 1613 # 1613 - XINCLUDE_INCLUDE_IN_INCLUDE = 1614 # 1614 - XINCLUDE_FALLBACKS_IN_INCLUDE = 1615 # 1615 - XINCLUDE_FALLBACK_NOT_IN_INCLUDE = 1616 # 1616 - XINCLUDE_DEPRECATED_NS = 1617 # 1617 - XINCLUDE_FRAGMENT_ID = 1618 # 1618 - CATALOG_MISSING_ATTR = 1650 - CATALOG_ENTRY_BROKEN = 1651 # 1651 - CATALOG_PREFER_VALUE = 1652 # 1652 - CATALOG_NOT_CATALOG = 1653 # 1653 - CATALOG_RECURSION = 1654 # 1654 - SCHEMAP_PREFIX_UNDEFINED = 1700 - SCHEMAP_ATTRFORMDEFAULT_VALUE = 1701 # 1701 - SCHEMAP_ATTRGRP_NONAME_NOREF = 1702 # 1702 - SCHEMAP_ATTR_NONAME_NOREF = 1703 # 1703 - SCHEMAP_COMPLEXTYPE_NONAME_NOREF = 1704 # 1704 - SCHEMAP_ELEMFORMDEFAULT_VALUE = 1705 # 1705 - SCHEMAP_ELEM_NONAME_NOREF = 1706 # 1706 - SCHEMAP_EXTENSION_NO_BASE = 1707 # 1707 - SCHEMAP_FACET_NO_VALUE = 1708 # 1708 - SCHEMAP_FAILED_BUILD_IMPORT = 1709 # 1709 - SCHEMAP_GROUP_NONAME_NOREF = 1710 # 1710 - SCHEMAP_IMPORT_NAMESPACE_NOT_URI = 1711 # 1711 - SCHEMAP_IMPORT_REDEFINE_NSNAME = 1712 # 1712 - SCHEMAP_IMPORT_SCHEMA_NOT_URI = 1713 # 1713 - SCHEMAP_INVALID_BOOLEAN = 1714 # 1714 - SCHEMAP_INVALID_ENUM = 1715 # 1715 - SCHEMAP_INVALID_FACET = 1716 # 1716 - SCHEMAP_INVALID_FACET_VALUE = 1717 # 1717 - SCHEMAP_INVALID_MAXOCCURS = 1718 # 1718 - SCHEMAP_INVALID_MINOCCURS = 1719 # 1719 - SCHEMAP_INVALID_REF_AND_SUBTYPE = 1720 # 1720 - SCHEMAP_INVALID_WHITE_SPACE = 1721 # 1721 - SCHEMAP_NOATTR_NOREF = 1722 # 1722 - SCHEMAP_NOTATION_NO_NAME = 1723 # 1723 - SCHEMAP_NOTYPE_NOREF = 1724 # 1724 - SCHEMAP_REF_AND_SUBTYPE = 1725 # 1725 - SCHEMAP_RESTRICTION_NONAME_NOREF = 1726 # 1726 - SCHEMAP_SIMPLETYPE_NONAME = 1727 # 1727 - SCHEMAP_TYPE_AND_SUBTYPE = 1728 # 1728 - SCHEMAP_UNKNOWN_ALL_CHILD = 1729 # 1729 - SCHEMAP_UNKNOWN_ANYATTRIBUTE_CHILD = 1730 # 1730 - SCHEMAP_UNKNOWN_ATTR_CHILD = 1731 # 1731 - SCHEMAP_UNKNOWN_ATTRGRP_CHILD = 1732 # 1732 - SCHEMAP_UNKNOWN_ATTRIBUTE_GROUP = 1733 # 1733 - SCHEMAP_UNKNOWN_BASE_TYPE = 1734 # 1734 - SCHEMAP_UNKNOWN_CHOICE_CHILD = 1735 # 1735 - SCHEMAP_UNKNOWN_COMPLEXCONTENT_CHILD = 1736 # 1736 - SCHEMAP_UNKNOWN_COMPLEXTYPE_CHILD = 1737 # 1737 - SCHEMAP_UNKNOWN_ELEM_CHILD = 1738 # 1738 - SCHEMAP_UNKNOWN_EXTENSION_CHILD = 1739 # 1739 - SCHEMAP_UNKNOWN_FACET_CHILD = 1740 # 1740 - SCHEMAP_UNKNOWN_FACET_TYPE = 1741 # 1741 - SCHEMAP_UNKNOWN_GROUP_CHILD = 1742 # 1742 - SCHEMAP_UNKNOWN_IMPORT_CHILD = 1743 # 1743 - SCHEMAP_UNKNOWN_LIST_CHILD = 1744 # 1744 - SCHEMAP_UNKNOWN_NOTATION_CHILD = 1745 # 1745 - SCHEMAP_UNKNOWN_PROCESSCONTENT_CHILD = 1746 # 1746 - SCHEMAP_UNKNOWN_REF = 1747 # 1747 - SCHEMAP_UNKNOWN_RESTRICTION_CHILD = 1748 # 1748 - SCHEMAP_UNKNOWN_SCHEMAS_CHILD = 1749 # 1749 - SCHEMAP_UNKNOWN_SEQUENCE_CHILD = 1750 # 1750 - SCHEMAP_UNKNOWN_SIMPLECONTENT_CHILD = 1751 # 1751 - SCHEMAP_UNKNOWN_SIMPLETYPE_CHILD = 1752 # 1752 - SCHEMAP_UNKNOWN_TYPE = 1753 # 1753 - SCHEMAP_UNKNOWN_UNION_CHILD = 1754 # 1754 - SCHEMAP_ELEM_DEFAULT_FIXED = 1755 # 1755 - SCHEMAP_REGEXP_INVALID = 1756 # 1756 - SCHEMAP_FAILED_LOAD = 1757 # 1757 - SCHEMAP_NOTHING_TO_PARSE = 1758 # 1758 - SCHEMAP_NOROOT = 1759 # 1759 - SCHEMAP_REDEFINED_GROUP = 1760 # 1760 - SCHEMAP_REDEFINED_TYPE = 1761 # 1761 - SCHEMAP_REDEFINED_ELEMENT = 1762 # 1762 - SCHEMAP_REDEFINED_ATTRGROUP = 1763 # 1763 - SCHEMAP_REDEFINED_ATTR = 1764 # 1764 - SCHEMAP_REDEFINED_NOTATION = 1765 # 1765 - SCHEMAP_FAILED_PARSE = 1766 # 1766 - SCHEMAP_UNKNOWN_PREFIX = 1767 # 1767 - SCHEMAP_DEF_AND_PREFIX = 1768 # 1768 - SCHEMAP_UNKNOWN_INCLUDE_CHILD = 1769 # 1769 - SCHEMAP_INCLUDE_SCHEMA_NOT_URI = 1770 # 1770 - SCHEMAP_INCLUDE_SCHEMA_NO_URI = 1771 # 1771 - SCHEMAP_NOT_SCHEMA = 1772 # 1772 - SCHEMAP_UNKNOWN_MEMBER_TYPE = 1773 # 1773 - SCHEMAP_INVALID_ATTR_USE = 1774 # 1774 - SCHEMAP_RECURSIVE = 1775 # 1775 - SCHEMAP_SUPERNUMEROUS_LIST_ITEM_TYPE = 1776 # 1776 - SCHEMAP_INVALID_ATTR_COMBINATION = 1777 # 1777 - SCHEMAP_INVALID_ATTR_INLINE_COMBINATION = 1778 # 1778 - SCHEMAP_MISSING_SIMPLETYPE_CHILD = 1779 # 1779 - SCHEMAP_INVALID_ATTR_NAME = 1780 # 1780 - SCHEMAP_REF_AND_CONTENT = 1781 # 1781 - SCHEMAP_CT_PROPS_CORRECT_1 = 1782 # 1782 - SCHEMAP_CT_PROPS_CORRECT_2 = 1783 # 1783 - SCHEMAP_CT_PROPS_CORRECT_3 = 1784 # 1784 - SCHEMAP_CT_PROPS_CORRECT_4 = 1785 # 1785 - SCHEMAP_CT_PROPS_CORRECT_5 = 1786 # 1786 - SCHEMAP_DERIVATION_OK_RESTRICTION_1 = 1787 # 1787 - SCHEMAP_DERIVATION_OK_RESTRICTION_2_1_1 = 1788 # 1788 - SCHEMAP_DERIVATION_OK_RESTRICTION_2_1_2 = 1789 # 1789 - SCHEMAP_DERIVATION_OK_RESTRICTION_2_2 = 1790 # 1790 - SCHEMAP_DERIVATION_OK_RESTRICTION_3 = 1791 # 1791 - SCHEMAP_WILDCARD_INVALID_NS_MEMBER = 1792 # 1792 - SCHEMAP_INTERSECTION_NOT_EXPRESSIBLE = 1793 # 1793 - SCHEMAP_UNION_NOT_EXPRESSIBLE = 1794 # 1794 - SCHEMAP_SRC_IMPORT_3_1 = 1795 # 1795 - SCHEMAP_SRC_IMPORT_3_2 = 1796 # 1796 - SCHEMAP_DERIVATION_OK_RESTRICTION_4_1 = 1797 # 1797 - SCHEMAP_DERIVATION_OK_RESTRICTION_4_2 = 1798 # 1798 - SCHEMAP_DERIVATION_OK_RESTRICTION_4_3 = 1799 # 1799 - SCHEMAP_COS_CT_EXTENDS_1_3 = 1800 # 1800 - SCHEMAV_NOROOT = 1801 - SCHEMAV_UNDECLAREDELEM = 1802 # 1802 - SCHEMAV_NOTTOPLEVEL = 1803 # 1803 - SCHEMAV_MISSING = 1804 # 1804 - SCHEMAV_WRONGELEM = 1805 # 1805 - SCHEMAV_NOTYPE = 1806 # 1806 - SCHEMAV_NOROLLBACK = 1807 # 1807 - SCHEMAV_ISABSTRACT = 1808 # 1808 - SCHEMAV_NOTEMPTY = 1809 # 1809 - SCHEMAV_ELEMCONT = 1810 # 1810 - SCHEMAV_HAVEDEFAULT = 1811 # 1811 - SCHEMAV_NOTNILLABLE = 1812 # 1812 - SCHEMAV_EXTRACONTENT = 1813 # 1813 - SCHEMAV_INVALIDATTR = 1814 # 1814 - SCHEMAV_INVALIDELEM = 1815 # 1815 - SCHEMAV_NOTDETERMINIST = 1816 # 1816 - SCHEMAV_CONSTRUCT = 1817 # 1817 - SCHEMAV_INTERNAL = 1818 # 1818 - SCHEMAV_NOTSIMPLE = 1819 # 1819 - SCHEMAV_ATTRUNKNOWN = 1820 # 1820 - SCHEMAV_ATTRINVALID = 1821 # 1821 - SCHEMAV_VALUE = 1822 # 1822 - SCHEMAV_FACET = 1823 # 1823 - SCHEMAV_CVC_DATATYPE_VALID_1_2_1 = 1824 # 1824 - SCHEMAV_CVC_DATATYPE_VALID_1_2_2 = 1825 # 1825 - SCHEMAV_CVC_DATATYPE_VALID_1_2_3 = 1826 # 1826 - SCHEMAV_CVC_TYPE_3_1_1 = 1827 # 1827 - SCHEMAV_CVC_TYPE_3_1_2 = 1828 # 1828 - SCHEMAV_CVC_FACET_VALID = 1829 # 1829 - SCHEMAV_CVC_LENGTH_VALID = 1830 # 1830 - SCHEMAV_CVC_MINLENGTH_VALID = 1831 # 1831 - SCHEMAV_CVC_MAXLENGTH_VALID = 1832 # 1832 - SCHEMAV_CVC_MININCLUSIVE_VALID = 1833 # 1833 - SCHEMAV_CVC_MAXINCLUSIVE_VALID = 1834 # 1834 - SCHEMAV_CVC_MINEXCLUSIVE_VALID = 1835 # 1835 - SCHEMAV_CVC_MAXEXCLUSIVE_VALID = 1836 # 1836 - SCHEMAV_CVC_TOTALDIGITS_VALID = 1837 # 1837 - SCHEMAV_CVC_FRACTIONDIGITS_VALID = 1838 # 1838 - SCHEMAV_CVC_PATTERN_VALID = 1839 # 1839 - SCHEMAV_CVC_ENUMERATION_VALID = 1840 # 1840 - SCHEMAV_CVC_COMPLEX_TYPE_2_1 = 1841 # 1841 - SCHEMAV_CVC_COMPLEX_TYPE_2_2 = 1842 # 1842 - SCHEMAV_CVC_COMPLEX_TYPE_2_3 = 1843 # 1843 - SCHEMAV_CVC_COMPLEX_TYPE_2_4 = 1844 # 1844 - SCHEMAV_CVC_ELT_1 = 1845 # 1845 - SCHEMAV_CVC_ELT_2 = 1846 # 1846 - SCHEMAV_CVC_ELT_3_1 = 1847 # 1847 - SCHEMAV_CVC_ELT_3_2_1 = 1848 # 1848 - SCHEMAV_CVC_ELT_3_2_2 = 1849 # 1849 - SCHEMAV_CVC_ELT_4_1 = 1850 # 1850 - SCHEMAV_CVC_ELT_4_2 = 1851 # 1851 - SCHEMAV_CVC_ELT_4_3 = 1852 # 1852 - SCHEMAV_CVC_ELT_5_1_1 = 1853 # 1853 - SCHEMAV_CVC_ELT_5_1_2 = 1854 # 1854 - SCHEMAV_CVC_ELT_5_2_1 = 1855 # 1855 - SCHEMAV_CVC_ELT_5_2_2_1 = 1856 # 1856 - SCHEMAV_CVC_ELT_5_2_2_2_1 = 1857 # 1857 - SCHEMAV_CVC_ELT_5_2_2_2_2 = 1858 # 1858 - SCHEMAV_CVC_ELT_6 = 1859 # 1859 - SCHEMAV_CVC_ELT_7 = 1860 # 1860 - SCHEMAV_CVC_ATTRIBUTE_1 = 1861 # 1861 - SCHEMAV_CVC_ATTRIBUTE_2 = 1862 # 1862 - SCHEMAV_CVC_ATTRIBUTE_3 = 1863 # 1863 - SCHEMAV_CVC_ATTRIBUTE_4 = 1864 # 1864 - SCHEMAV_CVC_COMPLEX_TYPE_3_1 = 1865 # 1865 - SCHEMAV_CVC_COMPLEX_TYPE_3_2_1 = 1866 # 1866 - SCHEMAV_CVC_COMPLEX_TYPE_3_2_2 = 1867 # 1867 - SCHEMAV_CVC_COMPLEX_TYPE_4 = 1868 # 1868 - SCHEMAV_CVC_COMPLEX_TYPE_5_1 = 1869 # 1869 - SCHEMAV_CVC_COMPLEX_TYPE_5_2 = 1870 # 1870 - SCHEMAV_ELEMENT_CONTENT = 1871 # 1871 - SCHEMAV_DOCUMENT_ELEMENT_MISSING = 1872 # 1872 - SCHEMAV_CVC_COMPLEX_TYPE_1 = 1873 # 1873 - SCHEMAV_CVC_AU = 1874 # 1874 - SCHEMAV_CVC_TYPE_1 = 1875 # 1875 - SCHEMAV_CVC_TYPE_2 = 1876 # 1876 - XPTR_UNKNOWN_SCHEME = 1900 - XPTR_CHILDSEQ_START = 1901 # 1901 - XPTR_EVAL_FAILED = 1902 # 1902 - XPTR_EXTRA_OBJECTS = 1903 # 1903 - C14N_CREATE_CTXT = 1950 - C14N_REQUIRES_UTF8 = 1951 # 1951 - C14N_CREATE_STACK = 1952 # 1952 - C14N_INVALID_NODE = 1953 # 1953 - FTP_PASV_ANSWER = 2000 - FTP_EPSV_ANSWER = 2001 # 2001 - FTP_ACCNT = 2002 # 2002 - HTTP_URL_SYNTAX = 2020 - HTTP_USE_IP = 2021 # 2021 - HTTP_UNKNOWN_HOST = 2022 # 2022 - SCHEMAP_SRC_SIMPLE_TYPE_1 = 3000 - SCHEMAP_SRC_SIMPLE_TYPE_2 = 3001 # 3001 - SCHEMAP_SRC_SIMPLE_TYPE_3 = 3002 # 3002 - SCHEMAP_SRC_SIMPLE_TYPE_4 = 3003 # 3003 - SCHEMAP_SRC_RESOLVE = 3004 # 3004 - SCHEMAP_SRC_RESTRICTION_BASE_OR_SIMPLETYPE = 3005 # 3005 - SCHEMAP_SRC_LIST_ITEMTYPE_OR_SIMPLETYPE = 3006 # 3006 - SCHEMAP_SRC_UNION_MEMBERTYPES_OR_SIMPLETYPES = 3007 # 3007 - SCHEMAP_ST_PROPS_CORRECT_1 = 3008 # 3008 - SCHEMAP_ST_PROPS_CORRECT_2 = 3009 # 3009 - SCHEMAP_ST_PROPS_CORRECT_3 = 3010 # 3010 - SCHEMAP_COS_ST_RESTRICTS_1_1 = 3011 # 3011 - SCHEMAP_COS_ST_RESTRICTS_1_2 = 3012 # 3012 - SCHEMAP_COS_ST_RESTRICTS_1_3_1 = 3013 # 3013 - SCHEMAP_COS_ST_RESTRICTS_1_3_2 = 3014 # 3014 - SCHEMAP_COS_ST_RESTRICTS_2_1 = 3015 # 3015 - SCHEMAP_COS_ST_RESTRICTS_2_3_1_1 = 3016 # 3016 - SCHEMAP_COS_ST_RESTRICTS_2_3_1_2 = 3017 # 3017 - SCHEMAP_COS_ST_RESTRICTS_2_3_2_1 = 3018 # 3018 - SCHEMAP_COS_ST_RESTRICTS_2_3_2_2 = 3019 # 3019 - SCHEMAP_COS_ST_RESTRICTS_2_3_2_3 = 3020 # 3020 - SCHEMAP_COS_ST_RESTRICTS_2_3_2_4 = 3021 # 3021 - SCHEMAP_COS_ST_RESTRICTS_2_3_2_5 = 3022 # 3022 - SCHEMAP_COS_ST_RESTRICTS_3_1 = 3023 # 3023 - SCHEMAP_COS_ST_RESTRICTS_3_3_1 = 3024 # 3024 - SCHEMAP_COS_ST_RESTRICTS_3_3_1_2 = 3025 # 3025 - SCHEMAP_COS_ST_RESTRICTS_3_3_2_2 = 3026 # 3026 - SCHEMAP_COS_ST_RESTRICTS_3_3_2_1 = 3027 # 3027 - SCHEMAP_COS_ST_RESTRICTS_3_3_2_3 = 3028 # 3028 - SCHEMAP_COS_ST_RESTRICTS_3_3_2_4 = 3029 # 3029 - SCHEMAP_COS_ST_RESTRICTS_3_3_2_5 = 3030 # 3030 - SCHEMAP_COS_ST_DERIVED_OK_2_1 = 3031 # 3031 - SCHEMAP_COS_ST_DERIVED_OK_2_2 = 3032 # 3032 - SCHEMAP_S4S_ELEM_NOT_ALLOWED = 3033 # 3033 - SCHEMAP_S4S_ELEM_MISSING = 3034 # 3034 - SCHEMAP_S4S_ATTR_NOT_ALLOWED = 3035 # 3035 - SCHEMAP_S4S_ATTR_MISSING = 3036 # 3036 - SCHEMAP_S4S_ATTR_INVALID_VALUE = 3037 # 3037 - SCHEMAP_SRC_ELEMENT_1 = 3038 # 3038 - SCHEMAP_SRC_ELEMENT_2_1 = 3039 # 3039 - SCHEMAP_SRC_ELEMENT_2_2 = 3040 # 3040 - SCHEMAP_SRC_ELEMENT_3 = 3041 # 3041 - SCHEMAP_P_PROPS_CORRECT_1 = 3042 # 3042 - SCHEMAP_P_PROPS_CORRECT_2_1 = 3043 # 3043 - SCHEMAP_P_PROPS_CORRECT_2_2 = 3044 # 3044 - SCHEMAP_E_PROPS_CORRECT_2 = 3045 # 3045 - SCHEMAP_E_PROPS_CORRECT_3 = 3046 # 3046 - SCHEMAP_E_PROPS_CORRECT_4 = 3047 # 3047 - SCHEMAP_E_PROPS_CORRECT_5 = 3048 # 3048 - SCHEMAP_E_PROPS_CORRECT_6 = 3049 # 3049 - SCHEMAP_SRC_INCLUDE = 3050 # 3050 - SCHEMAP_SRC_ATTRIBUTE_1 = 3051 # 3051 - SCHEMAP_SRC_ATTRIBUTE_2 = 3052 # 3052 - SCHEMAP_SRC_ATTRIBUTE_3_1 = 3053 # 3053 - SCHEMAP_SRC_ATTRIBUTE_3_2 = 3054 # 3054 - SCHEMAP_SRC_ATTRIBUTE_4 = 3055 # 3055 - SCHEMAP_NO_XMLNS = 3056 # 3056 - SCHEMAP_NO_XSI = 3057 # 3057 - SCHEMAP_COS_VALID_DEFAULT_1 = 3058 # 3058 - SCHEMAP_COS_VALID_DEFAULT_2_1 = 3059 # 3059 - SCHEMAP_COS_VALID_DEFAULT_2_2_1 = 3060 # 3060 - SCHEMAP_COS_VALID_DEFAULT_2_2_2 = 3061 # 3061 - SCHEMAP_CVC_SIMPLE_TYPE = 3062 # 3062 - SCHEMAP_COS_CT_EXTENDS_1_1 = 3063 # 3063 - SCHEMAP_SRC_IMPORT_1_1 = 3064 # 3064 - SCHEMAP_SRC_IMPORT_1_2 = 3065 # 3065 - SCHEMAP_SRC_IMPORT_2 = 3066 # 3066 - SCHEMAP_SRC_IMPORT_2_1 = 3067 # 3067 - SCHEMAP_SRC_IMPORT_2_2 = 3068 # 3068 - SCHEMAP_INTERNAL = 3069 # 3069 non-W3C - SCHEMAP_NOT_DETERMINISTIC = 3070 # 3070 non-W3C - SCHEMAP_SRC_ATTRIBUTE_GROUP_1 = 3071 # 3071 - SCHEMAP_SRC_ATTRIBUTE_GROUP_2 = 3072 # 3072 - SCHEMAP_SRC_ATTRIBUTE_GROUP_3 = 3073 # 3073 - SCHEMAP_MG_PROPS_CORRECT_1 = 3074 # 3074 - SCHEMAP_MG_PROPS_CORRECT_2 = 3075 # 3075 - SCHEMAP_SRC_CT_1 = 3076 # 3076 - SCHEMAP_DERIVATION_OK_RESTRICTION_2_1_3 = 3077 # 3077 - SCHEMAP_AU_PROPS_CORRECT_2 = 3078 # 3078 - SCHEMAP_A_PROPS_CORRECT_2 = 3079 # 3079 - MODULE_OPEN = 4900 # 4900 - MODULE_CLOSE = 4901 # 4901 - CHECK_FOUND_ELEMENT = 5000 - CHECK_FOUND_ATTRIBUTE = 5001 # 5001 - CHECK_FOUND_TEXT = 5002 # 5002 - CHECK_FOUND_CDATA = 5003 # 5003 - CHECK_FOUND_ENTITYREF = 5004 # 5004 - CHECK_FOUND_ENTITY = 5005 # 5005 - CHECK_FOUND_PI = 5006 # 5006 - CHECK_FOUND_COMMENT = 5007 # 5007 - CHECK_FOUND_DOCTYPE = 5008 # 5008 - CHECK_FOUND_FRAGMENT = 5009 # 5009 - CHECK_FOUND_NOTATION = 5010 # 5010 - CHECK_UNKNOWN_NODE = 5011 # 5011 - CHECK_ENTITY_TYPE = 5012 # 5012 - CHECK_NO_PARENT = 5013 # 5013 - CHECK_NO_DOC = 5014 # 5014 - CHECK_NO_NAME = 5015 # 5015 - CHECK_NO_ELEM = 5016 # 5016 - CHECK_WRONG_DOC = 5017 # 5017 - CHECK_NO_PREV = 5018 # 5018 - CHECK_WRONG_PREV = 5019 # 5019 - CHECK_NO_NEXT = 5020 # 5020 - CHECK_WRONG_NEXT = 5021 # 5021 - CHECK_NOT_DTD = 5022 # 5022 - CHECK_NOT_ATTR = 5023 # 5023 - CHECK_NOT_ATTR_DECL = 5024 # 5024 - CHECK_NOT_ELEM_DECL = 5025 # 5025 - CHECK_NOT_ENTITY_DECL = 5026 # 5026 - CHECK_NOT_NS_DECL = 5027 # 5027 - CHECK_NO_HREF = 5028 # 5028 - CHECK_WRONG_PARENT = 5029 # 5029 - CHECK_NS_SCOPE = 5030 # 5030 - CHECK_NS_ANCESTOR = 5031 # 5031 - CHECK_NOT_UTF8 = 5032 # 5032 - CHECK_NO_DICT = 5033 # 5033 - CHECK_NOT_NCNAME = 5034 # 5034 - CHECK_OUTSIDE_DICT = 5035 # 5035 - CHECK_WRONG_NAME = 5036 # 5036 - CHECK_NAME_NOT_NULL = 5037 # 5037 - CHECK_ = 5038 # 5033 - CHECK_X = 5039 # 503 - -cdef object __names -__names = ErrorLevels._names -for name, value in vars(ErrorLevels).iteritems(): - python.PyDict_SetItem(__names, value, name) - -__names = ErrorDomains._names -for name, value in vars(ErrorDomains).iteritems(): - python.PyDict_SetItem(__names, value, name) - -__names = ErrorTypes._names -for name, value in vars(ErrorTypes).iteritems(): - python.PyDict_SetItem(__names, value, name) + +cdef object __ERROR_TYPES +__ERROR_TYPES = """ + XML_ERR_OK = 0 + XML_ERR_INTERNAL_ERROR = 1 : 1 + XML_ERR_NO_MEMORY = 2 : 2 + XML_ERR_DOCUMENT_START = 3 : 3 + XML_ERR_DOCUMENT_EMPTY = 4 : 4 + XML_ERR_DOCUMENT_END = 5 : 5 + XML_ERR_INVALID_HEX_CHARREF = 6 : 6 + XML_ERR_INVALID_DEC_CHARREF = 7 : 7 + XML_ERR_INVALID_CHARREF = 8 : 8 + XML_ERR_INVALID_CHAR = 9 : 9 + XML_ERR_CHARREF_AT_EOF = 10 : 10 + XML_ERR_CHARREF_IN_PROLOG = 11 : 11 + XML_ERR_CHARREF_IN_EPILOG = 12 : 12 + XML_ERR_CHARREF_IN_DTD = 13 : 13 + XML_ERR_ENTITYREF_AT_EOF = 14 : 14 + XML_ERR_ENTITYREF_IN_PROLOG = 15 : 15 + XML_ERR_ENTITYREF_IN_EPILOG = 16 : 16 + XML_ERR_ENTITYREF_IN_DTD = 17 : 17 + XML_ERR_PEREF_AT_EOF = 18 : 18 + XML_ERR_PEREF_IN_PROLOG = 19 : 19 + XML_ERR_PEREF_IN_EPILOG = 20 : 20 + XML_ERR_PEREF_IN_INT_SUBSET = 21 : 21 + XML_ERR_ENTITYREF_NO_NAME = 22 : 22 + XML_ERR_ENTITYREF_SEMICOL_MISSING = 23 : 23 + XML_ERR_PEREF_NO_NAME = 24 : 24 + XML_ERR_PEREF_SEMICOL_MISSING = 25 : 25 + XML_ERR_UNDECLARED_ENTITY = 26 : 26 + XML_WAR_UNDECLARED_ENTITY = 27 : 27 + XML_ERR_UNPARSED_ENTITY = 28 : 28 + XML_ERR_ENTITY_IS_EXTERNAL = 29 : 29 + XML_ERR_ENTITY_IS_PARAMETER = 30 : 30 + XML_ERR_UNKNOWN_ENCODING = 31 : 31 + XML_ERR_UNSUPPORTED_ENCODING = 32 : 32 + XML_ERR_STRING_NOT_STARTED = 33 : 33 + XML_ERR_STRING_NOT_CLOSED = 34 : 34 + XML_ERR_NS_DECL_ERROR = 35 : 35 + XML_ERR_ENTITY_NOT_STARTED = 36 : 36 + XML_ERR_ENTITY_NOT_FINISHED = 37 : 37 + XML_ERR_LT_IN_ATTRIBUTE = 38 : 38 + XML_ERR_ATTRIBUTE_NOT_STARTED = 39 : 39 + XML_ERR_ATTRIBUTE_NOT_FINISHED = 40 : 40 + XML_ERR_ATTRIBUTE_WITHOUT_VALUE = 41 : 41 + XML_ERR_ATTRIBUTE_REDEFINED = 42 : 42 + XML_ERR_LITERAL_NOT_STARTED = 43 : 43 + XML_ERR_LITERAL_NOT_FINISHED = 44 : 44 + XML_ERR_COMMENT_NOT_FINISHED = 45 : 45 + XML_ERR_PI_NOT_STARTED = 46 : 46 + XML_ERR_PI_NOT_FINISHED = 47 : 47 + XML_ERR_NOTATION_NOT_STARTED = 48 : 48 + XML_ERR_NOTATION_NOT_FINISHED = 49 : 49 + XML_ERR_ATTLIST_NOT_STARTED = 50 : 50 + XML_ERR_ATTLIST_NOT_FINISHED = 51 : 51 + XML_ERR_MIXED_NOT_STARTED = 52 : 52 + XML_ERR_MIXED_NOT_FINISHED = 53 : 53 + XML_ERR_ELEMCONTENT_NOT_STARTED = 54 : 54 + XML_ERR_ELEMCONTENT_NOT_FINISHED = 55 : 55 + XML_ERR_XMLDECL_NOT_STARTED = 56 : 56 + XML_ERR_XMLDECL_NOT_FINISHED = 57 : 57 + XML_ERR_CONDSEC_NOT_STARTED = 58 : 58 + XML_ERR_CONDSEC_NOT_FINISHED = 59 : 59 + XML_ERR_EXT_SUBSET_NOT_FINISHED = 60 : 60 + XML_ERR_DOCTYPE_NOT_FINISHED = 61 : 61 + XML_ERR_MISPLACED_CDATA_END = 62 : 62 + XML_ERR_CDATA_NOT_FINISHED = 63 : 63 + XML_ERR_RESERVED_XML_NAME = 64 : 64 + XML_ERR_SPACE_REQUIRED = 65 : 65 + XML_ERR_SEPARATOR_REQUIRED = 66 : 66 + XML_ERR_NMTOKEN_REQUIRED = 67 : 67 + XML_ERR_NAME_REQUIRED = 68 : 68 + XML_ERR_PCDATA_REQUIRED = 69 : 69 + XML_ERR_URI_REQUIRED = 70 : 70 + XML_ERR_PUBID_REQUIRED = 71 : 71 + XML_ERR_LT_REQUIRED = 72 : 72 + XML_ERR_GT_REQUIRED = 73 : 73 + XML_ERR_LTSLASH_REQUIRED = 74 : 74 + XML_ERR_EQUAL_REQUIRED = 75 : 75 + XML_ERR_TAG_NAME_MISMATCH = 76 : 76 + XML_ERR_TAG_NOT_FINISHED = 77 : 77 + XML_ERR_STANDALONE_VALUE = 78 : 78 + XML_ERR_ENCODING_NAME = 79 : 79 + XML_ERR_HYPHEN_IN_COMMENT = 80 : 80 + XML_ERR_INVALID_ENCODING = 81 : 81 + XML_ERR_EXT_ENTITY_STANDALONE = 82 : 82 + XML_ERR_CONDSEC_INVALID = 83 : 83 + XML_ERR_VALUE_REQUIRED = 84 : 84 + XML_ERR_NOT_WELL_BALANCED = 85 : 85 + XML_ERR_EXTRA_CONTENT = 86 : 86 + XML_ERR_ENTITY_CHAR_ERROR = 87 : 87 + XML_ERR_ENTITY_PE_INTERNAL = 88 : 88 + XML_ERR_ENTITY_LOOP = 89 : 89 + XML_ERR_ENTITY_BOUNDARY = 90 : 90 + XML_ERR_INVALID_URI = 91 : 91 + XML_ERR_URI_FRAGMENT = 92 : 92 + XML_WAR_CATALOG_PI = 93 : 93 + XML_ERR_NO_DTD = 94 : 94 + XML_ERR_CONDSEC_INVALID_KEYWORD = 95 : 95 + XML_ERR_VERSION_MISSING = 96 : 96 + XML_WAR_UNKNOWN_VERSION = 97 : 97 + XML_WAR_LANG_VALUE = 98 : 98 + XML_WAR_NS_URI = 99 : 99 + XML_WAR_NS_URI_RELATIVE = 100 : 100 + XML_ERR_MISSING_ENCODING = 101 : 101 + XML_WAR_SPACE_VALUE = 102 : 102 + XML_ERR_NOT_STANDALONE = 103 : 103 + XML_ERR_ENTITY_PROCESSING = 104 : 104 + XML_ERR_NOTATION_PROCESSING = 105 : 105 + XML_WAR_NS_COLUMN = 106 : 106 + XML_WAR_ENTITY_REDEFINED = 107 : 107 + XML_NS_ERR_XML_NAMESPACE = 200 + XML_NS_ERR_UNDEFINED_NAMESPACE = 201 : 201 + XML_NS_ERR_QNAME = 202 : 202 + XML_NS_ERR_ATTRIBUTE_REDEFINED = 203 : 203 + XML_NS_ERR_EMPTY = 204 : 204 + XML_DTD_ATTRIBUTE_DEFAULT = 500 + XML_DTD_ATTRIBUTE_REDEFINED = 501 : 501 + XML_DTD_ATTRIBUTE_VALUE = 502 : 502 + XML_DTD_CONTENT_ERROR = 503 : 503 + XML_DTD_CONTENT_MODEL = 504 : 504 + XML_DTD_CONTENT_NOT_DETERMINIST = 505 : 505 + XML_DTD_DIFFERENT_PREFIX = 506 : 506 + XML_DTD_ELEM_DEFAULT_NAMESPACE = 507 : 507 + XML_DTD_ELEM_NAMESPACE = 508 : 508 + XML_DTD_ELEM_REDEFINED = 509 : 509 + XML_DTD_EMPTY_NOTATION = 510 : 510 + XML_DTD_ENTITY_TYPE = 511 : 511 + XML_DTD_ID_FIXED = 512 : 512 + XML_DTD_ID_REDEFINED = 513 : 513 + XML_DTD_ID_SUBSET = 514 : 514 + XML_DTD_INVALID_CHILD = 515 : 515 + XML_DTD_INVALID_DEFAULT = 516 : 516 + XML_DTD_LOAD_ERROR = 517 : 517 + XML_DTD_MISSING_ATTRIBUTE = 518 : 518 + XML_DTD_MIXED_CORRUPT = 519 : 519 + XML_DTD_MULTIPLE_ID = 520 : 520 + XML_DTD_NO_DOC = 521 : 521 + XML_DTD_NO_DTD = 522 : 522 + XML_DTD_NO_ELEM_NAME = 523 : 523 + XML_DTD_NO_PREFIX = 524 : 524 + XML_DTD_NO_ROOT = 525 : 525 + XML_DTD_NOTATION_REDEFINED = 526 : 526 + XML_DTD_NOTATION_VALUE = 527 : 527 + XML_DTD_NOT_EMPTY = 528 : 528 + XML_DTD_NOT_PCDATA = 529 : 529 + XML_DTD_NOT_STANDALONE = 530 : 530 + XML_DTD_ROOT_NAME = 531 : 531 + XML_DTD_STANDALONE_WHITE_SPACE = 532 : 532 + XML_DTD_UNKNOWN_ATTRIBUTE = 533 : 533 + XML_DTD_UNKNOWN_ELEM = 534 : 534 + XML_DTD_UNKNOWN_ENTITY = 535 : 535 + XML_DTD_UNKNOWN_ID = 536 : 536 + XML_DTD_UNKNOWN_NOTATION = 537 : 537 + XML_DTD_STANDALONE_DEFAULTED = 538 : 538 + XML_DTD_XMLID_VALUE = 539 : 539 + XML_DTD_XMLID_TYPE = 540 : 540 + XML_HTML_STRUCURE_ERROR = 800 + XML_HTML_UNKNOWN_TAG = 801 : 801 + XML_RNGP_ANYNAME_ATTR_ANCESTOR = 1000 + XML_RNGP_ATTR_CONFLICT = 1001 : 1001 + XML_RNGP_ATTRIBUTE_CHILDREN = 1002 : 1002 + XML_RNGP_ATTRIBUTE_CONTENT = 1003 : 1003 + XML_RNGP_ATTRIBUTE_EMPTY = 1004 : 1004 + XML_RNGP_ATTRIBUTE_NOOP = 1005 : 1005 + XML_RNGP_CHOICE_CONTENT = 1006 : 1006 + XML_RNGP_CHOICE_EMPTY = 1007 : 1007 + XML_RNGP_CREATE_FAILURE = 1008 : 1008 + XML_RNGP_DATA_CONTENT = 1009 : 1009 + XML_RNGP_DEF_CHOICE_AND_INTERLEAVE = 1010 : 1010 + XML_RNGP_DEFINE_CREATE_FAILED = 1011 : 1011 + XML_RNGP_DEFINE_EMPTY = 1012 : 1012 + XML_RNGP_DEFINE_MISSING = 1013 : 1013 + XML_RNGP_DEFINE_NAME_MISSING = 1014 : 1014 + XML_RNGP_ELEM_CONTENT_EMPTY = 1015 : 1015 + XML_RNGP_ELEM_CONTENT_ERROR = 1016 : 1016 + XML_RNGP_ELEMENT_EMPTY = 1017 : 1017 + XML_RNGP_ELEMENT_CONTENT = 1018 : 1018 + XML_RNGP_ELEMENT_NAME = 1019 : 1019 + XML_RNGP_ELEMENT_NO_CONTENT = 1020 : 1020 + XML_RNGP_ELEM_TEXT_CONFLICT = 1021 : 1021 + XML_RNGP_EMPTY = 1022 : 1022 + XML_RNGP_EMPTY_CONSTRUCT = 1023 : 1023 + XML_RNGP_EMPTY_CONTENT = 1024 : 1024 + XML_RNGP_EMPTY_NOT_EMPTY = 1025 : 1025 + XML_RNGP_ERROR_TYPE_LIB = 1026 : 1026 + XML_RNGP_EXCEPT_EMPTY = 1027 : 1027 + XML_RNGP_EXCEPT_MISSING = 1028 : 1028 + XML_RNGP_EXCEPT_MULTIPLE = 1029 : 1029 + XML_RNGP_EXCEPT_NO_CONTENT = 1030 : 1030 + XML_RNGP_EXTERNALREF_EMTPY = 1031 : 1031 + XML_RNGP_EXTERNAL_REF_FAILURE = 1032 : 1032 + XML_RNGP_EXTERNALREF_RECURSE = 1033 : 1033 + XML_RNGP_FORBIDDEN_ATTRIBUTE = 1034 : 1034 + XML_RNGP_FOREIGN_ELEMENT = 1035 : 1035 + XML_RNGP_GRAMMAR_CONTENT = 1036 : 1036 + XML_RNGP_GRAMMAR_EMPTY = 1037 : 1037 + XML_RNGP_GRAMMAR_MISSING = 1038 : 1038 + XML_RNGP_GRAMMAR_NO_START = 1039 : 1039 + XML_RNGP_GROUP_ATTR_CONFLICT = 1040 : 1040 + XML_RNGP_HREF_ERROR = 1041 : 1041 + XML_RNGP_INCLUDE_EMPTY = 1042 : 1042 + XML_RNGP_INCLUDE_FAILURE = 1043 : 1043 + XML_RNGP_INCLUDE_RECURSE = 1044 : 1044 + XML_RNGP_INTERLEAVE_ADD = 1045 : 1045 + XML_RNGP_INTERLEAVE_CREATE_FAILED = 1046 : 1046 + XML_RNGP_INTERLEAVE_EMPTY = 1047 : 1047 + XML_RNGP_INTERLEAVE_NO_CONTENT = 1048 : 1048 + XML_RNGP_INVALID_DEFINE_NAME = 1049 : 1049 + XML_RNGP_INVALID_URI = 1050 : 1050 + XML_RNGP_INVALID_VALUE = 1051 : 1051 + XML_RNGP_MISSING_HREF = 1052 : 1052 + XML_RNGP_NAME_MISSING = 1053 : 1053 + XML_RNGP_NEED_COMBINE = 1054 : 1054 + XML_RNGP_NOTALLOWED_NOT_EMPTY = 1055 : 1055 + XML_RNGP_NSNAME_ATTR_ANCESTOR = 1056 : 1056 + XML_RNGP_NSNAME_NO_NS = 1057 : 1057 + XML_RNGP_PARAM_FORBIDDEN = 1058 : 1058 + XML_RNGP_PARAM_NAME_MISSING = 1059 : 1059 + XML_RNGP_PARENTREF_CREATE_FAILED = 1060 : 1060 + XML_RNGP_PARENTREF_NAME_INVALID = 1061 : 1061 + XML_RNGP_PARENTREF_NO_NAME = 1062 : 1062 + XML_RNGP_PARENTREF_NO_PARENT = 1063 : 1063 + XML_RNGP_PARENTREF_NOT_EMPTY = 1064 : 1064 + XML_RNGP_PARSE_ERROR = 1065 : 1065 + XML_RNGP_PAT_ANYNAME_EXCEPT_ANYNAME = 1066 : 1066 + XML_RNGP_PAT_ATTR_ATTR = 1067 : 1067 + XML_RNGP_PAT_ATTR_ELEM = 1068 : 1068 + XML_RNGP_PAT_DATA_EXCEPT_ATTR = 1069 : 1069 + XML_RNGP_PAT_DATA_EXCEPT_ELEM = 1070 : 1070 + XML_RNGP_PAT_DATA_EXCEPT_EMPTY = 1071 : 1071 + XML_RNGP_PAT_DATA_EXCEPT_GROUP = 1072 : 1072 + XML_RNGP_PAT_DATA_EXCEPT_INTERLEAVE = 1073 : 1073 + XML_RNGP_PAT_DATA_EXCEPT_LIST = 1074 : 1074 + XML_RNGP_PAT_DATA_EXCEPT_ONEMORE = 1075 : 1075 + XML_RNGP_PAT_DATA_EXCEPT_REF = 1076 : 1076 + XML_RNGP_PAT_DATA_EXCEPT_TEXT = 1077 : 1077 + XML_RNGP_PAT_LIST_ATTR = 1078 : 1078 + XML_RNGP_PAT_LIST_ELEM = 1079 : 1079 + XML_RNGP_PAT_LIST_INTERLEAVE = 1080 : 1080 + XML_RNGP_PAT_LIST_LIST = 1081 : 1081 + XML_RNGP_PAT_LIST_REF = 1082 : 1082 + XML_RNGP_PAT_LIST_TEXT = 1083 : 1083 + XML_RNGP_PAT_NSNAME_EXCEPT_ANYNAME = 1084 : 1084 + XML_RNGP_PAT_NSNAME_EXCEPT_NSNAME = 1085 : 1085 + XML_RNGP_PAT_ONEMORE_GROUP_ATTR = 1086 : 1086 + XML_RNGP_PAT_ONEMORE_INTERLEAVE_ATTR = 1087 : 1087 + XML_RNGP_PAT_START_ATTR = 1088 : 1088 + XML_RNGP_PAT_START_DATA = 1089 : 1089 + XML_RNGP_PAT_START_EMPTY = 1090 : 1090 + XML_RNGP_PAT_START_GROUP = 1091 : 1091 + XML_RNGP_PAT_START_INTERLEAVE = 1092 : 1092 + XML_RNGP_PAT_START_LIST = 1093 : 1093 + XML_RNGP_PAT_START_ONEMORE = 1094 : 1094 + XML_RNGP_PAT_START_TEXT = 1095 : 1095 + XML_RNGP_PAT_START_VALUE = 1096 : 1096 + XML_RNGP_PREFIX_UNDEFINED = 1097 : 1097 + XML_RNGP_REF_CREATE_FAILED = 1098 : 1098 + XML_RNGP_REF_CYCLE = 1099 : 1099 + XML_RNGP_REF_NAME_INVALID = 1100 : 1100 + XML_RNGP_REF_NO_DEF = 1101 : 1101 + XML_RNGP_REF_NO_NAME = 1102 : 1102 + XML_RNGP_REF_NOT_EMPTY = 1103 : 1103 + XML_RNGP_START_CHOICE_AND_INTERLEAVE = 1104 : 1104 + XML_RNGP_START_CONTENT = 1105 : 1105 + XML_RNGP_START_EMPTY = 1106 : 1106 + XML_RNGP_START_MISSING = 1107 : 1107 + XML_RNGP_TEXT_EXPECTED = 1108 : 1108 + XML_RNGP_TEXT_HAS_CHILD = 1109 : 1109 + XML_RNGP_TYPE_MISSING = 1110 : 1110 + XML_RNGP_TYPE_NOT_FOUND = 1111 : 1111 + XML_RNGP_TYPE_VALUE = 1112 : 1112 + XML_RNGP_UNKNOWN_ATTRIBUTE = 1113 : 1113 + XML_RNGP_UNKNOWN_COMBINE = 1114 : 1114 + XML_RNGP_UNKNOWN_CONSTRUCT = 1115 : 1115 + XML_RNGP_UNKNOWN_TYPE_LIB = 1116 : 1116 + XML_RNGP_URI_FRAGMENT = 1117 : 1117 + XML_RNGP_URI_NOT_ABSOLUTE = 1118 : 1118 + XML_RNGP_VALUE_EMPTY = 1119 : 1119 + XML_RNGP_VALUE_NO_CONTENT = 1120 : 1120 + XML_RNGP_XMLNS_NAME = 1121 : 1121 + XML_RNGP_XML_NS = 1122 : 1122 + XML_XPATH_EXPRESSION_OK = 1200 + XML_XPATH_NUMBER_ERROR = 1201 : 1201 + XML_XPATH_UNFINISHED_LITERAL_ERROR = 1202 : 1202 + XML_XPATH_START_LITERAL_ERROR = 1203 : 1203 + XML_XPATH_VARIABLE_REF_ERROR = 1204 : 1204 + XML_XPATH_UNDEF_VARIABLE_ERROR = 1205 : 1205 + XML_XPATH_INVALID_PREDICATE_ERROR = 1206 : 1206 + XML_XPATH_EXPR_ERROR = 1207 : 1207 + XML_XPATH_UNCLOSED_ERROR = 1208 : 1208 + XML_XPATH_UNKNOWN_FUNC_ERROR = 1209 : 1209 + XML_XPATH_INVALID_OPERAND = 1210 : 1210 + XML_XPATH_INVALID_TYPE = 1211 : 1211 + XML_XPATH_INVALID_ARITY = 1212 : 1212 + XML_XPATH_INVALID_CTXT_SIZE = 1213 : 1213 + XML_XPATH_INVALID_CTXT_POSITION = 1214 : 1214 + XML_XPATH_MEMORY_ERROR = 1215 : 1215 + XML_XPTR_SYNTAX_ERROR = 1216 : 1216 + XML_XPTR_RESOURCE_ERROR = 1217 : 1217 + XML_XPTR_SUB_RESOURCE_ERROR = 1218 : 1218 + XML_XPATH_UNDEF_PREFIX_ERROR = 1219 : 1219 + XML_XPATH_ENCODING_ERROR = 1220 : 1220 + XML_XPATH_INVALID_CHAR_ERROR = 1221 : 1221 + XML_TREE_INVALID_HEX = 1300 + XML_TREE_INVALID_DEC = 1301 : 1301 + XML_TREE_UNTERMINATED_ENTITY = 1302 : 1302 + XML_SAVE_NOT_UTF8 = 1400 + XML_SAVE_CHAR_INVALID = 1401 : 1401 + XML_SAVE_NO_DOCTYPE = 1402 : 1402 + XML_SAVE_UNKNOWN_ENCODING = 1403 : 1403 + XML_REGEXP_COMPILE_ERROR = 1450 + XML_IO_UNKNOWN = 1500 + XML_IO_EACCES = 1501 : 1501 + XML_IO_EAGAIN = 1502 : 1502 + XML_IO_EBADF = 1503 : 1503 + XML_IO_EBADMSG = 1504 : 1504 + XML_IO_EBUSY = 1505 : 1505 + XML_IO_ECANCELED = 1506 : 1506 + XML_IO_ECHILD = 1507 : 1507 + XML_IO_EDEADLK = 1508 : 1508 + XML_IO_EDOM = 1509 : 1509 + XML_IO_EEXIST = 1510 : 1510 + XML_IO_EFAULT = 1511 : 1511 + XML_IO_EFBIG = 1512 : 1512 + XML_IO_EINPROGRESS = 1513 : 1513 + XML_IO_EINTR = 1514 : 1514 + XML_IO_EINVAL = 1515 : 1515 + XML_IO_EIO = 1516 : 1516 + XML_IO_EISDIR = 1517 : 1517 + XML_IO_EMFILE = 1518 : 1518 + XML_IO_EMLINK = 1519 : 1519 + XML_IO_EMSGSIZE = 1520 : 1520 + XML_IO_ENAMETOOLONG = 1521 : 1521 + XML_IO_ENFILE = 1522 : 1522 + XML_IO_ENODEV = 1523 : 1523 + XML_IO_ENOENT = 1524 : 1524 + XML_IO_ENOEXEC = 1525 : 1525 + XML_IO_ENOLCK = 1526 : 1526 + XML_IO_ENOMEM = 1527 : 1527 + XML_IO_ENOSPC = 1528 : 1528 + XML_IO_ENOSYS = 1529 : 1529 + XML_IO_ENOTDIR = 1530 : 1530 + XML_IO_ENOTEMPTY = 1531 : 1531 + XML_IO_ENOTSUP = 1532 : 1532 + XML_IO_ENOTTY = 1533 : 1533 + XML_IO_ENXIO = 1534 : 1534 + XML_IO_EPERM = 1535 : 1535 + XML_IO_EPIPE = 1536 : 1536 + XML_IO_ERANGE = 1537 : 1537 + XML_IO_EROFS = 1538 : 1538 + XML_IO_ESPIPE = 1539 : 1539 + XML_IO_ESRCH = 1540 : 1540 + XML_IO_ETIMEDOUT = 1541 : 1541 + XML_IO_EXDEV = 1542 : 1542 + XML_IO_NETWORK_ATTEMPT = 1543 : 1543 + XML_IO_ENCODER = 1544 : 1544 + XML_IO_FLUSH = 1545 : 1545 + XML_IO_WRITE = 1546 : 1546 + XML_IO_NO_INPUT = 1547 : 1547 + XML_IO_BUFFER_FULL = 1548 : 1548 + XML_IO_LOAD_ERROR = 1549 : 1549 + XML_IO_ENOTSOCK = 1550 : 1550 + XML_IO_EISCONN = 1551 : 1551 + XML_IO_ECONNREFUSED = 1552 : 1552 + XML_IO_ENETUNREACH = 1553 : 1553 + XML_IO_EADDRINUSE = 1554 : 1554 + XML_IO_EALREADY = 1555 : 1555 + XML_IO_EAFNOSUPPORT = 1556 : 1556 + XML_XINCLUDE_RECURSION = 1600 + XML_XINCLUDE_PARSE_VALUE = 1601 : 1601 + XML_XINCLUDE_ENTITY_DEF_MISMATCH = 1602 : 1602 + XML_XINCLUDE_NO_HREF = 1603 : 1603 + XML_XINCLUDE_NO_FALLBACK = 1604 : 1604 + XML_XINCLUDE_HREF_URI = 1605 : 1605 + XML_XINCLUDE_TEXT_FRAGMENT = 1606 : 1606 + XML_XINCLUDE_TEXT_DOCUMENT = 1607 : 1607 + XML_XINCLUDE_INVALID_CHAR = 1608 : 1608 + XML_XINCLUDE_BUILD_FAILED = 1609 : 1609 + XML_XINCLUDE_UNKNOWN_ENCODING = 1610 : 1610 + XML_XINCLUDE_MULTIPLE_ROOT = 1611 : 1611 + XML_XINCLUDE_XPTR_FAILED = 1612 : 1612 + XML_XINCLUDE_XPTR_RESULT = 1613 : 1613 + XML_XINCLUDE_INCLUDE_IN_INCLUDE = 1614 : 1614 + XML_XINCLUDE_FALLBACKS_IN_INCLUDE = 1615 : 1615 + XML_XINCLUDE_FALLBACK_NOT_IN_INCLUDE = 1616 : 1616 + XML_XINCLUDE_DEPRECATED_NS = 1617 : 1617 + XML_XINCLUDE_FRAGMENT_ID = 1618 : 1618 + XML_CATALOG_MISSING_ATTR = 1650 + XML_CATALOG_ENTRY_BROKEN = 1651 : 1651 + XML_CATALOG_PREFER_VALUE = 1652 : 1652 + XML_CATALOG_NOT_CATALOG = 1653 : 1653 + XML_CATALOG_RECURSION = 1654 : 1654 + XML_SCHEMAP_PREFIX_UNDEFINED = 1700 + XML_SCHEMAP_ATTRFORMDEFAULT_VALUE = 1701 : 1701 + XML_SCHEMAP_ATTRGRP_NONAME_NOREF = 1702 : 1702 + XML_SCHEMAP_ATTR_NONAME_NOREF = 1703 : 1703 + XML_SCHEMAP_COMPLEXTYPE_NONAME_NOREF = 1704 : 1704 + XML_SCHEMAP_ELEMFORMDEFAULT_VALUE = 1705 : 1705 + XML_SCHEMAP_ELEM_NONAME_NOREF = 1706 : 1706 + XML_SCHEMAP_EXTENSION_NO_BASE = 1707 : 1707 + XML_SCHEMAP_FACET_NO_VALUE = 1708 : 1708 + XML_SCHEMAP_FAILED_BUILD_IMPORT = 1709 : 1709 + XML_SCHEMAP_GROUP_NONAME_NOREF = 1710 : 1710 + XML_SCHEMAP_IMPORT_NAMESPACE_NOT_URI = 1711 : 1711 + XML_SCHEMAP_IMPORT_REDEFINE_NSNAME = 1712 : 1712 + XML_SCHEMAP_IMPORT_SCHEMA_NOT_URI = 1713 : 1713 + XML_SCHEMAP_INVALID_BOOLEAN = 1714 : 1714 + XML_SCHEMAP_INVALID_ENUM = 1715 : 1715 + XML_SCHEMAP_INVALID_FACET = 1716 : 1716 + XML_SCHEMAP_INVALID_FACET_VALUE = 1717 : 1717 + XML_SCHEMAP_INVALID_MAXOCCURS = 1718 : 1718 + XML_SCHEMAP_INVALID_MINOCCURS = 1719 : 1719 + XML_SCHEMAP_INVALID_REF_AND_SUBTYPE = 1720 : 1720 + XML_SCHEMAP_INVALID_WHITE_SPACE = 1721 : 1721 + XML_SCHEMAP_NOATTR_NOREF = 1722 : 1722 + XML_SCHEMAP_NOTATION_NO_NAME = 1723 : 1723 + XML_SCHEMAP_NOTYPE_NOREF = 1724 : 1724 + XML_SCHEMAP_REF_AND_SUBTYPE = 1725 : 1725 + XML_SCHEMAP_RESTRICTION_NONAME_NOREF = 1726 : 1726 + XML_SCHEMAP_SIMPLETYPE_NONAME = 1727 : 1727 + XML_SCHEMAP_TYPE_AND_SUBTYPE = 1728 : 1728 + XML_SCHEMAP_UNKNOWN_ALL_CHILD = 1729 : 1729 + XML_SCHEMAP_UNKNOWN_ANYATTRIBUTE_CHILD = 1730 : 1730 + XML_SCHEMAP_UNKNOWN_ATTR_CHILD = 1731 : 1731 + XML_SCHEMAP_UNKNOWN_ATTRGRP_CHILD = 1732 : 1732 + XML_SCHEMAP_UNKNOWN_ATTRIBUTE_GROUP = 1733 : 1733 + XML_SCHEMAP_UNKNOWN_BASE_TYPE = 1734 : 1734 + XML_SCHEMAP_UNKNOWN_CHOICE_CHILD = 1735 : 1735 + XML_SCHEMAP_UNKNOWN_COMPLEXCONTENT_CHILD = 1736 : 1736 + XML_SCHEMAP_UNKNOWN_COMPLEXTYPE_CHILD = 1737 : 1737 + XML_SCHEMAP_UNKNOWN_ELEM_CHILD = 1738 : 1738 + XML_SCHEMAP_UNKNOWN_EXTENSION_CHILD = 1739 : 1739 + XML_SCHEMAP_UNKNOWN_FACET_CHILD = 1740 : 1740 + XML_SCHEMAP_UNKNOWN_FACET_TYPE = 1741 : 1741 + XML_SCHEMAP_UNKNOWN_GROUP_CHILD = 1742 : 1742 + XML_SCHEMAP_UNKNOWN_IMPORT_CHILD = 1743 : 1743 + XML_SCHEMAP_UNKNOWN_LIST_CHILD = 1744 : 1744 + XML_SCHEMAP_UNKNOWN_NOTATION_CHILD = 1745 : 1745 + XML_SCHEMAP_UNKNOWN_PROCESSCONTENT_CHILD = 1746 : 1746 + XML_SCHEMAP_UNKNOWN_REF = 1747 : 1747 + XML_SCHEMAP_UNKNOWN_RESTRICTION_CHILD = 1748 : 1748 + XML_SCHEMAP_UNKNOWN_SCHEMAS_CHILD = 1749 : 1749 + XML_SCHEMAP_UNKNOWN_SEQUENCE_CHILD = 1750 : 1750 + XML_SCHEMAP_UNKNOWN_SIMPLECONTENT_CHILD = 1751 : 1751 + XML_SCHEMAP_UNKNOWN_SIMPLETYPE_CHILD = 1752 : 1752 + XML_SCHEMAP_UNKNOWN_TYPE = 1753 : 1753 + XML_SCHEMAP_UNKNOWN_UNION_CHILD = 1754 : 1754 + XML_SCHEMAP_ELEM_DEFAULT_FIXED = 1755 : 1755 + XML_SCHEMAP_REGEXP_INVALID = 1756 : 1756 + XML_SCHEMAP_FAILED_LOAD = 1757 : 1757 + XML_SCHEMAP_NOTHING_TO_PARSE = 1758 : 1758 + XML_SCHEMAP_NOROOT = 1759 : 1759 + XML_SCHEMAP_REDEFINED_GROUP = 1760 : 1760 + XML_SCHEMAP_REDEFINED_TYPE = 1761 : 1761 + XML_SCHEMAP_REDEFINED_ELEMENT = 1762 : 1762 + XML_SCHEMAP_REDEFINED_ATTRGROUP = 1763 : 1763 + XML_SCHEMAP_REDEFINED_ATTR = 1764 : 1764 + XML_SCHEMAP_REDEFINED_NOTATION = 1765 : 1765 + XML_SCHEMAP_FAILED_PARSE = 1766 : 1766 + XML_SCHEMAP_UNKNOWN_PREFIX = 1767 : 1767 + XML_SCHEMAP_DEF_AND_PREFIX = 1768 : 1768 + XML_SCHEMAP_UNKNOWN_INCLUDE_CHILD = 1769 : 1769 + XML_SCHEMAP_INCLUDE_SCHEMA_NOT_URI = 1770 : 1770 + XML_SCHEMAP_INCLUDE_SCHEMA_NO_URI = 1771 : 1771 + XML_SCHEMAP_NOT_SCHEMA = 1772 : 1772 + XML_SCHEMAP_UNKNOWN_MEMBER_TYPE = 1773 : 1773 + XML_SCHEMAP_INVALID_ATTR_USE = 1774 : 1774 + XML_SCHEMAP_RECURSIVE = 1775 : 1775 + XML_SCHEMAP_SUPERNUMEROUS_LIST_ITEM_TYPE = 1776 : 1776 + XML_SCHEMAP_INVALID_ATTR_COMBINATION = 1777 : 1777 + XML_SCHEMAP_INVALID_ATTR_INLINE_COMBINATION = 1778 : 1778 + XML_SCHEMAP_MISSING_SIMPLETYPE_CHILD = 1779 : 1779 + XML_SCHEMAP_INVALID_ATTR_NAME = 1780 : 1780 + XML_SCHEMAP_REF_AND_CONTENT = 1781 : 1781 + XML_SCHEMAP_CT_PROPS_CORRECT_1 = 1782 : 1782 + XML_SCHEMAP_CT_PROPS_CORRECT_2 = 1783 : 1783 + XML_SCHEMAP_CT_PROPS_CORRECT_3 = 1784 : 1784 + XML_SCHEMAP_CT_PROPS_CORRECT_4 = 1785 : 1785 + XML_SCHEMAP_CT_PROPS_CORRECT_5 = 1786 : 1786 + XML_SCHEMAP_DERIVATION_OK_RESTRICTION_1 = 1787 : 1787 + XML_SCHEMAP_DERIVATION_OK_RESTRICTION_2_1_1 = 1788 : 1788 + XML_SCHEMAP_DERIVATION_OK_RESTRICTION_2_1_2 = 1789 : 1789 + XML_SCHEMAP_DERIVATION_OK_RESTRICTION_2_2 = 1790 : 1790 + XML_SCHEMAP_DERIVATION_OK_RESTRICTION_3 = 1791 : 1791 + XML_SCHEMAP_WILDCARD_INVALID_NS_MEMBER = 1792 : 1792 + XML_SCHEMAP_INTERSECTION_NOT_EXPRESSIBLE = 1793 : 1793 + XML_SCHEMAP_UNION_NOT_EXPRESSIBLE = 1794 : 1794 + XML_SCHEMAP_SRC_IMPORT_3_1 = 1795 : 1795 + XML_SCHEMAP_SRC_IMPORT_3_2 = 1796 : 1796 + XML_SCHEMAP_DERIVATION_OK_RESTRICTION_4_1 = 1797 : 1797 + XML_SCHEMAP_DERIVATION_OK_RESTRICTION_4_2 = 1798 : 1798 + XML_SCHEMAP_DERIVATION_OK_RESTRICTION_4_3 = 1799 : 1799 + XML_SCHEMAP_COS_CT_EXTENDS_1_3 = 1800 : 1800 + XML_SCHEMAV_NOROOT = 1801 + XML_SCHEMAV_UNDECLAREDELEM = 1802 : 1802 + XML_SCHEMAV_NOTTOPLEVEL = 1803 : 1803 + XML_SCHEMAV_MISSING = 1804 : 1804 + XML_SCHEMAV_WRONGELEM = 1805 : 1805 + XML_SCHEMAV_NOTYPE = 1806 : 1806 + XML_SCHEMAV_NOROLLBACK = 1807 : 1807 + XML_SCHEMAV_ISABSTRACT = 1808 : 1808 + XML_SCHEMAV_NOTEMPTY = 1809 : 1809 + XML_SCHEMAV_ELEMCONT = 1810 : 1810 + XML_SCHEMAV_HAVEDEFAULT = 1811 : 1811 + XML_SCHEMAV_NOTNILLABLE = 1812 : 1812 + XML_SCHEMAV_EXTRACONTENT = 1813 : 1813 + XML_SCHEMAV_INVALIDATTR = 1814 : 1814 + XML_SCHEMAV_INVALIDELEM = 1815 : 1815 + XML_SCHEMAV_NOTDETERMINIST = 1816 : 1816 + XML_SCHEMAV_CONSTRUCT = 1817 : 1817 + XML_SCHEMAV_INTERNAL = 1818 : 1818 + XML_SCHEMAV_NOTSIMPLE = 1819 : 1819 + XML_SCHEMAV_ATTRUNKNOWN = 1820 : 1820 + XML_SCHEMAV_ATTRINVALID = 1821 : 1821 + XML_SCHEMAV_VALUE = 1822 : 1822 + XML_SCHEMAV_FACET = 1823 : 1823 + XML_SCHEMAV_CVC_DATATYPE_VALID_1_2_1 = 1824 : 1824 + XML_SCHEMAV_CVC_DATATYPE_VALID_1_2_2 = 1825 : 1825 + XML_SCHEMAV_CVC_DATATYPE_VALID_1_2_3 = 1826 : 1826 + XML_SCHEMAV_CVC_TYPE_3_1_1 = 1827 : 1827 + XML_SCHEMAV_CVC_TYPE_3_1_2 = 1828 : 1828 + XML_SCHEMAV_CVC_FACET_VALID = 1829 : 1829 + XML_SCHEMAV_CVC_LENGTH_VALID = 1830 : 1830 + XML_SCHEMAV_CVC_MINLENGTH_VALID = 1831 : 1831 + XML_SCHEMAV_CVC_MAXLENGTH_VALID = 1832 : 1832 + XML_SCHEMAV_CVC_MININCLUSIVE_VALID = 1833 : 1833 + XML_SCHEMAV_CVC_MAXINCLUSIVE_VALID = 1834 : 1834 + XML_SCHEMAV_CVC_MINEXCLUSIVE_VALID = 1835 : 1835 + XML_SCHEMAV_CVC_MAXEXCLUSIVE_VALID = 1836 : 1836 + XML_SCHEMAV_CVC_TOTALDIGITS_VALID = 1837 : 1837 + XML_SCHEMAV_CVC_FRACTIONDIGITS_VALID = 1838 : 1838 + XML_SCHEMAV_CVC_PATTERN_VALID = 1839 : 1839 + XML_SCHEMAV_CVC_ENUMERATION_VALID = 1840 : 1840 + XML_SCHEMAV_CVC_COMPLEX_TYPE_2_1 = 1841 : 1841 + XML_SCHEMAV_CVC_COMPLEX_TYPE_2_2 = 1842 : 1842 + XML_SCHEMAV_CVC_COMPLEX_TYPE_2_3 = 1843 : 1843 + XML_SCHEMAV_CVC_COMPLEX_TYPE_2_4 = 1844 : 1844 + XML_SCHEMAV_CVC_ELT_1 = 1845 : 1845 + XML_SCHEMAV_CVC_ELT_2 = 1846 : 1846 + XML_SCHEMAV_CVC_ELT_3_1 = 1847 : 1847 + XML_SCHEMAV_CVC_ELT_3_2_1 = 1848 : 1848 + XML_SCHEMAV_CVC_ELT_3_2_2 = 1849 : 1849 + XML_SCHEMAV_CVC_ELT_4_1 = 1850 : 1850 + XML_SCHEMAV_CVC_ELT_4_2 = 1851 : 1851 + XML_SCHEMAV_CVC_ELT_4_3 = 1852 : 1852 + XML_SCHEMAV_CVC_ELT_5_1_1 = 1853 : 1853 + XML_SCHEMAV_CVC_ELT_5_1_2 = 1854 : 1854 + XML_SCHEMAV_CVC_ELT_5_2_1 = 1855 : 1855 + XML_SCHEMAV_CVC_ELT_5_2_2_1 = 1856 : 1856 + XML_SCHEMAV_CVC_ELT_5_2_2_2_1 = 1857 : 1857 + XML_SCHEMAV_CVC_ELT_5_2_2_2_2 = 1858 : 1858 + XML_SCHEMAV_CVC_ELT_6 = 1859 : 1859 + XML_SCHEMAV_CVC_ELT_7 = 1860 : 1860 + XML_SCHEMAV_CVC_ATTRIBUTE_1 = 1861 : 1861 + XML_SCHEMAV_CVC_ATTRIBUTE_2 = 1862 : 1862 + XML_SCHEMAV_CVC_ATTRIBUTE_3 = 1863 : 1863 + XML_SCHEMAV_CVC_ATTRIBUTE_4 = 1864 : 1864 + XML_SCHEMAV_CVC_COMPLEX_TYPE_3_1 = 1865 : 1865 + XML_SCHEMAV_CVC_COMPLEX_TYPE_3_2_1 = 1866 : 1866 + XML_SCHEMAV_CVC_COMPLEX_TYPE_3_2_2 = 1867 : 1867 + XML_SCHEMAV_CVC_COMPLEX_TYPE_4 = 1868 : 1868 + XML_SCHEMAV_CVC_COMPLEX_TYPE_5_1 = 1869 : 1869 + XML_SCHEMAV_CVC_COMPLEX_TYPE_5_2 = 1870 : 1870 + XML_SCHEMAV_ELEMENT_CONTENT = 1871 : 1871 + XML_SCHEMAV_DOCUMENT_ELEMENT_MISSING = 1872 : 1872 + XML_SCHEMAV_CVC_COMPLEX_TYPE_1 = 1873 : 1873 + XML_SCHEMAV_CVC_AU = 1874 : 1874 + XML_SCHEMAV_CVC_TYPE_1 = 1875 : 1875 + XML_SCHEMAV_CVC_TYPE_2 = 1876 : 1876 + XML_SCHEMAV_CVC_IDC = 1877 : 1877 + XML_SCHEMAV_CVC_WILDCARD = 1878 : 1878 + XML_XPTR_UNKNOWN_SCHEME = 1900 + XML_XPTR_CHILDSEQ_START = 1901 : 1901 + XML_XPTR_EVAL_FAILED = 1902 : 1902 + XML_XPTR_EXTRA_OBJECTS = 1903 : 1903 + XML_C14N_CREATE_CTXT = 1950 + XML_C14N_REQUIRES_UTF8 = 1951 : 1951 + XML_C14N_CREATE_STACK = 1952 : 1952 + XML_C14N_INVALID_NODE = 1953 : 1953 + XML_C14N_UNKNOW_NODE = 1954 : 1954 + XML_C14N_RELATIVE_NAMESPACE = 1955 : 1955 + XML_FTP_PASV_ANSWER = 2000 + XML_FTP_EPSV_ANSWER = 2001 : 2001 + XML_FTP_ACCNT = 2002 : 2002 + XML_FTP_URL_SYNTAX = 2003 : 2003 + XML_HTTP_URL_SYNTAX = 2020 + XML_HTTP_USE_IP = 2021 : 2021 + XML_HTTP_UNKNOWN_HOST = 2022 : 2022 + XML_SCHEMAP_SRC_SIMPLE_TYPE_1 = 3000 + XML_SCHEMAP_SRC_SIMPLE_TYPE_2 = 3001 : 3001 + XML_SCHEMAP_SRC_SIMPLE_TYPE_3 = 3002 : 3002 + XML_SCHEMAP_SRC_SIMPLE_TYPE_4 = 3003 : 3003 + XML_SCHEMAP_SRC_RESOLVE = 3004 : 3004 + XML_SCHEMAP_SRC_RESTRICTION_BASE_OR_SIMPLETYPE = 3005 : 3005 + XML_SCHEMAP_SRC_LIST_ITEMTYPE_OR_SIMPLETYPE = 3006 : 3006 + XML_SCHEMAP_SRC_UNION_MEMBERTYPES_OR_SIMPLETYPES = 3007 : 3007 + XML_SCHEMAP_ST_PROPS_CORRECT_1 = 3008 : 3008 + XML_SCHEMAP_ST_PROPS_CORRECT_2 = 3009 : 3009 + XML_SCHEMAP_ST_PROPS_CORRECT_3 = 3010 : 3010 + XML_SCHEMAP_COS_ST_RESTRICTS_1_1 = 3011 : 3011 + XML_SCHEMAP_COS_ST_RESTRICTS_1_2 = 3012 : 3012 + XML_SCHEMAP_COS_ST_RESTRICTS_1_3_1 = 3013 : 3013 + XML_SCHEMAP_COS_ST_RESTRICTS_1_3_2 = 3014 : 3014 + XML_SCHEMAP_COS_ST_RESTRICTS_2_1 = 3015 : 3015 + XML_SCHEMAP_COS_ST_RESTRICTS_2_3_1_1 = 3016 : 3016 + XML_SCHEMAP_COS_ST_RESTRICTS_2_3_1_2 = 3017 : 3017 + XML_SCHEMAP_COS_ST_RESTRICTS_2_3_2_1 = 3018 : 3018 + XML_SCHEMAP_COS_ST_RESTRICTS_2_3_2_2 = 3019 : 3019 + XML_SCHEMAP_COS_ST_RESTRICTS_2_3_2_3 = 3020 : 3020 + XML_SCHEMAP_COS_ST_RESTRICTS_2_3_2_4 = 3021 : 3021 + XML_SCHEMAP_COS_ST_RESTRICTS_2_3_2_5 = 3022 : 3022 + XML_SCHEMAP_COS_ST_RESTRICTS_3_1 = 3023 : 3023 + XML_SCHEMAP_COS_ST_RESTRICTS_3_3_1 = 3024 : 3024 + XML_SCHEMAP_COS_ST_RESTRICTS_3_3_1_2 = 3025 : 3025 + XML_SCHEMAP_COS_ST_RESTRICTS_3_3_2_2 = 3026 : 3026 + XML_SCHEMAP_COS_ST_RESTRICTS_3_3_2_1 = 3027 : 3027 + XML_SCHEMAP_COS_ST_RESTRICTS_3_3_2_3 = 3028 : 3028 + XML_SCHEMAP_COS_ST_RESTRICTS_3_3_2_4 = 3029 : 3029 + XML_SCHEMAP_COS_ST_RESTRICTS_3_3_2_5 = 3030 : 3030 + XML_SCHEMAP_COS_ST_DERIVED_OK_2_1 = 3031 : 3031 + XML_SCHEMAP_COS_ST_DERIVED_OK_2_2 = 3032 : 3032 + XML_SCHEMAP_S4S_ELEM_NOT_ALLOWED = 3033 : 3033 + XML_SCHEMAP_S4S_ELEM_MISSING = 3034 : 3034 + XML_SCHEMAP_S4S_ATTR_NOT_ALLOWED = 3035 : 3035 + XML_SCHEMAP_S4S_ATTR_MISSING = 3036 : 3036 + XML_SCHEMAP_S4S_ATTR_INVALID_VALUE = 3037 : 3037 + XML_SCHEMAP_SRC_ELEMENT_1 = 3038 : 3038 + XML_SCHEMAP_SRC_ELEMENT_2_1 = 3039 : 3039 + XML_SCHEMAP_SRC_ELEMENT_2_2 = 3040 : 3040 + XML_SCHEMAP_SRC_ELEMENT_3 = 3041 : 3041 + XML_SCHEMAP_P_PROPS_CORRECT_1 = 3042 : 3042 + XML_SCHEMAP_P_PROPS_CORRECT_2_1 = 3043 : 3043 + XML_SCHEMAP_P_PROPS_CORRECT_2_2 = 3044 : 3044 + XML_SCHEMAP_E_PROPS_CORRECT_2 = 3045 : 3045 + XML_SCHEMAP_E_PROPS_CORRECT_3 = 3046 : 3046 + XML_SCHEMAP_E_PROPS_CORRECT_4 = 3047 : 3047 + XML_SCHEMAP_E_PROPS_CORRECT_5 = 3048 : 3048 + XML_SCHEMAP_E_PROPS_CORRECT_6 = 3049 : 3049 + XML_SCHEMAP_SRC_INCLUDE = 3050 : 3050 + XML_SCHEMAP_SRC_ATTRIBUTE_1 = 3051 : 3051 + XML_SCHEMAP_SRC_ATTRIBUTE_2 = 3052 : 3052 + XML_SCHEMAP_SRC_ATTRIBUTE_3_1 = 3053 : 3053 + XML_SCHEMAP_SRC_ATTRIBUTE_3_2 = 3054 : 3054 + XML_SCHEMAP_SRC_ATTRIBUTE_4 = 3055 : 3055 + XML_SCHEMAP_NO_XMLNS = 3056 : 3056 + XML_SCHEMAP_NO_XSI = 3057 : 3057 + XML_SCHEMAP_COS_VALID_DEFAULT_1 = 3058 : 3058 + XML_SCHEMAP_COS_VALID_DEFAULT_2_1 = 3059 : 3059 + XML_SCHEMAP_COS_VALID_DEFAULT_2_2_1 = 3060 : 3060 + XML_SCHEMAP_COS_VALID_DEFAULT_2_2_2 = 3061 : 3061 + XML_SCHEMAP_CVC_SIMPLE_TYPE = 3062 : 3062 + XML_SCHEMAP_COS_CT_EXTENDS_1_1 = 3063 : 3063 + XML_SCHEMAP_SRC_IMPORT_1_1 = 3064 : 3064 + XML_SCHEMAP_SRC_IMPORT_1_2 = 3065 : 3065 + XML_SCHEMAP_SRC_IMPORT_2 = 3066 : 3066 + XML_SCHEMAP_SRC_IMPORT_2_1 = 3067 : 3067 + XML_SCHEMAP_SRC_IMPORT_2_2 = 3068 : 3068 + XML_SCHEMAP_INTERNAL = 3069 : 3069 non-W3C + XML_SCHEMAP_NOT_DETERMINISTIC = 3070 : 3070 non-W3C + XML_SCHEMAP_SRC_ATTRIBUTE_GROUP_1 = 3071 : 3071 + XML_SCHEMAP_SRC_ATTRIBUTE_GROUP_2 = 3072 : 3072 + XML_SCHEMAP_SRC_ATTRIBUTE_GROUP_3 = 3073 : 3073 + XML_SCHEMAP_MG_PROPS_CORRECT_1 = 3074 : 3074 + XML_SCHEMAP_MG_PROPS_CORRECT_2 = 3075 : 3075 + XML_SCHEMAP_SRC_CT_1 = 3076 : 3076 + XML_SCHEMAP_DERIVATION_OK_RESTRICTION_2_1_3 = 3077 : 3077 + XML_SCHEMAP_AU_PROPS_CORRECT_2 = 3078 : 3078 + XML_SCHEMAP_A_PROPS_CORRECT_2 = 3079 : 3079 + XML_SCHEMAP_C_PROPS_CORRECT = 3080 : 3080 + XML_SCHEMAP_SRC_REDEFINE = 3081 : 3081 + XML_SCHEMAP_SRC_IMPORT = 3082 : 3082 + XML_SCHEMAP_WARN_SKIP_SCHEMA = 3083 : 3083 + XML_SCHEMAP_WARN_UNLOCATED_SCHEMA = 3084 : 3084 + XML_SCHEMAP_WARN_ATTR_REDECL_PROH = 3085 : 3085 + XML_SCHEMAP_WARN_ATTR_POINTLESS_PROH = 3086 : 3085 + XML_SCHEMAP_AG_PROPS_CORRECT = 3087 : 3086 + XML_SCHEMAP_COS_CT_EXTENDS_1_2 = 3088 : 3087 + XML_SCHEMAP_AU_PROPS_CORRECT = 3089 : 3088 + XML_SCHEMAP_A_PROPS_CORRECT_3 = 3090 : 3089 + XML_SCHEMAP_COS_ALL_LIMITED = 3091 : 3090 + XML_MODULE_OPEN = 4900 : 4900 + XML_MODULE_CLOSE = 4901 : 4901 + XML_CHECK_FOUND_ELEMENT = 5000 + XML_CHECK_FOUND_ATTRIBUTE = 5001 : 5001 + XML_CHECK_FOUND_TEXT = 5002 : 5002 + XML_CHECK_FOUND_CDATA = 5003 : 5003 + XML_CHECK_FOUND_ENTITYREF = 5004 : 5004 + XML_CHECK_FOUND_ENTITY = 5005 : 5005 + XML_CHECK_FOUND_PI = 5006 : 5006 + XML_CHECK_FOUND_COMMENT = 5007 : 5007 + XML_CHECK_FOUND_DOCTYPE = 5008 : 5008 + XML_CHECK_FOUND_FRAGMENT = 5009 : 5009 + XML_CHECK_FOUND_NOTATION = 5010 : 5010 + XML_CHECK_UNKNOWN_NODE = 5011 : 5011 + XML_CHECK_ENTITY_TYPE = 5012 : 5012 + XML_CHECK_NO_PARENT = 5013 : 5013 + XML_CHECK_NO_DOC = 5014 : 5014 + XML_CHECK_NO_NAME = 5015 : 5015 + XML_CHECK_NO_ELEM = 5016 : 5016 + XML_CHECK_WRONG_DOC = 5017 : 5017 + XML_CHECK_NO_PREV = 5018 : 5018 + XML_CHECK_WRONG_PREV = 5019 : 5019 + XML_CHECK_NO_NEXT = 5020 : 5020 + XML_CHECK_WRONG_NEXT = 5021 : 5021 + XML_CHECK_NOT_DTD = 5022 : 5022 + XML_CHECK_NOT_ATTR = 5023 : 5023 + XML_CHECK_NOT_ATTR_DECL = 5024 : 5024 + XML_CHECK_NOT_ELEM_DECL = 5025 : 5025 + XML_CHECK_NOT_ENTITY_DECL = 5026 : 5026 + XML_CHECK_NOT_NS_DECL = 5027 : 5027 + XML_CHECK_NO_HREF = 5028 : 5028 + XML_CHECK_WRONG_PARENT = 5029 : 5029 + XML_CHECK_NS_SCOPE = 5030 : 5030 + XML_CHECK_NS_ANCESTOR = 5031 : 5031 + XML_CHECK_NOT_UTF8 = 5032 : 5032 + XML_CHECK_NO_DICT = 5033 : 5033 + XML_CHECK_NOT_NCNAME = 5034 : 5034 + XML_CHECK_OUTSIDE_DICT = 5035 : 5035 + XML_CHECK_WRONG_NAME = 5036 : 5036 + XML_CHECK_NAME_NOT_NULL = 5037 : 5037 + XML_I18N_NO_NAME = 6000 + XML_I18N_NO_HANDLER = 6001 : 6001 + XML_I18N_EXCESS_HANDLER = 6002 : 6002 + XML_I18N_CONV_FAILED = 6003 : 6003 + XML_I18N_NO_OUTPUT = 6004 : 6004 + XML_CHECK_ = 6005 : 5033 + XML_CHECK_X = 6006 : 503 +""" + +__initErrorConstants() From scoder at codespeak.net Mon May 29 10:49:13 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 29 May 2006 10:49:13 +0200 (CEST) Subject: [Lxml-checkins] r27825 - lxml/trunk Message-ID: <20060529084913.9A32A10053@code0.codespeak.net> Author: scoder Date: Mon May 29 10:49:12 2006 New Revision: 27825 Modified: lxml/trunk/CREDITS.txt Log: credits for noah Modified: lxml/trunk/CREDITS.txt ============================================================================== --- lxml/trunk/CREDITS.txt (original) +++ lxml/trunk/CREDITS.txt Mon May 29 10:49:12 2006 @@ -32,6 +32,8 @@ David Sankel - building statically on Windows +Noah Slater - lots of bug squeezing + Duncan Booth - bugfixing Dean Pavlekovic - bug reporting From scoder at codespeak.net Mon May 29 11:26:04 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 29 May 2006 11:26:04 +0200 (CEST) Subject: [Lxml-checkins] r27827 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20060529092604.3B11910057@code0.codespeak.net> Author: scoder Date: Mon May 29 11:25:59 2006 New Revision: 27827 Modified: lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/c14n.pxd lxml/trunk/src/lxml/etree.pyx lxml/trunk/src/lxml/parser.pxi lxml/trunk/src/lxml/serializer.pxi lxml/trunk/src/lxml/tests/test_elementtree.py Log: fixed resetting element namespace, rewrite of C14N handling to use file-like objects and provide error messages Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Mon May 29 11:25:59 2006 @@ -7,6 +7,8 @@ Features added -------------- +* Writing C14N no longer serializes in memory (reduced memory footprint) + * PyErrorLog for error logging through the Python ``logging`` module * ``element.getroottree()`` returns an ElementTree for the root node of the @@ -22,6 +24,9 @@ Bugs fixed ---------- +* Setting namespace-less tag names on namespaced elements ('{ns}t' -> 't') + didn't reset the namespace + * Unknown constants from newer libxml2 versions could raise exceptions in the error handlers Modified: lxml/trunk/src/lxml/c14n.pxd ============================================================================== --- lxml/trunk/src/lxml/c14n.pxd (original) +++ lxml/trunk/src/lxml/c14n.pxd Mon May 29 11:25:59 2006 @@ -1,4 +1,4 @@ -from tree cimport xmlDoc +from tree cimport xmlDoc, xmlOutputBuffer from xpath cimport xmlNodeSet cdef extern from "libxml/c14n.h": @@ -8,4 +8,19 @@ char** inclusive_ns_prefixes, int with_comments, char** doc_txt_ptr) + + cdef int xmlC14NDocSave(xmlDoc* doc, + xmlNodeSet* nodes, + int exclusive, + char** inclusive_ns_prefixes, + int with_comments, + char* filename, + int compression) + + cdef int xmlC14NDocSaveTo(xmlDoc* doc, + xmlNodeSet* nodes, + int exclusive, + char** inclusive_ns_prefixes, + int with_comments, + xmlOutputBuffer* buffer) Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Mon May 29 11:25:59 2006 @@ -244,6 +244,8 @@ object node_ns_utf, object nsmap): """Lookup current namespace prefixes, then set namespace structure for node and register new ns-prefix mappings. + + This only works for a newly created node! """ cdef xmlNs* c_ns cdef xmlDoc* c_doc @@ -251,7 +253,7 @@ cdef char* c_href if not nsmap: if node_ns_utf is not None: - self._setNodeNs(c_node, node_ns_utf) + self._setNodeNs(c_node, _cstr(node_ns_utf)) return c_doc = self._c_doc @@ -272,7 +274,7 @@ node_ns_utf = None if node_ns_utf is not None: - self._setNodeNs(c_node, node_ns_utf) + self._setNodeNs(c_node, _cstr(node_ns_utf)) cdef _Document _documentFactory(xmlDoc* c_doc, _BaseParser parser): cdef _Document result @@ -510,7 +512,7 @@ self._assertHasRoot() schema = XMLSchema(xmlschema) return schema.validate(self) - + def xinclude(self): """Process this document, including using XInclude. """ @@ -525,30 +527,13 @@ result = xinclude.xmlXIncludeProcessTree(self._context_node._c_node) if result == -1: raise XIncludeError, "XInclude processing failed" - + def write_c14n(self, file): """C14N write of document. Always writes UTF-8. """ - cdef xmlDoc* c_base_doc - cdef xmlDoc* c_doc - cdef char* data - cdef int bytes self._assertHasRoot() - c_base_doc = self._context_node._doc._c_doc + _tofilelikeC14N(file, self._context_node) - c_doc = _fakeRootDoc(c_base_doc, self._context_node._c_node) - bytes = c14n.xmlC14NDocDumpMemory(c_doc, NULL, 0, NULL, 1, &data) - _destroyFakeDoc(c_base_doc, c_doc) - - if bytes < 0: - raise C14NError, "C14N failed" - try: - if not hasattr(file, 'write'): - file = open(file, 'wb') - file.write(data) - finally: - tree.xmlFree(data) - cdef _ElementTree _elementTreeFactory(_Document doc, _NodeBase context_node): return _newElementTree(doc, context_node, _ElementTree) @@ -707,13 +692,13 @@ return self._tag def __set__(self, value): - cdef xmlNs* c_ns ns, text = _getNsTag(value) self._tag = value tree.xmlNodeSetName(self._c_node, _cstr(text)) if ns is None: - return - self._doc._setNodeNs(self._c_node, _cstr(ns)) + self._c_node.ns = NULL + else: + self._doc._setNodeNs(self._c_node, _cstr(ns)) # not in ElementTree, read-only property prefix: @@ -982,7 +967,6 @@ else: assert 0, "Unknown node type: %s" % c_node.type result = element_class() - result._tag = None result._doc = doc result._c_node = c_node result._proxy_type = PROXY_ELEMENT Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Mon May 29 11:25:59 2006 @@ -4,6 +4,9 @@ cimport htmlparser from xmlparser cimport xmlParserCtxt, xmlDict +# initialize parser (and threading) +xmlparser.xmlInitParser() + class XMLSyntaxError(LxmlSyntaxError): pass @@ -28,11 +31,6 @@ if self._c_dict is not NULL: xmlparser.xmlDictFree(self._c_dict) - cdef void _initParser(self): - if not self._initialized: - xmlparser.xmlInitParser() - self._initialized = 1 - cdef void _initParserDict(self, xmlParserCtxt* pctxt): "Assure we always use the same string dictionary." if self._c_dict is NULL or self._c_dict is pctxt.dict: @@ -596,7 +594,6 @@ cdef Py_ssize_t c_len if parser is None: parser = __DEFAULT_PARSER - __GLOBAL_PARSER_CONTEXT._initParser() if not filename: c_filename = NULL else: @@ -611,7 +608,6 @@ cdef xmlDoc* _parseDocFromFile(filename, _BaseParser parser) except NULL: if parser is None: parser = __DEFAULT_PARSER - __GLOBAL_PARSER_CONTEXT._initParser() return (<_BaseParser>parser)._parseDocFromFile(_cstr(filename)) cdef xmlDoc* _parseDocFromFilelike(source, filename, @@ -619,7 +615,6 @@ cdef char* c_filename if parser is None: parser = __DEFAULT_PARSER - __GLOBAL_PARSER_CONTEXT._initParser() if not filename: c_filename = NULL else: Modified: lxml/trunk/src/lxml/serializer.pxi ============================================================================== --- lxml/trunk/src/lxml/serializer.pxi (original) +++ lxml/trunk/src/lxml/serializer.pxi Mon May 29 11:25:59 2006 @@ -100,12 +100,14 @@ cdef class _FilelikeWriter: cdef object _filelike cdef _ExceptionContext _exc_context + cdef _ErrorLog error_log def __init__(self, filelike, exc_context=None): self._filelike = filelike if exc_context is None: self._exc_context = _ExceptionContext() else: self._exc_context = exc_context + self.error_log = _ErrorLog() cdef tree.xmlOutputBuffer* _createOutputBuffer( self, tree.xmlCharEncodingHandler* enchandler) except NULL: @@ -173,6 +175,42 @@ if writer is not None: writer._exc_context._raise_if_stored() +cdef _tofilelikeC14N(f, _NodeBase element): + cdef _FilelikeWriter writer + cdef tree.xmlOutputBuffer* c_buffer + cdef xmlDoc* c_base_doc + cdef xmlDoc* c_doc + cdef int bytes + + c_base_doc = element._c_node.doc + c_doc = _fakeRootDoc(c_base_doc, element._c_node) + try: + if python.PyString_Check(f) or python.PyUnicode_Check(f): + filename = _utf8(f) + bytes = c14n.xmlC14NDocSave(c_doc, NULL, 0, NULL, 1, + _cstr(filename), 0) + elif hasattr(f, 'write'): + writer = _FilelikeWriter(f) + c_buffer = writer._createOutputBuffer(NULL) + writer.error_log.connect() + bytes = c14n.xmlC14NDocSaveTo(c_doc, NULL, 0, NULL, 1, c_buffer) + writer.error_log.disconnect() + tree.xmlOutputBufferClose(c_buffer) + else: + raise TypeError, "File or filename expected, got '%s'" % type(f) + finally: + _destroyFakeDoc(c_base_doc, c_doc) + + if writer is not None: + writer._exc_context._raise_if_stored() + + if bytes < 0: + if writer is not None and len(writer.error_log): + message = writer.error_log[0].message + else: + message = "C14N failed" + raise C14NError, message + # dump node to file (mainly for debug) cdef _dumpToFile(f, xmlNode* c_node, int pretty_print): Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Mon May 29 11:25:59 2006 @@ -823,7 +823,7 @@ self.assertXML( 'C2', a) - + def test_tag_write(self): Element = self.etree.Element SubElement = self.etree.SubElement @@ -841,6 +841,43 @@ '', a) + def test_tag_reset_ns(self): + Element = self.etree.Element + SubElement = self.etree.SubElement + tostring = self.etree.tostring + + a = Element('{a}a') + b1 = SubElement(a, '{a}b') + b2 = SubElement(a, '{b}b') + + self.assertEquals('{a}b', b1.tag) + + b1.tag = 'c' + + # can't use C14N here! + self.assertEquals('c', b1.tag) + self.assertEquals(' Author: scoder Date: Mon May 29 11:44:16 2006 New Revision: 27830 Modified: lxml/trunk/doc/FAQ.txt lxml/trunk/src/lxml/parser.pxi Log: cleanup, extended FAQ section on multi-threading Modified: lxml/trunk/doc/FAQ.txt ============================================================================== --- lxml/trunk/doc/FAQ.txt (original) +++ lxml/trunk/doc/FAQ.txt Mon May 29 11:44:16 2006 @@ -64,21 +64,26 @@ #) Can I use threads to concurrently access the lxml API? You should be able to use lxml in a multi-threaded environment, although - this is not very well tested. Note that lxml does not provide any - thread-safety by itself (mainly for performance reasons), so you have to - take care when you use parts of the API concurrently. Most importantly, - you must not forget to call ``etree.initThread()`` from each newly - generated thread to initialize lxml and libxml2 for the new thread context. - If you call API functions from a thread without having called this function - first, lxml can easily crash your program. + support is limited and not very well tested. Note that lxml does not + provide any thread-safety by itself (mainly for performance reasons), so + you have to take care when you use parts of the API concurrently. Most + importantly, you must not forget to call ``etree.initThread()`` from each + newly generated thread to initialize lxml and libxml2 for the new thread + context. If you call API functions from a thread without having called + this function first, lxml can easily crash your program. + + Tree modification is not thread-safe, so you must take care to properly + serialize modifications. Reading from a tree concurrently should not + produce any problems (otherwise it is a bug). Basically none of the API classes is thread-safe, including parsers, XPath, - XSLT and the validators. You cannot use such an object concurrently. - However, it is perfectly viable to create independent instances for each - thread. This is a cheap thing to do for parsers, but more expensive for - XSLT and validators, which have to compile trees recursively. So you might - want to consider a thread pool approach or threaded processing chains to - reduce the overhead if you require threading here. + XSLT and the validators. Each of them represents a stateful object that + cannot be used concurrently. However, it is perfectly viable to create + independent instances for each thread. This is a cheap thing to do for + parsers, but more expensive for XSLT and validators, which have to compile + trees recursively. So you might want to consider a thread pool approach or + threaded processing chains to reduce the overhead if you require threading + here. #) What are the ``findall()`` and ``xpath()`` methods on Element(Tree)? Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Mon May 29 11:44:16 2006 @@ -21,11 +21,8 @@ """Global parser context to share the string dictionary. """ cdef xmlDict* _c_dict - cdef int _initialized - def __init__(self): self._c_dict = NULL - self._initialized = 0 def __dealloc__(self): if self._c_dict is not NULL: From scoder at codespeak.net Mon May 29 12:29:08 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 29 May 2006 12:29:08 +0200 (CEST) Subject: [Lxml-checkins] r27835 - lxml/trunk/doc Message-ID: <20060529102908.456131006D@code0.codespeak.net> Author: scoder Date: Mon May 29 12:29:06 2006 New Revision: 27835 Modified: lxml/trunk/doc/FAQ.txt Log: FAQ entry on pretty printing, more on threading Modified: lxml/trunk/doc/FAQ.txt ============================================================================== --- lxml/trunk/doc/FAQ.txt (original) +++ lxml/trunk/doc/FAQ.txt Mon May 29 12:29:06 2006 @@ -64,17 +64,18 @@ #) Can I use threads to concurrently access the lxml API? You should be able to use lxml in a multi-threaded environment, although - support is limited and not very well tested. Note that lxml does not - provide any thread-safety by itself (mainly for performance reasons), so - you have to take care when you use parts of the API concurrently. Most - importantly, you must not forget to call ``etree.initThread()`` from each - newly generated thread to initialize lxml and libxml2 for the new thread - context. If you call API functions from a thread without having called - this function first, lxml can easily crash your program. + this is not very well tested. For performance reasons, lxml.etree provides + only very limited thread-safety by itself, so you have to take care when + you use parts of the API concurrently. Most importantly, you must not + forget to call ``etree.initThread()`` from each newly generated thread to + initialize lxml and libxml2 for the new thread context. If you call API + functions from a thread without having called this function first, lxml can + behave unexpectedly and even crash your program. This is not considered a + bug in lxml, it is a bug in your code. Tree modification is not thread-safe, so you must take care to properly - serialize modifications. Reading from a tree concurrently should not - produce any problems (otherwise it is a bug). + serialize modifications. Reading and traversing a tree concurrently should + not produce any problems (otherwise it is a bug). Basically none of the API classes is thread-safe, including parsers, XPath, XSLT and the validators. Each of them represents a stateful object that @@ -85,6 +86,32 @@ threaded processing chains to reduce the overhead if you require threading here. + This said, if you have problems with thread support or ideas how to improve + it, we would like to hear about it through the mailing list. + + +#) Why doesn't the ``pretty_print`` option reformat my XML output? + + Pretty printing (or formatting) an XML document means adding white space to + the content. These modifications are harmless if they only impact elements + in the document that do not carry (text) data. They corrupt your data if + they impact elements that contain data. The only way to distinguish + between harmless and harmful modification is structural information about + the document. + + If lxml cannot distinguish between whitespace and data, it will not alter + your data. The best way to tell lxml where whitespace can be safely added + and removed is allowing the parser to load the DTD (which obviously + requires the DTD to be accessible):: + + >>> tree = etree.parse(file, etree.XMLParser(load_dtd=True)) + + This will allow the parser to drop so-called 'ignorable whitespace' that is + not considered data (i.e. not part of the XML infoset). If you now call a + serialization function to pretty print this tree, it will use the + structural information it has to determine the correct places where it can + add whitespace to the XML tree. + #) What are the ``findall()`` and ``xpath()`` methods on Element(Tree)? From scoder at codespeak.net Mon May 29 15:50:48 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 29 May 2006 15:50:48 +0200 (CEST) Subject: [Lxml-checkins] r27854 - lxml/trunk/doc Message-ID: <20060529135048.A5D2C10036@code0.codespeak.net> Author: scoder Date: Mon May 29 15:50:46 2006 New Revision: 27854 Modified: lxml/trunk/doc/FAQ.txt Log: rewrite of FAQ threading section to say: doesn't work Modified: lxml/trunk/doc/FAQ.txt ============================================================================== --- lxml/trunk/doc/FAQ.txt (original) +++ lxml/trunk/doc/FAQ.txt Mon May 29 15:50:46 2006 @@ -63,31 +63,13 @@ #) Can I use threads to concurrently access the lxml API? - You should be able to use lxml in a multi-threaded environment, although - this is not very well tested. For performance reasons, lxml.etree provides - only very limited thread-safety by itself, so you have to take care when - you use parts of the API concurrently. Most importantly, you must not - forget to call ``etree.initThread()`` from each newly generated thread to - initialize lxml and libxml2 for the new thread context. If you call API - functions from a thread without having called this function first, lxml can - behave unexpectedly and even crash your program. This is not considered a - bug in lxml, it is a bug in your code. - - Tree modification is not thread-safe, so you must take care to properly - serialize modifications. Reading and traversing a tree concurrently should - not produce any problems (otherwise it is a bug). - - Basically none of the API classes is thread-safe, including parsers, XPath, - XSLT and the validators. Each of them represents a stateful object that - cannot be used concurrently. However, it is perfectly viable to create - independent instances for each thread. This is a cheap thing to do for - parsers, but more expensive for XSLT and validators, which have to compile - trees recursively. So you might want to consider a thread pool approach or - threaded processing chains to reduce the overhead if you require threading - here. + Short answer: No. - This said, if you have problems with thread support or ideas how to improve - it, we would like to hear about it through the mailing list. + Long answer: lxml does not currently release the GIL (Python's global + interpreter lock) internally, so you will not benefit from any performance + improvements by using threads. It is also not trivial to free the GIL, as + lxml calls back into Python in many places during XML processing: extension + functions, Python resolvers, error reporting, etc. #) Why doesn't the ``pretty_print`` option reformat my XML output? From scoder at codespeak.net Mon May 29 16:33:40 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 29 May 2006 16:33:40 +0200 (CEST) Subject: [Lxml-checkins] r27857 - lxml/trunk/doc Message-ID: <20060529143340.18DB010053@code0.codespeak.net> Author: scoder Date: Mon May 29 16:33:38 2006 New Revision: 27857 Modified: lxml/trunk/doc/FAQ.txt Log: rewrote pretty print section in FAQ Modified: lxml/trunk/doc/FAQ.txt ============================================================================== --- lxml/trunk/doc/FAQ.txt (original) +++ lxml/trunk/doc/FAQ.txt Mon May 29 16:33:38 2006 @@ -77,22 +77,19 @@ Pretty printing (or formatting) an XML document means adding white space to the content. These modifications are harmless if they only impact elements in the document that do not carry (text) data. They corrupt your data if - they impact elements that contain data. The only way to distinguish - between harmless and harmful modification is structural information about - the document. - - If lxml cannot distinguish between whitespace and data, it will not alter - your data. The best way to tell lxml where whitespace can be safely added - and removed is allowing the parser to load the DTD (which obviously - requires the DTD to be accessible):: - - >>> tree = etree.parse(file, etree.XMLParser(load_dtd=True)) - - This will allow the parser to drop so-called 'ignorable whitespace' that is - not considered data (i.e. not part of the XML infoset). If you now call a - serialization function to pretty print this tree, it will use the - structural information it has to determine the correct places where it can - add whitespace to the XML tree. + they impact elements that contain data. If lxml cannot distinguish between + whitespace and data, it will not alter your data. Whitespace is therefore + only added between nodes that do not contain data. This is always the case + for trees constructed element-by-element, so no problems should be expected + here. For parsed trees, a good way to assure that no conflicting + whitespace is left in the tree is the ``?gnore_blanks`` option:: + + >>> parser = etree.XMLParser(ignore_blanks=True) + >>> tree = etree.parse(file, parser) + + This will allow the parser to drop blank text nodes when constructing the + tree. If you now call a serialization function to pretty print this tree, + lxml can add fresh whitespace to the XML tree to indent it. #) What are the ``findall()`` and ``xpath()`` methods on Element(Tree)? From scoder at codespeak.net Mon May 29 16:39:54 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 29 May 2006 16:39:54 +0200 (CEST) Subject: [Lxml-checkins] r27858 - lxml/trunk/src/lxml Message-ID: <20060529143954.49CFE10053@code0.codespeak.net> Author: scoder Date: Mon May 29 16:39:50 2006 New Revision: 27858 Modified: lxml/trunk/src/lxml/etree.pyx lxml/trunk/src/lxml/parser.pxi lxml/trunk/src/lxml/tree.pxd lxml/trunk/src/lxml/xmlerror.pxd lxml/trunk/src/lxml/xmlerror.pxi Log: restructuring in thread setup, use thread default values for configuration, no longer uses KeepBlanksDefault (more XML compliant) Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Mon May 29 16:39:50 2006 @@ -40,10 +40,21 @@ # make the compiled-in debug state publicly available DEBUG = __DEBUG +# global per-thread setup +tree.xmlThrDefIndentTreeOutput(1) +tree.xmlThrDefLineNumbersDefaultValue(1) + +_initThreadLogging() + +# initialize parser (and threading) +xmlparser.xmlInitParser() + def initThread(): - "Call this method to set up the library from within a new thread." - _initThreadLogging() - tree.xmlKeepBlanksDefault(0) + """Must be called by each newly created thread before calling any API + functions.""" + #_initThreadLogging() + pass + # Error superclass for ElementTree compatibility class Error(Exception): @@ -1258,6 +1269,7 @@ cdef void _prepareNextNode(self): cdef _NodeBase node cdef xmlNode* c_node + cdef xmlNode* c_next_node cdef xmlNode* c_parent # find in descendants node = self._next_node @@ -1287,11 +1299,12 @@ # we are at a sibling, so set c_parent to our parent c_parent = c_parent.parent - self._next_node = _elementFactory(node._doc, c_node) + c_next_node = c_node # fix depth counter by looking up path to original parent while c_node is not c_parent: self._depth = self._depth + 1 c_node = c_node.parent + self._next_node = _elementFactory(node._doc, c_next_node) cdef xmlNode* _createElement(xmlDoc* c_doc, object name_utf) except NULL: cdef xmlNode* c_node @@ -1523,6 +1536,3 @@ include "relaxng.pxi" # RelaxNG include "xmlschema.pxi" # XMLSchema - -# configure main thread -initThread() Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Mon May 29 16:39:50 2006 @@ -4,9 +4,6 @@ cimport htmlparser from xmlparser cimport xmlParserCtxt, xmlDict -# initialize parser (and threading) -xmlparser.xmlInitParser() - class XMLSyntaxError(LxmlSyntaxError): pass @@ -449,13 +446,14 @@ * no_network - prevent network access * ns_clean - clean up redundant namespace declarations * recover - try hard to parse through broken XML + * ignore_blanks - discard blank text nodes Note that you must not share parsers between threads. This applies also to the default parser. """ def __init__(self, attribute_defaults=False, dtd_validation=False, load_dtd=False, no_network=False, ns_clean=False, - recover=False): + recover=False, ignore_blanks=False): cdef int parse_options _BaseParser.__init__(self) @@ -474,6 +472,8 @@ parse_options = parse_options | xmlparser.XML_PARSE_NSCLEAN if recover: parse_options = parse_options | xmlparser.XML_PARSE_RECOVER + if ignore_blanks: + parse_options = parse_options | xmlparser.XML_PARSE_NOBLANKS self._parse_options = parse_options Modified: lxml/trunk/src/lxml/tree.pxd ============================================================================== --- lxml/trunk/src/lxml/tree.pxd (original) +++ lxml/trunk/src/lxml/tree.pxd Mon May 29 16:39:50 2006 @@ -228,6 +228,7 @@ FILE* file, xmlCharEncodingHandler* encoder) cdef xmlOutputBuffer* xmlOutputBufferCreateFilename( char* URI, xmlCharEncodingHandler* encoder, int compression) + cdef extern from "libxml/xmlsave.h": ctypedef struct xmlSaveCtxt: pass @@ -236,6 +237,11 @@ int options) cdef long xmlSaveDoc(xmlSaveCtxt* ctxt, xmlDoc* doc) cdef int xmlSaveClose(xmlSaveCtxt* ctxt) + +cdef extern from "libxml/globals.h": + cdef int xmlThrDefKeepBlanksDefaultValue(int onoff) + cdef int xmlThrDefLineNumbersDefaultValue(int onoff) + cdef int xmlThrDefIndentTreeOutput(int onoff) cdef extern from "libxml/xmlstring.h": cdef char* xmlStrdup(char* cur) Modified: lxml/trunk/src/lxml/xmlerror.pxd ============================================================================== --- lxml/trunk/src/lxml/xmlerror.pxd (original) +++ lxml/trunk/src/lxml/xmlerror.pxd Mon May 29 16:39:50 2006 @@ -14,10 +14,11 @@ char* file int line - cdef void xmlSetGenericErrorFunc(void* ctxt, - void (*handler)(void* ctxt, char* msg, ...)) - cdef void xmlSetStructuredErrorFunc(void* ctxt, - void (*handler)(void* userData, xmlError* error)) + ctypedef void (*xmlGenericErrorFunc)(void* ctxt, char* msg, ...) + ctypedef void (*xmlStructuredErrorFunc)(void* userData, xmlError* error) + + cdef void xmlSetGenericErrorFunc(void* ctxt, xmlGenericErrorFunc func) + cdef void xmlSetStructuredErrorFunc(void* ctxt, xmlStructuredErrorFunc func) ctypedef enum xmlErrorDomain: XML_FROM_NONE = 0 @@ -775,3 +776,9 @@ XML_I18N_NO_OUTPUT = 6004 # 6004 XML_CHECK_ = 6005 # 5033 XML_CHECK_X = 6006 # 503 + +cdef extern from "libxml/globals.h": + cdef void xmlThrDefSetGenericErrorFunc(void* ctx, + xmlGenericErrorFunc handler) + cdef void xmlThrDefSetStructuredErrorFunc(void* ctx, + xmlStructuredErrorFunc handler) Modified: lxml/trunk/src/lxml/xmlerror.pxi ============================================================================== --- lxml/trunk/src/lxml/xmlerror.pxi (original) +++ lxml/trunk/src/lxml/xmlerror.pxi Mon May 29 16:39:50 2006 @@ -14,7 +14,6 @@ cdef void _initThreadLogging(): "Setup logging for the current thread. Called from etree.initThread()." # switch on line number reporting - xmlparser.xmlLineNumbersDefault(1) _logLibxmlErrors() try: _logLibxsltErrors() @@ -372,7 +371,10 @@ # setup for global log: cdef void _logLibxmlErrors(): + xmlerror.xmlThrDefSetGenericErrorFunc(NULL, _nullGenericErrorFunc) xmlerror.xmlSetGenericErrorFunc(NULL, _nullGenericErrorFunc) + + xmlerror.xmlThrDefSetStructuredErrorFunc(NULL, _receiveError) xmlerror.xmlSetStructuredErrorFunc(NULL, _receiveError) ################################################################################ From scoder at codespeak.net Mon May 29 16:56:02 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 29 May 2006 16:56:02 +0200 (CEST) Subject: [Lxml-checkins] r27859 - in lxml/trunk: doc src/lxml Message-ID: <20060529145602.3B59810057@code0.codespeak.net> Author: scoder Date: Mon May 29 16:56:00 2006 New Revision: 27859 Modified: lxml/trunk/doc/FAQ.txt lxml/trunk/src/lxml/parser.pxi Log: renamed ignore_blanks option in XMLParser as remove_blank_text, as it is called in HTMLParser Modified: lxml/trunk/doc/FAQ.txt ============================================================================== --- lxml/trunk/doc/FAQ.txt (original) +++ lxml/trunk/doc/FAQ.txt Mon May 29 16:56:00 2006 @@ -82,9 +82,9 @@ only added between nodes that do not contain data. This is always the case for trees constructed element-by-element, so no problems should be expected here. For parsed trees, a good way to assure that no conflicting - whitespace is left in the tree is the ``?gnore_blanks`` option:: + whitespace is left in the tree is the ``remove_blank_text`` option:: - >>> parser = etree.XMLParser(ignore_blanks=True) + >>> parser = etree.XMLParser(remove_blank_text=True) >>> tree = etree.parse(file, parser) This will allow the parser to drop blank text nodes when constructing the Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Mon May 29 16:56:00 2006 @@ -446,14 +446,14 @@ * no_network - prevent network access * ns_clean - clean up redundant namespace declarations * recover - try hard to parse through broken XML - * ignore_blanks - discard blank text nodes + * remove_blank_text - discard blank text nodes Note that you must not share parsers between threads. This applies also to the default parser. """ def __init__(self, attribute_defaults=False, dtd_validation=False, load_dtd=False, no_network=False, ns_clean=False, - recover=False, ignore_blanks=False): + recover=False, remove_blank_text=False): cdef int parse_options _BaseParser.__init__(self) From scoder at codespeak.net Mon May 29 16:56:40 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 29 May 2006 16:56:40 +0200 (CEST) Subject: [Lxml-checkins] r27860 - lxml/trunk/src/lxml Message-ID: <20060529145640.032CD10057@code0.codespeak.net> Author: scoder Date: Mon May 29 16:56:39 2006 New Revision: 27860 Modified: lxml/trunk/src/lxml/parser.pxi Log: doc update Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Mon May 29 16:56:39 2006 @@ -558,7 +558,7 @@ Available keyword arguments: * recover - try hard to parse through broken HTML (default: True) * no_network - prevent network access - * remove_blank_text - clean up empty text nodes + * remove_blank_text - discard empty text nodes Note that you must not share parsers between threads. """ From scoder at codespeak.net Mon May 29 17:42:04 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 29 May 2006 17:42:04 +0200 (CEST) Subject: [Lxml-checkins] r27861 - lxml/trunk/src/lxml Message-ID: <20060529154204.44CB210041@code0.codespeak.net> Author: scoder Date: Mon May 29 17:42:02 2006 New Revision: 27861 Modified: lxml/trunk/src/lxml/parser.pxi Log: fix reference to previously renamed variable Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Mon May 29 17:42:02 2006 @@ -472,7 +472,7 @@ parse_options = parse_options | xmlparser.XML_PARSE_NSCLEAN if recover: parse_options = parse_options | xmlparser.XML_PARSE_RECOVER - if ignore_blanks: + if remove_blank_text: parse_options = parse_options | xmlparser.XML_PARSE_NOBLANKS self._parse_options = parse_options From scoder at codespeak.net Mon May 29 18:11:05 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 29 May 2006 18:11:05 +0200 (CEST) Subject: [Lxml-checkins] r27863 - lxml/trunk/src/lxml/tests Message-ID: <20060529161105.BC0BE10053@code0.codespeak.net> Author: scoder Date: Mon May 29 18:11:04 2006 New Revision: 27863 Modified: lxml/trunk/src/lxml/tests/test_elementtree.py Log: added test case by Noah: appending element with xml:id attribute fails (in both etree and ET) Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Mon May 29 18:11:04 2006 @@ -1663,7 +1663,27 @@ # as namespace is not moved along with it del one self.assertEquals('{http://a.b.c}baz', baz.tag) - + + def test_attribute_xmlns_move(self): + Element = self.etree.Element + + root = etree.Element('element') + + subelement = etree.Element('subelement') + subelement.set("{http://www.w3.org/XML/1998/namespace}id", "foo") + self.assertEqual(1, len(subelement.attrib)) + self.assertEquals( + "foo", + subelement.get("{http://www.w3.org/XML/1998/namespace}id")) + + root.append(subelement) + self.assertEqual(1, len(subelement.attrib)) + self.assertEquals({"{http://www.w3.org/XML/1998/namespace}id" : "foo"}, + subelement.attrib) + self.assertEquals( + "foo", + subelement.get("{http://www.w3.org/XML/1998/namespace}id")) + def test_tostring(self): tostring = self.etree.tostring Element = self.etree.Element From faassen at codespeak.net Mon May 29 18:20:14 2006 From: faassen at codespeak.net (faassen at codespeak.net) Date: Mon, 29 May 2006 18:20:14 +0200 (CEST) Subject: [Lxml-checkins] r27865 - lxml/trunk Message-ID: <20060529162014.2ED3E10053@code0.codespeak.net> Author: faassen Date: Mon May 29 18:20:13 2006 New Revision: 27865 Modified: lxml/trunk/CREDITS.txt Log: update credits. :) Modified: lxml/trunk/CREDITS.txt ============================================================================== --- lxml/trunk/CREDITS.txt (original) +++ lxml/trunk/CREDITS.txt Mon May 29 18:20:13 2006 @@ -1,10 +1,10 @@ Credits ------- -Martijn Faassen - initial main developer - Stefan Behnel - main developer and maintainer +Martijn Faassen - creator of lxml and initial main developer + Marc-Antoine Parent - XPath extension function help and patches Olivier Grisel - improved (c)ElementTree compatibility patches, From scoder at codespeak.net Tue May 30 06:56:43 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 30 May 2006 06:56:43 +0200 (CEST) Subject: [Lxml-checkins] r27887 - lxml/trunk/src/lxml/tests Message-ID: <20060530045643.CB17910063@code0.codespeak.net> Author: scoder Date: Tue May 30 06:56:39 2006 New Revision: 27887 Modified: lxml/trunk/src/lxml/tests/test_elementtree.py Log: test case cleanup Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Tue May 30 06:56:39 2006 @@ -1667,9 +1667,9 @@ def test_attribute_xmlns_move(self): Element = self.etree.Element - root = etree.Element('element') + root = Element('element') - subelement = etree.Element('subelement') + subelement = Element('subelement') subelement.set("{http://www.w3.org/XML/1998/namespace}id", "foo") self.assertEqual(1, len(subelement.attrib)) self.assertEquals( From scoder at codespeak.net Tue May 30 07:39:39 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 30 May 2006 07:39:39 +0200 (CEST) Subject: [Lxml-checkins] r27888 - lxml/trunk/src/lxml Message-ID: <20060530053939.816EF10053@code0.codespeak.net> Author: scoder Date: Tue May 30 07:39:37 2006 New Revision: 27888 Modified: lxml/trunk/src/lxml/etree.pyx Log: whitespace Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Tue May 30 07:39:37 2006 @@ -633,7 +633,7 @@ def set(self, key, value): _setAttributeValue(self, key, value) - + def append(self, _Element element not None): cdef xmlNode* c_next cdef xmlNode* c_node From scoder at codespeak.net Tue May 30 08:26:07 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 30 May 2006 08:26:07 +0200 (CEST) Subject: [Lxml-checkins] r27889 - lxml/trunk/src/lxml Message-ID: <20060530062607.B78DB1005A@code0.codespeak.net> Author: scoder Date: Tue May 30 08:26:05 2006 New Revision: 27889 Modified: lxml/trunk/src/lxml/proxy.pxi Log: small performance improvement in deallocation code: faster handling of common case where elements are deallocated but not their parents (SubElement etc.) Modified: lxml/trunk/src/lxml/proxy.pxi ============================================================================== --- lxml/trunk/src/lxml/proxy.pxi (original) +++ lxml/trunk/src/lxml/proxy.pxi Tue May 30 08:26:05 2006 @@ -151,21 +151,23 @@ cdef xmlNode* c_current cdef xmlNode* c_top #print "trying to do deallocating:", c_node.type + if c_node._private is not NULL: + #print "Not freeing: proxies still exist" + return NULL c_current = c_node.parent c_top = c_node while c_current is not NULL: #print "checking:", c_current.type - # if we're still attached to the document, don't deallocate if c_current.type == tree.XML_DOCUMENT_NODE or \ c_current.type == tree.XML_HTML_DOCUMENT_NODE: #print "not freeing: still in doc" return NULL + # if we're still attached to the document, don't deallocate + if c_current._private is not NULL: + #print "Not freeing: proxies still exist" + return NULL c_top = c_current c_current = c_current.parent - # cannot free a top which has proxies pointing to it - if c_top._private is not NULL: - #print "Not freeing: proxies still exist" - return NULL # see whether we have children to deallocate if canDeallocateChildren(c_top): return c_top From scoder at codespeak.net Tue May 30 08:33:54 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 30 May 2006 08:33:54 +0200 (CEST) Subject: [Lxml-checkins] r27890 - in lxml/trunk: . doc Message-ID: <20060530063354.1DEC41005A@code0.codespeak.net> Author: scoder Date: Tue May 30 08:33:52 2006 New Revision: 27890 Modified: lxml/trunk/bench.py lxml/trunk/doc/performance.txt Log: benchmark for creating Elements, compare to makeelement/SubElement Modified: lxml/trunk/bench.py ============================================================================== --- lxml/trunk/bench.py (original) +++ lxml/trunk/bench.py Tue May 30 08:33:52 2006 @@ -347,6 +347,11 @@ for child in root: child.makeelement('{test}test', empty_attrib) + def bench_create_elements(self, root): + Element = self.etree.Element + for child in root: + Element('{test}test') + def bench_replace_children_element(self, root): Element = self.etree.Element for child in root: @@ -684,6 +689,9 @@ result = run_bench(bench, *benchmark_setup) except SkippedTest: print "skipped" + except KeyboardInterrupt: + print "interrupted by user" + sys.exit(1) except Exception, e: print "failed: %s: %s" % (e.__class__.__name__, e) else: Modified: lxml/trunk/doc/performance.txt ============================================================================== --- lxml/trunk/doc/performance.txt (original) +++ lxml/trunk/doc/performance.txt Tue May 30 08:33:52 2006 @@ -13,7 +13,8 @@ The statements made here are backed by the benchmark script `bench.py`_ that comes with the lxml source distribution. The timings cited below compare lxml -1.0, ElementTree 1.2.6 and cElementTree 1.0.5 under CPython 2.4.2. +1.0, ElementTree 1.2.6 and cElementTree 1.0.5 under CPython 2.4.2 on an AMD64 +machine. .. _`bench.py`: http://codespeak.net/svn/lxml/trunk/bench.py @@ -40,10 +41,10 @@ Parsing and Serialising ----------------------- -This is one of the areas where lxml excels. The reason is that both parts are -executed entirely at the C level, without major interaction with Python code. -The results are rather impressive. Compared to cElementTree, lxml is about 20 -to 40 times faster on serialisation:: +These are areas where lxml excels. The reason is that both parts are executed +entirely at the C level, without major interaction with Python code. The +results are rather impressive. Compared to cElementTree, lxml is about 20 to +40 times faster on serialisation:: lxe: tostring_utf16 (SA T2) 30.9846 msec/pass cET: tostring_utf16 (SA T2) 715.5002 msec/pass @@ -110,12 +111,32 @@ three times faster than lxml here. One of the reasons is that lxml must additionally discard the created Python elements after their use, when they are no longer referenced. ET and cET represent the tree itself through these -objects, which reduces their overhead in creating them. +objects, which reduces the overhead in creating them. + +As opposed to ET, libxml2 has a notion of documents that each element must be +in. This results in a major performance difference for creating independent +Elements that end up in independently created documents:: + + lxe: create_elements (-- T2 ) 22.0083 msec/pass + cET: create_elements (-- T2 ) 0.3920 msec/pass + ET : create_elements (-- T2 ) 3.0865 msec/pass + +Therefore, it is always preferable to create Elements for the document they +are supposed to end up in, either as SubElements of an Element or using the +explicit ``Element.makeelement()`` call:: + + lxe: makeelement (-- T2 ) 4.3003 msec/pass + cET: makeelement (-- T2 ) 0.5520 msec/pass + ET : makeelement (-- T2 ) 3.8092 msec/pass + + lxe: create_subelements (-- T2 ) 3.9673 msec/pass + cET: create_subelements (-- T2 ) 0.5666 msec/pass + ET : create_subelements (-- T2 ) 6.4613 msec/pass So, if the main performance bottleneck of an application is creating large XML trees in memory through calls to Element and SubElement, cET is the best choice. Note, however, that the serialisation performance may even out this -advantage. +advantage, especially for smaller trees and trees with many attributes. A critical action for lxml is moving elements between document contexts. It requires lxml to do recursive adaptations throughout the moved tree structure. @@ -170,7 +191,8 @@ Another area where lxml is very fast is iteration for tree traversal. If your algorithms can benefit from step-by-step traversal of the XML tree and -especially if few elements are of interest, lxml is a good choice:: +especially if few elements are of interest or the element tag name is known, +lxml is a good choice:: lxe: getiterator_all (-- T2 ) 31.2719 msec/pass cET: getiterator_all (-- T2 ) 36.3687 msec/pass From scoder at codespeak.net Tue May 30 10:14:07 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 30 May 2006 10:14:07 +0200 (CEST) Subject: [Lxml-checkins] r27892 - lxml/trunk/src/lxml Message-ID: <20060530081407.9532010057@code0.codespeak.net> Author: scoder Date: Tue May 30 10:14:06 2006 New Revision: 27892 Modified: lxml/trunk/src/lxml/etree.pyx Log: moved _init() method from _NodeBase down to _Element where it is actually used Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Tue May 30 10:14:06 2006 @@ -347,11 +347,6 @@ unregisterProxy(self) attemptDeallocation(self._c_node) - def _init(self): - """Called after object initialisation. Subclasses may override - this if they recursively call _init() in the superclasses. - """ - cdef class _ElementTree: cdef _Document _doc cdef _NodeBase _context_node @@ -562,6 +557,11 @@ cdef class _Element(_NodeBase): cdef object _tag + def _init(self): + """Called after object initialisation. Custom subclasses may override + this if they recursively call _init() in the superclasses. + """ + # MANIPULATORS def __setitem__(self, Py_ssize_t index, _NodeBase element not None): From scoder at codespeak.net Tue May 30 10:14:19 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 30 May 2006 10:14:19 +0200 (CEST) Subject: [Lxml-checkins] r27893 - lxml/trunk/src/lxml Message-ID: <20060530081419.1B7A410057@code0.codespeak.net> Author: scoder Date: Tue May 30 10:14:17 2006 New Revision: 27893 Modified: lxml/trunk/src/lxml/nsclasses.pxi Log: doc updates Modified: lxml/trunk/src/lxml/nsclasses.pxi ============================================================================== --- lxml/trunk/src/lxml/nsclasses.pxi (original) +++ lxml/trunk/src/lxml/nsclasses.pxi Tue May 30 10:14:17 2006 @@ -5,10 +5,13 @@ cdef class ElementBase(_Element): """All classes in namespace implementations must inherit from this one. + Note that subclasses *must not* override __init__ or __new__ as it is absolutely undefined when these objects will be created or destroyed. All - persistent state of elements must be stored in the underlying XML.""" - pass + persistent state of elements must be stored in the underlying XML. If you + really need to initialize the object after creation, you can implement an + ``_init(self)`` method that will be called after object creation. + """ cdef object __NAMESPACE_REGISTRIES __NAMESPACE_REGISTRIES = {} From scoder at codespeak.net Tue May 30 10:45:37 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 30 May 2006 10:45:37 +0200 (CEST) Subject: [Lxml-checkins] r27894 - lxml/trunk/doc Message-ID: <20060530084537.648A010057@code0.codespeak.net> Author: scoder Date: Tue May 30 10:45:35 2006 New Revision: 27894 Modified: lxml/trunk/doc/namespace_extensions.txt Log: major rewrite of documentation in doc/namespace_extensions.txt Modified: lxml/trunk/doc/namespace_extensions.txt ============================================================================== --- lxml/trunk/doc/namespace_extensions.txt (original) +++ lxml/trunk/doc/namespace_extensions.txt Tue May 30 10:45:35 2006 @@ -2,118 +2,111 @@ Implementing namespaces with the Namespace class ================================================ -Also see `extensions`_. +lxml allows you to implement namespaces, in a rather literal sense. You can +write your own classes for Elements and have lxml use them for a specific tag +name in a specific namespace. -.. _`extensions`: extensions.html +Custom Elements must inherit from the ``etree.ElementBase`` class, which +provides the Element interface for subclasses:: -Imagine, you have a namespace called 'http://hui.de/honk' and have to -treat all of its elements in a specific way, say, to find out if they -are really honking. You could provide a function called 'is_honking' -that handles that:: - - >>> def is_honking(honk_element): - ... return honk_element.get('honking') == 'true' - -Then you can use it:: + >>> from lxml import etree + >>> class HonkElement(etree.ElementBase): + ... def honking(self): + ... return self.get('honking') == 'true' + ... honking = property(honking) - >>> from lxml.etree import XML - >>> honk_element = XML('') - >>> print is_honking(honk_element) - True +This defines a new Element class ``HonkElement`` with a property ``honking``. -Not too bad, right? Now, imagine, you only want to do that to certain -elements from that namespace and prevent others from being passed to -is_honking. You can add a check to is_honking to test the tag name -before doing anything else. - -After a while, however, you remember what you heard at school about -object oriented programming. You start wondering if there isn't a -nicer way to do that. -- And there is! +Note that you cannot (or rather must not) instantiate this class yourself. +lxml.etree will do that for you through its normal ElementTree API. To let +lxml know about it, you must register it with a namespace. The Namespace class -=================== - -lxml allows you to implement namespaces, in a rather literal -sense. You can do the above like this:: - - >>> from lxml.etree import Namespace, ElementBase - >>> class HonkElement(ElementBase): - ... def honking(self): - ... return self.get('honking') == 'true' - ... honking = property(honking) +------------------- -Now you can build the new namespace by calling the Namespace class:: +You can build a new namespace (or retrieve an existing one) by calling the +Namespace class:: - >>> namespace = Namespace('http://hui.de/honk') + >>> namespace = etree.Namespace('http://hui.de/honk') -and then register the new element type with that namespace:: +and then register the new element type with that namespace, say, under the tag +name ``honk``:: >>> namespace['honk'] = HonkElement -After this, you create and use your XML elements:: +After this, you create and use your XML elements through the normal API of +lxml:: - >>> honk_element = XML('') + >>> xml = '' + >>> honk_element = etree.XML(xml) >>> print honk_element.honking True The same works when creating elements by hand:: - >>> from lxml.etree import Element - >>> honk_element = Element('{http://hui.de/honk}honk', honking='true') + >>> honk_element = etree.Element('{http://hui.de/honk}honk', + ... honking='true') >>> print honk_element.honking True -Essentially, what this allows you to do, is giving elements a specific -API based on their namespace and element name. +Essentially, what this allows you to do, is to give elements a custom API +based on their namespace and tag name. + +A somewhat related topic are `extension functions`_ which use a similar +mechanism for registering extension functions in XPath and XSLT. + +.. _`extension functions`: extensions.html Element initialization ---------------------- -There is one thing to remember. Element classes *must not* have a -constructor, neither must there be any internal state (except for -their XML representation). Element instances are created and garbage +There is one thing to remember. Element classes *must not* have a +constructor, neither must there be any internal state (except for the data +stored in the underlying XML tree). Element instances are created and garbage collected at need, so there is no way to predict when and how often a -constructor would be called. Even worse, when the ``__init__`` method -is called, the object may not even be initialized yet to represent the -XML tag, so there is not much use in providing an ``__init__`` method -in subclasses. - -However, there is one possible way to do things on element -initialization. Element classes have an ``_init()`` method that can be -overridden. It can be used to modify the XML tree, e.g. to construct +constructor would be called. Even worse, when the ``__init__`` method is +called, the object may not even be initialized yet to represent the XML tag, +so there is not much use in providing an ``__init__`` method in subclasses. + +However, there is one possible way to do things on element initialization, if +you really need to. ElementBase classes have an ``_init()`` method that can +be overridden. It can be used to modify the XML tree, e.g. to construct special children or verify and update attributes. The semantics of ``_init()`` are as follows: -* It is called at least once on element instantiation time. That is, - when a Python representation of the element is created. At that - time, the element object is completely initialized to represent a - specific XML element within the tree. - -* The method has complete access to the XML structure. Modifications - can be done in exactly the same way as anywhere else in the program. - -* It may be called multiple times. The _init() code provided by - subclasses must take special care by itself that multiple executions - either are harmless or that they are prevented by some kind of flag - in the XML tree. The latter can be achieved by modifying an - attribute value or by removing or adding a specific child node and - then verifying this before running through the init process. +* It is called at least once on element instantiation time. That is, when a + Python representation of the element is created by lxml. At that time, the + element object is completely initialized to represent a specific XML element + within the tree. + +* The method has complete access to the XML tree. Modifications can be done + in exactly the same way as anywhere else in the program. + +* Python representations of elements may be created multiple times during the + lifetime of an XML element in the underlying tree. The ``_init()`` code + provided by subclasses must take special care by itself that multiple + executions either are harmless or that they are prevented by some kind of + flag in the XML tree. The latter can be achieved by modifying an attribute + value or by removing or adding a specific child node and then verifying this + before running through the init process. + +* Any exceptions raised in ``_init()`` will be propagated throught the API + call that lead to the creation of the Element. So be careful with the code + you write here as its exceptions may turn up in various unexpected places. Default implementations ----------------------- -There is a slight difference between the Namespace example and the -simple 'is_honking' method above. We associated the HonkElement class -only with the 'honk' element. If you have other elements in the same -namespace, they do not pick up the same implementation. - -Example:: +In the Namespace example above, we associated the HonkElement class only with +the 'honk' element. If an XML tree contains different elements in the same +namespace, they do not pick up the same implementation:: - >>> honk_element = XML('') + >>> xml = '' + >>> honk_element = etree.XML(xml) >>> print honk_element.honking True >>> print honk_element[0].honking @@ -122,18 +115,18 @@ AttributeError: 'etree._Element' object has no attribute 'honking' You can therefore provide one implementation per element name in each -namespace and have lxml select the right one on the fly. If you want -one element implementation per namespace (ignoring the element name) -or prefer having a common class for most elements except a few, you -can specify a default implementation for an entire namespace by -registering that class with the empty element name (None). - -You may consider following an object oriented approach. If you build -a class hierarchy of element classes, you can also implement a base -class for a namespace, that is used if no specific element class is -provided. Again, you only have to pass None as an element name:: +namespace and have lxml select the right one on the fly. If you want one +element implementation per namespace (ignoring the element name) or prefer +having a common class for most elements except a few, you can specify a +default implementation for an entire namespace by registering that class with +the empty element name (None). + +You may consider following an object oriented approach here. If you build a +class hierarchy of element classes, you can also implement a base class for a +namespace that is used if no specific element class is provided. Again, you +can just pass None as an element name:: - >>> class HonkNSElement(ElementBase): + >>> class HonkNSElement(etree.ElementBase): ... def honk(self): ... return "HONK" >>> namespace[None] = HonkNSElement @@ -144,9 +137,15 @@ ... honking = property(honking) >>> namespace['honk'] = HonkElement -Now you can use your new namespace:: +Now you can rely on lxml to always return objects of type HonkNSElement or its +subclasses for elements of this namespace:: + + >>> xml = '' + >>> honk_element = etree.XML(xml) + + >>> print type(honk_element), type(honk_element[0]) + - >>> honk_element = XML('') >>> print honk_element.honking True >>> print honk_element.honk() From scoder at codespeak.net Tue May 30 10:45:55 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 30 May 2006 10:45:55 +0200 (CEST) Subject: [Lxml-checkins] r27895 - lxml/trunk/doc Message-ID: <20060530084555.D003710057@code0.codespeak.net> Author: scoder Date: Tue May 30 10:45:52 2006 New Revision: 27895 Modified: lxml/trunk/doc/api.txt Log: cleanup in doc/api.txt Modified: lxml/trunk/doc/api.txt ============================================================================== --- lxml/trunk/doc/api.txt (original) +++ lxml/trunk/doc/api.txt Tue May 30 10:45:52 2006 @@ -241,8 +241,26 @@ Optionally, you can provide a ``namespaces`` keyword argument, which should be a dictionary mapping the namespace prefixes used in the XPath expression to -namespace URIs. The optional ``extensions`` argument is used to define -`extension functions`_ in Python. +namespace URIs:: + + >>> f = StringIO('''\ + ... + ... Text + ... + ... ''') + >>> doc = etree.parse(f) + >>> r = doc.xpath('/t:foo/b:bar', {'t': 'http://codespeak.net/ns/test1', + ... 'b': 'http://codespeak.net/ns/test2'}) + >>> len(r) + 1 + >>> r[0].tag + '{http://codespeak.net/ns/test2}bar' + >>> r[0].text + 'Text' + +There is also an optional ``extensions`` argument which is used to define +`extension functions`_ in Python that are local to this evaluation. .. _`extension functions`: extensions.html @@ -261,34 +279,6 @@ contain a comment, the result contains a string as well, inside ```` markers. -Example:: - - >>> f = StringIO('') - >>> doc = etree.parse(f) - >>> r = doc.xpath('/foo/bar') - >>> len(r) - 1 - >>> r[0].tag - 'bar' - -Example of using namespace prefixes:: - - >>> f = StringIO('''\ - ... - ... Text - ... - ... ''') - >>> doc = etree.parse(f) - >>> r = doc.xpath('/t:foo/b:bar', {'t': 'http://codespeak.net/ns/test1', - ... 'b': 'http://codespeak.net/ns/test2'}) - >>> len(r) - 1 - >>> r[0].tag - '{http://codespeak.net/ns/test2}bar' - >>> r[0].text - 'Text' - A related convenience method of ElementTree objects is ``getpath(element)``, which returns a structural, absolute XPath expression to find that element:: From scoder at codespeak.net Tue May 30 12:18:06 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 30 May 2006 12:18:06 +0200 (CEST) Subject: [Lxml-checkins] r27901 - lxml/trunk/src/lxml/tests Message-ID: <20060530101806.D3D3110057@code0.codespeak.net> Author: scoder Date: Tue May 30 12:18:04 2006 New Revision: 27901 Modified: lxml/trunk/src/lxml/tests/test_elementtree.py Log: clean up in test case Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Tue May 30 12:18:04 2006 @@ -1669,15 +1669,15 @@ root = Element('element') - subelement = Element('subelement') - subelement.set("{http://www.w3.org/XML/1998/namespace}id", "foo") - self.assertEqual(1, len(subelement.attrib)) + subelement = Element('subelement', + {"{http://www.w3.org/XML/1998/namespace}id": "foo"}) + self.assertEquals(1, len(subelement.attrib)) self.assertEquals( "foo", subelement.get("{http://www.w3.org/XML/1998/namespace}id")) root.append(subelement) - self.assertEqual(1, len(subelement.attrib)) + self.assertEquals(1, len(subelement.attrib)) self.assertEquals({"{http://www.w3.org/XML/1998/namespace}id" : "foo"}, subelement.attrib) self.assertEquals( From scoder at codespeak.net Tue May 30 14:18:06 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 30 May 2006 14:18:06 +0200 (CEST) Subject: [Lxml-checkins] r27911 - lxml/trunk/src/lxml Message-ID: <20060530121806.07F3210057@code0.codespeak.net> Author: scoder Date: Tue May 30 14:18:04 2006 New Revision: 27911 Modified: lxml/trunk/src/lxml/apihelpers.pxi Log: iterative rewrite of _findDepthFirstInFollowing Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Tue May 30 14:18:04 2006 @@ -267,18 +267,35 @@ 2) its descendents 3) its following siblings. """ - cdef xmlNode* c_child + cdef xmlNode* c_next + cdef xmlNode* c_start_parent if c_name is NULL: # always match return c_node + if c_node is NULL: + return NULL + c_start_parent = c_node.parent while c_node is not NULL: - if _tagMatches(c_node, c_href, c_name): - return c_node - if c_node.children is not NULL: - c_child = _findDepthFirstInFollowing(c_node.children, c_href, c_name) - if c_child is not NULL: - return c_child - c_node = _nextElement(c_node) + if _isElement(c_node): + if _tagMatches(c_node, c_href, c_name): + return c_node + # walk through children + c_next = c_node.children + if c_next is NULL: + c_next = _nextElement(c_node) + elif not _isElement(c_next): + c_next = _nextElement(c_next) + if c_next is NULL: + c_next = _nextElement(c_node) + else: + c_next = _nextElement(c_node) + # back off through parents + while c_next is NULL: + c_node = c_node.parent + if c_node is c_start_parent: + return NULL + c_next = _nextElement(c_node) + c_node = c_next return NULL cdef int _tagMatches(xmlNode* c_node, char* c_href, char* c_name): From scoder at codespeak.net Tue May 30 14:38:37 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 30 May 2006 14:38:37 +0200 (CEST) Subject: [Lxml-checkins] r27915 - lxml/trunk/src/lxml Message-ID: <20060530123837.4E58810057@code0.codespeak.net> Author: scoder Date: Tue May 30 14:38:34 2006 New Revision: 27915 Modified: lxml/trunk/src/lxml/apihelpers.pxi Log: cleanup in _findDepthFirstInFollowing Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Tue May 30 14:38:34 2006 @@ -276,19 +276,18 @@ return NULL c_start_parent = c_node.parent while c_node is not NULL: - if _isElement(c_node): - if _tagMatches(c_node, c_href, c_name): - return c_node - # walk through children - c_next = c_node.children + if _tagMatches(c_node, c_href, c_name): + return c_node + # walk through children + c_next = c_node.children + if c_next is NULL: + # sibling? + c_next = _nextElement(c_node) + elif not _isElement(c_next): + # we need an element + c_next = _nextElement(c_next) if c_next is NULL: c_next = _nextElement(c_node) - elif not _isElement(c_next): - c_next = _nextElement(c_next) - if c_next is NULL: - c_next = _nextElement(c_node) - else: - c_next = _nextElement(c_node) # back off through parents while c_next is NULL: c_node = c_node.parent From scoder at codespeak.net Tue May 30 16:12:39 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 30 May 2006 16:12:39 +0200 (CEST) Subject: [Lxml-checkins] r27921 - lxml/trunk/src/lxml Message-ID: <20060530141239.1311110053@code0.codespeak.net> Author: scoder Date: Tue May 30 16:12:37 2006 New Revision: 27921 Modified: lxml/trunk/src/lxml/serializer.pxi Log: fixed a memory access bug found by valgrind: xmlOutputBufferClose free the encoding handler, we must not call xmlCharEncCloseFunc Modified: lxml/trunk/src/lxml/serializer.pxi ============================================================================== --- lxml/trunk/src/lxml/serializer.pxi (original) +++ lxml/trunk/src/lxml/serializer.pxi Tue May 30 16:12:37 2006 @@ -38,7 +38,6 @@ tree.xmlBufferLength(c_result_buffer)) finally: tree.xmlOutputBufferClose(c_buffer) - tree.xmlCharEncCloseFunc(enchandler) return result cdef _tounicode(_NodeBase element, int pretty_print): From scoder at codespeak.net Tue May 30 19:15:15 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 30 May 2006 19:15:15 +0200 (CEST) Subject: [Lxml-checkins] r27935 - lxml/trunk/src/lxml Message-ID: <20060530171515.2528B10057@code0.codespeak.net> Author: scoder Date: Tue May 30 19:15:13 2006 New Revision: 27935 Modified: lxml/trunk/src/lxml/parser.pxi Log: cleanup Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Tue May 30 19:15:13 2006 @@ -52,12 +52,12 @@ if result.dict is NULL: result.dict = xmlparser.xmlDictCreate() self._c_dict = result.dict - xmlparser.xmlDictReference(result.dict) + xmlparser.xmlDictReference(self._c_dict) elif result.dict != self._c_dict: if result.dict is not NULL: xmlparser.xmlDictFree(result.dict) result.dict = self._c_dict - xmlparser.xmlDictReference(self._c_dict) + xmlparser.xmlDictReference(result.dict) cdef _ParserContext __GLOBAL_PARSER_CONTEXT __GLOBAL_PARSER_CONTEXT = _ParserContext() From scoder at codespeak.net Tue May 30 19:18:34 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 30 May 2006 19:18:34 +0200 (CEST) Subject: [Lxml-checkins] r27936 - lxml/trunk/src/lxml Message-ID: <20060530171834.B2A0410057@code0.codespeak.net> Author: scoder Date: Tue May 30 19:18:33 2006 New Revision: 27936 Modified: lxml/trunk/src/lxml/parser.pxi Log: cleanup Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Tue May 30 19:18:33 2006 @@ -79,7 +79,7 @@ cdef Py_ssize_t l cdef char* buffer cdef char* enc - utext = unicode("") + utext = python.PyUnicode_DecodeUTF8("", 7, 'strict') l = python.PyUnicode_GET_DATA_SIZE(utext) buffer = python.PyUnicode_AS_DATA(utext) enc = _findEncodingName(buffer, l) From scoder at codespeak.net Tue May 30 21:25:15 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 30 May 2006 21:25:15 +0200 (CEST) Subject: [Lxml-checkins] r27940 - lxml/trunk/src/lxml/tests Message-ID: <20060530192515.42AC510057@code0.codespeak.net> Author: scoder Date: Tue May 30 21:25:03 2006 New Revision: 27940 Modified: lxml/trunk/src/lxml/tests/test_elementtree.py Log: fix test case Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Tue May 30 21:25:03 2006 @@ -1678,8 +1678,9 @@ root.append(subelement) self.assertEquals(1, len(subelement.attrib)) - self.assertEquals({"{http://www.w3.org/XML/1998/namespace}id" : "foo"}, - subelement.attrib) + self.assertEquals( + {"{http://www.w3.org/XML/1998/namespace}id" : "foo"}.items(), + subelement.attrib.items()) self.assertEquals( "foo", subelement.get("{http://www.w3.org/XML/1998/namespace}id")) From scoder at codespeak.net Tue May 30 21:25:42 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 30 May 2006 21:25:42 +0200 (CEST) Subject: [Lxml-checkins] r27941 - lxml/trunk/src/lxml Message-ID: <20060530192542.AB54210057@code0.codespeak.net> Author: scoder Date: Tue May 30 21:25:41 2006 New Revision: 27941 Modified: lxml/trunk/src/lxml/apihelpers.pxi Log: fixed stupid, stupid bug with namespace reconciliation: free the document /after/ fixing namespaces Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Tue May 30 21:25:41 2006 @@ -434,9 +434,9 @@ tree below (including the current node). It also reconciliates namespaces so they're correct inside the new environment. """ + tree.xmlReconciliateNs(doc._c_doc, node._c_node) if node._doc is not doc: changeDocumentBelow(node._c_node, doc) - tree.xmlReconciliateNs(doc._c_doc, node._c_node) cdef void changeDocumentBelow(xmlNode* c_node, _Document doc): """Update the Python references in the tree below the node. From scoder at codespeak.net Tue May 30 21:42:32 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 30 May 2006 21:42:32 +0200 (CEST) Subject: [Lxml-checkins] r27942 - lxml/trunk Message-ID: <20060530194232.7F1A91005A@code0.codespeak.net> Author: scoder Date: Tue May 30 21:42:31 2006 New Revision: 27942 Modified: lxml/trunk/CHANGES.txt Log: mark namespace bug as fixed Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Tue May 30 21:42:31 2006 @@ -24,6 +24,9 @@ Bugs fixed ---------- +* Namespace fixing after moving elements between documents could fail if the + source document was freed too early + * Setting namespace-less tag names on namespaced elements ('{ns}t' -> 't') didn't reset the namespace From scoder at codespeak.net Wed May 31 08:47:52 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 31 May 2006 08:47:52 +0200 (CEST) Subject: [Lxml-checkins] r27946 - lxml/trunk/src/lxml Message-ID: <20060531064752.95B7310060@code0.codespeak.net> Author: scoder Date: Wed May 31 08:47:37 2006 New Revision: 27946 Modified: lxml/trunk/src/lxml/apihelpers.pxi lxml/trunk/src/lxml/etree.pyx lxml/trunk/src/lxml/proxy.pxi Log: made _Attrib a plain Python object on top of an _Element, allowed for major code cleanup and simplification in proxy code Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Wed May 31 08:47:37 2006 @@ -80,7 +80,7 @@ else: return None -cdef object _attributeValue(xmlNode* c_element, xmlNode* c_attrib_node): +cdef object _attributeValue(xmlNode* c_element, xmlAttr* c_attrib_node): cdef char* value if c_attrib_node.ns is NULL or c_attrib_node.ns.href is NULL: value = tree.xmlGetNoNsProp(c_element, c_attrib_node.name) @@ -425,37 +425,3 @@ if hasattr(source, 'geturl'): return source.geturl() return None - -cdef void moveNodeToDocument(_NodeBase node, _Document doc): - """For a node and all nodes below, change document. - - A node can change document in certain operations as an XML - subtree can move. This updates all possible proxies in the - tree below (including the current node). It also reconciliates - namespaces so they're correct inside the new environment. - """ - tree.xmlReconciliateNs(doc._c_doc, node._c_node) - if node._doc is not doc: - changeDocumentBelow(node._c_node, doc) - -cdef void changeDocumentBelow(xmlNode* c_node, _Document doc): - """Update the Python references in the tree below the node. - - Note that we expect C pointers to the document to be updated already by - libxml2. - """ - cdef ProxyRef* ref - cdef xmlNode* c_current - cdef _NodeBase proxy - # adjust all children recursively - c_current = c_node.children - while c_current is not NULL: - changeDocumentBelow(c_current, doc) - c_current = c_current.next - - # adjust Python references of current node - ref = c_node._private - while ref is not NULL: - proxy = <_NodeBase>ref.proxy - proxy._doc = doc - ref = ref.next Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Wed May 31 08:47:37 2006 @@ -23,10 +23,6 @@ # any non-public function/class is prefixed with an underscore # instance creation is always through factories -ctypedef enum LXML_PROXY_TYPE: - PROXY_ELEMENT - PROXY_ATTRIB - # what to do with libxml2/libxslt error messages? # 0 : drop # 1 : use log @@ -173,7 +169,8 @@ # the document #print "freeing document:", self._c_doc #displayNode(self._c_doc, 0) - #print self._c_doc, self._c_doc.dict is __GLOBAL_PARSER_CONTEXT._c_dict + #print self._c_doc, self._c_doc.dict is __GLOBAL_PARSER_CONTEXT._c_dict + #print self._c_doc, canDeallocateChildNodes(self._c_doc) tree.xmlFreeDoc(self._c_doc) cdef getroot(self): @@ -338,7 +335,6 @@ """ cdef _Document _doc cdef xmlNode* _c_node - cdef int _proxy_type def __dealloc__(self): #print "trying to free node:", self._c_node @@ -556,7 +552,6 @@ cdef class _Element(_NodeBase): cdef object _tag - def _init(self): """Called after object initialisation. Custom subclasses may override this if they recursively call _init() in the superclasses. @@ -721,8 +716,11 @@ property attrib: def __get__(self): - return _attribFactory(self._doc, self._c_node) - + # do *NOT* keep a reference here to prevent cyclic dependencies + # this would free the element in the Cyclic GC, which might let + # Python deallocate the document before the element! + return _Attrib(self) + property text: def __get__(self): return _collectText(self._c_node.children) @@ -962,7 +960,7 @@ cdef _Element _elementFactory(_Document doc, xmlNode* c_node): cdef _Element result cdef char* c_ns_href - result = getProxy(c_node, PROXY_ELEMENT) + result = getProxy(c_node) if result is not None: return result if c_node is NULL: @@ -980,8 +978,7 @@ result = element_class() result._doc = doc result._c_node = c_node - result._proxy_type = PROXY_ELEMENT - registerProxy(result, PROXY_ELEMENT) + registerProxy(result) result._init() return result @@ -1038,7 +1035,7 @@ cdef _Comment _commentFactory(_Document doc, xmlNode* c_node): cdef _Comment result - result = getProxy(c_node, PROXY_ELEMENT) + result = getProxy(c_node) if result is not None: return result if c_node is NULL: @@ -1046,24 +1043,29 @@ result = _Comment() result._doc = doc result._c_node = c_node - result._proxy_type = PROXY_ELEMENT - registerProxy(result, PROXY_ELEMENT) + registerProxy(result) return result -cdef class _Attrib(_NodeBase): +cdef class _Attrib: + cdef _NodeBase _element + def __init__(self, _NodeBase element not None): + self._element = element + # MANIPULATORS def __setitem__(self, key, value): - _setAttributeValue(self, key, value) + _setAttributeValue(self._element, key, value) def __delitem__(self, key): + cdef xmlNode* c_node cdef xmlAttr* c_attr cdef char* c_tag ns, tag = _getNsTag(key) c_tag = _cstr(tag) + c_node = self._element._c_node if ns is None: - c_attr = tree.xmlHasProp(self._c_node, c_tag) + c_attr = tree.xmlHasProp(c_node, c_tag) else: - c_attr = tree.xmlHasNsProp(self._c_node, c_tag, _cstr(ns)) + c_attr = tree.xmlHasNsProp(c_node, c_tag, _cstr(ns)) if c_attr is NULL: # XXX free namespace that is not in use..? raise KeyError, key @@ -1077,43 +1079,46 @@ return repr(result) def __getitem__(self, key): - result = _getAttributeValue(self, key, None) + result = _getAttributeValue(self._element, key, None) if result is None: raise KeyError, key else: return result def __nonzero__(self): - cdef xmlNode* c_node - c_node = (self._c_node.properties) - while c_node is not NULL: - if c_node.type == tree.XML_ATTRIBUTE_NODE: + cdef xmlAttr* c_attr + c_attr = self._element._c_node.properties + while c_attr is not NULL: + if c_attr.type == tree.XML_ATTRIBUTE_NODE: return 1 - c_node = c_node.next + c_attr = c_attr.next return 0 def __len__(self): + cdef xmlAttr* c_attr cdef Py_ssize_t c - cdef xmlNode* c_node c = 0 - c_node = (self._c_node.properties) - while c_node is not NULL: - if c_node.type == tree.XML_ATTRIBUTE_NODE: + c_attr = self._element._c_node.properties + while c_attr is not NULL: + if c_attr.type == tree.XML_ATTRIBUTE_NODE: c = c + 1 - c_node = c_node.next + c_attr = c_attr.next return c def get(self, key, default=None): - return _getAttributeValue(self, key, default) + return _getAttributeValue(self._element, key, default) def keys(self): - result = [] cdef xmlNode* c_node - c_node = (self._c_node.properties) - while c_node is not NULL: - if c_node.type == tree.XML_ATTRIBUTE_NODE: - python.PyList_Append(result, _namespacedName(c_node)) - c_node = c_node.next + cdef xmlAttr* c_attr + c_node = self._element._c_node + c_attr = c_node.properties + result = [] + while c_attr is not NULL: + if c_attr.type == tree.XML_ATTRIBUTE_NODE: + python.PyList_Append( + result, _namespacedName(c_attr)) + c_attr = c_attr.next return result def __iter__(self): @@ -1124,13 +1129,15 @@ def values(self): cdef xmlNode* c_node + cdef xmlAttr* c_attr + c_node = self._element._c_node + c_attr = c_node.properties result = [] - c_node = (self._c_node.properties) - while c_node is not NULL: - if c_node.type == tree.XML_ATTRIBUTE_NODE: + while c_attr is not NULL: + if c_attr.type == tree.XML_ATTRIBUTE_NODE: python.PyList_Append( - result, _attributeValue(self._c_node, c_node)) - c_node = c_node.next + result, _attributeValue(c_node, c_attr)) + c_attr = c_attr.next return result def itervalues(self): @@ -1139,14 +1146,16 @@ def items(self): result = [] cdef xmlNode* c_node - c_node = (self._c_node.properties) - while c_node is not NULL: - if c_node.type == tree.XML_ATTRIBUTE_NODE: + cdef xmlAttr* c_attr + c_node = self._element._c_node + c_attr = c_node.properties + while c_attr is not NULL: + if c_attr.type == tree.XML_ATTRIBUTE_NODE: python.PyList_Append(result, ( - _namespacedName(c_node), - _attributeValue(self._c_node, c_node) + _namespacedName(c_attr), + _attributeValue(c_node, c_attr) )) - c_node = c_node.next + c_attr = c_attr.next return result def iteritems(self): @@ -1159,32 +1168,22 @@ return False def __contains__(self, key): + cdef xmlNode* c_node cdef char* c_result cdef char* c_tag ns, tag = _getNsTag(key) c_tag = _cstr(tag) + c_node = self._element._c_node if ns is None: - c_result = tree.xmlGetNoNsProp(self._c_node, c_tag) + c_result = tree.xmlGetNoNsProp(c_node, c_tag) else: - c_result = tree.xmlGetNsProp(self._c_node, c_tag, _cstr(ns)) + c_result = tree.xmlGetNsProp(c_node, c_tag, _cstr(ns)) if c_result is NULL: return 0 else: tree.xmlFree(c_result) return 1 -cdef _Attrib _attribFactory(_Document doc, xmlNode* c_node): - cdef _Attrib result - result = getProxy(c_node, PROXY_ATTRIB) - if result is not None: - return result - result = _Attrib() - result._doc = doc - result._c_node = c_node - result._proxy_type = PROXY_ATTRIB - registerProxy(result, PROXY_ATTRIB) - return result - ctypedef xmlNode* (*_node_to_node_function)(xmlNode*) cdef class ElementChildIterator: Modified: lxml/trunk/src/lxml/proxy.pxi ============================================================================== --- lxml/trunk/src/lxml/proxy.pxi (original) +++ lxml/trunk/src/lxml/proxy.pxi Wed May 31 08:47:37 2006 @@ -4,76 +4,37 @@ # structure of the respective node to avoid multiple instantiation of # the Python class -cdef struct _ProxyRef - -cdef struct _ProxyRef: - python.PyObject* proxy - LXML_PROXY_TYPE type - _ProxyRef* next - -ctypedef _ProxyRef ProxyRef - -cdef _NodeBase getProxy(xmlNode* c_node, int proxy_type): - """Get a proxy for a given node and node type. +cdef _NodeBase getProxy(xmlNode* c_node): + """Get a proxy for a given node. """ - cdef ProxyRef* ref #print "getProxy for:", c_node - if c_node is NULL: + if c_node is not NULL and c_node._private is not NULL: + return <_NodeBase>c_node._private + else: return None - ref = c_node._private - while ref is not NULL: - if ref.type == proxy_type: - return <_NodeBase>ref.proxy - ref = ref.next - return None cdef int hasProxy(xmlNode* c_node): return c_node._private is not NULL -cdef void registerProxy(_NodeBase proxy, int proxy_type): +cdef registerProxy(_NodeBase proxy): """Register a proxy and type for the node it's proxying for. """ cdef xmlNode* c_node - cdef ProxyRef* ref # cannot register for NULL c_node = proxy._c_node if c_node is NULL: return - # XXX should we check whether we ran into proxy_type before? #print "registering for:", proxy._c_node - ref = python.PyMem_Malloc(sizeof(ProxyRef)) - ref.proxy = proxy - ref.type = proxy_type - ref.next = c_node._private - c_node._private = ref # prepend + assert c_node._private is NULL, "double registering proxy!" + c_node._private = proxy -cdef void unregisterProxy(_NodeBase proxy): +cdef unregisterProxy(_NodeBase proxy): """Unregister a proxy for the node it's proxying for. """ - cdef python.PyObject* proxy_ref - cdef ProxyRef* ref - cdef ProxyRef* prev_ref cdef xmlNode* c_node - proxy_ref = proxy c_node = proxy._c_node - ref = c_node._private - if ref.proxy == proxy_ref: - c_node._private = ref.next - python.PyMem_Free(ref) - return - prev_ref = ref - #print "First registered is:", ref.type - ref = ref.next - while ref is not NULL: - #print "Registered is:", ref.type - if ref.proxy == proxy_ref: - prev_ref.next = ref.next - python.PyMem_Free(ref) - return - prev_ref = ref - ref = ref.next - #print "Proxy:", proxy, "Proxy type:", proxy_type - assert 0, "Tried to unregister unknown proxy" + assert c_node._private is proxy, "Tried to unregister unknown proxy" + c_node._private = NULL ################################################################################ # temporarily make a node the root node of its document @@ -169,7 +130,7 @@ c_top = c_current c_current = c_current.parent # see whether we have children to deallocate - if canDeallocateChildren(c_top): + if canDeallocateChildNodes(c_top): return c_top else: return NULL @@ -178,38 +139,43 @@ cdef xmlNode* c_current c_current = c_node.children while c_current is not NULL: - if c_current._private is not NULL: - return 0 - if not canDeallocateChildren(c_current): - return 0 + if _isElement(c_current): + if c_current._private is not NULL: + return 0 + if not canDeallocateChildNodes(c_current): + return 0 c_current = c_current.next return 1 -cdef int canDeallocateAttributes(xmlNode* c_node): - cdef xmlAttr* c_current - c_current = c_node.properties +################################################################################ +# change _Document references when a node changes documents + +cdef void moveNodeToDocument(_NodeBase node, _Document doc): + """For a node and all nodes below, change document. + + A node can change document in certain operations as an XML + subtree can move. This updates all possible proxies in the + tree below (including the current node). It also reconciliates + namespaces so they're correct inside the new environment. + """ + tree.xmlReconciliateNs(doc._c_doc, node._c_node) + if node._doc is not doc: + changeDocumentBelow(node._c_node, doc) + +cdef void changeDocumentBelow(xmlNode* c_node, _Document doc): + """Update the Python references in the tree below the node. + + Note that we expect C pointers to the document to be updated already by + libxml2. + """ + cdef xmlNode* c_current + # adjust all children recursively + c_current = c_node.children while c_current is not NULL: - if c_current._private is not NULL: - return 0 - # only check child nodes, don't try checking properties as - # attribute has none - if not canDeallocateChildNodes(c_current): - return 0 + if _isElement(c_current): + changeDocumentBelow(c_current, doc) c_current = c_current.next - # apparently we can deallocate all subnodes - return 1 - -cdef int canDeallocateChildren(xmlNode* c_node): - # the current implementation is inefficient as it does a - # tree traversal to find out whether there are any node proxies - # we could improve this by a smarter datastructure - # check children - if not canDeallocateChildNodes(c_node): - return 0 - # check any attributes - if (c_node.type == tree.XML_ELEMENT_NODE and - not canDeallocateAttributes(c_node)): - return 0 - # apparently we can deallocate all subnodes - return 1 + # adjust Python reference of current node + if c_node._private is not NULL: + (<_NodeBase>c_node._private)._doc = doc From scoder at codespeak.net Wed May 31 09:38:31 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 31 May 2006 09:38:31 +0200 (CEST) Subject: [Lxml-checkins] r27947 - in lxml/trunk: doc src/lxml Message-ID: <20060531073831.BFE6810053@code0.codespeak.net> Author: scoder Date: Wed May 31 09:38:15 2006 New Revision: 27947 Modified: lxml/trunk/doc/performance.txt lxml/trunk/src/lxml/apihelpers.pxi lxml/trunk/src/lxml/etree.h lxml/trunk/src/lxml/proxy.pxi lxml/trunk/src/lxml/tree.pxd Log: C macro implementation of an iterative tree walker: reduces code duplication between various functions and speeds up tree walking operations by up to 30% (deallocation, iteration, etc.) Modified: lxml/trunk/doc/performance.txt ============================================================================== --- lxml/trunk/doc/performance.txt (original) +++ lxml/trunk/doc/performance.txt Wed May 31 09:38:15 2006 @@ -194,37 +194,37 @@ especially if few elements are of interest or the element tag name is known, lxml is a good choice:: - lxe: getiterator_all (-- T2 ) 31.2719 msec/pass + lxe: getiterator_all (-- T2 ) 23.0440 msec/pass cET: getiterator_all (-- T2 ) 36.3687 msec/pass ET : getiterator_all (-- T2 ) 46.2846 msec/pass - lxe: getiterator_islice (-- T2 ) 2.8503 msec/pass + lxe: getiterator_islice (-- T2 ) 2.0699 msec/pass cET: getiterator_islice (-- T2 ) 0.3299 msec/pass ET : getiterator_islice (-- T2 ) 44.5898 msec/pass - lxe: getiterator_tag (-- T2 ) 3.0983 msec/pass + lxe: getiterator_tag (-- T2 ) 1.9176 msec/pass cET: getiterator_tag (-- T2 ) 11.2861 msec/pass ET : getiterator_tag (-- T2 ) 37.5661 msec/pass - lxe: getiterator_tag_all (-- T2 ) 4.9760 msec/pass + lxe: getiterator_tag_all (-- T2 ) 4.5722 msec/pass cET: getiterator_tag_all (-- T2 ) 33.2602 msec/pass ET : getiterator_tag_all (-- T2 ) 37.6200 msec/pass This similarly shows in ``Element.findall()``:: - lxe: findall (-- T2 ) 36.4730 msec/pass + lxe: findall (-- T2 ) 27.3874 msec/pass cET: findall (-- T2 ) 38.8718 msec/pass ET : findall (-- T2 ) 50.9692 msec/pass - lxe: findall (-- T3 ) 4.3956 msec/pass + lxe: findall (-- T3 ) 3.8227 msec/pass cET: findall (-- T3 ) 11.8051 msec/pass ET : findall (-- T3 ) 11.2570 msec/pass - lxe: findall_tag (-- T2 ) 4.3950 msec/pass + lxe: findall_tag (-- T2 ) 4.5549 msec/pass cET: findall_tag (-- T2 ) 31.3107 msec/pass ET : findall_tag (-- T2 ) 36.7813 msec/pass - lxe: findall_tag (-- T3 ) 0.5946 msec/pass + lxe: findall_tag (-- T3 ) 0.5643 msec/pass cET: findall_tag (-- T3 ) 7.4491 msec/pass ET : findall_tag (-- T3 ) 9.2943 msec/pass Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Wed May 31 09:38:15 2006 @@ -267,34 +267,13 @@ 2) its descendents 3) its following siblings. """ - cdef xmlNode* c_next - cdef xmlNode* c_start_parent if c_name is NULL: # always match return c_node - if c_node is NULL: - return NULL - c_start_parent = c_node.parent - while c_node is not NULL: - if _tagMatches(c_node, c_href, c_name): - return c_node - # walk through children - c_next = c_node.children - if c_next is NULL: - # sibling? - c_next = _nextElement(c_node) - elif not _isElement(c_next): - # we need an element - c_next = _nextElement(c_next) - if c_next is NULL: - c_next = _nextElement(c_node) - # back off through parents - while c_next is NULL: - c_node = c_node.parent - if c_node is c_start_parent: - return NULL - c_next = _nextElement(c_node) - c_node = c_next + tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node) + if _tagMatches(c_node, c_href, c_name): + return c_node + tree.END_FOR_EACH_ELEMENT_FROM(c_node) return NULL cdef int _tagMatches(xmlNode* c_node, char* c_href, char* c_name): Modified: lxml/trunk/src/lxml/etree.h ============================================================================== --- lxml/trunk/src/lxml/etree.h (original) +++ lxml/trunk/src/lxml/etree.h Wed May 31 09:38:15 2006 @@ -1,6 +1,10 @@ #ifndef HAS_ETREE_H #define HAS_ETREE_H +/* v_arg functions */ +#define va_int(ap) va_arg(ap, int) +#define va_charptr(ap) va_arg(ap, char *) + /* Py_ssize_t support was added in Python 2.5 */ #if PY_VERSION_HEX < 0x02050000 #ifndef PY_SSIZE_T_MAX /* patched Pyrex? */ @@ -19,12 +23,61 @@ #define str(o) PyObject_Str(o) #define iter(o) PyObject_GetIter(o) #define _cstr(s) PyString_AS_STRING(s) + #define _isElement(c_node) \ ((c_node)->type == XML_ELEMENT_NODE || \ (c_node)->type == XML_COMMENT_NODE) -/* v_arg functions */ -#define va_int(ap) va_arg(ap, int) -#define va_charptr(ap) va_arg(ap, char *) +/* Macro set implementation of a depth first tree walker + * + * Calls the code block between the BEGIN and END macros + * 1) for the start element (or the first 'element' sibling) + * 2) for all children (recursively) + * 3) all siblings (recursively) + * + * Usage in Pyrex: + * cdef xmlNode* some_node + * some_node = parent_node.children + * BEGIN_FOR_EACH_ELEMENT_FROM(some_node) + * # do something with some_node + * END_FOR_EACH_ELEMENT_FROM(some_node) + * + * NOTE: 'some_node' MUST be a plain 'xmlNode*' ! + * NOTE: parent modification during the walk will segfault ! + */ + +#define BEGIN_FOR_EACH_ELEMENT_FROM(c_node) \ +{ \ + while ((c_node != 0) && (!_isElement(c_node))) \ + c_node = c_node->next; \ + if (c_node != 0) { \ + xmlNode* ___start_parent = c_node->parent; \ + xmlNode* ___next; \ + while (c_node != 0) { + /* here goes the code to be run for each element */ +#define END_FOR_EACH_ELEMENT_FROM(c_node) \ + /* walk through children */ \ + ___next = c_node->children; \ + while ((___next != 0) && (!_isElement(___next))) \ + ___next = ___next->next; \ + if (___next == 0) { \ + /* try siblings */ \ + ___next = c_node->next; \ + while ((___next != 0) && (!_isElement(___next))) \ + ___next = ___next->next; \ + } \ + /* back off through parents */ \ + while (___next == 0) { \ + c_node = c_node->parent; \ + if (c_node == ___start_parent) \ + break; \ + ___next = c_node->next; \ + while ((___next != 0) && (!_isElement(___next))) \ + ___next = ___next->next; \ + } \ + c_node = ___next; \ + } \ + } \ +} #endif /*HAS_ETREE_H*/ Modified: lxml/trunk/src/lxml/proxy.pxi ============================================================================== --- lxml/trunk/src/lxml/proxy.pxi (original) +++ lxml/trunk/src/lxml/proxy.pxi Wed May 31 09:38:15 2006 @@ -136,15 +136,11 @@ return NULL cdef int canDeallocateChildNodes(xmlNode* c_node): - cdef xmlNode* c_current - c_current = c_node.children - while c_current is not NULL: - if _isElement(c_current): - if c_current._private is not NULL: - return 0 - if not canDeallocateChildNodes(c_current): - return 0 - c_current = c_current.next + c_node = c_node.children + tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node) + if c_node._private is not NULL: + return 0 + tree.END_FOR_EACH_ELEMENT_FROM(c_node) return 1 ################################################################################ @@ -160,22 +156,18 @@ """ tree.xmlReconciliateNs(doc._c_doc, node._c_node) if node._doc is not doc: + node._doc = doc changeDocumentBelow(node._c_node, doc) cdef void changeDocumentBelow(xmlNode* c_node, _Document doc): """Update the Python references in the tree below the node. + Does not update the node itself. Note that we expect C pointers to the document to be updated already by libxml2. """ - cdef xmlNode* c_current - # adjust all children recursively - c_current = c_node.children - while c_current is not NULL: - if _isElement(c_current): - changeDocumentBelow(c_current, doc) - c_current = c_current.next - - # adjust Python reference of current node + c_node = c_node.children + tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node) if c_node._private is not NULL: (<_NodeBase>c_node._private)._doc = doc + tree.END_FOR_EACH_ELEMENT_FROM(c_node) Modified: lxml/trunk/src/lxml/tree.pxd ============================================================================== --- lxml/trunk/src/lxml/tree.pxd (original) +++ lxml/trunk/src/lxml/tree.pxd Wed May 31 09:38:15 2006 @@ -248,3 +248,5 @@ cdef extern from "etree.h": cdef int _isElement(xmlNode* node) + cdef void BEGIN_FOR_EACH_ELEMENT_FROM(xmlNode* node) + cdef void END_FOR_EACH_ELEMENT_FROM(xmlNode* node) From scoder at codespeak.net Wed May 31 09:54:00 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 31 May 2006 09:54:00 +0200 (CEST) Subject: [Lxml-checkins] r27948 - lxml/trunk/doc Message-ID: <20060531075400.4EF5510053@code0.codespeak.net> Author: scoder Date: Wed May 31 09:53:48 2006 New Revision: 27948 Modified: lxml/trunk/doc/performance.txt Log: updated benchmark results Modified: lxml/trunk/doc/performance.txt ============================================================================== --- lxml/trunk/doc/performance.txt (original) +++ lxml/trunk/doc/performance.txt Wed May 31 09:53:48 2006 @@ -90,10 +90,10 @@ (given in seconds):: lxe: -- S- U- -A SA UA - T1: 0.1360 0.1236 0.1241 0.1243 0.1261 0.1254 - T2: 0.1281 0.1282 0.1299 0.1381 0.1389 0.1395 - T3: 0.0366 0.0300 0.0290 0.0850 0.0851 0.0893 - T4: 0.0010 0.0006 0.0006 0.0018 0.0018 0.0019 + T1: 0.1360 0.1214 0.1214 0.1217 0.1232 0.1226 + T2: 0.1258 0.1257 0.1250 0.1348 0.1359 0.1358 + T3: 0.0354 0.0282 0.0288 0.0850 0.0860 0.0862 + T4: 0.0006 0.0006 0.0006 0.0019 0.0018 0.0019 cET: -- S- U- -A SA UA T1: 0.0417 0.0409 0.0403 0.0410 0.0410 0.0415 @@ -125,13 +125,13 @@ are supposed to end up in, either as SubElements of an Element or using the explicit ``Element.makeelement()`` call:: - lxe: makeelement (-- T2 ) 4.3003 msec/pass - cET: makeelement (-- T2 ) 0.5520 msec/pass - ET : makeelement (-- T2 ) 3.8092 msec/pass - - lxe: create_subelements (-- T2 ) 3.9673 msec/pass - cET: create_subelements (-- T2 ) 0.5666 msec/pass - ET : create_subelements (-- T2 ) 6.4613 msec/pass + lxe: makeelement (-- T2 ) 4.2658 msec/pass + cET: makeelement (-- T2 ) 0.5658 msec/pass + ET : makeelement (-- T2 ) 3.7136 msec/pass + + lxe: create_subelements (-- T2 ) 3.7640 msec/pass + cET: create_subelements (-- T2 ) 0.5332 msec/pass + ET : create_subelements (-- T2 ) 6.5937 msec/pass So, if the main performance bottleneck of an application is creating large XML trees in memory through calls to Element and SubElement, cET is the best @@ -148,9 +148,9 @@ cET: append_from_document (-- T1,T2) 0.4673 msec/pass ET : append_from_document (-- T1,T2) 2.0460 msec/pass - lxe: append_from_document (-- T3,T4) 0.2017 msec/pass - cET: append_from_document (-- T3,T4) 0.0227 msec/pass - ET : append_from_document (-- T3,T4) 0.1563 msec/pass + lxe: append_from_document (-- T3,T4) 0.1582 msec/pass + cET: append_from_document (-- T3,T4) 0.0224 msec/pass + ET : append_from_document (-- T3,T4) 0.1618 msec/pass Although these are fairly small numbers compared to parsing, this easily shows the different performance classes for lxml and (c)ET. Where the latter do not @@ -161,9 +161,9 @@ This difference is not always as visible, but applies to most parts of the API, like inserting newly created elements:: - lxe: insert_from_document (-- T1,T2) 16.4772 msec/pass - cET: insert_from_document (-- T1,T2) 1.1874 msec/pass - ET : insert_from_document (-- T1,T2) 3.5447 msec/pass + lxe: insert_from_document (-- T1,T2) 16.2342 msec/pass + cET: insert_from_document (-- T1,T2) 1.1786 msec/pass + ET : insert_from_document (-- T1,T2) 3.6107 msec/pass Or replacing the child slice by a new element:: From scoder at codespeak.net Wed May 31 09:55:04 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 31 May 2006 09:55:04 +0200 (CEST) Subject: [Lxml-checkins] r27949 - lxml/trunk/doc Message-ID: <20060531075504.1DD7C10053@code0.codespeak.net> Author: scoder Date: Wed May 31 09:54:52 2006 New Revision: 27949 Modified: lxml/trunk/doc/performance.txt Log: doc updates Modified: lxml/trunk/doc/performance.txt ============================================================================== --- lxml/trunk/doc/performance.txt (original) +++ lxml/trunk/doc/performance.txt Wed May 31 09:54:52 2006 @@ -13,8 +13,8 @@ The statements made here are backed by the benchmark script `bench.py`_ that comes with the lxml source distribution. The timings cited below compare lxml -1.0, ElementTree 1.2.6 and cElementTree 1.0.5 under CPython 2.4.2 on an AMD64 -machine. +1.0, ElementTree 1.2.6 and cElementTree 1.0.5 under CPython 2.4.2 on a 1.6GHz +AMD64 machine. .. _`bench.py`: http://codespeak.net/svn/lxml/trunk/bench.py From scoder at codespeak.net Wed May 31 10:35:29 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 31 May 2006 10:35:29 +0200 (CEST) Subject: [Lxml-checkins] r27951 - lxml/trunk/src/lxml Message-ID: <20060531083529.BA1491005A@code0.codespeak.net> Author: scoder Date: Wed May 31 10:35:28 2006 New Revision: 27951 Modified: lxml/trunk/src/lxml/apihelpers.pxi Log: cleanup Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Wed May 31 10:35:28 2006 @@ -249,16 +249,13 @@ c_node = c_node.children if c_node is NULL: return NULL - if not _isElement(c_node): - c_node = _nextElement(c_node) return _findDepthFirstInFollowing(c_node, c_href, c_name) cdef xmlNode* _findDepthFirstInFollowingSiblings(xmlNode* c_node, char* c_href, char* c_name): if c_node is NULL: return NULL - c_node = _nextElement(c_node) - return _findDepthFirstInFollowing(c_node, c_href, c_name) + return _findDepthFirstInFollowing(c_node.next, c_href, c_name) cdef xmlNode* _findDepthFirstInFollowing(xmlNode* c_node, char* c_href, char* c_name): @@ -267,9 +264,6 @@ 2) its descendents 3) its following siblings. """ - if c_name is NULL: - # always match - return c_node tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node) if _tagMatches(c_node, c_href, c_name): return c_node From scoder at codespeak.net Wed May 31 10:38:27 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 31 May 2006 10:38:27 +0200 (CEST) Subject: [Lxml-checkins] r27952 - lxml/trunk Message-ID: <20060531083827.9A46F1005A@code0.codespeak.net> Author: scoder Date: Wed May 31 10:38:25 2006 New Revision: 27952 Modified: lxml/trunk/CHANGES.txt Log: updated CHANGES.txt for speedups Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Wed May 31 10:38:25 2006 @@ -7,6 +7,10 @@ Features added -------------- +* Another speedup in tree iteration code + +* General speedup of Python Element object creation and deallocation + * Writing C14N no longer serializes in memory (reduced memory footprint) * PyErrorLog for error logging through the Python ``logging`` module From scoder at codespeak.net Wed May 31 15:17:09 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 31 May 2006 15:17:09 +0200 (CEST) Subject: [Lxml-checkins] r27980 - in lxml/trunk: . doc src/lxml src/lxml/tests Message-ID: <20060531131709.7F3401005A@code0.codespeak.net> Author: scoder Date: Wed May 31 15:17:05 2006 New Revision: 27980 Modified: lxml/trunk/CHANGES.txt lxml/trunk/doc/performance.txt lxml/trunk/src/lxml/apihelpers.pxi lxml/trunk/src/lxml/etree.h lxml/trunk/src/lxml/etree.pyx lxml/trunk/src/lxml/proxy.pxi lxml/trunk/src/lxml/tests/test_etree.py lxml/trunk/src/lxml/tree.pxd Log: generalized tree walker to merge code also with ElementDepthFirstIterator, support '{ns}*' in filter Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Wed May 31 15:17:05 2006 @@ -7,6 +7,9 @@ Features added -------------- +* Element.getiterator() supports iterating through namespaces with the tag + expression '{namespace}*' + * Another speedup in tree iteration code * General speedup of Python Element object creation and deallocation @@ -28,6 +31,8 @@ Bugs fixed ---------- +* Element now raises ValueError on empty tag names + * Namespace fixing after moving elements between documents could fail if the source document was freed too early Modified: lxml/trunk/doc/performance.txt ============================================================================== --- lxml/trunk/doc/performance.txt (original) +++ lxml/trunk/doc/performance.txt Wed May 31 15:17:05 2006 @@ -194,38 +194,38 @@ especially if few elements are of interest or the element tag name is known, lxml is a good choice:: - lxe: getiterator_all (-- T2 ) 23.0440 msec/pass - cET: getiterator_all (-- T2 ) 36.3687 msec/pass + lxe: getiterator_all (-- T2 ) 22.5847 msec/pass + cET: getiterator_all (-- T2 ) 36.8212 msec/pass ET : getiterator_all (-- T2 ) 46.2846 msec/pass - lxe: getiterator_islice (-- T2 ) 2.0699 msec/pass - cET: getiterator_islice (-- T2 ) 0.3299 msec/pass + lxe: getiterator_islice (-- T2 ) 2.0421 msec/pass + cET: getiterator_islice (-- T2 ) 0.3343 msec/pass ET : getiterator_islice (-- T2 ) 44.5898 msec/pass - lxe: getiterator_tag (-- T2 ) 1.9176 msec/pass - cET: getiterator_tag (-- T2 ) 11.2861 msec/pass + lxe: getiterator_tag (-- T2 ) 1.9593 msec/pass + cET: getiterator_tag (-- T2 ) 11.7767 msec/pass ET : getiterator_tag (-- T2 ) 37.5661 msec/pass - lxe: getiterator_tag_all (-- T2 ) 4.5722 msec/pass - cET: getiterator_tag_all (-- T2 ) 33.2602 msec/pass + lxe: getiterator_tag_all (-- T2 ) 4.5667 msec/pass + cET: getiterator_tag_all (-- T2 ) 33.5681 msec/pass ET : getiterator_tag_all (-- T2 ) 37.6200 msec/pass This similarly shows in ``Element.findall()``:: - lxe: findall (-- T2 ) 27.3874 msec/pass - cET: findall (-- T2 ) 38.8718 msec/pass + lxe: findall (-- T2 ) 26.9907 msec/pass + cET: findall (-- T2 ) 39.1728 msec/pass ET : findall (-- T2 ) 50.9692 msec/pass - lxe: findall (-- T3 ) 3.8227 msec/pass - cET: findall (-- T3 ) 11.8051 msec/pass + lxe: findall (-- T3 ) 3.6452 msec/pass + cET: findall (-- T3 ) 12.0210 msec/pass ET : findall (-- T3 ) 11.2570 msec/pass - lxe: findall_tag (-- T2 ) 4.5549 msec/pass - cET: findall_tag (-- T2 ) 31.3107 msec/pass + lxe: findall_tag (-- T2 ) 4.6065 msec/pass + cET: findall_tag (-- T2 ) 34.0267 msec/pass ET : findall_tag (-- T2 ) 36.7813 msec/pass - lxe: findall_tag (-- T3 ) 0.5643 msec/pass - cET: findall_tag (-- T3 ) 7.4491 msec/pass + lxe: findall_tag (-- T3 ) 0.5884 msec/pass + cET: findall_tag (-- T3 ) 7.6307 msec/pass ET : findall_tag (-- T3 ) 9.2943 msec/pass Note that all three libraries currently use the same Python implementation for Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Wed May 31 15:17:05 2006 @@ -242,39 +242,16 @@ c_node = c_node.prev return NULL -cdef xmlNode* _findDepthFirstInDescendents(xmlNode* c_node, - char* c_href, char* c_name): - if c_node is NULL: - return NULL - c_node = c_node.children - if c_node is NULL: - return NULL - return _findDepthFirstInFollowing(c_node, c_href, c_name) - -cdef xmlNode* _findDepthFirstInFollowingSiblings(xmlNode* c_node, - char* c_href, char* c_name): - if c_node is NULL: - return NULL - return _findDepthFirstInFollowing(c_node.next, c_href, c_name) - -cdef xmlNode* _findDepthFirstInFollowing(xmlNode* c_node, - char* c_href, char* c_name): - """Find the next matching node by traversing: - 1) the node itself - 2) its descendents - 3) its following siblings. - """ - tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node) - if _tagMatches(c_node, c_href, c_name): - return c_node - tree.END_FOR_EACH_ELEMENT_FROM(c_node) - return NULL - cdef int _tagMatches(xmlNode* c_node, char* c_href, char* c_name): if c_name is NULL: - # always match - return 1 - if c_href is NULL: + if c_href is NULL: + # always match + return 1 + elif c_node.ns is NULL or c_node.ns.href is NULL: + return 0 + else: + return cstd.strcmp(c_node.ns.href, c_href) == 0 + elif c_href is NULL: if c_node.ns is not NULL and c_node.ns.href is not NULL: return 0 return cstd.strcmp(c_node.name, c_name) == 0 @@ -363,10 +340,11 @@ raise ValueError, "Invalid tag name" nslen = c_ns_end - c_tag taglen = python.PyString_GET_SIZE(tag) - nslen - 2 - ns = python.PyString_FromStringAndSize(c_tag, nslen) + if taglen == 0: + raise ValueError, "Empty tag name" + if nslen > 0: + ns = python.PyString_FromStringAndSize(c_tag, nslen) tag = python.PyString_FromStringAndSize(c_ns_end+1, taglen) - else: - ns = None return ns, tag cdef object _namespacedName(xmlNode* c_node): Modified: lxml/trunk/src/lxml/etree.h ============================================================================== --- lxml/trunk/src/lxml/etree.h (original) +++ lxml/trunk/src/lxml/etree.h Wed May 31 15:17:05 2006 @@ -28,56 +28,109 @@ ((c_node)->type == XML_ELEMENT_NODE || \ (c_node)->type == XML_COMMENT_NODE) -/* Macro set implementation of a depth first tree walker +/* Macro pair implementation of a depth first tree walker * - * Calls the code block between the BEGIN and END macros - * 1) for the start element (or the first 'element' sibling) - * 2) for all children (recursively) - * 3) all siblings (recursively) + * Calls the code block between the BEGIN and END macros for all elements + * below c_tree_top (exclusively), starting at c_node (inclusively iff + * 'inclusive' is 1). + * + * To traverse the node and all of its children and siblings in Pyrex, call + * cdef xmlNode* some_node + * BEGIN_FOR_EACH_ELEMENT_FROM(some_node.parent, some_node, 1) + * # do something with some_node + * END_FOR_EACH_ELEMENT_FROM(some_node) * - * Usage in Pyrex: + * To traverse only the children and siblings of a node, call + * cdef xmlNode* some_node + * BEGIN_FOR_EACH_ELEMENT_FROM(some_node.parent, some_node, 0) + * # do something with some_node + * END_FOR_EACH_ELEMENT_FROM(some_node) + * + * To traverse only the children, do: * cdef xmlNode* some_node * some_node = parent_node.children - * BEGIN_FOR_EACH_ELEMENT_FROM(some_node) + * BEGIN_FOR_EACH_ELEMENT_FROM(parent_node, some_node, 1) * # do something with some_node * END_FOR_EACH_ELEMENT_FROM(some_node) * * NOTE: 'some_node' MUST be a plain 'xmlNode*' ! - * NOTE: parent modification during the walk will segfault ! + * + * NOTE: parent modification during the walk can divert the iterator, but + * should not segfault ! */ -#define BEGIN_FOR_EACH_ELEMENT_FROM(c_node) \ -{ \ - while ((c_node != 0) && (!_isElement(c_node))) \ - c_node = c_node->next; \ - if (c_node != 0) { \ - xmlNode* ___start_parent = c_node->parent; \ - xmlNode* ___next; \ - while (c_node != 0) { +#define ADVANCE_TO_NEXT_ELEMENT(c_node) \ + while ((c_node != 0) && (!_isElement(c_node))) \ + c_node = c_node->next; + +#define BEGIN_FOR_EACH_ELEMENT_FROM(c_tree_top, c_node, inclusive) \ +{ \ + xmlNode* ___next; \ + const xmlNode* ___tree_top = (c_tree_top); \ + /* make sure we have an element or NULL */ \ + if (c_node != 0) { \ + if (!_isElement(c_node)) { \ + /* we skip the node, so 'inclusive' is irrelevant */ \ + if (c_node == ___tree_top) \ + c_node = 0; /* nothing to traverse */ \ + else { \ + c_node = c_node->next; \ + ADVANCE_TO_NEXT_ELEMENT(c_node) \ + } \ + } else if (! (inclusive)) { \ + /* duplicated for speed: find the second node */ \ + /* walk through children */ \ + ___next = c_node->children; \ + ADVANCE_TO_NEXT_ELEMENT(___next) \ + if ((___next == 0) && (c_node != ___tree_top)) { \ + /* try siblings */ \ + ___next = c_node->next; \ + ADVANCE_TO_NEXT_ELEMENT(___next) \ + /* back off through parents */ \ + while (___next == 0) { \ + c_node = c_node->parent; \ + if (c_node == 0) \ + break; \ + if (c_node == ___tree_top) \ + break; \ + if (!_isElement(c_node)) \ + break; \ + ___next = c_node->next; \ + ADVANCE_TO_NEXT_ELEMENT(___next) \ + } \ + } \ + c_node = ___next; \ + } \ + \ + /* now run the user code on the elements we find */ \ + while (c_node != 0) { \ /* here goes the code to be run for each element */ -#define END_FOR_EACH_ELEMENT_FROM(c_node) \ - /* walk through children */ \ - ___next = c_node->children; \ - while ((___next != 0) && (!_isElement(___next))) \ - ___next = ___next->next; \ - if (___next == 0) { \ - /* try siblings */ \ - ___next = c_node->next; \ - while ((___next != 0) && (!_isElement(___next))) \ - ___next = ___next->next; \ - } \ - /* back off through parents */ \ - while (___next == 0) { \ - c_node = c_node->parent; \ - if (c_node == ___start_parent) \ - break; \ - ___next = c_node->next; \ - while ((___next != 0) && (!_isElement(___next))) \ - ___next = ___next->next; \ - } \ - c_node = ___next; \ - } \ - } \ + +#define END_FOR_EACH_ELEMENT_FROM(c_node) \ + /* walk through children */ \ + ___next = c_node->children; \ + ADVANCE_TO_NEXT_ELEMENT(___next) \ + if ((___next == 0) && (c_node != ___tree_top)) { \ + /* try siblings */ \ + ___next = c_node->next; \ + ADVANCE_TO_NEXT_ELEMENT(___next) \ + /* back off through parents */ \ + while (___next == 0) { \ + c_node = c_node->parent; \ + if (c_node == 0) \ + break; \ + if (c_node == ___tree_top) \ + break; \ + if (!_isElement(c_node)) \ + break; \ + ___next = c_node->next; \ + ADVANCE_TO_NEXT_ELEMENT(___next) \ + } \ + } \ + c_node = ___next; \ + } \ + } \ } + #endif /*HAS_ETREE_H*/ Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Wed May 31 15:17:05 2006 @@ -1230,15 +1230,13 @@ # we keep Python references here to control GC # keep next node to return and a depth counter in the tree cdef _NodeBase _next_node - cdef Py_ssize_t _depth + cdef _NodeBase _top_node cdef object _pystrings cdef char* _href cdef char* _name def __init__(self, _NodeBase node not None, tag=None): + self._top_node = node self._next_node = node - self._depth = 0 - if tag == '*': - tag = None if tag is None: self._href = NULL self._name = NULL @@ -1249,10 +1247,11 @@ else: self._href = _cstr(self._pystrings[0]) self._name = _cstr(self._pystrings[1]) - - if not _tagMatches(node._c_node, self._href, self._name): - # this cannot raise StopIteration, self._next_node != None - self.next() + if cstd.strcmp(self._name, '*') == 0: + self._name = NULL + if not _tagMatches(node._c_node, self._href, self._name): + # this cannot raise StopIteration, self._next_node != None + self.next() def __iter__(self): return self @@ -1262,48 +1261,30 @@ current_node = self._next_node if current_node is None: raise StopIteration - self._prepareNextNode() + if self._name is NULL and self._href is NULL: + self._prepareNextNodeAnyTag() + else: + self._prepareNextNodeMatchTag() return current_node - cdef void _prepareNextNode(self): - cdef _NodeBase node + cdef void _prepareNextNodeAnyTag(self): + cdef xmlNode* c_node + c_node = self._next_node._c_node + tree.BEGIN_FOR_EACH_ELEMENT_FROM(self._top_node._c_node, c_node, 0) + self._next_node = _elementFactory(self._next_node._doc, c_node) + return + tree.END_FOR_EACH_ELEMENT_FROM(c_node) + self._next_node = None + + cdef void _prepareNextNodeMatchTag(self): cdef xmlNode* c_node - cdef xmlNode* c_next_node - cdef xmlNode* c_parent - # find in descendants - node = self._next_node - c_parent = node._c_node - c_node = _findDepthFirstInDescendents(c_parent, self._href, self._name) - if c_node is NULL: - if self._depth < 1: - # nothing left to traverse - self._next_node = None - return - # try siblings - c_node = _findDepthFirstInFollowingSiblings( - c_parent, self._href, self._name) - - while c_node is NULL and self._depth > 1: - # walk up the parent pointers and continue with their siblings - c_parent = c_parent.parent - self._depth = self._depth - 1 - if c_parent is NULL or not _isElement(c_parent): - break - c_node = _findDepthFirstInFollowingSiblings( - c_parent, self._href, self._name) - - if c_node is NULL or not _isElement(c_parent): - self._next_node = None - return # all found, nothing left - # we are at a sibling, so set c_parent to our parent - c_parent = c_parent.parent - - c_next_node = c_node - # fix depth counter by looking up path to original parent - while c_node is not c_parent: - self._depth = self._depth + 1 - c_node = c_node.parent - self._next_node = _elementFactory(node._doc, c_next_node) + c_node = self._next_node._c_node + tree.BEGIN_FOR_EACH_ELEMENT_FROM(self._top_node._c_node, c_node, 0) + if _tagMatches(c_node, self._href, self._name): + self._next_node = _elementFactory(self._next_node._doc, c_node) + return + tree.END_FOR_EACH_ELEMENT_FROM(c_node) + self._next_node = None cdef xmlNode* _createElement(xmlDoc* c_doc, object name_utf) except NULL: cdef xmlNode* c_node Modified: lxml/trunk/src/lxml/proxy.pxi ============================================================================== --- lxml/trunk/src/lxml/proxy.pxi (original) +++ lxml/trunk/src/lxml/proxy.pxi Wed May 31 15:17:05 2006 @@ -135,9 +135,10 @@ else: return NULL -cdef int canDeallocateChildNodes(xmlNode* c_node): - c_node = c_node.children - tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node) +cdef int canDeallocateChildNodes(xmlNode* c_parent): + cdef xmlNode* c_node + c_node = c_parent.children + tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_parent, c_node, 1) if c_node._private is not NULL: return 0 tree.END_FOR_EACH_ELEMENT_FROM(c_node) @@ -159,15 +160,16 @@ node._doc = doc changeDocumentBelow(node._c_node, doc) -cdef void changeDocumentBelow(xmlNode* c_node, _Document doc): +cdef void changeDocumentBelow(xmlNode* c_parent, _Document doc): """Update the Python references in the tree below the node. Does not update the node itself. Note that we expect C pointers to the document to be updated already by libxml2. """ - c_node = c_node.children - tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node) + cdef xmlNode* c_node + c_node = c_parent.children + tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_parent, c_node, 1) if c_node._private is not NULL: (<_NodeBase>c_node._private)._doc = doc tree.END_FOR_EACH_ELEMENT_FROM(c_node) Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Wed May 31 15:17:05 2006 @@ -28,6 +28,16 @@ self.assert_(etree.__version__.startswith( str(etree.LXML_VERSION[0]))) + def test_element_names(self): + Element = self.etree.Element + + el = Element('name') + self.assertEquals(el.tag, 'name') + el = Element('{}name') + self.assertEquals(el.tag, 'name') + self.assertRaises(ValueError, Element, '{test}') + self.assertRaises(ValueError, setattr, el, 'tag', '{test}') + def test_parse_error(self): parse = self.etree.parse # from StringIO @@ -436,6 +446,33 @@ '', self._writeElement(e)) + def test_getiterator_filter_namespace(self): + Element = self.etree.Element + SubElement = self.etree.SubElement + + a = Element('{a}a') + b = SubElement(a, '{a}b') + c = SubElement(a, '{a}c') + d = SubElement(b, '{b}d') + e = SubElement(c, '{a}e') + f = SubElement(c, '{b}f') + + self.assertEquals( + [a], + list(a.getiterator('{a}a'))) + self.assertEquals( + [], + list(a.getiterator('{b}a'))) + self.assertEquals( + [], + list(a.getiterator('a'))) + self.assertEquals( + [f], + list(c.getiterator('{b}*'))) + self.assertEquals( + [d, f], + list(a.getiterator('{b}*'))) + def test_index(self): etree = self.etree e = etree.Element('foo') Modified: lxml/trunk/src/lxml/tree.pxd ============================================================================== --- lxml/trunk/src/lxml/tree.pxd (original) +++ lxml/trunk/src/lxml/tree.pxd Wed May 31 15:17:05 2006 @@ -248,5 +248,6 @@ cdef extern from "etree.h": cdef int _isElement(xmlNode* node) - cdef void BEGIN_FOR_EACH_ELEMENT_FROM(xmlNode* node) - cdef void END_FOR_EACH_ELEMENT_FROM(xmlNode* node) + cdef void BEGIN_FOR_EACH_ELEMENT_FROM(xmlNode* tree_top, + xmlNode* start_node, int inclusive) + cdef void END_FOR_EACH_ELEMENT_FROM(xmlNode* start_node) From scoder at codespeak.net Wed May 31 15:57:12 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 31 May 2006 15:57:12 +0200 (CEST) Subject: [Lxml-checkins] r27989 - lxml/www Message-ID: <20060531135712.42A6110060@code0.codespeak.net> Author: scoder Date: Wed May 31 15:57:10 2006 New Revision: 27989 Modified: lxml/www/publish.py Log: new doc files: performance.txt, resolvers.txt Modified: lxml/www/publish.py ============================================================================== --- lxml/www/publish.py (original) +++ lxml/www/publish.py Wed May 31 15:57:10 2006 @@ -9,7 +9,7 @@ for name in ['main.txt', 'intro.txt', 'api.txt', 'compatibility.txt', 'extensions.txt', 'namespace_extensions.txt', 'sax.txt', - 'build.txt']: + 'build.txt', 'performance.txt', 'resolvers.txt']: path = os.path.join(lxml_path, 'doc', name) outname = os.path.splitext(name)[0] + '.html' outpath = os.path.join(dirname, outname) From scoder at codespeak.net Wed May 31 16:00:36 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 31 May 2006 16:00:36 +0200 (CEST) Subject: [Lxml-checkins] r27990 - lxml/trunk Message-ID: <20060531140036.6F54510060@code0.codespeak.net> Author: scoder Date: Wed May 31 16:00:35 2006 New Revision: 27990 Modified: lxml/trunk/CHANGES.txt lxml/trunk/version.txt Log: rest fixes and version bump to 1.0 Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Wed May 31 16:00:35 2006 @@ -1,14 +1,14 @@ lxml changelog ============== -current -======= +1.0 (2006-06-01) +================ Features added -------------- * Element.getiterator() supports iterating through namespaces with the tag - expression '{namespace}*' + expression ``{namespace}*`` * Another speedup in tree iteration code @@ -53,7 +53,7 @@ * Running absolute XPath expressions on an Element now evaluates against the root tree -* Evaluating absolute XPath expressions ('/*') on an ElementTree could fail +* Evaluating absolute XPath expressions (``/*``) on an ElementTree could fail * Crashes when calling XSLT, RelaxNG, etc. with uninitialized ElementTree objects Modified: lxml/trunk/version.txt ============================================================================== --- lxml/trunk/version.txt (original) +++ lxml/trunk/version.txt Wed May 31 16:00:35 2006 @@ -1 +1 @@ -1.0.beta +1.0 From scoder at codespeak.net Wed May 31 18:25:17 2006 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 31 May 2006 18:25:17 +0200 (CEST) Subject: [Lxml-checkins] r28019 - in lxml/trunk: . src/lxml/tests Message-ID: <20060531162517.1AD9A1005A@code0.codespeak.net> Author: scoder Date: Wed May 31 18:25:15 2006 New Revision: 28019 Modified: lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/tests/test_etree.py Log: test case for '{namespace}*' pattern in findall() Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Wed May 31 18:25:15 2006 @@ -7,8 +7,8 @@ Features added -------------- -* Element.getiterator() supports iterating through namespaces with the tag - expression ``{namespace}*`` +* Element.getiterator() and the findall() methods support finding arbitrary + elements from a namespace (pattern ``{namespace}*``) * Another speedup in tree iteration code Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Wed May 31 18:25:15 2006 @@ -473,6 +473,13 @@ [d, f], list(a.getiterator('{b}*'))) + def test_findall_ns(self): + XML = self.etree.XML + root = XML('') + self.assertEquals(len(root.findall(".//{X}b")), 2) + self.assertEquals(len(root.findall(".//{X}*")), 2) + self.assertEquals(len(root.findall(".//b")), 3) + def test_index(self): etree = self.etree e = etree.Element('foo')