From scoder at codespeak.net Thu Jan 3 18:22:39 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 3 Jan 2008 18:22:39 +0100 (CET) Subject: [Lxml-checkins] r50291 - in lxml/trunk: . doc Message-ID: <20080103172239.A8CD21684D3@codespeak.net> Author: scoder Date: Thu Jan 3 18:22:38 2008 New Revision: 50291 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/FAQ.txt Log: r3203 at delle: sbehnel | 2008-01-03 18:22:27 +0100 FAQ update Modified: lxml/trunk/doc/FAQ.txt ============================================================================== --- lxml/trunk/doc/FAQ.txt (original) +++ lxml/trunk/doc/FAQ.txt Thu Jan 3 18:22:38 2008 @@ -42,6 +42,7 @@ 6.2 Why can't lxml parse my XML from unicode strings? 6.3 What is the difference between str(xslt(doc)) and xslt(doc).write() ? 6.4 Why can't I just delete parents or clear the root node in iterparse()? + 6.5 How do I output null bytes in XML text? 7 XPath and Document Traversal 7.1 What are the ``findall()`` and ``xpath()`` methods on Element(Tree)? 7.2 Why doesn't ``findall()`` support full XPath expressions? @@ -608,6 +609,15 @@ .. _`iterparse section`: api.html#iterparse-and-iterwalk +How do I output null bytes in XML text? +--------------------------------------- + +Don't. What you would produce is not well-formed XML. XML parsers +will refuse to parse a document that contains null bytes. The right +way to embed binary data in XML is using a text encoding such as +uuencode or base64. + + XPath and Document Traversal ============================ From scoder at codespeak.net Fri Jan 4 19:22:01 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 4 Jan 2008 19:22:01 +0100 (CET) Subject: [Lxml-checkins] r50334 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20080104182201.E79E816843F@codespeak.net> Author: scoder Date: Fri Jan 4 19:22:01 2008 New Revision: 50334 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/apihelpers.pxi lxml/trunk/src/lxml/lxml.etree.pyx lxml/trunk/src/lxml/tests/test_etree.py Log: r3205 at delle: sbehnel | 2008-01-04 19:21:48 +0100 check entity/character references in Entity() factory Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Fri Jan 4 19:22:01 2008 @@ -8,6 +8,9 @@ Features added -------------- +* Invalid entity names and character references will now be rejected + by the ``Entity()`` factory. + * ``entity.text`` now returns the textual representation of the entity, e.g. ``&``. Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Fri Jan 4 19:22:01 2008 @@ -1043,22 +1043,41 @@ c_name = c_name + 1 return 1 +cdef bint _characterReferenceIsValid(char* c_name): + cdef bint is_hex + if c_name[0] == c'x': + c_name += 1 + is_hex = 1 + else: + is_hex = 0 + if c_name[0] == c'\0': + return 0 + while c_name[0] != c'\0': + if c_name[0] < c'0' or c_name[0] > c'9': + if not is_hex: + return 0 + if not (c_name[0] >= c'a' and c_name[0] <= c'f'): + if not (c_name[0] >= c'A' and c_name[0] <= c'F'): + return 0 + c_name += 1 + return 1 + cdef int _tagValidOrRaise(tag_utf) except -1: if not _pyXmlNameIsValid(tag_utf): - raise ValueError, "Invalid tag name %r" % \ - python.PyUnicode_FromEncodedObject(tag_utf, 'UTF-8', 'strict') + raise ValueError("Invalid tag name %r" % \ + python.PyUnicode_FromEncodedObject(tag_utf, 'UTF-8', 'strict')) return 0 cdef int _htmlTagValidOrRaise(tag_utf) except -1: if not _pyHtmlNameIsValid(tag_utf): - raise ValueError, "Invalid HTML tag name %r" % \ - python.PyUnicode_FromEncodedObject(tag_utf, 'UTF-8', 'strict') + raise ValueError("Invalid HTML tag name %r" % \ + python.PyUnicode_FromEncodedObject(tag_utf, 'UTF-8', 'strict')) return 0 cdef int _attributeValidOrRaise(name_utf) except -1: if not _pyXmlNameIsValid(name_utf): - raise ValueError, "Invalid attribute name %r" % \ - python.PyUnicode_FromEncodedObject(name_utf, 'UTF-8', 'strict') + raise ValueError("Invalid attribute name %r" % \ + python.PyUnicode_FromEncodedObject(name_utf, 'UTF-8', 'strict')) return 0 cdef object _namespacedName(xmlNode* c_node): Modified: lxml/trunk/src/lxml/lxml.etree.pyx ============================================================================== --- lxml/trunk/src/lxml/lxml.etree.pyx (original) +++ lxml/trunk/src/lxml/lxml.etree.pyx Fri Jan 4 19:22:01 2008 @@ -2110,18 +2110,26 @@ PI = ProcessingInstruction def Entity(name): - """Entity factory. This factory function creates a special element that - will be serialized as an XML entity. Note, however, that the entity will - not be automatically declared in the document. A document that uses - entities requires a DTD. + """Entity factory. This factory function creates a special element + that will be serialized as an XML entity reference or character + reference. Note, however, that entities will not be automatically + declared in the document. A document that uses entity references + requires a DTD to define the entities. """ cdef _Document doc cdef xmlNode* c_node cdef xmlDoc* c_doc - name = _utf8(name) + cdef char* c_name + name_utf = _utf8(name) + c_name = _cstr(name_utf) + if c_name[0] == c'#': + if not _characterReferenceIsValid(c_name + 1): + raise ValueError("Invalid character reference: '%s'" % name) + elif not _xmlNameIsValid(c_name): + raise ValueError("Invalid entity reference: '%s'" % name) c_doc = _newDoc() doc = _documentFactory(c_doc, None) - c_node = _createEntity(c_doc, _cstr(name)) + c_node = _createEntity(c_doc, c_name) tree.xmlAddChild(c_doc, c_node) return _elementFactory(doc, c_node) Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Fri Jan 4 19:22:01 2008 @@ -605,6 +605,21 @@ self.assertEquals('&test;', tostring(root)) + def test_entity_values(self): + Entity = self.etree.Entity + self.assertEquals(Entity("test").text, '&test;') + self.assertEquals(Entity("#17683").text, '䔓') + self.assertEquals(Entity("#x1768").text, 'ᝨ') + self.assertEquals(Entity("#x98AF").text, '颯') + + def test_entity_error(self): + Entity = self.etree.Entity + self.assertRaises(ValueError, Entity, 'a b c') + self.assertRaises(ValueError, Entity, 'a,b') + self.assertRaises(AssertionError, Entity, 'a\0b') + self.assertRaises(ValueError, Entity, '#abc') + self.assertRaises(ValueError, Entity, '#xxyz') + # TypeError in etree, AssertionError in ElementTree; def test_setitem_assert(self): Element = self.etree.Element From scoder at codespeak.net Wed Jan 9 19:31:24 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 9 Jan 2008 19:31:24 +0100 (CET) Subject: [Lxml-checkins] r50461 - in lxml/trunk: . src/lxml Message-ID: <20080109183124.8BBAF168510@codespeak.net> Author: scoder Date: Wed Jan 9 19:31:23 2008 New Revision: 50461 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/xslt.pxi Log: r3207 at delle: sbehnel | 2008-01-05 21:57:09 +0100 factored XSLT.__copy__() into a separate C function Modified: lxml/trunk/src/lxml/xslt.pxi ============================================================================== --- lxml/trunk/src/lxml/xslt.pxi (original) +++ lxml/trunk/src/lxml/xslt.pxi Wed Jan 9 19:31:23 2008 @@ -341,24 +341,7 @@ return self.__copy__() def __copy__(self): - cdef XSLT new_xslt - cdef xmlDoc* c_doc - new_xslt = NEW_XSLT(XSLT) # without calling __init__() - new_xslt._access_control = self._access_control - new_xslt._error_log = _ErrorLog() - new_xslt._context = self._context._copy() - - new_xslt._xslt_resolver_context = self._xslt_resolver_context._copy() - new_xslt._xslt_resolver_context._c_style_doc = _copyDoc( - self._xslt_resolver_context._c_style_doc, 1) - - c_doc = _copyDoc(self._c_style.doc, 1) - new_xslt._c_style = xslt.xsltParseStylesheetDoc(c_doc) - if new_xslt._c_style is NULL: - tree.xmlFreeDoc(c_doc) - python.PyErr_NoMemory() - - return new_xslt + return _copyXSLT(self) def __call__(self, _input, *, profile_run=False, **_kw): cdef _XSLTContext context @@ -491,6 +474,26 @@ # macro call to 't->tp_new()' for instantiation without calling __init__() cdef XSLT NEW_XSLT "PY_NEW" (object t) +cdef XSLT _copyXSLT(XSLT stylesheet): + cdef XSLT new_xslt + cdef xmlDoc* c_doc + new_xslt = NEW_XSLT(XSLT) # without calling __init__() + new_xslt._access_control = stylesheet._access_control + new_xslt._error_log = _ErrorLog() + new_xslt._context = stylesheet._context._copy() + + new_xslt._xslt_resolver_context = stylesheet._xslt_resolver_context._copy() + new_xslt._xslt_resolver_context._c_style_doc = _copyDoc( + stylesheet._xslt_resolver_context._c_style_doc, 1) + + c_doc = _copyDoc(stylesheet._c_style.doc, 1) + new_xslt._c_style = xslt.xsltParseStylesheetDoc(c_doc) + if new_xslt._c_style is NULL: + tree.xmlFreeDoc(c_doc) + python.PyErr_NoMemory() + + return new_xslt + cdef class _XSLTResultTree(_ElementTree): cdef XSLT _xslt cdef _Document _profile From scoder at codespeak.net Wed Jan 9 19:31:29 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 9 Jan 2008 19:31:29 +0100 (CET) Subject: [Lxml-checkins] r50462 - in lxml/trunk: . src/lxml Message-ID: <20080109183129.C9FB5168511@codespeak.net> Author: scoder Date: Wed Jan 9 19:31:29 2008 New Revision: 50462 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/parser.pxi lxml/trunk/src/lxml/xslt.pxi Log: r3208 at delle: sbehnel | 2008-01-05 21:59:36 +0100 copy XSLT into current thread instead of raising a 'not usable' exception Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Wed Jan 9 19:31:29 2008 @@ -8,6 +8,9 @@ Features added -------------- +* ``XSLT`` objects are now usable in any thread - at the cost of a + deep copy if they were not created in that thread. + * Invalid entity names and character references will now be rejected by the ``Entity()`` factory. Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Wed Jan 9 19:31:29 2008 @@ -132,8 +132,8 @@ """Check that c_dict is either the local thread dictionary or the global parent dictionary. """ - if __GLOBAL_PARSER_CONTEXT._c_dict is c_dict: - return 1 # main thread + #if __GLOBAL_PARSER_CONTEXT._c_dict is c_dict: + # return 1 # main thread if __GLOBAL_PARSER_CONTEXT._getThreadDict(NULL) is c_dict: return 1 # local thread dict return 0 Modified: lxml/trunk/src/lxml/xslt.pxi ============================================================================== --- lxml/trunk/src/lxml/xslt.pxi (original) +++ lxml/trunk/src/lxml/xslt.pxi Wed Jan 9 19:31:29 2008 @@ -356,7 +356,8 @@ cdef xmlDoc* c_doc if not _checkThreadDict(self._c_style.doc.dict): - raise RuntimeError, "stylesheet is not usable in this thread" + _kw['profile_run'] = profile_run + return _copyXSLT(self)(_input, **_kw) input_doc = _documentOrRaise(_input) root_node = _rootNodeOrRaise(_input) From scoder at codespeak.net Wed Jan 9 19:31:33 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 9 Jan 2008 19:31:33 +0100 (CET) Subject: [Lxml-checkins] r50463 - in lxml/trunk: . src/lxml Message-ID: <20080109183133.D1217168512@codespeak.net> Author: scoder Date: Wed Jan 9 19:31:33 2008 New Revision: 50463 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/xslt.pxi Log: r3209 at delle: sbehnel | 2008-01-05 22:39:37 +0100 cleanup Modified: lxml/trunk/src/lxml/xslt.pxi ============================================================================== --- lxml/trunk/src/lxml/xslt.pxi (original) +++ lxml/trunk/src/lxml/xslt.pxi Wed Jan 9 19:31:33 2008 @@ -356,7 +356,8 @@ cdef xmlDoc* c_doc if not _checkThreadDict(self._c_style.doc.dict): - _kw['profile_run'] = profile_run + if profile_run is not False: + _kw['profile_run'] = profile_run return _copyXSLT(self)(_input, **_kw) input_doc = _documentOrRaise(_input) From scoder at codespeak.net Wed Jan 9 19:31:38 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 9 Jan 2008 19:31:38 +0100 (CET) Subject: [Lxml-checkins] r50464 - in lxml/trunk: . src/lxml Message-ID: <20080109183138.A69B7168511@codespeak.net> Author: scoder Date: Wed Jan 9 19:31:38 2008 New Revision: 50464 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/extensions.pxi lxml/trunk/src/lxml/python.pxd Log: r3211 at delle: sbehnel | 2008-01-08 21:02:16 +0100 return a string subclass for XPath string results that points to the source Element Modified: lxml/trunk/src/lxml/extensions.pxi ============================================================================== --- lxml/trunk/src/lxml/extensions.pxi (original) +++ lxml/trunk/src/lxml/extensions.pxi Wed Jan 9 19:31:38 2008 @@ -522,12 +522,9 @@ # XSLT: can it leak when merging trees from multiple sources? c_node = tree.xmlDocCopyNode(c_node, doc._c_doc, 1) value = _elementFactory(doc, c_node) - elif c_node.type == tree.XML_TEXT_NODE: - value = funicode(c_node.content) - elif c_node.type == tree.XML_ATTRIBUTE_NODE: - s = tree.xmlNodeGetContent(c_node) - value = funicode(s) - tree.xmlFree(s) + elif c_node.type == tree.XML_TEXT_NODE or \ + c_node.type == tree.XML_ATTRIBUTE_NODE: + value = _newElementStringResult(c_node, doc) elif c_node.type == tree.XML_NAMESPACE_DECL: s = (c_node).href if s is NULL: @@ -561,6 +558,56 @@ xpath.xmlXPathFreeObject(xpathObj) ################################################################################ +# special str/unicode subclasses + +cdef class _ElementStringResult(python.unicode): + cdef _Element parent + cdef readonly object is_tail + cdef readonly object is_text + cdef readonly object is_attribute + + def getparent(self): + return self.parent + +cdef object _newElementStringResult(xmlNode* c_node, _Document doc): + cdef _ElementStringResult element_string + cdef xmlNode* c_element + cdef char* s + cdef bint is_attribute, is_tail + + if c_node.type == tree.XML_ATTRIBUTE_NODE: + is_attribute = 1 + is_tail = 0 + s = tree.xmlNodeGetContent(c_node) + value = python.PyUnicode_DecodeUTF8(s, cstd.strlen(s), NULL) + tree.xmlFree(s) + c_element = NULL + else: + #assert c_node.type == tree.XML_TEXT_NODE, "invalid node type" + is_attribute = 0 + # tail text? + value = python.PyUnicode_DecodeUTF8( + c_node.content, cstd.strlen(c_node.content), NULL) + c_element = _previousElement(c_node) + is_tail = c_element is not NULL + + if c_element is NULL: + # non-tail text or attribute text + c_element = c_node.parent + while c_element is not NULL and not _isElement(c_element): + c_element = c_element.parent + + if c_element is NULL: + return value + + element_string = _ElementStringResult(value) + element_string.parent = _elementFactory(doc, c_element) + element_string.is_attribute = is_attribute + element_string.is_tail = is_tail + element_string.is_text = not (is_tail or is_attribute) + return element_string + +################################################################################ # callbacks for XPath/XSLT extension functions cdef void _extension_function_call(_BaseContext context, function, Modified: lxml/trunk/src/lxml/python.pxd ============================================================================== --- lxml/trunk/src/lxml/python.pxd (original) +++ lxml/trunk/src/lxml/python.pxd Wed Jan 9 19:31:38 2008 @@ -16,6 +16,9 @@ cdef object stop cdef object step + ctypedef class __builtin__.unicode [object PyUnicodeObject]: + pass + cdef FILE* PyFile_AsFile(object p) cdef int PyFile_Check(object p) cdef object PyFile_Name(object p) From scoder at codespeak.net Wed Jan 9 19:31:44 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 9 Jan 2008 19:31:44 +0100 (CET) Subject: [Lxml-checkins] r50465 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20080109183144.11317168510@codespeak.net> Author: scoder Date: Wed Jan 9 19:31:43 2008 New Revision: 50465 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/extensions.pxi lxml/trunk/src/lxml/proxy.pxi lxml/trunk/src/lxml/tests/test_xpathevaluator.py Log: r3212 at delle: sbehnel | 2008-01-09 19:31:08 +0100 cleanups and bug fixes: while a document is 'fake-rooted', we must take care that we do not propagate the fake-root into Python space but the original element instead Modified: lxml/trunk/src/lxml/extensions.pxi ============================================================================== --- lxml/trunk/src/lxml/extensions.pxi (original) +++ lxml/trunk/src/lxml/extensions.pxi Wed Jan 9 19:31:43 2008 @@ -515,16 +515,16 @@ for i from 0 <= i < xpathObj.nodesetval.nodeNr: c_node = xpathObj.nodesetval.nodeTab[i] if _isElement(c_node): - if c_node.doc != doc._c_doc: + if c_node.doc != doc._c_doc and c_node.doc._private is NULL: # XXX: works, but maybe not always the right thing to do? # XPath: only runs when extensions create or copy trees # -> we store Python refs to these, so that is OK # XSLT: can it leak when merging trees from multiple sources? c_node = tree.xmlDocCopyNode(c_node, doc._c_doc, 1) - value = _elementFactory(doc, c_node) + value = _fakeDocElementFactory(doc, c_node) elif c_node.type == tree.XML_TEXT_NODE or \ c_node.type == tree.XML_ATTRIBUTE_NODE: - value = _newElementStringResult(c_node, doc) + value = _newElementStringResult(doc, c_node) elif c_node.type == tree.XML_NAMESPACE_DECL: s = (c_node).href if s is NULL: @@ -569,7 +569,7 @@ def getparent(self): return self.parent -cdef object _newElementStringResult(xmlNode* c_node, _Document doc): +cdef object _newElementStringResult(_Document doc, xmlNode* c_node): cdef _ElementStringResult element_string cdef xmlNode* c_element cdef char* s @@ -579,8 +579,10 @@ is_attribute = 1 is_tail = 0 s = tree.xmlNodeGetContent(c_node) - value = python.PyUnicode_DecodeUTF8(s, cstd.strlen(s), NULL) - tree.xmlFree(s) + try: + value = python.PyUnicode_DecodeUTF8(s, cstd.strlen(s), NULL) + finally: + tree.xmlFree(s) c_element = NULL else: #assert c_node.type == tree.XML_TEXT_NODE, "invalid node type" @@ -601,7 +603,7 @@ return value element_string = _ElementStringResult(value) - element_string.parent = _elementFactory(doc, c_element) + element_string.parent = _fakeDocElementFactory(doc, c_element) element_string.is_attribute = is_attribute element_string.is_tail = is_tail element_string.is_text = not (is_tail or is_attribute) Modified: lxml/trunk/src/lxml/proxy.pxi ============================================================================== --- lxml/trunk/src/lxml/proxy.pxi (original) +++ lxml/trunk/src/lxml/proxy.pxi Wed Jan 9 19:31:43 2008 @@ -66,11 +66,10 @@ c_new_root = tree.xmlDocCopyNode(c_node, c_doc, 2) # non recursive! tree.xmlDocSetRootElement(c_doc, c_new_root) _copyParentNamespaces(c_node, c_new_root) - _copyParentNamespaces(c_node, c_root) c_new_root.children = c_node.children c_new_root.last = c_node.last - c_new_root.next = c_new_root.prev = c_new_root.parent = NULL + c_new_root.next = c_new_root.prev = NULL # store original node c_doc._private = c_node @@ -89,19 +88,35 @@ cdef xmlNode* c_child cdef xmlNode* c_parent cdef xmlNode* c_root - if c_doc != c_base_doc: - c_root = tree.xmlDocGetRootElement(c_doc) + if c_doc is c_base_doc: + return + c_root = tree.xmlDocGetRootElement(c_doc) + + # restore parent pointers of children + c_parent = c_doc._private + c_child = c_root.children + while c_child is not NULL: + c_child.parent = c_parent + c_child = c_child.next - # restore parent pointers of children - c_parent = c_doc._private - c_child = c_root.children - while c_child is not NULL: - c_child.parent = c_parent - c_child = c_child.next - - # prevent recursive removal of children - c_root.children = c_root.last = NULL - tree.xmlFreeDoc(c_doc) + # prevent recursive removal of children + c_root.children = c_root.last = NULL + tree.xmlFreeDoc(c_doc) + +cdef _Element _fakeDocElementFactory(_Document doc, xmlNode* c_element): + """Special element factory for cases where we need to create a fake + root document, but still need to instantiate arbitrary nodes from + it. If we instantiate the fake root node, things will turn bad + when it's destroyed. + + Instead, if we are asked to instantiate the fake root node, we + instantiate the original node instead. + """ + if c_element.doc is not doc._c_doc: + if c_element.doc._private is not NULL: + if c_element is c_element.doc.children: + c_element = c_element.doc._private + return _elementFactory(doc, c_element) ################################################################################ # support for freeing tree elements when proxy objects are destroyed Modified: lxml/trunk/src/lxml/tests/test_xpathevaluator.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_xpathevaluator.py (original) +++ lxml/trunk/src/lxml/tests/test_xpathevaluator.py Wed Jan 9 19:31:43 2008 @@ -72,6 +72,13 @@ self.assertEquals(['B'], tree.xpath('/a/@b')) + def test_xpath_list_attribute_parent(self): + tree = self.parse('') + results = tree.xpath('/a/@c') + self.assertEquals(1, len(results)) + self.assertEquals('C', results[0]) + self.assertEquals(tree.getroot().tag, results[0].getparent().tag) + def test_xpath_list_comment(self): tree = self.parse('') self.assertEquals([''], @@ -182,6 +189,21 @@ [root[0]], e.evaluate('c')) + def test_xpath_evaluator_tree_absolute(self): + tree = self.parse('') + child_tree = etree.ElementTree(tree.getroot()[0]) + e = etree.XPathEvaluator(child_tree) + self.assertEquals( + [], + e.evaluate('/a')) + root = child_tree.getroot() + self.assertEquals( + [root], + e.evaluate('/b')) + self.assertEquals( + [], + e.evaluate('/c')) + def test_xpath_evaluator_element(self): tree = self.parse('') root = tree.getroot() From scoder at codespeak.net Wed Jan 9 19:41:41 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 9 Jan 2008 19:41:41 +0100 (CET) Subject: [Lxml-checkins] r50466 - in lxml/trunk: . src/lxml Message-ID: <20080109184141.5F9F1168513@codespeak.net> Author: scoder Date: Wed Jan 9 19:41:40 2008 New Revision: 50466 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/extensions.pxi Log: r3218 at delle: sbehnel | 2008-01-09 19:41:29 +0100 disabled ElementStringResult patch until it works reliably Modified: lxml/trunk/src/lxml/extensions.pxi ============================================================================== --- lxml/trunk/src/lxml/extensions.pxi (original) +++ lxml/trunk/src/lxml/extensions.pxi Wed Jan 9 19:41:40 2008 @@ -599,8 +599,8 @@ while c_element is not NULL and not _isElement(c_element): c_element = c_element.parent - if c_element is NULL: - return value + #if c_element is NULL: + return value element_string = _ElementStringResult(value) element_string.parent = _fakeDocElementFactory(doc, c_element) From scoder at codespeak.net Fri Jan 11 09:50:51 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 11 Jan 2008 09:50:51 +0100 (CET) Subject: [Lxml-checkins] r50504 - in lxml/trunk: . src/lxml Message-ID: <20080111085051.CC3C616850A@codespeak.net> Author: scoder Date: Fri Jan 11 09:50:50 2008 New Revision: 50504 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/etree_defs.h lxml/trunk/src/lxml/lxml.pyclasslookup.pyx lxml/trunk/src/lxml/python.pxd Log: r3220 at delle: sbehnel | 2008-01-10 00:00:38 +0100 removed unused stuff Modified: lxml/trunk/src/lxml/etree_defs.h ============================================================================== --- lxml/trunk/src/lxml/etree_defs.h (original) +++ lxml/trunk/src/lxml/etree_defs.h Fri Jan 11 09:50:50 2008 @@ -94,14 +94,7 @@ #endif /* Redefinition of some Python builtins as C functions */ -#define isinstance(o,c) PyObject_IsInstance(o,c) -#define issubclass(c,csuper) PyObject_IsSubclass(c,csuper) -#define hasattr(o,a) PyObject_HasAttr(o,a) -#define getattr(o,a) PyObject_GetAttr(o,a) #define callable(o) PyCallable_Check(o) -#define str(o) PyObject_Str(o) -#define repr(o) PyObject_Repr(o) -#define iter(o) PyObject_GetIter(o) #define _cstr(s) PyString_AS_STRING(s) #define _fqtypename(o) (((PyTypeObject*)o)->ob_type->tp_name) Modified: lxml/trunk/src/lxml/lxml.pyclasslookup.pyx ============================================================================== --- lxml/trunk/src/lxml/lxml.pyclasslookup.pyx (original) +++ lxml/trunk/src/lxml/lxml.pyclasslookup.pyx Fri Jan 11 09:50:50 2008 @@ -1,7 +1,6 @@ from etreepublic cimport _Document, _Element, ElementBase from etreepublic cimport ElementClassLookup, FallbackElementClassLookup from etreepublic cimport elementFactory, import_lxml__etree -from python cimport str, repr, isinstance, issubclass, iter from python cimport _cstr cimport etreepublic as cetree cimport python Modified: lxml/trunk/src/lxml/python.pxd ============================================================================== --- lxml/trunk/src/lxml/python.pxd (original) +++ lxml/trunk/src/lxml/python.pxd Fri Jan 11 09:50:50 2008 @@ -116,14 +116,7 @@ cdef extern from "etree_defs.h": # redefines some functions as macros cdef int _isString(object obj) - cdef int isinstance(object instance, object classes) - cdef int issubclass(object derived, object superclasses) cdef char* _fqtypename(object t) - cdef int hasattr(object obj, object attr) - cdef object getattr(object obj, object attr) cdef int callable(object obj) - cdef object str(object obj) - cdef object repr(object obj) - cdef object iter(object obj) cdef char* _cstr(object s) cdef object PY_NEW(object t) From scoder at codespeak.net Fri Jan 11 09:50:55 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 11 Jan 2008 09:50:55 +0100 (CET) Subject: [Lxml-checkins] r50505 - in lxml/trunk: . src/lxml Message-ID: <20080111085055.5D8551684F2@codespeak.net> Author: scoder Date: Fri Jan 11 09:50:54 2008 New Revision: 50505 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/proxy.pxi Log: r3221 at delle: sbehnel | 2008-01-10 00:01:35 +0100 keep an assert just in case Modified: lxml/trunk/src/lxml/proxy.pxi ============================================================================== --- lxml/trunk/src/lxml/proxy.pxi (original) +++ lxml/trunk/src/lxml/proxy.pxi Fri Jan 11 09:50:54 2008 @@ -116,6 +116,7 @@ if c_element.doc._private is not NULL: if c_element is c_element.doc.children: c_element = c_element.doc._private + #assert c_element.type == tree.XML_ELEMENT_NODE return _elementFactory(doc, c_element) ################################################################################ From scoder at codespeak.net Fri Jan 11 09:50:59 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 11 Jan 2008 09:50:59 +0100 (CET) Subject: [Lxml-checkins] r50506 - in lxml/trunk: . src/lxml Message-ID: <20080111085059.BF49C16850A@codespeak.net> Author: scoder Date: Fri Jan 11 09:50:59 2008 New Revision: 50506 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/extensions.pxi lxml/trunk/src/lxml/python.pxd Log: r3222 at delle: sbehnel | 2008-01-10 00:09:11 +0100 separate ElementStringResult implementations for str and unicode values, requires Cython > 0.9.6.10b Modified: lxml/trunk/src/lxml/extensions.pxi ============================================================================== --- lxml/trunk/src/lxml/extensions.pxi (original) +++ lxml/trunk/src/lxml/extensions.pxi Fri Jan 11 09:50:59 2008 @@ -560,7 +560,16 @@ ################################################################################ # special str/unicode subclasses -cdef class _ElementStringResult(python.unicode): +cdef class _ElementUnicodeResult(python.unicode): + cdef _Element parent + cdef readonly object is_tail + cdef readonly object is_text + cdef readonly object is_attribute + + def getparent(self): + return self.parent + +cdef class _ElementStringResult(python.str): cdef _Element parent cdef readonly object is_tail cdef readonly object is_text @@ -570,17 +579,22 @@ return self.parent cdef object _newElementStringResult(_Document doc, xmlNode* c_node): - cdef _ElementStringResult element_string + cdef _ElementUnicodeResult element_unicode + cdef _ElementStringResult element_str cdef xmlNode* c_element cdef char* s - cdef bint is_attribute, is_tail + cdef bint is_attribute, is_tail, is_utf8 if c_node.type == tree.XML_ATTRIBUTE_NODE: is_attribute = 1 is_tail = 0 s = tree.xmlNodeGetContent(c_node) + is_utf8 = isutf8(s) try: - value = python.PyUnicode_DecodeUTF8(s, cstd.strlen(s), NULL) + if is_utf8: + value = python.PyUnicode_DecodeUTF8(s, cstd.strlen(s), NULL) + else: + value = s finally: tree.xmlFree(s) c_element = NULL @@ -588,8 +602,12 @@ #assert c_node.type == tree.XML_TEXT_NODE, "invalid node type" is_attribute = 0 # tail text? - value = python.PyUnicode_DecodeUTF8( - c_node.content, cstd.strlen(c_node.content), NULL) + is_utf8 = isutf8(c_node.content) + if is_utf8: + value = python.PyUnicode_DecodeUTF8( + c_node.content, cstd.strlen(c_node.content), NULL) + else: + value = c_node.content c_element = _previousElement(c_node) is_tail = c_element is not NULL @@ -599,15 +617,23 @@ while c_element is not NULL and not _isElement(c_element): c_element = c_element.parent - #if c_element is NULL: - return value + if c_element is NULL: + return value - element_string = _ElementStringResult(value) - element_string.parent = _fakeDocElementFactory(doc, c_element) - element_string.is_attribute = is_attribute - element_string.is_tail = is_tail - element_string.is_text = not (is_tail or is_attribute) - return element_string + if is_utf8: + element_unicode = _ElementUnicodeResult(value) + element_unicode.parent = _fakeDocElementFactory(doc, c_element) + element_unicode.is_attribute = is_attribute + element_unicode.is_tail = is_tail + element_unicode.is_text = not (is_tail or is_attribute) + return element_unicode + else: + element_str = _ElementStringResult(value) + element_str.parent = _fakeDocElementFactory(doc, c_element) + element_str.is_attribute = is_attribute + element_str.is_tail = is_tail + element_str.is_text = not (is_tail or is_attribute) + return element_str ################################################################################ # callbacks for XPath/XSLT extension functions Modified: lxml/trunk/src/lxml/python.pxd ============================================================================== --- lxml/trunk/src/lxml/python.pxd (original) +++ lxml/trunk/src/lxml/python.pxd Fri Jan 11 09:50:59 2008 @@ -19,6 +19,9 @@ ctypedef class __builtin__.unicode [object PyUnicodeObject]: pass + ctypedef class __builtin__.str [object PyStringObject]: + pass + cdef FILE* PyFile_AsFile(object p) cdef int PyFile_Check(object p) cdef object PyFile_Name(object p) From scoder at codespeak.net Fri Jan 11 09:51:03 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 11 Jan 2008 09:51:03 +0100 (CET) Subject: [Lxml-checkins] r50507 - in lxml/trunk: . doc Message-ID: <20080111085103.47DCA168514@codespeak.net> Author: scoder Date: Fri Jan 11 09:51:02 2008 New Revision: 50507 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/main.txt Log: r3223 at delle: sbehnel | 2008-01-10 13:04:21 +0100 prepare release of 2.0beta1 Modified: lxml/trunk/doc/main.txt ============================================================================== --- lxml/trunk/doc/main.txt (original) +++ lxml/trunk/doc/main.txt Fri Jan 11 09:51:02 2008 @@ -138,8 +138,8 @@ .. _`lxml at the Python Package Index`: http://pypi.python.org/pypi/lxml/ .. _`this key`: pubkey.asc -The latest version is `lxml 2.0alpha6`_, released 2007-12-19 -(`changes for 2.0alpha6`_). `Older versions`_ are listed below. +The latest version is `lxml 2.0beta1`_, released 2008-01-11 +(`changes for 2.0beta1`_). `Older versions`_ are listed below. .. _`Older versions`: #old-versions @@ -199,6 +199,8 @@ Old Versions ------------ +* `lxml 2.0alpha6`_, released 2007-12-19 (`changes for 2.0alpha6`_) + * `lxml 2.0alpha5`_, released 2007-11-24 (`changes for 2.0alpha5`_) * `lxml 2.0alpha4`_, released 2007-10-07 (`changes for 2.0alpha4`_) @@ -259,6 +261,7 @@ * `lxml 0.5`_, released 2005-04-08 +.. _`lxml 2.0beta1`: lxml-2.0beta1.tgz .. _`lxml 2.0alpha6`: lxml-2.0alpha6.tgz .. _`lxml 2.0alpha5`: lxml-2.0alpha5.tgz .. _`lxml 2.0alpha4`: lxml-2.0alpha4.tgz @@ -290,6 +293,7 @@ .. _`lxml 0.5.1`: lxml-0.5.1.tgz .. _`lxml 0.5`: lxml-0.5.tgz +.. _`changes for 2.0beta1`: changes-2.0beta1.html .. _`changes for 2.0alpha6`: changes-2.0alpha6.html .. _`changes for 2.0alpha5`: changes-2.0alpha5.html .. _`changes for 2.0alpha4`: changes-2.0alpha4.html From scoder at codespeak.net Fri Jan 11 09:51:06 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 11 Jan 2008 09:51:06 +0100 (CET) Subject: [Lxml-checkins] r50508 - in lxml/trunk: . doc Message-ID: <20080111085106.B5A61168514@codespeak.net> Author: scoder Date: Fri Jan 11 09:51:06 2008 New Revision: 50508 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/build.txt Log: r3224 at delle: sbehnel | 2008-01-10 13:04:54 +0100 require Cython 0.9.6.11 Modified: lxml/trunk/doc/build.txt ============================================================================== --- lxml/trunk/doc/build.txt (original) +++ lxml/trunk/doc/build.txt Fri Jan 11 09:51:06 2008 @@ -33,11 +33,11 @@ be an lxml developer, you do need a working Cython installation. You can use EasyInstall_ to install it:: - easy_install Cython==0.9.6.10 + easy_install Cython==0.9.6.11 .. _EasyInstall: http://peak.telecommunity.com/DevCenter/EasyInstall -lxml currently requires at least Cython 0.9.6.10, but later versions +lxml currently requires at least Cython 0.9.6.11, but later versions should work. From scoder at codespeak.net Fri Jan 11 09:51:09 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 11 Jan 2008 09:51:09 +0100 (CET) Subject: [Lxml-checkins] r50509 - lxml/trunk Message-ID: <20080111085109.09DC0168513@codespeak.net> Author: scoder Date: Fri Jan 11 09:51:09 2008 New Revision: 50509 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt Log: r3225 at delle: sbehnel | 2008-01-10 13:05:13 +0100 changelog Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Fri Jan 11 09:51:09 2008 @@ -8,6 +8,10 @@ Features added -------------- +* XPath string results of the ``text()`` function and attribute + selection make their Element container accessible through a + ``getparent()`` method. + * ``XSLT`` objects are now usable in any thread - at the cost of a deep copy if they were not created in that thread. @@ -20,6 +24,11 @@ Bugs fixed ---------- +* XPath on ElementTrees could crash when selecting the virtual root + node of the ElementTree. + +* Compilation ``--without-threading`` was buggy in alpha5/6. + Other changes ------------- From scoder at codespeak.net Fri Jan 11 09:51:17 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 11 Jan 2008 09:51:17 +0100 (CET) Subject: [Lxml-checkins] r50510 - in lxml/trunk: . doc src/lxml src/lxml/tests Message-ID: <20080111085117.F3CF016850A@codespeak.net> Author: scoder Date: Fri Jan 11 09:51:17 2008 New Revision: 50510 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/doc/validation.txt lxml/trunk/src/lxml/iterparse.pxi lxml/trunk/src/lxml/parser.pxi lxml/trunk/src/lxml/tests/common_imports.py lxml/trunk/src/lxml/tests/test_dtd.py lxml/trunk/src/lxml/tests/test_xmlschema.py lxml/trunk/src/lxml/xmlparser.pxd lxml/trunk/src/lxml/xmlschema.pxd lxml/trunk/src/lxml/xmlschema.pxi Log: r3226 at delle: sbehnel | 2008-01-10 20:28:46 +0100 on-the-fly XML schema validation in the parser Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Fri Jan 11 09:51:17 2008 @@ -8,6 +8,8 @@ Features added -------------- +* Parse-time XML schema validation (``schema`` parser keyword). + * XPath string results of the ``text()`` function and attribute selection make their Element container accessible through a ``getparent()`` method. Modified: lxml/trunk/doc/validation.txt ============================================================================== --- lxml/trunk/doc/validation.txt (original) +++ lxml/trunk/doc/validation.txt Fri Jan 11 09:51:17 2008 @@ -13,16 +13,17 @@ There is also initial support for Schematron_. However, it does not currently support error reporting in the validation phase due to insufficiencies in the -implementation as of libxml2 2.6.29. +implementation as of libxml2 2.6.30. .. _Schematron: http://www.ascc.net/xml/schematron .. contents:: .. - 1 DTD - 2 RelaxNG - 3 XMLSchema - 4 Schematron + 1 Validation at parse time + 2 DTD + 3 RelaxNG + 4 XMLSchema + 5 Schematron The usual setup procedure:: @@ -30,20 +31,59 @@ >>> from StringIO import StringIO +Validation at parse time +------------------------ + +The parser in lxml can do on-the-fly validation of a document against +a DTD or an XML schema. The DTD is retrieved automatically based on +the DOCTYPE of the parsed document. All you have to do is use a +parser that has DTD validation enabled:: + + >>> parser = etree.XMLParser(dtd_validation=True) + +Obviously, a request for validation enables the DTD loading feature. +There are two other options that enable loading the DTD, but that do +not perform any validation. The first is the ``load_dtd`` keyword +option, which simply loads the DTD into the parser and makes it +available to the document as external subset. You can retrieve the +DTD from the parsed document using the ``docinfo`` property of the +result ElementTree object. The internal subset is available as +``internalDTD``, the external subset is provided as ``externalDTD``. + +The third way way to activate DTD loading is with the +``attribute_defaults`` option, which loads the DTD and weaves +attribute default values into the document. Again, no validation is +performed unless explicitly requested. + +XML schema is supported in a similar way, but requires an explicit +schema to be provided:: + + >>> schema_root = etree.XML('''\ + ... + ... + ... + ... ''') + >>> schema = etree.XMLSchema(schema_root) + + >>> parser = etree.XMLParser(schema = schema) + >>> root = etree.fromstring("5", parser) + +If the validation fails (be it for a DTD or an XML schema), the parser +will raise an exception:: + + >>> root = etree.fromstring("not int", parser) + Traceback (most recent call last): + XMLSyntaxError: Element 'a': 'not int' is not a valid value of the atomic type 'xs:integer'. + + DTD --- -There are two places in lxml where DTDs are supported: parsers and the DTD -class. If you pass a keyword option to a parser that requires DTD loading, -lxml will automatically include the DTD in the parsing process. If you pass -the keyword for DTD validation, lxml (or rather libxml2) will use this DTD -right inside the parser and report failure or success when parsing terminates. - -The parser support for DTDs depends on internal or external subsets of the XML -file. This means that the XML file itself must either contain a DTD or must -reference a DTD to make this work. If you want to validate an XML document -against a DTD that is not referenced by the document itself, you can use the -``DTD`` class. +As described above, the parser support for DTDs depends on internal or +external subsets of the XML file. This means that the XML file itself +must either contain a DTD or must reference a DTD to make this work. +If you want to validate an XML document against a DTD that is not +referenced by the document itself, you can use the ``DTD`` class. To use the ``DTD`` class, you must first pass a filename or file-like object into the constructor to parse a DTD:: Modified: lxml/trunk/src/lxml/iterparse.pxi ============================================================================== --- lxml/trunk/src/lxml/iterparse.pxi (original) +++ lxml/trunk/src/lxml/iterparse.pxi Fri Jan 11 09:51:17 2008 @@ -272,14 +272,15 @@ Other keyword arguments: * encoding - override the document encoding + * schema - an XMLSchema to validate against """ cdef object _source cdef readonly object root - def __init__(self, source, events=("end",), tag=None, + def __init__(self, source, events=("end",), *, tag=None, attribute_defaults=False, dtd_validation=False, load_dtd=False, no_network=True, remove_blank_text=False, remove_comments=False, remove_pis=False, encoding=None, - html=False): + html=False, XMLSchema schema=None): cdef _IterparseContext context cdef char* c_encoding cdef int parse_options @@ -318,7 +319,7 @@ if remove_blank_text: parse_options = parse_options | xmlparser.XML_PARSE_NOBLANKS - _BaseParser.__init__(self, parse_options, html, + _BaseParser.__init__(self, parse_options, html, schema, remove_comments, remove_pis, None, filename, encoding) Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Fri Jan 11 09:51:17 2008 @@ -375,9 +375,13 @@ cdef class _ParserContext(_ResolverContext) cdef class _SaxParserContext(_ParserContext) cdef class _TargetParserContext(_SaxParserContext) +cdef class _ParserSchemaValidationContext +cdef class _Validator +cdef class XMLSchema(_Validator) cdef class _ParserContext(_ResolverContext): cdef _ErrorLog _error_log + cdef _ParserSchemaValidationContext _validator cdef xmlparser.xmlParserCtxt* _c_ctxt cdef python.PyThread_type_lock _lock @@ -390,6 +394,7 @@ cdef _ParserContext _copy(self): cdef _ParserContext context context = self.__class__() + context._validator = self._validator.copy() _initParserContext(context, self._resolvers._copy(), NULL) return context @@ -414,11 +419,15 @@ if result == 0: raise ParserError, "parser locking failed" self._error_log.connect() + if self._validator is not None: + self._validator.connect(self._c_ctxt) return 0 cdef int cleanup(self) except -1: self._resetParserContext() self.clear() + if self._validator is not None: + self._validator.disconnect() self._error_log.disconnect() if config.ENABLE_THREADING and self._lock is not NULL: python.PyThread_release_lock(self._lock) @@ -487,7 +496,10 @@ c_ctxt.myDoc = NULL if result is not NULL: - if recover or (c_ctxt.wellFormed and \ + if context._validator is not None and \ + not context._validator.isvalid(): + well_formed = 0 # actually not 'valid', but anyway ... + elif recover or (c_ctxt.wellFormed and \ c_ctxt.lastError.level < xmlerror.XML_ERR_ERROR): well_formed = 1 elif not c_ctxt.replaceEntities and not c_ctxt.validate \ @@ -535,16 +547,15 @@ cdef bint _for_html cdef bint _remove_comments cdef bint _remove_pis + cdef XMLSchema _schema cdef object _filename cdef object _target cdef object _default_encoding cdef int _default_encoding_int - def __init__(self, int parse_options, bint for_html, - remove_comments, remove_pis, - target, filename, encoding): + def __init__(self, int parse_options, bint for_html, XMLSchema schema, + remove_comments, remove_pis, target, filename, encoding): cdef int c_encoding - cdef xmlparser.xmlParserCtxt* pctxt if not isinstance(self, HTMLParser) and \ not isinstance(self, XMLParser) and \ not isinstance(self, iterparse): @@ -556,6 +567,7 @@ self._for_html = for_html self._remove_comments = remove_comments self._remove_pis = remove_pis + self._schema = schema self._resolvers = _ResolverRegistry() @@ -575,6 +587,9 @@ cdef xmlparser.xmlParserCtxt* pctxt if self._parser_context is None: self._parser_context = self._createContext(self._target) + if self._schema is not None: + self._parser_context._validator = \ + self._schema._newSaxValidator() pctxt = self._newParserCtxt() if pctxt is NULL: python.PyErr_NoMemory() @@ -591,6 +606,9 @@ cdef xmlparser.xmlParserCtxt* pctxt if self._push_parser_context is None: self._push_parser_context = self._createContext(self._target) + if self._schema is not None: + self._push_parser_context._validator = \ + self._schema._newSaxValidator() pctxt = self._newPushParserCtxt() if pctxt is NULL: python.PyErr_NoMemory() @@ -1439,6 +1457,7 @@ Other keyword arguments: * encoding - override the document encoding * target - a parser target object that will receive the parse events + * schema - an XMLSchema to validate against Note that you should avoid sharing parsers between threads. While this is not harmful, it is more efficient to use separate parsers. This does not @@ -1448,7 +1467,8 @@ load_dtd=False, no_network=True, ns_clean=False, recover=False, remove_blank_text=False, compact=True, resolve_entities=True, remove_comments=False, - remove_pis=False, target=None, encoding=None): + remove_pis=False, target=None, encoding=None, + XMLSchema schema=None): cdef int parse_options parse_options = _XML_DEFAULT_PARSE_OPTIONS if load_dtd: @@ -1472,7 +1492,7 @@ if not resolve_entities: parse_options = parse_options ^ xmlparser.XML_PARSE_NOENT - _BaseParser.__init__(self, parse_options, 0, + _BaseParser.__init__(self, parse_options, 0, schema, remove_comments, remove_pis, target, None, encoding) @@ -1487,7 +1507,7 @@ load_dtd=False, no_network=True, ns_clean=False, recover=False, remove_blank_text=False, compact=True, resolve_entities=True, remove_comments=True, - remove_pis=True, target=None, encoding=None): + remove_pis=True, target=None, encoding=None, schema=None): XMLParser.__init__(self, attribute_defaults=attribute_defaults, dtd_validation=dtd_validation, @@ -1501,7 +1521,8 @@ remove_comments=remove_comments, remove_pis=remove_pis, target=target, - encoding=encoding) + encoding=encoding, + schema=schema) cdef XMLParser __DEFAULT_XML_PARSER @@ -1561,13 +1582,15 @@ Other keyword arguments: * encoding - override the document encoding * target - a parser target object that will receive the parse events + * schema - an XMLSchema to validate against Note that you should avoid sharing parsers between threads for performance reasons. """ - def __init__(self, recover=True, no_network=True, remove_blank_text=False, - compact=True, remove_comments=False, remove_pis=False, - target=None, encoding=None): + def __init__(self, *, recover=True, no_network=True, + remove_blank_text=False, compact=True, remove_comments=False, + remove_pis=False, target=None, encoding=None, + XMLSchema schema=None): cdef int parse_options parse_options = _HTML_DEFAULT_PARSE_OPTIONS if remove_blank_text: @@ -1579,7 +1602,7 @@ if not compact: parse_options = parse_options ^ htmlparser.HTML_PARSE_COMPACT - _BaseParser.__init__(self, parse_options, 1, + _BaseParser.__init__(self, parse_options, 1, schema, remove_comments, remove_pis, target, None, encoding) Modified: lxml/trunk/src/lxml/tests/common_imports.py ============================================================================== --- lxml/trunk/src/lxml/tests/common_imports.py (original) +++ lxml/trunk/src/lxml/tests/common_imports.py Fri Jan 11 09:51:17 2008 @@ -53,9 +53,9 @@ def tearDown(self): gc.collect() - def parse(self, text): + def parse(self, text, parser=None): f = StringIO(text) - return etree.parse(f) + return etree.parse(f, parser=parser) def _rootstring(self, tree): return etree.tostring(tree.getroot()).replace(' ', '').replace('\n', '') Modified: lxml/trunk/src/lxml/tests/test_dtd.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_dtd.py (original) +++ lxml/trunk/src/lxml/tests/test_dtd.py Fri Jan 11 09:51:17 2008 @@ -26,6 +26,13 @@ dtd = etree.DTD(StringIO("")) self.assert_(dtd.validate(root)) + def test_dtd_parse_invalid(self): + fromstring = etree.fromstring + parser = etree.XMLParser(dtd_validation=True) + xml = '' % fileInTestDir("test.dtd") + self.assertRaises(etree.XMLSyntaxError, + fromstring, xml, parser=parser) + def test_dtd_invalid(self): root = etree.XML("") dtd = etree.DTD(StringIO("")) Modified: lxml/trunk/src/lxml/tests/test_xmlschema.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_xmlschema.py (original) +++ lxml/trunk/src/lxml/tests/test_xmlschema.py Fri Jan 11 09:51:17 2008 @@ -26,6 +26,26 @@ self.assert_(schema.validate(tree_valid)) self.assert_(not schema.validate(tree_invalid)) + def test_xmlschema_parse(self): + schema = self.parse(''' + + + + + + + + +''') + schema = etree.XMLSchema(schema) + parser = etree.XMLParser(schema=schema) + + tree_valid = self.parse('', parser=parser) + self.assertEquals('a', tree_valid.getroot().tag) + + self.assertRaises(etree.XMLSyntaxError, + self.parse, '', parser=parser) + def test_xmlschema_elementtree_error(self): self.assertRaises(ValueError, etree.XMLSchema, etree.ElementTree()) Modified: lxml/trunk/src/lxml/xmlparser.pxd ============================================================================== --- lxml/trunk/src/lxml/xmlparser.pxd (original) +++ lxml/trunk/src/lxml/xmlparser.pxd Fri Jan 11 09:51:17 2008 @@ -91,6 +91,7 @@ xmlError lastError xmlNode* node xmlSAXHandler* sax + void* userData int* spaceTab int spaceMax bint html Modified: lxml/trunk/src/lxml/xmlschema.pxd ============================================================================== --- lxml/trunk/src/lxml/xmlschema.pxd (original) +++ lxml/trunk/src/lxml/xmlschema.pxd Fri Jan 11 09:51:17 2008 @@ -1,10 +1,11 @@ -cimport tree +from xmlparser cimport xmlSAXHandler from tree cimport xmlDoc cdef extern from "libxml/xmlschemas.h": ctypedef struct xmlSchema ctypedef struct xmlSchemaParserCtxt + ctypedef struct xmlSchemaSAXPlugStruct ctypedef struct xmlSchemaValidCtxt cdef xmlSchemaValidCtxt* xmlSchemaNewValidCtxt(xmlSchema* schema) nogil @@ -15,3 +16,9 @@ cdef void xmlSchemaFree(xmlSchema* schema) nogil cdef void xmlSchemaFreeParserCtxt(xmlSchemaParserCtxt* ctxt) nogil cdef void xmlSchemaFreeValidCtxt(xmlSchemaValidCtxt* ctxt) nogil + + cdef xmlSchemaSAXPlugStruct* xmlSchemaSAXPlug(xmlSchemaValidCtxt* ctxt, + xmlSAXHandler** sax, + void** data) nogil + cdef int xmlSchemaSAXUnplug(xmlSchemaSAXPlugStruct* sax_plug) + cdef int xmlSchemaIsValid(xmlSchemaValidCtxt* ctxt) Modified: lxml/trunk/src/lxml/xmlschema.pxi ============================================================================== --- lxml/trunk/src/lxml/xmlschema.pxi (original) +++ lxml/trunk/src/lxml/xmlschema.pxi Fri Jan 11 09:51:17 2008 @@ -105,8 +105,53 @@ self._error_log.disconnect() if ret == -1: - raise XMLSchemaValidateError, "Internal error in XML Schema validation." + raise XMLSchemaValidateError( + "Internal error in XML Schema validation.") if ret == 0: return True else: return False + + cdef _ParserSchemaValidationContext _newSaxValidator(self): + cdef _ParserSchemaValidationContext context + context = NEW_SCHEMA_CONTEXT(_ParserSchemaValidationContext) + context._schema = self + context._valid_ctxt = NULL + context._sax_plug = NULL + return context + +cdef class _ParserSchemaValidationContext: + cdef XMLSchema _schema + cdef xmlschema.xmlSchemaValidCtxt* _valid_ctxt + cdef xmlschema.xmlSchemaSAXPlugStruct* _sax_plug + + def __dealloc__(self): + if self._sax_plug: + self.disconnect() + if self._valid_ctxt: + xmlschema.xmlSchemaFreeValidCtxt(self._valid_ctxt) + + cdef _ParserSchemaValidationContext copy(self): + return self._schema._newSaxValidator() + + cdef int connect(self, xmlparser.xmlParserCtxt* c_ctxt) except -1: + if self._valid_ctxt is NULL: + self._valid_ctxt = xmlschema.xmlSchemaNewValidCtxt( + self._schema._c_schema) + if self._valid_ctxt is NULL: + raise XMLSchemaError, "Failed to create validation context" + self._sax_plug = xmlschema.xmlSchemaSAXPlug( + self._valid_ctxt, &c_ctxt.sax, &c_ctxt.userData) + + cdef void disconnect(self): + xmlschema.xmlSchemaSAXUnplug(self._sax_plug) + self._sax_plug = NULL + + cdef bint isvalid(self): + if self._valid_ctxt is NULL: + return 1 # valid + return xmlschema.xmlSchemaIsValid(self._valid_ctxt) + +cdef extern from "etree_defs.h": + # macro call to 't->tp_new()' for fast instantiation + cdef _ParserSchemaValidationContext NEW_SCHEMA_CONTEXT "PY_NEW" (object t) From scoder at codespeak.net Fri Jan 11 09:51:21 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 11 Jan 2008 09:51:21 +0100 (CET) Subject: [Lxml-checkins] r50511 - lxml/trunk Message-ID: <20080111085121.0ECF316850A@codespeak.net> Author: scoder Date: Fri Jan 11 09:51:20 2008 New Revision: 50511 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt Log: r3227 at delle: sbehnel | 2008-01-10 21:14:50 +0100 changelog cleanup and 2.0beta1 release date Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Fri Jan 11 09:51:20 2008 @@ -2,8 +2,8 @@ lxml changelog ============== -Under development -================= +2.0beta1 (2008-01-11) +===================== Features added -------------- @@ -14,14 +14,14 @@ selection make their Element container accessible through a ``getparent()`` method. -* ``XSLT`` objects are now usable in any thread - at the cost of a - deep copy if they were not created in that thread. +* ``XSLT`` objects are usable in any thread - at the cost of a deep + copy if they were not created in that thread. -* Invalid entity names and character references will now be rejected - by the ``Entity()`` factory. +* Invalid entity names and character references will be rejected by + the ``Entity()`` factory. -* ``entity.text`` now returns the textual representation of the - entity, e.g. ``&``. +* ``entity.text`` returns the textual representation of the entity, + e.g. ``&``. Bugs fixed ---------- From scoder at codespeak.net Fri Jan 11 11:55:09 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 11 Jan 2008 11:55:09 +0100 (CET) Subject: [Lxml-checkins] r50512 - in lxml/trunk: . doc src/lxml src/lxml/tests Message-ID: <20080111105509.6BF3D1684D7@codespeak.net> Author: scoder Date: Fri Jan 11 11:55:07 2008 New Revision: 50512 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/doc/tutorial.txt lxml/trunk/src/lxml/extensions.pxi lxml/trunk/src/lxml/python.pxd lxml/trunk/src/lxml/tests/test_xpathevaluator.py Log: r3237 at delle: sbehnel | 2008-01-11 11:54:56 +0100 subtyping PyStringObject does not work in Cython/Pyrex, so XPath string results will just have to be unicode Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Fri Jan 11 11:55:07 2008 @@ -12,7 +12,8 @@ * XPath string results of the ``text()`` function and attribute selection make their Element container accessible through a - ``getparent()`` method. + ``getparent()`` method. As a side-effect, they are now always + unicode objects (even ASCII strings). * ``XSLT`` objects are usable in any thread - at the cost of a deep copy if they were not created in that thread. Modified: lxml/trunk/doc/tutorial.txt ============================================================================== --- lxml/trunk/doc/tutorial.txt (original) +++ lxml/trunk/doc/tutorial.txt Fri Jan 11 11:55:07 2008 @@ -281,13 +281,39 @@ >>> print html.xpath("string()") # lxml.etree only! TEXTTAIL >>> print html.xpath("//text()") # lxml.etree only! - ['TEXT', 'TAIL'] + [u'TEXT', u'TAIL'] If you want to use this more often, you can wrap it in a function:: >>> build_text_list = etree.XPath("//text()") # lxml.etree only! >>> print build_text_list(html) - ['TEXT', 'TAIL'] + [u'TEXT', u'TAIL'] + +Note that the ``text()`` function in XPath always returns unicode +strings. This is because it is actually a special object that knows +about its origins. You can ask it where it came from through its +``getparent()`` method, just as you would with Elements:: + + >>> texts = build_text_list(html) + >>> print texts[0] + TEXT + >>> parent = texts[0].getparent() + >>> print parent.tag + body + + >>> print texts[1] + TAIL + >>> print texts[1].getparent().tag + br + +You can also find out if it's normal text content or tail text:: + + >>> print texts[0].is_text + True + >>> print texts[1].is_text + False + >>> print texts[1].is_tail + True .. _XPath: xpathxslt.html#xpath Modified: lxml/trunk/src/lxml/extensions.pxi ============================================================================== --- lxml/trunk/src/lxml/extensions.pxi (original) +++ lxml/trunk/src/lxml/extensions.pxi Fri Jan 11 11:55:07 2008 @@ -560,16 +560,7 @@ ################################################################################ # special str/unicode subclasses -cdef class _ElementUnicodeResult(python.unicode): - cdef _Element parent - cdef readonly object is_tail - cdef readonly object is_text - cdef readonly object is_attribute - - def getparent(self): - return self.parent - -cdef class _ElementStringResult(python.str): +cdef class _ElementStringResult(python.unicode): cdef _Element parent cdef readonly object is_tail cdef readonly object is_text @@ -579,22 +570,17 @@ return self.parent cdef object _newElementStringResult(_Document doc, xmlNode* c_node): - cdef _ElementUnicodeResult element_unicode - cdef _ElementStringResult element_str + cdef _ElementStringResult result cdef xmlNode* c_element cdef char* s - cdef bint is_attribute, is_tail, is_utf8 + cdef bint is_attribute, is_tail if c_node.type == tree.XML_ATTRIBUTE_NODE: is_attribute = 1 is_tail = 0 s = tree.xmlNodeGetContent(c_node) - is_utf8 = isutf8(s) try: - if is_utf8: - value = python.PyUnicode_DecodeUTF8(s, cstd.strlen(s), NULL) - else: - value = s + value = python.PyUnicode_DecodeUTF8(s, cstd.strlen(s), NULL) finally: tree.xmlFree(s) c_element = NULL @@ -602,12 +588,8 @@ #assert c_node.type == tree.XML_TEXT_NODE, "invalid node type" is_attribute = 0 # tail text? - is_utf8 = isutf8(c_node.content) - if is_utf8: - value = python.PyUnicode_DecodeUTF8( - c_node.content, cstd.strlen(c_node.content), NULL) - else: - value = c_node.content + value = python.PyUnicode_DecodeUTF8( + c_node.content, cstd.strlen(c_node.content), NULL) c_element = _previousElement(c_node) is_tail = c_element is not NULL @@ -620,20 +602,12 @@ if c_element is NULL: return value - if is_utf8: - element_unicode = _ElementUnicodeResult(value) - element_unicode.parent = _fakeDocElementFactory(doc, c_element) - element_unicode.is_attribute = is_attribute - element_unicode.is_tail = is_tail - element_unicode.is_text = not (is_tail or is_attribute) - return element_unicode - else: - element_str = _ElementStringResult(value) - element_str.parent = _fakeDocElementFactory(doc, c_element) - element_str.is_attribute = is_attribute - element_str.is_tail = is_tail - element_str.is_text = not (is_tail or is_attribute) - return element_str + result = _ElementStringResult(value) + result.parent = _fakeDocElementFactory(doc, c_element) + result.is_attribute = is_attribute + result.is_tail = is_tail + result.is_text = not (is_tail or is_attribute) + return result ################################################################################ # callbacks for XPath/XSLT extension functions Modified: lxml/trunk/src/lxml/python.pxd ============================================================================== --- lxml/trunk/src/lxml/python.pxd (original) +++ lxml/trunk/src/lxml/python.pxd Fri Jan 11 11:55:07 2008 @@ -19,9 +19,6 @@ ctypedef class __builtin__.unicode [object PyUnicodeObject]: pass - ctypedef class __builtin__.str [object PyStringObject]: - pass - cdef FILE* PyFile_AsFile(object p) cdef int PyFile_Check(object p) cdef object PyFile_Name(object p) Modified: lxml/trunk/src/lxml/tests/test_xpathevaluator.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_xpathevaluator.py (original) +++ lxml/trunk/src/lxml/tests/test_xpathevaluator.py Fri Jan 11 11:55:07 2008 @@ -67,16 +67,33 @@ self.assertEquals(['Foo', 'Bar'], tree.xpath('/a/b/text()')) + def test_xpath_list_text_parent(self): + tree = self.parse('FooBarBarFoo') + root = tree.getroot() + self.assertEquals(['FooBar', 'BarFoo'], + tree.xpath('/a/b/text()')) + self.assertEquals([root[0], root[1]], + [r.getparent() for r in tree.xpath('/a/b/text()')]) + + def test_xpath_list_unicode_text_parent(self): + xml = u'FooBar\u0680\u3120BarFoo\u0680\u3120' + tree = self.parse(xml.encode('utf-8')) + root = tree.getroot() + self.assertEquals([u'FooBar\u0680\u3120', u'BarFoo\u0680\u3120'], + tree.xpath('/a/b/text()')) + self.assertEquals([root[0], root[1]], + [r.getparent() for r in tree.xpath('/a/b/text()')]) + def test_xpath_list_attribute(self): tree = self.parse('') self.assertEquals(['B'], tree.xpath('/a/@b')) def test_xpath_list_attribute_parent(self): - tree = self.parse('') + tree = self.parse('') results = tree.xpath('/a/@c') self.assertEquals(1, len(results)) - self.assertEquals('C', results[0]) + self.assertEquals('CqWeRtZuI', results[0]) self.assertEquals(tree.getroot().tag, results[0].getparent().tag) def test_xpath_list_comment(self): From scoder at codespeak.net Fri Jan 11 15:18:57 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 11 Jan 2008 15:18:57 +0100 (CET) Subject: [Lxml-checkins] r50515 - in lxml/trunk: . doc Message-ID: <20080111141857.146FE1684C7@codespeak.net> Author: scoder Date: Fri Jan 11 15:18:56 2008 New Revision: 50515 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/api.txt Log: r3239 at delle: sbehnel | 2008-01-11 15:17:27 +0100 fix doctest for libxml2 2.6.31 Modified: lxml/trunk/doc/api.txt ============================================================================== --- lxml/trunk/doc/api.txt (original) +++ lxml/trunk/doc/api.txt Fri Jan 11 15:18:56 2008 @@ -308,10 +308,10 @@ >>> notxml = etree.tostring(unicode_root, encoding="UTF-16LE", ... xml_declaration=False) - >>> root = etree.XML(notxml) + >>> root = etree.XML(notxml) #doctest: +ELLIPSIS Traceback (most recent call last): ... - XMLSyntaxError: StartTag: invalid element name, line 1, column 2 + XMLSyntaxError: ... XInclude and ElementInclude From scoder at codespeak.net Fri Jan 11 15:20:45 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 11 Jan 2008 15:20:45 +0100 (CET) Subject: [Lxml-checkins] r50516 - lxml/trunk Message-ID: <20080111142045.2F9131684C7@codespeak.net> Author: scoder Date: Fri Jan 11 15:20:44 2008 New Revision: 50516 Modified: lxml/trunk/ (props changed) lxml/trunk/update-error-constants.py Log: r3241 at delle: sbehnel | 2008-01-11 15:20:34 +0100 API usage fix Modified: lxml/trunk/update-error-constants.py ============================================================================== --- lxml/trunk/update-error-constants.py (original) +++ lxml/trunk/update-error-constants.py Fri Jan 11 15:20:44 2008 @@ -65,7 +65,8 @@ PARSE_ENUM_NAME = re.compile('\s*enum\s+(\w+)\s*{', re.I).match PARSE_ENUM_VALUE = re.compile('\s*=\s+([0-9]+)\s*(?::\s*(.*))?').match tree = etree.parse(html_file) - xpath = etree.XPathEvaluator(tree, {'html' : 'http://www.w3.org/1999/xhtml'}) + xpath = etree.XPathEvaluator( + tree, namespaces={'html' : 'http://www.w3.org/1999/xhtml'}) enum_dict = {} enums = xpath.evaluate("//html:pre[@class = 'programlisting' and contains(text(), 'Enum') and html:a[@name]]") From scoder at codespeak.net Fri Jan 11 16:21:31 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 11 Jan 2008 16:21:31 +0100 (CET) Subject: [Lxml-checkins] r50519 - in lxml/trunk: . src/lxml/html/tests src/lxml/tests Message-ID: <20080111152131.B850F1684C7@codespeak.net> Author: scoder Date: Fri Jan 11 16:21:30 2008 New Revision: 50519 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/html/tests/test_clean.py lxml/trunk/src/lxml/tests/test_etree.py Log: r3243 at delle: sbehnel | 2008-01-11 16:21:15 +0100 test fixes Modified: lxml/trunk/src/lxml/html/tests/test_clean.py ============================================================================== --- lxml/trunk/src/lxml/html/tests/test_clean.py (original) +++ lxml/trunk/src/lxml/html/tests/test_clean.py Fri Jan 11 16:21:30 2008 @@ -5,6 +5,6 @@ def test_suite(): suite = unittest.TestSuite() suite.addTests([doctest.DocFileSuite('test_clean.txt')]) - if LIBXML_VERSION <= (2,6,28): + if LIBXML_VERSION <= (2,6,28) or LIBXML_VERSION >= (2,6,31): suite.addTests([doctest.DocFileSuite('test_clean_embed.txt')]) return suite Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Fri Jan 11 16:21:30 2008 @@ -1353,8 +1353,7 @@ '' % ns_href, self.etree.tostring(two)) - def _test_namespaces_after_serialize(self): - # FIXME: this currently fails - fix serializer.pxi! + def test_namespaces_after_serialize(self): parse = self.etree.parse tostring = self.etree.tostring @@ -1363,9 +1362,7 @@ StringIO('' % ns_href)) baz = one.getroot()[0][0] - print tostring(baz) parsed = parse(StringIO( tostring(baz) )).getroot() - self.assertEquals('{%s}baz' % ns_href, parsed.tag) def test_element_nsmap(self): From scoder at codespeak.net Fri Jan 11 16:23:05 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 11 Jan 2008 16:23:05 +0100 (CET) Subject: [Lxml-checkins] r50520 - lxml/tag/lxml-2.0beta1 Message-ID: <20080111152305.571C31684C7@codespeak.net> Author: scoder Date: Fri Jan 11 16:23:04 2008 New Revision: 50520 Added: lxml/tag/lxml-2.0beta1/ - copied from r50518, lxml/trunk/ Log: tag for lxml 2.0beta1 From scoder at codespeak.net Fri Jan 11 16:26:31 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 11 Jan 2008 16:26:31 +0100 (CET) Subject: [Lxml-checkins] r50521 - lxml/tag/lxml-2.0alpha6 Message-ID: <20080111152631.8B8291684C7@codespeak.net> Author: scoder Date: Fri Jan 11 16:26:31 2008 New Revision: 50521 Added: lxml/tag/lxml-2.0alpha6/ - copied from r49929, lxml/trunk/ Log: tag for lxml 2.0alpha6 From scoder at codespeak.net Fri Jan 11 16:36:31 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 11 Jan 2008 16:36:31 +0100 (CET) Subject: [Lxml-checkins] r50522 - in lxml/trunk: . doc Message-ID: <20080111153631.245271684C7@codespeak.net> Author: scoder Date: Fri Jan 11 16:36:30 2008 New Revision: 50522 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/main.txt Log: r3245 at delle: sbehnel | 2008-01-11 16:36:16 +0100 doc update Modified: lxml/trunk/doc/main.txt ============================================================================== --- lxml/trunk/doc/main.txt (original) +++ lxml/trunk/doc/main.txt Fri Jan 11 16:36:30 2008 @@ -131,9 +131,11 @@ Download -------- -The best way to download binary versions is to visit `lxml at the Python -Package Index`_. It has the source, eggs and installers for various platforms. -The source distribution is signed with `this key`_. +The best way to download lxml is to visit `lxml at the Python Package +Index`_ (PyPI). It has the source that compiles on various platforms. +The source distribution is signed with `this key`_. Binary builds for +MS Windows usually become available through PyPI a few days after a +source release. .. _`lxml at the Python Package Index`: http://pypi.python.org/pypi/lxml/ .. _`this key`: pubkey.asc From scoder at codespeak.net Sat Jan 12 11:37:42 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 12 Jan 2008 11:37:42 +0100 (CET) Subject: [Lxml-checkins] r50527 - in lxml/trunk: . doc Message-ID: <20080112103742.1DBEB168548@codespeak.net> Author: scoder Date: Sat Jan 12 11:37:40 2008 New Revision: 50527 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/main.txt Log: r3249 at delle: sbehnel | 2008-01-11 18:19:02 +0100 doc update Modified: lxml/trunk/doc/main.txt ============================================================================== --- lxml/trunk/doc/main.txt (original) +++ lxml/trunk/doc/main.txt Sat Jan 12 11:37:40 2008 @@ -135,7 +135,8 @@ Index`_ (PyPI). It has the source that compiles on various platforms. The source distribution is signed with `this key`_. Binary builds for MS Windows usually become available through PyPI a few days after a -source release. +source release. If you can't wait, consider trying a less recent +version first. .. _`lxml at the Python Package Index`: http://pypi.python.org/pypi/lxml/ .. _`this key`: pubkey.asc From scoder at codespeak.net Sat Jan 12 11:37:46 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 12 Jan 2008 11:37:46 +0100 (CET) Subject: [Lxml-checkins] r50528 - in lxml/trunk: . src/lxml/html/tests Message-ID: <20080112103746.2C389168549@codespeak.net> Author: scoder Date: Sat Jan 12 11:37:45 2008 New Revision: 50528 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/html/tests/test_basic.py Log: r3250 at delle: sbehnel | 2008-01-12 11:37:28 +0100 run doctests from lxmlhtml.txt Modified: lxml/trunk/src/lxml/html/tests/test_basic.py ============================================================================== --- lxml/trunk/src/lxml/html/tests/test_basic.py (original) +++ lxml/trunk/src/lxml/html/tests/test_basic.py Sat Jan 12 11:37:45 2008 @@ -4,6 +4,7 @@ def test_suite(): suite = unittest.TestSuite() suite.addTests([doctest.DocFileSuite('test_basic.txt')]) + suite.addTests([doctest.DocFileSuite('../../../../doc/lxmlhtml.txt')]) return suite if __name__ == '__main__': From scoder at codespeak.net Sat Jan 12 19:41:33 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 12 Jan 2008 19:41:33 +0100 (CET) Subject: [Lxml-checkins] r50533 - in lxml/trunk: . src/lxml/html Message-ID: <20080112184133.67C63168549@codespeak.net> Author: scoder Date: Sat Jan 12 19:41:32 2008 New Revision: 50533 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/html/clean.py Log: r3254 at delle: sbehnel | 2008-01-12 19:41:17 +0100 code cleanup Modified: lxml/trunk/src/lxml/html/clean.py ============================================================================== --- lxml/trunk/src/lxml/html/clean.py (original) +++ lxml/trunk/src/lxml/html/clean.py Sat Jan 12 19:41:32 2008 @@ -44,7 +44,7 @@ # execution: _javascript_scheme_re = re.compile( r'\s*(?:javascript|jscript|livescript|vbscript|about|mocha):', re.I) -_whitespace_re = re.compile(r'\s+') +_substitute_whitespace = re.compile(r'\s+').sub # FIXME: should data: be blocked? # FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx @@ -57,15 +57,6 @@ _find_external_links = etree.XPath( "descendant-or-self::a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']") -def clean_html(html, **kw): - """ - Like clean(), but takes a text input document, and returns a text - document. - """ - doc = fromstring(html) - clean(doc, **kw) - return tostring(doc) - class Cleaner(object): """ Instances cleans the document of each of the possible offending @@ -205,7 +196,7 @@ doc = doc.getroot() # Normalize a case that IE treats like , and that # can confuse either this step or later steps. - for el in doc.getiterator('image'): + for el in doc.iter('image'): el.tag = 'img' if not self.comments: # Of course, if we were going to kill comments anyway, we don't @@ -221,7 +212,7 @@ kill_tags.add('script') if self.safe_attrs_only: safe_attrs = set(defs.safe_attrs) - for el in doc.getiterator(): + for el in doc.iter(): attrib = el.attrib for aname in attrib.keys(): if aname not in safe_attrs: @@ -229,7 +220,7 @@ if self.javascript: if not self.safe_attrs_only: # safe_attrs handles events attributes itself - for el in doc.getiterator(): + for el in doc.iter(): attrib = el.attrib for aname in attrib.keys(): if aname.startswith('on'): @@ -248,7 +239,7 @@ del el.attrib['style'] elif new != old: el.set('style', new) - for el in list(doc.getiterator('style')): + for el in list(doc.iter('style')): if el.get('type', '').lower().strip() == 'text/javascript': el.drop_tree() continue @@ -277,7 +268,7 @@ elif self.style or self.javascript: # We must get rid of included stylesheets if Javascript is not # allowed, as you can put Javascript in them - for el in list(doc.getiterator('link')): + for el in list(doc.iter('link')): if 'stylesheet' in el.get('rel', '').lower(): # Note this kills alternate stylesheets as well el.drop_tree() @@ -289,7 +280,7 @@ # FIXME: is really embedded? # We should get rid of any tags not inside ; # These are not really valid anyway. - for el in list(doc.getiterator('param')): + for el in list(doc.iter('param')): found_parent = False parent = el.getparent() while parent is not None and parent.tag not in ('applet', 'object'): @@ -312,7 +303,7 @@ _remove = [] _kill = [] - for el in doc.getiterator(): + for el in doc.iter(): if el.tag in kill_tags: if self.allow_element(el): continue @@ -349,7 +340,7 @@ allow_tags = set(defs.tags) if allow_tags: bad = [] - for el in doc.getiterator(): + for el in doc.iter(): if el.tag not in allow_tags: bad.append(el) for el in bad: @@ -408,7 +399,7 @@ def _kill_elements(self, doc, condition, iterate=None): bad = [] - for el in doc.getiterator(iterate): + for el in doc.iter(iterate): if condition(el): bad.append(el) for el in bad: @@ -416,13 +407,13 @@ def _remove_javascript_link(self, link): # links like "j a v a s c r i p t:" might be interpreted in IE - new = _whitespace_re.sub('', link) + new = _substitute_whitespace('', link) if _javascript_scheme_re.search(new): # FIXME: should this be None to delete? return '' return link - _decomment_re = re.compile(r'/\*.*?\*/', re.S) + _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub def _has_sneaky_javascript(self, style): """ @@ -435,9 +426,9 @@ that and remove only the Javascript from the style; this catches more sneaky attempts. """ - style = self._decomment_re.sub('', style) + style = self._substitute_comments('', style) style = style.replace('\\', '') - style = _whitespace_re.sub('', style) + style = _substitute_whitespace('', style) style = style.lower() if 'javascript:' in style: return True From scoder at codespeak.net Sat Jan 12 20:03:41 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 12 Jan 2008 20:03:41 +0100 (CET) Subject: [Lxml-checkins] r50534 - in lxml/trunk: . src/lxml Message-ID: <20080112190341.83449168544@codespeak.net> Author: scoder Date: Sat Jan 12 20:03:41 2008 New Revision: 50534 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/doctestcompare.py Log: r3256 at delle: sbehnel | 2008-01-12 19:47:21 +0100 do not use recovering HTML parser in doctestcompare Modified: lxml/trunk/src/lxml/doctestcompare.py ============================================================================== --- lxml/trunk/src/lxml/doctestcompare.py (original) +++ lxml/trunk/src/lxml/doctestcompare.py Sat Jan 12 20:03:41 2008 @@ -28,7 +28,6 @@ """ from lxml import etree -from lxml.html import document_fromstring import re import doctest import cgi @@ -51,6 +50,11 @@ def norm_whitespace(v): return _norm_whitespace_re.sub(' ', v) +_html_parser = etree.HTMLParser(recover=False) + +def html_fromstring(html): + return etree.fromstring(html, _html_parser) + # We use this to distinguish repr()s from elements: _repr_re = re.compile(r'^<[^>]+ (at|object) ') _norm_whitespace_re = re.compile(r'[ \t\n][ \t\n]+') @@ -90,12 +94,12 @@ if NOPARSE_MARKUP & optionflags: return None if PARSE_HTML & optionflags: - parser = document_fromstring + parser = html_fromstring elif PARSE_XML & optionflags: parser = etree.XML elif (want.strip().lower().startswith(' Author: scoder Date: Sat Jan 12 20:03:44 2008 New Revision: 50535 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/doctestcompare.py Log: r3257 at delle: sbehnel | 2008-01-12 20:03:30 +0100 remove blank text in HTML doctest parsing Modified: lxml/trunk/src/lxml/doctestcompare.py ============================================================================== --- lxml/trunk/src/lxml/doctestcompare.py (original) +++ lxml/trunk/src/lxml/doctestcompare.py Sat Jan 12 20:03:44 2008 @@ -50,7 +50,7 @@ def norm_whitespace(v): return _norm_whitespace_re.sub(' ', v) -_html_parser = etree.HTMLParser(recover=False) +_html_parser = etree.HTMLParser(recover=False, remove_blank_text=True) def html_fromstring(html): return etree.fromstring(html, _html_parser) From scoder at codespeak.net Mon Jan 14 19:54:20 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 14 Jan 2008 19:54:20 +0100 (CET) Subject: [Lxml-checkins] r50612 - in lxml/trunk: . src/lxml/html/tests Message-ID: <20080114185420.86285168564@codespeak.net> Author: scoder Date: Mon Jan 14 19:54:19 2008 New Revision: 50612 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/html/tests/test_forms.txt Log: r3260 at delle: sbehnel | 2008-01-14 07:22:48 +0100 doctest fixes Modified: lxml/trunk/src/lxml/html/tests/test_forms.txt ============================================================================== --- lxml/trunk/src/lxml/html/tests/test_forms.txt (original) +++ lxml/trunk/src/lxml/html/tests/test_forms.txt Mon Jan 14 19:54:19 2008 @@ -39,7 +39,7 @@ 'http://example.org/test' >>> f.method 'GET' ->>> f.inputs +>>> f.inputs # doctest:+NOPARSE_MARKUP >>> hidden = f.inputs['hidden_field'] >>> hidden.checkable @@ -68,10 +68,10 @@ >>> checkbox2.value 'good' >>> group = f.inputs['check_group'] ->>> group.value +>>> group.value # doctest:+NOPARSE_MARKUP >>> group.value.add('1') ->>> group.value +>>> group.value # doctest:+NOPARSE_MARKUP >>> print tostring(group[0]) @@ -110,7 +110,7 @@ >>> select.value_options [None, '', '1'] >>> select = f.inputs['select2'] ->>> select.value +>>> select.value # doctest:+NOPARSE_MARKUP >>> select.value.update(['2', '3']) >>> select.value.remove('3') @@ -124,7 +124,7 @@ >>> print urllib.urlencode(f.form_values()) hidden_field=new+value&text_field=text_value&single_checkbox=on&single_checkbox2=good&check_group=1&check_group=2&check_group=3&textarea_field=some+text&select1=&select2=1&select2=2&select2=3 >>> fields = f.fields ->>> fields +>>> fields # doctest:+NOPARSE_MARKUP >>> for name, value in fields.items(): ... print '%s: %r' % (name, value) From scoder at codespeak.net Mon Jan 14 19:54:30 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 14 Jan 2008 19:54:30 +0100 (CET) Subject: [Lxml-checkins] r50613 - in lxml/trunk: . src/lxml/html/tests Message-ID: <20080114185430.DF91E168564@codespeak.net> Author: scoder Date: Mon Jan 14 19:54:30 2008 New Revision: 50613 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/html/tests/test_basic.py Log: r3261 at delle: sbehnel | 2008-01-14 07:23:28 +0100 lxmlhtml.txt doesn't work as doctest Modified: lxml/trunk/src/lxml/html/tests/test_basic.py ============================================================================== --- lxml/trunk/src/lxml/html/tests/test_basic.py (original) +++ lxml/trunk/src/lxml/html/tests/test_basic.py Mon Jan 14 19:54:30 2008 @@ -4,7 +4,6 @@ def test_suite(): suite = unittest.TestSuite() suite.addTests([doctest.DocFileSuite('test_basic.txt')]) - suite.addTests([doctest.DocFileSuite('../../../../doc/lxmlhtml.txt')]) return suite if __name__ == '__main__': From scoder at codespeak.net Fri Jan 18 15:57:16 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 18 Jan 2008 15:57:16 +0100 (CET) Subject: [Lxml-checkins] r50751 - in lxml/trunk: . doc Message-ID: <20080118145716.35129169E1E@codespeak.net> Author: scoder Date: Fri Jan 18 15:57:15 2008 New Revision: 50751 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/lxml2.txt Log: r3264 at delle: sbehnel | 2008-01-16 10:43:18 +0100 doc update Modified: lxml/trunk/doc/lxml2.txt ============================================================================== --- lxml/trunk/doc/lxml2.txt (original) +++ lxml/trunk/doc/lxml2.txt Fri Jan 18 15:57:15 2008 @@ -21,6 +21,17 @@ extensions. Wherever possible, lxml 1.3 comes close to the semantics of lxml 2.0, so that migrating should be easier for code that currently runs with 1.3. +One of the important internal changes was the switch from the Pyrex_ +compiler to Cython_, which provides better optimisation and improved +support for newer Python language features. This allows the code of +lxml to become more Python-like again, while the performance improves +as Cython continues its own development. The code simplification, +which will continue throughout the 2.x series, will hopefully make it +even easier for users to contribute. + +.. _Cython: http://www.cython.org/ +.. _Pyrex: http://www.cosc.canterbury.ac.nz/~greg/python/Pyrex/ + Changes in etree and objectify ============================== From scoder at codespeak.net Fri Jan 18 15:57:20 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 18 Jan 2008 15:57:20 +0100 (CET) Subject: [Lxml-checkins] r50752 - in lxml/trunk: . src/lxml Message-ID: <20080118145720.08F80169E1D@codespeak.net> Author: scoder Date: Fri Jan 18 15:57:19 2008 New Revision: 50752 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/apihelpers.pxi lxml/trunk/src/lxml/lxml.etree.pyx Log: r3265 at delle: sbehnel | 2008-01-18 00:20:37 +0100 error reporting fixes Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Fri Jan 18 15:57:19 2008 @@ -728,7 +728,7 @@ if seqlength != slicelength: raise ValueError( "attempt to assign sequence of size %d " - "to extended slice of size %d" % (seqlength, c)) + "to extended slice of size %d" % (seqlength, slicelength)) if c_node is NULL: # no children yet => add all elements straight away Modified: lxml/trunk/src/lxml/lxml.etree.pyx ============================================================================== --- lxml/trunk/src/lxml/lxml.etree.pyx (original) +++ lxml/trunk/src/lxml/lxml.etree.pyx Fri Jan 18 15:57:19 2008 @@ -575,7 +575,7 @@ """ cdef xmlNode* c_node cdef xmlNode* c_next - cdef Py_ssize_t index, step, slicelength + cdef Py_ssize_t step, slicelength if python.PySlice_Check(x): # slice deletion if _isFullSlice(x): @@ -594,7 +594,7 @@ # item deletion c_node = _findChild(self._c_node, x) if c_node is NULL: - raise IndexError, index + raise IndexError("index out of range: %d" % x) _removeText(c_node.next) _removeNode(self._doc, c_node) From scoder at codespeak.net Sat Jan 19 14:22:10 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 19 Jan 2008 14:22:10 +0100 (CET) Subject: [Lxml-checkins] r50778 - lxml/trunk Message-ID: <20080119132210.6AB2F16851D@codespeak.net> Author: scoder Date: Sat Jan 19 14:22:09 2008 New Revision: 50778 Modified: lxml/trunk/ (props changed) lxml/trunk/setupinfo.py Log: r3268 at delle: sbehnel | 2008-01-19 14:21:46 +0100 do not use close_fds in Popen() as it is not portable Modified: lxml/trunk/setupinfo.py ============================================================================== --- lxml/trunk/setupinfo.py (original) +++ lxml/trunk/setupinfo.py Sat Jan 19 14:22:09 2008 @@ -135,7 +135,7 @@ _, rf, ef = os.popen3(cmd) else: # Python 2.4+ - p = subprocess.Popen(cmd, shell=True, close_fds=True, + p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) rf, ef = p.stdout, p.stderr errors = ef.read() From scoder at codespeak.net Sat Jan 19 14:36:33 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 19 Jan 2008 14:36:33 +0100 (CET) Subject: [Lxml-checkins] r50779 - lxml/trunk Message-ID: <20080119133633.C6D9616850C@codespeak.net> Author: scoder Date: Sat Jan 19 14:36:31 2008 New Revision: 50779 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/Makefile Log: r3270 at delle: sbehnel | 2008-01-19 14:25:11 +0100 do not remove generated .c files in 'make clean', use 'make realclean' instead Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Sat Jan 19 14:36:31 2008 @@ -8,6 +8,22 @@ Features added -------------- +Bugs fixed +---------- + +Other changes +------------- + +* ``make clean`` no longer removes the .c files (use ``make + realclean`` instead) + + +2.0beta1 (2008-01-11) +===================== + +Features added +-------------- + * Parse-time XML schema validation (``schema`` parser keyword). * XPath string results of the ``text()`` function and attribute Modified: lxml/trunk/Makefile ============================================================================== --- lxml/trunk/Makefile (original) +++ lxml/trunk/Makefile Sat Jan 19 14:36:31 2008 @@ -52,9 +52,10 @@ ftest: ftest_inplace clean: - find . \( -name '*.o' -o -name '*.c' -o -name '*.so' -o -name '*.py[cod]' -o -name '*.dll' \) -exec rm -f {} \; + find . \( -name '*.o' -o -name '*.so' -o -name '*.py[cod]' -o -name '*.dll' \) -exec rm -f {} \; rm -rf build realclean: clean + find . -name '*.c' -exec rm -f {} \; rm -f TAGS $(PYTHON) setup.py clean -a From scoder at codespeak.net Sun Jan 20 12:56:28 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 20 Jan 2008 12:56:28 +0100 (CET) Subject: [Lxml-checkins] r50796 - in lxml/trunk: . doc Message-ID: <20080120115628.4595D168559@codespeak.net> Author: scoder Date: Sun Jan 20 12:56:26 2008 New Revision: 50796 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/FAQ.txt Log: r3272 at delle: sbehnel | 2008-01-20 12:04:35 +0100 FAQ fix Modified: lxml/trunk/doc/FAQ.txt ============================================================================== --- lxml/trunk/doc/FAQ.txt (original) +++ lxml/trunk/doc/FAQ.txt Sun Jan 20 12:56:26 2008 @@ -42,7 +42,7 @@ 6.2 Why can't lxml parse my XML from unicode strings? 6.3 What is the difference between str(xslt(doc)) and xslt(doc).write() ? 6.4 Why can't I just delete parents or clear the root node in iterparse()? - 6.5 How do I output null bytes in XML text? + 6.5 How do I output null characters in XML text? 7 XPath and Document Traversal 7.1 What are the ``findall()`` and ``xpath()`` methods on Element(Tree)? 7.2 Why doesn't ``findall()`` support full XPath expressions? @@ -609,12 +609,12 @@ .. _`iterparse section`: api.html#iterparse-and-iterwalk -How do I output null bytes in XML text? +How do I output null characters in XML text? --------------------------------------- Don't. What you would produce is not well-formed XML. XML parsers -will refuse to parse a document that contains null bytes. The right -way to embed binary data in XML is using a text encoding such as +will refuse to parse a document that contains null characters. The +right way to embed binary data in XML is using a text encoding such as uuencode or base64. From scoder at codespeak.net Mon Jan 21 19:39:20 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 21 Jan 2008 19:39:20 +0100 (CET) Subject: [Lxml-checkins] r50848 - in lxml/trunk: . src/lxml/html Message-ID: <20080121183920.2C42E16856B@codespeak.net> Author: scoder Date: Mon Jan 21 19:39:18 2008 New Revision: 50848 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/html/diff.py Log: r3274 at delle: sbehnel | 2008-01-21 11:24:48 +0100 fix Py2.4-isms Modified: lxml/trunk/src/lxml/html/diff.py ============================================================================== --- lxml/trunk/src/lxml/html/diff.py (original) +++ lxml/trunk/src/lxml/html/diff.py Mon Jan 21 19:39:18 2008 @@ -320,7 +320,7 @@ name, pos, tag = tag_stack.pop() balanced[pos] = tag elif tag_stack: - start.extend(tag for name, pos, tag in tag_stack) + start.extend([tag for name, pos, tag in tag_stack]) tag_stack = [] end.append(chunk) else: @@ -702,8 +702,8 @@ The text representation of the start tag for a tag. """ return '<%s%s>' % ( - el.tag, ''.join(' %s="%s"' % (name, cgi.escape(value, True)) - for name, value in el.attrib.items())) + el.tag, ''.join([' %s="%s"' % (name, cgi.escape(value, True)) + for name, value in el.attrib.items())]) def end_tag(el): """ The text representation of an end tag for a tag. Includes From scoder at codespeak.net Mon Jan 21 19:39:26 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 21 Jan 2008 19:39:26 +0100 (CET) Subject: [Lxml-checkins] r50849 - in lxml/trunk: . src/lxml Message-ID: <20080121183926.730F316856D@codespeak.net> Author: scoder Date: Mon Jan 21 19:39:25 2008 New Revision: 50849 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/lxml.etree.pyx lxml/trunk/src/lxml/python.pxd Log: r3275 at delle: sbehnel | 2008-01-21 11:29:56 +0100 fix Py2.4-isms Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Mon Jan 21 19:39:25 2008 @@ -11,6 +11,8 @@ Bugs fixed ---------- +* Some Python 2.4-isms slipped through in beta1. + Other changes ------------- Modified: lxml/trunk/src/lxml/lxml.etree.pyx ============================================================================== --- lxml/trunk/src/lxml/lxml.etree.pyx (original) +++ lxml/trunk/src/lxml/lxml.etree.pyx Mon Jan 21 19:39:25 2008 @@ -846,7 +846,7 @@ prefix = None else: prefix = funicode(c_ns.prefix) - if not python.PyDict_Contains(nsmap, prefix): + if not python.PyDict_GetItem(nsmap, prefix): python.PyDict_SetItem( nsmap, prefix, funicode(c_ns.href)) c_ns = c_ns.next Modified: lxml/trunk/src/lxml/python.pxd ============================================================================== --- lxml/trunk/src/lxml/python.pxd (original) +++ lxml/trunk/src/lxml/python.pxd Mon Jan 21 19:39:25 2008 @@ -66,7 +66,7 @@ cdef void PyDict_Clear(object d) cdef object PyDict_Copy(object d) cdef object PyDictProxy_New(object d) - cdef int PyDict_Contains(object d, object key) except -1 + # cdef int PyDict_Contains(object d, object key) except -1 # Python 2.4+ cdef Py_ssize_t PyDict_Size(object d) cdef object PySequence_List(object o) cdef object PySequence_Tuple(object o) From scoder at codespeak.net Mon Jan 21 19:39:45 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 21 Jan 2008 19:39:45 +0100 (CET) Subject: [Lxml-checkins] r50850 - in lxml/trunk: . src/lxml/html src/lxml/html/tests src/lxml/tests Message-ID: <20080121183945.8B55E16856B@codespeak.net> Author: scoder Date: Mon Jan 21 19:39:44 2008 New Revision: 50850 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/html/diff.py lxml/trunk/src/lxml/html/tests/test_autolink.py lxml/trunk/src/lxml/html/tests/test_basic.py lxml/trunk/src/lxml/html/tests/test_clean.py lxml/trunk/src/lxml/html/tests/test_diff.py lxml/trunk/src/lxml/html/tests/test_elementsoup.py lxml/trunk/src/lxml/html/tests/test_feedparser_data.py lxml/trunk/src/lxml/html/tests/test_formfill.py lxml/trunk/src/lxml/html/tests/test_forms.py lxml/trunk/src/lxml/html/tests/test_rewritelinks.py lxml/trunk/src/lxml/tests/test_css.py lxml/trunk/src/lxml/tests/test_etree.py lxml/trunk/src/lxml/tests/test_objectify.py lxml/trunk/src/lxml/tests/test_xpathevaluator.py Log: r3276 at delle: sbehnel | 2008-01-21 14:51:38 +0100 run HTML doctests only under Python 2.4+, fix some 2.4-isms in the tests Modified: lxml/trunk/src/lxml/html/diff.py ============================================================================== --- lxml/trunk/src/lxml/html/diff.py (original) +++ lxml/trunk/src/lxml/html/diff.py Mon Jan 21 19:39:44 2008 @@ -703,7 +703,7 @@ """ return '<%s%s>' % ( el.tag, ''.join([' %s="%s"' % (name, cgi.escape(value, True)) - for name, value in el.attrib.items())]) + for name, value in el.attrib.items()])) def end_tag(el): """ The text representation of an end tag for a tag. Includes Modified: lxml/trunk/src/lxml/html/tests/test_autolink.py ============================================================================== --- lxml/trunk/src/lxml/html/tests/test_autolink.py (original) +++ lxml/trunk/src/lxml/html/tests/test_autolink.py Mon Jan 21 19:39:44 2008 @@ -1,9 +1,10 @@ -import unittest +import unittest, sys from lxml.tests.common_imports import doctest def test_suite(): suite = unittest.TestSuite() - suite.addTests([doctest.DocFileSuite('test_autolink.txt')]) + if sys.version_info >= (2,4): + suite.addTests([doctest.DocFileSuite('test_autolink.txt')]) return suite if __name__ == '__main__': Modified: lxml/trunk/src/lxml/html/tests/test_basic.py ============================================================================== --- lxml/trunk/src/lxml/html/tests/test_basic.py (original) +++ lxml/trunk/src/lxml/html/tests/test_basic.py Mon Jan 21 19:39:44 2008 @@ -1,9 +1,10 @@ -import unittest +import unittest, sys from lxml.tests.common_imports import doctest def test_suite(): suite = unittest.TestSuite() - suite.addTests([doctest.DocFileSuite('test_basic.txt')]) + if sys.version_info >= (2,4): + suite.addTests([doctest.DocFileSuite('test_basic.txt')]) return suite if __name__ == '__main__': Modified: lxml/trunk/src/lxml/html/tests/test_clean.py ============================================================================== --- lxml/trunk/src/lxml/html/tests/test_clean.py (original) +++ lxml/trunk/src/lxml/html/tests/test_clean.py Mon Jan 21 19:39:44 2008 @@ -1,10 +1,11 @@ -import unittest +import unittest, sys from lxml.tests.common_imports import doctest from lxml.etree import LIBXML_VERSION def test_suite(): suite = unittest.TestSuite() - suite.addTests([doctest.DocFileSuite('test_clean.txt')]) - if LIBXML_VERSION <= (2,6,28) or LIBXML_VERSION >= (2,6,31): - suite.addTests([doctest.DocFileSuite('test_clean_embed.txt')]) + if sys.version_info >= (2,4): + suite.addTests([doctest.DocFileSuite('test_clean.txt')]) + if LIBXML_VERSION <= (2,6,28) or LIBXML_VERSION >= (2,6,31): + suite.addTests([doctest.DocFileSuite('test_clean_embed.txt')]) return suite Modified: lxml/trunk/src/lxml/html/tests/test_diff.py ============================================================================== --- lxml/trunk/src/lxml/html/tests/test_diff.py (original) +++ lxml/trunk/src/lxml/html/tests/test_diff.py Mon Jan 21 19:39:44 2008 @@ -1,12 +1,13 @@ -import unittest +import unittest, sys from lxml.tests.common_imports import doctest from lxml.html import diff def test_suite(): suite = unittest.TestSuite() - suite.addTests([doctest.DocFileSuite('test_diff.txt'), - doctest.DocTestSuite(diff)]) + if sys.version_info >= (2,4): + suite.addTests([doctest.DocFileSuite('test_diff.txt'), + doctest.DocTestSuite(diff)]) return suite if __name__ == '__main__': Modified: lxml/trunk/src/lxml/html/tests/test_elementsoup.py ============================================================================== --- lxml/trunk/src/lxml/html/tests/test_elementsoup.py (original) +++ lxml/trunk/src/lxml/html/tests/test_elementsoup.py Mon Jan 21 19:39:44 2008 @@ -1,4 +1,4 @@ -import unittest +import unittest, sys from lxml.tests.common_imports import doctest try: @@ -10,8 +10,9 @@ def test_suite(): suite = unittest.TestSuite() - if BS_INSTALLED: - suite.addTests([doctest.DocFileSuite('../../../../doc/elementsoup.txt')]) + if sys.version_info >= (2,4): + if BS_INSTALLED: + suite.addTests([doctest.DocFileSuite('../../../../doc/elementsoup.txt')]) return suite if __name__ == '__main__': Modified: lxml/trunk/src/lxml/html/tests/test_feedparser_data.py ============================================================================== --- lxml/trunk/src/lxml/html/tests/test_feedparser_data.py (original) +++ lxml/trunk/src/lxml/html/tests/test_feedparser_data.py Mon Jan 21 19:39:44 2008 @@ -1,9 +1,11 @@ +import sys import os import re import rfc822 import unittest from lxml.tests.common_imports import doctest -from lxml.doctestcompare import LHTMLOutputChecker +if sys.version_info >= (2,4): + from lxml.doctestcompare import LHTMLOutputChecker from lxml.html.clean import clean, Cleaner @@ -75,15 +77,16 @@ def test_suite(): suite = unittest.TestSuite() - for dir in feed_dirs: - for fn in os.listdir(dir): - fn = os.path.join(dir, fn) - if fn.endswith('.data'): - case = FeedTestCase(fn) - suite.addTests([case]) - # This is my lazy way of stopping on first error: - try: - case.runTest() - except: - break + if sys.version_info >= (2,4): + for dir in feed_dirs: + for fn in os.listdir(dir): + fn = os.path.join(dir, fn) + if fn.endswith('.data'): + case = FeedTestCase(fn) + suite.addTests([case]) + # This is my lazy way of stopping on first error: + try: + case.runTest() + except: + break return suite Modified: lxml/trunk/src/lxml/html/tests/test_formfill.py ============================================================================== --- lxml/trunk/src/lxml/html/tests/test_formfill.py (original) +++ lxml/trunk/src/lxml/html/tests/test_formfill.py Mon Jan 21 19:39:44 2008 @@ -1,7 +1,8 @@ -import unittest +import unittest, sys from lxml.tests.common_imports import doctest def test_suite(): suite = unittest.TestSuite() - suite.addTests([doctest.DocFileSuite('test_formfill.txt')]) + if sys.version_info >= (2,4): + suite.addTests([doctest.DocFileSuite('test_formfill.txt')]) return suite Modified: lxml/trunk/src/lxml/html/tests/test_forms.py ============================================================================== --- lxml/trunk/src/lxml/html/tests/test_forms.py (original) +++ lxml/trunk/src/lxml/html/tests/test_forms.py Mon Jan 21 19:39:44 2008 @@ -1,9 +1,10 @@ -import unittest +import unittest, sys from lxml.tests.common_imports import doctest def test_suite(): suite = unittest.TestSuite() - suite.addTests([doctest.DocFileSuite('test_forms.txt')]) + if sys.version_info >= (2,4): + suite.addTests([doctest.DocFileSuite('test_forms.txt')]) return suite if __name__ == '__main__': Modified: lxml/trunk/src/lxml/html/tests/test_rewritelinks.py ============================================================================== --- lxml/trunk/src/lxml/html/tests/test_rewritelinks.py (original) +++ lxml/trunk/src/lxml/html/tests/test_rewritelinks.py Mon Jan 21 19:39:44 2008 @@ -1,9 +1,10 @@ -import unittest +import unittest, sys from lxml.tests.common_imports import doctest def test_suite(): suite = unittest.TestSuite() - suite.addTests([doctest.DocFileSuite('test_rewritelinks.txt')]) + if sys.version_info >= (2,4): + suite.addTests([doctest.DocFileSuite('test_rewritelinks.txt')]) return suite if __name__ == '__main__': Modified: lxml/trunk/src/lxml/tests/test_css.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_css.py (original) +++ lxml/trunk/src/lxml/tests/test_css.py Mon Jan 21 19:39:44 2008 @@ -1,4 +1,4 @@ -import unittest +import unittest, sys from lxml.tests.common_imports import doctest from lxml import html from lxml import cssselect @@ -61,10 +61,10 @@ self.index = index unittest.TestCase.__init__(self) - @classmethod def all(cls): for i in range(len(cls.selectors)): yield cls(i) + all = classmethod(all) def runTest(self): f = open(doc_fn, 'rb') Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Mon Jan 21 19:39:44 2008 @@ -1736,9 +1736,8 @@ def test_sourceline_iterparse_end(self): iterparse = self.etree.iterparse - lines = list( - el.sourceline for (event, el) in - iterparse(fileInTestDir('include/test_xinclude.xml'))) + lines = [ el.sourceline for (event, el) in + iterparse(fileInTestDir('include/test_xinclude.xml')) ] self.assertEquals( [2, 3, 1], @@ -1746,10 +1745,9 @@ def test_sourceline_iterparse_start(self): iterparse = self.etree.iterparse - lines = list( - el.sourceline for (event, el) in - iterparse(fileInTestDir('include/test_xinclude.xml'), - events=("start",))) + lines = [ el.sourceline for (event, el) in + iterparse(fileInTestDir('include/test_xinclude.xml'), + events=("start",)) ] self.assertEquals( [1, 2, 3], Modified: lxml/trunk/src/lxml/tests/test_objectify.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_objectify.py (original) +++ lxml/trunk/src/lxml/tests/test_objectify.py Mon Jan 21 19:39:44 2008 @@ -37,8 +37,8 @@ # None: xsi:nil="true" } -xsitype2objclass = dict(( (v, k) for k in objectclass2xsitype - for v in objectclass2xsitype[k] )) +xsitype2objclass = dict([ (v, k) for k in objectclass2xsitype + for v in objectclass2xsitype[k] ]) objectclass2pytype = { # objectify built-in @@ -50,7 +50,8 @@ # None: xsi:nil="true" } -pytype2objclass = dict(( (objectclass2pytype[k], k) for k in objectclass2pytype)) +pytype2objclass = dict([ (objectclass2pytype[k], k) + for k in objectclass2pytype]) xml_str = '''\ Modified: lxml/trunk/src/lxml/tests/test_xpathevaluator.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_xpathevaluator.py (original) +++ lxml/trunk/src/lxml/tests/test_xpathevaluator.py Mon Jan 21 19:39:44 2008 @@ -4,10 +4,10 @@ Test cases related to XPath evaluation and the XPath class """ -import unittest, doctest +import unittest from StringIO import StringIO -from common_imports import etree, HelperTestCase +from common_imports import etree, HelperTestCase, doctest class ETreeXPathTestCase(HelperTestCase): """XPath tests etree""" From scoder at codespeak.net Mon Jan 21 19:40:00 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 21 Jan 2008 19:40:00 +0100 (CET) Subject: [Lxml-checkins] r50851 - in lxml/trunk: . src/lxml/tests Message-ID: <20080121184000.BA6D416856B@codespeak.net> Author: scoder Date: Mon Jan 21 19:39:59 2008 New Revision: 50851 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/tests/test_css.py lxml/trunk/src/lxml/tests/test_objectify.py Log: r3277 at delle: sbehnel | 2008-01-21 16:40:29 +0100 switch off some more doctests under Python 2.3 Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Mon Jan 21 19:39:59 2008 @@ -11,7 +11,8 @@ Bugs fixed ---------- -* Some Python 2.4-isms slipped through in beta1. +* Some Python 2.4-isms prevented lxml from building/running under + Python 2.3. Other changes ------------- @@ -19,6 +20,8 @@ * ``make clean`` no longer removes the .c files (use ``make realclean`` instead) +* The test suite now skips most doctests under Python 2.3. + 2.0beta1 (2008-01-11) ===================== Modified: lxml/trunk/src/lxml/tests/test_css.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_css.py (original) +++ lxml/trunk/src/lxml/tests/test_css.py Mon Jan 21 19:39:59 2008 @@ -112,7 +112,8 @@ def test_suite(): suite = unittest.TestSuite() - for fn in 'test_css.txt', 'test_css_select.txt': - suite.addTests([doctest.DocFileSuite(fn)]) + if sys.version_info >= (2,4): + suite.addTests([doctest.DocFileSuite('test_css_select.txt')]) + suite.addTests([doctest.DocFileSuite('test_css.txt')]) suite.addTests(list(CSSTestCase.all())) return suite Modified: lxml/trunk/src/lxml/tests/test_objectify.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_objectify.py (original) +++ lxml/trunk/src/lxml/tests/test_objectify.py Mon Jan 21 19:39:59 2008 @@ -5,7 +5,7 @@ """ -import unittest, operator +import unittest, operator, sys from common_imports import etree, StringIO, HelperTestCase, fileInTestDir from common_imports import SillyFileLike, canonicalize, doctest @@ -2071,8 +2071,9 @@ def test_suite(): suite = unittest.TestSuite() suite.addTests([unittest.makeSuite(ObjectifyTestCase)]) - suite.addTests( - [doctest.DocFileSuite('../../../doc/objectify.txt')]) + if sys.version_info >= (2,4): + suite.addTests( + [doctest.DocFileSuite('../../../doc/objectify.txt')]) return suite if __name__ == '__main__': From scoder at codespeak.net Wed Jan 23 11:25:01 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 23 Jan 2008 11:25:01 +0100 (CET) Subject: [Lxml-checkins] r50902 - in lxml/trunk: . src/lxml/html/tests Message-ID: <20080123102501.5405E1684D9@codespeak.net> Author: scoder Date: Wed Jan 23 11:24:59 2008 New Revision: 50902 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/html/tests/test_clean.txt Log: r3282 at delle: sbehnel | 2008-01-21 22:23:44 +0100 fix doctests Modified: lxml/trunk/src/lxml/html/tests/test_clean.txt ============================================================================== --- lxml/trunk/src/lxml/html/tests/test_clean.txt (original) +++ lxml/trunk/src/lxml/html/tests/test_clean.txt Wed Jan 23 11:24:59 2008 @@ -3,29 +3,28 @@ >>> from lxml.html import usedoctest >>> doc = ''' -... -... -... -... -... -... -... -... a link -... another link -...

a paragraph

-...
secret EVIL!
-... of EVIL! -... -...
-... Password: -...
-... annoying EVIL! -... spam spam SPAM! -... -... +... +... +... +... +... +... +... +... a link +... another link +...

a paragraph

+...
secret EVIL!
+... of EVIL! +... +...
+... Password: +...
+... spam spam SPAM! +... +... ... ''' >>> print doc @@ -49,9 +48,8 @@
Password:
- annoying EVIL! spam spam SPAM! - + @@ -76,9 +74,8 @@
Password:
- annoying EVIL! spam spam SPAM! - + @@ -94,7 +91,6 @@
secret EVIL!
of EVIL! Password: - annoying EVIL! spam spam SPAM! @@ -112,7 +108,6 @@
secret EVIL!
of EVIL! Password: - annoying EVIL! spam spam SPAM! From scoder at codespeak.net Wed Jan 23 11:25:04 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 23 Jan 2008 11:25:04 +0100 (CET) Subject: [Lxml-checkins] r50903 - lxml/trunk Message-ID: <20080123102504.4F08F1684DA@codespeak.net> Author: scoder Date: Wed Jan 23 11:25:03 2008 New Revision: 50903 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt Log: r3283 at delle: sbehnel | 2008-01-21 22:24:06 +0100 changelog fix Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Wed Jan 23 11:25:03 2008 @@ -2,8 +2,8 @@ lxml changelog ============== -2.0beta1 (2008-01-11) -===================== +Under development +================= Features added -------------- From scoder at codespeak.net Wed Jan 23 11:25:08 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 23 Jan 2008 11:25:08 +0100 (CET) Subject: [Lxml-checkins] r50904 - in lxml/trunk: . src/lxml/html Message-ID: <20080123102508.4600A1684DB@codespeak.net> Author: scoder Date: Wed Jan 23 11:25:07 2008 New Revision: 50904 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/html/clean.py Log: r3284 at delle: sbehnel | 2008-01-22 08:44:43 +0100 missing import Modified: lxml/trunk/src/lxml/html/clean.py ============================================================================== --- lxml/trunk/src/lxml/html/clean.py (original) +++ lxml/trunk/src/lxml/html/clean.py Wed Jan 23 11:25:07 2008 @@ -1,4 +1,5 @@ import re +import copy import urlparse from lxml import etree from lxml.html import defs From scoder at codespeak.net Wed Jan 23 11:25:12 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 23 Jan 2008 11:25:12 +0100 (CET) Subject: [Lxml-checkins] r50905 - lxml/trunk Message-ID: <20080123102512.EDC1C1684DE@codespeak.net> Author: scoder Date: Wed Jan 23 11:25:12 2008 New Revision: 50905 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt Log: r3285 at delle: sbehnel | 2008-01-22 09:28:55 +0100 changelog cleanup Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Wed Jan 23 11:25:12 2008 @@ -11,17 +11,19 @@ Bugs fixed ---------- +* Missing import in ``lxml.html.clean``. + * Some Python 2.4-isms prevented lxml from building/running under Python 2.3. Other changes ------------- +* The test suite now skips most doctests under Python 2.3. + * ``make clean`` no longer removes the .c files (use ``make realclean`` instead) -* The test suite now skips most doctests under Python 2.3. - 2.0beta1 (2008-01-11) ===================== From scoder at codespeak.net Wed Jan 23 16:11:50 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 23 Jan 2008 16:11:50 +0100 (CET) Subject: [Lxml-checkins] r50924 - in lxml/trunk: . src/lxml Message-ID: <20080123151150.EB9DC16847A@codespeak.net> Author: scoder Date: Wed Jan 23 16:11:49 2008 New Revision: 50924 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/dtd.pxi lxml/trunk/src/lxml/relaxng.pxi lxml/trunk/src/lxml/xmlschema.pxi Log: r3290 at delle: sbehnel | 2008-01-23 13:03:09 +0100 keyword-only arguments in validators Modified: lxml/trunk/src/lxml/dtd.pxi ============================================================================== --- lxml/trunk/src/lxml/dtd.pxi (original) +++ lxml/trunk/src/lxml/dtd.pxi Wed Jan 23 16:11:49 2008 @@ -27,7 +27,7 @@ catalog. """ cdef tree.xmlDtd* _c_dtd - def __init__(self, file=None, external_id=None): + def __init__(self, file=None, *, external_id=None): self._c_dtd = NULL if file is not None: if python._isString(file): Modified: lxml/trunk/src/lxml/relaxng.pxi ============================================================================== --- lxml/trunk/src/lxml/relaxng.pxi (original) +++ lxml/trunk/src/lxml/relaxng.pxi Wed Jan 23 16:11:49 2008 @@ -21,10 +21,12 @@ cdef class RelaxNG(_Validator): """Turn a document into a Relax NG validator. - Can also load from filesystem directly given file object or filename. + + Either pass a schema as Element or ElementTree, or pass a file or + filename through the ``file`` keyword argument. """ cdef relaxng.xmlRelaxNG* _c_schema - def __init__(self, etree=None, file=None): + def __init__(self, etree=None, *, file=None): cdef _Document doc cdef _Element root_node cdef xmlNode* c_node Modified: lxml/trunk/src/lxml/xmlschema.pxi ============================================================================== --- lxml/trunk/src/lxml/xmlschema.pxi (original) +++ lxml/trunk/src/lxml/xmlschema.pxi Wed Jan 23 16:11:49 2008 @@ -21,9 +21,12 @@ cdef class XMLSchema(_Validator): """Turn a document into an XML Schema validator. + + Either pass a schema as Element or ElementTree, or pass a file or + filename through the ``file`` keyword argument. """ cdef xmlschema.xmlSchema* _c_schema - def __init__(self, etree=None, file=None): + def __init__(self, etree=None, *, file=None): cdef _Document doc cdef _Element root_node cdef xmlDoc* fake_c_doc From scoder at codespeak.net Wed Jan 23 17:10:03 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 23 Jan 2008 17:10:03 +0100 (CET) Subject: [Lxml-checkins] r50930 - in lxml/trunk: . src/lxml Message-ID: <20080123161003.7311B168469@codespeak.net> Author: scoder Date: Wed Jan 23 17:10:02 2008 New Revision: 50930 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/dtd.pxi lxml/trunk/src/lxml/lxml.etree.pyx lxml/trunk/src/lxml/relaxng.pxi lxml/trunk/src/lxml/xmlschema.pxi Log: r3292 at delle: sbehnel | 2008-01-23 16:39:27 +0100 cleanup in validation code, paste local error log into exceptions during schema parsing Modified: lxml/trunk/src/lxml/dtd.pxi ============================================================================== --- lxml/trunk/src/lxml/dtd.pxi (original) +++ lxml/trunk/src/lxml/dtd.pxi Wed Jan 23 17:10:02 2008 @@ -29,21 +29,27 @@ cdef tree.xmlDtd* _c_dtd def __init__(self, file=None, *, external_id=None): self._c_dtd = NULL + _Validator.__init__(self) if file is not None: if python._isString(file): + self._error_log.connect() self._c_dtd = xmlparser.xmlParseDTD(NULL, _cstr(file)) + self._error_log.disconnect() elif hasattr(file, 'read'): self._c_dtd = _parseDtdFromFilelike(file) else: - raise DTDParseError, "parsing from file objects is not supported" + raise DTDParseError("file must be a filename or file-like object") elif external_id is not None: + self._error_log.connect() self._c_dtd = xmlparser.xmlParseDTD(external_id, NULL) + self._error_log.disconnect() else: - raise DTDParseError, "either filename or external ID required" + raise DTDParseError("either filename or external ID required") if self._c_dtd is NULL: - raise DTDParseError, "error parsing DTD" - _Validator.__init__(self) + raise DTDParseError( + self._error_log._buildExceptionMessage("error parsing DTD"), + error_log=self._error_log) def __dealloc__(self): tree.xmlFreeDtd(self._c_dtd) @@ -77,7 +83,7 @@ self._error_log.disconnect() if ret == -1: - raise DTDValidateError, "Internal error in DTD validation" + raise DTDValidateError("Internal error in DTD validation") if ret == 1: return True else: @@ -87,15 +93,19 @@ cdef tree.xmlDtd* _parseDtdFromFilelike(file) except NULL: cdef _ExceptionContext exc_context cdef _FileReaderContext dtd_parser + cdef _ErrorLog error_log cdef tree.xmlDtd* c_dtd exc_context = _ExceptionContext() dtd_parser = _FileReaderContext(file, exc_context, None, None) + error_log = _ErrorLog() + error_log.connect() c_dtd = dtd_parser._readDtd() + error_log.disconnect() exc_context._raise_if_stored() if c_dtd is NULL: - raise DTDParseError, "error parsing DTD" + raise DTDParseError("error parsing DTD", error_log=error_log) return c_dtd cdef extern from "etree_defs.h": Modified: lxml/trunk/src/lxml/lxml.etree.pyx ============================================================================== --- lxml/trunk/src/lxml/lxml.etree.pyx (original) +++ lxml/trunk/src/lxml/lxml.etree.pyx Wed Jan 23 17:10:02 2008 @@ -93,9 +93,12 @@ """Main exception base class for lxml. All other exceptions inherit from this one. """ - def __ini