From ianb at codespeak.net Sat Apr 5 17:39:15 2008 From: ianb at codespeak.net (ianb at codespeak.net) Date: Sat, 5 Apr 2008 17:39:15 +0200 (CEST) Subject: [Lxml-checkins] r53392 - in lxml/trunk: . src/lxml/html Message-ID: <20080405153915.7AE2316AA5A@codespeak.net> Author: ianb Date: Sat Apr 5 17:39:13 2008 New Revision: 53392 Modified: lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/html/__init__.py Log: fix cssselect method Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Sat Apr 5 17:39:13 2008 @@ -15,6 +15,8 @@ * lxml.etree accepted non well-formed namespace prefix names. +* HTML elements' ``.cssselect()`` method was broken. + Other changes ------------- Modified: lxml/trunk/src/lxml/html/__init__.py ============================================================================== --- lxml/trunk/src/lxml/html/__init__.py (original) +++ lxml/trunk/src/lxml/html/__init__.py Sat Apr 5 17:39:13 2008 @@ -194,7 +194,7 @@ that pre-compiling the expression can provide a substantial speedup. """ - return cssselect.CSSSelect(expr)(self) + return cssselect.CSSSelector(expr)(self) ######################################## ## Link functions From lxml-checkins at codespeak.net Mon Apr 7 09:05:51 2008 From: lxml-checkins at codespeak.net (lxml-checkins at codespeak.net) Date: Mon, 7 Apr 2008 09:05:51 +0200 (CEST) Subject: [Lxml-checkins] Gucci 48757 Message-ID: <20080407100449.2614.qmail@adsl-dyn230.78-99-169.t-com.sk> An HTML attachment was scrubbed... URL: http://codespeak.net/pipermail/lxml-checkins/attachments/20080407/86e58e97/attachment-0001.htm From jholg at codespeak.net Mon Apr 7 15:29:27 2008 From: jholg at codespeak.net (jholg at codespeak.net) Date: Mon, 7 Apr 2008 15:29:27 +0200 (CEST) Subject: [Lxml-checkins] r53527 - in lxml/trunk/src/lxml: . tests Message-ID: <20080407132927.D18F916A273@codespeak.net> Author: jholg Date: Mon Apr 7 15:29:27 2008 New Revision: 53527 Modified: lxml/trunk/src/lxml/lxml.objectify.pyx lxml/trunk/src/lxml/tests/test_objectify.py Log: Added __int__, __long__, __float__, __complex__ methods to StringElement, plus tests. Modified: lxml/trunk/src/lxml/lxml.objectify.pyx ============================================================================== --- lxml/trunk/src/lxml/lxml.objectify.pyx (original) +++ lxml/trunk/src/lxml/lxml.objectify.pyx Mon Apr 7 15:29:27 2008 @@ -773,6 +773,18 @@ def __mod__(self, other): return _strValueOf(self) % other + def __int__(self): + return int(textOf(self._c_node)) + + def __long__(self): + return long(textOf(self._c_node)) + + def __float__(self): + return float(textOf(self._c_node)) + + def __complex__(self): + return complex(textOf(self._c_node)) + cdef class NoneElement(ObjectifiedDataElement): def __str__(self): return "None" Modified: lxml/trunk/src/lxml/tests/test_objectify.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_objectify.py (original) +++ lxml/trunk/src/lxml/tests/test_objectify.py Mon Apr 7 15:29:27 2008 @@ -815,6 +815,26 @@ el = objectify.DataElement(s) val = 5 self.assertRaises(TypeError, el.__mod__, val) + + def test_type_str_as_int(self): + v = "1" + el = objectify.DataElement(v) + self.assertEquals(int(el), 1) + + def test_type_str_as_long(self): + v = "1" + el = objectify.DataElement(v) + self.assertEquals(long(el), 1) + + def test_type_str_as_float(self): + v = "1" + el = objectify.DataElement(v) + self.assertEquals(float(el), 1) + + def test_type_str_as_complex(self): + v = "1" + el = objectify.DataElement(v) + self.assertEquals(complex(el), 1) def test_type_str_mod_data_elements(self): s = "%d %f %s %r" From scoder at codespeak.net Tue Apr 8 19:05:14 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 8 Apr 2008 19:05:14 +0200 (CEST) Subject: [Lxml-checkins] r53591 - in lxml/branch/lxml-2.0: . src/lxml/html Message-ID: <20080408170514.1032916A968@codespeak.net> Author: scoder Date: Tue Apr 8 19:05:13 2008 New Revision: 53591 Modified: lxml/branch/lxml-2.0/CHANGES.txt lxml/branch/lxml-2.0/src/lxml/html/__init__.py Log: merged in cssselect() bug fix by Ian Modified: lxml/branch/lxml-2.0/CHANGES.txt ============================================================================== --- lxml/branch/lxml-2.0/CHANGES.txt (original) +++ lxml/branch/lxml-2.0/CHANGES.txt Tue Apr 8 19:05:13 2008 @@ -11,6 +11,8 @@ Bugs fixed ---------- +* HTML elements' ``.cssselect()`` method was broken. + * ``ElementTree.find*()`` didn't accept QName objects. Other changes Modified: lxml/branch/lxml-2.0/src/lxml/html/__init__.py ============================================================================== --- lxml/branch/lxml-2.0/src/lxml/html/__init__.py (original) +++ lxml/branch/lxml-2.0/src/lxml/html/__init__.py Tue Apr 8 19:05:13 2008 @@ -194,7 +194,7 @@ that pre-compiling the expression can provide a substantial speedup. """ - return cssselect.CSSSelect(expr)(self) + return cssselect.CSSSelector(expr)(self) ######################################## ## Link functions From scoder at codespeak.net Thu Apr 10 08:00:39 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 10 Apr 2008 08:00:39 +0200 (CEST) Subject: [Lxml-checkins] r53639 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20080410060039.EFDE42A0192@codespeak.net> Author: scoder Date: Thu Apr 10 08:00:37 2008 New Revision: 53639 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/lxml.objectify.pyx lxml/trunk/src/lxml/tests/test_objectify.py Log: r3916 at delle: sbehnel | 2008-04-08 09:45:15 +0200 objectify: let BoolElement inherit from IntElement (as in Python), lots of cleanup in 'special methods' Modified: lxml/trunk/src/lxml/lxml.objectify.pyx ============================================================================== --- lxml/trunk/src/lxml/lxml.objectify.pyx (original) +++ lxml/trunk/src/lxml/lxml.objectify.pyx Thu Apr 10 08:00:37 2008 @@ -618,41 +618,44 @@ cetree.setNodeText(self._c_node, s) cdef class NumberElement(ObjectifiedDataElement): - cdef object _type + cdef object _parse_value def _setValueParser(self, function): - "Set the function that parses the Python value from a string." - self._type = function + """Set the function that parses the Python value from a string. - cdef _value(self): - return self._type(textOf(self._c_node)) + Do not use this unless you know what you are doing. + """ + self._parse_value = function property pyval: def __get__(self): - return self._value() + return _parseNumber(self) def __int__(self): - return int(textOf(self._c_node)) + return int(_parseNumber(self)) def __long__(self): - return long(textOf(self._c_node)) + return long(_parseNumber(self)) def __float__(self): - return float(textOf(self._c_node)) + return float(_parseNumber(self)) + + def __complex__(self): + return complex(_parseNumber(self)) def __str__(self): - return str(self._type(textOf(self._c_node))) + return str(_parseNumber(self)) def __repr__(self): - return repr(self._type(textOf(self._c_node))) + return repr(_parseNumber(self)) + + def __oct__(self): + return oct(_parseNumber(self)) -# def __oct__(self): -# def __hex__(self): + def __hex__(self): + return hex(_parseNumber(self)) def __richcmp__(self, other, int op): - if hasattr(other, 'pyval'): - other = other.pyval - return python.PyObject_RichCompare( - _numericValueOf(self), other, op) + return _richcmpPyvals(self, other, op) def __add__(self, other): return _numericValueOf(self) + _numericValueOf(other) @@ -710,15 +713,15 @@ cdef class IntElement(NumberElement): def _init(self): - self._type = int + self._parse_value = int cdef class LongElement(NumberElement): def _init(self): - self._type = long + self._parse_value = long cdef class FloatElement(NumberElement): def _init(self): - self._type = float + self._parse_value = float cdef class StringElement(ObjectifiedDataElement): """String data class. @@ -748,10 +751,7 @@ return len(text) > 0 def __richcmp__(self, other, int op): - if hasattr(other, 'pyval'): - other = other.pyval - return python.PyObject_RichCompare( - _strValueOf(self), other, op) + return _richcmpPyvals(self, other, op) def __add__(self, other): text = _strValueOf(self) @@ -807,61 +807,64 @@ def __get__(self): return None -cdef class BoolElement(ObjectifiedDataElement): +cdef class BoolElement(IntElement): """Boolean type base on string values: 'true' or 'false'. + + Note that this inherits from IntElement to mimic the behaviour of + Python's bool type. """ - cdef int _boolval(self) except -1: - cdef char* c_str - text = textOf(self._c_node) - if text is None: - return 0 - c_str = _cstr(text) - if c_str[0] == c'0' or c_str[0] == c'f' or c_str[0] == c'F': - if c_str[1] == c'\0' or text == "false" or text.lower() == "false": - # '0' or 'f' or 'false' - return 0 - elif c_str[0] == c'1' or c_str[0] == c't' or c_str[0] == c'T': - if c_str[1] == c'\0' or text == "true" or text.lower() == "true": - # '1' or 't' or 'true' - return 1 - raise ValueError("Invalid boolean value: '%s'" % text) + def _init(self): + self._parse_value = __parseBool def __nonzero__(self): - if self._boolval(): - return True - else: - return False + return __parseBool(textOf(self._c_node)) def __richcmp__(self, other, int op): - if hasattr(other, 'pyval'): - other = other.pyval - if hasattr(self, 'pyval'): - self_val = self.pyval - else: - self_val = bool(self) - return python.PyObject_RichCompare(self_val, other, op) + return _richcmpPyvals(self, other, op) def __str__(self): - if self._boolval(): - return "True" - else: - return "False" + return str(__parseBool(textOf(self._c_node))) def __repr__(self): - if self._boolval(): - return "True" - else: - return "False" + return repr(__parseBool(textOf(self._c_node))) property pyval: def __get__(self): - return self.__nonzero__() + return __parseBool(textOf(self._c_node)) def __checkBool(s): - if s != 'true' and s != 'false' and s != '1' and s != '0': + cdef int value = -1 + if s is not None: + value = __parseBoolAsInt(s) + if value == -1: raise ValueError -cdef object _strValueOf(obj): +cpdef __parseBool(s): + cdef int value + if s is None: + return False + value = __parseBoolAsInt(s) + if value == -1: + raise ValueError("Invalid boolean value: '%s'" % s) + return value + +cdef inline int __parseBoolAsInt(text): + cdef char* c_str + c_str = _cstr(text) + if c_str[0] == c'0' or c_str[0] == c'f' or c_str[0] == c'F': + if c_str[1] == c'\0' or text == "false" or text.lower() == "false": + # '0' or 'f' or 'false' + return 0 + elif c_str[0] == c'1' or c_str[0] == c't' or c_str[0] == c'T': + if c_str[1] == c'\0' or text == "true" or text.lower() == "true": + # '1' or 't' or 'true' + return 1 + return -1 + +cdef inline _parseNumber(NumberElement element): + return element._parse_value(textOf(element._c_node)) + +cdef inline object _strValueOf(obj): if python._isString(obj): return obj if isinstance(obj, _Element): @@ -870,15 +873,20 @@ return '' return str(obj) -cdef object _numericValueOf(obj): +cdef inline object _numericValueOf(obj): if isinstance(obj, NumberElement): - return (obj)._type( - textOf((obj)._c_node)) + return _parseNumber(obj) elif hasattr(obj, 'pyval'): # not always numeric, but Python will raise the right exception return obj.pyval return obj +cdef inline _richcmpPyvals(left, right, int op): + left = getattr3(left, 'pyval', left) + right = getattr3(right, 'pyval', right) + return python.PyObject_RichCompare(left, right, op) + + ################################################################################ # Python type registry Modified: lxml/trunk/src/lxml/tests/test_objectify.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_objectify.py (original) +++ lxml/trunk/src/lxml/tests/test_objectify.py Thu Apr 10 08:00:37 2008 @@ -737,10 +737,20 @@ root = Element("{objectified}root") root.bool = True self.assertEquals(root.bool, True) + self.assertEquals(root.bool + root.bool, True + True) + self.assertEquals(True + root.bool, True + root.bool) + self.assertEquals(root.bool * root.bool, True * True) + self.assertEquals(int(root.bool), int(True)) + self.assertEquals(complex(root.bool), complex(True)) self.assert_(isinstance(root.bool, objectify.BoolElement)) root.bool = False self.assertEquals(root.bool, False) + self.assertEquals(root.bool + root.bool, False + False) + self.assertEquals(False + root.bool, False + root.bool) + self.assertEquals(root.bool * root.bool, False * False) + self.assertEquals(int(root.bool), int(False)) + self.assertEquals(complex(root.bool), complex(False)) self.assert_(isinstance(root.bool, objectify.BoolElement)) def test_data_element_bool(self): From scoder at codespeak.net Thu Apr 10 08:00:44 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 10 Apr 2008 08:00:44 +0200 (CEST) Subject: [Lxml-checkins] r53640 - in lxml/trunk: . doc Message-ID: <20080410060044.C03302A0192@codespeak.net> Author: scoder Date: Thu Apr 10 08:00:43 2008 New Revision: 53640 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/FAQ.txt Log: r3917 at delle: sbehnel | 2008-04-09 15:18:10 +0200 more 'who uses lxml' entries Modified: lxml/trunk/doc/FAQ.txt ============================================================================== --- lxml/trunk/doc/FAQ.txt (original) +++ lxml/trunk/doc/FAQ.txt Thu Apr 10 08:00:43 2008 @@ -154,12 +154,14 @@ * lwebstring_, an XML template engine * OpenXMLlib_, a library for handling OpenXML document meta data * Pycoon_, a WSGI web development framework based on XML pipelines -* rfadict_, an RDFa parser with a simple dictionary-like interface. +* Rambler_, a meta search engine that aggregates different data sources +* rdfadict_, an RDFa parser with a simple dictionary-like interface. Zope3 and some of its extensions have good support for lxml: * gocept.lxml_, Zope3 interface bindings for lxml * z3c.rml_, an implementation of ReportLab's RML format +* zif.sedna_, an XQuery based interface to the Sedna OpenSource XML database And don't miss the quotes by our generally happy_ users_, and other `sites that link to lxml`_. @@ -172,8 +174,10 @@ .. _lwebstring: http://pypi.python.org/pypi/lwebstring .. _OpenXMLlib: http://permalink.gmane.org/gmane.comp.python.lxml.devel/3250 .. _Pycoon: http://pypi.python.org/pypi/pycoon -.. _rfadict: http://pypi.python.org/pypi/rdfadict +.. _Rambler: http://beta.rambler.ru/srch?query=python+lxml&searchtype=web +.. _rdfadict: http://pypi.python.org/pypi/rdfadict .. _z3c.rml: http://pypi.python.org/pypi/z3c.rml +.. _zif.sedna: http://pypi.python.org/pypi/zif.sedna .. _happy: http://thread.gmane.org/gmane.comp.python.lxml.devel/3244/focus=3244 .. _users: http://article.gmane.org/gmane.comp.python.lxml.devel/3246 From scoder at codespeak.net Thu Apr 10 08:00:49 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 10 Apr 2008 08:00:49 +0200 (CEST) Subject: [Lxml-checkins] r53641 - in lxml/trunk: . src/lxml Message-ID: <20080410060049.C8A3F2A0192@codespeak.net> Author: scoder Date: Thu Apr 10 08:00:48 2008 New Revision: 53641 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/xmlerror.pxd lxml/trunk/src/lxml/xmlerror.pxi Log: r3918 at delle: sbehnel | 2008-04-10 07:58:58 +0200 new error constants as of libxml2 2.6.32 Modified: lxml/trunk/src/lxml/xmlerror.pxd ============================================================================== --- lxml/trunk/src/lxml/xmlerror.pxd (original) +++ lxml/trunk/src/lxml/xmlerror.pxd Thu Apr 10 08:00:48 2008 @@ -39,6 +39,7 @@ XML_FROM_WRITER = 25 # The xmlwriter module XML_FROM_MODULE = 26 # The dynamically loaded module modul XML_FROM_I18N = 27 # The module handling character conversion + XML_FROM_SCHEMATRONV = 28 # The Schematron validator module ctypedef enum xmlParserErrors: XML_ERR_OK = 0 @@ -345,6 +346,7 @@ XML_TREE_INVALID_HEX = 1300 XML_TREE_INVALID_DEC = 1301 # 1301 XML_TREE_UNTERMINATED_ENTITY = 1302 # 1302 + XML_TREE_NOT_UTF8 = 1303 # 1303 XML_SAVE_NOT_UTF8 = 1400 XML_SAVE_CHAR_INVALID = 1401 # 1401 XML_SAVE_NO_DOCTYPE = 1402 # 1402 @@ -720,6 +722,8 @@ XML_SCHEMAP_AU_PROPS_CORRECT = 3089 # 3088 XML_SCHEMAP_A_PROPS_CORRECT_3 = 3090 # 3089 XML_SCHEMAP_COS_ALL_LIMITED = 3091 # 3090 + XML_SCHEMATRONV_ASSERT = 4000 # 4000 + XML_SCHEMATRONV_REPORT = 4001 XML_MODULE_OPEN = 4900 # 4900 XML_MODULE_CLOSE = 4901 # 4901 XML_CHECK_FOUND_ELEMENT = 5000 Modified: lxml/trunk/src/lxml/xmlerror.pxi ============================================================================== --- lxml/trunk/src/lxml/xmlerror.pxi (original) +++ lxml/trunk/src/lxml/xmlerror.pxi Thu Apr 10 08:00:48 2008 @@ -584,6 +584,7 @@ WRITER=25 MODULE=26 I18N=27 +SCHEMATRONV=28 """,) cdef object __PARSER_ERROR_TYPES @@ -898,13 +899,14 @@ TREE_INVALID_HEX=1300 TREE_INVALID_DEC=1301 TREE_UNTERMINATED_ENTITY=1302 +TREE_NOT_UTF8=1303 SAVE_NOT_UTF8=1400 SAVE_CHAR_INVALID=1401 SAVE_NO_DOCTYPE=1402 SAVE_UNKNOWN_ENCODING=1403 -REGEXP_COMPILE_ERROR=1450 """, """\ +REGEXP_COMPILE_ERROR=1450 IO_UNKNOWN=1500 IO_EACCES=1501 IO_EAGAIN=1502 @@ -996,9 +998,9 @@ SCHEMAP_EXTENSION_NO_BASE=1707 SCHEMAP_FACET_NO_VALUE=1708 SCHEMAP_FAILED_BUILD_IMPORT=1709 -SCHEMAP_GROUP_NONAME_NOREF=1710 """, """\ +SCHEMAP_GROUP_NONAME_NOREF=1710 SCHEMAP_IMPORT_NAMESPACE_NOT_URI=1711 SCHEMAP_IMPORT_REDEFINE_NSNAME=1712 SCHEMAP_IMPORT_SCHEMA_NOT_URI=1713 @@ -1059,9 +1061,9 @@ SCHEMAP_DEF_AND_PREFIX=1768 SCHEMAP_UNKNOWN_INCLUDE_CHILD=1769 SCHEMAP_INCLUDE_SCHEMA_NOT_URI=1770 -SCHEMAP_INCLUDE_SCHEMA_NO_URI=1771 """, """\ +SCHEMAP_INCLUDE_SCHEMA_NO_URI=1771 SCHEMAP_NOT_SCHEMA=1772 SCHEMAP_UNKNOWN_MEMBER_TYPE=1773 SCHEMAP_INVALID_ATTR_USE=1774 @@ -1124,9 +1126,9 @@ SCHEMAV_CVC_MINLENGTH_VALID=1831 SCHEMAV_CVC_MAXLENGTH_VALID=1832 SCHEMAV_CVC_MININCLUSIVE_VALID=1833 -SCHEMAV_CVC_MAXINCLUSIVE_VALID=1834 """, """\ +SCHEMAV_CVC_MAXINCLUSIVE_VALID=1834 SCHEMAV_CVC_MINEXCLUSIVE_VALID=1835 SCHEMAV_CVC_MAXEXCLUSIVE_VALID=1836 SCHEMAV_CVC_TOTALDIGITS_VALID=1837 @@ -1195,9 +1197,9 @@ SCHEMAP_SRC_SIMPLE_TYPE_4=3003 SCHEMAP_SRC_RESOLVE=3004 SCHEMAP_SRC_RESTRICTION_BASE_OR_SIMPLETYPE=3005 -SCHEMAP_SRC_LIST_ITEMTYPE_OR_SIMPLETYPE=3006 """, """\ +SCHEMAP_SRC_LIST_ITEMTYPE_OR_SIMPLETYPE=3006 SCHEMAP_SRC_UNION_MEMBERTYPES_OR_SIMPLETYPES=3007 SCHEMAP_ST_PROPS_CORRECT_1=3008 SCHEMAP_ST_PROPS_CORRECT_2=3009 @@ -1256,9 +1258,9 @@ SCHEMAP_CVC_SIMPLE_TYPE=3062 SCHEMAP_COS_CT_EXTENDS_1_1=3063 SCHEMAP_SRC_IMPORT_1_1=3064 -SCHEMAP_SRC_IMPORT_1_2=3065 """, """\ +SCHEMAP_SRC_IMPORT_1_2=3065 SCHEMAP_SRC_IMPORT_2=3066 SCHEMAP_SRC_IMPORT_2_1=3067 SCHEMAP_SRC_IMPORT_2_2=3068 @@ -1285,6 +1287,8 @@ SCHEMAP_AU_PROPS_CORRECT=3089 SCHEMAP_A_PROPS_CORRECT_3=3090 SCHEMAP_COS_ALL_LIMITED=3091 +SCHEMATRONV_ASSERT=4000 +SCHEMATRONV_REPORT=4001 MODULE_OPEN=4900 MODULE_CLOSE=4901 CHECK_FOUND_ELEMENT=5000 From scoder at codespeak.net Thu Apr 10 08:02:09 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 10 Apr 2008 08:02:09 +0200 (CEST) Subject: [Lxml-checkins] r53642 - lxml/branch/lxml-2.0/src/lxml Message-ID: <20080410060209.88A04168031@codespeak.net> Author: scoder Date: Thu Apr 10 08:02:08 2008 New Revision: 53642 Modified: lxml/branch/lxml-2.0/src/lxml/xmlerror.pxd lxml/branch/lxml-2.0/src/lxml/xmlerror.pxi Log: trunk merge -c 53641: new error constants as of libxml2 2.6.32 Modified: lxml/branch/lxml-2.0/src/lxml/xmlerror.pxd ============================================================================== --- lxml/branch/lxml-2.0/src/lxml/xmlerror.pxd (original) +++ lxml/branch/lxml-2.0/src/lxml/xmlerror.pxd Thu Apr 10 08:02:08 2008 @@ -39,6 +39,7 @@ XML_FROM_WRITER = 25 # The xmlwriter module XML_FROM_MODULE = 26 # The dynamically loaded module modul XML_FROM_I18N = 27 # The module handling character conversion + XML_FROM_SCHEMATRONV = 28 # The Schematron validator module ctypedef enum xmlParserErrors: XML_ERR_OK = 0 @@ -345,6 +346,7 @@ XML_TREE_INVALID_HEX = 1300 XML_TREE_INVALID_DEC = 1301 # 1301 XML_TREE_UNTERMINATED_ENTITY = 1302 # 1302 + XML_TREE_NOT_UTF8 = 1303 # 1303 XML_SAVE_NOT_UTF8 = 1400 XML_SAVE_CHAR_INVALID = 1401 # 1401 XML_SAVE_NO_DOCTYPE = 1402 # 1402 @@ -720,6 +722,8 @@ XML_SCHEMAP_AU_PROPS_CORRECT = 3089 # 3088 XML_SCHEMAP_A_PROPS_CORRECT_3 = 3090 # 3089 XML_SCHEMAP_COS_ALL_LIMITED = 3091 # 3090 + XML_SCHEMATRONV_ASSERT = 4000 # 4000 + XML_SCHEMATRONV_REPORT = 4001 XML_MODULE_OPEN = 4900 # 4900 XML_MODULE_CLOSE = 4901 # 4901 XML_CHECK_FOUND_ELEMENT = 5000 Modified: lxml/branch/lxml-2.0/src/lxml/xmlerror.pxi ============================================================================== --- lxml/branch/lxml-2.0/src/lxml/xmlerror.pxi (original) +++ lxml/branch/lxml-2.0/src/lxml/xmlerror.pxi Thu Apr 10 08:02:08 2008 @@ -603,6 +603,7 @@ WRITER=25 MODULE=26 I18N=27 +SCHEMATRONV=28 """,) cdef object __PARSER_ERROR_TYPES @@ -917,13 +918,14 @@ TREE_INVALID_HEX=1300 TREE_INVALID_DEC=1301 TREE_UNTERMINATED_ENTITY=1302 +TREE_NOT_UTF8=1303 SAVE_NOT_UTF8=1400 SAVE_CHAR_INVALID=1401 SAVE_NO_DOCTYPE=1402 SAVE_UNKNOWN_ENCODING=1403 -REGEXP_COMPILE_ERROR=1450 """, """\ +REGEXP_COMPILE_ERROR=1450 IO_UNKNOWN=1500 IO_EACCES=1501 IO_EAGAIN=1502 @@ -1015,9 +1017,9 @@ SCHEMAP_EXTENSION_NO_BASE=1707 SCHEMAP_FACET_NO_VALUE=1708 SCHEMAP_FAILED_BUILD_IMPORT=1709 -SCHEMAP_GROUP_NONAME_NOREF=1710 """, """\ +SCHEMAP_GROUP_NONAME_NOREF=1710 SCHEMAP_IMPORT_NAMESPACE_NOT_URI=1711 SCHEMAP_IMPORT_REDEFINE_NSNAME=1712 SCHEMAP_IMPORT_SCHEMA_NOT_URI=1713 @@ -1078,9 +1080,9 @@ SCHEMAP_DEF_AND_PREFIX=1768 SCHEMAP_UNKNOWN_INCLUDE_CHILD=1769 SCHEMAP_INCLUDE_SCHEMA_NOT_URI=1770 -SCHEMAP_INCLUDE_SCHEMA_NO_URI=1771 """, """\ +SCHEMAP_INCLUDE_SCHEMA_NO_URI=1771 SCHEMAP_NOT_SCHEMA=1772 SCHEMAP_UNKNOWN_MEMBER_TYPE=1773 SCHEMAP_INVALID_ATTR_USE=1774 @@ -1143,9 +1145,9 @@ SCHEMAV_CVC_MINLENGTH_VALID=1831 SCHEMAV_CVC_MAXLENGTH_VALID=1832 SCHEMAV_CVC_MININCLUSIVE_VALID=1833 -SCHEMAV_CVC_MAXINCLUSIVE_VALID=1834 """, """\ +SCHEMAV_CVC_MAXINCLUSIVE_VALID=1834 SCHEMAV_CVC_MINEXCLUSIVE_VALID=1835 SCHEMAV_CVC_MAXEXCLUSIVE_VALID=1836 SCHEMAV_CVC_TOTALDIGITS_VALID=1837 @@ -1214,9 +1216,9 @@ SCHEMAP_SRC_SIMPLE_TYPE_4=3003 SCHEMAP_SRC_RESOLVE=3004 SCHEMAP_SRC_RESTRICTION_BASE_OR_SIMPLETYPE=3005 -SCHEMAP_SRC_LIST_ITEMTYPE_OR_SIMPLETYPE=3006 """, """\ +SCHEMAP_SRC_LIST_ITEMTYPE_OR_SIMPLETYPE=3006 SCHEMAP_SRC_UNION_MEMBERTYPES_OR_SIMPLETYPES=3007 SCHEMAP_ST_PROPS_CORRECT_1=3008 SCHEMAP_ST_PROPS_CORRECT_2=3009 @@ -1275,9 +1277,9 @@ SCHEMAP_CVC_SIMPLE_TYPE=3062 SCHEMAP_COS_CT_EXTENDS_1_1=3063 SCHEMAP_SRC_IMPORT_1_1=3064 -SCHEMAP_SRC_IMPORT_1_2=3065 """, """\ +SCHEMAP_SRC_IMPORT_1_2=3065 SCHEMAP_SRC_IMPORT_2=3066 SCHEMAP_SRC_IMPORT_2_1=3067 SCHEMAP_SRC_IMPORT_2_2=3068 @@ -1304,6 +1306,8 @@ SCHEMAP_AU_PROPS_CORRECT=3089 SCHEMAP_A_PROPS_CORRECT_3=3090 SCHEMAP_COS_ALL_LIMITED=3091 +SCHEMATRONV_ASSERT=4000 +SCHEMATRONV_REPORT=4001 MODULE_OPEN=4900 MODULE_CLOSE=4901 CHECK_FOUND_ELEMENT=5000 From scoder at codespeak.net Thu Apr 10 09:00:15 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 10 Apr 2008 09:00:15 +0200 (CEST) Subject: [Lxml-checkins] r53645 - in lxml/trunk: . src/lxml Message-ID: <20080410070015.603012A0192@codespeak.net> Author: scoder Date: Thu Apr 10 09:00:14 2008 New Revision: 53645 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/etree_defs.h lxml/trunk/src/lxml/schematron.pxd lxml/trunk/src/lxml/schematron.pxi Log: r3923 at delle: sbehnel | 2008-04-10 08:43:38 +0200 untested support for schematron error reporting (only used with libxml2 2.6.32+) Modified: lxml/trunk/src/lxml/etree_defs.h ============================================================================== --- lxml/trunk/src/lxml/etree_defs.h (original) +++ lxml/trunk/src/lxml/etree_defs.h Thu Apr 10 09:00:14 2008 @@ -70,6 +70,10 @@ /* schematron was added in libxml2 2.6.21 */ #ifdef LIBXML_SCHEMATRON_ENABLED # define ENABLE_SCHEMATRON 1 +# if LIBXML_VERSION < 20632 + /* schematron error reporting was added in libxml2 2.6.32 */ +# define xmlSchematronSetValidStructuredErrors(ctxt, errorfunc, data) +# endif #else # define ENABLE_SCHEMATRON 0 # define XML_SCHEMATRON_OUT_QUIET 0 @@ -85,8 +89,10 @@ # define xmlSchematronNewValidCtxt(schema, options) NULL # define xmlSchematronValidateDoc(ctxt, doc) 0 # define xmlSchematronFreeValidCtxt(ctxt) +# define xmlSchematronSetValidStructuredErrors(ctxt, errorfunc, data) #endif + /* work around MSDEV 6.0 */ #if (_MSC_VER == 1200) && (WINVER < 0x0500) long _ftol( double ); //defined by VC6 C libs Modified: lxml/trunk/src/lxml/schematron.pxd ============================================================================== --- lxml/trunk/src/lxml/schematron.pxd (original) +++ lxml/trunk/src/lxml/schematron.pxd Thu Apr 10 09:00:14 2008 @@ -1,4 +1,4 @@ -cimport tree +cimport tree, xmlerror from tree cimport xmlDoc, xmlDtd cdef extern from "libxml/schematron.h": @@ -28,3 +28,6 @@ cdef void xmlSchematronFreeParserCtxt(xmlSchematronParserCtxt* ctxt) nogil cdef void xmlSchematronFreeValidCtxt(xmlSchematronValidCtxt* ctxt) nogil cdef void xmlSchematronFree(xmlSchematron* schema) nogil + cdef void xmlSchematronSetValidStructuredErrors( + xmlSchematronValidCtxt* ctxt, + xmlerror.xmlStructuredErrorFunc error_func, void *data) Modified: lxml/trunk/src/lxml/schematron.pxi ============================================================================== --- lxml/trunk/src/lxml/schematron.pxi (original) +++ lxml/trunk/src/lxml/schematron.pxi Thu Apr 10 09:00:14 2008 @@ -141,10 +141,12 @@ doc = _documentOrRaise(etree) root_node = _rootNodeOrRaise(etree) - options = schematron.XML_SCHEMATRON_OUT_QUIET - #if tree.LIBXML_VERSION <= 20630: # ... and later? - # hack to switch off stderr output - options = options | schematron.XML_SCHEMATRON_OUT_XML + if _LIBXML_VERSION_INT >= 20632: + options = schematron.XML_SCHEMATRON_OUT_ERROR + else: + options = schematron.XML_SCHEMATRON_OUT_QUIET + # hack to switch off stderr output + options = options | schematron.XML_SCHEMATRON_OUT_XML valid_ctxt = schematron.xmlSchematronNewValidCtxt( self._c_schema, options) @@ -152,6 +154,8 @@ return python.PyErr_NoMemory() self._error_log.connect() + schematron.xmlSchematronSetValidStructuredErrors( + valid_ctxt, _receiveError, self.error_log) c_doc = _fakeRootDoc(doc._c_doc, root_node._c_node) with nogil: ret = schematron.xmlSchematronValidateDoc(valid_ctxt, c_doc) From scoder at codespeak.net Thu Apr 10 09:00:20 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 10 Apr 2008 09:00:20 +0200 (CEST) Subject: [Lxml-checkins] r53646 - lxml/trunk Message-ID: <20080410070020.2E5982A0192@codespeak.net> Author: scoder Date: Thu Apr 10 09:00:19 2008 New Revision: 53646 Modified: lxml/trunk/ (props changed) lxml/trunk/setupinfo.py Log: r3924 at delle: sbehnel | 2008-04-10 08:44:13 +0200 switch on dependency tracking for Cython 0.9.6.13+ Modified: lxml/trunk/setupinfo.py ============================================================================== --- lxml/trunk/setupinfo.py (original) +++ lxml/trunk/setupinfo.py Thu Apr 10 09:00:19 2008 @@ -83,8 +83,7 @@ if not CYTHON_INSTALLED: return [] from Cython.Compiler.Version import version - # currently, no official Cython release supports this ... - if True or split_version(version) <= (0,9,6,12): + if split_version(version) < (0,9,6,13): return [] package_dir = os.path.join(get_base_dir(), PACKAGE_PATH) From scoder at codespeak.net Thu Apr 10 09:00:24 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 10 Apr 2008 09:00:24 +0200 (CEST) Subject: [Lxml-checkins] r53647 - in lxml/trunk: . src/lxml Message-ID: <20080410070024.88CC02A0192@codespeak.net> Author: scoder Date: Thu Apr 10 09:00:23 2008 New Revision: 53647 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/iterparse.pxi Log: r3925 at delle: sbehnel | 2008-04-10 08:53:27 +0200 fix for crash bug 211711: iterparse didn't set up parser hash table Modified: lxml/trunk/src/lxml/iterparse.pxi ============================================================================== --- lxml/trunk/src/lxml/iterparse.pxi (original) +++ lxml/trunk/src/lxml/iterparse.pxi Thu Apr 10 09:00:23 2008 @@ -385,6 +385,7 @@ None, filename, encoding) context = <_IterparseContext>self._getPushParserContext() + __GLOBAL_PARSER_CONTEXT.initParserDict(context._c_ctxt) context.prepare() # parser will not be unlocked - no other methods supported From scoder at codespeak.net Thu Apr 10 09:00:29 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 10 Apr 2008 09:00:29 +0200 (CEST) Subject: [Lxml-checkins] r53648 - in lxml/trunk: . src/lxml/tests Message-ID: <20080410070029.758312A0192@codespeak.net> Author: scoder Date: Thu Apr 10 09:00:28 2008 New Revision: 53648 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/tests/test_elementtree.py Log: r3926 at delle: sbehnel | 2008-04-10 08:59:00 +0200 mark bug fixed, new test case Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Thu Apr 10 09:00:28 2008 @@ -11,6 +11,8 @@ Bugs fixed ---------- +* Crash bug in iterparse when moving elements into other documents. + * ``ElementTree.find*()`` didn't accept QName objects. * lxml.etree accepted non well-formed namespace prefix names. Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Thu Apr 10 09:00:28 2008 @@ -2696,6 +2696,19 @@ [1,2,1,4], counts) + def test_iterparse_move_elements(self): + iterparse = self.etree.iterparse + f = StringIO('') + + for event, node in etree.iterparse(f): pass + + root = etree.Element('new_root', {}) + root[:] = node[:] + + self.assertEquals( + ['b', 'c'], + [ el.tag for el in root ]) + def test_parse_file(self): parse = self.etree.parse # from file From scoder at codespeak.net Thu Apr 10 09:04:39 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 10 Apr 2008 09:04:39 +0200 (CEST) Subject: [Lxml-checkins] r53649 - in lxml/branch/lxml-2.0: . src/lxml src/lxml/tests Message-ID: <20080410070439.F07522A0192@codespeak.net> Author: scoder Date: Thu Apr 10 09:04:38 2008 New Revision: 53649 Modified: lxml/branch/lxml-2.0/CHANGES.txt lxml/branch/lxml-2.0/src/lxml/iterparse.pxi lxml/branch/lxml-2.0/src/lxml/tests/test_elementtree.py Log: trunk merge -r 53647:53648: iterparse crash fix Modified: lxml/branch/lxml-2.0/CHANGES.txt ============================================================================== --- lxml/branch/lxml-2.0/CHANGES.txt (original) +++ lxml/branch/lxml-2.0/CHANGES.txt Thu Apr 10 09:04:38 2008 @@ -11,6 +11,8 @@ Bugs fixed ---------- +* Crash bug in iterparse when moving elements into other documents. + * HTML elements' ``.cssselect()`` method was broken. * ``ElementTree.find*()`` didn't accept QName objects. Modified: lxml/branch/lxml-2.0/src/lxml/iterparse.pxi ============================================================================== --- lxml/branch/lxml-2.0/src/lxml/iterparse.pxi (original) +++ lxml/branch/lxml-2.0/src/lxml/iterparse.pxi Thu Apr 10 09:04:38 2008 @@ -325,6 +325,7 @@ None, filename, encoding) context = <_IterparseContext>self._getPushParserContext() + __GLOBAL_PARSER_CONTEXT.initParserDict(context._c_ctxt) context._setEventFilter(events, tag) context.prepare() # parser will not be unlocked - no other methods supported Modified: lxml/branch/lxml-2.0/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/branch/lxml-2.0/src/lxml/tests/test_elementtree.py (original) +++ lxml/branch/lxml-2.0/src/lxml/tests/test_elementtree.py Thu Apr 10 09:04:38 2008 @@ -2567,6 +2567,19 @@ [1,2,1,4], counts) + def test_iterparse_move_elements(self): + iterparse = self.etree.iterparse + f = StringIO('') + + for event, node in etree.iterparse(f): pass + + root = etree.Element('new_root', {}) + root[:] = node[:] + + self.assertEquals( + ['b', 'c'], + [ el.tag for el in root ]) + def test_parse_file(self): parse = self.etree.parse # from file From scoder at codespeak.net Fri Apr 11 19:32:59 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 11 Apr 2008 19:32:59 +0200 (CEST) Subject: [Lxml-checkins] r53695 - in lxml/trunk: . doc Message-ID: <20080411173259.075FD2A01B9@codespeak.net> Author: scoder Date: Fri Apr 11 19:32:55 2008 New Revision: 53695 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/performance.txt Log: r3932 at delle: sbehnel | 2008-04-11 15:16:32 +0200 link to HTML benchmarks Modified: lxml/trunk/doc/performance.txt ============================================================================== --- lxml/trunk/doc/performance.txt (original) +++ lxml/trunk/doc/performance.txt Fri Apr 11 19:32:55 2008 @@ -193,6 +193,16 @@ input documents are not considerably bigger than the output, lxml is the clear winner. +Regarding HTML parsing, Ian Bicking has done some `benchmarking on +lxml's HTML parser`_, comparing it to a number of other famous HTML +parser tools for Python. lxml wins this contest by quite a length. +To give an idea, the numbers suggest that lxml.html can run a couple +of parse-serialise cycles in the time that other tools need for +parsing alone. The comparison even shows some very favourable results +regarding memory consumption. + +.. _`benchmarking on lxml's HTML parser`: http://blog.ianbicking.org/2008/03/30/python-html-parser-performance/ + The ElementTree API =================== From scoder at codespeak.net Fri Apr 11 19:33:02 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 11 Apr 2008 19:33:02 +0200 (CEST) Subject: [Lxml-checkins] r53696 - in lxml/trunk: . src/lxml Message-ID: <20080411173302.AB7192A01BB@codespeak.net> Author: scoder Date: Fri Apr 11 19:33:02 2008 New Revision: 53696 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/python.pxd Log: r3933 at delle: sbehnel | 2008-04-11 15:17:54 +0200 cimport fix Modified: lxml/trunk/src/lxml/python.pxd ============================================================================== --- lxml/trunk/src/lxml/python.pxd (original) +++ lxml/trunk/src/lxml/python.pxd Fri Apr 11 19:33:02 2008 @@ -1,4 +1,4 @@ -from tree cimport FILE +from cstd cimport FILE cdef extern from "Python.h": ctypedef struct PyObject From scoder at codespeak.net Fri Apr 11 19:33:06 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 11 Apr 2008 19:33:06 +0200 (CEST) Subject: [Lxml-checkins] r53697 - in lxml/trunk: . src/lxml Message-ID: <20080411173306.DF3D52A01B9@codespeak.net> Author: scoder Date: Fri Apr 11 19:33:06 2008 New Revision: 53697 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/python.pxd Log: r3934 at delle: sbehnel | 2008-04-11 19:15:10 +0200 cleanup Modified: lxml/trunk/src/lxml/python.pxd ============================================================================== --- lxml/trunk/src/lxml/python.pxd (original) +++ lxml/trunk/src/lxml/python.pxd Fri Apr 11 19:33:06 2008 @@ -120,6 +120,6 @@ NOWAIT_LOCK cdef extern from "etree_defs.h": # redefines some functions as macros - cdef int _isString(object obj) + cdef bint _isString(object obj) cdef char* _fqtypename(object t) cdef object PY_NEW(object t) From scoder at codespeak.net Fri Apr 11 19:33:10 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 11 Apr 2008 19:33:10 +0200 (CEST) Subject: [Lxml-checkins] r53698 - in lxml/trunk: . src/lxml Message-ID: <20080411173310.CAB0A2A01B9@codespeak.net> Author: scoder Date: Fri Apr 11 19:33:10 2008 New Revision: 53698 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/extensions.pxi Log: r3935 at delle: sbehnel | 2008-04-11 19:17:10 +0200 cleanup Modified: lxml/trunk/src/lxml/extensions.pxi ============================================================================== --- lxml/trunk/src/lxml/extensions.pxi (original) +++ lxml/trunk/src/lxml/extensions.pxi Fri Apr 11 19:33:10 2008 @@ -564,13 +564,13 @@ # special str/unicode subclasses cdef class _ElementUnicodeResult(python.unicode): - cdef _Element parent + cdef _Element _parent cdef readonly object is_tail cdef readonly object is_text cdef readonly object is_attribute def getparent(self): - return self.parent + return self._parent class _ElementStringResult(str): # we need to use a Python class here, str cannot be C-subclassed @@ -596,7 +596,7 @@ return result else: uresult = _ElementUnicodeResult(string_value) - uresult.parent = parent + uresult._parent = parent uresult.is_attribute = is_attribute uresult.is_tail = is_tail uresult.is_text = is_text From scoder at codespeak.net Fri Apr 11 19:33:18 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 11 Apr 2008 19:33:18 +0200 (CEST) Subject: [Lxml-checkins] r53699 - in lxml/trunk: . doc src/lxml src/lxml/tests Message-ID: <20080411173318.9BDD52A01B9@codespeak.net> Author: scoder Date: Fri Apr 11 19:33:18 2008 New Revision: 53699 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/doc/api.txt lxml/trunk/src/lxml/apihelpers.pxi lxml/trunk/src/lxml/iterparse.pxi lxml/trunk/src/lxml/lxml.etree.pyx lxml/trunk/src/lxml/parser.pxi lxml/trunk/src/lxml/saxparser.pxi lxml/trunk/src/lxml/tests/test_etree.py lxml/trunk/src/lxml/tree.pxd Log: r3936 at delle: sbehnel | 2008-04-11 19:31:10 +0200 support for CDATA blocks: parser option and CDATA() text factory Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Fri Apr 11 19:33:18 2008 @@ -8,6 +8,12 @@ Features added -------------- +* Parser option `strip_cdata` for normalising or keeping CDATA + sections. Defaults to ``True`` as before, thus replacing CDATA + sections by their text content. + +* ``CDATA()`` factory to wrap string content as CDATA section. + Bugs fixed ---------- Modified: lxml/trunk/doc/api.txt ============================================================================== --- lxml/trunk/doc/api.txt (original) +++ lxml/trunk/doc/api.txt Fri Apr 11 19:33:18 2008 @@ -30,15 +30,16 @@ .. contents:: .. - 1 lxml.etree - 2 Other Element APIs - 3 Trees and Documents - 4 Iteration - 5 Error handling on exceptions - 6 Error logging - 7 Serialisation - 8 XInclude and ElementInclude - 9 write_c14n on ElementTree + 1 lxml.etree + 2 Other Element APIs + 3 Trees and Documents + 4 Iteration + 5 Error handling on exceptions + 6 Error logging + 7 Serialisation + 8 CDATA + 9 XInclude and ElementInclude + 10 write_c14n on ElementTree lxml.etree @@ -352,6 +353,50 @@ XMLSyntaxError: ... +CDATA +----- + +By default, lxml's parser will strip CDATA sections from the tree and +replace them by their plain text content. As real applications for +CDATA are rare, this is the best way to deal with this issue. + +However, in some cases, keeping CDATA sections or creating them in a +document is required to adhere to existing XML language definitions. +For these special cases, you can instruct the parser to leave CDATA +sections in the document: + +.. sourcecode:: pycon + + >>> parser = etree.XMLParser(strip_cdata=False) + >>> root = etree.XML('', parser) + >>> root.text + 'test' + + >>> etree.tostring(root) + '' + +Note how the ``.text`` property does not give any indication that the +text content is wrapped by a CDATA section. If you want to make sure +your data is wrapped by a CDATA block, you can use the ``CDATA()`` +text wrapper: + +.. sourcecode:: pycon + + >>> root.text = 'test' + + >>> root.text + 'test' + >>> etree.tostring(root) + 'test' + + >>> root.text = etree.CDATA(root.text) + + >>> root.text + 'test' + >>> etree.tostring(root) + '' + + XInclude and ElementInclude --------------------------- Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Fri Apr 11 19:33:18 2008 @@ -449,8 +449,17 @@ if value is None: return 0 # now add new text node with value at start - text = _utf8(value) - c_text_node = tree.xmlNewDocText(c_node.doc, _cstr(text)) + if python._isString(value): + text = _utf8(value) + c_text_node = tree.xmlNewDocText(c_node.doc, _cstr(text)) + elif isinstance(value, CDATA): + c_text_node = tree.xmlNewCDataBlock( + c_node.doc, _cstr((value)._utf8_data), + python.PyString_GET_SIZE((value)._utf8_data)) + else: + # this will raise the right error + _utf8(value) + return -1 if c_node.children is NULL: tree.xmlAddChild(c_node, c_text_node) else: @@ -593,6 +602,8 @@ while c_node is not NULL: if c_node.type == tree.XML_TEXT_NODE: return c_node + if c_node.type == tree.XML_CDATA_SECTION_NODE: + return c_node elif c_node.type == tree.XML_XINCLUDE_START or \ c_node.type == tree.XML_XINCLUDE_END: c_node = c_node.next Modified: lxml/trunk/src/lxml/iterparse.pxi ============================================================================== --- lxml/trunk/src/lxml/iterparse.pxi (original) +++ lxml/trunk/src/lxml/iterparse.pxi Fri Apr 11 19:33:18 2008 @@ -327,6 +327,7 @@ - remove_blank_text - discard blank text nodes - remove_comments - discard comments - remove_pis - discard processing instructions + - strip_cdata - replace CDATA sections by normal text content (default: True) - compact - safe memory for short text content (default: True) - resolve_entities - replace entities by their text value (default: True) @@ -342,7 +343,7 @@ attribute_defaults=False, dtd_validation=False, load_dtd=False, no_network=True, remove_blank_text=False, compact=True, resolve_entities=True, remove_comments=False, - remove_pis=False, encoding=None, + remove_pis=False, strip_cdata=True, encoding=None, html=False, XMLSchema schema=None): cdef _IterparseContext context cdef char* c_encoding @@ -381,7 +382,7 @@ parse_options = parse_options ^ xmlparser.XML_PARSE_NOENT _BaseParser.__init__(self, parse_options, html, schema, - remove_comments, remove_pis, + remove_comments, remove_pis, strip_cdata, None, filename, encoding) context = <_IterparseContext>self._getPushParserContext() Modified: lxml/trunk/src/lxml/lxml.etree.pyx ============================================================================== --- lxml/trunk/src/lxml/lxml.etree.pyx (original) +++ lxml/trunk/src/lxml/lxml.etree.pyx Fri Apr 11 19:33:18 2008 @@ -2264,6 +2264,20 @@ PI = ProcessingInstruction +cdef class CDATA: + """CDATA(data) + + CDATA factory. This factory creates an opaque data object that + can be used to set Element text. The usual way to use it is:: + + >>> from lxml import etree + >>> el = etree.Element('content') + >>> el.text = etree.CDATA('a string') + """ + cdef object _utf8_data + def __init__(self, data): + self._utf8_data = _utf8(data) + def Entity(name): """Entity(name) Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Fri Apr 11 19:33:18 2008 @@ -550,6 +550,7 @@ cdef bint _for_html cdef bint _remove_comments cdef bint _remove_pis + cdef bint _strip_cdata cdef XMLSchema _schema cdef object _filename cdef object _target @@ -557,7 +558,8 @@ cdef int _default_encoding_int def __init__(self, int parse_options, bint for_html, XMLSchema schema, - remove_comments, remove_pis, target, filename, encoding): + remove_comments, remove_pis, strip_cdata, target, + filename, encoding): cdef int c_encoding if not isinstance(self, HTMLParser) and \ not isinstance(self, XMLParser) and \ @@ -570,6 +572,7 @@ self._for_html = for_html self._remove_comments = remove_comments self._remove_pis = remove_pis + self._strip_cdata = strip_cdata self._schema = schema self._resolvers = _ResolverRegistry() @@ -601,8 +604,9 @@ pctxt.sax.comment = NULL if self._remove_pis: pctxt.sax.processingInstruction = NULL - # hard switch-off for CDATA nodes => makes them plain text - pctxt.sax.cdataBlock = NULL + if self._strip_cdata: + # hard switch-off for CDATA nodes => makes them plain text + pctxt.sax.cdataBlock = NULL return self._parser_context cdef _ParserContext _getPushParserContext(self): @@ -621,8 +625,9 @@ pctxt.sax.comment = NULL if self._remove_pis: pctxt.sax.processingInstruction = NULL - # hard switch-off for CDATA nodes => makes them plain text - pctxt.sax.cdataBlock = NULL + if self._strip_cdata: + # hard switch-off for CDATA nodes => makes them plain text + pctxt.sax.cdataBlock = NULL return self._push_parser_context cdef _ParserContext _createContext(self, target): @@ -700,6 +705,7 @@ parser._for_html = self._for_html parser._remove_comments = self._remove_comments parser._remove_pis = self._remove_pis + parser._strip_cdata = self._strip_cdata parser._filename = self._filename parser._resolvers = self._resolvers parser._target = self._target @@ -1051,6 +1057,7 @@ - remove_blank_text - discard blank text nodes - remove_comments - discard comments - remove_pis - discard processing instructions + - strip_cdata - replace CDATA sections by normal text content (default: True) - compact - safe memory for short text content (default: True) - resolve_entities - replace entities by their text value (default: True) @@ -1068,8 +1075,8 @@ load_dtd=False, no_network=True, ns_clean=False, recover=False, remove_blank_text=False, compact=True, resolve_entities=True, remove_comments=False, - remove_pis=False, target=None, encoding=None, - XMLSchema schema=None): + remove_pis=False, strip_cdata=True, target=None, + encoding=None, XMLSchema schema=None): cdef int parse_options parse_options = _XML_DEFAULT_PARSE_OPTIONS if load_dtd: @@ -1092,9 +1099,11 @@ parse_options = parse_options ^ xmlparser.XML_PARSE_COMPACT if not resolve_entities: parse_options = parse_options ^ xmlparser.XML_PARSE_NOENT + if not strip_cdata: + parse_options = parse_options ^ xmlparser.XML_PARSE_NOCDATA _BaseParser.__init__(self, parse_options, 0, schema, - remove_comments, remove_pis, + remove_comments, remove_pis, strip_cdata, target, None, encoding) cdef class ETCompatXMLParser(XMLParser): @@ -1110,7 +1119,8 @@ load_dtd=False, no_network=True, ns_clean=False, recover=False, remove_blank_text=False, compact=True, resolve_entities=True, remove_comments=True, - remove_pis=True, target=None, encoding=None, schema=None): + remove_pis=True, strip_cdata=True, target=None, + encoding=None, schema=None): XMLParser.__init__(self, attribute_defaults=attribute_defaults, dtd_validation=dtd_validation, @@ -1123,6 +1133,7 @@ resolve_entities=resolve_entities, remove_comments=remove_comments, remove_pis=remove_pis, + strip_cdata=strip_cdata, target=target, encoding=encoding, schema=schema) @@ -1180,6 +1191,7 @@ - remove_blank_text - discard empty text nodes - remove_comments - discard comments - remove_pis - discard processing instructions + - strip_cdata - replace CDATA sections by normal text content (default: True) - compact - safe memory for short text content (default: True) Other keyword arguments: @@ -1193,7 +1205,7 @@ """ def __init__(self, *, recover=True, no_network=True, remove_blank_text=False, compact=True, remove_comments=False, - remove_pis=False, target=None, encoding=None, + remove_pis=False, strip_cdata=True, target=None, encoding=None, XMLSchema schema=None): cdef int parse_options parse_options = _HTML_DEFAULT_PARSE_OPTIONS @@ -1207,7 +1219,7 @@ parse_options = parse_options ^ htmlparser.HTML_PARSE_COMPACT _BaseParser.__init__(self, parse_options, 1, schema, - remove_comments, remove_pis, + remove_comments, remove_pis, strip_cdata, target, None, encoding) cdef HTMLParser __DEFAULT_HTML_PARSER Modified: lxml/trunk/src/lxml/saxparser.pxi ============================================================================== --- lxml/trunk/src/lxml/saxparser.pxi (original) +++ lxml/trunk/src/lxml/saxparser.pxi Fri Apr 11 19:33:18 2008 @@ -37,6 +37,7 @@ cdef xmlparser.startElementSAXFunc _origSaxStartNoNs cdef xmlparser.endElementSAXFunc _origSaxEndNoNs cdef xmlparser.charactersSAXFunc _origSaxData + cdef xmlparser.cdataBlockSAXFunc _origSaxCData cdef xmlparser.internalSubsetSAXFunc _origSaxDoctype cdef xmlparser.commentSAXFunc _origSaxComment cdef xmlparser.processingInstructionSAXFunc _origSaxPi @@ -76,10 +77,12 @@ if self._target._sax_event_propagate & SAX_EVENT_DATA: self._origSaxData = sax.characters + self._origSaxCData = sax.cdataBlock else: - self._origSaxData = sax.characters = NULL + self._origSaxData = sax.characters = sax.cdataBlock = NULL if self._target._sax_event_filter & SAX_EVENT_DATA: sax.characters = _handleSaxData + sax.cdataBlock = _handleSaxCData # doctype propagation is always required for entity replacement self._origSaxDoctype = sax.internalSubset @@ -249,6 +252,21 @@ except: context._handleSaxException(c_ctxt) +cdef void _handleSaxCData(void* ctxt, char* c_data, int data_len) with gil: + cdef _SaxParserContext context + cdef xmlparser.xmlParserCtxt* c_ctxt + c_ctxt = ctxt + if c_ctxt._private is NULL: + return + context = <_SaxParserContext>c_ctxt._private + if context._origSaxCData is not NULL: + context._origSaxCData(c_ctxt, c_data, data_len) + try: + context._target._handleSaxData( + python.PyUnicode_DecodeUTF8(c_data, data_len, NULL)) + except: + context._handleSaxException(c_ctxt) + cdef void _handleSaxDoctype(void* ctxt, char* c_name, char* c_public, char* c_system) with gil: cdef _SaxParserContext context Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Fri Apr 11 19:33:18 2008 @@ -462,6 +462,29 @@ "data-B", "end-root", "pi-test-c"], events) + def test_parser_target_cdata(self): + events = [] + class Target(object): + def start(self, tag, attrib): + events.append("start-" + tag) + def end(self, tag): + events.append("end-" + tag) + def data(self, data): + events.append("data-" + data) + def close(self): + return "DONE" + + parser = self.etree.XMLParser(target=Target(), + strip_cdata=False) + + parser.feed('AB') + done = parser.close() + + self.assertEquals("DONE", done) + self.assertEquals(["start-root", "data-A", "start-a", + "data-ca", "end-a", "data-B", "end-root"], + events) + def test_iterwalk_tag(self): iterwalk = self.etree.iterwalk root = self.etree.XML('') @@ -666,6 +689,55 @@ self.assertRaises(ValueError, Entity, '#abc') self.assertRaises(ValueError, Entity, '#xxyz') + def test_cdata(self): + CDATA = self.etree.CDATA + Element = self.etree.Element + tostring = self.etree.tostring + + root = Element("root") + root.text = CDATA('test') + + self.assertEquals('test', + root.text) + self.assertEquals('', + tostring(root)) + + def test_cdata_type(self): + CDATA = self.etree.CDATA + Element = self.etree.Element + root = Element("root") + + root.text = CDATA("test") + self.assertEquals('test', root.text) + + root.text = CDATA(u"test") + self.assertEquals('test', root.text) + + self.assertRaises(TypeError, CDATA, 1) + + def test_cdata_errors(self): + CDATA = self.etree.CDATA + Element = self.etree.Element + + root = Element("root") + cdata = CDATA('test') + + self.assertRaises(TypeError, + setattr, root, 'tail', cdata) + self.assertRaises(TypeError, + root.set, 'attr', cdata) + self.assertRaises(TypeError, + operator.setitem, root.attrib, 'attr', cdata) + + def test_cdata_parser(self): + tostring = self.etree.tostring + parser = self.etree.XMLParser(strip_cdata=False) + root = self.etree.XML('', parser) + + self.assertEquals('test', root.text) + self.assertEquals('', + tostring(root)) + # TypeError in etree, AssertionError in ElementTree; def test_setitem_assert(self): Element = self.etree.Element Modified: lxml/trunk/src/lxml/tree.pxd ============================================================================== --- lxml/trunk/src/lxml/tree.pxd (original) +++ lxml/trunk/src/lxml/tree.pxd Fri Apr 11 19:33:18 2008 @@ -181,6 +181,7 @@ cdef xmlNode* xmlNewDocComment(xmlDoc* doc, char* content) nogil cdef xmlNode* xmlNewDocPI(xmlDoc* doc, char* name, char* content) nogil cdef xmlNode* xmlNewReference(xmlDoc* doc, char* name) nogil + cdef xmlNode* xmlNewCDataBlock(xmlDoc* doc, char* text, int len) nogil cdef xmlNs* xmlNewNs(xmlNode* node, char* href, char* prefix) nogil cdef xmlNode* xmlAddChild(xmlNode* parent, xmlNode* cur) nogil cdef xmlNode* xmlReplaceNode(xmlNode* old, xmlNode* cur) nogil From scoder at codespeak.net Sat Apr 12 14:33:29 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 12 Apr 2008 14:33:29 +0200 (CEST) Subject: [Lxml-checkins] r53716 - lxml/trunk Message-ID: <20080412123329.2F26516A45F@codespeak.net> Author: scoder Date: Sat Apr 12 14:33:27 2008 New Revision: 53716 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt Log: r3943 at delle: sbehnel | 2008-04-11 19:39:06 +0200 rst fix Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Sat Apr 12 14:33:27 2008 @@ -8,7 +8,7 @@ Features added -------------- -* Parser option `strip_cdata` for normalising or keeping CDATA +* Parser option ``strip_cdata`` for normalising or keeping CDATA sections. Defaults to ``True`` as before, thus replacing CDATA sections by their text content. From scoder at codespeak.net Sat Apr 12 14:33:34 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 12 Apr 2008 14:33:34 +0200 (CEST) Subject: [Lxml-checkins] r53717 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20080412123334.B38DC16A460@codespeak.net> Author: scoder Date: Sat Apr 12 14:33:33 2008 New Revision: 53717 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/iterparse.pxi lxml/trunk/src/lxml/tests/test_elementtree.py lxml/trunk/src/lxml/tests/test_etree.py Log: r3944 at delle: sbehnel | 2008-04-12 12:02:37 +0200 fix for CDATA parsing in iterparse() Modified: lxml/trunk/src/lxml/iterparse.pxi ============================================================================== --- lxml/trunk/src/lxml/iterparse.pxi (original) +++ lxml/trunk/src/lxml/iterparse.pxi Sat Apr 12 14:33:33 2008 @@ -380,6 +380,8 @@ parse_options = parse_options ^ xmlparser.XML_PARSE_COMPACT if not resolve_entities: parse_options = parse_options ^ xmlparser.XML_PARSE_NOENT + if not strip_cdata: + parse_options = parse_options ^ xmlparser.XML_PARSE_NOCDATA _BaseParser.__init__(self, parse_options, html, schema, remove_comments, remove_pis, strip_cdata, Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Sat Apr 12 14:33:33 2008 @@ -2709,6 +2709,16 @@ ['b', 'c'], [ el.tag for el in root ]) + def test_iterparse_cdata(self): + tostring = self.etree.tostring + f = StringIO('') + context = self.etree.iterparse(f) + content = [ el.text for event,el in context ] + + self.assertEquals(['test'], content) + self.assertEquals('test', + tostring(context.root)) + def test_parse_file(self): parse = self.etree.parse # from file Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Sat Apr 12 14:33:33 2008 @@ -409,6 +409,26 @@ a = iterator.root self.assertEquals(a.text, text) + def test_iterparse_cdata(self): + tostring = self.etree.tostring + f = StringIO('') + context = self.etree.iterparse(f) + content = [ el.text for event,el in context ] + + self.assertEquals(['test'], content) + self.assertEquals('test', + tostring(context.root)) + + def test_iterparse_keep_cdata(self): + tostring = self.etree.tostring + f = StringIO('') + context = self.etree.iterparse(f, strip_cdata=False) + content = [ el.text for event,el in context ] + + self.assertEquals(['test'], content) + self.assertEquals('', + tostring(context.root)) + def test_parser_encoding_unknown(self): self.assertRaises( LookupError, self.etree.XMLParser, encoding="hopefully unknown") From scoder at codespeak.net Sat Apr 12 14:33:38 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 12 Apr 2008 14:33:38 +0200 (CEST) Subject: [Lxml-checkins] r53718 - in lxml/trunk: . src/lxml/tests Message-ID: <20080412123338.CC8C416A463@codespeak.net> Author: scoder Date: Sat Apr 12 14:33:38 2008 New Revision: 53718 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/tests/test_etree.py Log: r3945 at delle: sbehnel | 2008-04-12 12:05:52 +0200 cleanup Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Sat Apr 12 14:33:38 2008 @@ -409,16 +409,6 @@ a = iterator.root self.assertEquals(a.text, text) - def test_iterparse_cdata(self): - tostring = self.etree.tostring - f = StringIO('') - context = self.etree.iterparse(f) - content = [ el.text for event,el in context ] - - self.assertEquals(['test'], content) - self.assertEquals('test', - tostring(context.root)) - def test_iterparse_keep_cdata(self): tostring = self.etree.tostring f = StringIO('') From scoder at codespeak.net Sat Apr 12 14:33:43 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 12 Apr 2008 14:33:43 +0200 (CEST) Subject: [Lxml-checkins] r53719 - in lxml/trunk: . src/lxml/tests Message-ID: <20080412123343.3EFB316A463@codespeak.net> Author: scoder Date: Sat Apr 12 14:33:42 2008 New Revision: 53719 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/tests/test_elementtree.py Log: r3946 at delle: sbehnel | 2008-04-12 12:06:43 +0200 ET test for CDATA parsing Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Sat Apr 12 14:33:42 2008 @@ -2770,6 +2770,14 @@ tree.getroot() ) + def test_parse_cdata(self): + tostring = self.etree.tostring + root = self.etree.XML('') + + self.assertEquals('test', root.text) + self.assertEquals('test', + tostring(root)) + def test_parse_with_encoding(self): # this can fail in libxml2 <= 2.6.22 parse = self.etree.parse From scoder at codespeak.net Sat Apr 12 14:33:47 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 12 Apr 2008 14:33:47 +0200 (CEST) Subject: [Lxml-checkins] r53720 - in lxml/trunk: . src/lxml/tests Message-ID: <20080412123347.0495716A466@codespeak.net> Author: scoder Date: Sat Apr 12 14:33:47 2008 New Revision: 53720 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/tests/test_xslt.py Log: r3947 at delle: sbehnel | 2008-04-12 12:11:43 +0200 test case for currently lacking XSLT error on wrong stylesheet parameters Modified: lxml/trunk/src/lxml/tests/test_xslt.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_xslt.py (original) +++ lxml/trunk/src/lxml/tests/test_xslt.py Sat Apr 12 14:33:47 2008 @@ -299,6 +299,24 @@ ''', str(res)) + def _test_xslt_parameter_invalid(self): + tree = self.parse('BC') + style = self.parse('''\ + + + + +''') + + st = etree.XSLT(style) + res = self.assertRaises(etree.XSLTApplyError, + st, tree, bar="test") + res = self.assertRaises(etree.XSLTApplyError, + st, tree, bar="") + res = self.assertRaises(etree.XSLTApplyError, + st, tree, bar="....") + if etree.LIBXSLT_VERSION < (1,1,18): # later versions produce no error def test_xslt_parameter_missing(self): From scoder at codespeak.net Sun Apr 13 18:30:08 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 13 Apr 2008 18:30:08 +0200 (CEST) Subject: [Lxml-checkins] r53742 - in lxml/trunk: . doc Message-ID: <20080413163008.39FA149812A@codespeak.net> Author: scoder Date: Sun Apr 13 18:30:06 2008 New Revision: 53742 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/elementsoup.txt Log: r3953 at delle: sbehnel | 2008-04-13 18:28:47 +0200 doc update, section on using lxml.html.soupparser as a fallback Modified: lxml/trunk/doc/elementsoup.txt ============================================================================== --- lxml/trunk/doc/elementsoup.txt (original) +++ lxml/trunk/doc/elementsoup.txt Sun Apr 13 18:30:06 2008 @@ -3,7 +3,7 @@ ==================== BeautifulSoup_ is a Python package that parses broken HTML. While libxml2 -(and thus lxml) can also parse broken HTML, BeautifulSoup is somewhat more +(and thus lxml) can also parse broken HTML, BeautifulSoup is a bit more forgiving and has superiour `support for encoding detection`_. .. _BeautifulSoup: http://www.crummy.com/software/BeautifulSoup/ @@ -13,7 +13,7 @@ lxml can benefit from the parsing capabilities of BeautifulSoup through the ``lxml.html.soupparser`` module. It provides three main functions: ``fromstring()`` and ``parse()`` to parse a string or file -using BeautifulSoup, and `convert_tree()` to convert an existing +using BeautifulSoup, and ``convert_tree()`` to convert an existing BeautifulSoup tree into a list of top-level Elements. The functions ``fromstring()`` and ``parse()`` behave as known from @@ -58,6 +58,10 @@ ``makeelement`` factory function to ``parse()`` and ``fromstring()``. By default, this is based on the HTML parser defined in ``lxml.html``. + +Entity handling +=============== + By default, the BeautifulSoup parser also replaces the entities it finds by their character equivalent. @@ -94,3 +98,45 @@ >>> tostring(body, method="html", encoding=unicode) u'\xa9\u20ac-\xf5\u01bd

' + + +Using soupparser as a fallback +============================== + +The downside of using this parser is that it is `much slower`_ than +the HTML parser of lxml. So if performance matters, you might want to +consider using ``soupparser`` only as a fallback for certain cases. + +.. _`much slower`: http://blog.ianbicking.org/2008/03/30/python-html-parser-performance/ + +One common problem of lxml's parser is that it might not get the +encoding right in cases where the document contains a ```` tag +at the wrong place. In this case, you can exploit the fact that lxml +serialises much faster than most other HTML libraries for Python. +Just serialise the document to unicode and if that gives you an +exception, re-parse it with BeautifulSoup to see if that works +better. + +.. sourcecode:: pycon + + >>> tag_soup = '''\ + ... + ... + ... + ... Hello W\xc3\xb6rld! + ... + ... Hi all + ... ''' + + >>> import lxml.html + >>> import lxml.html.soupparser + + >>> root = lxml.html.fromstring(tag_soup) + >>> try: + ... ignore = tostring(root, encoding=unicode) + ... except UnicodeDecodeError: + ... root = lxml.html.soupparser.fromstring(tag_soup) + ... # try again, but don't catch the exception this time + ... ignore = tostring(root, encoding=unicode) + From scoder at codespeak.net Sun Apr 13 19:17:57 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 13 Apr 2008 19:17:57 +0200 (CEST) Subject: [Lxml-checkins] r53744 - in lxml/trunk: . doc Message-ID: <20080413171757.404E7168521@codespeak.net> Author: scoder Date: Sun Apr 13 19:17:56 2008 New Revision: 53744 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/elementsoup.txt Log: r3955 at delle: sbehnel | 2008-04-13 19:16:41 +0200 doc fix Modified: lxml/trunk/doc/elementsoup.txt ============================================================================== --- lxml/trunk/doc/elementsoup.txt (original) +++ lxml/trunk/doc/elementsoup.txt Sun Apr 13 19:17:56 2008 @@ -22,7 +22,8 @@ There is also a legacy module called ``lxml.html.ElementSoup``, which mimics the interface provided by ElementTree's own ElementSoup_ -module. +module. Note that the ``soupparser`` module was added in lxml 2.0.3. +Previous versions of lxml 2.0.x only have the ``ElementSoup`` module. Here is a document full of tag soup, similar to, but not quite like, HTML: @@ -73,7 +74,7 @@ u'\xa9\u20ac-\xf5\u01bd' If you want them back on the way out, you can just serialise with the -default encoding, which is 'US-ASCII'. The 'html' method +default encoding, which is 'US-ASCII'. .. sourcecode:: pycon From scoder at codespeak.net Sun Apr 13 19:27:22 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 13 Apr 2008 19:27:22 +0200 (CEST) Subject: [Lxml-checkins] r53745 - in lxml/branch/lxml-2.0: . doc Message-ID: <20080413172722.453B949812A@codespeak.net> Author: scoder Date: Sun Apr 13 19:27:20 2008 New Revision: 53745 Modified: lxml/branch/lxml-2.0/CHANGES.txt lxml/branch/lxml-2.0/doc/main.txt lxml/branch/lxml-2.0/version.txt Log: prepare release of 2.0.4 Modified: lxml/branch/lxml-2.0/CHANGES.txt ============================================================================== --- lxml/branch/lxml-2.0/CHANGES.txt (original) +++ lxml/branch/lxml-2.0/CHANGES.txt Sun Apr 13 19:27:20 2008 @@ -2,8 +2,8 @@ lxml changelog ============== -Under development -================= +2.0.4 (2008-04-13) +================== Features added -------------- Modified: lxml/branch/lxml-2.0/doc/main.txt ============================================================================== --- lxml/branch/lxml-2.0/doc/main.txt (original) +++ lxml/branch/lxml-2.0/doc/main.txt Sun Apr 13 19:27:20 2008 @@ -145,8 +145,8 @@ .. _`lxml at the Python Package Index`: http://pypi.python.org/pypi/lxml/ .. _`this key`: pubkey.asc -The latest version is `lxml 2.0.3`_, released 2008-03-26 -(`changes for 2.0.3`_). `Older versions`_ are listed below. +The latest version is `lxml 2.0.4`_, released 2008-04-13 +(`changes for 2.0.4`_). `Older versions`_ are listed below. .. _`Older versions`: #old-versions @@ -206,6 +206,8 @@ Old Versions ------------ +* `lxml 2.0.3`_, released 2008-03-26 (`changes for 2.0.3`_) + * `lxml 2.0.2`_, released 2008-02-22 (`changes for 2.0.2`_) * `lxml 2.0.1`_, released 2008-02-13 (`changes for 2.0.1`_) @@ -262,6 +264,7 @@ * `lxml 0.5`_, released 2005-04-08 +.. _`lxml 2.0.4`: lxml-2.0.4.tgz .. _`lxml 2.0.3`: lxml-2.0.3.tgz .. _`lxml 2.0.2`: lxml-2.0.2.tgz .. _`lxml 2.0.1`: lxml-2.0.1.tgz @@ -291,6 +294,7 @@ .. _`lxml 0.5.1`: lxml-0.5.1.tgz .. _`lxml 0.5`: lxml-0.5.tgz +.. _`changes for 2.0.4`: changes-2.0.4.html .. _`changes for 2.0.3`: changes-2.0.3.html .. _`changes for 2.0.2`: changes-2.0.2.html .. _`changes for 2.0.1`: changes-2.0.1.html Modified: lxml/branch/lxml-2.0/version.txt ============================================================================== --- lxml/branch/lxml-2.0/version.txt (original) +++ lxml/branch/lxml-2.0/version.txt Sun Apr 13 19:27:20 2008 @@ -1 +1 @@ -2.0.3 +2.0.4 From scoder at codespeak.net Sun Apr 13 20:28:23 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 13 Apr 2008 20:28:23 +0200 (CEST) Subject: [Lxml-checkins] r53746 - lxml/branch/lxml-2.0/doc Message-ID: <20080413182823.406462A0187@codespeak.net> Author: scoder Date: Sun Apr 13 20:28:20 2008 New Revision: 53746 Modified: lxml/branch/lxml-2.0/doc/elementsoup.txt Log: partial doc merge from trunk Modified: lxml/branch/lxml-2.0/doc/elementsoup.txt ============================================================================== --- lxml/branch/lxml-2.0/doc/elementsoup.txt (original) +++ lxml/branch/lxml-2.0/doc/elementsoup.txt Sun Apr 13 20:28:20 2008 @@ -3,22 +3,28 @@ ==================== BeautifulSoup_ is a Python package that parses broken HTML. While libxml2 -(and thus lxml) can also parse broken HTML, BeautifulSoup is much more +(and thus lxml) can also parse broken HTML, BeautifulSoup is a bit more forgiving and has superiour `support for encoding detection`_. .. _BeautifulSoup: http://www.crummy.com/software/BeautifulSoup/ .. _`support for encoding detection`: http://www.crummy.com/software/BeautifulSoup/documentation.html#Beautiful%20Soup%20Gives%20You%20Unicode,%20Dammit +.. _ElementSoup: http://effbot.org/zone/element-soup.htm lxml can benefit from the parsing capabilities of BeautifulSoup through the ``lxml.html.soupparser`` module. It provides three main functions: ``fromstring()`` and ``parse()`` to parse a string or file -using BeautifulSoup, and `convert_tree()` to convert an existing +using BeautifulSoup, and ``convert_tree()`` to convert an existing BeautifulSoup tree into a list of top-level Elements. The functions ``fromstring()`` and ``parse()`` behave as known from ElementTree. The first returns a root Element, the latter returns an ElementTree. +There is also a legacy module called ``lxml.html.ElementSoup``, which +mimics the interface provided by ElementTree's own ElementSoup_ +module. Note that the ``soupparser`` module was added in lxml 2.0.3. +Previous versions of lxml 2.0.x only have the ``ElementSoup`` module. + Here is a document full of tag soup, similar to, but not quite like, HTML:: >>> tag_soup = 'Hello</head<body onload=crash()>Hi all<p>' @@ -47,6 +53,10 @@ ``makeelement`` factory function to ``parse()`` and ``fromstring()``. By default, this is based on the HTML parser defined in ``lxml.html``. + +Entity handling +=============== + By default, the BeautifulSoup parser also replaces the entities it finds by their character equivalent:: @@ -83,4 +93,41 @@ mimics the interface provided by ElementTree's own ElementSoup_ module. -.. _ElementSoup: http://effbot.org/zone/element-soup.htm + +Using soupparser as a fallback +============================== + +The downside of using this parser is that it is `much slower`_ than +the HTML parser of lxml. So if performance matters, you might want to +consider using ``soupparser`` only as a fallback for certain cases. + +.. _`much slower`: http://blog.ianbicking.org/2008/03/30/python-html-parser-performance/ + +One common problem of lxml's parser is that it might not get the +encoding right in cases where the document contains a ``<meta>`` tag +at the wrong place. In this case, you can exploit the fact that lxml +serialises much faster than most other HTML libraries for Python. +Just serialise the document to unicode and if that gives you an +exception, re-parse it with BeautifulSoup to see if that works +better:: + + >>> tag_soup = '''\ + ... <meta http-equiv="Content-Type" + ... content="text/html;charset=utf-8" /> + ... <html> + ... <head> + ... <title>Hello W\xc3\xb6rld! + ... + ... Hi all + ... ''' + + >>> import lxml.html + >>> import lxml.html.soupparser + + >>> root = lxml.html.fromstring(tag_soup) + >>> try: + ... ignore = tostring(root, encoding=unicode) + ... except UnicodeDecodeError: + ... root = lxml.html.soupparser.fromstring(tag_soup) + ... # try again, but don't catch the exception this time + ... ignore = tostring(root, encoding=unicode) From scoder at codespeak.net Sun Apr 13 20:28:50 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 13 Apr 2008 20:28:50 +0200 (CEST) Subject: [Lxml-checkins] r53747 - in lxml/branch/lxml-2.0: . src/lxml Message-ID: <20080413182850.AD1ED16844F@codespeak.net> Author: scoder Date: Sun Apr 13 20:28:50 2008 New Revision: 53747 Modified: lxml/branch/lxml-2.0/CHANGES.txt lxml/branch/lxml-2.0/src/lxml/apihelpers.pxi Log: missing bug fix for 2.0.4 Modified: lxml/branch/lxml-2.0/CHANGES.txt ============================================================================== --- lxml/branch/lxml-2.0/CHANGES.txt (original) +++ lxml/branch/lxml-2.0/CHANGES.txt Sun Apr 13 20:28:50 2008 @@ -11,6 +11,8 @@ Bugs fixed ---------- +* Hanging thread in conjunction with GTK threading. + * Crash bug in iterparse when moving elements into other documents. * HTML elements' ``.cssselect()`` method was broken. Modified: lxml/branch/lxml-2.0/src/lxml/apihelpers.pxi ============================================================================== --- lxml/branch/lxml-2.0/src/lxml/apihelpers.pxi (original) +++ lxml/branch/lxml-2.0/src/lxml/apihelpers.pxi Sun Apr 13 20:28:50 2008 @@ -1103,20 +1103,16 @@ Returns None if not a file object. """ # file instances have a name attribute - try: - return source.name - except AttributeError: - pass + filename = getattr3(source, 'name', None) + if filename is not None: + return filename # gzip file instances have a filename attribute - try: - return source.filename - except AttributeError: - pass + filename = getattr3(source, 'filename', None) + if filename is not None: + return filename # urllib2 provides a geturl() method - try: - geturl = source.geturl - except AttributeError: - # can't determine filename - return None - else: + geturl = getattr3(source, 'geturl', None) + if geturl is not None: return geturl() + # can't determine filename + return None From scoder at codespeak.net Sun Apr 13 20:37:36 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 13 Apr 2008 20:37:36 +0200 (CEST) Subject: [Lxml-checkins] r53748 - lxml/branch/lxml-2.0/doc Message-ID: <20080413183736.1FB8F168554@codespeak.net> Author: scoder Date: Sun Apr 13 20:37:35 2008 New Revision: 53748 Modified: lxml/branch/lxml-2.0/doc/elementsoup.txt Log: remove redundant example code from docs Modified: lxml/branch/lxml-2.0/doc/elementsoup.txt ============================================================================== --- lxml/branch/lxml-2.0/doc/elementsoup.txt (original) +++ lxml/branch/lxml-2.0/doc/elementsoup.txt Sun Apr 13 20:37:35 2008 @@ -129,5 +129,3 @@ ... ignore = tostring(root, encoding=unicode) ... except UnicodeDecodeError: ... root = lxml.html.soupparser.fromstring(tag_soup) - ... # try again, but don't catch the exception this time - ... ignore = tostring(root, encoding=unicode) From scoder at codespeak.net Mon Apr 14 09:58:41 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 14 Apr 2008 09:58:41 +0200 (CEST) Subject: [Lxml-checkins] r53750 - lxml/trunk Message-ID: <20080414075841.DDC8F39B594@codespeak.net> Author: scoder Date: Mon Apr 14 09:58:39 2008 New Revision: 53750 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/INSTALL.txt Log: r3957 at delle: sbehnel | 2008-04-13 20:12:13 +0200 cleanup after release of 2.0.4 Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Mon Apr 14 09:58:39 2008 @@ -17,14 +17,8 @@ Bugs fixed ---------- -* Crash bug in iterparse when moving elements into other documents. - -* ``ElementTree.find*()`` didn't accept QName objects. - * lxml.etree accepted non well-formed namespace prefix names. -* HTML elements' ``.cssselect()`` method was broken. - Other changes ------------- @@ -35,6 +29,27 @@ context. +2.0.4 (2008-04-13) +================== + +Features added +-------------- + +Bugs fixed +---------- + +* Hanging thread in conjunction with GTK threading. + +* Crash bug in iterparse when moving elements into other documents. + +* HTML elements' ``.cssselect()`` method was broken. + +* ``ElementTree.find*()`` didn't accept QName objects. + +Other changes +------------- + + 2.1alpha1 (2008-03-27) ====================== Modified: lxml/trunk/INSTALL.txt ============================================================================== --- lxml/trunk/INSTALL.txt (original) +++ lxml/trunk/INSTALL.txt Mon Apr 14 09:58:39 2008 @@ -11,7 +11,8 @@ * libxml 2.6.20 or later. It can be found here: http://xmlsoft.org/downloads.html - If you want to use XPath, do not use libxml2 2.6.27. + If you want to use XPath, do not use libxml2 2.6.27. We recommend + libxml2 2.6.28 or later. * libxslt 1.1.15 or later. It can be found here: http://xmlsoft.org/XSLT/downloads.html From scoder at codespeak.net Mon Apr 14 09:58:45 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 14 Apr 2008 09:58:45 +0200 (CEST) Subject: [Lxml-checkins] r53751 - in lxml/trunk: . doc Message-ID: <20080414075845.68A4D39B594@codespeak.net> Author: scoder Date: Mon Apr 14 09:58:43 2008 New Revision: 53751 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/elementsoup.txt Log: r3958 at delle: sbehnel | 2008-04-13 20:35:31 +0200 remove redundant example code from docs Modified: lxml/trunk/doc/elementsoup.txt ============================================================================== --- lxml/trunk/doc/elementsoup.txt (original) +++ lxml/trunk/doc/elementsoup.txt Mon Apr 14 09:58:43 2008 @@ -138,6 +138,3 @@ ... ignore = tostring(root, encoding=unicode) ... except UnicodeDecodeError: ... root = lxml.html.soupparser.fromstring(tag_soup) - ... # try again, but don't catch the exception this time - ... ignore = tostring(root, encoding=unicode) - From scoder at codespeak.net Mon Apr 14 13:11:28 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 14 Apr 2008 13:11:28 +0200 (CEST) Subject: [Lxml-checkins] r53753 - in lxml/branch/lxml-2.0: . src/lxml Message-ID: <20080414111128.3EE1016853F@codespeak.net> Author: scoder Date: Mon Apr 14 13:11:26 2008 New Revision: 53753 Modified: lxml/branch/lxml-2.0/CHANGES.txt lxml/branch/lxml-2.0/src/lxml/schematron.pxi Log: memory leak in Schematron with libxml2 >= 2.6.31 Modified: lxml/branch/lxml-2.0/CHANGES.txt ============================================================================== --- lxml/branch/lxml-2.0/CHANGES.txt (original) +++ lxml/branch/lxml-2.0/CHANGES.txt Mon Apr 14 13:11:26 2008 @@ -2,6 +2,21 @@ lxml changelog ============== +Under development +================= + +Features added +-------------- + +Bugs fixed +---------- + +* Memory leak in Schematron with libxml2 >= 2.6.31. + +Other changes +------------- + + 2.0.4 (2008-04-13) ================== Modified: lxml/branch/lxml-2.0/src/lxml/schematron.pxi ============================================================================== --- lxml/branch/lxml-2.0/src/lxml/schematron.pxi (original) +++ lxml/branch/lxml-2.0/src/lxml/schematron.pxi Mon Apr 14 13:11:26 2008 @@ -74,24 +74,26 @@ the file system. """ cdef schematron.xmlSchematron* _c_schema + cdef xmlDoc* _c_schema_doc def __init__(self, etree=None, *, file=None): cdef _Document doc cdef _Element root_node cdef xmlNode* c_node - cdef xmlDoc* c_doc cdef char* c_href cdef schematron.xmlSchematronParserCtxt* parser_ctxt + self._c_schema = NULL + self._c_schema_doc = NULL _Validator.__init__(self) if not config.ENABLE_SCHEMATRON: raise SchematronError( "lxml.etree was compiled without Schematron support.") - self._c_schema = NULL if etree is not None: doc = _documentOrRaise(etree) root_node = _rootNodeOrRaise(etree) - c_doc = _copyDocRoot(doc._c_doc, root_node._c_node) + self._c_schema_doc = _copyDocRoot(doc._c_doc, root_node._c_node) self._error_log.connect() - parser_ctxt = schematron.xmlSchematronNewDocParserCtxt(c_doc) + parser_ctxt = schematron.xmlSchematronNewDocParserCtxt( + self._c_schema_doc) elif file is not None: filename = _getFilenameForFile(file) if filename is None: @@ -100,12 +102,14 @@ filename = _encodeFilename(filename) self._error_log.connect() parser_ctxt = schematron.xmlSchematronNewParserCtxt(_cstr(filename)) - c_doc = NULL else: raise SchematronParseError("No tree or file given") if parser_ctxt is NULL: self._error_log.disconnect() + if self._c_schema_doc is not NULL: + tree.xmlFreeDoc(self._c_schema_doc) + self._c_schema_doc = NULL python.PyErr_NoMemory() return @@ -114,16 +118,17 @@ schematron.xmlSchematronFreeParserCtxt(parser_ctxt) if self._c_schema is NULL: - if _LIBXML_VERSION_INT >= 20631: - # leak in older versions instead of just crashing - if c_doc is not NULL: - tree.xmlFreeDoc(c_doc) raise SchematronParseError( "Document is not a valid Schematron schema", self._error_log) def __dealloc__(self): schematron.xmlSchematronFree(self._c_schema) + if _LIBXML_VERSION_INT >= 20631: + # earlier libxml2 versions may have freed the document in + # xmlSchematronFree() already, we don't know ... + if self._c_schema_doc is not NULL: + tree.xmlFreeDoc(self._c_schema_doc) def __call__(self, etree): """__call__(self, etree) From scoder at codespeak.net Tue Apr 15 15:28:48 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 15 Apr 2008 15:28:48 +0200 (CEST) Subject: [Lxml-checkins] r53781 - in lxml/trunk: . src/lxml Message-ID: <20080415132848.0126E169EAF@codespeak.net> Author: scoder Date: Tue Apr 15 15:28:46 2008 New Revision: 53781 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/etree_defs.h lxml/trunk/src/lxml/schematron.pxd lxml/trunk/src/lxml/schematron.pxi Log: r3965 at delle: sbehnel | 2008-04-14 12:17:48 +0200 fix for Schematron error logging on libxml2 2.6.32 Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Tue Apr 15 15:28:46 2008 @@ -8,6 +8,8 @@ Features added -------------- +* Error logging in Schematron (requires libxml2 2.6.32 or later). + * Parser option ``strip_cdata`` for normalising or keeping CDATA sections. Defaults to ``True`` as before, thus replacing CDATA sections by their text content. Modified: lxml/trunk/src/lxml/etree_defs.h ============================================================================== --- lxml/trunk/src/lxml/etree_defs.h (original) +++ lxml/trunk/src/lxml/etree_defs.h Tue Apr 15 15:28:46 2008 @@ -73,11 +73,13 @@ # if LIBXML_VERSION < 20632 /* schematron error reporting was added in libxml2 2.6.32 */ # define xmlSchematronSetValidStructuredErrors(ctxt, errorfunc, data) +# define XML_SCHEMATRON_OUT_ERROR 0 # endif #else # define ENABLE_SCHEMATRON 0 # define XML_SCHEMATRON_OUT_QUIET 0 # define XML_SCHEMATRON_OUT_XML 0 +# define XML_SCHEMATRON_OUT_ERROR 0 typedef void xmlSchematron; typedef void xmlSchematronParserCtxt; typedef void xmlSchematronValidCtxt; Modified: lxml/trunk/src/lxml/schematron.pxd ============================================================================== --- lxml/trunk/src/lxml/schematron.pxd (original) +++ lxml/trunk/src/lxml/schematron.pxd Tue Apr 15 15:28:46 2008 @@ -7,12 +7,13 @@ ctypedef struct xmlSchematronValidCtxt ctypedef enum xmlSchematronValidOptions: - XML_SCHEMATRON_OUT_QUIET = 1 # quiet no report - XML_SCHEMATRON_OUT_TEXT = 2 # build a textual report - XML_SCHEMATRON_OUT_XML = 4 # output SVRL - XML_SCHEMATRON_OUT_FILE = 256 # output to a file descriptor - XML_SCHEMATRON_OUT_BUFFER = 512 # output to a buffer - XML_SCHEMATRON_OUT_IO = 1024 # output to I/O mechanism + XML_SCHEMATRON_OUT_QUIET = 1 # quiet no report + XML_SCHEMATRON_OUT_TEXT = 2 # build a textual report + XML_SCHEMATRON_OUT_XML = 4 # output SVRL + XML_SCHEMATRON_OUT_ERROR = 8 # output via xmlStructuredErrorFunc + XML_SCHEMATRON_OUT_FILE = 256 # output to a file descriptor + XML_SCHEMATRON_OUT_BUFFER = 512 # output to a buffer + XML_SCHEMATRON_OUT_IO = 1024 # output to I/O mechanism cdef xmlSchematronParserCtxt* xmlSchematronNewDocParserCtxt( xmlDoc* doc) nogil Modified: lxml/trunk/src/lxml/schematron.pxi ============================================================================== --- lxml/trunk/src/lxml/schematron.pxi (original) +++ lxml/trunk/src/lxml/schematron.pxi Tue Apr 15 15:28:46 2008 @@ -141,7 +141,8 @@ doc = _documentOrRaise(etree) root_node = _rootNodeOrRaise(etree) - if _LIBXML_VERSION_INT >= 20632: + if _LIBXML_VERSION_INT >= 20632 and \ + schematron.XML_SCHEMATRON_OUT_ERROR != 0: options = schematron.XML_SCHEMATRON_OUT_ERROR else: options = schematron.XML_SCHEMATRON_OUT_QUIET @@ -154,8 +155,9 @@ return python.PyErr_NoMemory() self._error_log.connect() - schematron.xmlSchematronSetValidStructuredErrors( - valid_ctxt, _receiveError, self.error_log) + if _LIBXML_VERSION_INT >= 20632: + schematron.xmlSchematronSetValidStructuredErrors( + valid_ctxt, _receiveError, self.error_log) c_doc = _fakeRootDoc(doc._c_doc, root_node._c_node) with nogil: ret = schematron.xmlSchematronValidateDoc(valid_ctxt, c_doc) From scoder at codespeak.net Tue Apr 15 15:28:53 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 15 Apr 2008 15:28:53 +0200 (CEST) Subject: [Lxml-checkins] r53782 - in lxml/trunk: . src/lxml Message-ID: <20080415132853.86DFF169EAF@codespeak.net> Author: scoder Date: Tue Apr 15 15:28:52 2008 New Revision: 53782 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/schematron.pxi Log: r3966 at delle: sbehnel | 2008-04-14 13:07:21 +0200 memory leak in schematron with libxml2 >= 2.6.31 Modified: lxml/trunk/src/lxml/schematron.pxi ============================================================================== --- lxml/trunk/src/lxml/schematron.pxi (original) +++ lxml/trunk/src/lxml/schematron.pxi Tue Apr 15 15:28:52 2008 @@ -74,24 +74,26 @@ the file system. """ cdef schematron.xmlSchematron* _c_schema + cdef xmlDoc* _c_schema_doc def __init__(self, etree=None, *, file=None): cdef _Document doc cdef _Element root_node cdef xmlNode* c_node - cdef xmlDoc* c_doc cdef char* c_href cdef schematron.xmlSchematronParserCtxt* parser_ctxt + self._c_schema = NULL + self._c_schema_doc = NULL _Validator.__init__(self) if not config.ENABLE_SCHEMATRON: raise SchematronError( "lxml.etree was compiled without Schematron support.") - self._c_schema = NULL if etree is not None: doc = _documentOrRaise(etree) root_node = _rootNodeOrRaise(etree) - c_doc = _copyDocRoot(doc._c_doc, root_node._c_node) + self._c_schema_doc = _copyDocRoot(doc._c_doc, root_node._c_node) self._error_log.connect() - parser_ctxt = schematron.xmlSchematronNewDocParserCtxt(c_doc) + parser_ctxt = schematron.xmlSchematronNewDocParserCtxt( + self._c_schema_doc) elif file is not None: filename = _getFilenameForFile(file) if filename is None: @@ -100,12 +102,14 @@ filename = _encodeFilename(filename) self._error_log.connect() parser_ctxt = schematron.xmlSchematronNewParserCtxt(_cstr(filename)) - c_doc = NULL else: raise SchematronParseError("No tree or file given") if parser_ctxt is NULL: self._error_log.disconnect() + if self._c_schema_doc is not NULL: + tree.xmlFreeDoc(self._c_schema_doc) + self._c_schema_doc = NULL python.PyErr_NoMemory() return @@ -114,16 +118,17 @@ schematron.xmlSchematronFreeParserCtxt(parser_ctxt) if self._c_schema is NULL: - if _LIBXML_VERSION_INT >= 20631: - # leak in older versions instead of just crashing - if c_doc is not NULL: - tree.xmlFreeDoc(c_doc) raise SchematronParseError( "Document is not a valid Schematron schema", self._error_log) def __dealloc__(self): schematron.xmlSchematronFree(self._c_schema) + if _LIBXML_VERSION_INT >= 20631: + # earlier libxml2 versions may have freed the document in + # xmlSchematronFree() already, we don't know ... + if self._c_schema_doc is not NULL: + tree.xmlFreeDoc(self._c_schema_doc) def __call__(self, etree): """__call__(self, etree) From scoder at codespeak.net Tue Apr 15 15:28:58 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 15 Apr 2008 15:28:58 +0200 (CEST) Subject: [Lxml-checkins] r53783 - lxml/trunk Message-ID: <20080415132858.A47C2169EB0@codespeak.net> Author: scoder Date: Tue Apr 15 15:28:57 2008 New Revision: 53783 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt Log: r3967 at delle: sbehnel | 2008-04-14 13:11:16 +0200 changelog Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Tue Apr 15 15:28:57 2008 @@ -19,6 +19,8 @@ Bugs fixed ---------- +* Memory leak in Schematron with libxml2 >= 2.6.31. + * lxml.etree accepted non well-formed namespace prefix names. Other changes From scoder at codespeak.net Tue Apr 15 15:29:03 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 15 Apr 2008 15:29:03 +0200 (CEST) Subject: [Lxml-checkins] r53784 - in lxml/trunk: . src/lxml Message-ID: <20080415132903.7DDE1169EB3@codespeak.net> Author: scoder Date: Tue Apr 15 15:29:02 2008 New Revision: 53784 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/xmlerror.pxi lxml/trunk/src/lxml/xslt.pxi Log: r3968 at delle: sbehnel | 2008-04-15 15:26:33 +0200 some cleanup, small fix: line count starts at 1 Modified: lxml/trunk/src/lxml/xmlerror.pxi ============================================================================== --- lxml/trunk/src/lxml/xmlerror.pxi (original) +++ lxml/trunk/src/lxml/xmlerror.pxi Tue Apr 15 15:29:02 2008 @@ -461,7 +461,7 @@ if cstd.strstr(msg, 'line %d'): c_error.line = cstd.va_int(args) else: - c_error.line = -1 + c_error.line = 0 if cstd.strstr(msg, 'element %s'): c_element = cstd.va_charptr(args) else: Modified: lxml/trunk/src/lxml/xslt.pxi ============================================================================== --- lxml/trunk/src/lxml/xslt.pxi (original) +++ lxml/trunk/src/lxml/xslt.pxi Tue Apr 15 15:29:02 2008 @@ -496,22 +496,24 @@ if resolver_context is not None and resolver_context._has_raised(): if c_result is not NULL: tree.xmlFreeDoc(c_result) + c_result = NULL resolver_context._raise_if_stored() if context._exc._has_raised(): if c_result is not NULL: tree.xmlFreeDoc(c_result) + c_result = NULL context._exc._raise_if_stored() if c_result is NULL: # last error seems to be the most accurate here error = self._error_log.last_error if error is not None and error.message: - if error.line >= 0: + if error.line > 0: message = "%s, line %d" % (error.message, error.line) else: message = error.message - elif error is not None and error.line >= 0: + elif error is not None and error.line > 0: message = "Error applying stylesheet, line %d" % error.line else: message = "Error applying stylesheet" From scoder at codespeak.net Tue Apr 15 15:29:08 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 15 Apr 2008 15:29:08 +0200 (CEST) Subject: [Lxml-checkins] r53785 - in lxml/trunk: . src/lxml/tests Message-ID: <20080415132908.9287C169EAF@codespeak.net> Author: scoder Date: Tue Apr 15 15:29:07 2008 New Revision: 53785 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/tests/test_elementtree.py Log: r3969 at delle: sbehnel | 2008-04-15 15:26:49 +0200 test cleanup Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Tue Apr 15 15:29:07 2008 @@ -2428,18 +2428,16 @@ def test_ns_move(self): Element = self.etree.Element - ElementTree = self.etree.ElementTree - - one = self.etree.parse( - StringIO('')) - baz = one.getroot()[0][0] + one = self.etree.fromstring( + '') + baz = one[0][0] - two = ElementTree(Element('root')) - two.getroot().append(baz) + two = Element('root') + two.append(baz) # removing the originating document could cause a crash/error before # as namespace is not moved along with it - del one - self.assertEquals('{http://a.b.c}baz', baz.tag) + del one, baz + self.assertEquals('{http://a.b.c}baz', two[0].tag) def test_ns_decl_tostring(self): tostring = self.etree.tostring From scoder at codespeak.net Tue Apr 15 15:29:13 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 15 Apr 2008 15:29:13 +0200 (CEST) Subject: [Lxml-checkins] r53786 - in lxml/trunk: . doc Message-ID: <20080415132913.468A5169EB3@codespeak.net> Author: scoder Date: Tue Apr 15 15:29:12 2008 New Revision: 53786 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/build.txt Log: r3970 at delle: sbehnel | 2008-04-15 15:27:19 +0200 require Cython 0.9.6.12 Modified: lxml/trunk/doc/build.txt ============================================================================== --- lxml/trunk/doc/build.txt (original) +++ lxml/trunk/doc/build.txt Tue Apr 15 15:29:12 2008 @@ -46,8 +46,8 @@ easy_install Cython==0.9.6.12 -lxml currently requires Cython 0.9.6.11b or 0.9.6.12, later versions -were not tested. +lxml currently requires Cython 0.9.6.12. Any 0.9.6.13 version will not +work, later versions were not tested. Subversion From scoder at codespeak.net Tue Apr 15 16:54:09 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 15 Apr 2008 16:54:09 +0200 (CEST) Subject: [Lxml-checkins] r53787 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20080415145409.B2A5C169EA3@codespeak.net> Author: scoder Date: Tue Apr 15 16:54:07 2008 New Revision: 53787 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/tests/test_xslt.py lxml/trunk/src/lxml/xslt.pxi Log: r3978 at delle: sbehnel | 2008-04-15 16:52:51 +0200 some XSLT errors could pass silently Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Tue Apr 15 16:54:07 2008 @@ -19,6 +19,9 @@ Bugs fixed ---------- +* lxml did not honour libxslt's second error state "STOPPED", which + let some XSLT errors pass silently. + * Memory leak in Schematron with libxml2 >= 2.6.31. * lxml.etree accepted non well-formed namespace prefix names. Modified: lxml/trunk/src/lxml/tests/test_xslt.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_xslt.py (original) +++ lxml/trunk/src/lxml/tests/test_xslt.py Tue Apr 15 16:54:07 2008 @@ -299,11 +299,12 @@ ''', str(res)) - def _test_xslt_parameter_invalid(self): + def test_xslt_parameter_invalid(self): tree = self.parse('BC') style = self.parse('''\ + @@ -311,8 +312,6 @@ st = etree.XSLT(style) res = self.assertRaises(etree.XSLTApplyError, - st, tree, bar="test") - res = self.assertRaises(etree.XSLTApplyError, st, tree, bar="") res = self.assertRaises(etree.XSLTApplyError, st, tree, bar="....") @@ -521,8 +520,8 @@ source = self.parse(xml) styledoc = self.parse(xslt) style = etree.XSLT(styledoc) - result = style(source) - self.assertEqual('', str(result)) + + self.assertRaises(etree.XSLTApplyError, style, source) self.assert_("TEST TEST TEST" in [entry.message for entry in style.error_log]) Modified: lxml/trunk/src/lxml/xslt.pxi ============================================================================== --- lxml/trunk/src/lxml/xslt.pxi (original) +++ lxml/trunk/src/lxml/xslt.pxi Tue Apr 15 16:54:07 2008 @@ -481,6 +481,11 @@ c_result = self._run_transform( c_doc, _kw, context, transform_ctxt) + if transform_ctxt.state != xslt.XSLT_STATE_OK: + if c_result is not NULL: + tree.xmlFreeDoc(c_result) + c_result = NULL + if transform_ctxt.profile: c_profile_doc = xslt.xsltGetProfileInformation(transform_ctxt) if c_profile_doc is not NULL: From scoder at codespeak.net Tue Apr 15 17:44:13 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 15 Apr 2008 17:44:13 +0200 (CEST) Subject: [Lxml-checkins] r53788 - in lxml/trunk: . src/lxml Message-ID: <20080415154413.643B1169F28@codespeak.net> Author: scoder Date: Tue Apr 15 17:44:10 2008 New Revision: 53788 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/proxy.pxi Log: r3980 at delle: sbehnel | 2008-04-15 17:42:57 +0200 huge cleanup in moveNodeToDocument() function Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Tue Apr 15 17:44:10 2008 @@ -29,6 +29,10 @@ Other changes ------------- +* Major cleanup in internal ``moveNodeToDocument()`` function, which + takes care of namespace cleanup when moving elements between + different namespace contexts. + * New Elements created through the ``makeelement()`` method of an HTML parser or through lxml.html now end up in a new HTML document (doctype HTML 4.01 Transitional) instead of a generic XML document. Modified: lxml/trunk/src/lxml/proxy.pxi ============================================================================== --- lxml/trunk/src/lxml/proxy.pxi (original) +++ lxml/trunk/src/lxml/proxy.pxi Tue Apr 15 17:44:10 2008 @@ -46,6 +46,18 @@ python.Py_XDECREF(proxy._gc_doc) proxy._gc_doc = NULL +cdef inline void _updateProxyDocument(xmlNode* c_node, _Document doc): + """Replace the document reference of a proxy. + + This may deallocate the original document of the proxy! + """ + cdef _Element element = <_Element>c_node._private + if element._doc is not doc: + python.Py_INCREF(doc) + python.Py_DECREF(element._doc) + element._doc = doc + element._gc_doc = doc + ################################################################################ # temporarily make a node the root node of its document @@ -198,6 +210,72 @@ c_new_ns = c_new_ns.next c_parent = c_parent.parent +ctypedef struct _nscache: + xmlNs** new + xmlNs** old + cstd.size_t size + cstd.size_t last + +cdef int _growNsCache(_nscache* c_ns_cache) except -1: + cdef xmlNs** c_ns_ptr + if c_ns_cache.size == 0: + c_ns_cache.size = 20 + else: + c_ns_cache.size *= 2 + c_ns_ptr = cstd.realloc( + c_ns_cache.new, c_ns_cache.size * sizeof(xmlNs*)) + if c_ns_ptr is not NULL: + c_ns_cache.new = c_ns_ptr + c_ns_ptr = cstd.realloc( + c_ns_cache.old, c_ns_cache.size * sizeof(xmlNs*)) + if c_ns_ptr is not NULL: + c_ns_cache.old = c_ns_ptr + else: + cstd.free(c_ns_cache.new) + cstd.free(c_ns_cache.old) + python.PyErr_NoMemory() + return -1 + return 0 + +cdef inline int _appendToNsCache(_nscache* c_ns_cache, + xmlNs* c_old_ns, xmlNs* c_new_ns) except -1: + if c_ns_cache.last >= c_ns_cache.size: + _growNsCache(c_ns_cache) + c_ns_cache.old[c_ns_cache.last] = c_old_ns + c_ns_cache.new[c_ns_cache.last] = c_new_ns + c_ns_cache.last += 1 + +cdef int _stripRedundantNamespaceDeclarations( + xmlNode* c_element, _nscache* c_ns_cache, xmlNs** c_del_ns_list) except -1: + """Removes namespace declarations from an element that are already + defined in its parents. Does not free the xmlNs's, just prepends + them to the c_del_ns_list. + """ + cdef xmlNs* c_ns + cdef xmlNs* c_ns_next + cdef xmlNs** c_nsdef + # use a xmlNs** to handle assignments to "c_element.nsDef" correctly + c_nsdef = &c_element.nsDef + while c_nsdef[0] is not NULL: + c_ns = tree.xmlSearchNsByHref( + c_element.doc, c_element.parent, c_nsdef[0].href) + if c_ns is NULL: + # new namespace href => keep and cache the ns declaration + _appendToNsCache(c_ns_cache, c_nsdef[0], c_nsdef[0]) + c_nsdef = &c_nsdef[0].next + else: + # known namespace href => strip the ns + if c_ns is tree.xmlSearchNs(c_element.doc, c_element.parent, + c_ns.prefix): + # prefix is not shadowed by parents => ns is reusable + _appendToNsCache(c_ns_cache, c_nsdef[0], c_ns) + # cut out c_nsdef.next and prepend it to garbage chain + c_ns_next = c_nsdef[0].next + c_nsdef[0].next = c_del_ns_list[0] + c_del_ns_list[0] = c_nsdef[0] + c_nsdef[0] = c_ns_next + return 0 + cdef int moveNodeToDocument(_Document doc, xmlNode* c_element) except -1: """Fix the xmlNs pointers of a node and its subtree that were moved. @@ -223,96 +301,48 @@ step 1), but freed only after the complete subtree was traversed and all occurrences were replaced by tree-internal pointers. """ - cdef _Element element - cdef xmlDoc* c_doc cdef xmlNode* c_start_node cdef xmlNode* c_node - cdef xmlNs** c_ns_ptr - cdef xmlNs** c_ns_new_cache - cdef xmlNs** c_ns_old_cache + cdef _nscache c_ns_cache cdef xmlNs* c_ns cdef xmlNs* c_ns_next cdef xmlNs* c_nsdef - cdef xmlNs* c_new_ns - cdef xmlNs* c_del_ns - cdef cstd.size_t i, c_cache_size, c_cache_last + cdef xmlNs* c_del_ns_list + cdef cstd.size_t i if not tree._isElementOrXInclude(c_element): return 0 - c_doc = c_element.doc c_start_node = c_element - c_ns_new_cache = NULL - c_ns_old_cache = NULL - c_cache_size = 0 - c_cache_last = 0 - c_del_ns = NULL + c_del_ns_list = NULL + + c_ns_cache.new = NULL + c_ns_cache.old = NULL + c_ns_cache.size = 0 + c_ns_cache.last = 0 while c_element is not NULL: # 1) cut out namespaces defined here that are already known by # the ancestors - c_nsdef = c_element.nsDef - if c_nsdef is not NULL: - # start with second nsdef to keep c_element.nsDef for now - while c_nsdef.next is not NULL: - if c_nsdef.next is c_element.ns: - c_nsdef = c_nsdef.next - continue - c_ns = tree.xmlSearchNsByHref( - c_element.doc, c_element.parent, c_nsdef.next.href) - if c_ns is NULL: - c_nsdef = c_nsdef.next - continue - # cut out c_nsdef.next and prepend it to garbage chain - c_ns_next = c_nsdef.next.next - c_nsdef.next.next = c_del_ns - c_del_ns = c_nsdef.next - c_nsdef.next = c_ns_next - # now handle c_element.nsDef - c_ns = tree.xmlSearchNsByHref( - c_element.doc, c_element.parent, c_element.nsDef.href) - if c_ns is not NULL: - c_ns_next = c_element.nsDef.next - c_element.nsDef.next = c_del_ns - c_del_ns = c_element.nsDef - c_element.nsDef = c_ns_next + if c_element.nsDef is not NULL: + _stripRedundantNamespaceDeclarations( + c_element, &c_ns_cache, &c_del_ns_list) - # 2) make sure the namespace of an element and its attributes - # is declared in this document (i.e. the node or its parents) + # 2) make sure the namespaces of an element and its attributes + # are declared in this document (i.e. on the node or its parents) c_node = c_element while c_node is not NULL: if c_node.ns is not NULL: - for i from 0 <= i < c_cache_last: - if c_node.ns is c_ns_old_cache[i]: - c_node.ns = c_ns_new_cache[i] + for i from 0 <= i < c_ns_cache.last: + if c_node.ns is c_ns_cache.old[i]: + c_node.ns = c_ns_cache.new[i] break else: # not in cache => find a replacement from this document - c_new_ns = doc._findOrBuildNodeNs( + c_ns = doc._findOrBuildNodeNs( c_element, c_node.ns.href, c_node.ns.prefix) - if c_cache_last >= c_cache_size: - # must resize cache - if c_cache_size == 0: - c_cache_size = 20 - else: - c_cache_size *= 2 - c_ns_ptr = cstd.realloc( - c_ns_new_cache, c_cache_size * sizeof(xmlNs*)) - if c_ns_ptr is not NULL: - c_ns_new_cache = c_ns_ptr - c_ns_ptr = cstd.realloc( - c_ns_old_cache, c_cache_size * sizeof(xmlNs*)) - if c_ns_ptr is not NULL: - c_ns_old_cache = c_ns_ptr - else: - cstd.free(c_ns_new_cache) - cstd.free(c_ns_old_cache) - python.PyErr_NoMemory() - return -1 - c_ns_new_cache[c_cache_last] = c_new_ns - c_ns_old_cache[c_cache_last] = c_node.ns - c_cache_last += 1 - c_node.ns = c_new_ns + _appendToNsCache(&c_ns_cache, c_node.ns, c_ns) + c_node.ns = c_ns if c_node is c_element: # after the element, continue with its attributes c_node = c_element.properties @@ -330,12 +360,7 @@ # 3) fix _Document reference (may dealloc the original document!) if c_element._private is not NULL: - element = <_Element>c_element._private - if element._doc is not doc: - python.Py_INCREF(doc) - python.Py_DECREF(element._doc) - element._doc = doc - element._gc_doc = doc + _updateProxyDocument(c_element, doc) if c_element is c_start_node: break # all done @@ -353,12 +378,7 @@ # 3) fix _Document reference (may dealloc the original document!) if c_element._private is not NULL: - element = <_Element>c_element._private - if element._doc is not doc: - python.Py_INCREF(doc) - python.Py_DECREF(element._doc) - element._doc = doc - element._gc_doc = doc + _updateProxyDocument(c_element, doc) if c_element is c_start_node: break @@ -372,13 +392,13 @@ c_element = c_node # free now unused namespace declarations - if c_del_ns is not NULL: - tree.xmlFreeNsList(c_del_ns) + if c_del_ns_list is not NULL: + tree.xmlFreeNsList(c_del_ns_list) # cleanup - if c_ns_new_cache is not NULL: - cstd.free(c_ns_new_cache) - if c_ns_old_cache is not NULL: - cstd.free(c_ns_old_cache) + if c_ns_cache.new is not NULL: + cstd.free(c_ns_cache.new) + if c_ns_cache.old is not NULL: + cstd.free(c_ns_cache.old) return 0 From scoder at codespeak.net Tue Apr 15 19:39:15 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 15 Apr 2008 19:39:15 +0200 (CEST) Subject: [Lxml-checkins] r53789 - in lxml/trunk: . src/lxml Message-ID: <20080415173915.8FACA498138@codespeak.net> Author: scoder Date: Tue Apr 15 19:39:12 2008 New Revision: 53789 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/schematron.pxi Log: r3982 at delle: sbehnel | 2008-04-15 19:37:57 +0200 fix for schematron error reporting Modified: lxml/trunk/src/lxml/schematron.pxi ============================================================================== --- lxml/trunk/src/lxml/schematron.pxi (original) +++ lxml/trunk/src/lxml/schematron.pxi Tue Apr 15 19:39:12 2008 @@ -162,7 +162,7 @@ self._error_log.connect() if _LIBXML_VERSION_INT >= 20632: schematron.xmlSchematronSetValidStructuredErrors( - valid_ctxt, _receiveError, self.error_log) + valid_ctxt, _receiveError, self._error_log) c_doc = _fakeRootDoc(doc._c_doc, root_node._c_node) with nogil: ret = schematron.xmlSchematronValidateDoc(valid_ctxt, c_doc) From scoder at codespeak.net Tue Apr 15 19:55:10 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 15 Apr 2008 19:55:10 +0200 (CEST) Subject: [Lxml-checkins] r53790 - in lxml/trunk: . doc Message-ID: <20080415175510.8FD95498136@codespeak.net> Author: scoder Date: Tue Apr 15 19:55:09 2008 New Revision: 53790 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/doc/main.txt lxml/trunk/version.txt Log: r3984 at delle: sbehnel | 2008-04-15 19:44:25 +0200 prepare release of 2.1beta1 Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Tue Apr 15 19:55:09 2008 @@ -2,8 +2,8 @@ lxml changelog ============== -Under development -================= +2.1beta1 (2008-04-15) +===================== Features added -------------- Modified: lxml/trunk/doc/main.txt ============================================================================== --- lxml/trunk/doc/main.txt (original) +++ lxml/trunk/doc/main.txt Tue Apr 15 19:55:09 2008 @@ -142,8 +142,8 @@ source release. If you can't wait, consider trying a less recent release version first. -The latest version is `lxml 2.1alpha1`_, released 2008-03-27 -(`changes for 2.1alpha1`_). `Older versions`_ are listed below. +The latest version is `lxml 2.1beta1`_, released 2008-04-15 +(`changes for 2.1beta1`_). `Older versions`_ are listed below. Please take a look at the `installation instructions`_! @@ -211,6 +211,10 @@ Old Versions ------------ +* `lxml 2.1alpha1`_, released 2008-03-27 (`changes for 2.1alpha1`_) + +* `lxml 2.0.4`_, released 2008-04-14 (`changes for 2.0.4`_) + * `lxml 2.0.3`_, released 2008-03-26 (`changes for 2.0.3`_) * `lxml 2.0.2`_, released 2008-02-22 (`changes for 2.0.2`_) @@ -269,6 +273,7 @@ * `lxml 0.5`_, released 2005-04-08 +.. _`lxml 2.1beta1`: lxml-2.1beta1.tgz .. _`lxml 2.1alpha1`: lxml-2.1alpha1.tgz .. _`lxml 2.0.3`: lxml-2.0.3.tgz .. _`lxml 2.0.2`: lxml-2.0.2.tgz @@ -299,6 +304,7 @@ .. _`lxml 0.5.1`: lxml-0.5.1.tgz .. _`lxml 0.5`: lxml-0.5.tgz +.. _`changes for 2.1beta1`: changes-2.1beta1.html .. _`changes for 2.1alpha1`: changes-2.1alpha1.html .. _`changes for 2.0.3`: changes-2.0.3.html .. _`changes for 2.0.2`: changes-2.0.2.html Modified: lxml/trunk/version.txt ============================================================================== --- lxml/trunk/version.txt (original) +++ lxml/trunk/version.txt Tue Apr 15 19:55:09 2008 @@ -1 +1 @@ -2.1alpha1 +2.1alpha2 From scoder at codespeak.net Tue Apr 15 19:55:16 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 15 Apr 2008 19:55:16 +0200 (CEST) Subject: [Lxml-checkins] r53791 - in lxml/trunk: . src/lxml Message-ID: <20080415175516.808B4498136@codespeak.net> Author: scoder Date: Tue Apr 15 19:55:15 2008 New Revision: 53791 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/parser.pxi Log: r3985 at delle: sbehnel | 2008-04-15 19:53:52 +0200 fix for resolving to filenames in custom resolvers Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Tue Apr 15 19:55:15 2008 @@ -19,6 +19,8 @@ Bugs fixed ---------- +* Resolving to a filename in custom resolvers didn't work. + * lxml did not honour libxslt's second error state "STOPPED", which let some XSLT errors pass silently. Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Tue Apr 15 19:55:15 2008 @@ -333,7 +333,7 @@ c_context, _cstr(data)) elif doc_ref._type == PARSER_DATA_FILENAME: c_input = xmlparser.xmlNewInputFromFile( - c_context, _cstr(doc_ref._data_bytes)) + c_context, _cstr(doc_ref._filename)) elif doc_ref._type == PARSER_DATA_FILE: file_context = _FileReaderContext(doc_ref._file, context, url) c_input = file_context._createParserInput(c_context) From scoder at codespeak.net Tue Apr 15 19:56:28 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 15 Apr 2008 19:56:28 +0200 (CEST) Subject: [Lxml-checkins] r53792 - in lxml/branch/lxml-2.0: . src/lxml Message-ID: <20080415175628.1305F49813F@codespeak.net> Author: scoder Date: Tue Apr 15 19:56:28 2008 New Revision: 53792 Modified: lxml/branch/lxml-2.0/CHANGES.txt lxml/branch/lxml-2.0/src/lxml/parser.pxi Log: trunk merge -c 53791: custom resolver fix Modified: lxml/branch/lxml-2.0/CHANGES.txt ============================================================================== --- lxml/branch/lxml-2.0/CHANGES.txt (original) +++ lxml/branch/lxml-2.0/CHANGES.txt Tue Apr 15 19:56:28 2008 @@ -11,6 +11,8 @@ Bugs fixed ---------- +* Resolving to a filename in custom resolvers didn't work. + * Memory leak in Schematron with libxml2 >= 2.6.31. Other changes Modified: lxml/branch/lxml-2.0/src/lxml/parser.pxi ============================================================================== --- lxml/branch/lxml-2.0/src/lxml/parser.pxi (original) +++ lxml/branch/lxml-2.0/src/lxml/parser.pxi Tue Apr 15 19:56:28 2008 @@ -333,7 +333,7 @@ c_context, _cstr(data)) elif doc_ref._type == PARSER_DATA_FILENAME: c_input = xmlparser.xmlNewInputFromFile( - c_context, _cstr(doc_ref._data_bytes)) + c_context, _cstr(doc_ref._filename)) elif doc_ref._type == PARSER_DATA_FILE: file_context = _FileReaderContext(doc_ref._file, context, url) c_input = file_context._createParserInput(c_context) From scoder at codespeak.net Tue Apr 15 20:20:04 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 15 Apr 2008 20:20:04 +0200 (CEST) Subject: [Lxml-checkins] r53793 - in lxml/trunk: . src/lxml/tests Message-ID: <20080415182004.CA7A0169F21@codespeak.net> Author: scoder Date: Tue Apr 15 20:20:04 2008 New Revision: 53793 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/tests/test_etree.py Log: r3988 at delle: sbehnel | 2008-04-15 20:18:50 +0200 test for custom filename resolver Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Tue Apr 15 20:20:04 2008 @@ -612,6 +612,28 @@ root = tree.getroot() self.assertEquals(root.text, test_url) + def test_resolve_filename_dtd(self): + parse = self.etree.parse + parser = self.etree.XMLParser(attribute_defaults=True) + assertEqual = self.assertEqual + test_url = u"__nosuch.dtd" + + class MyResolver(self.etree.Resolver): + def resolve(self, url, id, context): + assertEqual(url, test_url) + return self.resolve_filename( + fileInTestDir('test.dtd'), context) + + parser.resolvers.add(MyResolver()) + + xml = u'' % test_url + tree = parse(StringIO(xml), parser) + root = tree.getroot() + self.assertEquals( + root.attrib, {'default': 'valueA'}) + self.assertEquals( + root[0].attrib, {'default': 'valueB'}) + def test_resolve_empty(self): parse = self.etree.parse parser = self.etree.XMLParser(load_dtd=True) From scoder at codespeak.net Tue Apr 15 20:22:03 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 15 Apr 2008 20:22:03 +0200 (CEST) Subject: [Lxml-checkins] r53794 - lxml/branch/lxml-2.0/src/lxml/tests Message-ID: <20080415182203.6B4C0169F29@codespeak.net> Author: scoder Date: Tue Apr 15 20:22:03 2008 New Revision: 53794 Modified: lxml/branch/lxml-2.0/src/lxml/tests/test_etree.py Log: trunk merge -c 53793 (test case) Modified: lxml/branch/lxml-2.0/src/lxml/tests/test_etree.py ============================================================================== --- lxml/branch/lxml-2.0/src/lxml/tests/test_etree.py (original) +++ lxml/branch/lxml-2.0/src/lxml/tests/test_etree.py Tue Apr 15 20:22:03 2008 @@ -533,6 +533,28 @@ root = tree.getroot() self.assertEquals(root.text, test_url) + def test_resolve_filename_dtd(self): + parse = self.etree.parse + parser = self.etree.XMLParser(attribute_defaults=True) + assertEqual = self.assertEqual + test_url = u"__nosuch.dtd" + + class MyResolver(self.etree.Resolver): + def resolve(self, url, id, context): + assertEqual(url, test_url) + return self.resolve_filename( + fileInTestDir('test.dtd'), context) + + parser.resolvers.add(MyResolver()) + + xml = u'' % test_url + tree = parse(StringIO(xml), parser) + root = tree.getroot() + self.assertEquals( + root.attrib, {'default': 'valueA'}) + self.assertEquals( + root[0].attrib, {'default': 'valueB'}) + def test_resolve_empty(self): parse = self.etree.parse parser = self.etree.XMLParser(load_dtd=True) From scoder at codespeak.net Tue Apr 15 20:28:48 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 15 Apr 2008 20:28:48 +0200 (CEST) Subject: [Lxml-checkins] r53795 - in lxml/branch/lxml-2.0: . src/lxml src/lxml/tests Message-ID: <20080415182848.567A2169F36@codespeak.net> Author: scoder Date: Tue Apr 15 20:28:46 2008 New Revision: 53795 Modified: lxml/branch/lxml-2.0/CHANGES.txt lxml/branch/lxml-2.0/src/lxml/tests/test_xslt.py lxml/branch/lxml-2.0/src/lxml/xslt.pxd lxml/branch/lxml-2.0/src/lxml/xslt.pxi Log: trunk merge -c 53787: XSLT error handling Modified: lxml/branch/lxml-2.0/CHANGES.txt ============================================================================== --- lxml/branch/lxml-2.0/CHANGES.txt (original) +++ lxml/branch/lxml-2.0/CHANGES.txt Tue Apr 15 20:28:46 2008 @@ -13,6 +13,9 @@ * Resolving to a filename in custom resolvers didn't work. +* lxml did not honour libxslt's second error state "STOPPED", which + let some XSLT errors pass silently. + * Memory leak in Schematron with libxml2 >= 2.6.31. Other changes Modified: lxml/branch/lxml-2.0/src/lxml/tests/test_xslt.py ============================================================================== --- lxml/branch/lxml-2.0/src/lxml/tests/test_xslt.py (original) +++ lxml/branch/lxml-2.0/src/lxml/tests/test_xslt.py Tue Apr 15 20:28:46 2008 @@ -276,6 +276,23 @@ ''', st.tostring(res)) + def test_xslt_parameter_invalid(self): + tree = self.parse('BC') + style = self.parse('''\ + + + + + +''') + + st = etree.XSLT(style) + res = self.assertRaises(etree.XSLTApplyError, + st, tree, bar="") + res = self.assertRaises(etree.XSLTApplyError, + st, tree, bar="....") + if etree.LIBXSLT_VERSION < (1,1,18): # later versions produce no error def test_xslt_parameter_missing(self): @@ -482,9 +499,8 @@ source = self.parse(xml) styledoc = self.parse(xslt) style = etree.XSLT(styledoc) - result = style.apply(source) - self.assertEqual('', style.tostring(result)) - self.assertEqual('', str(result)) + + self.assertRaises(etree.XSLTApplyError, style, source) self.assert_("TEST TEST TEST" in [entry.message for entry in style.error_log]) Modified: lxml/branch/lxml-2.0/src/lxml/xslt.pxd ============================================================================== --- lxml/branch/lxml-2.0/src/lxml/xslt.pxd (original) +++ lxml/branch/lxml-2.0/src/lxml/xslt.pxd Tue Apr 15 20:28:46 2008 @@ -8,6 +8,11 @@ cdef int LIBXSLT_VERSION cdef extern from "libxslt/xsltInternals.h": + ctypedef enum xsltTransformState: + XSLT_STATE_OK # 0 + XSLT_STATE_ERROR # 1 + XSLT_STATE_STOPPED # 2 + ctypedef struct xsltDocument: xmlDoc* doc @@ -22,6 +27,7 @@ void* _private xmlDict* dict int profile + xsltTransformState state cdef xsltStylesheet* xsltParseStylesheetDoc(xmlDoc* doc) nogil cdef void xsltFreeStylesheet(xsltStylesheet* sheet) nogil Modified: lxml/branch/lxml-2.0/src/lxml/xslt.pxi ============================================================================== --- lxml/branch/lxml-2.0/src/lxml/xslt.pxi (original) +++ lxml/branch/lxml-2.0/src/lxml/xslt.pxi Tue Apr 15 20:28:46 2008 @@ -420,6 +420,11 @@ c_result = self._run_transform( c_doc, _kw, context, transform_ctxt) + if transform_ctxt.state != xslt.XSLT_STATE_OK: + if c_result is not NULL: + tree.xmlFreeDoc(c_result) + c_result = NULL + if transform_ctxt.profile: c_profile_doc = xslt.xsltGetProfileInformation(transform_ctxt) if c_profile_doc is not NULL: From scoder at codespeak.net Tue Apr 15 20:31:17 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 15 Apr 2008 20:31:17 +0200 (CEST) Subject: [Lxml-checkins] r53796 - in lxml/trunk: . doc Message-ID: <20080415183117.15967169F39@codespeak.net> Author: scoder Date: Tue Apr 15 20:31:16 2008 New Revision: 53796 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/main.txt Log: r3991 at delle: sbehnel | 2008-04-15 20:30:02 +0200 doc fix Modified: lxml/trunk/doc/main.txt ============================================================================== --- lxml/trunk/doc/main.txt (original) +++ lxml/trunk/doc/main.txt Tue Apr 15 20:31:16 2008 @@ -275,6 +275,7 @@ .. _`lxml 2.1beta1`: lxml-2.1beta1.tgz .. _`lxml 2.1alpha1`: lxml-2.1alpha1.tgz +.. _`lxml 2.0.4`: lxml-2.0.4.tgz .. _`lxml 2.0.3`: lxml-2.0.3.tgz .. _`lxml 2.0.2`: lxml-2.0.2.tgz .. _`lxml 2.0.1`: lxml-2.0.1.tgz @@ -306,6 +307,7 @@ .. _`changes for 2.1beta1`: changes-2.1beta1.html .. _`changes for 2.1alpha1`: changes-2.1alpha1.html +.. _`changes for 2.0.4`: changes-2.0.4.html .. _`changes for 2.0.3`: changes-2.0.3.html .. _`changes for 2.0.2`: changes-2.0.2.html .. _`changes for 2.0.1`: changes-2.0.1.html From scoder at codespeak.net Tue Apr 15 20:32:40 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 15 Apr 2008 20:32:40 +0200 (CEST) Subject: [Lxml-checkins] r53797 - lxml/tag/lxml-2.0.4 Message-ID: <20080415183240.7B4E3169ED1@codespeak.net> Author: scoder Date: Tue Apr 15 20:32:38 2008 New Revision: 53797 Added: lxml/tag/lxml-2.0.4/ - copied from r53747, lxml/branch/lxml-2.0/ Log: tag for 2.0.4 From scoder at codespeak.net Tue Apr 15 20:33:20 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 15 Apr 2008 20:33:20 +0200 (CEST) Subject: [Lxml-checkins] r53798 - lxml/tag/lxml-2.1beta1 Message-ID: <20080415183320.C9BE2169F36@codespeak.net> Author: scoder Date: Tue Apr 15 20:33:19 2008 New Revision: 53798 Added: lxml/tag/lxml-2.1beta1/ - copied from r53796, lxml/trunk/ Log: tag for 2.1beta1 From scoder at codespeak.net Tue Apr 15 20:35:40 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 15 Apr 2008 20:35:40 +0200 (CEST) Subject: [Lxml-checkins] r53799 - lxml/trunk Message-ID: <20080415183540.BDECA169F3A@codespeak.net> Author: scoder Date: Tue Apr 15 20:35:40 2008 New Revision: 53799 Modified: lxml/trunk/ (props changed) lxml/trunk/version.txt Log: r3995 at delle: sbehnel | 2008-04-15 20:33:35 +0200 version fix Modified: lxml/trunk/version.txt ============================================================================== --- lxml/trunk/version.txt (original) +++ lxml/trunk/version.txt Tue Apr 15 20:35:40 2008 @@ -1 +1 @@ -2.1alpha2 +2.1beta1 From scoder at codespeak.net Tue Apr 15 20:35:49 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 15 Apr 2008 20:35:49 +0200 (CEST) Subject: [Lxml-checkins] r53800 - lxml/tag/lxml-2.1beta1 Message-ID: <20080415183549.BE3284980FD@codespeak.net> Author: scoder Date: Tue Apr 15 20:35:49 2008 New Revision: 53800 Removed: lxml/tag/lxml-2.1beta1/ Log: removed tag for update From scoder at codespeak.net Tue Apr 15 20:36:05 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 15 Apr 2008 20:36:05 +0200 (CEST) Subject: [Lxml-checkins] r53801 - lxml/tag/lxml-2.1beta1 Message-ID: <20080415183605.7E997169F3A@codespeak.net> Author: scoder Date: Tue Apr 15 20:36:05 2008 New Revision: 53801 Added: lxml/tag/lxml-2.1beta1/ - copied from r53799, lxml/trunk/ Log: tag for 2.1beta1 From scoder at codespeak.net Fri Apr 18 21:42:32 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 18 Apr 2008 21:42:32 +0200 (CEST) Subject: [Lxml-checkins] r53865 - lxml/trunk Message-ID: <20080418194232.D63F24981D1@codespeak.net> Author: scoder Date: Fri Apr 18 21:42:30 2008 New Revision: 53865 Modified: lxml/trunk/ (props changed) lxml/trunk/CREDITS.txt Log: r4001 at delle: sbehnel | 2008-04-18 21:38:05 +0200 cleanup in credits Modified: lxml/trunk/CREDITS.txt ============================================================================== --- lxml/trunk/CREDITS.txt (original) +++ lxml/trunk/CREDITS.txt Fri Apr 18 21:42:30 2008 @@ -2,40 +2,54 @@ Credits ======= -Stefan Behnel - main developer and maintainer +Main contributors +================= -Martijn Faassen - creator of lxml and initial main developer +Stefan Behnel + main developer and maintainer -Ian Bicking - creator and maintainer of lxml.html +Martijn Faassen + creator of lxml and initial main developer -Holger Joukl - bug reports, feedback and development on lxml.objectify +Ian Bicking + creator and maintainer of lxml.html -Sidnei da Sivla - official MS Windows builds +Holger Joukl + bug reports, feedback and development on lxml.objectify -Marc-Antoine Parent - XPath extension function help and patches +Sidnei da Sivla + official MS Windows builds -Olivier Grisel - improved (c)ElementTree compatibility patches, - website improvements. +Marc-Antoine Parent + XPath extension function help and patches -Kasimier Buchcik - help with specs and libxml2 +Olivier Grisel + improved (c)ElementTree compatibility patches, + website improvements. -Florian Wagner - help with copy.deepcopy support, bug reporting - -Emil Kroymann - help with encoding support, bug reporting +Kasimier Buchcik + help with specs and libxml2 -Slou - help with index() support, bug reporting +Florian Wagner + help with copy.deepcopy support, bug reporting -Paul Everitt - bug reporting, feedback on API design - -Victor Ng - Discussions on memory management strategies, vlibxml2 +Emil Kroymann + help with encoding support, bug reporting + +Paul Everitt + bug reporting, feedback on API design -Robert Kern - feedback on API design +Victor Ng + Discussions on memory management strategies, vlibxml2 -Andreas Pakulat - rpath linking support, doc improvements +Robert Kern + feedback on API design -Steve Howe - Windows builds +Andreas Pakulat + rpath linking support, doc improvements -David Sankel - building statically on Windows +David Sankel + building statically on Windows ... and lots of other people who contributed to lxml by reporting @@ -47,15 +61,13 @@ Special thanks goes to: ======================= -* the libxml2 project and especially Daniel Veillard for a great XML - library. +* Daniel Veillard and the libxml2 project for a great XML library. -* Fredrik Lundh for the ElementTree API. +* Fredrik Lundh for ElementTree, its API, and the competition through + cElementTree. * Greg Ewing (Pyrex) and Robert Bradshaw (Cython) for the binding technology. * the codespeak crew, in particular Philipp von Weitershausen and Holger Krekel for hosting lxml on codespeak.net - -* Infrae for initiating the project. From scoder at codespeak.net Fri Apr 18 21:42:36 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 18 Apr 2008 21:42:36 +0200 (CEST) Subject: [Lxml-checkins] r53866 - in lxml/trunk: . doc Message-ID: <20080418194236.C82504981D1@codespeak.net> Author: scoder Date: Fri Apr 18 21:42:35 2008 New Revision: 53866 Added: lxml/trunk/doc/docstructure.py Modified: lxml/trunk/ (props changed) lxml/trunk/doc/mkhtml.py Log: r4002 at delle: sbehnel | 2008-04-18 21:38:47 +0200 external module to describe doc structure Added: lxml/trunk/doc/docstructure.py ============================================================================== --- (empty file) +++ lxml/trunk/doc/docstructure.py Fri Apr 18 21:42:35 2008 @@ -0,0 +1,24 @@ + +SITE_STRUCTURE = [ + ('lxml', ('main.txt', 'intro.txt', '../INSTALL.txt', 'lxml2.txt', + 'performance.txt', 'compatibility.txt', 'FAQ.txt')), + ('Developing with lxml', ('tutorial.txt', '@API reference', + 'api.txt', 'parsing.txt', + 'validation.txt', 'xpathxslt.txt', + 'objectify.txt', 'lxmlhtml.txt', + 'cssselect.txt', 'elementsoup.txt')), + ('Extending lxml', ('resolvers.txt', 'extensions.txt', + 'element_classes.txt', 'sax.txt', 'capi.txt')), + ('Developing lxml', ('build.txt', 'lxml-source-howto.txt', + '@Release Changelog', '../CREDITS.txt')), + ] + +HREF_MAP = { + "API reference" : "api/index.html" +} + +BASENAME_MAP = { + 'main' : 'index', + 'INSTALL' : 'installation', + 'CREDITS' : 'credits', +} Modified: lxml/trunk/doc/mkhtml.py ============================================================================== --- lxml/trunk/doc/mkhtml.py (original) +++ lxml/trunk/doc/mkhtml.py Fri Apr 18 21:42:35 2008 @@ -1,21 +1,8 @@ +from docstructure import SITE_STRUCTURE, HREF_MAP, BASENAME_MAP from lxml.etree import (parse, fromstring, ElementTree, Element, SubElement, XPath) import os, shutil, re, sys, copy, time -SITE_STRUCTURE = [ - ('lxml', ('main.txt', 'intro.txt', '../INSTALL.txt', 'lxml2.txt', - 'performance.txt', 'compatibility.txt', 'FAQ.txt')), - ('Developing with lxml', ('tutorial.txt', '@API reference', - 'api.txt', 'parsing.txt', - 'validation.txt', 'xpathxslt.txt', - 'objectify.txt', 'lxmlhtml.txt', - 'cssselect.txt', 'elementsoup.txt')), - ('Extending lxml', ('resolvers.txt', 'extensions.txt', - 'element_classes.txt', 'sax.txt', 'capi.txt')), - ('Developing lxml', ('build.txt', 'lxml-source-howto.txt', - '@Release Changelog')), - ] - RST2HTML_OPTIONS = " ".join([ "--no-toc-backlinks", "--strip-comments", @@ -23,15 +10,6 @@ "--date", ]) -HREF_MAP = { - "API reference" : "api/index.html" -} - -BASENAME_MAP = { - 'main' : 'index', - 'INSTALL' : 'installation', -} - htmlnsmap = {"h" : "http://www.w3.org/1999/xhtml"} find_title = XPath("/h:html/h:head/h:title/text()", namespaces=htmlnsmap) From scoder at codespeak.net Fri Apr 18 21:42:41 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 18 Apr 2008 21:42:41 +0200 (CEST) Subject: [Lxml-checkins] r53867 - in lxml/trunk: . doc Message-ID: <20080418194241.BA6E74981D1@codespeak.net> Author: scoder Date: Fri Apr 18 21:42:41 2008 New Revision: 53867 Added: lxml/trunk/doc/mklatex.py lxml/trunk/doc/rest2latex.py Modified: lxml/trunk/ (props changed) lxml/trunk/Makefile lxml/trunk/doc/performance.txt Log: r4003 at delle: sbehnel | 2008-04-18 21:39:50 +0200 PDF doc support Modified: lxml/trunk/Makefile ============================================================================== --- lxml/trunk/Makefile (original) +++ lxml/trunk/Makefile Fri Apr 18 21:42:41 2008 @@ -2,6 +2,7 @@ TESTFLAGS=-p -v TESTOPTS= SETUPFLAGS= +LXMLVERSION=`cat version.txt` all: inplace @@ -41,7 +42,7 @@ $(PYTHON) test.py -f $(TESTFLAGS) $(TESTOPTS) html: inplace - PYTHONPATH=src $(PYTHON) doc/mkhtml.py doc/html . `cat version.txt` + PYTHONPATH=src $(PYTHON) doc/mkhtml.py doc/html . ${LXMLVERSION} rm -fr doc/html/api @[ -x "`which epydoc`" ] \ && (cd src && echo "Generating API docs ..." && \ @@ -50,6 +51,13 @@ --name lxml --url http://codespeak.net/lxml/ lxml/) \ || (echo "not generating epydoc API documentation") +pdf: + $(PYTHON) doc/mklatex.py doc/pdf . ${LXMLVERSION} + (cd doc/pdf && pdflatex lxmldoc.tex && pdflatex lxmldoc.tex) + @echo "PDF available as doc/pdf/lxmldoc.pdf" + +# Two pdflatex runs are needed to build the correct Table of contents. + test: test_inplace valtest: valgrind_test_inplace Added: lxml/trunk/doc/mklatex.py ============================================================================== --- (empty file) +++ lxml/trunk/doc/mklatex.py Fri Apr 18 21:42:41 2008 @@ -0,0 +1,197 @@ +# The script builds the LaTeX documentation. +# Testing: +# python mklatex.py latex .. 1.0 + +from docstructure import SITE_STRUCTURE, HREF_MAP, BASENAME_MAP +import os, shutil, re, sys + +TARGET_FILE = "lxmldoc.tex" + +RST2LATEX_OPTIONS = " ".join([ +# "--no-toc-backlinks", + "--strip-comments", + "--language en", +# "--date", + "--use-latex-footnotes", + "--use-latex-citations", + "--use-latex-toc", + #"--font-encoding=T1", + ]) + +htmlnsmap = {"h" : "http://www.w3.org/1999/xhtml"} + +replace_invalid = re.compile(r'[-_/.\s\\]').sub + +def rest2latex(script, source_path, dest_path): + command = ('%s %s %s %s > %s' % + (sys.executable, script, RST2LATEX_OPTIONS, + source_path, dest_path)) + os.system(command) + +def build_pygments_macros(filename): + from pygments.formatters import LatexFormatter + text = LatexFormatter().get_style_defs() + f = file(filename, "w") + f.write(text) + f.close() + +def noop(input): + return input + +counter_no = 0 + +def tex_postprocess(src, dest, want_header = False, process_line=noop): + """ + Postprocessing of the LaTeX file generated from ReST. + + Reads file src and saves to dest only the true content + (without the document header and final) - so it is suitable + to be used as part of the longer document. + + Returns the title of document + + If want_header is set, returns also the document header (as + the list of lines). + """ + title = '' + header = [] + global counter_no + counter_no = counter_no + 1 + counter_text = "listcnt%d" % counter_no + + search_title = re.compile(r'\\title{([^}]*)}').search + skipping = re.compile(r'(\\end{document}|\\tableofcontents)').search + + src = file(src) + dest = file(dest, "w") + + iter_lines = iter(src.readlines()) + for l in iter_lines: + l = process_line(l) + if want_header: + header.append(l) + m = search_title(l) + if m: + title = m.group(0) + if l.startswith("\\maketitle"): + break + + for l in iter_lines: + l = process_line(l) + if skipping(l): + # To-Do minitoc instead of tableofcontents + pass + else: + l = l.replace("listcnt0", counter_text) + dest.write(l) + + if not title: + raise Exception("Bueee, no title") + return title, header + +def publish(dirname, lxml_path, release): + if not os.path.exists(dirname): + os.mkdir(dirname) + + book_title = "lxml %s" % release + + doc_dir = os.path.join(lxml_path, 'doc') + script = os.path.join(doc_dir, 'rest2latex.py') + pubkey = os.path.join(doc_dir, 'pubkey.asc') + + shutil.copy(pubkey, dirname) + + href_map = HREF_MAP.copy() + changelog_basename = 'changes-%s' % release + href_map['Release Changelog'] = changelog_basename + '.tex' + + # build pygments macros + build_pygments_macros(os.path.join(dirname, '_part_pygments.tex')) + + # Used in postprocessing of generated LaTeX files + header = [] + titles = {} + + # Building pages + for section, text_files in SITE_STRUCTURE: + for filename in text_files: + if filename.startswith('@'): + print "Not yet implemented: %s" % filename[1:] + #page_title = filename[1:] + #url = href_map[page_title] + #build_menu_entry(page_title, url, section_head) + else: + path = os.path.join(doc_dir, filename) + basename = os.path.splitext(os.path.basename(filename))[0] + basename = BASENAME_MAP.get(basename, basename) + outname = basename + '.tex' + outpath = os.path.join(dirname, outname) + + print "Creating %s" % outname + rest2latex(script, path, outpath) + + final_name = os.path.join(dirname, "_part_%s" % outname) + + title, hd = tex_postprocess(outpath, final_name, not header) + if not header: + header = hd + titles[outname] = title + + # also convert CHANGES.txt + find_version_title = re.compile( + r'(.*\\section\{)([0-9][^\} ]*)\s+\(([^)]+)\)(\}.*)').search + def change_version_title(line): + m = find_version_title(line) + if m: + line = "%sChanges in version %s, released %s%s" % m.groups() + return line + + chgname = 'changes-%s.tex' % release + chgpath = os.path.join(dirname, chgname) + rest2latex(script, + os.path.join(lxml_path, 'CHANGES.txt'), + chgpath) + tex_postprocess(chgpath, os.path.join(dirname, "_part_%s" % chgname), + process_line=change_version_title) + + # Writing a master file + print "Building %s\n" % TARGET_FILE + master = file( os.path.join(dirname, TARGET_FILE), "w") + for hln in header: + if hln.startswith("\\documentclass"): + #hln = hln.replace('article', 'book') + hln = "\\documentclass[10pt,english]{book}\n\\usepackage[a4paper]{geometry}\n" + elif hln.startswith("\\begin{document}"): + # pygments support + master.write("\\usepackage{fancyvrb}\n") + master.write("\\input{_part_pygments.tex}\n") + elif hln.startswith("\\title{"): + hln = re.sub("\{[^\}]*\}", '{%s}' % book_title, hln) + master.write(hln) + + master.write("\\tableofcontents\n\n") + + for section, text_files in SITE_STRUCTURE: + master.write("\\part{%s}\n\n" % section) + for filename in text_files: + if filename.startswith('@'): + pass + #print "Not yet implemented: %s" % filename[1:] + #page_title = filename[1:] + #url = href_map[page_title] + #build_menu_entry(page_title, url, section_head) + else: + basename = os.path.splitext(os.path.basename(filename))[0] + basename = BASENAME_MAP.get(basename, basename) + outname = basename + '.tex' + ## TODO: true title + master.write("\\chapter{%s}\n\n" % titles[outname]) + master.write("\\input{_part_%s}\n\n" % outname) + + master.write("\\chapter{Changes}\n\n") + master.write("\\input{_part_%s}\n\n" % chgname) + + master.write("\end{document}\n") + +if __name__ == '__main__': + publish(sys.argv[1], sys.argv[2], sys.argv[3]) Modified: lxml/trunk/doc/performance.txt ============================================================================== --- lxml/trunk/doc/performance.txt (original) +++ lxml/trunk/doc/performance.txt Fri Apr 18 21:42:41 2008 @@ -71,8 +71,8 @@ a specific part of the API yourself, please consider sending it to the lxml mailing list. -The timings cited below compare lxml 2.0 final (with libxml2 2.6.31) -to the January 2008 SVN trunk versions of ElementTree (1.3alpha) and +The timings cited below compare lxml 2.1 (with libxml2 2.6.32) to the +January 2008 SVN trunk versions of ElementTree (1.3alpha) and cElementTree (1.2.7). They were run single-threaded on a 1.8GHz Intel Core Duo machine under Ubuntu Linux 7.10 (Gutsy). The C libraries were compiled with the same platform specific optimisation flags. The @@ -215,10 +215,10 @@ (given in seconds):: lxe: -- S- U- -A SA UA - T1: 0.0783 0.0777 0.0774 0.0787 0.0781 0.0783 - T2: 0.0799 0.0796 0.0799 0.0879 0.0882 0.0886 - T3: 0.0245 0.0216 0.0217 0.0577 0.0575 0.0572 - T4: 0.0003 0.0003 0.0003 0.0011 0.0011 0.0011 + T1: 0.0792 0.0821 0.0869 0.0741 0.0814 0.0865 + T2: 0.0776 0.0830 0.0885 0.0808 0.0877 0.0933 + T3: 0.0248 0.0231 0.0240 0.0430 0.0444 0.0451 + T4: 0.0003 0.0003 0.0003 0.0007 0.0007 0.0007 cET: -- S- U- -A SA UA T1: 0.0272 0.0264 0.0267 0.0268 0.0261 0.0265 T2: 0.0280 0.0274 0.0273 0.0273 0.0276 0.0275 @@ -315,11 +315,11 @@ The following benchmark appends all root children of the second tree to the root of the first tree:: - lxe: append_from_document (--TR T1,T2) 3.3841 msec/pass + lxe: append_from_document (--TR T1,T2) 2.7261 msec/pass cET: append_from_document (--TR T1,T2) 0.2699 msec/pass ET : append_from_document (--TR T1,T2) 1.2650 msec/pass - lxe: append_from_document (--TR T3,T4) 0.0441 msec/pass + lxe: append_from_document (--TR T3,T4) 0.0460 msec/pass cET: append_from_document (--TR T3,T4) 0.0169 msec/pass ET : append_from_document (--TR T3,T4) 0.0820 msec/pass Added: lxml/trunk/doc/rest2latex.py ============================================================================== --- (empty file) +++ lxml/trunk/doc/rest2latex.py Fri Apr 18 21:42:41 2008 @@ -0,0 +1,66 @@ +#!/usr/bin/python + +# Testing: +# python rest2latex.py objectify.txt > latex/objectify.tex + +""" +A minimal front end to the Docutils Publisher, producing LaTeX with +some syntax highlighting. +""" + +# Set to True if you want inline CSS styles instead of classes +INLINESTYLES = False + + +try: + import locale + locale.setlocale(locale.LC_ALL, '') +except: + pass + +# set up Pygments + +from pygments.formatters import LatexFormatter + +# The default formatter +DEFAULT = LatexFormatter() + +# Add name -> formatter pairs for every variant you want to use +VARIANTS = { + # 'linenos': HtmlFormatter(noclasses=INLINESTYLES, linenos=True), +} + + +from docutils import nodes +from docutils.parsers.rst import directives + +from pygments import highlight +from pygments.lexers import get_lexer_by_name, TextLexer + +def pygments_directive(name, arguments, options, content, lineno, + content_offset, block_text, state, state_machine): + try: + lexer = get_lexer_by_name(arguments[0]) + except ValueError, e: + # no lexer found - use the text one instead of an exception + lexer = TextLexer() + # take an arbitrary option if more than one is given + formatter = options and VARIANTS[options.keys()[0]] or DEFAULT + parsed = highlight(u'\n'.join(content), lexer, formatter) + return [nodes.raw('', parsed, format='latex')] + +pygments_directive.arguments = (1, 0, 1) +pygments_directive.content = 1 +pygments_directive.options = dict([(key, directives.flag) for key in VARIANTS]) + +directives.register_directive('sourcecode', pygments_directive) + + +# run the generation + +from docutils.core import publish_cmdline, default_description + +description = ('Generates LaTeX documents from standalone reStructuredText ' + 'sources. ' + default_description) + +publish_cmdline(writer_name='latex2e', description=description) From scoder at codespeak.net Sat Apr 19 16:10:12 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 19 Apr 2008 16:10:12 +0200 (CEST) Subject: [Lxml-checkins] r53871 - in lxml/trunk: . doc Message-ID: <20080419141012.D66082A013F@codespeak.net> Author: scoder Date: Sat Apr 19 16:10:12 2008 New Revision: 53871 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/mklatex.py Log: r4007 at delle: sbehnel | 2008-04-19 16:08:51 +0200 fix inter-document links in doc PDF Modified: lxml/trunk/doc/mklatex.py ============================================================================== --- lxml/trunk/doc/mklatex.py (original) +++ lxml/trunk/doc/mklatex.py Sat Apr 19 16:10:12 2008 @@ -112,6 +112,21 @@ header = [] titles = {} + replace_relative_hyperrefs = re.compile( + r'\\href\{([^/}]+)[.]([^.]+)\}\{([^}]+)\}').sub + def build_hyperref(match): + basename, extension, linktext = match.groups() + outname = BASENAME_MAP.get(basename, basename) + if '#' in basename or extension != 'html': + return r'\href{http://codespeak.net/lxml/%s.%s}{%s}' % ( + outname, extension, linktext) + else: + return r"\hyperref[_part_%s.tex]{%s}" % (outname, linktext) + def fix_relative_hyperrefs(line): + if r'\href' not in line: + return line + return replace_relative_hyperrefs(build_hyperref, line) + # Building pages for section, text_files in SITE_STRUCTURE: for filename in text_files: @@ -132,7 +147,8 @@ final_name = os.path.join(dirname, "_part_%s" % outname) - title, hd = tex_postprocess(outpath, final_name, not header) + title, hd = tex_postprocess(outpath, final_name, not header, + process_line=fix_relative_hyperrefs) if not header: header = hd titles[outname] = title @@ -171,6 +187,11 @@ master.write("\\tableofcontents\n\n") + def write_chapter(title, outname): + master.write( + "\\chapter{%s}\n\\label{_part_%s}\n\n\\input{_part_%s}\n\n" % ( + title, outname, outname)) + for section, text_files in SITE_STRUCTURE: master.write("\\part{%s}\n\n" % section) for filename in text_files: @@ -184,12 +205,9 @@ basename = os.path.splitext(os.path.basename(filename))[0] basename = BASENAME_MAP.get(basename, basename) outname = basename + '.tex' - ## TODO: true title - master.write("\\chapter{%s}\n\n" % titles[outname]) - master.write("\\input{_part_%s}\n\n" % outname) + write_chapter(titles[outname], outname) - master.write("\\chapter{Changes}\n\n") - master.write("\\input{_part_%s}\n\n" % chgname) + write_chapter("Changes", chgname) master.write("\end{document}\n") From scoder at codespeak.net Sat Apr 19 16:46:56 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 19 Apr 2008 16:46:56 +0200 (CEST) Subject: [Lxml-checkins] r53872 - in lxml/trunk: . doc Message-ID: <20080419144656.5C1BF2A017F@codespeak.net> Author: scoder Date: Sat Apr 19 16:46:53 2008 New Revision: 53872 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/mklatex.py Log: r4009 at delle: sbehnel | 2008-04-19 16:22:03 +0200 fix up ChangeLog TOC in PDF Modified: lxml/trunk/doc/mklatex.py ============================================================================== --- lxml/trunk/doc/mklatex.py (original) +++ lxml/trunk/doc/mklatex.py Sat Apr 19 16:46:53 2008 @@ -156,10 +156,12 @@ # also convert CHANGES.txt find_version_title = re.compile( r'(.*\\section\{)([0-9][^\} ]*)\s+\(([^)]+)\)(\}.*)').search - def change_version_title(line): + def fix_changelog(line): m = find_version_title(line) if m: line = "%sChanges in version %s, released %s%s" % m.groups() + else: + line = line.replace(r'\subsection{', r'\subsection*{') return line chgname = 'changes-%s.tex' % release @@ -168,7 +170,7 @@ os.path.join(lxml_path, 'CHANGES.txt'), chgpath) tex_postprocess(chgpath, os.path.join(dirname, "_part_%s" % chgname), - process_line=change_version_title) + process_line=fix_changelog) # Writing a master file print "Building %s\n" % TARGET_FILE From scoder at codespeak.net Sat Apr 19 16:46:59 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 19 Apr 2008 16:46:59 +0200 (CEST) Subject: [Lxml-checkins] r53873 - in lxml/trunk: . doc Message-ID: <20080419144659.8084C2A017F@codespeak.net> Author: scoder Date: Sat Apr 19 16:46:58 2008 New Revision: 53873 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/FAQ.txt Log: r4010 at delle: sbehnel | 2008-04-19 16:43:00 +0200 doc fix Modified: lxml/trunk/doc/FAQ.txt ============================================================================== --- lxml/trunk/doc/FAQ.txt (original) +++ lxml/trunk/doc/FAQ.txt Sat Apr 19 16:46:58 2008 @@ -775,7 +775,7 @@ refer to the `iterparse section`_ of the lxml API documentation to find out what you can do and what you can't do. -.. _`iterparse section`: api.html#iterparse-and-iterwalk +.. _`iterparse section`: parsing.html#iterparse-and-iterwalk How do I output null characters in XML text? From scoder at codespeak.net Sat Apr 19 16:47:04 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 19 Apr 2008 16:47:04 +0200 (CEST) Subject: [Lxml-checkins] r53874 - in lxml/trunk: . doc Message-ID: <20080419144704.1830F2A017F@codespeak.net> Author: scoder Date: Sat Apr 19 16:47:03 2008 New Revision: 53874 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/mklatex.py Log: r4011 at delle: sbehnel | 2008-04-19 16:45:18 +0200 fix relative hyperrefs in doc PDF Modified: lxml/trunk/doc/mklatex.py ============================================================================== --- lxml/trunk/doc/mklatex.py (original) +++ lxml/trunk/doc/mklatex.py Sat Apr 19 16:47:03 2008 @@ -112,12 +112,17 @@ header = [] titles = {} - replace_relative_hyperrefs = re.compile( - r'\\href\{([^/}]+)[.]([^.]+)\}\{([^}]+)\}').sub + replace_interdoc_hyperrefs = re.compile( + r'\\href\{([^/}]+)[.]([^./}]+)\}\{([^}]+)\}').sub + replace_docinternal_hyperrefs = re.compile( + r'\\href\{\\#([^}]+)\}').sub def build_hyperref(match): basename, extension, linktext = match.groups() outname = BASENAME_MAP.get(basename, basename) - if '#' in basename or extension != 'html': + if '#' in extension: + anchor = extension.split('#')[-1] + return r"\hyperref[%s]{%s}" % (anchor, linktext) + elif extension != 'html': return r'\href{http://codespeak.net/lxml/%s.%s}{%s}' % ( outname, extension, linktext) else: @@ -125,7 +130,8 @@ def fix_relative_hyperrefs(line): if r'\href' not in line: return line - return replace_relative_hyperrefs(build_hyperref, line) + line = replace_interdoc_hyperrefs(build_hyperref, line) + return replace_docinternal_hyperrefs(r'\hyperref[\1]', line) # Building pages for section, text_files in SITE_STRUCTURE: From scoder at codespeak.net Sat Apr 19 17:25:13 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 19 Apr 2008 17:25:13 +0200 (CEST) Subject: [Lxml-checkins] r53875 - lxml/trunk Message-ID: <20080419152513.1A4C62A013F@codespeak.net> Author: scoder Date: Sat Apr 19 17:25:12 2008 New Revision: 53875 Modified: lxml/trunk/ (props changed) lxml/trunk/Makefile Log: r4015 at delle: sbehnel | 2008-04-19 17:11:48 +0200 name PDF doc after lxml version, let 'make docclean' delete doc/pdf Modified: lxml/trunk/Makefile ============================================================================== --- lxml/trunk/Makefile (original) +++ lxml/trunk/Makefile Sat Apr 19 17:25:12 2008 @@ -54,7 +54,8 @@ pdf: $(PYTHON) doc/mklatex.py doc/pdf . ${LXMLVERSION} (cd doc/pdf && pdflatex lxmldoc.tex && pdflatex lxmldoc.tex) - @echo "PDF available as doc/pdf/lxmldoc.pdf" + @mv doc/pdf/lxmldoc.pdf doc/pdf/lxmldoc-${LXMLVERSION}.pdf + @echo "PDF available as doc/pdf/lxmldoc-${LXMLVERSION}.pdf" # Two pdflatex runs are needed to build the correct Table of contents. @@ -75,6 +76,7 @@ docclean: rm -f doc/html/*.html rm -fr doc/html/api + rm -fr doc/pdf realclean: clean docclean find . -name '*.c' -exec rm -f {} \; From scoder at codespeak.net Sat Apr 19 17:25:16 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 19 Apr 2008 17:25:16 +0200 (CEST) Subject: [Lxml-checkins] r53876 - in lxml/trunk: . doc Message-ID: <20080419152516.D048D2A013F@codespeak.net> Author: scoder Date: Sat Apr 19 17:25:16 2008 New Revision: 53876 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/main.txt Log: r4016 at delle: sbehnel | 2008-04-19 17:11:59 +0200 link to doc PDF from web site Modified: lxml/trunk/doc/main.txt ============================================================================== --- lxml/trunk/doc/main.txt (original) +++ lxml/trunk/doc/main.txt Sat Apr 19 17:25:16 2008 @@ -47,6 +47,10 @@ Documentation ------------- +The complete lxml documentation is available for download as `PDF +documentation`_. The HTML documentation from this web site is part of +the normal `source download <#download>`_. + * ElementTree: * `ElementTree API`_ @@ -211,6 +215,8 @@ Old Versions ------------ +.. _`PDF documentation`: lxmldoc-2.1beta1.pdf + * `lxml 2.1alpha1`_, released 2008-03-27 (`changes for 2.1alpha1`_) * `lxml 2.0.4`_, released 2008-04-14 (`changes for 2.0.4`_) From scoder at codespeak.net Sat Apr 19 17:25:20 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 19 Apr 2008 17:25:20 +0200 (CEST) Subject: [Lxml-checkins] r53877 - lxml/trunk Message-ID: <20080419152520.CAD782A013F@codespeak.net> Author: scoder Date: Sat Apr 19 17:25:20 2008 New Revision: 53877 Modified: lxml/trunk/ (props changed) lxml/trunk/CREDITS.txt Log: r4017 at delle: sbehnel | 2008-04-19 17:17:43 +0200 credits update Modified: lxml/trunk/CREDITS.txt ============================================================================== --- lxml/trunk/CREDITS.txt (original) +++ lxml/trunk/CREDITS.txt Sat Apr 19 17:25:20 2008 @@ -9,48 +9,50 @@ main developer and maintainer Martijn Faassen - creator of lxml and initial main developer + creator of lxml and initial main developer Ian Bicking - creator and maintainer of lxml.html + creator and maintainer of lxml.html Holger Joukl - bug reports, feedback and development on lxml.objectify + bug reports, feedback and development on lxml.objectify Sidnei da Sivla - official MS Windows builds + official MS Windows builds Marc-Antoine Parent - XPath extension function help and patches + XPath extension function help and patches Olivier Grisel - improved (c)ElementTree compatibility patches, - website improvements. + improved (c)ElementTree compatibility patches, + website improvements. Kasimier Buchcik - help with specs and libxml2 + help with specs and libxml2 Florian Wagner - help with copy.deepcopy support, bug reporting + help with copy.deepcopy support, bug reporting Emil Kroymann - help with encoding support, bug reporting + help with encoding support, bug reporting Paul Everitt - bug reporting, feedback on API design + bug reporting, feedback on API design Victor Ng - Discussions on memory management strategies, vlibxml2 + Discussions on memory management strategies, vlibxml2 Robert Kern - feedback on API design + feedback on API design Andreas Pakulat - rpath linking support, doc improvements + rpath linking support, doc improvements David Sankel - building statically on Windows + building statically on Windows +Marcin Kasperski + PDF documentation generation ... and lots of other people who contributed to lxml by reporting bugs, discussing its functionality or blaming the docs for the bugs in From scoder at codespeak.net Sat Apr 19 17:25:24 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 19 Apr 2008 17:25:24 +0200 (CEST) Subject: [Lxml-checkins] r53878 - in lxml/trunk: . doc Message-ID: <20080419152524.521CF2A013F@codespeak.net> Author: scoder Date: Sat Apr 19 17:25:23 2008 New Revision: 53878 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/element_classes.txt Log: r4018 at delle: sbehnel | 2008-04-19 17:23:00 +0200 doc fix Modified: lxml/trunk/doc/element_classes.txt ============================================================================== --- lxml/trunk/doc/element_classes.txt (original) +++ lxml/trunk/doc/element_classes.txt Sat Apr 19 17:25:23 2008 @@ -20,8 +20,11 @@ This defines a new Element class ``HonkElement`` with a property ``honking``. -Note that you cannot (or rather *must not*) instantiate this class yourself. -lxml.etree will do that for you through its normal ElementTree API. +Note that you cannot (or rather *must not*) instantiate this class +yourself. lxml.etree will do that for you through its normal +ElementTree API. All you have to do is tell lxml which class to use +for which kind of Element. This is done through a class lookup +scheme, as described below. .. contents:: .. From lxml-checkins at codespeak.net Sun Apr 20 15:51:27 2008 From: lxml-checkins at codespeak.net (lxml-checkins at codespeak.net) Date: Sun, 20 Apr 2008 15:51:27 +0200 (CEST) Subject: [Lxml-checkins] Dear lxml-checkins@codespeak.net April 85% 0FF Message-ID: <20080420045020.4263.qmail@eff133.neoplus.adsl.tpnet.pl> Canadian Doctor Erica Best Price On Net http://www.google.com/pagead/iclk?sa=l&ai=grksn&num=89361&adurl=http://eyta.himlook.com?urex From scoder at codespeak.net Sun Apr 20 18:32:36 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 20 Apr 2008 18:32:36 +0200 (CEST) Subject: [Lxml-checkins] r53915 - lxml/trunk Message-ID: <20080420163236.301CF168539@codespeak.net> Author: scoder Date: Sun Apr 20 18:32:33 2008 New Revision: 53915 Modified: lxml/trunk/ (props changed) lxml/trunk/setupinfo.py Log: r4023 at delle: sbehnel | 2008-04-20 18:30:20 +0200 allow passing all cmd line options also as environment variables Modified: lxml/trunk/setupinfo.py ============================================================================== --- lxml/trunk/setupinfo.py (original) +++ lxml/trunk/setupinfo.py Sun Apr 20 18:32:33 2008 @@ -256,7 +256,12 @@ sys.argv.remove('--%s' % name) return True except ValueError: - return False + pass + # allow passing all cmd line options also as environment variables + env_val = os.getenv(name.upper().replace('-', '_'), 'false').upper() + if env_val == "true": + return True + return False # pick up any commandline options OPTION_WITHOUT_OBJECTIFY = has_option('without-objectify') From scoder at codespeak.net Sun Apr 20 18:32:39 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 20 Apr 2008 18:32:39 +0200 (CEST) Subject: [Lxml-checkins] r53916 - lxml/trunk Message-ID: <20080420163239.985EC168539@codespeak.net> Author: scoder Date: Sun Apr 20 18:32:38 2008 New Revision: 53916 Modified: lxml/trunk/ (props changed) lxml/trunk/setup.py Log: r4024 at delle: sbehnel | 2008-04-20 18:31:07 +0200 support running tests from setup.py after build (option --run-tests) Modified: lxml/trunk/setup.py ============================================================================== --- lxml/trunk/setup.py (original) +++ lxml/trunk/setup.py Sun Apr 20 18:32:38 2008 @@ -40,6 +40,7 @@ versioninfo.create_version_h(svn_version) print("Building lxml version %s." % svn_version) +OPTION_RUN_TESTS = setupinfo.has_option('run-tests') branch_link = """ After an official release of a new stable series, current bug fixes become @@ -105,3 +106,8 @@ STATIC_INCLUDE_DIRS, STATIC_LIBRARY_DIRS, STATIC_CFLAGS), **extra_options ) + +if OPTION_RUN_TESTS: + print("Running tests.") + import test + sys.exit( test.main(sys.argv[:1]) ) From scoder at codespeak.net Wed Apr 23 07:15:00 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 23 Apr 2008 07:15:00 +0200 (CEST) Subject: [Lxml-checkins] r54027 - in lxml/trunk: . doc Message-ID: <20080423051500.C0A502A018D@codespeak.net> Author: scoder Date: Wed Apr 23 07:14:59 2008 New Revision: 54027 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/lxml2.txt Log: r4027 at delle: sbehnel | 2008-04-22 23:29:33 +0200 notes on further deprecations Modified: lxml/trunk/doc/lxml2.txt ============================================================================== --- lxml/trunk/doc/lxml2.txt (original) +++ lxml/trunk/doc/lxml2.txt Wed Apr 23 07:14:59 2008 @@ -180,15 +180,40 @@ * CamelCaseNamed module functions and methods were renamed to their underscore equivalents to follow `PEP 8`_ in naming. - - ``etree.setDefaultParser()`` -> ``etree.set_default_parser()`` + - ``etree.clearErrorLog()``, use ``etree.clear_error_log()`` - - ``etree.getDefaultParser()`` -> ``etree.get_default_parser()`` + - ``etree.useGlobalPythonLog()``, use + ``etree.use_global_python_log()`` - - ``etree.useGlobalPythonLog()`` -> ``etree.use_global_python_log()`` + - ``etree.ElementClassLookup.setFallback()``, use + ``etree.ElementClassLookup.set_fallback()`` - - ``XMLParser.setElementClassLookup()`` -> ``.set_element_class_lookup()`` + - ``etree.getDefaultParser()``, use ``etree.get_default_parser()`` - - ``HTMLParser.setElementClassLookup()`` -> ``.set_element_class_lookup()`` + - ``etree.setDefaultParser()``, use ``etree.set_default_parser()`` + + - ``etree.setElementClassLookup()``, use + ``etree.set_element_class_lookup()`` + + - ``XMLParser.setElementClassLookup()``, use ``.set_element_class_lookup()`` + + - ``HTMLParser.setElementClassLookup()``, use ``.set_element_class_lookup()`` + + Note that ``parser.setElementClassLookup()`` has not been removed + yet, although ``parser.set_element_class_lookup()`` should be used + instead. + + - ``xpath_evaluator.registerNamespace()``, use + ``xpath_evaluator.register_namespace()`` + + - ``xpath_evaluator.registerNamespaces()``, use + ``xpath_evaluator.register_namespaces()`` + + - ``objectify.setPytypeAttributeTag``, use + ``objectify.set_pytype_attribute_tag`` + + - ``objectify.setDefaultParser()``, use + ``objectify.set_default_parser()`` * The ``.getiterator()`` method on Elements and ElementTrees was renamed to ``.iter()`` to follow ElementTree 1.3. From scoder at codespeak.net Wed Apr 23 07:15:11 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 23 Apr 2008 07:15:11 +0200 (CEST) Subject: [Lxml-checkins] r54028 - in lxml/trunk: . doc Message-ID: <20080423051511.DFCBA2A018D@codespeak.net> Author: scoder Date: Wed Apr 23 07:15:11 2008 New Revision: 54028 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/mklatex.py Log: r4028 at delle: sbehnel | 2008-04-22 23:30:05 +0200 PDF layout fixes Modified: lxml/trunk/doc/mklatex.py ============================================================================== --- lxml/trunk/doc/mklatex.py (original) +++ lxml/trunk/doc/mklatex.py Wed Apr 23 07:15:11 2008 @@ -22,6 +22,30 @@ replace_invalid = re.compile(r'[-_/.\s\\]').sub +# LaTeX snippets + +DOCUMENT_CLASS = r""" +\documentclass[10pt,english]{report} +\usepackage[a4paper]{geometry} +\parindent0pt +\parskip1ex +""" + +PYGMENTS_IMPORT = r""" +\usepackage{fancyvrb} +\input{_part_pygments.tex} +""" + +def write_chapter(master, title, outname): + master.write(r""" +\chapter{%s} +\label{_part_%s} +\input{_part_%s} +""".replace(' ', '') % (title, outname, outname)) + + +# the program ---- + def rest2latex(script, source_path, dest_path): command = ('%s %s %s %s > %s' % (sys.executable, script, RST2LATEX_OPTIONS, @@ -182,24 +206,18 @@ print "Building %s\n" % TARGET_FILE master = file( os.path.join(dirname, TARGET_FILE), "w") for hln in header: - if hln.startswith("\\documentclass"): + if hln.startswith(r"\documentclass"): #hln = hln.replace('article', 'book') - hln = "\\documentclass[10pt,english]{book}\n\\usepackage[a4paper]{geometry}\n" - elif hln.startswith("\\begin{document}"): + hln = DOCUMENT_CLASS + elif hln.startswith(r"\begin{document}"): # pygments support - master.write("\\usepackage{fancyvrb}\n") - master.write("\\input{_part_pygments.tex}\n") - elif hln.startswith("\\title{"): + master.write(PYGMENTS_IMPORT) + elif hln.startswith(r"\title{"): hln = re.sub("\{[^\}]*\}", '{%s}' % book_title, hln) master.write(hln) master.write("\\tableofcontents\n\n") - def write_chapter(title, outname): - master.write( - "\\chapter{%s}\n\\label{_part_%s}\n\n\\input{_part_%s}\n\n" % ( - title, outname, outname)) - for section, text_files in SITE_STRUCTURE: master.write("\\part{%s}\n\n" % section) for filename in text_files: @@ -213,11 +231,11 @@ basename = os.path.splitext(os.path.basename(filename))[0] basename = BASENAME_MAP.get(basename, basename) outname = basename + '.tex' - write_chapter(titles[outname], outname) + write_chapter(master, titles[outname], outname) - write_chapter("Changes", chgname) + write_chapter(master, "Changes", chgname) - master.write("\end{document}\n") + master.write("\\end{document}\n") if __name__ == '__main__': publish(sys.argv[1], sys.argv[2], sys.argv[3]) From scoder at codespeak.net Wed Apr 23 07:15:19 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 23 Apr 2008 07:15:19 +0200 (CEST) Subject: [Lxml-checkins] r54029 - in lxml/trunk: . doc Message-ID: <20080423051519.3A7B72A024E@codespeak.net> Author: scoder Date: Wed Apr 23 07:15:18 2008 New Revision: 54029 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/lxml2.txt Log: r4029 at delle: sbehnel | 2008-04-22 23:40:39 +0200 rst fix Modified: lxml/trunk/doc/lxml2.txt ============================================================================== --- lxml/trunk/doc/lxml2.txt (original) +++ lxml/trunk/doc/lxml2.txt Wed Apr 23 07:15:18 2008 @@ -183,7 +183,7 @@ - ``etree.clearErrorLog()``, use ``etree.clear_error_log()`` - ``etree.useGlobalPythonLog()``, use - ``etree.use_global_python_log()`` + ``etree.use_global_python_log()`` - ``etree.ElementClassLookup.setFallback()``, use ``etree.ElementClassLookup.set_fallback()`` From scoder at codespeak.net Wed Apr 23 07:15:24 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 23 Apr 2008 07:15:24 +0200 (CEST) Subject: [Lxml-checkins] r54030 - in lxml/trunk: . doc Message-ID: <20080423051524.A8FB02A024E@codespeak.net> Author: scoder Date: Wed Apr 23 07:15:24 2008 New Revision: 54030 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/mklatex.py Log: r4030 at delle: sbehnel | 2008-04-22 23:50:23 +0200 PDF fixes Modified: lxml/trunk/doc/mklatex.py ============================================================================== --- lxml/trunk/doc/mklatex.py (original) +++ lxml/trunk/doc/mklatex.py Wed Apr 23 07:15:24 2008 @@ -213,13 +213,16 @@ # pygments support master.write(PYGMENTS_IMPORT) elif hln.startswith(r"\title{"): - hln = re.sub("\{[^\}]*\}", '{%s}' % book_title, hln) + hln = re.sub("\{[^\}]*\}", + r'{%s\\\\\\vspace{1em}\\includegraphics{../html/tagpython.png}}' % book_title, hln) + elif hln.startswith("pdftitle"): + hln = re.sub("\{[^\}]*\}", r'{%s}' % book_title, hln) master.write(hln) - master.write("\\tableofcontents\n\n") + master.write("\\tableofcontents\n") for section, text_files in SITE_STRUCTURE: - master.write("\\part{%s}\n\n" % section) + master.write("\n\n\\part{%s}\n" % section) for filename in text_files: if filename.startswith('@'): pass From scoder at codespeak.net Wed Apr 23 07:15:30 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 23 Apr 2008 07:15:30 +0200 (CEST) Subject: [Lxml-checkins] r54031 - lxml/trunk Message-ID: <20080423051530.206222A024E@codespeak.net> Author: scoder Date: Wed Apr 23 07:15:29 2008 New Revision: 54031 Modified: lxml/trunk/ (props changed) lxml/trunk/Makefile Log: r4031 at delle: sbehnel | 2008-04-22 23:53:08 +0200 run pdfopt on PDF docs Modified: lxml/trunk/Makefile ============================================================================== --- lxml/trunk/Makefile (original) +++ lxml/trunk/Makefile Wed Apr 23 07:15:29 2008 @@ -54,7 +54,7 @@ pdf: $(PYTHON) doc/mklatex.py doc/pdf . ${LXMLVERSION} (cd doc/pdf && pdflatex lxmldoc.tex && pdflatex lxmldoc.tex) - @mv doc/pdf/lxmldoc.pdf doc/pdf/lxmldoc-${LXMLVERSION}.pdf + @pdfopt doc/pdf/lxmldoc.pdf doc/pdf/lxmldoc-${LXMLVERSION}.pdf @echo "PDF available as doc/pdf/lxmldoc-${LXMLVERSION}.pdf" # Two pdflatex runs are needed to build the correct Table of contents. From scoder at codespeak.net Wed Apr 23 20:22:45 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 23 Apr 2008 20:22:45 +0200 (CEST) Subject: [Lxml-checkins] r54068 - in lxml/trunk: . doc Message-ID: <20080423182245.E087916A3D6@codespeak.net> Author: scoder Date: Wed Apr 23 20:22:45 2008 New Revision: 54068 Modified: lxml/trunk/ (props changed) lxml/trunk/Makefile lxml/trunk/doc/mklatex.py Log: r4037 at delle: sbehnel | 2008-04-23 18:15:02 +0200 include generated API documentation in PDF Modified: lxml/trunk/Makefile ============================================================================== --- lxml/trunk/Makefile (original) +++ lxml/trunk/Makefile Wed Apr 23 20:22:45 2008 @@ -41,17 +41,30 @@ ftest_inplace: inplace $(PYTHON) test.py -f $(TESTFLAGS) $(TESTOPTS) -html: inplace - PYTHONPATH=src $(PYTHON) doc/mkhtml.py doc/html . ${LXMLVERSION} +apihtml: inplace rm -fr doc/html/api @[ -x "`which epydoc`" ] \ && (cd src && echo "Generating API docs ..." && \ PYTHONPATH=. epydoc -v --docformat "restructuredtext en" \ -o ../doc/html/api --no-private --exclude='[.]html[.]tests|[.]_' \ - --name lxml --url http://codespeak.net/lxml/ lxml/) \ + --name "lxml API" --url http://codespeak.net/lxml/ lxml/) \ + || (echo "not generating epydoc API documentation") + +html: inplace apihtml + PYTHONPATH=src $(PYTHON) doc/mkhtml.py doc/html . ${LXMLVERSION} + +apipdf: inplace + rm -fr doc/pdf + mkdir -p doc/pdf + @[ -x "`which epydoc`" ] \ + && (cd src && echo "Generating API docs ..." && \ + PYTHONPATH=. epydoc -v --latex --docformat "restructuredtext en" \ + -o ../doc/pdf --no-private --exclude='([.]html)?[.]tests|[.]_' \ + --exclude-introspect='html[.]clean' \ + --name "lxml API" --url http://codespeak.net/lxml/ lxml/) \ || (echo "not generating epydoc API documentation") -pdf: +pdf: apipdf $(PYTHON) doc/mklatex.py doc/pdf . ${LXMLVERSION} (cd doc/pdf && pdflatex lxmldoc.tex && pdflatex lxmldoc.tex) @pdfopt doc/pdf/lxmldoc.pdf doc/pdf/lxmldoc-${LXMLVERSION}.pdf Modified: lxml/trunk/doc/mklatex.py ============================================================================== --- lxml/trunk/doc/mklatex.py (original) +++ lxml/trunk/doc/mklatex.py Wed Apr 23 20:22:45 2008 @@ -5,23 +5,41 @@ from docstructure import SITE_STRUCTURE, HREF_MAP, BASENAME_MAP import os, shutil, re, sys +try: + set +except NameError: + # Python 2.3 + from sets import Set as set + TARGET_FILE = "lxmldoc.tex" RST2LATEX_OPTIONS = " ".join([ # "--no-toc-backlinks", "--strip-comments", "--language en", -# "--date", + "--date", "--use-latex-footnotes", "--use-latex-citations", "--use-latex-toc", - #"--font-encoding=T1", + "--font-encoding=T1", + "--output-encoding=utf-8", + "--input-encoding=utf-8", ]) htmlnsmap = {"h" : "http://www.w3.org/1999/xhtml"} replace_invalid = re.compile(r'[-_/.\s\\]').sub +replace_epydoc_macros = re.compile(r'(,\s*amssymb|dvips\s*,\s*)').sub +replace_rst_macros = re.compile(r'(\\usepackage\{color}|\\usepackage\[[^]]*]\{hyperref})').sub + +FILENAME_MAP = { + "@API reference" : "api.tex" +} + +BASENAME_MAP = BASENAME_MAP.copy() +BASENAME_MAP.update({'api' : 'lxmlapi'}) + # LaTeX snippets DOCUMENT_CLASS = r""" @@ -36,12 +54,18 @@ \input{_part_pygments.tex} """ -def write_chapter(master, title, outname): +EPYDOC_IMPORT = r""" +\input{_part_epydoc.tex} +""" + +def write_chapter(master, title, filename): + filename = os.path.join(os.path.dirname(filename), + "_part_%s" % os.path.basename(filename)) master.write(r""" \chapter{%s} -\label{_part_%s} -\input{_part_%s} -""".replace(' ', '') % (title, outname, outname)) +\label{%s} +\input{%s} +""".replace(' ', '') % (title, filename, filename)) # the program ---- @@ -57,8 +81,25 @@ text = LatexFormatter().get_style_defs() f = file(filename, "w") f.write(text) + f.write('\n') f.close() +def copy_epydoc_macros(src, dest, existing_header_lines): + doc = file(src, 'r') + out = file(dest, "w") + for line in doc: + if line.startswith('%% generator'): + break + if line.startswith('%') or \ + r'\documentclass' in line or \ + r'\makeindex' in line: + continue + if line.startswith(r'\usepackage') and line in existing_header_lines: + continue + out.write( replace_epydoc_macros('', line) ) + out.close() + doc.close() + def noop(input): return input @@ -79,6 +120,7 @@ """ title = '' header = [] + add_header_line = header.append global counter_no counter_no = counter_no + 1 counter_text = "listcnt%d" % counter_no @@ -92,8 +134,10 @@ iter_lines = iter(src.readlines()) for l in iter_lines: l = process_line(l) + if not l: + continue if want_header: - header.append(l) + add_header_line(replace_rst_macros('', l)) m = search_title(l) if m: title = m.group(0) @@ -131,6 +175,7 @@ # build pygments macros build_pygments_macros(os.path.join(dirname, '_part_pygments.tex')) + have_epydoc_macros = False # Used in postprocessing of generated LaTeX files header = [] @@ -160,28 +205,43 @@ # Building pages for section, text_files in SITE_STRUCTURE: for filename in text_files: - if filename.startswith('@'): + special = False + if filename in FILENAME_MAP: + outname = FILENAME_MAP[filename] + if not have_epydoc_macros: + have_epydoc_macros = True + copy_epydoc_macros( + os.path.join(dirname, outname), + os.path.join(dirname, '_part_epydoc.tex'), + set(header)) + special = True + elif filename.startswith('@'): print "Not yet implemented: %s" % filename[1:] + continue #page_title = filename[1:] #url = href_map[page_title] #build_menu_entry(page_title, url, section_head) else: - path = os.path.join(doc_dir, filename) basename = os.path.splitext(os.path.basename(filename))[0] basename = BASENAME_MAP.get(basename, basename) outname = basename + '.tex' - outpath = os.path.join(dirname, outname) + + outpath = os.path.join(dirname, outname) + print "Creating %s" % outname - print "Creating %s" % outname + if not special: + path = os.path.join(doc_dir, filename) rest2latex(script, path, outpath) - final_name = os.path.join(dirname, "_part_%s" % outname) + final_name = os.path.join(dirname, os.path.dirname(outname), + "_part_%s" % os.path.basename(outname)) - title, hd = tex_postprocess(outpath, final_name, not header, - process_line=fix_relative_hyperrefs) - if not header: - header = hd - titles[outname] = title + title, hd = tex_postprocess(outpath, final_name, + want_header = not header, + process_line=fix_relative_hyperrefs) + if not header: + header = hd + titles[outname] = title # also convert CHANGES.txt find_version_title = re.compile( @@ -209,8 +269,10 @@ if hln.startswith(r"\documentclass"): #hln = hln.replace('article', 'book') hln = DOCUMENT_CLASS + elif hln.startswith("%% generator "): + master.write(EPYDOC_IMPORT) elif hln.startswith(r"\begin{document}"): - # pygments support + # pygments and epydoc support master.write(PYGMENTS_IMPORT) elif hln.startswith(r"\title{"): hln = re.sub("\{[^\}]*\}", @@ -224,8 +286,10 @@ for section, text_files in SITE_STRUCTURE: master.write("\n\n\\part{%s}\n" % section) for filename in text_files: - if filename.startswith('@'): - pass + if filename in FILENAME_MAP: + outname = FILENAME_MAP[filename] + elif filename.startswith('@'): + continue #print "Not yet implemented: %s" % filename[1:] #page_title = filename[1:] #url = href_map[page_title] @@ -234,7 +298,7 @@ basename = os.path.splitext(os.path.basename(filename))[0] basename = BASENAME_MAP.get(basename, basename) outname = basename + '.tex' - write_chapter(master, titles[outname], outname) + write_chapter(master, titles[outname], outname) write_chapter(master, "Changes", chgname) From scoder at codespeak.net Wed Apr 23 21:45:40 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 23 Apr 2008 21:45:40 +0200 (CEST) Subject: [Lxml-checkins] r54070 - in lxml/trunk: . doc Message-ID: <20080423194540.7FFA32A025E@codespeak.net> Author: scoder Date: Wed Apr 23 21:45:40 2008 New Revision: 54070 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/mklatex.py Log: r4039 at delle: sbehnel | 2008-04-23 20:44:49 +0200 PDF cleanup Modified: lxml/trunk/doc/mklatex.py ============================================================================== --- lxml/trunk/doc/mklatex.py (original) +++ lxml/trunk/doc/mklatex.py Wed Apr 23 21:45:40 2008 @@ -65,7 +65,7 @@ \chapter{%s} \label{%s} \input{%s} -""".replace(' ', '') % (title, filename, filename)) +""" % (title, filename, filename)) # the program ---- @@ -94,8 +94,11 @@ r'\documentclass' in line or \ r'\makeindex' in line: continue - if line.startswith(r'\usepackage') and line in existing_header_lines: - continue + if line.startswith(r'\usepackage'): + if line in existing_header_lines: + continue + if '{hyperref}' in line: + line = line.replace('black', 'blue') out.write( replace_epydoc_macros('', line) ) out.close() doc.close() From scoder at codespeak.net Thu Apr 24 00:46:43 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 24 Apr 2008 00:46:43 +0200 (CEST) Subject: [Lxml-checkins] r54072 - in lxml/trunk: . doc Message-ID: <20080423224643.932B916A223@codespeak.net> Author: scoder Date: Thu Apr 24 00:46:41 2008 New Revision: 54072 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/capi.txt lxml/trunk/doc/tutorial.txt Log: r4041 at delle: sbehnel | 2008-04-23 20:59:38 +0200 doc fixes Modified: lxml/trunk/doc/capi.txt ============================================================================== --- lxml/trunk/doc/capi.txt (original) +++ lxml/trunk/doc/capi.txt Thu Apr 24 00:46:41 2008 @@ -87,11 +87,8 @@ #include "etree.h" /* setup code */ - static PyObject* m_etree; - m_etree = _ADD_YOUR_WAY_TO_IMPORT_A_MODULE_("lxml.etree"); + import_lxml__etree() - import_etree(m_etree); - -Note that including ``etree.h`` does not automatically include the header -files it requires. Note also that the above list of common imports may not be -sufficient. +Note that including ``etree.h`` does not automatically include the +header files it requires. Note also that the above list of common +includes may not be sufficient. Modified: lxml/trunk/doc/tutorial.txt ============================================================================== --- lxml/trunk/doc/tutorial.txt (original) +++ lxml/trunk/doc/tutorial.txt Thu Apr 24 00:46:41 2008 @@ -1,6 +1,10 @@ ======================= The lxml.etree Tutorial ======================= + +.. meta:: + :description: The lxml tutorial on XML that feels like Python + :keywords: lxml, etree, tutorial, ElementTree, Python, XML, HTML :Author: Stefan Behnel From scoder at codespeak.net Thu Apr 24 00:46:49 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 24 Apr 2008 00:46:49 +0200 (CEST) Subject: [Lxml-checkins] r54073 - in lxml/trunk: . doc Message-ID: <20080423224649.BDBEB16A224@codespeak.net> Author: scoder Date: Thu Apr 24 00:46:48 2008 New Revision: 54073 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/FAQ.txt lxml/trunk/doc/mklatex.py Log: r4042 at delle: sbehnel | 2008-04-23 21:23:38 +0200 doc fixes Modified: lxml/trunk/doc/FAQ.txt ============================================================================== --- lxml/trunk/doc/FAQ.txt (original) +++ lxml/trunk/doc/FAQ.txt Thu Apr 24 00:46:48 2008 @@ -831,7 +831,7 @@ How can I find out which namespace prefixes are used in a document? ------------------------------------------------------------------- -You can traverse the document (``getiterator()``) and collect the prefix +You can traverse the document (``root.iter()``) and collect the prefix attributes from all Elements into a set. However, it is unlikely that you really want to do that. You do not need these prefixes, honestly. You only need the namespace URIs. All namespace comparisons use these, so feel free to Modified: lxml/trunk/doc/mklatex.py ============================================================================== --- lxml/trunk/doc/mklatex.py (original) +++ lxml/trunk/doc/mklatex.py Thu Apr 24 00:46:48 2008 @@ -3,7 +3,7 @@ # python mklatex.py latex .. 1.0 from docstructure import SITE_STRUCTURE, HREF_MAP, BASENAME_MAP -import os, shutil, re, sys +import os, shutil, re, sys, datetime try: set @@ -17,7 +17,7 @@ # "--no-toc-backlinks", "--strip-comments", "--language en", - "--date", +# "--date", "--use-latex-footnotes", "--use-latex-citations", "--use-latex-toc", @@ -29,6 +29,7 @@ htmlnsmap = {"h" : "http://www.w3.org/1999/xhtml"} replace_invalid = re.compile(r'[-_/.\s\\]').sub +replace_content = re.compile("\{[^\}]*\}").sub replace_epydoc_macros = re.compile(r'(,\s*amssymb|dvips\s*,\s*)').sub replace_rst_macros = re.compile(r'(\\usepackage\{color}|\\usepackage\[[^]]*]\{hyperref})').sub @@ -278,10 +279,14 @@ # pygments and epydoc support master.write(PYGMENTS_IMPORT) elif hln.startswith(r"\title{"): - hln = re.sub("\{[^\}]*\}", - r'{%s\\\\\\vspace{1em}\\includegraphics{../html/tagpython.png}}' % book_title, hln) + hln = replace_content( + r'{%s\\\\\\vspace{1em}\\includegraphics{../html/tagpython.png}}' % book_title, hln) + elif hln.startswith(r"\date{"): + hln = replace_content( + r'{%s}' % datetime.date.today().isoformat(), hln) elif hln.startswith("pdftitle"): - hln = re.sub("\{[^\}]*\}", r'{%s}' % book_title, hln) + hln = replace_content( + r'{%s}' % book_title, hln) master.write(hln) master.write("\\tableofcontents\n") From scoder at codespeak.net Thu Apr 24 00:46:56 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 24 Apr 2008 00:46:56 +0200 (CEST) Subject: [Lxml-checkins] r54074 - in lxml/trunk: . doc/html Message-ID: <20080423224656.1945816A225@codespeak.net> Author: scoder Date: Thu Apr 24 00:46:55 2008 New Revision: 54074 Added: lxml/trunk/doc/html/tagpython-big.png (contents, props changed) Modified: lxml/trunk/ (props changed) Log: r4043 at delle: sbehnel | 2008-04-23 21:39:42 +0200 larger copy of the lxml logo Added: lxml/trunk/doc/html/tagpython-big.png ============================================================================== Binary file. No diff available. From scoder at codespeak.net Thu Apr 24 00:47:01 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 24 Apr 2008 00:47:01 +0200 (CEST) Subject: [Lxml-checkins] r54075 - in lxml/trunk: . doc Message-ID: <20080423224701.EC9BA16A223@codespeak.net> Author: scoder Date: Thu Apr 24 00:47:01 2008 New Revision: 54075 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/mklatex.py Log: r4044 at delle: sbehnel | 2008-04-23 21:46:38 +0200 use bigger logo in PDF Modified: lxml/trunk/doc/mklatex.py ============================================================================== --- lxml/trunk/doc/mklatex.py (original) +++ lxml/trunk/doc/mklatex.py Thu Apr 24 00:47:01 2008 @@ -179,7 +179,6 @@ # build pygments macros build_pygments_macros(os.path.join(dirname, '_part_pygments.tex')) - have_epydoc_macros = False # Used in postprocessing of generated LaTeX files header = [] @@ -207,6 +206,7 @@ return replace_docinternal_hyperrefs(r'\hyperref[\1]', line) # Building pages + have_epydoc_macros = False for section, text_files in SITE_STRUCTURE: for filename in text_files: special = False @@ -280,7 +280,7 @@ master.write(PYGMENTS_IMPORT) elif hln.startswith(r"\title{"): hln = replace_content( - r'{%s\\\\\\vspace{1em}\\includegraphics{../html/tagpython.png}}' % book_title, hln) + r'{%s\\\\\\vspace{1cm}\\includegraphics[width=2.5cm]{../html/tagpython-big.png}}' % book_title, hln) elif hln.startswith(r"\date{"): hln = replace_content( r'{%s}' % datetime.date.today().isoformat(), hln) From scoder at codespeak.net Thu Apr 24 00:47:08 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 24 Apr 2008 00:47:08 +0200 (CEST) Subject: [Lxml-checkins] r54076 - in lxml/trunk: . benchmark Message-ID: <20080423224708.21BB716A224@codespeak.net> Author: scoder Date: Thu Apr 24 00:47:07 2008 New Revision: 54076 Modified: lxml/trunk/ (props changed) lxml/trunk/benchmark/bench_objectify.py lxml/trunk/benchmark/benchbase.py Log: r4045 at delle: sbehnel | 2008-04-23 23:44:28 +0200 changed trees to a somewhat more realistic szenario with smaller XML vocabularies - absolute numbers are no longer comparable to previous benchmarks Modified: lxml/trunk/benchmark/bench_objectify.py ============================================================================== --- lxml/trunk/benchmark/bench_objectify.py (original) +++ lxml/trunk/benchmark/bench_objectify.py Thu Apr 24 00:47:07 2008 @@ -46,14 +46,14 @@ def bench_attributes_deep(self, root): "1 2 4" for i in self.repeat3000: - root.zzzzz['{cdefg}z00000'] + root.zzzzz['{cdefg}a00001'] def bench_attributes_deep_cached(self, root): "1 2 4" cache1 = root.zzzzz - cache2 = cache1['{cdefg}z00000'] + cache2 = cache1['{cdefg}a00001'] for i in self.repeat3000: - root.zzzzz['{cdefg}z00000'] + root.zzzzz['{cdefg}a00001'] def bench_objectpath(self, root): "1 2 4" @@ -63,15 +63,15 @@ def bench_objectpath_deep(self, root): "1 2 4" - path = self.objectify.ObjectPath(".zzzzz.{cdefg}z00000") + path = self.objectify.ObjectPath(".zzzzz.{cdefg}a00001") for i in self.repeat3000: path(root) def bench_objectpath_deep_cached(self, root): "1 2 4" cache1 = root.zzzzz - cache2 = cache1['{cdefg}z00000'] - path = self.objectify.ObjectPath(".zzzzz.{cdefg}z00000") + cache2 = cache1['{cdefg}a00001'] + path = self.objectify.ObjectPath(".zzzzz.{cdefg}a00001") for i in self.repeat3000: path(root) Modified: lxml/trunk/benchmark/benchbase.py ============================================================================== --- lxml/trunk/benchmark/benchbase.py (original) +++ lxml/trunk/benchmark/benchbase.py Thu Apr 24 00:47:07 2008 @@ -199,8 +199,9 @@ el = SubElement(root, "{abc}"+ch1*5, attributes) el.text = text for ch2 in atoz: + tag = "{cdefg}%s00001" % ch2 for i in range(20 * TREE_FACTOR): - SubElement(el, "{cdefg}%s%05d" % (ch2, i)).tail = text + SubElement(el, tag).tail = text t = current_time() - t return (root, t) @@ -216,7 +217,7 @@ el = SubElement(root, "{abc}"+ch1*5, attributes) el.text = text for ch2 in atoz: - SubElement(el, "{cdefg}%s%05d" % (ch2, i)).tail = text + SubElement(el, "{cdefg}%s00001" % ch2).tail = text t = current_time() - t return (root, t) @@ -229,7 +230,7 @@ children = [root] for i in range(6 + TREE_FACTOR): tag_no = count().next - children = [ SubElement(c, "{cdefg}a%05d" % i, attributes) + children = [ SubElement(c, "{cdefg}a%05d" % (i%8), attributes) for i,c in enumerate(chain(children, children, children)) ] for child in children: child.text = text From scoder at codespeak.net Thu Apr 24 00:47:14 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 24 Apr 2008 00:47:14 +0200 (CEST) Subject: [Lxml-checkins] r54077 - in lxml/trunk: . doc Message-ID: <20080423224714.0AE1016A225@codespeak.net> Author: scoder Date: Thu Apr 24 00:47:13 2008 New Revision: 54077 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/performance.txt Log: r4046 at delle: sbehnel | 2008-04-23 23:45:33 +0200 updated benchmark results Modified: lxml/trunk/doc/performance.txt ============================================================================== --- lxml/trunk/doc/performance.txt (original) +++ lxml/trunk/doc/performance.txt Thu Apr 24 00:47:13 2008 @@ -71,16 +71,16 @@ a specific part of the API yourself, please consider sending it to the lxml mailing list. -The timings cited below compare lxml 2.1 (with libxml2 2.6.32) to the -January 2008 SVN trunk versions of ElementTree (1.3alpha) and +The timings cited below compare lxml 2.1 (with libxml2 2.6.33) to the +April 2008 SVN trunk versions of ElementTree (1.3alpha) and cElementTree (1.2.7). They were run single-threaded on a 1.8GHz Intel Core Duo machine under Ubuntu Linux 7.10 (Gutsy). The C libraries were compiled with the same platform specific optimisation flags. The Python interpreter (2.5.1) was used as provided by the distribution. -.. _`bench_etree.py`: http://codespeak.net/svn/lxml/branch/lxml-1.3/benchmark/bench_etree.py -.. _`bench_xpath.py`: http://codespeak.net/svn/lxml/branch/lxml-1.3/benchmark/bench_xpath.py -.. _`bench_objectify.py`: http://codespeak.net/svn/lxml/branch/lxml-1.3/benchmark/bench_objectify.py +.. _`bench_etree.py`: http://codespeak.net/svn/lxml/trunk/benchmark/bench_etree.py +.. _`bench_xpath.py`: http://codespeak.net/svn/lxml/trunk/benchmark/bench_xpath.py +.. _`bench_objectify.py`: http://codespeak.net/svn/lxml/trunk/benchmark/bench_objectify.py The scripts run a number of simple tests on the different libraries, using different XML tree configurations: different tree sizes (T1-4), with or @@ -114,84 +114,93 @@ executes entirely at the C level, without any interaction with Python code. The results are rather impressive, especially for UTF-8, which is native to libxml2. While 20 to 40 times faster than (c)ElementTree -1.2, lxml is still more than 5 times as fast as the much improved +1.2, lxml is still more than 7 times as fast as the much improved ElementTree 1.3:: - lxe: tostring_utf16 (SATR T1) 19.0921 msec/pass - cET: tostring_utf16 (SATR T1) 129.8430 msec/pass - ET : tostring_utf16 (SATR T1) 136.1301 msec/pass - - lxe: tostring_utf16 (UATR T1) 20.4630 msec/pass - cET: tostring_utf16 (UATR T1) 130.1570 msec/pass - ET : tostring_utf16 (UATR T1) 136.3101 msec/pass - - lxe: tostring_utf16 (S-TR T2) 18.8632 msec/pass - cET: tostring_utf16 (S-TR T2) 136.9388 msec/pass - ET : tostring_utf16 (S-TR T2) 143.9550 msec/pass - - lxe: tostring_utf8 (S-TR T2) 14.4310 msec/pass - cET: tostring_utf8 (S-TR T2) 137.0859 msec/pass - ET : tostring_utf8 (S-TR T2) 144.3110 msec/pass - - lxe: tostring_utf8 (U-TR T3) 2.6381 msec/pass - cET: tostring_utf8 (U-TR T3) 52.1040 msec/pass - ET : tostring_utf8 (U-TR T3) 53.1070 msec/pass + lxe: tostring_utf16 (SATR T1) 25.7590 msec/pass + cET: tostring_utf16 (SATR T1) 179.6291 msec/pass + ET : tostring_utf16 (SATR T1) 188.5638 msec/pass + + lxe: tostring_utf16 (UATR T1) 26.0060 msec/pass + cET: tostring_utf16 (UATR T1) 176.9981 msec/pass + ET : tostring_utf16 (UATR T1) 188.2110 msec/pass + + lxe: tostring_utf16 (S-TR T2) 26.9201 msec/pass + cET: tostring_utf16 (S-TR T2) 182.5061 msec/pass + ET : tostring_utf16 (S-TR T2) 190.2061 msec/pass + + lxe: tostring_utf8 (S-TR T2) 19.5830 msec/pass + cET: tostring_utf8 (S-TR T2) 183.0020 msec/pass + ET : tostring_utf8 (S-TR T2) 187.7251 msec/pass + + lxe: tostring_utf8 (U-TR T3) 5.5292 msec/pass + cET: tostring_utf8 (U-TR T3) 56.1349 msec/pass + ET : tostring_utf8 (U-TR T3) 56.6628 msec/pass + +The same applies to plain text serialisation. Note that cElementTree +does not currently support this, as it is new in ET 1.3:: + + lxe: tostring_text_ascii (S-TR T1) 4.5149 msec/pass + ET : tostring_text_ascii (S-TR T1) 87.6551 msec/pass + + lxe: tostring_text_ascii (S-TR T3) 1.2901 msec/pass + ET : tostring_text_ascii (S-TR T3) 27.5211 msec/pass For parsing, on the other hand, the advantage is clearly with cElementTree. The (c)ET libraries use a very thin layer on top of the expat parser, which is known to be extremely fast:: - lxe: parse_stringIO (SAXR T1) 144.1851 msec/pass - cET: parse_stringIO (SAXR T1) 14.4269 msec/pass - ET : parse_stringIO (SAXR T1) 245.9190 msec/pass - - lxe: parse_stringIO (S-XR T3) 5.6100 msec/pass - cET: parse_stringIO (S-XR T3) 5.3229 msec/pass - ET : parse_stringIO (S-XR T3) 82.4831 msec/pass - - lxe: parse_stringIO (UAXR T3) 23.4420 msec/pass - cET: parse_stringIO (UAXR T3) 30.2689 msec/pass - ET : parse_stringIO (UAXR T3) 165.7169 msec/pass + lxe: parse_stringIO (SAXR T1) 40.6771 msec/pass + cET: parse_stringIO (SAXR T1) 19.3741 msec/pass + ET : parse_stringIO (SAXR T1) 355.7711 msec/pass + + lxe: parse_stringIO (S-XR T3) 5.9960 msec/pass + cET: parse_stringIO (S-XR T3) 5.8751 msec/pass + ET : parse_stringIO (S-XR T3) 93.7259 msec/pass + + lxe: parse_stringIO (UAXR T3) 26.2671 msec/pass + cET: parse_stringIO (UAXR T3) 30.6449 msec/pass + ET : parse_stringIO (UAXR T3) 178.8890 msec/pass While about as fast for smaller documents, the expat parser allows cET -to be up to 10 times faster than lxml on plain parser performance for +to be up to 2 times faster than lxml on plain parser performance for large input documents. Similar timings can be observed for the ``iterparse()`` function:: - lxe: iterparse_stringIO (SAXR T1) 160.3689 msec/pass - cET: iterparse_stringIO (SAXR T1) 19.1891 msec/pass - ET : iterparse_stringIO (SAXR T1) 274.8971 msec/pass - - lxe: iterparse_stringIO (UAXR T3) 24.9629 msec/pass - cET: iterparse_stringIO (UAXR T3) 31.7740 msec/pass - ET : iterparse_stringIO (UAXR T3) 173.8000 msec/pass + lxe: iterparse_stringIO (SAXR T1) 50.8120 msec/pass + cET: iterparse_stringIO (SAXR T1) 24.9379 msec/pass + ET : iterparse_stringIO (SAXR T1) 388.9420 msec/pass + + lxe: iterparse_stringIO (UAXR T3) 29.0790 msec/pass + cET: iterparse_stringIO (UAXR T3) 32.1240 msec/pass + ET : iterparse_stringIO (UAXR T3) 189.1720 msec/pass However, if you benchmark the complete round-trip of a serialise-parse cycle, the numbers will look similar to these:: - lxe: write_utf8_parse_stringIO (S-TR T1) 160.0718 msec/pass - cET: write_utf8_parse_stringIO (S-TR T1) 207.6778 msec/pass - ET : write_utf8_parse_stringIO (S-TR T1) 450.2120 msec/pass - - lxe: write_utf8_parse_stringIO (UATR T2) 173.5830 msec/pass - cET: write_utf8_parse_stringIO (UATR T2) 253.0849 msec/pass - ET : write_utf8_parse_stringIO (UATR T2) 519.2261 msec/pass - - lxe: write_utf8_parse_stringIO (S-TR T3) 8.4269 msec/pass - cET: write_utf8_parse_stringIO (S-TR T3) 75.7639 msec/pass - ET : write_utf8_parse_stringIO (S-TR T3) 156.1930 msec/pass - - lxe: write_utf8_parse_stringIO (SATR T4) 1.2100 msec/pass - cET: write_utf8_parse_stringIO (SATR T4) 6.4859 msec/pass - ET : write_utf8_parse_stringIO (SATR T4) 9.9051 msec/pass - -For applications that require a high parser throughput and do little -serialization, cET is the best choice. Also for iterparse -applications that extract small amounts of data from large XML data -sets. If it comes to round-trip performance, however, lxml tends to -be between 30% and multiple times faster in total. So, whenever the -input documents are not considerably bigger than the output, lxml is -the clear winner. + lxe: write_utf8_parse_stringIO (S-TR T1) 63.7550 msec/pass + cET: write_utf8_parse_stringIO (S-TR T1) 292.0721 msec/pass + ET : write_utf8_parse_stringIO (S-TR T1) 635.2799 msec/pass + + lxe: write_utf8_parse_stringIO (UATR T2) 75.0258 msec/pass + cET: write_utf8_parse_stringIO (UATR T2) 341.7251 msec/pass + ET : write_utf8_parse_stringIO (UATR T2) 713.1951 msec/pass + + lxe: write_utf8_parse_stringIO (S-TR T3) 11.4899 msec/pass + cET: write_utf8_parse_stringIO (S-TR T3) 96.8502 msec/pass + ET : write_utf8_parse_stringIO (S-TR T3) 185.6079 msec/pass + + lxe: write_utf8_parse_stringIO (SATR T4) 1.2081 msec/pass + cET: write_utf8_parse_stringIO (SATR T4) 6.8581 msec/pass + ET : write_utf8_parse_stringIO (SATR T4) 10.6261 msec/pass + +For applications that require a high parser throughput of large files, +and that do little to no serialization, cET is the best choice. Also +for iterparse applications that extract small amounts of data from +large XML data sets that do not fit into the memory. If it comes to +round-trip performance, however, lxml tends to be multiple times +faster in total. So, whenever the input documents are not +considerably larger than the output, lxml is the clear winner. Regarding HTML parsing, Ian Bicking has done some `benchmarking on lxml's HTML parser`_, comparing it to a number of other famous HTML @@ -214,24 +223,25 @@ restructuring. This can be seen from the tree setup times of the benchmark (given in seconds):: - lxe: -- S- U- -A SA UA - T1: 0.0792 0.0821 0.0869 0.0741 0.0814 0.0865 - T2: 0.0776 0.0830 0.0885 0.0808 0.0877 0.0933 - T3: 0.0248 0.0231 0.0240 0.0430 0.0444 0.0451 - T4: 0.0003 0.0003 0.0003 0.0007 0.0007 0.0007 - cET: -- S- U- -A SA UA - T1: 0.0272 0.0264 0.0267 0.0268 0.0261 0.0265 - T2: 0.0280 0.0274 0.0273 0.0273 0.0276 0.0275 - T3: 0.0065 0.0066 0.0065 0.0111 0.0088 0.0088 + lxe: -- S- U- -A SA UA + T1: 0.0437 0.0498 0.0516 0.0430 0.0498 0.0519 + T2: 0.0550 0.0643 0.0677 0.0612 0.0685 0.0721 + T3: 0.0168 0.0142 0.0159 0.0338 0.0350 0.0359 + T4: 0.0003 0.0002 0.0003 0.0007 0.0007 0.0007 + cET: -- S- U- -A SA UA + T1: 0.0093 0.0093 0.0093 0.0097 0.0094 0.0094 + T2: 0.0153 0.0155 0.0152 0.0157 0.0154 0.0154 + T3: 0.0076 0.0076 0.0076 0.0099 0.0122 0.0100 T4: 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 - ET : -- S- U- -A SA UA - T1: 0.1302 0.1903 0.2208 0.1265 0.2542 0.1267 - T2: 0.2994 0.1301 0.3402 0.3746 0.1326 0.4170 - T3: 0.0301 0.0310 0.0302 0.0348 0.3654 0.0349 - T4: 0.0006 0.0005 0.0008 0.0006 0.0007 0.0006 + ET : -- S- U- -A SA UA + T1: 0.1074 0.1669 0.1050 0.2054 0.2401 0.1047 + T2: 0.2920 0.1172 0.3393 0.4021 0.1184 0.4216 + T3: 0.0347 0.0331 0.0316 0.0368 0.3944 0.0377 + T4: 0.0006 0.0005 0.0007 0.0006 0.0007 0.0006 + While lxml is still faster than ET in most cases (10-70%), cET can be up to -three times faster than lxml here. One of the reasons is that lxml must +five times faster than lxml here. One of the reasons is that lxml must additionally discard the created Python elements after their use, when they are no longer referenced. ET and cET represent the tree itself through these objects, which reduces the overhead in creating them. @@ -255,26 +265,26 @@ This handicap is also visible when accessing single children:: - lxe: first_child (--TR T2) 0.2429 msec/pass - cET: first_child (--TR T2) 0.2170 msec/pass - ET : first_child (--TR T2) 0.9968 msec/pass - - lxe: last_child (--TR T1) 0.2470 msec/pass - cET: last_child (--TR T1) 0.2291 msec/pass - ET : last_child (--TR T1) 0.9830 msec/pass + lxe: first_child (--TR T2) 0.2341 msec/pass + cET: first_child (--TR T2) 0.2198 msec/pass + ET : first_child (--TR T2) 0.8960 msec/pass + + lxe: last_child (--TR T1 ) 0.2549 msec/pass + cET: last_child (--TR T1 ) 0.2251 msec/pass + ET : last_child (--TR T1 ) 0.8969 msec/pass ... unless you also add the time to find a child index in a bigger list. ET and cET use Python lists here, which are based on arrays. The data structure used by libxml2 is a linked tree, and thus, a linked list of children:: - lxe: middle_child (--TR T1) 0.2759 msec/pass - cET: middle_child (--TR T1) 0.2229 msec/pass - ET : middle_child (--TR T1) 1.0030 msec/pass - - lxe: middle_child (--TR T2) 1.7071 msec/pass - cET: middle_child (--TR T2) 0.2229 msec/pass - ET : middle_child (--TR T2) 0.9930 msec/pass + lxe: middle_child (--TR T1) 0.2699 msec/pass + cET: middle_child (--TR T1) 0.2089 msec/pass + ET : middle_child (--TR T1) 0.8910 msec/pass + + lxe: middle_child (--TR T2) 1.9410 msec/pass + cET: middle_child (--TR T2) 0.2151 msec/pass + ET : middle_child (--TR T2) 0.8960 msec/pass Element creation @@ -284,21 +294,21 @@ in. This results in a major performance difference for creating independent Elements that end up in independently created documents:: - lxe: create_elements (--TC T2) 2.8961 msec/pass + lxe: create_elements (--TC T2) 1.7340 msec/pass cET: create_elements (--TC T2) 0.1929 msec/pass - ET : create_elements (--TC T2) 1.3590 msec/pass + ET : create_elements (--TC T2) 1.3809 msec/pass Therefore, it is always preferable to create Elements for the document they are supposed to end up in, either as SubElements of an Element or using the explicit ``Element.makeelement()`` call:: - lxe: makeelement (--TC T2) 1.9000 msec/pass - cET: makeelement (--TC T2) 0.3211 msec/pass - ET : makeelement (--TC T2) 1.6358 msec/pass - - lxe: create_subelements (--TC T2) 1.7891 msec/pass - cET: create_subelements (--TC T2) 0.2351 msec/pass - ET : create_subelements (--TC T2) 3.2270 msec/pass + lxe: makeelement (--TC T2) 1.6100 msec/pass + cET: makeelement (--TC T2) 0.3171 msec/pass + ET : makeelement (--TC T2) 1.6270 msec/pass + + lxe: create_subelements (--TC T2) 1.3542 msec/pass + cET: create_subelements (--TC T2) 0.2329 msec/pass + ET : create_subelements (--TC T2) 3.3019 msec/pass So, if the main performance bottleneck of an application is creating large XML trees in memory through calls to Element and SubElement, cET is the best @@ -315,13 +325,13 @@ The following benchmark appends all root children of the second tree to the root of the first tree:: - lxe: append_from_document (--TR T1,T2) 2.7261 msec/pass - cET: append_from_document (--TR T1,T2) 0.2699 msec/pass - ET : append_from_document (--TR T1,T2) 1.2650 msec/pass - - lxe: append_from_document (--TR T3,T4) 0.0460 msec/pass - cET: append_from_document (--TR T3,T4) 0.0169 msec/pass - ET : append_from_document (--TR T3,T4) 0.0820 msec/pass + lxe: append_from_document (--TR T1,T2) 3.0038 msec/pass + cET: append_from_document (--TR T1,T2) 0.2639 msec/pass + ET : append_from_document (--TR T1,T2) 1.2522 msec/pass + + lxe: append_from_document (--TR T3,T4) 0.0398 msec/pass + cET: append_from_document (--TR T3,T4) 0.0160 msec/pass + ET : append_from_document (--TR T3,T4) 0.0811 msec/pass Although these are fairly small numbers compared to parsing, this easily shows the different performance classes for lxml and (c)ET. Where the latter do not @@ -332,24 +342,26 @@ This difference is not always as visible, but applies to most parts of the API, like inserting newly created elements:: - lxe: insert_from_document (--TR T1,T2) 5.7020 msec/pass - cET: insert_from_document (--TR T1,T2) 0.4041 msec/pass - ET : insert_from_document (--TR T1,T2) 1.4789 msec/pass + lxe: insert_from_document (--TR T1,T2) 4.9140 msec/pass + cET: insert_from_document (--TR T1,T2) 0.4108 msec/pass + ET : insert_from_document (--TR T1,T2) 1.4670 msec/pass or replacing the child slice by a newly created element:: - lxe: replace_children_element (--TC T1) 0.2210 msec/pass + lxe: replace_children_element (--TC T1) 0.1500 msec/pass cET: replace_children_element (--TC T1) 0.0238 msec/pass ET : replace_children_element (--TC T1) 0.1600 msec/pass as opposed to replacing the slice with an existing element from the same document:: - lxe: replace_children (--TC T1) 0.0179 msec/pass + lxe: replace_children (--TC T1) 0.0160 msec/pass cET: replace_children (--TC T1) 0.0119 msec/pass - ET : replace_children (--TC T1) 0.0739 msec/pass + ET : replace_children (--TC T1) 0.0741 msec/pass -You should keep this difference in mind when you merge very large trees. +While these numbers are too small to provide a major performance +impact in practice, you should keep this difference in mind when you +merge very large trees. deepcopy @@ -357,17 +369,17 @@ Deep copying a tree is fast in lxml:: - lxe: deepcopy_all (--TR T1) 9.7558 msec/pass - cET: deepcopy_all (--TR T1) 120.6188 msec/pass - ET : deepcopy_all (--TR T1) 902.6880 msec/pass - - lxe: deepcopy_all (-ATR T2) 12.3210 msec/pass - cET: deepcopy_all (-ATR T2) 136.9810 msec/pass - ET : deepcopy_all (-ATR T2) 944.2801 msec/pass - - lxe: deepcopy_all (S-TR T3) 8.3981 msec/pass - cET: deepcopy_all (S-TR T3) 35.6541 msec/pass - ET : deepcopy_all (S-TR T3) 221.6041 msec/pass + lxe: deepcopy_all (--TR T1) 9.4090 msec/pass + cET: deepcopy_all (--TR T1) 120.1589 msec/pass + ET : deepcopy_all (--TR T1) 901.3789 msec/pass + + lxe: deepcopy_all (-ATR T2) 12.4569 msec/pass + cET: deepcopy_all (-ATR T2) 135.8809 msec/pass + ET : deepcopy_all (-ATR T2) 940.7840 msec/pass + + lxe: deepcopy_all (S-TR T3) 2.7640 msec/pass + cET: deepcopy_all (S-TR T3) 30.1108 msec/pass + ET : deepcopy_all (S-TR T3) 228.4350 msec/pass So, for example, if you have a database-like scenario where you parse in a large tree and then search and copy independent subtrees from it for further @@ -382,42 +394,42 @@ especially if few elements are of interest or the target element tag name is known, lxml is a good choice:: - lxe: getiterator_all (--TR T1) 5.7251 msec/pass - cET: getiterator_all (--TR T1) 39.9489 msec/pass - ET : getiterator_all (--TR T1) 23.0000 msec/pass - - lxe: getiterator_islice (--TR T2) 0.0830 msec/pass - cET: getiterator_islice (--TR T2) 0.3440 msec/pass - ET : getiterator_islice (--TR T2) 0.2429 msec/pass - - lxe: getiterator_tag (--TR T2) 0.3011 msec/pass - cET: getiterator_tag (--TR T2) 14.1001 msec/pass - ET : getiterator_tag (--TR T2) 7.4241 msec/pass - - lxe: getiterator_tag_all (--TR T2) 0.6340 msec/pass - cET: getiterator_tag_all (--TR T2) 40.7901 msec/pass - ET : getiterator_tag_all (--TR T2) 21.0390 msec/pass + lxe: getiterator_all (--TR T1) 5.0449 msec/pass + cET: getiterator_all (--TR T1) 42.0539 msec/pass + ET : getiterator_all (--TR T1) 22.9158 msec/pass + + lxe: getiterator_islice (--TR T2) 0.0789 msec/pass + cET: getiterator_islice (--TR T2) 0.3579 msec/pass + ET : getiterator_islice (--TR T2) 0.2351 msec/pass + + lxe: getiterator_tag (--TR T2) 0.0651 msec/pass + cET: getiterator_tag (--TR T2) 0.7648 msec/pass + ET : getiterator_tag (--TR T2) 0.4380 msec/pass + + lxe: getiterator_tag_all (--TR T2) 0.8650 msec/pass + cET: getiterator_tag_all (--TR T2) 42.7120 msec/pass + ET : getiterator_tag_all (--TR T2) 21.5559 msec/pass This translates directly into similar timings for ``Element.findall()``:: - lxe: findall (--TR T2) 7.8950 msec/pass - cET: findall (--TR T2) 44.5340 msec/pass - ET : findall (--TR T2) 27.1149 msec/pass - - lxe: findall (--TR T3) 1.7281 msec/pass - cET: findall (--TR T3) 12.9611 msec/pass - ET : findall (--TR T3) 8.6131 msec/pass - - lxe: findall_tag (--TR T2) 0.7720 msec/pass - cET: findall_tag (--TR T2) 40.6358 msec/pass - ET : findall_tag (--TR T2) 21.4581 msec/pass - - lxe: findall_tag (--TR T3) 0.2050 msec/pass - cET: findall_tag (--TR T3) 9.6831 msec/pass - ET : findall_tag (--TR T3) 5.2109 msec/pass + lxe: findall (--TR T2) 6.8750 msec/pass + cET: findall (--TR T2) 46.8600 msec/pass + ET : findall (--TR T2) 27.0121 msec/pass + + lxe: findall (--TR T3) 1.5690 msec/pass + cET: findall (--TR T3) 13.6340 msec/pass + ET : findall (--TR T3) 8.8100 msec/pass + + lxe: findall_tag (--TR T2) 1.0221 msec/pass + cET: findall_tag (--TR T2) 42.8400 msec/pass + ET : findall_tag (--TR T2) 21.4801 msec/pass + + lxe: findall_tag (--TR T3) 0.4241 msec/pass + cET: findall_tag (--TR T3) 10.7069 msec/pass + ET : findall_tag (--TR T3) 5.8560 msec/pass Note that all three libraries currently use the same Python implementation for -``findall()``, except for their native tree iterator. +``findall()``, except for their native tree iterator (``element.iter()``). XPath @@ -430,38 +442,38 @@ of the lxml API you use. The most straight forward way is to call the ``xpath()`` method on an Element or ElementTree:: - lxe: xpath_method (--TC T1) 1.7459 msec/pass - lxe: xpath_method (--TC T2) 22.0850 msec/pass - lxe: xpath_method (--TC T3) 0.1309 msec/pass - lxe: xpath_method (--TC T4) 1.0772 msec/pass + lxe: xpath_method (--TC T1) 1.5969 msec/pass + lxe: xpath_method (--TC T2) 21.3680 msec/pass + lxe: xpath_method (--TC T3) 0.1218 msec/pass + lxe: xpath_method (--TC T4) 1.0300 msec/pass This is well suited for testing and when the XPath expressions are as diverse as the trees they are called on. However, if you have a single XPath expression that you want to apply to a larger number of different elements, the ``XPath`` class is the most efficient way to do it:: - lxe: xpath_class (--TC T1) 0.6740 msec/pass - lxe: xpath_class (--TC T2) 3.1760 msec/pass - lxe: xpath_class (--TC T3) 0.0548 msec/pass - lxe: xpath_class (--TC T4) 0.1700 msec/pass + lxe: xpath_class (--TC T1) 0.6590 msec/pass + lxe: xpath_class (--TC T2) 2.9969 msec/pass + lxe: xpath_class (--TC T3) 0.0520 msec/pass + lxe: xpath_class (--TC T4) 0.1619 msec/pass Note that this still allows you to use variables in the expression, so you can parse it once and then adapt it through variables at call time. In other cases, where you have a fixed Element or ElementTree and want to run different expressions on it, you should consider the ``XPathEvaluator``:: - lxe: xpath_element (--TR T1) 0.4151 msec/pass - lxe: xpath_element (--TR T2) 11.6129 msec/pass - lxe: xpath_element (--TR T3) 0.1299 msec/pass - lxe: xpath_element (--TR T4) 0.3409 msec/pass + lxe: xpath_element (--TR T1) 0.4120 msec/pass + lxe: xpath_element (--TR T2) 11.5321 msec/pass + lxe: xpath_element (--TR T3) 0.1152 msec/pass + lxe: xpath_element (--TR T4) 0.3202 msec/pass While it looks slightly slower, creating an XPath object for each of the expressions generates a much higher overhead here:: - lxe: xpath_class_repeat (--TC T1) 1.6699 msec/pass - lxe: xpath_class_repeat (--TC T2) 20.4420 msec/pass - lxe: xpath_class_repeat (--TC T3) 0.1230 msec/pass - lxe: xpath_class_repeat (--TC T4) 0.9859 msec/pass + lxe: xpath_class_repeat (--TC T1) 1.5409 msec/pass + lxe: xpath_class_repeat (--TC T2) 20.2711 msec/pass + lxe: xpath_class_repeat (--TC T3) 0.1161 msec/pass + lxe: xpath_class_repeat (--TC T4) 0.9799 msec/pass A longer example @@ -628,21 +640,21 @@ tree. It avoids step-by-step Python element instantiations along the path, which can substantially improve the access time:: - lxe: attribute (--TR T1) 9.4581 msec/pass - lxe: attribute (--TR T2) 52.5560 msec/pass - lxe: attribute (--TR T4) 9.1729 msec/pass - - lxe: objectpath (--TR T1) 4.8690 msec/pass - lxe: objectpath (--TR T2) 47.8780 msec/pass - lxe: objectpath (--TR T4) 4.7870 msec/pass - - lxe: attributes_deep (--TR T1) 54.7471 msec/pass - lxe: attributes_deep (--TR T2) 62.7451 msec/pass - lxe: attributes_deep (--TR T4) 15.1050 msec/pass - - lxe: objectpath_deep (--TR T1) 48.2810 msec/pass - lxe: objectpath_deep (--TR T2) 51.3949 msec/pass - lxe: objectpath_deep (--TR T4) 6.1419 msec/pass + lxe: attribute (--TR T1) 8.4081 msec/pass + lxe: attribute (--TR T2) 51.3301 msec/pass + lxe: attribute (--TR T4) 8.2269 msec/pass + + lxe: objectpath (--TR T1) 4.6120 msec/pass + lxe: objectpath (--TR T2) 47.0440 msec/pass + lxe: objectpath (--TR T4) 4.4930 msec/pass + + lxe: attributes_deep (--TR T1) 12.6550 msec/pass + lxe: attributes_deep (--TR T2) 56.0241 msec/pass + lxe: attributes_deep (--TR T4) 12.5690 msec/pass + + lxe: objectpath_deep (--TR T1) 5.9190 msec/pass + lxe: objectpath_deep (--TR T2) 49.6972 msec/pass + lxe: objectpath_deep (--TR T4) 5.7530 msec/pass Note, however, that parsing ObjectPath expressions is not for free either, so this is most effective for frequently accessing the same element. @@ -672,17 +684,17 @@ subtrees and elements) to cache, you can trade memory usage against access speed:: - lxe: attribute_cached (--TR T1) 7.5061 msec/pass - lxe: attribute_cached (--TR T2) 50.1881 msec/pass - lxe: attribute_cached (--TR T4) 7.4170 msec/pass - - lxe: attributes_deep_cached (--TR T1) 48.7239 msec/pass - lxe: attributes_deep_cached (--TR T2) 55.2199 msec/pass - lxe: attributes_deep_cached (--TR T4) 9.9740 msec/pass - - lxe: objectpath_deep_cached (--TR T1) 43.4160 msec/pass - lxe: objectpath_deep_cached (--TR T2) 47.6480 msec/pass - lxe: objectpath_deep_cached (--TR T4) 3.4680 msec/pass + lxe: attribute_cached (--TR T1) 6.4209 msec/pass + lxe: attribute_cached (--TR T2) 48.0378 msec/pass + lxe: attribute_cached (--TR T4) 6.3779 msec/pass + + lxe: attributes_deep_cached (--TR T1) 7.8559 msec/pass + lxe: attributes_deep_cached (--TR T2) 51.0719 msec/pass + lxe: attributes_deep_cached (--TR T4) 7.7350 msec/pass + + lxe: objectpath_deep_cached (--TR T1) 3.2761 msec/pass + lxe: objectpath_deep_cached (--TR T2) 45.7590 msec/pass + lxe: objectpath_deep_cached (--TR T4) 3.1459 msec/pass Things to note: you cannot currently use ``weakref.WeakKeyDictionary`` objects for this as lxml's element objects do not support weak references (which are From scoder at codespeak.net Thu Apr 24 22:04:05 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 24 Apr 2008 22:04:05 +0200 (CEST) Subject: [Lxml-checkins] r54102 - in lxml/trunk: . doc Message-ID: <20080424200405.8A30116A019@codespeak.net> Author: scoder Date: Thu Apr 24 22:04:01 2008 New Revision: 54102 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/performance.txt Log: r4053 at delle: sbehnel | 2008-04-24 07:56:37 +0200 doc update: make clear in performance.txt that lxml really is fast Modified: lxml/trunk/doc/performance.txt ============================================================================== --- lxml/trunk/doc/performance.txt (original) +++ lxml/trunk/doc/performance.txt Thu Apr 24 22:04:01 2008 @@ -10,11 +10,26 @@ :keywords: lxml performance, lxml.etree, lxml.objectify, benchmarks, ElementTree -As an XML library, lxml.etree is very fast. It is also slow. As with -all software, it depends on what you do with it. Rest assured that -lxml is fast enough for most applications, so lxml is probably -somewhere between 'fast enough' and 'the best choice' for yours. Read -some messages_ from happy_ users_ to see what we mean. +lxml.etree is a very fast XML library. Most of this is due to the +speed of libxml2, e.g. the parser and serialiser, or the XPath engine. +Other areas of lxml were specifically written for high performance in +high-level operations, such as the tree iterators. + +On the other hand, the simplicity of lxml sometimes hides internal +operations that are more costly than the API suggests. If you are not +aware of these cases, lxml may not always perform as you expect. A +common example in the Python world is the Python list type. New users +often expect it to be a linked list, while it actually is implemented +as an array, which results in a completely different complexity for +common operations. + +Similarly, the tree model of libxml2 is more complex than what lxml's +ElementTree API projects into Python space, so some operations may +show unexpected performance. Rest assured that most lxml users will +not notice this in real life, as lxml is very fast in absolute +numbers. It is definitely fast enough for most applications, so lxml +is probably somewhere between 'fast enough' and 'the best choice' for +yours. Read some messages_ from happy_ users_ to see what we mean. .. _messages: http://permalink.gmane.org/gmane.comp.python.lxml.devel/3250 .. _happy: http://article.gmane.org/gmane.comp.python.lxml.devel/3246 @@ -235,7 +250,7 @@ T4: 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 ET : -- S- U- -A SA UA T1: 0.1074 0.1669 0.1050 0.2054 0.2401 0.1047 - T2: 0.2920 0.1172 0.3393 0.4021 0.1184 0.4216 + T2: 0.2920 0.1172 0.3393 0.3830 0.1184 0.4215 T3: 0.0347 0.0331 0.0316 0.0368 0.3944 0.0377 T4: 0.0006 0.0005 0.0007 0.0006 0.0007 0.0006 From scoder at codespeak.net Thu Apr 24 22:04:10 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 24 Apr 2008 22:04:10 +0200 (CEST) Subject: [Lxml-checkins] r54103 - in lxml/trunk: . benchmark Message-ID: <20080424200410.31C1816A088@codespeak.net> Author: scoder Date: Thu Apr 24 22:04:08 2008 New Revision: 54103 Modified: lxml/trunk/ (props changed) lxml/trunk/benchmark/bench_etree.py lxml/trunk/benchmark/bench_objectify.py lxml/trunk/benchmark/bench_xpath.py lxml/trunk/benchmark/benchbase.py Log: r4054 at delle: sbehnel | 2008-04-24 07:57:21 +0200 faster benchmark runs: avoid rebuilding trees when the benchmark does not change them Modified: lxml/trunk/benchmark/bench_etree.py ============================================================================== --- lxml/trunk/benchmark/bench_etree.py (original) +++ lxml/trunk/benchmark/bench_etree.py Thu Apr 24 22:04:08 2008 @@ -3,7 +3,8 @@ from StringIO import StringIO import benchbase -from benchbase import with_attributes, with_text, onlylib, serialized, children +from benchbase import (with_attributes, with_text, onlylib, + serialized, children, nochange) TEXT = "some ASCII text" UTEXT = u"some klingon: \F8D2" @@ -13,39 +14,47 @@ ############################################################ class BenchMark(benchbase.TreeBenchMark): + @nochange def bench_iter_children(self, root): for child in root: pass + @nochange def bench_iter_children_reversed(self, root): for child in reversed(root): pass + @nochange def bench_first_child(self, root): - for i in range(1000): + for i in self.repeat1000: child = root[0] + @nochange def bench_last_child(self, root): - for i in range(1000): + for i in self.repeat1000: child = root[-1] + @nochange def bench_middle_child(self, root): pos = len(root) / 2 - for i in range(1000): + for i in self.repeat1000: child = root[pos] + @nochange @with_attributes(False) @with_text(text=True) @onlylib('lxe', 'ET') def bench_tostring_text_ascii(self, root): self.etree.tostring(root, method="text") + @nochange @with_attributes(False) @with_text(text=True, utext=True) @onlylib('lxe') def bench_tostring_text_utf16(self, root): self.etree.tostring(root, method="text", encoding='UTF-16') + @nochange @with_attributes(False) @with_text(text=True, utext=True) @onlylib('lxe', 'ET') @@ -55,6 +64,7 @@ self.etree.tostring(child, method="text", encoding='UTF-8', with_tail=True) + @nochange @with_attributes(False) @with_text(text=True, utext=True) @onlylib('lxe') @@ -63,22 +73,26 @@ for child in children: self.etree.tostring(child, method="text", encoding=unicode) + @nochange @with_attributes(True, False) @with_text(text=True, utext=True) def bench_tostring_utf8(self, root): self.etree.tostring(root, encoding='UTF-8') + @nochange @with_attributes(True, False) @with_text(text=True, utext=True) def bench_tostring_utf16(self, root): self.etree.tostring(root, encoding='UTF-16') + @nochange @with_attributes(True, False) @with_text(text=True, utext=True) def bench_tostring_utf8_unicode_XML(self, root): xml = unicode(self.etree.tostring(root, encoding='UTF-8'), 'UTF-8') self.etree.XML(xml) + @nochange @with_attributes(True, False) @with_text(text=True, utext=True) def bench_write_utf8_parse_stringIO(self, root): @@ -149,12 +163,14 @@ def bench_clear(self, root): root.clear() + @nochange @children def bench_has_children(self, children): for child in children: if child and child and child and child and child: pass + @nochange @children def bench_len(self, children): for child in children: @@ -172,12 +188,14 @@ el = Element('{test}test') child.append(el) + @nochange @children def bench_makeelement(self, children): empty_attrib = {} for child in children: child.makeelement('{test}test', empty_attrib) + @nochange @children def bench_create_elements(self, children): Element = self.etree.Element @@ -224,28 +242,34 @@ for child in children: child.get('a') + @nochange def bench_root_getchildren(self, root): root.getchildren() + @nochange def bench_root_list_children(self, root): list(root) + @nochange @children def bench_getchildren(self, children): for child in children: child.getchildren() + @nochange @children def bench_get_children_slice(self, children): for child in children: child[:] + @nochange @children def bench_get_children_slice_2x(self, children): for child in children: child[:] child[:] + @nochange @children @with_attributes(True, False) @with_text(utext=True, text=True, no_text=True) @@ -253,34 +277,38 @@ for child in children: copy.deepcopy(child) + @nochange @with_attributes(True, False) @with_text(utext=True, text=True, no_text=True) def bench_deepcopy_all(self, root): copy.deepcopy(root) + @nochange @children def bench_tag(self, children): for child in children: child.tag + @nochange @children def bench_tag_repeat(self, children): for child in children: - for i in repeat(0, 100): + for i in self.repeat100: child.tag + @nochange @with_text(utext=True, text=True, no_text=True) @children def bench_text(self, children): for child in children: child.text + @nochange @with_text(utext=True, text=True, no_text=True) @children def bench_text_repeat(self, children): - repeat = range(500) for child in children: - for i in repeat: + for i in self.repeat500: child.text @children @@ -295,65 +323,82 @@ for child in children: child.text = text + @nochange @onlylib('lxe') def bench_index(self, root): for child in root: root.index(child) + @nochange @onlylib('lxe') def bench_index_slice(self, root): for child in root[5:100]: root.index(child, 5, 100) + @nochange @onlylib('lxe') def bench_index_slice_neg(self, root): for child in root[-100:-5]: root.index(child, start=-100, stop=-5) + @nochange def bench_getiterator_all(self, root): list(root.getiterator()) + @nochange def bench_getiterator_islice(self, root): list(islice(root.getiterator(), 10, 110)) + @nochange def bench_getiterator_tag(self, root): list(islice(root.getiterator(self.SEARCH_TAG), 3, 10)) + @nochange def bench_getiterator_tag_all(self, root): list(root.getiterator(self.SEARCH_TAG)) + @nochange def bench_getiterator_tag_none(self, root): list(root.getiterator("{ThisShould}NeverExist")) + @nochange def bench_getiterator_tag_text(self, root): [ e.text for e in root.getiterator(self.SEARCH_TAG) ] + @nochange def bench_findall(self, root): root.findall(".//*") + @nochange def bench_findall_child(self, root): root.findall(".//*/" + self.SEARCH_TAG) + @nochange def bench_findall_tag(self, root): root.findall(".//" + self.SEARCH_TAG) + @nochange def bench_findall_path(self, root): root.findall(".//*[%s]/./%s/./*" % (self.SEARCH_TAG, self.SEARCH_TAG)) + @nochange @onlylib('lxe') def bench_xpath_path(self, root): ns, tag = self.SEARCH_TAG[1:].split('}') root.xpath(".//*[p:%s]/./p:%s/./*" % (tag,tag), namespaces = {'p':ns}) + @nochange @onlylib('lxe') def bench_iterfind(self, root): list(root.iterfind(".//*")) + @nochange @onlylib('lxe') def bench_iterfind_tag(self, root): list(root.iterfind(".//" + self.SEARCH_TAG)) + @nochange @onlylib('lxe') def bench_iterfind_islice(self, root): list(islice(root.iterfind(".//*"), 10, 110)) Modified: lxml/trunk/benchmark/bench_objectify.py ============================================================================== --- lxml/trunk/benchmark/bench_objectify.py (original) +++ lxml/trunk/benchmark/bench_objectify.py Thu Apr 24 22:04:08 2008 @@ -3,7 +3,8 @@ from StringIO import StringIO import benchbase -from benchbase import with_attributes, with_text, onlylib, serialized, children +from benchbase import (with_attributes, with_text, onlylib, + serialized, children, nochange) ############################################################ # Benchmarks @@ -22,6 +23,7 @@ parser.setElementClassLookup(lookup) super(BenchMark, self).__init__(etree, parser) + @nochange def bench_attribute(self, root): "1 2 4" for i in self.repeat3000: @@ -37,17 +39,20 @@ for i in self.repeat3000: root.XYZ = "5" + @nochange def bench_attribute_cached(self, root): "1 2 4" cache = root.zzzzz for i in self.repeat3000: root.zzzzz + @nochange def bench_attributes_deep(self, root): "1 2 4" for i in self.repeat3000: root.zzzzz['{cdefg}a00001'] + @nochange def bench_attributes_deep_cached(self, root): "1 2 4" cache1 = root.zzzzz @@ -55,18 +60,21 @@ for i in self.repeat3000: root.zzzzz['{cdefg}a00001'] + @nochange def bench_objectpath(self, root): "1 2 4" path = self.objectify.ObjectPath(".zzzzz") for i in self.repeat3000: path(root) + @nochange def bench_objectpath_deep(self, root): "1 2 4" path = self.objectify.ObjectPath(".zzzzz.{cdefg}a00001") for i in self.repeat3000: path(root) + @nochange def bench_objectpath_deep_cached(self, root): "1 2 4" cache1 = root.zzzzz @@ -79,9 +87,11 @@ def bench_annotate(self, root): self.objectify.annotate(root) + @nochange def bench_descendantpaths(self, root): root.descendantpaths() + @nochange @with_text(text=True) def bench_type_inference(self, root): "1 2 4" @@ -89,6 +99,7 @@ for i in self.repeat1000: el.getchildren() + @nochange @with_text(text=True) def bench_type_inference_annotated(self, root): "1 2 4" @@ -97,6 +108,7 @@ for i in self.repeat1000: el.getchildren() + @nochange @children def bench_elementmaker(self, children): E = self.objectify.E Modified: lxml/trunk/benchmark/bench_xpath.py ============================================================================== --- lxml/trunk/benchmark/bench_xpath.py (original) +++ lxml/trunk/benchmark/bench_xpath.py Thu Apr 24 22:04:08 2008 @@ -10,6 +10,7 @@ ############################################################ class XPathBenchMark(benchbase.TreeBenchMark): + @nochange @onlylib('lxe') @children def bench_xpath_class(self, children): @@ -17,6 +18,7 @@ for child in children: xpath(child) + @nochange @onlylib('lxe') @children def bench_xpath_class_repeat(self, children): @@ -24,18 +26,21 @@ xpath = self.etree.XPath("./*[0]") xpath(child) + @nochange @onlylib('lxe') def bench_xpath_element(self, root): xpath = self.etree.XPathElementEvaluator(root) for child in root: xpath.evaluate("./*[0]") + @nochange @onlylib('lxe') @children def bench_xpath_method(self, children): for child in children: child.xpath("./*[0]") + @nochange @onlylib('lxe') @children def bench_xpath_old_extensions(self, children): @@ -50,6 +55,7 @@ for child in children: xpath(child) + @nochange @onlylib('lxe') @children def bench_xpath_extensions(self, children): Modified: lxml/trunk/benchmark/benchbase.py ============================================================================== --- lxml/trunk/benchmark/benchbase.py (original) +++ lxml/trunk/benchmark/benchbase.py Thu Apr 24 22:04:08 2008 @@ -18,6 +18,7 @@ def initArgs(argv): + global TREE_FACTOR try: argv.remove('-l') # use large trees @@ -83,6 +84,11 @@ function.CHILDREN = True return function +def nochange(function): + "Decorator for benchmarks that do not change the XML tree" + function.NO_CHANGE = True + return function + ############################################################ # benchmark baseclass ############################################################ @@ -92,6 +98,9 @@ class TreeBenchMark(object): atoz = string.ascii_lowercase + repeat100 = range(100) + repeat500 = range(500) + repeat1000 = range(1000) _LIB_NAME_MAP = { 'etree' : 'lxe', @@ -286,12 +295,14 @@ serialized = getattr(method, 'STRING', False) children = getattr(method, 'CHILDREN', False) + no_change = getattr(method, 'NO_CHANGE', False) for tree_tuple in tree_tuples: for tn in sorted(getattr(method, 'TEXT', (0,))): for an in sorted(getattr(method, 'ATTRIBUTES', (0,))): benchmarks.append((name, method_call, tree_tuple, - tn, an, serialized, children)) + tn, an, serialized, children, + no_change)) return benchmarks @@ -349,7 +360,8 @@ print " T%d:" % (i+1), ' '.join("%6.4f" % t for t in tree_times) print -def runBench(suite, method_name, method_call, tree_set, tn, an, serial, children): +def runBench(suite, method_name, method_call, tree_set, tn, an, + serial, children, no_change): if method_call is None: raise SkippedTest @@ -359,14 +371,19 @@ tree_builders = [ suite.tree_builder(tree, tn, an, serial, children) for tree in tree_set ] + if no_change or serial: + args = tuple([ build() for build in tree_builders ]) + else: + args = () + times = [] - args = () + gc.collect() for i in range(3): - gc.collect() gc.disable() t = -1 for i in call_repeat: - args = [ build() for build in tree_builders ] + if not no_change and not serial: + args = [ build() for build in tree_builders ] t_one_call = current_time() method_call(*args) t_one_call = current_time() - t_one_call @@ -376,14 +393,16 @@ t = min(t, t_one_call) times.append(1000.0 * t) gc.enable() - del args + gc.collect() + if not isinstance(args, tuple): + del args return times def runBenchmarks(benchmark_suites, benchmarks): for bench_calls in izip(*benchmarks): for lib, (bench, benchmark_setup) in enumerate(izip(benchmark_suites, bench_calls)): bench_name = benchmark_setup[0] - tree_set_name = build_treeset_name(*benchmark_setup[-5:]) + tree_set_name = build_treeset_name(*benchmark_setup[-6:-1]) print "%-3s: %-28s" % (bench.lib_name, bench_name[6:34]), print "(%-10s)" % tree_set_name, sys.stdout.flush() From scoder at codespeak.net Thu Apr 24 22:04:14 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 24 Apr 2008 22:04:14 +0200 (CEST) Subject: [Lxml-checkins] r54104 - in lxml/trunk: . benchmark Message-ID: <20080424200414.7D8FC16A089@codespeak.net> Author: scoder Date: Thu Apr 24 22:04:14 2008 New Revision: 54104 Modified: lxml/trunk/ (props changed) lxml/trunk/benchmark/bench_etree.py Log: r4055 at delle: sbehnel | 2008-04-24 09:53:44 +0200 fix benchmark Modified: lxml/trunk/benchmark/bench_etree.py ============================================================================== --- lxml/trunk/benchmark/bench_etree.py (original) +++ lxml/trunk/benchmark/bench_etree.py Thu Apr 24 22:04:14 2008 @@ -57,7 +57,7 @@ @nochange @with_attributes(False) @with_text(text=True, utext=True) - @onlylib('lxe', 'ET') + @onlylib('lxe') @children def bench_tostring_text_utf8_with_tail(self, children): for child in children: From scoder at codespeak.net Thu Apr 24 22:04:19 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 24 Apr 2008 22:04:19 +0200 (CEST) Subject: [Lxml-checkins] r54105 - in lxml/trunk: . benchmark Message-ID: <20080424200419.53A5516A087@codespeak.net> Author: scoder Date: Thu Apr 24 22:04:18 2008 New Revision: 54105 Modified: lxml/trunk/ (props changed) lxml/trunk/benchmark/benchbase.py Log: r4056 at delle: sbehnel | 2008-04-24 09:55:51 +0200 more reliable benchmark numbers Modified: lxml/trunk/benchmark/benchbase.py ============================================================================== --- lxml/trunk/benchmark/benchbase.py (original) +++ lxml/trunk/benchmark/benchbase.py Thu Apr 24 22:04:18 2008 @@ -371,18 +371,18 @@ tree_builders = [ suite.tree_builder(tree, tn, an, serial, children) for tree in tree_set ] - if no_change or serial: - args = tuple([ build() for build in tree_builders ]) - else: - args = () + rebuild_trees = not no_change and not serial + + args = tuple([ build() for build in tree_builders ]) + method_call(*args) # run once to skip setup overhead times = [] - gc.collect() for i in range(3): + gc.collect() gc.disable() t = -1 for i in call_repeat: - if not no_change and not serial: + if rebuild_trees: args = [ build() for build in tree_builders ] t_one_call = current_time() method_call(*args) @@ -393,9 +393,10 @@ t = min(t, t_one_call) times.append(1000.0 * t) gc.enable() - gc.collect() - if not isinstance(args, tuple): - del args + if rebuild_trees: + args = () + args = () + gc.collect() return times def runBenchmarks(benchmark_suites, benchmarks): From scoder at codespeak.net Thu Apr 24 22:04:23 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 24 Apr 2008 22:04:23 +0200 (CEST) Subject: [Lxml-checkins] r54106 - in lxml/trunk: . doc Message-ID: <20080424200423.2679316A08A@codespeak.net> Author: scoder Date: Thu Apr 24 22:04:22 2008 New Revision: 54106 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/mklatex.py Log: r4057 at delle: sbehnel | 2008-04-24 10:50:47 +0200 doc PDF: move changelog and API docs into appendix, some simplification in mklatex.py Modified: lxml/trunk/doc/mklatex.py ============================================================================== --- lxml/trunk/doc/mklatex.py (original) +++ lxml/trunk/doc/mklatex.py Thu Apr 24 22:04:22 2008 @@ -2,7 +2,7 @@ # Testing: # python mklatex.py latex .. 1.0 -from docstructure import SITE_STRUCTURE, HREF_MAP, BASENAME_MAP +from docstructure import SITE_STRUCTURE, BASENAME_MAP import os, shutil, re, sys, datetime try: @@ -34,10 +34,6 @@ replace_epydoc_macros = re.compile(r'(,\s*amssymb|dvips\s*,\s*)').sub replace_rst_macros = re.compile(r'(\\usepackage\{color}|\\usepackage\[[^]]*]\{hyperref})').sub -FILENAME_MAP = { - "@API reference" : "api.tex" -} - BASENAME_MAP = BASENAME_MAP.copy() BASENAME_MAP.update({'api' : 'lxmlapi'}) @@ -173,10 +169,6 @@ shutil.copy(pubkey, dirname) - href_map = HREF_MAP.copy() - changelog_basename = 'changes-%s' % release - href_map['Release Changelog'] = changelog_basename + '.tex' - # build pygments macros build_pygments_macros(os.path.join(dirname, '_part_pygments.tex')) @@ -209,33 +201,20 @@ have_epydoc_macros = False for section, text_files in SITE_STRUCTURE: for filename in text_files: - special = False - if filename in FILENAME_MAP: - outname = FILENAME_MAP[filename] - if not have_epydoc_macros: - have_epydoc_macros = True - copy_epydoc_macros( - os.path.join(dirname, outname), - os.path.join(dirname, '_part_epydoc.tex'), - set(header)) - special = True - elif filename.startswith('@'): - print "Not yet implemented: %s" % filename[1:] + if filename.startswith('@'): continue #page_title = filename[1:] #url = href_map[page_title] #build_menu_entry(page_title, url, section_head) - else: - basename = os.path.splitext(os.path.basename(filename))[0] - basename = BASENAME_MAP.get(basename, basename) - outname = basename + '.tex' - + + basename = os.path.splitext(os.path.basename(filename))[0] + basename = BASENAME_MAP.get(basename, basename) + outname = basename + '.tex' outpath = os.path.join(dirname, outname) - print "Creating %s" % outname + path = os.path.join(doc_dir, filename) - if not special: - path = os.path.join(doc_dir, filename) - rest2latex(script, path, outpath) + print "Creating %s" % outname + rest2latex(script, path, outpath) final_name = os.path.join(dirname, os.path.dirname(outname), "_part_%s" % os.path.basename(outname)) @@ -247,7 +226,19 @@ header = hd titles[outname] = title - # also convert CHANGES.txt + # integrate generated API docs + + print "Integrating API docs" + apidocsname = 'api.tex' + apipath = os.path.join(dirname, apidocsname) + tex_postprocess(apipath, os.path.join(dirname, "_part_%s" % apidocsname), + process_line=fix_relative_hyperrefs) + copy_epydoc_macros(apipath, os.path.join(dirname, '_part_epydoc.tex'), + set(header)) + + # convert CHANGES.txt + + print "Integrating ChangeLog" find_version_title = re.compile( r'(.*\\section\{)([0-9][^\} ]*)\s+\(([^)]+)\)(\}.*)').search def fix_changelog(line): @@ -294,9 +285,7 @@ for section, text_files in SITE_STRUCTURE: master.write("\n\n\\part{%s}\n" % section) for filename in text_files: - if filename in FILENAME_MAP: - outname = FILENAME_MAP[filename] - elif filename.startswith('@'): + if filename.startswith('@'): continue #print "Not yet implemented: %s" % filename[1:] #page_title = filename[1:] @@ -308,8 +297,13 @@ outname = basename + '.tex' write_chapter(master, titles[outname], outname) + master.write("\\appendix\n") + master.write("\\begin{appendix}\n") + write_chapter(master, "Changes", chgname) + write_chapter(master, "Generated API documentation", apidocsname) + master.write("\\end{appendix}\n") master.write("\\end{document}\n") if __name__ == '__main__': From scoder at codespeak.net Thu Apr 24 22:04:28 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 24 Apr 2008 22:04:28 +0200 (CEST) Subject: [Lxml-checkins] r54107 - in lxml/trunk: . benchmark doc Message-ID: <20080424200428.C5FD716A019@codespeak.net> Author: scoder Date: Thu Apr 24 22:04:27 2008 New Revision: 54107 Modified: lxml/trunk/ (props changed) lxml/trunk/benchmark/bench_etree.py lxml/trunk/doc/performance.txt Log: r4058 at delle: sbehnel | 2008-04-24 15:48:15 +0200 some more benchmark results on text serialisation Modified: lxml/trunk/benchmark/bench_etree.py ============================================================================== --- lxml/trunk/benchmark/bench_etree.py (original) +++ lxml/trunk/benchmark/bench_etree.py Thu Apr 24 22:04:27 2008 @@ -51,6 +51,13 @@ @with_attributes(False) @with_text(text=True, utext=True) @onlylib('lxe') + def bench_tostring_text_unicode(self, root): + self.etree.tostring(root, method="text", encoding=unicode) + + @nochange + @with_attributes(False) + @with_text(text=True, utext=True) + @onlylib('lxe', 'ET') def bench_tostring_text_utf16(self, root): self.etree.tostring(root, method="text", encoding='UTF-16') @@ -65,15 +72,6 @@ encoding='UTF-8', with_tail=True) @nochange - @with_attributes(False) - @with_text(text=True, utext=True) - @onlylib('lxe') - @children - def bench_tostring_text_unicode(self, children): - for child in children: - self.etree.tostring(child, method="text", encoding=unicode) - - @nochange @with_attributes(True, False) @with_text(text=True, utext=True) def bench_tostring_utf8(self, root): Modified: lxml/trunk/doc/performance.txt ============================================================================== --- lxml/trunk/doc/performance.txt (original) +++ lxml/trunk/doc/performance.txt Thu Apr 24 22:04:27 2008 @@ -129,8 +129,8 @@ executes entirely at the C level, without any interaction with Python code. The results are rather impressive, especially for UTF-8, which is native to libxml2. While 20 to 40 times faster than (c)ElementTree -1.2, lxml is still more than 7 times as fast as the much improved -ElementTree 1.3:: +1.2 (which is part of the standard library in Python 2.5), lxml is +still more than 7 times as fast as the much improved ElementTree 1.3:: lxe: tostring_utf16 (SATR T1) 25.7590 msec/pass cET: tostring_utf16 (SATR T1) 179.6291 msec/pass @@ -155,11 +155,25 @@ The same applies to plain text serialisation. Note that cElementTree does not currently support this, as it is new in ET 1.3:: - lxe: tostring_text_ascii (S-TR T1) 4.5149 msec/pass - ET : tostring_text_ascii (S-TR T1) 87.6551 msec/pass + lxe: tostring_text_ascii (S-TR T1) 3.8729 msec/pass + ET : tostring_text_ascii (S-TR T1) 90.7841 msec/pass - lxe: tostring_text_ascii (S-TR T3) 1.2901 msec/pass - ET : tostring_text_ascii (S-TR T3) 27.5211 msec/pass + lxe: tostring_text_ascii (S-TR T3) 1.1508 msec/pass + ET : tostring_text_ascii (S-TR T3) 28.0581 msec/pass + + lxe: tostring_text_utf16 (S-TR T1) 5.6219 msec/pass + ET : tostring_text_utf16 (S-TR T1) 87.4891 msec/pass + + lxe: tostring_text_utf16 (U-TR T1) 7.0660 msec/pass + ET : tostring_text_utf16 (U-TR T1) 82.1049 msec/pass + +Unlike ElementTree, the ``tostring()`` function in lxml also supports +serialisation to a Python unicode string object:: + + lxe: tostring_text_unicode (S-TR T1) 4.2419 msec/pass + lxe: tostring_text_unicode (U-TR T1) 5.2760 msec/pass + lxe: tostring_text_unicode (S-TR T3) 1.3049 msec/pass + lxe: tostring_text_unicode (U-TR T3) 1.4210 msec/pass For parsing, on the other hand, the advantage is clearly with cElementTree. The (c)ET libraries use a very thin layer on top of the From scoder at codespeak.net Thu Apr 24 22:04:33 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 24 Apr 2008 22:04:33 +0200 (CEST) Subject: [Lxml-checkins] r54108 - in lxml/trunk: . doc Message-ID: <20080424200433.14F7116A088@codespeak.net> Author: scoder Date: Thu Apr 24 22:04:32 2008 New Revision: 54108 Modified: lxml/trunk/ (props changed) lxml/trunk/Makefile lxml/trunk/doc/mklatex.py Log: r4059 at delle: sbehnel | 2008-04-24 16:49:37 +0200 fix page numbers Modified: lxml/trunk/Makefile ============================================================================== --- lxml/trunk/Makefile (original) +++ lxml/trunk/Makefile Thu Apr 24 22:04:32 2008 @@ -66,7 +66,9 @@ pdf: apipdf $(PYTHON) doc/mklatex.py doc/pdf . ${LXMLVERSION} - (cd doc/pdf && pdflatex lxmldoc.tex && pdflatex lxmldoc.tex) + (cd doc/pdf && pdflatex lxmldoc.tex \ + && pdflatex lxmldoc.tex \ + && pdflatex lxmldoc.tex) @pdfopt doc/pdf/lxmldoc.pdf doc/pdf/lxmldoc-${LXMLVERSION}.pdf @echo "PDF available as doc/pdf/lxmldoc-${LXMLVERSION}.pdf" Modified: lxml/trunk/doc/mklatex.py ============================================================================== --- lxml/trunk/doc/mklatex.py (original) +++ lxml/trunk/doc/mklatex.py Thu Apr 24 22:04:32 2008 @@ -280,6 +280,7 @@ r'{%s}' % book_title, hln) master.write(hln) + master.write("\\setcounter{page}{2}\n") master.write("\\tableofcontents\n") for section, text_files in SITE_STRUCTURE: From scoder at codespeak.net Thu Apr 24 22:04:38 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 24 Apr 2008 22:04:38 +0200 (CEST) Subject: [Lxml-checkins] r54109 - in lxml/trunk: . doc Message-ID: <20080424200438.43C9D16A089@codespeak.net> Author: scoder Date: Thu Apr 24 22:04:37 2008 New Revision: 54109 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/mklatex.py Log: r4060 at delle: sbehnel | 2008-04-24 18:04:50 +0200 cmd line option for rst2latex Modified: lxml/trunk/doc/mklatex.py ============================================================================== --- lxml/trunk/doc/mklatex.py (original) +++ lxml/trunk/doc/mklatex.py Thu Apr 24 22:04:37 2008 @@ -24,6 +24,7 @@ "--font-encoding=T1", "--output-encoding=utf-8", "--input-encoding=utf-8", + "--graphicx-option=pdftex", ]) htmlnsmap = {"h" : "http://www.w3.org/1999/xhtml"} From scoder at codespeak.net Thu Apr 24 22:04:44 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 24 Apr 2008 22:04:44 +0200 (CEST) Subject: [Lxml-checkins] r54110 - in lxml/trunk: . doc Message-ID: <20080424200444.3FF6416A089@codespeak.net> Author: scoder Date: Thu Apr 24 22:04:42 2008 New Revision: 54110 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/extensions.txt lxml/trunk/doc/xpathxslt.txt Log: r4061 at delle: sbehnel | 2008-04-24 18:11:40 +0200 moved XLST extension element docs from xpathxslt.txt to extensions.txt Modified: lxml/trunk/doc/extensions.txt ============================================================================== --- lxml/trunk/doc/extensions.txt (original) +++ lxml/trunk/doc/extensions.txt Thu Apr 24 22:04:42 2008 @@ -1,3 +1,4 @@ +====================================== Extension functions for XPath and XSLT ====================================== @@ -8,10 +9,39 @@ +as well as extension elements in XSLT as in the following example: + +.. sourcecode:: xml + + + + + + + + +.. contents:: +.. + 1 XPath Extension functions + 1.1 The FunctionNamespace + 1.2 Global prefix assignment + 1.3 The XPath context + 1.4 Evaluators and XSLT + 1.5 Evaluator-local extensions + 1.6 What to return from a function + 2 XSLT extension elements + 2.1 Declaring extension elements + 2.2 Applying XSL templates + 2.3 Working with read-only elements + + +XPath Extension functions +========================= + Here is how an extension function looks like. As the first argument, -it always receives a context object (see below). The other arguments -are provided by the respective call in the XPath expression, one in -the following examples. Any number of arguments is allowed: +it always receives an opaque context object (see below). The other +arguments are provided by the respective call in the XPath expression, +one in the following examples. Any number of arguments is allowed: .. sourcecode:: pycon @@ -23,23 +53,13 @@ ... return "Got %d arguments." % len(args) -.. contents:: -.. - 1 The FunctionNamespace - 2 Global prefix assignment - 3 The XPath context - 4 Evaluators and XSLT - 5 Evaluator-local extensions - 6 What to return from a function - - The FunctionNamespace --------------------- -In order to use a function in XPath/XSLT, it needs to have a (namespaced) name -by which it can be called during evaluation. This is done using the -FunctionNamespace class. For simplicity, we choose the empty namespace -(None): +In order to use a function in XPath or XSLT, it needs to have a +(namespaced) name by which it can be called during evaluation. This +is done using the FunctionNamespace class. For simplicity, we choose +the empty namespace (None): .. sourcecode:: pycon @@ -407,3 +427,156 @@ avoid relying on it in your code. Note that elements taken from the source document (the most common case) do not suffer from this restriction. They will always be passed unchanged. + + +XSLT extension elements +======================= + +Just like the XPath extension functions described above, lxml supports +custom extension *elements* in XSLT. This means, you can write XSLT +code like this: + +.. sourcecode:: xml + + + + + + + +And then you can implement the element in Python like this: + +.. sourcecode:: pycon + + >>> class MyExtElement(etree.XSLTExtension): + ... def execute(self, context, self_node, input_node, output_parent): + ... print "Hello from XSLT!" + ... output_parent.text = "I did it!" + ... # just copy own content input to output + ... output_parent.extend( list(self_node) ) + +The arguments passed to the ``.execute()`` method are + +context + The opaque evaluation context. You need this when calling back + into the XSLT processor. + +self_node + A read-only Element object that represents the extension element + in the stylesheet. + +input_node + The current context Element in the input document (also read-only). + +output_parent + The current insertion point in the output document. You can + append elements or set the text value (not the tail). Apart from + that, the Element is read-only. + + +Declaring extension elements +---------------------------- + +In XSLT, extension elements can be used like any other XSLT element, +except that they must be declared as extensions using the standard +XSLT ``extension-element-prefixes`` option: + +.. sourcecode:: pycon + + >>> xslt_ext_tree = etree.XML(''' + ... + ... + ... XYZ + ... + ... + ... --xyz-- + ... + ... ''') + +To register the extension, add its namespace and name to the extension +mapping of the XSLT object: + +.. sourcecode:: pycon + + >>> my_extension = MyExtElement() + >>> extensions = { ('testns', 'ext') : my_extension } + >>> transform = etree.XSLT(xslt_ext_tree, extensions = extensions) + +Note how we pass an instance here, not the class of the extension. +Now we can run the transformation and see how our extension is +called: + +.. sourcecode:: pycon + + >>> root = etree.XML('') + >>> result = transform(root) + Hello from XSLT! + >>> str(result) + '\nI did it!XYZ\n' + + +Applying XSL templates +---------------------- + +XSLT extensions are a very powerful feature that allows you to +interact directly with the XSLT processor. You have full read-only +access to the input document and the stylesheet, and you can even call +back into the XSLT processor to process templates. Here is an example +that passes an Element into the ``.apply_templates()`` method of the +``XSLTExtension`` instance: + +.. sourcecode:: pycon + + >>> class MyExtElement(etree.XSLTExtension): + ... def execute(self, context, self_node, input_node, output_parent): + ... child = self_node[0] + ... results = self.apply_templates(context, child) + ... output_parent.append(results[0]) + + >>> my_extension = MyExtElement() + >>> extensions = { ('testns', 'ext') : my_extension } + >>> transform = etree.XSLT(xslt_ext_tree, extensions = extensions) + + >>> root = etree.XML('') + >>> result = transform(root) + >>> str(result) + '\n--xyz--\n' + +Note how we applied the templates to a child of the extension element +itself, i.e. to an element inside the stylesheet instead of an element +of the input document. + + +Working with read-only elements +------------------------------- + +There is one important thing to keep in mind: all Elements that the +``execute()`` method gets to deal with are read-only Elements, so you +cannot modify them. They also will not easily work in the API. For +example, you cannot pass them to the ``tostring()`` function or wrap +them in an ``ElementTree``. + +What you can do, however, is to deepcopy them to make them normal +Elements, and then modify them using the normal etree API. So this +will work: + +.. sourcecode:: pycon + + >>> from copy import deepcopy + >>> class MyExtElement(etree.XSLTExtension): + ... def execute(self, context, self_node, input_node, output_parent): + ... child = deepcopy(self_node[0]) + ... child.text = "NEW TEXT" + ... output_parent.append(child) + + >>> my_extension = MyExtElement() + >>> extensions = { ('testns', 'ext') : my_extension } + >>> transform = etree.XSLT(xslt_ext_tree, extensions = extensions) + + >>> root = etree.XML('') + >>> result = transform(root) + >>> str(result) + '\nNEW TEXT\n' Modified: lxml/trunk/doc/xpathxslt.txt ============================================================================== --- lxml/trunk/doc/xpathxslt.txt (original) +++ lxml/trunk/doc/xpathxslt.txt Thu Apr 24 22:04:42 2008 @@ -18,10 +18,9 @@ 2 XSLT 2.1 XSLT result objects 2.2 Stylesheet parameters - 2.3 Extension elements - 2.4 The ``xslt()`` tree method - 2.5 Dealing with stylesheet complexity - 2.6 Profiling + 2.3 The ``xslt()`` tree method + 2.4 Dealing with stylesheet complexity + 2.5 Profiling The usual setup procedure: @@ -43,7 +42,8 @@ .. _ElementPath: http://effbot.org/zone/element-xpath.htm .. _`find, findall and findtext`: http://effbot.org/zone/element.htm#searching-for-subelements -.. _`custom extension functions`: extensions.html +.. _`custom extension functions`: extensions.html#xpath-extension-functions +.. _`XSLT extension elements`: extensions.html#xslt-extension-elements There are also specialized XPath evaluator classes that are more efficient for frequent evaluation: ``XPath`` and ``XPathEvaluator``. See the `performance @@ -141,8 +141,8 @@ XPath return values ------------------- -The return values of XPath evaluations vary, depending on the XPath expression -used: +The return value types of XPath evaluations vary, depending on the +XPath expression used: * True or False, when the XPath expression has a boolean result @@ -403,10 +403,11 @@ >>> doc = etree.parse(f) >>> result_tree = transform(doc) -By default, XSLT supports all extension functions from libxslt and libexslt as -well as Python regular expressions through the `EXSLT regexp functions`_. -Also see the documentation on `custom extension functions`_ and `document -resolvers`_. There is a separate section on `controlling access`_ to external +By default, XSLT supports all extension functions from libxslt and +libexslt as well as Python regular expressions through the `EXSLT +regexp functions`_. Also see the documentation on `custom extension +functions`_, `XSLT extension elements`_ and `document resolvers`_. +There is a separate section on `controlling access`_ to external documents and resources. .. _`EXSLT regexp functions`: http://www.exslt.org/regexp/ @@ -507,147 +508,6 @@ '\nText\n' -Extension elements ------------------- - -Just like `custom extension functions`_, lxml supports custom -extension *elements* in XSLT. This means, you can write XSLT code -like this: - -.. sourcecode:: xml - - - - - - - -And then you can implement the element in Python like this: - -.. sourcecode:: pycon - - >>> class MyExtElement(etree.XSLTExtension): - ... def execute(self, context, self_node, input_node, output_parent): - ... print "Hello from XSLT!" - ... output_parent.text = "I did it!" - ... # just copy own content input to output - ... output_parent.extend( list(self_node) ) - -The arguments passed to this function are - -context - The opaque evaluation context. You need this when calling back - into the XSLT processor. - -self_node - A read-only Element object that represents the extension element - in the stylesheet. - -input_node - The current context Element in the input document (also read-only). - -output_parent - The current insertion point in the output document. You can - append elements or set the text value (not the tail). Apart from - that, the Element is read-only. - -In XSLT, extension elements can be used like any other XSLT element, -except that they must be declared as extensions using the standard -XSLT ``extension-element-prefixes`` option: - -.. sourcecode:: pycon - - >>> xslt_ext_tree = etree.XML(''' - ... - ... - ... XYZ - ... - ... - ... --xyz-- - ... - ... ''') - -To register the extension, add its namespace and name to the extension -mapping of the XSLT object: - -.. sourcecode:: pycon - - >>> my_extension = MyExtElement() - >>> extensions = { ('testns', 'ext') : my_extension } - >>> transform = etree.XSLT(xslt_ext_tree, extensions = extensions) - -Note how we pass an instance here, not the class of the extension. -Now we can run the transformation and see how our extension is -called: - -.. sourcecode:: pycon - - >>> root = etree.XML('') - >>> result = transform(root) - Hello from XSLT! - >>> str(result) - '\nI did it!XYZ\n' - -XSLT extensions are a very powerful feature that allows you to -interact directly with the XSLT processor. You have full read-only -access to the input document and the stylesheet, and you can even call -back into the XSLT processor to process templates. Here is an example -that passes an Element into the ``.apply_templates()`` method of the -``XSLTExtension`` instance: - -.. sourcecode:: pycon - - >>> class MyExtElement(etree.XSLTExtension): - ... def execute(self, context, self_node, input_node, output_parent): - ... child = self_node[0] - ... results = self.apply_templates(context, child) - ... output_parent.append(results[0]) - - >>> my_extension = MyExtElement() - >>> extensions = { ('testns', 'ext') : my_extension } - >>> transform = etree.XSLT(xslt_ext_tree, extensions = extensions) - - >>> root = etree.XML('') - >>> result = transform(root) - >>> str(result) - '\n--xyz--\n' - -Note how we applied the templates to a child of the extension element -itself, i.e. to an element inside the stylesheet instead of an element -of the input document. - -There is one important thing to keep in mind: all Elements that the -``execute()`` method gets to deal with are read-only Elements, so you -cannot modify them. They also will not easily work in the API. For -example, you cannot pass them to the ``tostring()`` function or wrap -them in an ``ElementTree``. - -What you can do, however, is to deepcopy them to make them normal -Elements, and then modify them using the normal etree API. So this -will work: - -.. sourcecode:: pycon - - >>> from copy import deepcopy - >>> class MyExtElement(etree.XSLTExtension): - ... def execute(self, context, self_node, input_node, output_parent): - ... child = deepcopy(self_node[0]) - ... child.text = "NEW TEXT" - ... output_parent.append(child) - - >>> my_extension = MyExtElement() - >>> extensions = { ('testns', 'ext') : my_extension } - >>> transform = etree.XSLT(xslt_ext_tree, extensions = extensions) - - >>> root = etree.XML('') - >>> result = transform(root) - >>> str(result) - '\nNEW TEXT\n' - - The ``xslt()`` tree method -------------------------- @@ -696,10 +556,11 @@ forward. A third thing to remember is the support for `custom extension -functions`_. Some things are much easier to do in XSLT than in -Python, while for others it is the complete opposite. Finding the -right mixture of Python code and XSL code can help a great deal in -keeping applications well designed and maintainable. +functions`_ and `XSLT extension elements`_. Some things are much +easier to express in XSLT than in Python, while for others it is the +complete opposite. Finding the right mixture of Python code and XSL +code can help a great deal in keeping applications well designed and +maintainable. Profiling From scoder at codespeak.net Thu Apr 24 22:04:50 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 24 Apr 2008 22:04:50 +0200 (CEST) Subject: [Lxml-checkins] r54111 - lxml/trunk Message-ID: <20080424200450.8AD7916A019@codespeak.net> Author: scoder Date: Thu Apr 24 22:04:47 2008 New Revision: 54111 Modified: lxml/trunk/ (props changed) lxml/trunk/Makefile Log: r4062 at delle: sbehnel | 2008-04-24 18:17:15 +0200 epydoc gen fix Modified: lxml/trunk/Makefile ============================================================================== --- lxml/trunk/Makefile (original) +++ lxml/trunk/Makefile Thu Apr 24 22:04:47 2008 @@ -47,6 +47,7 @@ && (cd src && echo "Generating API docs ..." && \ PYTHONPATH=. epydoc -v --docformat "restructuredtext en" \ -o ../doc/html/api --no-private --exclude='[.]html[.]tests|[.]_' \ + --exclude-introspect='[.]usedoctest' \ --name "lxml API" --url http://codespeak.net/lxml/ lxml/) \ || (echo "not generating epydoc API documentation") @@ -60,7 +61,7 @@ && (cd src && echo "Generating API docs ..." && \ PYTHONPATH=. epydoc -v --latex --docformat "restructuredtext en" \ -o ../doc/pdf --no-private --exclude='([.]html)?[.]tests|[.]_' \ - --exclude-introspect='html[.]clean' \ + --exclude-introspect='html[.]clean|[.]usedoctest' \ --name "lxml API" --url http://codespeak.net/lxml/ lxml/) \ || (echo "not generating epydoc API documentation") From scoder at codespeak.net Thu Apr 24 22:04:54 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 24 Apr 2008 22:04:54 +0200 (CEST) Subject: [Lxml-checkins] r54112 - in lxml/trunk: . doc Message-ID: <20080424200454.6790A16A08A@codespeak.net> Author: scoder Date: Thu Apr 24 22:04:53 2008 New Revision: 54112 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/extensions.txt Log: r4063 at delle: sbehnel | 2008-04-24 18:23:21 +0200 doc fixes Modified: lxml/trunk/doc/extensions.txt ============================================================================== --- lxml/trunk/doc/extensions.txt (original) +++ lxml/trunk/doc/extensions.txt Thu Apr 24 22:04:53 2008 @@ -1,6 +1,6 @@ -====================================== -Extension functions for XPath and XSLT -====================================== +==================================== +Python extensions for XPath and XSLT +==================================== This document describes how to use Python extension functions in XPath and XSLT like this: @@ -9,7 +9,7 @@ -as well as extension elements in XSLT as in the following example: +and extension elements in XSLT as in the following example: .. sourcecode:: xml @@ -39,9 +39,9 @@ ========================= Here is how an extension function looks like. As the first argument, -it always receives an opaque context object (see below). The other -arguments are provided by the respective call in the XPath expression, -one in the following examples. Any number of arguments is allowed: +it always receives a context object (see below). The other arguments +are provided by the respective call in the XPath expression, one in +the following examples. Any number of arguments is allowed: .. sourcecode:: pycon From scoder at codespeak.net Thu Apr 24 22:04:58 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 24 Apr 2008 22:04:58 +0200 (CEST) Subject: [Lxml-checkins] r54113 - in lxml/trunk: . src/lxml Message-ID: <20080424200458.0F84A16A08C@codespeak.net> Author: scoder Date: Thu Apr 24 22:04:57 2008 New Revision: 54113 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/xslt.pxi Log: r4064 at delle: sbehnel | 2008-04-24 18:37:25 +0200 docstring fix Modified: lxml/trunk/src/lxml/xslt.pxi ============================================================================== --- lxml/trunk/src/lxml/xslt.pxi (original) +++ lxml/trunk/src/lxml/xslt.pxi Thu Apr 24 22:04:57 2008 @@ -328,6 +328,8 @@ Keyword arguments of the constructor: + - extensions: a dict mapping ``(namespace, name)`` pairs to + extension functions or extension elements - regexp: enable exslt regular expression support in XPath (default: True) - access_control: access restrictions for network or file From scoder at codespeak.net Thu Apr 24 22:05:02 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 24 Apr 2008 22:05:02 +0200 (CEST) Subject: [Lxml-checkins] r54114 - in lxml/trunk: . src/lxml Message-ID: <20080424200502.67EB416A08C@codespeak.net> Author: scoder Date: Thu Apr 24 22:05:01 2008 New Revision: 54114 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/xsltext.pxi Log: r4065 at delle: sbehnel | 2008-04-24 18:37:39 +0200 cleanup Modified: lxml/trunk/src/lxml/xsltext.pxi ============================================================================== --- lxml/trunk/src/lxml/xsltext.pxi (original) +++ lxml/trunk/src/lxml/xsltext.pxi Thu Apr 24 22:05:01 2008 @@ -69,9 +69,9 @@ cdef _registerXSLTExtensions(xslt.xsltTransformContext* c_ctxt, extension_dict): - for ns, name in extension_dict: + for ns_utf, name_utf in extension_dict: xslt.xsltRegisterExtElement( - c_ctxt, _cstr(name), _cstr(ns), _callExtensionElement) + c_ctxt, _cstr(name_utf), _cstr(ns_utf), _callExtensionElement) cdef void _callExtensionElement(xslt.xsltTransformContext* c_ctxt, xmlNode* c_context_node, @@ -94,7 +94,7 @@ dict_result = python.PyDict_GetItem( context._extension_elements, (c_uri, c_inst_node.name)) if dict_result is NULL: - raise KeyError("extension element %s not found", + raise KeyError("extension element %s not found" % c_inst_node.name) extension = dict_result From scoder at codespeak.net Thu Apr 24 22:05:07 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 24 Apr 2008 22:05:07 +0200 (CEST) Subject: [Lxml-checkins] r54115 - in lxml/trunk: . doc Message-ID: <20080424200507.1D2E816A08E@codespeak.net> Author: scoder Date: Thu Apr 24 22:05:06 2008 New Revision: 54115 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/elementsoup.txt Log: r4066 at delle: sbehnel | 2008-04-24 18:43:34 +0200 additional doc heading Modified: lxml/trunk/doc/elementsoup.txt ============================================================================== --- lxml/trunk/doc/elementsoup.txt (original) +++ lxml/trunk/doc/elementsoup.txt Thu Apr 24 22:05:06 2008 @@ -16,6 +16,10 @@ using BeautifulSoup, and ``convert_tree()`` to convert an existing BeautifulSoup tree into a list of top-level Elements. + +Parsing with the soupparser +=========================== + The functions ``fromstring()`` and ``parse()`` behave as known from ElementTree. The first returns a root Element, the latter returns an ElementTree. From scoder at codespeak.net Thu Apr 24 22:05:11 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 24 Apr 2008 22:05:11 +0200 (CEST) Subject: [Lxml-checkins] r54116 - in lxml/trunk: . doc Message-ID: <20080424200511.ED1E316A08C@codespeak.net> Author: scoder Date: Thu Apr 24 22:05:10 2008 New Revision: 54116 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/cssselect.txt Log: r4067 at delle: sbehnel | 2008-04-24 19:38:29 +0200 rst fix Modified: lxml/trunk/doc/cssselect.txt ============================================================================== --- lxml/trunk/doc/cssselect.txt (original) +++ lxml/trunk/doc/cssselect.txt Thu Apr 24 22:05:10 2008 @@ -80,7 +80,7 @@ Namespaces ----------- +========== In CSS you can use ``namespace-prefix|element``, similar to ``namespace-prefix:element`` in an XPath expression. In fact, it maps From scoder at codespeak.net Thu Apr 24 22:05:15 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 24 Apr 2008 22:05:15 +0200 (CEST) Subject: [Lxml-checkins] r54117 - lxml/trunk Message-ID: <20080424200515.EB06716A089@codespeak.net> Author: scoder Date: Thu Apr 24 22:05:15 2008 New Revision: 54117 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt Log: r4068 at delle: sbehnel | 2008-04-24 19:56:09 +0200 changelog cleanup Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Thu Apr 24 22:05:15 2008 @@ -269,19 +269,6 @@ * ``makeparser()`` function in ``lxml.objectify`` to create a new parser with the usual objectify setup. -Bugs fixed ----------- - -Other changes -------------- - - -2.0beta2 (2008-01-26) -===================== - -Features added --------------- - * Plain ASCII XPath string results are no longer forced into unicode objects as in 2.0beta1, but are returned as plain strings as before. @@ -292,35 +279,6 @@ * More accurate exception messages in validator creation. -Bugs fixed ----------- - -* Missing import in ``lxml.html.clean``. - -* Some Python 2.4-isms prevented lxml from building/running under - Python 2.3. - -Other changes -------------- - -* Exceptions carry only the part of the error log that is related to - the operation that caused the error. - -* ``XMLSchema()`` and ``RelaxNG()`` now enforce passing the source - file/filename through the ``file`` keyword argument. - -* The test suite now skips most doctests under Python 2.3. - -* ``make clean`` no longer removes the .c files (use ``make - realclean`` instead) - - -2.0beta1 (2008-01-11) -===================== - -Features added --------------- - * Parse-time XML schema validation (``schema`` parser keyword). * XPath string results of the ``text()`` function and attribute @@ -337,74 +295,9 @@ * ``entity.text`` returns the textual representation of the entity, e.g. ``&``. -Bugs fixed ----------- - -* XPath on ElementTrees could crash when selecting the virtual root - node of the ElementTree. - -* Compilation ``--without-threading`` was buggy in alpha5/6. - -Other changes -------------- - -* Minor performance tweaks for Element instantiation and subelement - creation - - -2.0alpha6 (2007-12-19) -====================== - -Features added --------------- - * New properties ``position`` and ``code`` on ParseError exception (as in ET 1.3) -Bugs fixed ----------- - -* Memory leak in the ``parse()`` function. - -* Minor bugs in XSLT error message formatting. - -* Result document memory leak in target parser. - -Other changes -------------- - -* Various places in the XPath, XSLT and iteration APIs now require - keyword-only arguments. - -* The argument order in ``element.itersiblings()`` was changed to - match the order used in all other iteration methods. The second - argument ('preceding') is now a keyword-only argument. - -* The ``getiterator()`` method on Elements and ElementTrees was - reverted to return an iterator as it did in lxml 1.x. The ET API - specification allows it to return either a sequence or an iterator, - and it traditionally returned a sequence in ET and an iterator in - lxml. However, it is now deprecated in favour of the ``iter()`` - method, which should be used in new code wherever possible. - -* The 'pretty printed' serialisation of ElementTree objects now - inserts newlines at the root level between processing instructions, - comments and the root tag. - -* A 'pretty printed' serialisation is now terminated with a newline. - -* Second argument to ``lxml.etree.Extension()`` helper is no longer - required, third argument is now a keyword-only argument ``ns``. - -* ``lxml.html.tostring`` takes an ``encoding`` argument. - - -2.0alpha5 (2007-11-24) -====================== - -Features added --------------- - * Rich comparison of ``element.attrib`` proxies. * ElementTree compatible TreeBuilder class. @@ -425,98 +318,6 @@ ``NOPARSE_MARKUP`` (like ``# doctest: +NOPARSE_MARKUP``) to suppress the special checking for one test. -Bugs fixed ----------- - -* Target parser failed to report comments. - -* In the ``lxml.html`` ``iter_links`` method, links in ```` - tags weren't recognized. (Note: plugin-specific link parameters - still aren't recognized.) Also, the ```` tag, though not - standard, is now included in ``lxml.html.defs.special_inline_tags``. - -* Using custom resolvers on XSLT stylesheets parsed from a string - could request ill-formed URLs. - -* With ``lxml.doctestcompare`` if you do ```` in your - output, it will then be namespace-neutral (before the ellipsis was - treated as a real namespace). - -Other changes -------------- - -* The module source files were renamed to "lxml.*.pyx", such as - "lxml.etree.pyx". This was changed for consistency with the way - Pyrex commonly handles package imports. The main effect is that - classes now know about their fully qualified class name, including - the package name of their module. - -* Keyword-only arguments in some API functions, especially in the - parsers and serialisers. - - -1.3.6 (2007-10-29) -================== - -Bugs fixed ----------- - -* Backported decref crash fix from 2.0 - -* Well hidden free-while-in-use crash bug in ObjectPath - -Other changes -------------- - -* The test suites now run ``gc.collect()`` in the ``tearDown()`` - methods. While this makes them take a lot longer to run, it also - makes it easier to link a specific test to garbage collection - problems that would otherwise appear in later tests. - - -1.3.5 (2007-10-22) -================== - -Features added --------------- - -Bugs fixed ----------- - -* lxml.etree could crash when adding more than 10000 namespaces to a - document - -* lxml failed to serialise namespace declarations of elements other - than the root node of a tree - - -2.0alpha4 (2007-10-07) -====================== - -Features added --------------- - -Bugs fixed ----------- - -* AttributeError in feed parser on parse errors - -Other changes -------------- - -* Tag name validation in lxml.etree (and lxml.html) now distinguishes - between HTML tags and XML tags based on the parser that was used to - parse or create them. HTML tags no longer reject any non-ASCII - characters in tag names but only spaces and the special characters - ``<>&/"'``. - - -2.0alpha3 (2007-09-26) -====================== - -Features added --------------- - * Separate ``feed_error_log`` property for the feed parser interface. The normal parser interface and ``iterparse`` continue to use ``error_log``. @@ -539,29 +340,6 @@ * ``annotate()`` function in objectify can annotate with Python types and XSI types in one step. Accompanied by ``xsiannotate()`` and ``pyannotate()``. -Bugs fixed ----------- - -* XML feed parser setup problem - -* Type annotation for unicode strings in ``DataElement()`` - -Other changes -------------- - -* lxml.etree now emits a warning if you use XPath with libxml2 2.6.27 - (which can crash on certain XPath errors) - -* Type annotation in objectify now preserves the already annotated type by - default to prevent loosing type information that is already there. - - -2.0alpha2 (2007-09-15) -====================== - -Features added --------------- - * ``ET.write()``, ``tostring()`` and ``tounicode()`` now accept a keyword argument ``method`` that can be one of 'xml' (or None), 'html' or 'text' to serialise as XML, HTML or plain text content. @@ -580,28 +358,6 @@ * ElementTree-like feed parser interface on XMLParser and HTMLParser (``feed()`` and ``close()`` methods) -Bugs fixed ----------- - -* lxml failed to serialise namespace declarations of elements other than the - root node of a tree - -* Race condition in XSLT where the resolver context leaked between concurrent - XSLT calls - -Other changes -------------- - -* ``element.getiterator()`` returns a list, use ``element.iter()`` to retrieve - an iterator (ElementTree 1.3 compatible behaviour) - - -2.0alpha1 (2007-09-02) -====================== - -Features added --------------- - * Reimplemented ``objectify.E`` for better performance and improved integration with objectify. Provides extended type support based on registered PyTypes. @@ -657,6 +413,48 @@ Bugs fixed ---------- +* Missing import in ``lxml.html.clean``. + +* Some Python 2.4-isms prevented lxml from building/running under + Python 2.3. + +* XPath on ElementTrees could crash when selecting the virtual root + node of the ElementTree. + +* Compilation ``--without-threading`` was buggy in alpha5/6. + +* Memory leak in the ``parse()`` function. + +* Minor bugs in XSLT error message formatting. + +* Result document memory leak in target parser. + +* Target parser failed to report comments. + +* In the ``lxml.html`` ``iter_links`` method, links in ```` + tags weren't recognized. (Note: plugin-specific link parameters + still aren't recognized.) Also, the ```` tag, though not + standard, is now included in ``lxml.html.defs.special_inline_tags``. + +* Using custom resolvers on XSLT stylesheets parsed from a string + could request ill-formed URLs. + +* With ``lxml.doctestcompare`` if you do ```` in your + output, it will then be namespace-neutral (before the ellipsis was + treated as a real namespace). + +* AttributeError in feed parser on parse errors + +* XML feed parser setup problem + +* Type annotation for unicode strings in ``DataElement()`` + +* lxml failed to serialise namespace declarations of elements other than the + root node of a tree + +* Race condition in XSLT where the resolver context leaked between concurrent + XSLT calls + * lxml.etree did not check tag/attribute names * The XML parser did not report undefined entities as error @@ -671,6 +469,69 @@ Other changes ------------- +* Exceptions carry only the part of the error log that is related to + the operation that caused the error. + +* ``XMLSchema()`` and ``RelaxNG()`` now enforce passing the source + file/filename through the ``file`` keyword argument. + +* The test suite now skips most doctests under Python 2.3. + +* ``make clean`` no longer removes the .c files (use ``make + realclean`` instead) + +* Minor performance tweaks for Element instantiation and subelement + creation + +* Various places in the XPath, XSLT and iteration APIs now require + keyword-only arguments. + +* The argument order in ``element.itersiblings()`` was changed to + match the order used in all other iteration methods. The second + argument ('preceding') is now a keyword-only argument. + +* The ``getiterator()`` method on Elements and ElementTrees was + reverted to return an iterator as it did in lxml 1.x. The ET API + specification allows it to return either a sequence or an iterator, + and it traditionally returned a sequence in ET and an iterator in + lxml. However, it is now deprecated in favour of the ``iter()`` + method, which should be used in new code wherever possible. + +* The 'pretty printed' serialisation of ElementTree objects now + inserts newlines at the root level between processing instructions, + comments and the root tag. + +* A 'pretty printed' serialisation is now terminated with a newline. + +* Second argument to ``lxml.etree.Extension()`` helper is no longer + required, third argument is now a keyword-only argument ``ns``. + +* ``lxml.html.tostring`` takes an ``encoding`` argument. + +* The module source files were renamed to "lxml.*.pyx", such as + "lxml.etree.pyx". This was changed for consistency with the way + Pyrex commonly handles package imports. The main effect is that + classes now know about their fully qualified class name, including + the package name of their module. + +* Keyword-only arguments in some API functions, especially in the + parsers and serialisers. + +* Tag name validation in lxml.etree (and lxml.html) now distinguishes + between HTML tags and XML tags based on the parser that was used to + parse or create them. HTML tags no longer reject any non-ASCII + characters in tag names but only spaces and the special characters + ``<>&/"'``. + +* lxml.etree now emits a warning if you use XPath with libxml2 2.6.27 + (which can crash on certain XPath errors) + +* Type annotation in objectify now preserves the already annotated type by + default to prevent loosing type information that is already there. + +* ``element.getiterator()`` returns a list, use ``element.iter()`` to retrieve + an iterator (ElementTree 1.3 compatible behaviour) + * objectify.PyType for None is now called "NoneType" * ``el.getiterator()`` renamed to ``el.iter()``, following ElementTree 1.3 - @@ -684,6 +545,41 @@ * Network access in parsers disabled by default +1.3.6 (2007-10-29) +================== + +Bugs fixed +---------- + +* Backported decref crash fix from 2.0 + +* Well hidden free-while-in-use crash bug in ObjectPath + +Other changes +------------- + +* The test suites now run ``gc.collect()`` in the ``tearDown()`` + methods. While this makes them take a lot longer to run, it also + makes it easier to link a specific test to garbage collection + problems that would otherwise appear in later tests. + + +1.3.5 (2007-10-22) +================== + +Features added +-------------- + +Bugs fixed +---------- + +* lxml.etree could crash when adding more than 10000 namespaces to a + document + +* lxml failed to serialise namespace declarations of elements other + than the root node of a tree + + 1.3.4 (2007-08-30) ================== From scoder at codespeak.net Thu Apr 24 22:25:19 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 24 Apr 2008 22:25:19 +0200 (CEST) Subject: [Lxml-checkins] r54120 - lxml/branch/lxml-2.0/doc Message-ID: <20080424202519.5214716A424@codespeak.net> Author: scoder Date: Thu Apr 24 22:25:18 2008 New Revision: 54120 Modified: lxml/branch/lxml-2.0/doc/capi.txt lxml/branch/lxml-2.0/doc/tutorial.txt Log: doc fixes Modified: lxml/branch/lxml-2.0/doc/capi.txt ============================================================================== --- lxml/branch/lxml-2.0/doc/capi.txt (original) +++ lxml/branch/lxml-2.0/doc/capi.txt Thu Apr 24 22:25:18 2008 @@ -85,11 +85,8 @@ #include "etree.h" /* setup code */ - static PyObject* m_etree; - m_etree = _ADD_YOUR_WAY_TO_IMPORT_A_MODULE_("lxml.etree"); + import_lxml__etree() - import_etree(m_etree); - -Note that including ``etree.h`` does not automatically include the header -files it requires. Note also that the above list of common imports may not be -sufficient. +Note that including ``etree.h`` does not automatically include the +header files it requires. Note also that the above list of common +includes may not be sufficient. Modified: lxml/branch/lxml-2.0/doc/tutorial.txt ============================================================================== --- lxml/branch/lxml-2.0/doc/tutorial.txt (original) +++ lxml/branch/lxml-2.0/doc/tutorial.txt Thu Apr 24 22:25:18 2008 @@ -1,6 +1,10 @@ ======================= The lxml.etree Tutorial ======================= + +.. meta:: + :description: The lxml tutorial on XML that feels like Python + :keywords: lxml, etree, tutorial, ElementTree, Python, XML, HTML :Author: Stefan Behnel From scoder at codespeak.net Fri Apr 25 11:07:42 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 25 Apr 2008 11:07:42 +0200 (CEST) Subject: [Lxml-checkins] r54125 - in lxml/trunk: . doc Message-ID: <20080425090742.1868D2A013F@codespeak.net> Author: scoder Date: Fri Apr 25 11:07:41 2008 New Revision: 54125 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/FAQ.txt lxml/trunk/doc/api.txt lxml/trunk/doc/lxml-source-howto.txt lxml/trunk/doc/main.txt lxml/trunk/doc/mkhtml.py Log: r4086 at delle: sbehnel | 2008-04-25 11:06:16 +0200 HTML doc link fixes Modified: lxml/trunk/doc/FAQ.txt ============================================================================== --- lxml/trunk/doc/FAQ.txt (original) +++ lxml/trunk/doc/FAQ.txt Fri Apr 25 11:07:41 2008 @@ -181,7 +181,7 @@ .. _happy: http://thread.gmane.org/gmane.comp.python.lxml.devel/3244/focus=3244 .. _users: http://article.gmane.org/gmane.comp.python.lxml.devel/3246 -.. _`sites that link to lxml`: http://www.google.com/search?as_lq=http%3A%2F%2Fcodespeak.net%2Flxml +.. _`sites that link to lxml`: http://www.google.com/search?as_lq=http:%2F%2Fcodespeak.net%2Flxml What is the difference between lxml.etree and lxml.objectify? Modified: lxml/trunk/doc/api.txt ============================================================================== --- lxml/trunk/doc/api.txt (original) +++ lxml/trunk/doc/api.txt Fri Apr 25 11:07:41 2008 @@ -78,7 +78,7 @@ lxml.etree comes with a number of `different lookup schemes`_ to customize the mapping between libxml2 nodes and the Element classes used by lxml.etree. -.. _`custom element subclasses`: namespace_extensions.html +.. _`custom element subclasses`: element_classes.html .. _`objectify`: objectify.html .. _`different lookup schemes`: element_classes.html#setting-up-a-class-lookup-scheme .. _`Amara bindery`: http://uche.ogbuji.net/tech/4suite/amara/ Modified: lxml/trunk/doc/lxml-source-howto.txt ============================================================================== --- lxml/trunk/doc/lxml-source-howto.txt (original) +++ lxml/trunk/doc/lxml-source-howto.txt Fri Apr 25 11:07:41 2008 @@ -13,7 +13,7 @@ to start working on it. You might also be interested in the companion document that describes `how to build lxml from sources`_. -.. _lxml: http://codespeak.net/lxml +.. _lxml: http://codespeak.net/lxml/ .. _`how to build lxml from sources`: build.html .. _`ReStructured Text`: http://docutils.sourceforge.net/rst.html .. _epydoc: http://epydoc.sourceforge.net/ Modified: lxml/trunk/doc/main.txt ============================================================================== --- lxml/trunk/doc/main.txt (original) +++ lxml/trunk/doc/main.txt Fri Apr 25 11:07:41 2008 @@ -198,7 +198,7 @@ unexpected behaviour of lxml is a bug or not, please ask on the `mailing list`_ first. Do not forget to search the archive (e.g. with Gmane_)! -.. _`launchpad bug tracker`: https://launchpad.net/lxml +.. _`launchpad bug tracker`: https://launchpad.net/lxml/ License Modified: lxml/trunk/doc/mkhtml.py ============================================================================== --- lxml/trunk/doc/mkhtml.py (original) +++ lxml/trunk/doc/mkhtml.py Fri Apr 25 11:07:41 2008 @@ -17,6 +17,7 @@ find_menu = XPath("//h:ul[@id=$name]", namespaces=htmlnsmap) find_page_end = XPath("/h:html/h:body/h:div[last()]", namespaces=htmlnsmap) +find_words = re.compile('(\w+)').findall replace_invalid = re.compile(r'[-_/.\s\\]').sub def make_menu_section_head(section, menuroot): @@ -51,7 +52,7 @@ subul = SubElement(title, "ul", {"class":"submenu"}) for heading in headings: li = SubElement(subul, "li", {"class":"menu item"}) - ref = replace_invalid('-', heading.lower()) + ref = '-'.join(find_words(heading.lower())) a = SubElement(li, "a", href=url+'#'+ref) a.text = heading From scoder at codespeak.net Fri Apr 25 12:08:56 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 25 Apr 2008 12:08:56 +0200 (CEST) Subject: [Lxml-checkins] r54126 - in lxml/trunk: . doc Message-ID: <20080425100856.7EBF64980FB@codespeak.net> Author: scoder Date: Fri Apr 25 12:08:54 2008 New Revision: 54126 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/elementsoup.txt lxml/trunk/doc/main.txt lxml/trunk/doc/mkhtml.py lxml/trunk/doc/resolvers.txt Log: r4088 at delle: sbehnel | 2008-04-25 12:07:27 +0200 HTML link/anchor fixes in docs Modified: lxml/trunk/doc/elementsoup.txt ============================================================================== --- lxml/trunk/doc/elementsoup.txt (original) +++ lxml/trunk/doc/elementsoup.txt Fri Apr 25 12:08:54 2008 @@ -7,7 +7,7 @@ forgiving and has superiour `support for encoding detection`_. .. _BeautifulSoup: http://www.crummy.com/software/BeautifulSoup/ -.. _`support for encoding detection`: http://www.crummy.com/software/BeautifulSoup/documentation.html#Beautiful%20Soup%20Gives%20You%20Unicode,%20Dammit +.. _`support for encoding detection`: http://www.crummy.com/software/BeautifulSoup/documentation.html#Beautiful%20Soup%20Gives%20You%20Unicode%2C%20Dammit .. _ElementSoup: http://effbot.org/zone/element-soup.htm lxml can benefit from the parsing capabilities of BeautifulSoup Modified: lxml/trunk/doc/main.txt ============================================================================== --- lxml/trunk/doc/main.txt (original) +++ lxml/trunk/doc/main.txt Fri Apr 25 12:08:54 2008 @@ -187,7 +187,7 @@ .. _`mailing list`: http://codespeak.net/mailman/listinfo/lxml-dev .. _Gmane: http://blog.gmane.org/gmane.comp.python.lxml.devel -.. _Google: http://www.google.com/webhp?q=site:codespeak.net/mailman/listinfo/lxml-dev%20 +.. _Google: http://www.google.com/webhp?q=site:codespeak.net%2Fmailman%2Flistinfo%2Flxml-dev+ Bug tracker Modified: lxml/trunk/doc/mkhtml.py ============================================================================== --- lxml/trunk/doc/mkhtml.py (original) +++ lxml/trunk/doc/mkhtml.py Fri Apr 25 12:08:54 2008 @@ -21,9 +21,10 @@ replace_invalid = re.compile(r'[-_/.\s\\]').sub def make_menu_section_head(section, menuroot): - section_head = menuroot.xpath("//ul[@id=$section]/li", section=section) + section_id = section + '-section' + section_head = menuroot.xpath("//ul[@id=$section]/li", section=section_id) if not section_head: - ul = SubElement(menuroot, "ul", id=section) + ul = SubElement(menuroot, "ul", id=section_id) section_head = SubElement(ul, "li") title = SubElement(section_head, "span", {"class":"section title"}) title.text = section @@ -41,7 +42,7 @@ headings=find_headings(tree)) def build_menu_entry(page_title, url, section_head, headings=None): - page_id = replace_invalid(' ', os.path.splitext(url)[0]) + page_id = replace_invalid(' ', os.path.splitext(url)[0]) + '-menu' ul = SubElement(section_head, "ul", {"class":"menu foreign", "id":page_id}) title = SubElement(ul, "li", {"class":"menu title"}) @@ -52,7 +53,7 @@ subul = SubElement(title, "ul", {"class":"submenu"}) for heading in headings: li = SubElement(subul, "li", {"class":"menu item"}) - ref = '-'.join(find_words(heading.lower())) + ref = '-'.join(find_words(replace_invalid(' ', heading.lower()))) a = SubElement(li, "a", href=url+'#'+ref) a.text = heading @@ -63,7 +64,8 @@ tag = el.tag if tag[0] != '{': el.tag = "{http://www.w3.org/1999/xhtml}" + tag - current_menu = find_menu(menu_root, name=replace_invalid(' ', name)) + current_menu = find_menu( + menu_root, name=replace_invalid(' ', name + '-menu')) if current_menu: for submenu in current_menu: submenu.set("class", submenu.get("class", ""). Modified: lxml/trunk/doc/resolvers.txt ============================================================================== --- lxml/trunk/doc/resolvers.txt (original) +++ lxml/trunk/doc/resolvers.txt Fri Apr 25 12:08:54 2008 @@ -3,7 +3,7 @@ .. contents:: .. - 1 Resolvers + 1 URI Resolvers 2 Document loading in context 3 I/O access control in XSLT @@ -13,8 +13,8 @@ etree.Resolver class. -Resolvers ---------- +URI Resolvers +------------- Here is an example of a custom resolver: From scoder at codespeak.net Fri Apr 25 12:23:52 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 25 Apr 2008 12:23:52 +0200 (CEST) Subject: [Lxml-checkins] r54127 - in lxml/trunk: . src/lxml Message-ID: <20080425102352.1ECBD169EC8@codespeak.net> Author: scoder Date: Fri Apr 25 12:23:51 2008 New Revision: 54127 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/lxml.objectify.pyx Log: r4090 at delle: sbehnel | 2008-04-25 12:22:28 +0200 make XML() function explicit in objectify instead of copying it from the fromstring function (function name, docstring, etc.) Modified: lxml/trunk/src/lxml/lxml.objectify.pyx ============================================================================== --- lxml/trunk/src/lxml/lxml.objectify.pyx (original) +++ lxml/trunk/src/lxml/lxml.objectify.pyx Fri Apr 25 12:23:51 2008 @@ -1762,7 +1762,17 @@ parser = objectify_parser return _fromstring(xml, parser) -XML = fromstring +def XML(xml, parser=None): + """XML(xml, parser=None) + + Objectify specific version of the lxml.etree XML() literal factory + that uses the objectify parser. + + You can pass a different parser as second argument. + """ + if parser is None: + parser = objectify_parser + return _fromstring(xml, parser) cdef object _parse _parse = etree.parse From scoder at codespeak.net Fri Apr 25 23:31:48 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 25 Apr 2008 23:31:48 +0200 (CEST) Subject: [Lxml-checkins] r54136 - in lxml/trunk: . src/lxml Message-ID: <20080425213148.AE265169EE2@codespeak.net> Author: scoder Date: Fri Apr 25 23:31:46 2008 New Revision: 54136 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/apihelpers.pxi lxml/trunk/src/lxml/classlookup.pxi lxml/trunk/src/lxml/docloader.pxi lxml/trunk/src/lxml/dtd.pxi lxml/trunk/src/lxml/extensions.pxi lxml/trunk/src/lxml/iterparse.pxi lxml/trunk/src/lxml/lxml.etree.pyx lxml/trunk/src/lxml/lxml.objectify.pyx lxml/trunk/src/lxml/nsclasses.pxi lxml/trunk/src/lxml/objectpath.pxi lxml/trunk/src/lxml/parser.pxi lxml/trunk/src/lxml/readonlytree.pxi lxml/trunk/src/lxml/relaxng.pxi lxml/trunk/src/lxml/schematron.pxi lxml/trunk/src/lxml/serializer.pxi lxml/trunk/src/lxml/xmlid.pxi lxml/trunk/src/lxml/xmlschema.pxi lxml/trunk/src/lxml/xpath.pxi lxml/trunk/src/lxml/xslt.pxi lxml/trunk/src/lxml/xsltext.pxi Log: r4092 at delle: sbehnel | 2008-04-25 22:47:11 +0200 simpler exception raising code Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Fri Apr 25 23:31:46 2008 @@ -26,9 +26,11 @@ elif isinstance(input, _Document): doc = <_Document>input else: - raise TypeError("Invalid input object: %s" % type(input)) + raise TypeError, "Invalid input object: %s" % \ + python._fqtypename(input) if doc is None: - raise ValueError("Input object has no document: %s" % type(input)) + raise ValueError, "Input object has no document: %s" % \ + python._fqtypename(input) else: return doc @@ -46,9 +48,11 @@ elif isinstance(input, _Document): node = (<_Document>input).getroot() else: - raise TypeError("Invalid input object: %s" % type(input)) + raise TypeError, "Invalid input object: %s" % \ + python._fqtypename(input) if node is None: - raise ValueError("Input object has no element: %s" % type(input)) + raise ValueError, "Input object has no element: %s" % \ + python._fqtypename(input) else: return node @@ -213,7 +217,8 @@ cdef xmlNs* c_ns # 'extra' is not checked here (expected to be a keyword dict) if attrib is not None and not hasattr(attrib, 'items'): - raise TypeError("Invalid attribute dictionary: %s" % type(attrib)) + raise TypeError, "Invalid attribute dictionary: %s" % \ + python._fqtypename(attrib) if extra is not None and extra: if attrib is None: attrib = extra @@ -307,7 +312,7 @@ else: c_href = _cstr(ns) if _delAttributeFromNsName(element._c_node, c_href, _cstr(tag)): - raise KeyError(key) + raise KeyError, key return 0 cdef int _delAttributeFromNsName(xmlNode* c_node, char* c_href, char* c_name): @@ -780,9 +785,8 @@ # *replacing* children stepwise with list => check size! seqlength = len(elements) if seqlength != slicelength: - raise ValueError( - "attempt to assign sequence of size %d " - "to extended slice of size %d" % (seqlength, slicelength)) + raise ValueError, "attempt to assign sequence of size %d " \ + "to extended slice of size %d" % (seqlength, slicelength) if c_node is NULL: # no children yet => add all elements straight away @@ -1000,7 +1004,7 @@ assert isutf8py(s) != -1, \ "All strings must be XML compatible, either Unicode or ASCII" else: - raise TypeError("Argument must be string or unicode.") + raise TypeError, "Argument must be string or unicode." return s cdef object _encodeFilename(object filename): @@ -1014,7 +1018,7 @@ return python.PyUnicode_AsEncodedString( filename, _C_FILENAME_ENCODING, NULL) else: - raise TypeError("Argument must be string or unicode.") + raise TypeError, "Argument must be string or unicode." cdef object _encodeFilenameUTF8(object filename): """Recode filename as UTF-8. Tries ASCII, local filesystem encoding and @@ -1043,7 +1047,7 @@ if python.PyUnicode_Check(filename): return python.PyUnicode_AsUTF8String(filename) else: - raise TypeError("Argument must be string or unicode.") + raise TypeError, "Argument must be string or unicode." cdef _getNsTag(tag): """Given a tag, find namespace URI and tag name. @@ -1062,16 +1066,16 @@ c_tag = c_tag + 1 c_ns_end = cstd.strchr(c_tag, c'}') if c_ns_end is NULL: - raise ValueError("Invalid tag name") + raise ValueError, "Invalid tag name" nslen = c_ns_end - c_tag taglen = python.PyString_GET_SIZE(tag) - nslen - 2 if taglen == 0: - raise ValueError("Empty tag name") + raise ValueError, "Empty tag name" if nslen > 0: ns = python.PyString_FromStringAndSize(c_tag, nslen) tag = python.PyString_FromStringAndSize(c_ns_end+1, taglen) elif python.PyString_GET_SIZE(tag) == 0: - raise ValueError("Empty tag name") + raise ValueError, "Empty tag name" return ns, tag cdef int _pyXmlNameIsValid(name_utf8): @@ -1123,26 +1127,26 @@ cdef int _tagValidOrRaise(tag_utf) except -1: if not _pyXmlNameIsValid(tag_utf): - raise ValueError("Invalid tag name %r" % \ - python.PyUnicode_FromEncodedObject(tag_utf, 'UTF-8', 'strict')) + raise ValueError, "Invalid tag name %r" % \ + python.PyUnicode_FromEncodedObject(tag_utf, 'UTF-8', 'strict') return 0 cdef int _htmlTagValidOrRaise(tag_utf) except -1: if not _pyHtmlNameIsValid(tag_utf): - raise ValueError("Invalid HTML tag name %r" % \ - python.PyUnicode_FromEncodedObject(tag_utf, 'UTF-8', 'strict')) + raise ValueError, "Invalid HTML tag name %r" % \ + python.PyUnicode_FromEncodedObject(tag_utf, 'UTF-8', 'strict') return 0 cdef int _attributeValidOrRaise(name_utf) except -1: if not _pyXmlNameIsValid(name_utf): - raise ValueError("Invalid attribute name %r" % \ - python.PyUnicode_FromEncodedObject(name_utf, 'UTF-8', 'strict')) + raise ValueError, "Invalid attribute name %r" % \ + python.PyUnicode_FromEncodedObject(name_utf, 'UTF-8', 'strict') return 0 cdef int _prefixValidOrRaise(tag_utf) except -1: if not _pyXmlNameIsValid(tag_utf): - raise ValueError("Invalid namespace prefix %r" % \ - python.PyUnicode_FromEncodedObject(tag_utf, 'UTF-8', 'strict')) + raise ValueError, "Invalid namespace prefix %r" % \ + python.PyUnicode_FromEncodedObject(tag_utf, 'UTF-8', 'strict') return 0 cdef object _namespacedName(xmlNode* c_node): Modified: lxml/trunk/src/lxml/classlookup.pxi ============================================================================== --- lxml/trunk/src/lxml/classlookup.pxi (original) +++ lxml/trunk/src/lxml/classlookup.pxi Fri Apr 25 23:31:46 2008 @@ -134,28 +134,28 @@ elif issubclass(element, ElementBase): self.element_class = element else: - raise TypeError("element class must be subclass of ElementBase") + raise TypeError, "element class must be subclass of ElementBase" if comment is None: self.comment_class = _Comment elif issubclass(comment, CommentBase): self.comment_class = comment else: - raise TypeError("comment class must be subclass of CommentBase") + raise TypeError, "comment class must be subclass of CommentBase" if entity is None: self.entity_class = _Entity elif issubclass(entity, EntityBase): self.entity_class = entity else: - raise TypeError("Entity class must be subclass of EntityBase") + raise TypeError, "Entity class must be subclass of EntityBase" if pi is None: self.pi_class = None # special case, see below elif issubclass(pi, PIBase): self.pi_class = pi else: - raise TypeError("PI class must be subclass of PIBase") + raise TypeError, "PI class must be subclass of PIBase" cdef object _lookupDefaultElementClass(state, _Document _doc, xmlNode* c_node): "Trivial class lookup function that always returns the default class." Modified: lxml/trunk/src/lxml/docloader.pxi ============================================================================== --- lxml/trunk/src/lxml/docloader.pxi (original) +++ lxml/trunk/src/lxml/docloader.pxi Fri Apr 25 23:31:46 2008 @@ -78,7 +78,7 @@ try: f.read except AttributeError: - raise TypeError("Argument is not a file-like object") + raise TypeError, "Argument is not a file-like object" doc_ref = _InputDocument() doc_ref._type = PARSER_DATA_FILE if base_url is not None: Modified: lxml/trunk/src/lxml/dtd.pxi ============================================================================== --- lxml/trunk/src/lxml/dtd.pxi (original) +++ lxml/trunk/src/lxml/dtd.pxi Fri Apr 25 23:31:46 2008 @@ -40,13 +40,13 @@ elif hasattr(file, 'read'): self._c_dtd = _parseDtdFromFilelike(file) else: - raise DTDParseError("file must be a filename or file-like object") + raise DTDParseError, "file must be a filename or file-like object" elif external_id is not None: self._error_log.connect() self._c_dtd = xmlparser.xmlParseDTD(external_id, NULL) self._error_log.disconnect() else: - raise DTDParseError("either filename or external ID required") + raise DTDParseError, "either filename or external ID required" if self._c_dtd is NULL: raise DTDParseError( Modified: lxml/trunk/src/lxml/extensions.pxi ============================================================================== --- lxml/trunk/src/lxml/extensions.pxi (original) +++ lxml/trunk/src/lxml/extensions.pxi Fri Apr 25 23:31:46 2008 @@ -57,8 +57,7 @@ for extension in extensions: for (ns_uri, name), function in extension.items(): if name is None: - raise ValueError( - "extensions must have non empty names") + raise ValueError, "extensions must have non empty names" ns_utf = self._to_utf(ns_uri) name_utf = self._to_utf(name) python.PyDict_SetItem( @@ -72,11 +71,11 @@ ns = [] for prefix, ns_uri in namespaces: if prefix is None or not prefix: - raise TypeError( - "empty namespace prefix is not supported in XPath") + raise TypeError, \ + "empty namespace prefix is not supported in XPath" if ns_uri is None or not ns_uri: - raise TypeError( - "setting default namespace is not supported in XPath") + raise TypeError, \ + "setting default namespace is not supported in XPath" prefix_utf = self._to_utf(prefix) ns_uri_utf = self._to_utf(ns_uri) python.PyList_Append(ns, (prefix_utf, ns_uri_utf)) @@ -139,7 +138,7 @@ cdef addNamespace(self, prefix, ns_uri): if prefix is None: - raise TypeError("empty prefix is not supported in XPath") + raise TypeError, "empty prefix is not supported in XPath" prefix_utf = self._to_utf(prefix) ns_uri_utf = self._to_utf(ns_uri) new_item = (prefix_utf, ns_uri_utf) @@ -161,7 +160,7 @@ cdef registerNamespace(self, prefix, ns_uri): if prefix is None: - raise TypeError("empty prefix is not supported in XPath") + raise TypeError, "empty prefix is not supported in XPath" prefix_utf = self._to_utf(prefix) ns_uri_utf = self._to_utf(ns_uri) python.PyList_Append(self._global_namespaces, prefix_utf) @@ -279,17 +278,16 @@ def __get__(self): cdef xmlNode* c_node if self._xpathCtxt is NULL: - raise XPathError( - "XPath context is only usable during the evaluation") + raise XPathError, \ + "XPath context is only usable during the evaluation" c_node = self._xpathCtxt.node if c_node is NULL: - raise XPathError("no context node") + raise XPathError, "no context node" if c_node.doc != self._xpathCtxt.doc: - raise XPathError( - "document-external context nodes are not supported") + raise XPathError, \ + "document-external context nodes are not supported" if self._doc is None: - raise XPathError( - "document context is missing") + raise XPathError, "document context is missing" return _elementFactory(self._doc, c_node) property eval_context: @@ -477,15 +475,16 @@ xpath.xmlXPathNodeSetAdd(resultSet, node._c_node) else: xpath.xmlXPathFreeNodeSet(resultSet) - raise XPathResultError("This is not a node: %r" % element) + raise XPathResultError, "This is not a node: %r" % element else: - raise XPathResultError("Unknown return type: %s" % type(obj)) + raise XPathResultError, "Unknown return type: %s" % \ + python._fqtypename(obj) return xpath.xmlXPathWrapNodeSet(resultSet) cdef object _unwrapXPathObject(xpath.xmlXPathObject* xpathObj, _Document doc): if xpathObj.type == xpath.XPATH_UNDEFINED: - raise XPathResultError("Undefined xpath result") + raise XPathResultError, "Undefined xpath result" elif xpathObj.type == xpath.XPATH_NODESET: return _createNodeSetResult(xpathObj, doc) elif xpathObj.type == xpath.XPATH_BOOLEAN: @@ -506,7 +505,7 @@ elif xpathObj.type == xpath.XPATH_XSLT_TREE: raise NotImplementedError else: - raise XPathResultError("Unknown xpath result %s" % str(xpathObj.type)) + raise XPathResultError, "Unknown xpath result %s" % str(xpathObj.type) cdef object _createNodeSetResult(xpath.xmlXPathObject* xpathObj, _Document doc): cdef xmlNode* c_node @@ -546,8 +545,8 @@ c_node.type == tree.XML_XINCLUDE_END: continue else: - raise NotImplementedError( - "Not yet implemented result node type: %d" % c_node.type) + raise NotImplementedError, \ + "Not yet implemented result node type: %d" % c_node.type python.PyList_Append(result, value) return result Modified: lxml/trunk/src/lxml/iterparse.pxi ============================================================================== --- lxml/trunk/src/lxml/iterparse.pxi (original) +++ lxml/trunk/src/lxml/iterparse.pxi Fri Apr 25 23:31:46 2008 @@ -28,7 +28,7 @@ elif event == 'pi': event_filter |= ITERPARSE_FILTER_PI else: - raise ValueError("invalid event name '%s'" % event) + raise ValueError, "invalid event name '%s'" % event return event_filter cdef int _countNsDefs(xmlNode* c_node): @@ -407,7 +407,7 @@ return context def copy(self): - raise TypeError("iterparse parsers cannot be copied") + raise TypeError, "iterparse parsers cannot be copied" def __iter__(self): return self @@ -433,7 +433,7 @@ data = self._source.read(__ITERPARSE_CHUNK_SIZE) if not python.PyString_Check(data): self._source = None - raise TypeError("reading file objects must return plain strings") + raise TypeError, "reading file objects must return plain strings" elif data: if self._for_html: error = htmlparser.htmlParseChunk( Modified: lxml/trunk/src/lxml/lxml.etree.pyx ============================================================================== --- lxml/trunk/src/lxml/lxml.etree.pyx (original) +++ lxml/trunk/src/lxml/lxml.etree.pyx Fri Apr 25 23:31:46 2008 @@ -422,7 +422,7 @@ self._doc = _documentOrRaise(tree) root_name, public_id, system_url = self._doc.getdoctype() if not root_name and (public_id or system_url): - raise ValueError("Could not find root node") + raise ValueError, "Could not find root node" property root_name: "Returns the name of the root node as defined by the DOCTYPE." @@ -541,7 +541,7 @@ cdef bint left_to_right cdef Py_ssize_t slicelength, step if value is None: - raise ValueError("cannot assign None") + raise ValueError, "cannot assign None" if python.PySlice_Check(x): # slice assignment _findChildSlice(x, self._c_node, &c_node, &step, &slicelength) @@ -557,7 +557,7 @@ element = value c_node = _findChild(self._c_node, x) if c_node is NULL: - raise IndexError("list index out of range") + raise IndexError, "list index out of range" c_next = element._c_node.next _removeText(c_node.next) tree.xmlReplaceNode(c_node, element._c_node) @@ -592,7 +592,7 @@ # item deletion c_node = _findChild(self._c_node, x) if c_node is NULL: - raise IndexError("index out of range: %d" % x) + raise IndexError, "index out of range: %d" % x _removeText(c_node.next) _removeNode(self._doc, c_node) @@ -645,7 +645,7 @@ if self._c_node.parent != NULL and not _isElement(self._c_node.parent): if element._c_node.type != tree.XML_PI_NODE: if element._c_node.type != tree.XML_COMMENT_NODE: - raise TypeError("Only processing instructions and comments can be siblings of the root element") + raise TypeError, "Only processing instructions and comments can be siblings of the root element" element.tail = None _appendSibling(self, element) @@ -662,7 +662,7 @@ if self._c_node.parent != NULL and not _isElement(self._c_node.parent): if element._c_node.type != tree.XML_PI_NODE: if element._c_node.type != tree.XML_COMMENT_NODE: - raise TypeError("Only processing instructions and comments can be siblings of the root element") + raise TypeError, "Only processing instructions and comments can be siblings of the root element" element.tail = None _prependSibling(self, element) @@ -731,7 +731,7 @@ cdef xmlNode* c_next c_node = element._c_node if c_node.parent is not self._c_node: - raise ValueError("Element is not a child of this node.") + raise ValueError, "Element is not a child of this node." c_next = element._c_node.next tree.xmlUnlinkNode(c_node) _moveTail(c_next, c_node) @@ -750,7 +750,7 @@ cdef xmlNode* c_new_next c_old_node = old_element._c_node if c_old_node.parent is not self._c_node: - raise ValueError("Element is not a child of this node.") + raise ValueError, "Element is not a child of this node." c_old_next = c_old_node.next c_new_node = new_element._c_node c_new_next = c_new_node.next @@ -948,7 +948,7 @@ # indexing c_node = _findChild(self._c_node, x) if c_node is NULL: - raise IndexError("list index out of range") + raise IndexError, "list index out of range" return _elementFactory(self._doc, c_node) def __len__(self): @@ -998,7 +998,7 @@ cdef xmlNode* c_start_node c_child = child._c_node if c_child.parent is not self._c_node: - raise ValueError("Element is not a child of this node.") + raise ValueError, "Element is not a child of this node." # handle the unbounded search straight away (normal case) if stop is None and (start is None or start == 0): @@ -1021,7 +1021,7 @@ c_stop = stop if c_stop == 0 or \ c_start >= c_stop and (c_stop > 0 or c_start < 0): - raise ValueError("list.index(x): x not in slice") + raise ValueError, "list.index(x): x not in slice" # for negative slice indices, check slice before searching index if c_start < 0 or c_stop < 0: @@ -1039,9 +1039,9 @@ if c_start_node == c_child: # found! before slice end? if c_stop < 0 and l <= -c_stop: - raise ValueError("list.index(x): x not in slice") + raise ValueError, "list.index(x): x not in slice" elif c_start < 0: - raise ValueError("list.index(x): x not in slice") + raise ValueError, "list.index(x): x not in slice" # now determine the index backwards from child c_child = c_child.prev @@ -1066,9 +1066,9 @@ else: return k if c_start != 0 or c_stop != 0: - raise ValueError("list.index(x): x not in slice") + raise ValueError, "list.index(x): x not in slice" else: - raise ValueError("list.index(x): x not in list") + raise ValueError, "list.index(x): x not in list" def get(self, key, default=None): """get(self, key, default=None) @@ -1359,7 +1359,7 @@ cdef class __ContentOnlyElement(_Element): cdef int _raiseImmutable(self) except -1: - raise TypeError("this element does not have children or attributes") + raise TypeError, "this element does not have children or attributes" def set(self, key, value): "set(self, key, value)" @@ -1404,7 +1404,7 @@ if python.PySlice_Check(x): return [] else: - raise IndexError("list index out of range") + raise IndexError, "list index out of range" def __len__(self): "__len__(self)" @@ -1521,7 +1521,7 @@ Relocate the ElementTree to a new root node. """ if root._c_node.type != tree.XML_ELEMENT_NODE: - raise TypeError("Only elements can be the root of an ElementTree") + raise TypeError, "Only elements can be the root of an ElementTree" self._context_node = root self._doc = None @@ -1602,7 +1602,7 @@ cdef char* c_path doc = self._context_node._doc if element._doc is not doc: - raise ValueError("Element is not in this tree.") + raise ValueError, "Element is not in this tree." c_doc = _fakeRootDoc(doc._c_doc, self._context_node._c_node) c_path = tree.xmlGetNodePath(element._c_node) _destroyFakeDoc(doc._c_doc, c_doc) @@ -1836,12 +1836,12 @@ def pop(self, key, *default): if python.PyTuple_GET_SIZE(default) > 1: - raise TypeError("pop expected at most 2 arguments, got %d" % ( - python.PyTuple_GET_SIZE(default)+1)) + raise TypeError, "pop expected at most 2 arguments, got %d" % ( + python.PyTuple_GET_SIZE(default)+1) result = _getAttributeValue(self._element, key, None) if result is None: if python.PyTuple_GET_SIZE(default) == 0: - raise KeyError(key) + raise KeyError, key else: result = python.PyTuple_GET_ITEM(default, 0) python.Py_INCREF(result) @@ -1862,7 +1862,7 @@ def __getitem__(self, key): result = _getAttributeValue(self._element, key, None) if result is None: - raise KeyError(key) + raise KeyError, key else: return result @@ -2295,9 +2295,9 @@ c_name = _cstr(name_utf) if c_name[0] == c'#': if not _characterReferenceIsValid(c_name + 1): - raise ValueError("Invalid character reference: '%s'" % name) + raise ValueError, "Invalid character reference: '%s'" % name elif not _xmlNameIsValid(c_name): - raise ValueError("Invalid entity reference: '%s'" % name) + raise ValueError, "Invalid entity reference: '%s'" % name c_doc = _newXMLDoc() doc = _documentFactory(c_doc, None) c_node = _createEntity(c_doc, c_name) @@ -2466,8 +2466,8 @@ cdef bint write_declaration if encoding is _unicode: if xml_declaration: - raise ValueError( - "Serialisation to unicode must not request an XML declaration") + raise ValueError, \ + "Serialisation to unicode must not request an XML declaration" write_declaration = 0 elif xml_declaration is None: # by default, write an XML declaration only for non-standard encodings @@ -2486,8 +2486,8 @@ encoding, method, write_declaration, 1, pretty_print, with_tail) else: - raise TypeError("Type '%s' cannot be serialized." % - type(element_or_tree)) + raise TypeError, "Type '%s' cannot be serialized." % \ + python._fqtypename(element_or_tree) def tostringlist(element_or_tree, *args, **kwargs): """tostringlist(element_or_tree, *args, **kwargs) @@ -2530,8 +2530,8 @@ return _tostring((<_ElementTree>element_or_tree)._context_node, _unicode, method, 0, 1, pretty_print, with_tail) else: - raise TypeError("Type '%s' cannot be serialized." % - type(element_or_tree)) + raise TypeError, "Type '%s' cannot be serialized." % \ + type(element_or_tree) def parse(source, _BaseParser parser=None, *, base_url=None): """parse(source, parser=None, base_url=None) @@ -2622,8 +2622,8 @@ Raises `AssertionError` if the document does not comply with the schema. """ if not self(etree): - raise AssertionError(self._error_log._buildExceptionMessage( - "Document does not comply with schema")) + raise AssertionError, self._error_log._buildExceptionMessage( + "Document does not comply with schema") property error_log: "The log of validation errors and warnings." Modified: lxml/trunk/src/lxml/lxml.objectify.pyx ============================================================================== --- lxml/trunk/src/lxml/lxml.objectify.pyx (original) +++ lxml/trunk/src/lxml/lxml.objectify.pyx Fri Apr 25 23:31:46 2008 @@ -213,8 +213,8 @@ # properties are looked up /after/ __setattr__, so we must emulate them if tag == 'text' or tag == 'pyval': # read-only ! - raise TypeError("attribute '%s' of '%s' objects is not writable" % - (tag, _typename(self))) + raise TypeError, "attribute '%s' of '%s' objects is not writable" % \ + (tag, _typename(self)) elif tag == 'tail': cetree.setTailText(self._c_node, value) return @@ -269,7 +269,7 @@ if key == 0: return self else: - raise IndexError(key) + raise IndexError, str(key) if key < 0: c_node = c_parent.last else: @@ -277,7 +277,7 @@ c_node = _findFollowingSibling( c_node, tree._getNs(c_self_node), c_self_node.name, key) if c_node is NULL: - raise IndexError(key) + raise IndexError, str(key) return elementFactory(self._doc, c_node) def __setitem__(self, key, value): @@ -306,7 +306,7 @@ if self._c_node.parent is NULL: # the 'root[i] = ...' case - raise TypeError("assignment to root element is invalid") + raise TypeError, "assignment to root element is invalid" if python.PySlice_Check(key): # slice assignment @@ -320,7 +320,7 @@ c_node = _findFollowingSibling( c_node, tree._getNs(self._c_node), self._c_node.name, key) if c_node is NULL: - raise IndexError(key) + raise IndexError, str(key) element = elementFactory(self._doc, c_node) _replaceElement(element, value) @@ -328,7 +328,7 @@ cdef Py_ssize_t start, stop, step, slicelength parent = self.getparent() if parent is None: - raise TypeError("deleting items not supported by root element") + raise TypeError, "deleting items not supported by root element" if python.PySlice_Check(key): # slice deletion del_items = list(self)[key] @@ -445,8 +445,8 @@ cdef object _lookupChildOrRaise(_Element parent, tag): element = _lookupChild(parent, tag) if element is None: - raise AttributeError("no such child: " + - _buildChildTag(parent, tag)) + raise AttributeError, \ + "no such child: " + _buildChildTag(parent, tag) return element cdef object _buildChildTag(_Element parent, tag): @@ -529,7 +529,7 @@ else: c_step = (slice).step if c_step == 0: - raise ValueError("Invalid slice") + raise ValueError, "Invalid slice" del_items = target[slice] # collect new values @@ -550,10 +550,10 @@ # sanity check - raise what a list would raise if c_step != 1 and \ python.PyList_GET_SIZE(del_items) != python.PyList_GET_SIZE(new_items): - raise ValueError( + raise ValueError, \ "attempt to assign sequence of size %d to extended slice of size %d" % ( - python.PyList_GET_SIZE(new_items), - python.PyList_GET_SIZE(del_items))) + python.PyList_GET_SIZE(new_items), + python.PyList_GET_SIZE(del_items)) # replace existing items pos = 0 @@ -768,7 +768,7 @@ elif isinstance(other, StringElement): return _numericValueOf(self) * textOf((other)._c_node) else: - raise TypeError("invalid types for * operator") + raise TypeError, "invalid types for * operator" def __mod__(self, other): return _strValueOf(self) % other @@ -845,7 +845,7 @@ return False value = __parseBoolAsInt(s) if value == -1: - raise ValueError("Invalid boolean value: '%s'" % s) + raise ValueError, "Invalid boolean value: '%s'" % s return value cdef inline int __parseBoolAsInt(text): @@ -914,13 +914,13 @@ cdef object _schema_types def __init__(self, name, type_check, type_class, stringify=None): if not python._isString(name): - raise TypeError("Type name must be a string") + raise TypeError, "Type name must be a string" if type_check is not None and not callable(type_check): - raise TypeError("Type check function must be callable (or None)") + raise TypeError, "Type check function must be callable (or None)" if name != TREE_PYTYPE_NAME and \ not issubclass(type_class, ObjectifiedDataElement): - raise TypeError( - "Data classes must inherit from ObjectifiedDataElement") + raise TypeError, \ + "Data classes must inherit from ObjectifiedDataElement" self.name = name self._type = type_class self.type_check = type_check @@ -943,7 +943,7 @@ ignored. Raises ValueError if the dependencies cannot be fulfilled. """ if self.name == TREE_PYTYPE_NAME: - raise ValueError("Cannot register tree type") + raise ValueError, "Cannot register tree type" if self.type_check is not None: for item in _TYPE_CHECKS: if item[0] is self.type_check: @@ -965,7 +965,7 @@ if last_pos == -1: _TYPE_CHECKS.append(entry) elif first_pos > last_pos: - raise ValueError("inconsistent before/after dependencies") + raise ValueError, "inconsistent before/after dependencies" else: _TYPE_CHECKS.insert(last_pos, entry) @@ -1723,7 +1723,7 @@ elif isinstance(new_parser, etree.XMLParser): objectify_parser = new_parser else: - raise TypeError("parser must inherit from lxml.etree.XMLParser") + raise TypeError, "parser must inherit from lxml.etree.XMLParser" def makeparser(**kw): """makeparser(remove_blank_text=True, **kw) @@ -1871,7 +1871,7 @@ prefix, name = _xsi.split(':', 1) ns = nsmap.get(prefix) if ns != XML_SCHEMA_NS: - raise ValueError("XSD types require the XSD namespace") + raise ValueError, "XSD types require the XSD namespace" elif nsmap is _DEFAULT_NSMAP: name = _xsi _xsi = 'xsd:' + _xsi @@ -1883,7 +1883,7 @@ _xsi = prefix + ':' + _xsi break else: - raise ValueError("XSD types require the XSD namespace") + raise ValueError, "XSD types require the XSD namespace" python.PyDict_SetItem(_attributes, XML_SCHEMA_INSTANCE_TYPE_ATTR, _xsi) if _pytype is None: # allow using unregistered or even wrong xsi:type names Modified: lxml/trunk/src/lxml/nsclasses.pxi ============================================================================== --- lxml/trunk/src/lxml/nsclasses.pxi (original) +++ lxml/trunk/src/lxml/nsclasses.pxi Fri Apr 25 23:31:46 2008 @@ -56,14 +56,14 @@ cdef python.PyObject* dict_result dict_result = python.PyDict_GetItem(self._entries, name) if dict_result is NULL: - raise KeyError("Name not registered.") + raise KeyError, "Name not registered." return dict_result cdef object _getForString(self, char* name): cdef python.PyObject* dict_result dict_result = python.PyDict_GetItemString(self._entries, name) if dict_result is NULL: - raise KeyError("Name not registered.") + raise KeyError, "Name not registered." return dict_result def __iter__(self): @@ -82,8 +82,8 @@ "Dictionary-like registry for namespace implementation classes" def __setitem__(self, name, item): if not python.PyType_Check(item) or not issubclass(item, ElementBase): - raise NamespaceRegistryError( - "Registered element classes must be subtypes of ElementBase") + raise NamespaceRegistryError, \ + "Registered element classes must be subtypes of ElementBase" if name is not None: name = _utf8(name) self._entries[name] = item @@ -186,11 +186,11 @@ cdef class _FunctionNamespaceRegistry(_NamespaceRegistry): def __setitem__(self, name, item): if not callable(item): - raise NamespaceRegistryError( - "Registered functions must be callable.") + raise NamespaceRegistryError, \ + "Registered functions must be callable." if not name: - raise ValueError( - "extensions must have non empty names") + raise ValueError, \ + "extensions must have non empty names" self._entries[_utf8(name)] = item def __repr__(self): Modified: lxml/trunk/src/lxml/objectpath.pxi ============================================================================== --- lxml/trunk/src/lxml/objectpath.pxi (original) +++ lxml/trunk/src/lxml/objectpath.pxi Fri Apr 25 23:31:46 2008 @@ -50,7 +50,7 @@ python.Py_INCREF(default) use_default = 1 elif use_default > 1: - raise TypeError("invalid number of arguments: needs one or two") + raise TypeError, "invalid number of arguments: needs one or two" return _findObjectPath(root, self._c_path, self._path_len, default, use_default) @@ -113,15 +113,15 @@ # path '.child' => ignore root python.PyList_Append(new_path, _RELATIVE_PATH_SEGMENT) elif index != 0: - raise ValueError("index not allowed on root node") + raise ValueError, "index not allowed on root node" elif not has_dot: - raise ValueError("invalid path") + raise ValueError, "invalid path" python.PyList_Append(new_path, (ns, name, index)) path_pos = match.end() if python.PyList_GET_SIZE(new_path) == 0 or \ python.PyString_GET_SIZE(path) > path_pos: - raise ValueError("invalid path") + raise ValueError, "invalid path" return new_path cdef _parseObjectPathList(path): @@ -146,17 +146,17 @@ else: index_end = cstd.strchr(index_pos + 1, c']') if index_end is NULL: - raise ValueError("index must be enclosed in []") + raise ValueError, "index must be enclosed in []" index = python.PyNumber_Int( python.PyString_FromStringAndSize( index_pos + 1, (index_end - index_pos - 1))) if python.PyList_GET_SIZE(new_path) == 0 and index != 0: - raise ValueError("index not allowed on root node") + raise ValueError, "index not allowed on root node" name = python.PyString_FromStringAndSize( c_name, (index_pos - c_name)) python.PyList_Append(new_path, (ns, name, index)) if python.PyList_GET_SIZE(new_path) == 0: - raise ValueError("invalid path") + raise ValueError, "invalid path" return new_path cdef _ObjectPath* _buildObjectPathSegments(path_list) except NULL: @@ -194,9 +194,9 @@ if c_href is NULL or c_href[0] == c'\0': c_href = tree._getNs(c_node) if not cetree.tagMatches(c_node, c_href, c_name): - raise ValueError( - "root element does not match: need %s, got %s" % - (cetree.namespacedNameFromNsName(c_href, c_name), root.tag)) + raise ValueError, \ + "root element does not match: need %s, got %s" % \ + (cetree.namespacedNameFromNsName(c_href, c_name), root.tag) while c_node is not NULL: c_path_len = c_path_len - 1 @@ -221,7 +221,7 @@ return default_value else: tag = cetree.namespacedNameFromNsName(c_href, c_name) - raise AttributeError("no such child: " + tag) + raise AttributeError, "no such child: " + tag cdef _createObjectPath(_Element root, _ObjectPath* c_path, Py_ssize_t c_path_len, int replace, value): @@ -236,7 +236,7 @@ cdef char* c_name cdef Py_ssize_t c_index if c_path_len == 1: - raise TypeError("cannot update root node") + raise TypeError, "cannot update root node" c_node = root._c_node c_name = c_path[0].name @@ -244,9 +244,9 @@ if c_href is NULL or c_href[0] == c'\0': c_href = tree._getNs(c_node) if not cetree.tagMatches(c_node, c_href, c_name): - raise ValueError( - "root element does not match: need %s, got %s" % - (cetree.namespacedNameFromNsName(c_href, c_name), root.tag)) + raise ValueError, \ + "root element does not match: need %s, got %s" % \ + (cetree.namespacedNameFromNsName(c_href, c_name), root.tag) while c_path_len > 1: c_path_len = c_path_len - 1 @@ -265,8 +265,8 @@ if c_child is not NULL: c_node = c_child elif c_index != 0: - raise TypeError( - "creating indexed path attributes is not supported") + raise TypeError, \ + "creating indexed path attributes is not supported" elif c_path_len == 1: _appendValue(cetree.elementFactory(root._doc, c_node), cetree.namespacedNameFromNsName(c_href, c_name), Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Fri Apr 25 23:31:46 2008 @@ -269,8 +269,8 @@ if remaining <= 0: self._bytes = self._filelike.read(c_size) if not python.PyString_Check(self._bytes): - raise TypeError( - "reading file objects must return plain strings") + raise TypeError, \ + "reading file objects must return plain strings" remaining = python.PyString_GET_SIZE(self._bytes) self._bytes_read = 0 if remaining == 0: @@ -420,7 +420,7 @@ result = python.PyThread_acquire_lock( self._lock, python.WAIT_LOCK) if result == 0: - raise ParserError("parser locking failed") + raise ParserError, "parser locking failed" self._error_log.connect() if self._validator is not None: self._validator.connect(self._c_ctxt) @@ -473,7 +473,7 @@ filename, (ctxt.lastError.message).strip()) else: message = "Error reading '%s'" % filename - raise IOError(message) + raise IOError, message elif error_log: raise error_log._buildParseException( XMLSyntaxError, "Document is not well formed") @@ -564,7 +564,7 @@ if not isinstance(self, HTMLParser) and \ not isinstance(self, XMLParser) and \ not isinstance(self, iterparse): - raise TypeError("This class cannot be instantiated") + raise TypeError, "This class cannot be instantiated" self._parse_options = parse_options self._filename = filename @@ -585,7 +585,7 @@ c_encoding = tree.xmlParseCharEncoding(_cstr(encoding)) if c_encoding == tree.XML_CHAR_ENCODING_ERROR or \ c_encoding == tree.XML_CHAR_ENCODING_NONE: - raise LookupError("unknown encoding: '%s'" % encoding) + raise LookupError, "unknown encoding: '%s'" % encoding self._default_encoding = encoding self._default_encoding_int = c_encoding @@ -775,7 +775,7 @@ cdef xmlparser.xmlParserCtxt* pctxt cdef char* c_encoding if c_len > python.INT_MAX: - raise ParserError("string is too long to parse it with libxml2") + raise ParserError, "string is too long to parse it with libxml2" context = self._getParserContext() context.prepare() @@ -910,13 +910,13 @@ py_buffer_len = python.PyString_GET_SIZE(data) elif python.PyUnicode_Check(data): if _UNICODE_ENCODING is NULL: - raise ParserError( - "Unicode parsing is not supported on this platform") + raise ParserError, \ + "Unicode parsing is not supported on this platform" c_encoding = _UNICODE_ENCODING c_data = python.PyUnicode_AS_DATA(data) py_buffer_len = python.PyUnicode_GET_DATA_SIZE(data) else: - raise TypeError("Parsing requires string data") + raise TypeError, "Parsing requires string data" context = self._getPushParserContext() pctxt = context._c_ctxt @@ -1344,7 +1344,7 @@ return _parseFilelikeDocument( source, _encodeFilenameUTF8(url), parser) - raise TypeError("cannot parse from '%s'" % python._fqtypename(source)) + raise TypeError, "cannot parse from '%s'" % python._fqtypename(source) cdef _Document _parseDocumentFromURL(url, _BaseParser parser): cdef xmlDoc* c_doc @@ -1355,13 +1355,13 @@ cdef xmlDoc* c_doc if python.PyUnicode_Check(text): if _hasEncodingDeclaration(text): - raise ValueError( - "Unicode strings with encoding declaration are not supported.") + raise ValueError, \ + "Unicode strings with encoding declaration are not supported." # pass native unicode only if libxml2 can handle it if _UNICODE_ENCODING is NULL: text = python.PyUnicode_AsUTF8String(text) elif not python.PyString_Check(text): - raise ValueError("can only parse strings") + raise ValueError, "can only parse strings" if python.PyUnicode_Check(url): url = python.PyUnicode_AsUTF8String(url) c_doc = _parseDoc(text, url, parser) Modified: lxml/trunk/src/lxml/readonlytree.pxi ============================================================================== --- lxml/trunk/src/lxml/readonlytree.pxi (original) +++ lxml/trunk/src/lxml/readonlytree.pxi Fri Apr 25 23:31:46 2008 @@ -78,7 +78,7 @@ cdef xmlNode* c_node c_node = _findChild(self._c_node, index) if c_node is NULL: - raise IndexError("list index out of range") + raise IndexError, "list index out of range" return _newReadOnlyProxy(self._source_proxy, c_node) def __getslice__(self, Py_ssize_t start, Py_ssize_t stop): @@ -320,8 +320,8 @@ elif isinstance(element, _ReadOnlyElementProxy): c_node = (<_ReadOnlyElementProxy>element)._c_node else: - raise TypeError("invalid value to append()") + raise TypeError, "invalid value to append()" if c_node is NULL: - raise TypeError("invalid element") + raise TypeError, "invalid element" return c_node Modified: lxml/trunk/src/lxml/relaxng.pxi ============================================================================== --- lxml/trunk/src/lxml/relaxng.pxi (original) +++ lxml/trunk/src/lxml/relaxng.pxi Fri Apr 25 23:31:46 2008 @@ -47,7 +47,7 @@ if c_href is NULL or \ cstd.strcmp(c_href, 'http://relaxng.org/ns/structure/1.0') != 0: - raise RelaxNGParseError("Document is not Relax NG") + raise RelaxNGParseError, "Document is not Relax NG" self._error_log.connect() fake_c_doc = _fakeRootDoc(doc._c_doc, root_node._c_node) parser_ctxt = relaxng.xmlRelaxNGNewDocParserCtxt(fake_c_doc) @@ -61,7 +61,7 @@ self._error_log.connect() parser_ctxt = relaxng.xmlRelaxNGNewDocParserCtxt(doc._c_doc) else: - raise RelaxNGParseError("No tree or file given") + raise RelaxNGParseError, "No tree or file given" if parser_ctxt is NULL: self._error_log.disconnect() Modified: lxml/trunk/src/lxml/schematron.pxi ============================================================================== --- lxml/trunk/src/lxml/schematron.pxi (original) +++ lxml/trunk/src/lxml/schematron.pxi Fri Apr 25 23:31:46 2008 @@ -85,8 +85,8 @@ self._c_schema_doc = NULL _Validator.__init__(self) if not config.ENABLE_SCHEMATRON: - raise SchematronError( - "lxml.etree was compiled without Schematron support.") + raise SchematronError, \ + "lxml.etree was compiled without Schematron support." if etree is not None: doc = _documentOrRaise(etree) root_node = _rootNodeOrRaise(etree) @@ -103,7 +103,7 @@ self._error_log.connect() parser_ctxt = schematron.xmlSchematronNewParserCtxt(_cstr(filename)) else: - raise SchematronParseError("No tree or file given") + raise SchematronParseError, "No tree or file given" if parser_ctxt is NULL: self._error_log.disconnect() Modified: lxml/trunk/src/lxml/serializer.pxi ============================================================================== --- lxml/trunk/src/lxml/serializer.pxi (original) +++ lxml/trunk/src/lxml/serializer.pxi Fri Apr 25 23:31:46 2008 @@ -15,7 +15,7 @@ return OUTPUT_METHOD_HTML if method == "text": return OUTPUT_METHOD_TEXT - raise ValueError("unknown output method %r" % method) + raise ValueError, "unknown output method %r" % method cdef _textToString(xmlNode* c_node, encoding, bint with_tail): cdef bint needs_conversion @@ -89,8 +89,8 @@ # encoding during output enchandler = tree.xmlFindCharEncodingHandler(c_enc) if enchandler is NULL and c_enc is not NULL: - raise LookupError(python.PyString_FromFormat( - "unknown encoding: '%s'", c_enc)) + raise LookupError, python.PyString_FromFormat( + "unknown encoding: '%s'", c_enc) c_buffer = tree.xmlAllocOutputBuffer(enchandler) if c_buffer is NULL: tree.xmlCharEncCloseFunc(enchandler) @@ -278,13 +278,13 @@ _writeFilelikeWriter, _closeFilelikeWriter, self, enchandler) if c_buffer is NULL: - raise IOError("Could not create I/O writer context.") + raise IOError, "Could not create I/O writer context." return c_buffer cdef int write(self, char* c_buffer, int size): try: if self._filelike is None: - raise IOError("File is already closed") + raise IOError, "File is already closed" py_buffer = python.PyString_FromStringAndSize(c_buffer, size) self._filelike.write(py_buffer) return size @@ -328,8 +328,8 @@ return enchandler = tree.xmlFindCharEncodingHandler(c_enc) if enchandler is NULL: - raise LookupError(python.PyString_FromFormat( - "unknown encoding: '%s'", c_enc)) + raise LookupError, python.PyString_FromFormat( + "unknown encoding: '%s'", c_enc) if _isString(f): filename8 = _encodeFilename(f) @@ -343,7 +343,8 @@ c_buffer = writer._createOutputBuffer(enchandler) else: tree.xmlCharEncCloseFunc(enchandler) - raise TypeError("File or filename expected, got '%s'" % type(f)) + raise TypeError, \ + "File or filename expected, got '%s'" % python._fqtypename(f) _writeNodeToBuffer(c_buffer, element._c_node, c_enc, c_method, write_xml_declaration, write_doctype, @@ -380,7 +381,8 @@ writer.error_log.disconnect() tree.xmlOutputBufferClose(c_buffer) else: - raise TypeError("File or filename expected, got '%s'" % type(f)) + raise TypeError, \ + "File or filename expected, got '%s'" % python._fqtypename(f) finally: _destroyFakeDoc(c_base_doc, c_doc) @@ -393,14 +395,14 @@ errors = writer.error_log if len(errors): message = errors[0].message - raise C14NError(message) + raise C14NError, message # dump node to file (mainly for debug) cdef _dumpToFile(f, xmlNode* c_node, bint pretty_print, bint with_tail): cdef tree.xmlOutputBuffer* c_buffer if not python.PyFile_Check(f): - raise ValueError("not a file") + raise ValueError, "not a file" c_buffer = tree.xmlOutputBufferCreateFile(python.PyFile_AsFile(f), NULL) tree.xmlNodeDumpOutput(c_buffer, c_node.doc, c_node, 0, pretty_print, NULL) if with_tail: Modified: lxml/trunk/src/lxml/xmlid.pxi ============================================================================== --- lxml/trunk/src/lxml/xmlid.pxi (original) +++ lxml/trunk/src/lxml/xmlid.pxi Fri Apr 25 23:31:46 2008 @@ -69,7 +69,7 @@ cdef _Document doc doc = _documentOrRaise(etree) if doc._c_doc.ids is NULL: - raise ValueError("No ID dictionary available.") + raise ValueError, "No ID dictionary available." self._doc = doc self._keys = None self._items = None @@ -85,10 +85,10 @@ id_utf = _utf8(id_name) c_id = tree.xmlHashLookup(c_ids, _cstr(id_utf)) if c_id is NULL: - raise KeyError("key not found.") + raise KeyError, "key not found." c_attr = c_id.attr if c_attr is NULL or c_attr.parent is NULL: - raise KeyError("ID attribute not found.") + raise KeyError, "ID attribute not found." return _elementFactory(self._doc, c_attr.parent) def get(self, id_name): Modified: lxml/trunk/src/lxml/xmlschema.pxi ============================================================================== --- lxml/trunk/src/lxml/xmlschema.pxi (original) +++ lxml/trunk/src/lxml/xmlschema.pxi Fri Apr 25 23:31:46 2008 @@ -47,7 +47,7 @@ c_href = _getNs(c_node) if c_href is NULL or \ cstd.strcmp(c_href, 'http://www.w3.org/2001/XMLSchema') != 0: - raise XMLSchemaParseError("Document is not XML Schema") + raise XMLSchemaParseError, "Document is not XML Schema" fake_c_doc = _fakeRootDoc(doc._c_doc, root_node._c_node) self._error_log.connect() @@ -62,7 +62,7 @@ self._error_log.connect() parser_ctxt = xmlschema.xmlSchemaNewDocParserCtxt(doc._c_doc) else: - raise XMLSchemaParseError("No tree or file given") + raise XMLSchemaParseError, "No tree or file given" if parser_ctxt is not NULL: self._c_schema = xmlschema.xmlSchemaParse(parser_ctxt) Modified: lxml/trunk/src/lxml/xpath.pxi ============================================================================== --- lxml/trunk/src/lxml/xpath.pxi (original) +++ lxml/trunk/src/lxml/xpath.pxi Fri Apr 25 23:31:46 2008 @@ -158,7 +158,7 @@ result = python.PyThread_acquire_lock( self._eval_lock, python.WAIT_LOCK) if result == 0: - raise ParserError("parser locking failed") + raise ParserError, "parser locking failed" return 0 cdef void _unlock(self): Modified: lxml/trunk/src/lxml/xslt.pxi ============================================================================== --- lxml/trunk/src/lxml/xslt.pxi (original) +++ lxml/trunk/src/lxml/xslt.pxi Fri Apr 25 23:31:46 2008 @@ -278,8 +278,8 @@ if extensions is not None and extensions: for ns_name_tuple, extension in extensions.items(): if ns_name_tuple[0] is None: - raise XSLTExtensionError( - "extensions must not have empty namespaces") + raise XSLTExtensionError, \ + "extensions must not have empty namespaces" if isinstance(extension, XSLTExtension): if self._extension_elements is EMPTY_READ_ONLY_DICT: self._extension_elements = {} @@ -727,10 +727,10 @@ cdef char* c_href cdef xmlAttr* c_attr if self._c_node.content is NULL: - raise ValueError("PI lacks content") + raise ValueError, "PI lacks content" hrefs_utf = _FIND_PI_HREF(' ' + self._c_node.content) if len(hrefs_utf) != 1: - raise ValueError("malformed PI attributes") + raise ValueError, "malformed PI attributes" href_utf = hrefs_utf[0] c_href = _cstr(href_utf) @@ -756,20 +756,20 @@ # try XPath search root = _findStylesheetByID(self._doc, funicode(c_href)) if not root: - raise ValueError("reference to non-existing embedded stylesheet") + raise ValueError, "reference to non-existing embedded stylesheet" elif len(root) > 1: - raise ValueError("ambiguous reference to embedded stylesheet") + raise ValueError, "ambiguous reference to embedded stylesheet" result_node = root[0] return _elementTreeFactory(result_node._doc, result_node) def set(self, key, value): if key != "href": - raise AttributeError( - "only setting the 'href' attribute is supported on XSLT-PIs") + raise AttributeError, \ + "only setting the 'href' attribute is supported on XSLT-PIs" if value is None: attrib = "" elif '"' in value or '>' in value: - raise ValueError("Invalid URL, must not contain '\"' or '>'") + raise ValueError, "Invalid URL, must not contain '\"' or '>'" else: attrib = ' href="%s"' % value text = ' ' + self.text Modified: lxml/trunk/src/lxml/xsltext.pxi ============================================================================== --- lxml/trunk/src/lxml/xsltext.pxi (original) +++ lxml/trunk/src/lxml/xsltext.pxi Fri Apr 25 23:31:46 2008 @@ -58,8 +58,8 @@ tree.xmlUnlinkNode(c_node) proxy.free_after_use() else: - raise TypeError("unsupported XSLT result type: %d" % - c_node.type) + raise TypeError, \ + "unsupported XSLT result type: %d" % c_node.type c_node = c_next finally: # free all intermediate nodes that will not be freed by proxies @@ -94,8 +94,8 @@ dict_result = python.PyDict_GetItem( context._extension_elements, (c_uri, c_inst_node.name)) if dict_result is NULL: - raise KeyError("extension element %s not found" % - c_inst_node.name) + raise KeyError, \ + "extension element %s not found" % c_inst_node.name extension = dict_result try: From scoder at codespeak.net Fri Apr 25 23:31:52 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 25 Apr 2008 23:31:52 +0200 (CEST) Subject: [Lxml-checkins] r54137 - in lxml/trunk: . src/lxml Message-ID: <20080425213152.9604A169EE4@codespeak.net> Author: scoder Date: Fri Apr 25 23:31:52 2008 New Revision: 54137 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/lxml.etree.pyx lxml/trunk/src/lxml/python.pxd Log: r4093 at delle: sbehnel | 2008-04-25 23:30:13 +0200 cleanup in exception __init__ Modified: lxml/trunk/src/lxml/lxml.etree.pyx ============================================================================== --- lxml/trunk/src/lxml/lxml.etree.pyx (original) +++ lxml/trunk/src/lxml/lxml.etree.pyx Fri Apr 25 23:31:52 2008 @@ -105,7 +105,11 @@ this one. """ def __init__(self, message, error_log=None): - _initError(self, message) + if python.PY_VERSION_HEX >= 0x02050000: + # Python >= 2.5 uses new style class exceptions + super(_LxmlError, self).__init__(message) + else: + super_init(self, message) if error_log is None: self.error_log = __copyGlobalErrorLog() else: @@ -114,16 +118,8 @@ cdef object _LxmlError _LxmlError = LxmlError -def _superError(obj, message): - super(_LxmlError, obj).__init__(message) - -cdef object _initError -if isinstance(_LxmlError, type): - _initError = _superError # Python >= 2.5 -else: - _initError = Error.__init__ # Python <= 2.4 - -del _superError +cdef object super_init +super_init = Error.__init__ # superclass for all syntax errors Modified: lxml/trunk/src/lxml/python.pxd ============================================================================== --- lxml/trunk/src/lxml/python.pxd (original) +++ lxml/trunk/src/lxml/python.pxd Fri Apr 25 23:31:52 2008 @@ -6,6 +6,7 @@ ctypedef int size_t cdef int INT_MAX cdef int PY_SSIZE_T_MAX + cdef int PY_VERSION_HEX cdef void Py_INCREF(object o) cdef void Py_DECREF(object o) From scoder at codespeak.net Sun Apr 27 21:03:04 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 27 Apr 2008 21:03:04 +0200 (CEST) Subject: [Lxml-checkins] r54165 - in lxml/trunk: . doc Message-ID: <20080427190304.C63D52A0151@codespeak.net> Author: scoder Date: Sun Apr 27 21:02:49 2008 New Revision: 54165 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/objectify.txt Log: r4096 at delle: sbehnel | 2008-04-27 20:59:31 +0200 more robust test Modified: lxml/trunk/doc/objectify.txt ============================================================================== --- lxml/trunk/doc/objectify.txt (original) +++ lxml/trunk/doc/objectify.txt Sun Apr 27 21:02:49 2008 @@ -1063,7 +1063,9 @@ .. sourcecode:: pycon >>> el = objectify.DataElement('5', _xsi='string') - >>> for prefix, namespace in el.nsmap.items(): + >>> namespaces = el.nsmap.items() + >>> namespaces.sort() + >>> for prefix, namespace in namespaces: ... print prefix, '-', namespace py - http://codespeak.net/lxml/objectify/pytype xsd - http://www.w3.org/2001/XMLSchema @@ -1079,10 +1081,12 @@ >>> el = objectify.DataElement('5', _xsi='foo:string', ... nsmap={'foo': 'http://www.w3.org/2001/XMLSchema'}) - >>> for prefix, namespace in el.nsmap.items(): + >>> namespaces = el.nsmap.items() + >>> namespaces.sort() + >>> for prefix, namespace in namespaces: ... print prefix, '-', namespace - py - http://codespeak.net/lxml/objectify/pytype foo - http://www.w3.org/2001/XMLSchema + py - http://codespeak.net/lxml/objectify/pytype xsi - http://www.w3.org/2001/XMLSchema-instance >>> print el.get("{http://www.w3.org/2001/XMLSchema-instance}type") @@ -1096,11 +1100,13 @@ >>> el = objectify.DataElement('5', _xsi='foo:string', ... nsmap={'foo': 'http://www.w3.org/2001/XMLSchema', ... 'myxsi': 'http://www.w3.org/2001/XMLSchema-instance'}) - >>> for prefix, namespace in el.nsmap.items(): + >>> namespaces = el.nsmap.items() + >>> namespaces.sort() + >>> for prefix, namespace in namespaces: ... print prefix, '-', namespace - py - http://codespeak.net/lxml/objectify/pytype foo - http://www.w3.org/2001/XMLSchema myxsi - http://www.w3.org/2001/XMLSchema-instance + py - http://codespeak.net/lxml/objectify/pytype >>> print el.get("{http://www.w3.org/2001/XMLSchema-instance}type") foo:string From scoder at codespeak.net Sun Apr 27 21:03:09 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 27 Apr 2008 21:03:09 +0200 (CEST) Subject: [Lxml-checkins] r54166 - in lxml/trunk: . src/lxml Message-ID: <20080427190309.317E42A0151@codespeak.net> Author: scoder Date: Sun Apr 27 21:03:08 2008 New Revision: 54166 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/apihelpers.pxi lxml/trunk/src/lxml/python.pxd Log: r4097 at delle: sbehnel | 2008-04-27 21:00:32 +0200 fixes for namespace setup on new Elements Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Sun Apr 27 21:03:08 2008 @@ -2,6 +2,30 @@ lxml changelog ============== +Under development +================= + +Features added +-------------- + +Bugs fixed +---------- + +* Passing an nsmap when creating an Element will no longer strip + redundantly defined namespace URIs. This prevented the definition + of more than one prefix for a namespace on the same Element. + +Other changes +------------- + +* If the default namespace is redundantly defined with a prefix on the + same Element, the prefix will now be preferred for subelements and + attributes. This allows users to work around a problem in libxml2 + where attributes from the default namespace could serialise without + a prefix even when they appear on an Element with a different + namespace (i.e. they would end up in the wrong namespace). + + 2.1beta1 (2008-04-15) ===================== Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Sun Apr 27 21:03:08 2008 @@ -189,18 +189,31 @@ doc._setNodeNs(c_node, _cstr(node_ns_utf)) return 0 - for prefix, href in nsmap.items(): + # Sort the prefixes backwards to move the default namespace to the + # end. This makes sure libxml2 prefers a prefix if the ns is + # defined redundantly on the same element. That way, users can + # work around a problem themselves where default namespace + # attributes on non-default namespaced elements serialise without + # prefix (i.e. into the non-default namespace). + nsdefs = list(nsmap.items()) + if python.PyList_GET_SIZE(nsdefs) > 1: + python.PyList_Sort(nsdefs) + python.PyList_Reverse(nsdefs) + + for prefix, href in nsdefs: href_utf = _utf8(href) c_href = _cstr(href_utf) - if prefix is not None and prefix: + if prefix is not None: prefix_utf = _utf8(prefix) _prefixValidOrRaise(prefix_utf) c_prefix = _cstr(prefix_utf) else: c_prefix = NULL - # add namespace with prefix if ns is not already known - c_ns = tree.xmlSearchNsByHref(doc._c_doc, c_node, c_href) - if c_ns is NULL: + # add namespace with prefix if it is not already known + c_ns = tree.xmlSearchNs(doc._c_doc, c_node, c_prefix) + if c_ns is NULL or \ + c_ns.href is NULL or \ + cstd.strcmp(c_ns.href, c_href) != 0: c_ns = tree.xmlNewNs(c_node, c_href, c_prefix) if href_utf == node_ns_utf: tree.xmlSetNs(c_node, c_ns) Modified: lxml/trunk/src/lxml/python.pxd ============================================================================== --- lxml/trunk/src/lxml/python.pxd (original) +++ lxml/trunk/src/lxml/python.pxd Sun Apr 27 21:03:08 2008 @@ -59,6 +59,8 @@ cdef int PyList_Insert(object l, Py_ssize_t index, object o) except -1 cdef object PyList_AsTuple(object l) cdef void PyList_Clear(object l) + cdef void PyList_Sort(object l) + cdef void PyList_Reverse(object l) cdef int PyDict_SetItemString(object d, char* key, object value) except -1 cdef int PyDict_SetItem(object d, object key, object value) except -1 From scoder at codespeak.net Sun Apr 27 21:09:05 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 27 Apr 2008 21:09:05 +0200 (CEST) Subject: [Lxml-checkins] r54167 - in lxml/trunk: . src/lxml Message-ID: <20080427190905.DF3EE168531@codespeak.net> Author: scoder Date: Sun Apr 27 21:09:01 2008 New Revision: 54167 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/python.pxd Log: r4100 at delle: sbehnel | 2008-04-27 21:07:36 +0200 signature fix Modified: lxml/trunk/src/lxml/python.pxd ============================================================================== --- lxml/trunk/src/lxml/python.pxd (original) +++ lxml/trunk/src/lxml/python.pxd Sun Apr 27 21:09:01 2008 @@ -56,11 +56,10 @@ cdef void PyList_SET_ITEM(object l, Py_ssize_t index, object value) cdef int PyList_Append(object l, object obj) except -1 cdef int PyList_Reverse(object l) except -1 + cdef int PyList_Sort(object l) except -1 cdef int PyList_Insert(object l, Py_ssize_t index, object o) except -1 cdef object PyList_AsTuple(object l) cdef void PyList_Clear(object l) - cdef void PyList_Sort(object l) - cdef void PyList_Reverse(object l) cdef int PyDict_SetItemString(object d, char* key, object value) except -1 cdef int PyDict_SetItem(object d, object key, object value) except -1 From scoder at codespeak.net Sun Apr 27 22:04:25 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 27 Apr 2008 22:04:25 +0200 (CEST) Subject: [Lxml-checkins] r54168 - in lxml/trunk: . src/lxml Message-ID: <20080427200425.1B8D3168568@codespeak.net> Author: scoder Date: Sun Apr 27 22:04:22 2008 New Revision: 54168 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/apihelpers.pxi lxml/trunk/src/lxml/python.pxd Log: r4102 at delle: sbehnel | 2008-04-27 22:02:55 +0200 cleanup of last change: avoid stuff that will break with Py3 anyway Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Sun Apr 27 22:04:22 2008 @@ -189,16 +189,17 @@ doc._setNodeNs(c_node, _cstr(node_ns_utf)) return 0 - # Sort the prefixes backwards to move the default namespace to the - # end. This makes sure libxml2 prefers a prefix if the ns is - # defined redundantly on the same element. That way, users can - # work around a problem themselves where default namespace - # attributes on non-default namespaced elements serialise without - # prefix (i.e. into the non-default namespace). nsdefs = list(nsmap.items()) - if python.PyList_GET_SIZE(nsdefs) > 1: - python.PyList_Sort(nsdefs) - python.PyList_Reverse(nsdefs) + if None in nsmap and python.PyList_GET_SIZE(nsdefs) > 1: + # Move the default namespace to the end. This makes sure libxml2 + # prefers a prefix if the ns is defined redundantly on the same + # element. That way, users can work around a problem themselves + # where default namespace attributes on non-default namespaced + # elements serialise without prefix (i.e. into the non-default + # namespace). + item = (None, nsmap[None]) + nsdefs.remove(item) + nsdefs.append(item) for prefix, href in nsdefs: href_utf = _utf8(href) Modified: lxml/trunk/src/lxml/python.pxd ============================================================================== --- lxml/trunk/src/lxml/python.pxd (original) +++ lxml/trunk/src/lxml/python.pxd Sun Apr 27 22:04:22 2008 @@ -56,7 +56,6 @@ cdef void PyList_SET_ITEM(object l, Py_ssize_t index, object value) cdef int PyList_Append(object l, object obj) except -1 cdef int PyList_Reverse(object l) except -1 - cdef int PyList_Sort(object l) except -1 cdef int PyList_Insert(object l, Py_ssize_t index, object o) except -1 cdef object PyList_AsTuple(object l) cdef void PyList_Clear(object l)