From ianb at codespeak.net Sat Apr 5 17:39:15 2008 From: ianb at codespeak.net (ianb at codespeak.net) Date: Sat, 5 Apr 2008 17:39:15 +0200 (CEST) Subject: [Lxml-checkins] r53392 - in lxml/trunk: . src/lxml/html Message-ID: <20080405153915.7AE2316AA5A@codespeak.net> Author: ianb Date: Sat Apr 5 17:39:13 2008 New Revision: 53392 Modified: lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/html/__init__.py Log: fix cssselect method Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Sat Apr 5 17:39:13 2008 @@ -15,6 +15,8 @@ * lxml.etree accepted non well-formed namespace prefix names. +* HTML elements' ``.cssselect()`` method was broken. + Other changes ------------- Modified: lxml/trunk/src/lxml/html/__init__.py ============================================================================== --- lxml/trunk/src/lxml/html/__init__.py (original) +++ lxml/trunk/src/lxml/html/__init__.py Sat Apr 5 17:39:13 2008 @@ -194,7 +194,7 @@ that pre-compiling the expression can provide a substantial speedup. """ - return cssselect.CSSSelect(expr)(self) + return cssselect.CSSSelector(expr)(self) ######################################## ## Link functions From lxml-checkins at codespeak.net Mon Apr 7 09:05:51 2008 From: lxml-checkins at codespeak.net (lxml-checkins at codespeak.net) Date: Mon, 7 Apr 2008 09:05:51 +0200 (CEST) Subject: [Lxml-checkins] Gucci 48757 Message-ID: <20080407100449.2614.qmail@adsl-dyn230.78-99-169.t-com.sk> An HTML attachment was scrubbed... URL: http://codespeak.net/pipermail/lxml-checkins/attachments/20080407/86e58e97/attachment-0001.htm From jholg at codespeak.net Mon Apr 7 15:29:27 2008 From: jholg at codespeak.net (jholg at codespeak.net) Date: Mon, 7 Apr 2008 15:29:27 +0200 (CEST) Subject: [Lxml-checkins] r53527 - in lxml/trunk/src/lxml: . tests Message-ID: <20080407132927.D18F916A273@codespeak.net> Author: jholg Date: Mon Apr 7 15:29:27 2008 New Revision: 53527 Modified: lxml/trunk/src/lxml/lxml.objectify.pyx lxml/trunk/src/lxml/tests/test_objectify.py Log: Added __int__, __long__, __float__, __complex__ methods to StringElement, plus tests. Modified: lxml/trunk/src/lxml/lxml.objectify.pyx ============================================================================== --- lxml/trunk/src/lxml/lxml.objectify.pyx (original) +++ lxml/trunk/src/lxml/lxml.objectify.pyx Mon Apr 7 15:29:27 2008 @@ -773,6 +773,18 @@ def __mod__(self, other): return _strValueOf(self) % other + def __int__(self): + return int(textOf(self._c_node)) + + def __long__(self): + return long(textOf(self._c_node)) + + def __float__(self): + return float(textOf(self._c_node)) + + def __complex__(self): + return complex(textOf(self._c_node)) + cdef class NoneElement(ObjectifiedDataElement): def __str__(self): return "None" Modified: lxml/trunk/src/lxml/tests/test_objectify.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_objectify.py (original) +++ lxml/trunk/src/lxml/tests/test_objectify.py Mon Apr 7 15:29:27 2008 @@ -815,6 +815,26 @@ el = objectify.DataElement(s) val = 5 self.assertRaises(TypeError, el.__mod__, val) + + def test_type_str_as_int(self): + v = "1" + el = objectify.DataElement(v) + self.assertEquals(int(el), 1) + + def test_type_str_as_long(self): + v = "1" + el = objectify.DataElement(v) + self.assertEquals(long(el), 1) + + def test_type_str_as_float(self): + v = "1" + el = objectify.DataElement(v) + self.assertEquals(float(el), 1) + + def test_type_str_as_complex(self): + v = "1" + el = objectify.DataElement(v) + self.assertEquals(complex(el), 1) def test_type_str_mod_data_elements(self): s = "%d %f %s %r" From scoder at codespeak.net Tue Apr 8 19:05:14 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 8 Apr 2008 19:05:14 +0200 (CEST) Subject: [Lxml-checkins] r53591 - in lxml/branch/lxml-2.0: . src/lxml/html Message-ID: <20080408170514.1032916A968@codespeak.net> Author: scoder Date: Tue Apr 8 19:05:13 2008 New Revision: 53591 Modified: lxml/branch/lxml-2.0/CHANGES.txt lxml/branch/lxml-2.0/src/lxml/html/__init__.py Log: merged in cssselect() bug fix by Ian Modified: lxml/branch/lxml-2.0/CHANGES.txt ============================================================================== --- lxml/branch/lxml-2.0/CHANGES.txt (original) +++ lxml/branch/lxml-2.0/CHANGES.txt Tue Apr 8 19:05:13 2008 @@ -11,6 +11,8 @@ Bugs fixed ---------- +* HTML elements' ``.cssselect()`` method was broken. + * ``ElementTree.find*()`` didn't accept QName objects. Other changes Modified: lxml/branch/lxml-2.0/src/lxml/html/__init__.py ============================================================================== --- lxml/branch/lxml-2.0/src/lxml/html/__init__.py (original) +++ lxml/branch/lxml-2.0/src/lxml/html/__init__.py Tue Apr 8 19:05:13 2008 @@ -194,7 +194,7 @@ that pre-compiling the expression can provide a substantial speedup. """ - return cssselect.CSSSelect(expr)(self) + return cssselect.CSSSelector(expr)(self) ######################################## ## Link functions From scoder at codespeak.net Thu Apr 10 08:00:39 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 10 Apr 2008 08:00:39 +0200 (CEST) Subject: [Lxml-checkins] r53639 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20080410060039.EFDE42A0192@codespeak.net> Author: scoder Date: Thu Apr 10 08:00:37 2008 New Revision: 53639 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/lxml.objectify.pyx lxml/trunk/src/lxml/tests/test_objectify.py Log: r3916 at delle: sbehnel | 2008-04-08 09:45:15 +0200 objectify: let BoolElement inherit from IntElement (as in Python), lots of cleanup in 'special methods' Modified: lxml/trunk/src/lxml/lxml.objectify.pyx ============================================================================== --- lxml/trunk/src/lxml/lxml.objectify.pyx (original) +++ lxml/trunk/src/lxml/lxml.objectify.pyx Thu Apr 10 08:00:37 2008 @@ -618,41 +618,44 @@ cetree.setNodeText(self._c_node, s) cdef class NumberElement(ObjectifiedDataElement): - cdef object _type + cdef object _parse_value def _setValueParser(self, function): - "Set the function that parses the Python value from a string." - self._type = function + """Set the function that parses the Python value from a string. - cdef _value(self): - return self._type(textOf(self._c_node)) + Do not use this unless you know what you are doing. + """ + self._parse_value = function property pyval: def __get__(self): - return self._value() + return _parseNumber(self) def __int__(self): - return int(textOf(self._c_node)) + return int(_parseNumber(self)) def __long__(self): - return long(textOf(self._c_node)) + return long(_parseNumber(self)) def __float__(self): - return float(textOf(self._c_node)) + return float(_parseNumber(self)) + + def __complex__(self): + return complex(_parseNumber(self)) def __str__(self): - return str(self._type(textOf(self._c_node))) + return str(_parseNumber(self)) def __repr__(self): - return repr(self._type(textOf(self._c_node))) + return repr(_parseNumber(self)) + + def __oct__(self): + return oct(_parseNumber(self)) -# def __oct__(self): -# def __hex__(self): + def __hex__(self): + return hex(_parseNumber(self)) def __richcmp__(self, other, int op): - if hasattr(other, 'pyval'): - other = other.pyval - return python.PyObject_RichCompare( - _numericValueOf(self), other, op) + return _richcmpPyvals(self, other, op) def __add__(self, other): return _numericValueOf(self) + _numericValueOf(other) @@ -710,15 +713,15 @@ cdef class IntElement(NumberElement): def _init(self): - self._type = int + self._parse_value = int cdef class LongElement(NumberElement): def _init(self): - self._type = long + self._parse_value = long cdef class FloatElement(NumberElement): def _init(self): - self._type = float + self._parse_value = float cdef class StringElement(ObjectifiedDataElement): """String data class. @@ -748,10 +751,7 @@ return len(text) > 0 def __richcmp__(self, other, int op): - if hasattr(other, 'pyval'): - other = other.pyval - return python.PyObject_RichCompare( - _strValueOf(self), other, op) + return _richcmpPyvals(self, other, op) def __add__(self, other): text = _strValueOf(self) @@ -807,61 +807,64 @@ def __get__(self): return None -cdef class BoolElement(ObjectifiedDataElement): +cdef class BoolElement(IntElement): """Boolean type base on string values: 'true' or 'false'. + + Note that this inherits from IntElement to mimic the behaviour of + Python's bool type. """ - cdef int _boolval(self) except -1: - cdef char* c_str - text = textOf(self._c_node) - if text is None: - return 0 - c_str = _cstr(text) - if c_str[0] == c'0' or c_str[0] == c'f' or c_str[0] == c'F': - if c_str[1] == c'\0' or text == "false" or text.lower() == "false": - # '0' or 'f' or 'false' - return 0 - elif c_str[0] == c'1' or c_str[0] == c't' or c_str[0] == c'T': - if c_str[1] == c'\0' or text == "true" or text.lower() == "true": - # '1' or 't' or 'true' - return 1 - raise ValueError("Invalid boolean value: '%s'" % text) + def _init(self): + self._parse_value = __parseBool def __nonzero__(self): - if self._boolval(): - return True - else: - return False + return __parseBool(textOf(self._c_node)) def __richcmp__(self, other, int op): - if hasattr(other, 'pyval'): - other = other.pyval - if hasattr(self, 'pyval'): - self_val = self.pyval - else: - self_val = bool(self) - return python.PyObject_RichCompare(self_val, other, op) + return _richcmpPyvals(self, other, op) def __str__(self): - if self._boolval(): - return "True" - else: - return "False" + return str(__parseBool(textOf(self._c_node))) def __repr__(self): - if self._boolval(): - return "True" - else: - return "False" + return repr(__parseBool(textOf(self._c_node))) property pyval: def __get__(self): - return self.__nonzero__() + return __parseBool(textOf(self._c_node)) def __checkBool(s): - if s != 'true' and s != 'false' and s != '1' and s != '0': + cdef int value = -1 + if s is not None: + value = __parseBoolAsInt(s) + if value == -1: raise ValueError -cdef object _strValueOf(obj): +cpdef __parseBool(s): + cdef int value + if s is None: + return False + value = __parseBoolAsInt(s) + if value == -1: + raise ValueError("Invalid boolean value: '%s'" % s) + return value + +cdef inline int __parseBoolAsInt(text): + cdef char* c_str + c_str = _cstr(text) + if c_str[0] == c'0' or c_str[0] == c'f' or c_str[0] == c'F': + if c_str[1] == c'\0' or text == "false" or text.lower() == "false": + # '0' or 'f' or 'false' + return 0 + elif c_str[0] == c'1' or c_str[0] == c't' or c_str[0] == c'T': + if c_str[1] == c'\0' or text == "true" or text.lower() == "true": + # '1' or 't' or 'true' + return 1 + return -1 + +cdef inline _parseNumber(NumberElement element): + return element._parse_value(textOf(element._c_node)) + +cdef inline object _strValueOf(obj): if python._isString(obj): return obj if isinstance(obj, _Element): @@ -870,15 +873,20 @@ return '' return str(obj) -cdef object _numericValueOf(obj): +cdef inline object _numericValueOf(obj): if isinstance(obj, NumberElement): - return (obj)._type( - textOf((obj)._c_node)) + return _parseNumber(obj) elif hasattr(obj, 'pyval'): # not always numeric, but Python will raise the right exception return obj.pyval return obj +cdef inline _richcmpPyvals(left, right, int op): + left = getattr3(left, 'pyval', left) + right = getattr3(right, 'pyval', right) + return python.PyObject_RichCompare(left, right, op) + + ################################################################################ # Python type registry Modified: lxml/trunk/src/lxml/tests/test_objectify.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_objectify.py (original) +++ lxml/trunk/src/lxml/tests/test_objectify.py Thu Apr 10 08:00:37 2008 @@ -737,10 +737,20 @@ root = Element("{objectified}root") root.bool = True self.assertEquals(root.bool, True) + self.assertEquals(root.bool + root.bool, True + True) + self.assertEquals(True + root.bool, True + root.bool) + self.assertEquals(root.bool * root.bool, True * True) + self.assertEquals(int(root.bool), int(True)) + self.assertEquals(complex(root.bool), complex(True)) self.assert_(isinstance(root.bool, objectify.BoolElement)) root.bool = False self.assertEquals(root.bool, False) + self.assertEquals(root.bool + root.bool, False + False) + self.assertEquals(False + root.bool, False + root.bool) + self.assertEquals(root.bool * root.bool, False * False) + self.assertEquals(int(root.bool), int(False)) + self.assertEquals(complex(root.bool), complex(False)) self.assert_(isinstance(root.bool, objectify.BoolElement)) def test_data_element_bool(self): From scoder at codespeak.net Thu Apr 10 08:00:44 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 10 Apr 2008 08:00:44 +0200 (CEST) Subject: [Lxml-checkins] r53640 - in lxml/trunk: . doc Message-ID: <20080410060044.C03302A0192@codespeak.net> Author: scoder Date: Thu Apr 10 08:00:43 2008 New Revision: 53640 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/FAQ.txt Log: r3917 at delle: sbehnel | 2008-04-09 15:18:10 +0200 more 'who uses lxml' entries Modified: lxml/trunk/doc/FAQ.txt ============================================================================== --- lxml/trunk/doc/FAQ.txt (original) +++ lxml/trunk/doc/FAQ.txt Thu Apr 10 08:00:43 2008 @@ -154,12 +154,14 @@ * lwebstring_, an XML template engine * OpenXMLlib_, a library for handling OpenXML document meta data * Pycoon_, a WSGI web development framework based on XML pipelines -* rfadict_, an RDFa parser with a simple dictionary-like interface. +* Rambler_, a meta search engine that aggregates different data sources +* rdfadict_, an RDFa parser with a simple dictionary-like interface. Zope3 and some of its extensions have good support for lxml: * gocept.lxml_, Zope3 interface bindings for lxml * z3c.rml_, an implementation of ReportLab's RML format +* zif.sedna_, an XQuery based interface to the Sedna OpenSource XML database And don't miss the quotes by our generally happy_ users_, and other `sites that link to lxml`_. @@ -172,8 +174,10 @@ .. _lwebstring: http://pypi.python.org/pypi/lwebstring .. _OpenXMLlib: http://permalink.gmane.org/gmane.comp.python.lxml.devel/3250 .. _Pycoon: http://pypi.python.org/pypi/pycoon -.. _rfadict: http://pypi.python.org/pypi/rdfadict +.. _Rambler: http://beta.rambler.ru/srch?query=python+lxml&searchtype=web +.. _rdfadict: http://pypi.python.org/pypi/rdfadict .. _z3c.rml: http://pypi.python.org/pypi/z3c.rml +.. _zif.sedna: http://pypi.python.org/pypi/zif.sedna .. _happy: http://thread.gmane.org/gmane.comp.python.lxml.devel/3244/focus=3244 .. _users: http://article.gmane.org/gmane.comp.python.lxml.devel/3246 From scoder at codespeak.net Thu Apr 10 08:00:49 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 10 Apr 2008 08:00:49 +0200 (CEST) Subject: [Lxml-checkins] r53641 - in lxml/trunk: . src/lxml Message-ID: <20080410060049.C8A3F2A0192@codespeak.net> Author: scoder Date: Thu Apr 10 08:00:48 2008 New Revision: 53641 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/xmlerror.pxd lxml/trunk/src/lxml/xmlerror.pxi Log: r3918 at delle: sbehnel | 2008-04-10 07:58:58 +0200 new error constants as of libxml2 2.6.32 Modified: lxml/trunk/src/lxml/xmlerror.pxd ============================================================================== --- lxml/trunk/src/lxml/xmlerror.pxd (original) +++ lxml/trunk/src/lxml/xmlerror.pxd Thu Apr 10 08:00:48 2008 @@ -39,6 +39,7 @@ XML_FROM_WRITER = 25 # The xmlwriter module XML_FROM_MODULE = 26 # The dynamically loaded module modul XML_FROM_I18N = 27 # The module handling character conversion + XML_FROM_SCHEMATRONV = 28 # The Schematron validator module ctypedef enum xmlParserErrors: XML_ERR_OK = 0 @@ -345,6 +346,7 @@ XML_TREE_INVALID_HEX = 1300 XML_TREE_INVALID_DEC = 1301 # 1301 XML_TREE_UNTERMINATED_ENTITY = 1302 # 1302 + XML_TREE_NOT_UTF8 = 1303 # 1303 XML_SAVE_NOT_UTF8 = 1400 XML_SAVE_CHAR_INVALID = 1401 # 1401 XML_SAVE_NO_DOCTYPE = 1402 # 1402 @@ -720,6 +722,8 @@ XML_SCHEMAP_AU_PROPS_CORRECT = 3089 # 3088 XML_SCHEMAP_A_PROPS_CORRECT_3 = 3090 # 3089 XML_SCHEMAP_COS_ALL_LIMITED = 3091 # 3090 + XML_SCHEMATRONV_ASSERT = 4000 # 4000 + XML_SCHEMATRONV_REPORT = 4001 XML_MODULE_OPEN = 4900 # 4900 XML_MODULE_CLOSE = 4901 # 4901 XML_CHECK_FOUND_ELEMENT = 5000 Modified: lxml/trunk/src/lxml/xmlerror.pxi ============================================================================== --- lxml/trunk/src/lxml/xmlerror.pxi (original) +++ lxml/trunk/src/lxml/xmlerror.pxi Thu Apr 10 08:00:48 2008 @@ -584,6 +584,7 @@ WRITER=25 MODULE=26 I18N=27 +SCHEMATRONV=28 """,) cdef object __PARSER_ERROR_TYPES @@ -898,13 +899,14 @@ TREE_INVALID_HEX=1300 TREE_INVALID_DEC=1301 TREE_UNTERMINATED_ENTITY=1302 +TREE_NOT_UTF8=1303 SAVE_NOT_UTF8=1400 SAVE_CHAR_INVALID=1401 SAVE_NO_DOCTYPE=1402 SAVE_UNKNOWN_ENCODING=1403 -REGEXP_COMPILE_ERROR=1450 """, """\ +REGEXP_COMPILE_ERROR=1450 IO_UNKNOWN=1500 IO_EACCES=1501 IO_EAGAIN=1502 @@ -996,9 +998,9 @@ SCHEMAP_EXTENSION_NO_BASE=1707 SCHEMAP_FACET_NO_VALUE=1708 SCHEMAP_FAILED_BUILD_IMPORT=1709 -SCHEMAP_GROUP_NONAME_NOREF=1710 """, """\ +SCHEMAP_GROUP_NONAME_NOREF=1710 SCHEMAP_IMPORT_NAMESPACE_NOT_URI=1711 SCHEMAP_IMPORT_REDEFINE_NSNAME=1712 SCHEMAP_IMPORT_SCHEMA_NOT_URI=1713 @@ -1059,9 +1061,9 @@ SCHEMAP_DEF_AND_PREFIX=1768 SCHEMAP_UNKNOWN_INCLUDE_CHILD=1769 SCHEMAP_INCLUDE_SCHEMA_NOT_URI=1770 -SCHEMAP_INCLUDE_SCHEMA_NO_URI=1771 """, """\ +SCHEMAP_INCLUDE_SCHEMA_NO_URI=1771 SCHEMAP_NOT_SCHEMA=1772 SCHEMAP_UNKNOWN_MEMBER_TYPE=1773 SCHEMAP_INVALID_ATTR_USE=1774 @@ -1124,9 +1126,9 @@ SCHEMAV_CVC_MINLENGTH_VALID=1831 SCHEMAV_CVC_MAXLENGTH_VALID=1832 SCHEMAV_CVC_MININCLUSIVE_VALID=1833 -SCHEMAV_CVC_MAXINCLUSIVE_VALID=1834 """, """\ +SCHEMAV_CVC_MAXINCLUSIVE_VALID=1834 SCHEMAV_CVC_MINEXCLUSIVE_VALID=1835 SCHEMAV_CVC_MAXEXCLUSIVE_VALID=1836 SCHEMAV_CVC_TOTALDIGITS_VALID=1837 @@ -1195,9 +1197,9 @@ SCHEMAP_SRC_SIMPLE_TYPE_4=3003 SCHEMAP_SRC_RESOLVE=3004 SCHEMAP_SRC_RESTRICTION_BASE_OR_SIMPLETYPE=3005 -SCHEMAP_SRC_LIST_ITEMTYPE_OR_SIMPLETYPE=3006 """, """\ +SCHEMAP_SRC_LIST_ITEMTYPE_OR_SIMPLETYPE=3006 SCHEMAP_SRC_UNION_MEMBERTYPES_OR_SIMPLETYPES=3007 SCHEMAP_ST_PROPS_CORRECT_1=3008 SCHEMAP_ST_PROPS_CORRECT_2=3009 @@ -1256,9 +1258,9 @@ SCHEMAP_CVC_SIMPLE_TYPE=3062 SCHEMAP_COS_CT_EXTENDS_1_1=3063 SCHEMAP_SRC_IMPORT_1_1=3064 -SCHEMAP_SRC_IMPORT_1_2=3065 """, """\ +SCHEMAP_SRC_IMPORT_1_2=3065 SCHEMAP_SRC_IMPORT_2=3066 SCHEMAP_SRC_IMPORT_2_1=3067 SCHEMAP_SRC_IMPORT_2_2=3068 @@ -1285,6 +1287,8 @@ SCHEMAP_AU_PROPS_CORRECT=3089 SCHEMAP_A_PROPS_CORRECT_3=3090 SCHEMAP_COS_ALL_LIMITED=3091 +SCHEMATRONV_ASSERT=4000 +SCHEMATRONV_REPORT=4001 MODULE_OPEN=4900 MODULE_CLOSE=4901 CHECK_FOUND_ELEMENT=5000 From scoder at codespeak.net Thu Apr 10 08:02:09 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 10 Apr 2008 08:02:09 +0200 (CEST) Subject: [Lxml-checkins] r53642 - lxml/branch/lxml-2.0/src/lxml Message-ID: <20080410060209.88A04168031@codespeak.net> Author: scoder Date: Thu Apr 10 08:02:08 2008 New Revision: 53642 Modified: lxml/branch/lxml-2.0/src/lxml/xmlerror.pxd lxml/branch/lxml-2.0/src/lxml/xmlerror.pxi Log: trunk merge -c 53641: new error constants as of libxml2 2.6.32 Modified: lxml/branch/lxml-2.0/src/lxml/xmlerror.pxd ============================================================================== --- lxml/branch/lxml-2.0/src/lxml/xmlerror.pxd (original) +++ lxml/branch/lxml-2.0/src/lxml/xmlerror.pxd Thu Apr 10 08:02:08 2008 @@ -39,6 +39,7 @@ XML_FROM_WRITER = 25 # The xmlwriter module XML_FROM_MODULE = 26 # The dynamically loaded module modul XML_FROM_I18N = 27 # The module handling character conversion + XML_FROM_SCHEMATRONV = 28 # The Schematron validator module ctypedef enum xmlParserErrors: XML_ERR_OK = 0 @@ -345,6 +346,7 @@ XML_TREE_INVALID_HEX = 1300 XML_TREE_INVALID_DEC = 1301 # 1301 XML_TREE_UNTERMINATED_ENTITY = 1302 # 1302 + XML_TREE_NOT_UTF8 = 1303 # 1303 XML_SAVE_NOT_UTF8 = 1400 XML_SAVE_CHAR_INVALID = 1401 # 1401 XML_SAVE_NO_DOCTYPE = 1402 # 1402 @@ -720,6 +722,8 @@ XML_SCHEMAP_AU_PROPS_CORRECT = 3089 # 3088 XML_SCHEMAP_A_PROPS_CORRECT_3 = 3090 # 3089 XML_SCHEMAP_COS_ALL_LIMITED = 3091 # 3090 + XML_SCHEMATRONV_ASSERT = 4000 # 4000 + XML_SCHEMATRONV_REPORT = 4001 XML_MODULE_OPEN = 4900 # 4900 XML_MODULE_CLOSE = 4901 # 4901 XML_CHECK_FOUND_ELEMENT = 5000 Modified: lxml/branch/lxml-2.0/src/lxml/xmlerror.pxi ============================================================================== --- lxml/branch/lxml-2.0/src/lxml/xmlerror.pxi (original) +++ lxml/branch/lxml-2.0/src/lxml/xmlerror.pxi Thu Apr 10 08:02:08 2008 @@ -603,6 +603,7 @@ WRITER=25 MODULE=26 I18N=27 +SCHEMATRONV=28 """,) cdef object __PARSER_ERROR_TYPES @@ -917,13 +918,14 @@ TREE_INVALID_HEX=1300 TREE_INVALID_DEC=1301 TREE_UNTERMINATED_ENTITY=1302 +TREE_NOT_UTF8=1303 SAVE_NOT_UTF8=1400 SAVE_CHAR_INVALID=1401 SAVE_NO_DOCTYPE=1402 SAVE_UNKNOWN_ENCODING=1403 -REGEXP_COMPILE_ERROR=1450 """, """\ +REGEXP_COMPILE_ERROR=1450 IO_UNKNOWN=1500 IO_EACCES=1501 IO_EAGAIN=1502 @@ -1015,9 +1017,9 @@ SCHEMAP_EXTENSION_NO_BASE=1707 SCHEMAP_FACET_NO_VALUE=1708 SCHEMAP_FAILED_BUILD_IMPORT=1709 -SCHEMAP_GROUP_NONAME_NOREF=1710 """, """\ +SCHEMAP_GROUP_NONAME_NOREF=1710 SCHEMAP_IMPORT_NAMESPACE_NOT_URI=1711 SCHEMAP_IMPORT_REDEFINE_NSNAME=1712 SCHEMAP_IMPORT_SCHEMA_NOT_URI=1713 @@ -1078,9 +1080,9 @@ SCHEMAP_DEF_AND_PREFIX=1768 SCHEMAP_UNKNOWN_INCLUDE_CHILD=1769 SCHEMAP_INCLUDE_SCHEMA_NOT_URI=1770 -SCHEMAP_INCLUDE_SCHEMA_NO_URI=1771 """, """\ +SCHEMAP_INCLUDE_SCHEMA_NO_URI=1771 SCHEMAP_NOT_SCHEMA=1772 SCHEMAP_UNKNOWN_MEMBER_TYPE=1773 SCHEMAP_INVALID_ATTR_USE=1774 @@ -1143,9 +1145,9 @@ SCHEMAV_CVC_MINLENGTH_VALID=1831 SCHEMAV_CVC_MAXLENGTH_VALID=1832 SCHEMAV_CVC_MININCLUSIVE_VALID=1833 -SCHEMAV_CVC_MAXINCLUSIVE_VALID=1834 """, """\ +SCHEMAV_CVC_MAXINCLUSIVE_VALID=1834 SCHEMAV_CVC_MINEXCLUSIVE_VALID=1835 SCHEMAV_CVC_MAXEXCLUSIVE_VALID=1836 SCHEMAV_CVC_TOTALDIGITS_VALID=1837 @@ -1214,9 +1216,9 @@ SCHEMAP_SRC_SIMPLE_TYPE_4=3003 SCHEMAP_SRC_RESOLVE=3004 SCHEMAP_SRC_RESTRICTION_BASE_OR_SIMPLETYPE=3005 -SCHEMAP_SRC_LIST_ITEMTYPE_OR_SIMPLETYPE=3006 """, """\ +SCHEMAP_SRC_LIST_ITEMTYPE_OR_SIMPLETYPE=3006 SCHEMAP_SRC_UNION_MEMBERTYPES_OR_SIMPLETYPES=3007 SCHEMAP_ST_PROPS_CORRECT_1=3008 SCHEMAP_ST_PROPS_CORRECT_2=3009 @@ -1275,9 +1277,9 @@ SCHEMAP_CVC_SIMPLE_TYPE=3062 SCHEMAP_COS_CT_EXTENDS_1_1=3063 SCHEMAP_SRC_IMPORT_1_1=3064 -SCHEMAP_SRC_IMPORT_1_2=3065 """, """\ +SCHEMAP_SRC_IMPORT_1_2=3065 SCHEMAP_SRC_IMPORT_2=3066 SCHEMAP_SRC_IMPORT_2_1=3067 SCHEMAP_SRC_IMPORT_2_2=3068 @@ -1304,6 +1306,8 @@ SCHEMAP_AU_PROPS_CORRECT=3089 SCHEMAP_A_PROPS_CORRECT_3=3090 SCHEMAP_COS_ALL_LIMITED=3091 +SCHEMATRONV_ASSERT=4000 +SCHEMATRONV_REPORT=4001 MODULE_OPEN=4900 MODULE_CLOSE=4901 CHECK_FOUND_ELEMENT=5000 From scoder at codespeak.net Thu Apr 10 09:00:15 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 10 Apr 2008 09:00:15 +0200 (CEST) Subject: [Lxml-checkins] r53645 - in lxml/trunk: . src/lxml Message-ID: <20080410070015.603012A0192@codespeak.net> Author: scoder Date: Thu Apr 10 09:00:14 2008 New Revision: 53645 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/etree_defs.h lxml/trunk/src/lxml/schematron.pxd lxml/trunk/src/lxml/schematron.pxi Log: r3923 at delle: sbehnel | 2008-04-10 08:43:38 +0200 untested support for schematron error reporting (only used with libxml2 2.6.32+) Modified: lxml/trunk/src/lxml/etree_defs.h ============================================================================== --- lxml/trunk/src/lxml/etree_defs.h (original) +++ lxml/trunk/src/lxml/etree_defs.h Thu Apr 10 09:00:14 2008 @@ -70,6 +70,10 @@ /* schematron was added in libxml2 2.6.21 */ #ifdef LIBXML_SCHEMATRON_ENABLED # define ENABLE_SCHEMATRON 1 +# if LIBXML_VERSION < 20632 + /* schematron error reporting was added in libxml2 2.6.32 */ +# define xmlSchematronSetValidStructuredErrors(ctxt, errorfunc, data) +# endif #else # define ENABLE_SCHEMATRON 0 # define XML_SCHEMATRON_OUT_QUIET 0 @@ -85,8 +89,10 @@ # define xmlSchematronNewValidCtxt(schema, options) NULL # define xmlSchematronValidateDoc(ctxt, doc) 0 # define xmlSchematronFreeValidCtxt(ctxt) +# define xmlSchematronSetValidStructuredErrors(ctxt, errorfunc, data) #endif + /* work around MSDEV 6.0 */ #if (_MSC_VER == 1200) && (WINVER < 0x0500) long _ftol( double ); //defined by VC6 C libs Modified: lxml/trunk/src/lxml/schematron.pxd ============================================================================== --- lxml/trunk/src/lxml/schematron.pxd (original) +++ lxml/trunk/src/lxml/schematron.pxd Thu Apr 10 09:00:14 2008 @@ -1,4 +1,4 @@ -cimport tree +cimport tree, xmlerror from tree cimport xmlDoc, xmlDtd cdef extern from "libxml/schematron.h": @@ -28,3 +28,6 @@ cdef void xmlSchematronFreeParserCtxt(xmlSchematronParserCtxt* ctxt) nogil cdef void xmlSchematronFreeValidCtxt(xmlSchematronValidCtxt* ctxt) nogil cdef void xmlSchematronFree(xmlSchematron* schema) nogil + cdef void xmlSchematronSetValidStructuredErrors( + xmlSchematronValidCtxt* ctxt, + xmlerror.xmlStructuredErrorFunc error_func, void *data) Modified: lxml/trunk/src/lxml/schematron.pxi ============================================================================== --- lxml/trunk/src/lxml/schematron.pxi (original) +++ lxml/trunk/src/lxml/schematron.pxi Thu Apr 10 09:00:14 2008 @@ -141,10 +141,12 @@ doc = _documentOrRaise(etree) root_node = _rootNodeOrRaise(etree) - options = schematron.XML_SCHEMATRON_OUT_QUIET - #if tree.LIBXML_VERSION <= 20630: # ... and later? - # hack to switch off stderr output - options = options | schematron.XML_SCHEMATRON_OUT_XML + if _LIBXML_VERSION_INT >= 20632: + options = schematron.XML_SCHEMATRON_OUT_ERROR + else: + options = schematron.XML_SCHEMATRON_OUT_QUIET + # hack to switch off stderr output + options = options | schematron.XML_SCHEMATRON_OUT_XML valid_ctxt = schematron.xmlSchematronNewValidCtxt( self._c_schema, options) @@ -152,6 +154,8 @@ return python.PyErr_NoMemory() self._error_log.connect() + schematron.xmlSchematronSetValidStructuredErrors( + valid_ctxt, _receiveError, self.error_log) c_doc = _fakeRootDoc(doc._c_doc, root_node._c_node) with nogil: ret = schematron.xmlSchematronValidateDoc(valid_ctxt, c_doc) From scoder at codespeak.net Thu Apr 10 09:00:20 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 10 Apr 2008 09:00:20 +0200 (CEST) Subject: [Lxml-checkins] r53646 - lxml/trunk Message-ID: <20080410070020.2E5982A0192@codespeak.net> Author: scoder Date: Thu Apr 10 09:00:19 2008 New Revision: 53646 Modified: lxml/trunk/ (props changed) lxml/trunk/setupinfo.py Log: r3924 at delle: sbehnel | 2008-04-10 08:44:13 +0200 switch on dependency tracking for Cython 0.9.6.13+ Modified: lxml/trunk/setupinfo.py ============================================================================== --- lxml/trunk/setupinfo.py (original) +++ lxml/trunk/setupinfo.py Thu Apr 10 09:00:19 2008 @@ -83,8 +83,7 @@ if not CYTHON_INSTALLED: return [] from Cython.Compiler.Version import version - # currently, no official Cython release supports this ... - if True or split_version(version) <= (0,9,6,12): + if split_version(version) < (0,9,6,13): return [] package_dir = os.path.join(get_base_dir(), PACKAGE_PATH) From scoder at codespeak.net Thu Apr 10 09:00:24 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 10 Apr 2008 09:00:24 +0200 (CEST) Subject: [Lxml-checkins] r53647 - in lxml/trunk: . src/lxml Message-ID: <20080410070024.88CC02A0192@codespeak.net> Author: scoder Date: Thu Apr 10 09:00:23 2008 New Revision: 53647 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/iterparse.pxi Log: r3925 at delle: sbehnel | 2008-04-10 08:53:27 +0200 fix for crash bug 211711: iterparse didn't set up parser hash table Modified: lxml/trunk/src/lxml/iterparse.pxi ============================================================================== --- lxml/trunk/src/lxml/iterparse.pxi (original) +++ lxml/trunk/src/lxml/iterparse.pxi Thu Apr 10 09:00:23 2008 @@ -385,6 +385,7 @@ None, filename, encoding) context = <_IterparseContext>self._getPushParserContext() + __GLOBAL_PARSER_CONTEXT.initParserDict(context._c_ctxt) context.prepare() # parser will not be unlocked - no other methods supported From scoder at codespeak.net Thu Apr 10 09:00:29 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 10 Apr 2008 09:00:29 +0200 (CEST) Subject: [Lxml-checkins] r53648 - in lxml/trunk: . src/lxml/tests Message-ID: <20080410070029.758312A0192@codespeak.net> Author: scoder Date: Thu Apr 10 09:00:28 2008 New Revision: 53648 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/tests/test_elementtree.py Log: r3926 at delle: sbehnel | 2008-04-10 08:59:00 +0200 mark bug fixed, new test case Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Thu Apr 10 09:00:28 2008 @@ -11,6 +11,8 @@ Bugs fixed ---------- +* Crash bug in iterparse when moving elements into other documents. + * ``ElementTree.find*()`` didn't accept QName objects. * lxml.etree accepted non well-formed namespace prefix names. Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Thu Apr 10 09:00:28 2008 @@ -2696,6 +2696,19 @@ [1,2,1,4], counts) + def test_iterparse_move_elements(self): + iterparse = self.etree.iterparse + f = StringIO('') + + for event, node in etree.iterparse(f): pass + + root = etree.Element('new_root', {}) + root[:] = node[:] + + self.assertEquals( + ['b', 'c'], + [ el.tag for el in root ]) + def test_parse_file(self): parse = self.etree.parse # from file From scoder at codespeak.net Thu Apr 10 09:04:39 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 10 Apr 2008 09:04:39 +0200 (CEST) Subject: [Lxml-checkins] r53649 - in lxml/branch/lxml-2.0: . src/lxml src/lxml/tests Message-ID: <20080410070439.F07522A0192@codespeak.net> Author: scoder Date: Thu Apr 10 09:04:38 2008 New Revision: 53649 Modified: lxml/branch/lxml-2.0/CHANGES.txt lxml/branch/lxml-2.0/src/lxml/iterparse.pxi lxml/branch/lxml-2.0/src/lxml/tests/test_elementtree.py Log: trunk merge -r 53647:53648: iterparse crash fix Modified: lxml/branch/lxml-2.0/CHANGES.txt ============================================================================== --- lxml/branch/lxml-2.0/CHANGES.txt (original) +++ lxml/branch/lxml-2.0/CHANGES.txt Thu Apr 10 09:04:38 2008 @@ -11,6 +11,8 @@ Bugs fixed ---------- +* Crash bug in iterparse when moving elements into other documents. + * HTML elements' ``.cssselect()`` method was broken. * ``ElementTree.find*()`` didn't accept QName objects. Modified: lxml/branch/lxml-2.0/src/lxml/iterparse.pxi ============================================================================== --- lxml/branch/lxml-2.0/src/lxml/iterparse.pxi (original) +++ lxml/branch/lxml-2.0/src/lxml/iterparse.pxi Thu Apr 10 09:04:38 2008 @@ -325,6 +325,7 @@ None, filename, encoding) context = <_IterparseContext>self._getPushParserContext() + __GLOBAL_PARSER_CONTEXT.initParserDict(context._c_ctxt) context._setEventFilter(events, tag) context.prepare() # parser will not be unlocked - no other methods supported Modified: lxml/branch/lxml-2.0/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/branch/lxml-2.0/src/lxml/tests/test_elementtree.py (original) +++ lxml/branch/lxml-2.0/src/lxml/tests/test_elementtree.py Thu Apr 10 09:04:38 2008 @@ -2567,6 +2567,19 @@ [1,2,1,4], counts) + def test_iterparse_move_elements(self): + iterparse = self.etree.iterparse + f = StringIO('') + + for event, node in etree.iterparse(f): pass + + root = etree.Element('new_root', {}) + root[:] = node[:] + + self.assertEquals( + ['b', 'c'], + [ el.tag for el in root ]) + def test_parse_file(self): parse = self.etree.parse # from file From scoder at codespeak.net Fri Apr 11 19:32:59 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 11 Apr 2008 19:32:59 +0200 (CEST) Subject: [Lxml-checkins] r53695 - in lxml/trunk: . doc Message-ID: <20080411173259.075FD2A01B9@codespeak.net> Author: scoder Date: Fri Apr 11 19:32:55 2008 New Revision: 53695 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/performance.txt Log: r3932 at delle: sbehnel | 2008-04-11 15:16:32 +0200 link to HTML benchmarks Modified: lxml/trunk/doc/performance.txt ============================================================================== --- lxml/trunk/doc/performance.txt (original) +++ lxml/trunk/doc/performance.txt Fri Apr 11 19:32:55 2008 @@ -193,6 +193,16 @@ input documents are not considerably bigger than the output, lxml is the clear winner. +Regarding HTML parsing, Ian Bicking has done some `benchmarking on +lxml's HTML parser`_, comparing it to a number of other famous HTML +parser tools for Python. lxml wins this contest by quite a length. +To give an idea, the numbers suggest that lxml.html can run a couple +of parse-serialise cycles in the time that other tools need for +parsing alone. The comparison even shows some very favourable results +regarding memory consumption. + +.. _`benchmarking on lxml's HTML parser`: http://blog.ianbicking.org/2008/03/30/python-html-parser-performance/ + The ElementTree API =================== From scoder at codespeak.net Fri Apr 11 19:33:02 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 11 Apr 2008 19:33:02 +0200 (CEST) Subject: [Lxml-checkins] r53696 - in lxml/trunk: . src/lxml Message-ID: <20080411173302.AB7192A01BB@codespeak.net> Author: scoder Date: Fri Apr 11 19:33:02 2008 New Revision: 53696 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/python.pxd Log: r3933 at delle: sbehnel | 2008-04-11 15:17:54 +0200 cimport fix Modified: lxml/trunk/src/lxml/python.pxd ============================================================================== --- lxml/trunk/src/lxml/python.pxd (original) +++ lxml/trunk/src/lxml/python.pxd Fri Apr 11 19:33:02 2008 @@ -1,4 +1,4 @@ -from tree cimport FILE +from cstd cimport FILE cdef extern from "Python.h": ctypedef struct PyObject From scoder at codespeak.net Fri Apr 11 19:33:06 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 11 Apr 2008 19:33:06 +0200 (CEST) Subject: [Lxml-checkins] r53697 - in lxml/trunk: . src/lxml Message-ID: <20080411173306.DF3D52A01B9@codespeak.net> Author: scoder Date: Fri Apr 11 19:33:06 2008 New Revision: 53697 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/python.pxd Log: r3934 at delle: sbehnel | 2008-04-11 19:15:10 +0200 cleanup Modified: lxml/trunk/src/lxml/python.pxd ============================================================================== --- lxml/trunk/src/lxml/python.pxd (original) +++ lxml/trunk/src/lxml/python.pxd Fri Apr 11 19:33:06 2008 @@ -120,6 +120,6 @@ NOWAIT_LOCK cdef extern from "etree_defs.h": # redefines some functions as macros - cdef int _isString(object obj) + cdef bint _isString(object obj) cdef char* _fqtypename(object t) cdef object PY_NEW(object t) From scoder at codespeak.net Fri Apr 11 19:33:10 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 11 Apr 2008 19:33:10 +0200 (CEST) Subject: [Lxml-checkins] r53698 - in lxml/trunk: . src/lxml Message-ID: <20080411173310.CAB0A2A01B9@codespeak.net> Author: scoder Date: Fri Apr 11 19:33:10 2008 New Revision: 53698 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/extensions.pxi Log: r3935 at delle: sbehnel | 2008-04-11 19:17:10 +0200 cleanup Modified: lxml/trunk/src/lxml/extensions.pxi ============================================================================== --- lxml/trunk/src/lxml/extensions.pxi (original) +++ lxml/trunk/src/lxml/extensions.pxi Fri Apr 11 19:33:10 2008 @@ -564,13 +564,13 @@ # special str/unicode subclasses cdef class _ElementUnicodeResult(python.unicode): - cdef _Element parent + cdef _Element _parent cdef readonly object is_tail cdef readonly object is_text cdef readonly object is_attribute def getparent(self): - return self.parent + return self._parent class _ElementStringResult(str): # we need to use a Python class here, str cannot be C-subclassed @@ -596,7 +596,7 @@ return result else: uresult = _ElementUnicodeResult(string_value) - uresult.parent = parent + uresult._parent = parent uresult.is_attribute = is_attribute uresult.is_tail = is_tail uresult.is_text = is_text From scoder at codespeak.net Fri Apr 11 19:33:18 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 11 Apr 2008 19:33:18 +0200 (CEST) Subject: [Lxml-checkins] r53699 - in lxml/trunk: . doc src/lxml src/lxml/tests Message-ID: <20080411173318.9BDD52A01B9@codespeak.net> Author: scoder Date: Fri Apr 11 19:33:18 2008 New Revision: 53699 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/doc/api.txt lxml/trunk/src/lxml/apihelpers.pxi lxml/trunk/src/lxml/iterparse.pxi lxml/trunk/src/lxml/lxml.etree.pyx lxml/trunk/src/lxml/parser.pxi lxml/trunk/src/lxml/saxparser.pxi lxml/trunk/src/lxml/tests/test_etree.py lxml/trunk/src/lxml/tree.pxd Log: r3936 at delle: sbehnel | 2008-04-11 19:31:10 +0200 support for CDATA blocks: parser option and CDATA() text factory Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Fri Apr 11 19:33:18 2008 @@ -8,6 +8,12 @@ Features added -------------- +* Parser option `strip_cdata` for normalising or keeping CDATA + sections. Defaults to ``True`` as before, thus replacing CDATA + sections by their text content. + +* ``CDATA()`` factory to wrap string content as CDATA section. + Bugs fixed ---------- Modified: lxml/trunk/doc/api.txt ============================================================================== --- lxml/trunk/doc/api.txt (original) +++ lxml/trunk/doc/api.txt Fri Apr 11 19:33:18 2008 @@ -30,15 +30,16 @@ .. contents:: .. - 1 lxml.etree - 2 Other Element APIs - 3 Trees and Documents - 4 Iteration - 5 Error handling on exceptions - 6 Error logging - 7 Serialisation - 8 XInclude and ElementInclude - 9 write_c14n on ElementTree + 1 lxml.etree + 2 Other Element APIs + 3 Trees and Documents + 4 Iteration + 5 Error handling on exceptions + 6 Error logging + 7 Serialisation + 8 CDATA + 9 XInclude and ElementInclude + 10 write_c14n on ElementTree lxml.etree @@ -352,6 +353,50 @@ XMLSyntaxError: ... +CDATA +----- + +By default, lxml's parser will strip CDATA sections from the tree and +replace them by their plain text content. As real applications for +CDATA are rare, this is the best way to deal with this issue. + +However, in some cases, keeping CDATA sections or creating them in a +document is required to adhere to existing XML language definitions. +For these special cases, you can instruct the parser to leave CDATA +sections in the document: + +.. sourcecode:: pycon + + >>> parser = etree.XMLParser(strip_cdata=False) + >>> root = etree.XML('', parser) + >>> root.text + 'test' + + >>> etree.tostring(root) + '' + +Note how the ``.text`` property does not give any indication that the +text content is wrapped by a CDATA section. If you want to make sure +your data is wrapped by a CDATA block, you can use the ``CDATA()`` +text wrapper: + +.. sourcecode:: pycon + + >>> root.text = 'test' + + >>> root.text + 'test' + >>> etree.tostring(root) + 'test' + + >>> root.text = etree.CDATA(root.text) + + >>> root.text + 'test' + >>> etree.tostring(root) + '' + + XInclude and ElementInclude --------------------------- Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Fri Apr 11 19:33:18 2008 @@ -449,8 +449,17 @@ if value is None: return 0 # now add new text node with value at start - text = _utf8(value) - c_text_node = tree.xmlNewDocText(c_node.doc, _cstr(text)) + if python._isString(value): + text = _utf8(value) + c_text_node = tree.xmlNewDocText(c_node.doc, _cstr(text)) + elif isinstance(value, CDATA): + c_text_node = tree.xmlNewCDataBlock( + c_node.doc, _cstr((value)._utf8_data), + python.PyString_GET_SIZE((value)._utf8_data)) + else: + # this will raise the right error + _utf8(value) + return -1 if c_node.children is NULL: tree.xmlAddChild(c_node, c_text_node) else: @@ -593,6 +602,8 @@ while c_node is not NULL: if c_node.type == tree.XML_TEXT_NODE: return c_node + if c_node.type == tree.XML_CDATA_SECTION_NODE: + return c_node elif c_node.type == tree.XML_XINCLUDE_START or \ c_node.type == tree.XML_XINCLUDE_END: c_node = c_node.next Modified: lxml/trunk/src/lxml/iterparse.pxi ============================================================================== --- lxml/trunk/src/lxml/iterparse.pxi (original) +++ lxml/trunk/src/lxml/iterparse.pxi Fri Apr 11 19:33:18 2008 @@ -327,6 +327,7 @@ - remove_blank_text - discard blank text nodes - remove_comments - discard comments - remove_pis - discard processing instructions + - strip_cdata - replace CDATA sections by normal text content (default: True) - compact - safe memory for short text content (default: True) - resolve_entities - replace entities by their text value (default: True) @@ -342,7 +343,7 @@ attribute_defaults=False, dtd_validation=False, load_dtd=False, no_network=True, remove_blank_text=False, compact=True, resolve_entities=True, remove_comments=False, - remove_pis=False, encoding=None, + remove_pis=False, strip_cdata=True, encoding=None, html=False, XMLSchema schema=None): cdef _IterparseContext context cdef char* c_encoding @@ -381,7 +382,7 @@ parse_options = parse_options ^ xmlparser.XML_PARSE_NOENT _BaseParser.__init__(self, parse_options, html, schema, - remove_comments, remove_pis, + remove_comments, remove_pis, strip_cdata, None, filename, encoding) context = <_IterparseContext>self._getPushParserContext() Modified: lxml/trunk/src/lxml/lxml.etree.pyx ============================================================================== --- lxml/trunk/src/lxml/lxml.etree.pyx (original) +++ lxml/trunk/src/lxml/lxml.etree.pyx Fri Apr 11 19:33:18 2008 @@ -2264,6 +2264,20 @@ PI = ProcessingInstruction +cdef class CDATA: + """CDATA(data) + + CDATA factory. This factory creates an opaque data object that + can be used to set Element text. The usual way to use it is:: + + >>> from lxml import etree + >>> el = etree.Element('content') + >>> el.text = etree.CDATA('a string') + """ + cdef object _utf8_data + def __init__(self, data): + self._utf8_data = _utf8(data) + def Entity(name): """Entity(name) Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Fri Apr 11 19:33:18 2008 @@ -550,6 +550,7 @@ cdef bint _for_html cdef bint _remove_comments cdef bint _remove_pis + cdef bint _strip_cdata cdef XMLSchema _schema cdef object _filename cdef object _target @@ -557,7 +558,8 @@ cdef int _default_encoding_int def __init__(self, int parse_options, bint for_html, XMLSchema schema, - remove_comments, remove_pis, target, filename, encoding): + remove_comments, remove_pis, strip_cdata, target, + filename, encoding): cdef int c_encoding if not isinstance(self, HTMLParser) and \ not isinstance(self, XMLParser) and \ @@ -570,6 +572,7 @@ self._for_html = for_html self._remove_comments = remove_comments self._remove_pis = remove_pis + self._strip_cdata = strip_cdata self._schema = schema self._resolvers = _ResolverRegistry() @@ -601,8 +604,9 @@ pctxt.sax.comment = NULL if self._remove_pis: pctxt.sax.processingInstruction = NULL - # hard switch-off for CDATA nodes => makes them plain text - pctxt.sax.cdataBlock = NULL + if self._strip_cdata: + # hard switch-off for CDATA nodes => makes them plain text + pctxt.sax.cdataBlock = NULL return self._parser_context cdef _ParserContext _getPushParserContext(self): @@ -621,8 +625,9 @@ pctxt.sax.comment = NULL if self._remove_pis: pctxt.sax.processingInstruction = NULL - # hard switch-off for CDATA nodes => makes them plain text - pctxt.sax.cdataBlock = NULL + if self._strip_cdata: + # hard switch-off for CDATA nodes => makes them plain text + pctxt.sax.cdataBlock = NULL return self._push_parser_context cdef _ParserContext _createContext(self, target): @@ -700,6 +705,7 @@ parser._for_html = self._for_html parser._remove_comments = self._remove_comments parser._remove_pis = self._remove_pis + parser._strip_cdata = self._strip_cdata parser._filename = self._filename parser._resolvers = self._resolvers parser._target = self._target @@ -1051,6 +1057,7 @@ - remove_blank_text - discard blank text nodes - remove_comments - discard comments - remove_pis - discard processing instructions + - strip_cdata - replace CDATA sections by normal text content (default: True) - compact - safe memory for short text content (default: True) - resolve_entities - replace entities by their text value (default: True) @@ -1068,8 +1075,8 @@ load_dtd=False, no_network=True, ns_clean=False, recover=False, remove_blank_text=False, compact=True, resolve_entities=True, remove_comments=False, - remove_pis=False, target=None, encoding=None, - XMLSchema schema=None): + remove_pis=False, strip_cdata=True, target=None, + encoding=None, XMLSchema schema=None): cdef int parse_options parse_options = _XML_DEFAULT_PARSE_OPTIONS if load_dtd: @@ -1092,9 +1099,11 @@ parse_options = parse_options ^ xmlparser.XML_PARSE_COMPACT if not resolve_entities: parse_options = parse_options ^ xmlparser.XML_PARSE_NOENT + if not strip_cdata: + parse_options = parse_options ^ xmlparser.XML_PARSE_NOCDATA _BaseParser.__init__(self, parse_options, 0, schema, - remove_comments, remove_pis, + remove_comments, remove_pis, strip_cdata, target, None, encoding) cdef class ETCompatXMLParser(XMLParser): @@ -1110,7 +1119,8 @@ load_dtd=False, no_network=True, ns_clean=False, recover=False, remove_blank_text=False, compact=True, resolve_entities=True, remove_comments=True, - remove_pis=True, target=None, encoding=None, schema=None): + remove_pis=True, strip_cdata=True, target=None, + encoding=None, schema=None): XMLParser.__init__(self, attribute_defaults=attribute_defaults, dtd_validation=dtd_validation, @@ -1123,6 +1133,7 @@ resolve_entities=resolve_entities, remove_comments=remove_comments, remove_pis=remove_pis, + strip_cdata=strip_cdata, target=target, encoding=encoding, schema=schema) @@ -1180,6 +1191,7 @@ - remove_blank_text - discard empty text nodes - remove_comments - discard comments - remove_pis - discard processing instructions + - strip_cdata - replace CDATA sections by normal text content (default: True) - compact - safe memory for short text content (default: True) Other keyword arguments: @@ -1193,7 +1205,7 @@ """ def __init__(self, *, recover=True, no_network=True, remove_blank_text=False, compact=True, remove_comments=False, - remove_pis=False, target=None, encoding=None, + remove_pis=False, strip_cdata=True, target=None, encoding=None, XMLSchema schema=None): cdef int parse_options parse_options = _HTML_DEFAULT_PARSE_OPTIONS @@ -1207,7 +1219,7 @@ parse_options = parse_options ^ htmlparser.HTML_PARSE_COMPACT _BaseParser.__init__(self, parse_options, 1, schema, - remove_comments, remove_pis, + remove_comments, remove_pis, strip_cdata, target, None, encoding) cdef HTMLParser __DEFAULT_HTML_PARSER Modified: lxml/trunk/src/lxml/saxparser.pxi ============================================================================== --- lxml/trunk/src/lxml/saxparser.pxi (original) +++ lxml/trunk/src/lxml/saxparser.pxi Fri Apr 11 19:33:18 2008 @@ -37,6 +37,7 @@ cdef xmlparser.startElementSAXFunc _origSaxStartNoNs cdef xmlparser.endElementSAXFunc _origSaxEndNoNs cdef xmlparser.charactersSAXFunc _origSaxData + cdef xmlparser.cdataBlockSAXFunc _origSaxCData cdef xmlparser.internalSubsetSAXFunc _origSaxDoctype cdef xmlparser.commentSAXFunc _origSaxComment cdef xmlparser.processingInstructionSAXFunc _origSaxPi @@ -76,10 +77,12 @@ if self._target._sax_event_propagate & SAX_EVENT_DATA: self._origSaxData = sax.characters + self._origSaxCData = sax.cdataBlock else: - self._origSaxData = sax.characters = NULL + self._origSaxData = sax.characters = sax.cdataBlock = NULL if self._target._sax_event_filter & SAX_EVENT_DATA: sax.characters = _handleSaxData + sax.cdataBlock = _handleSaxCData # doctype propagation is always required for entity replacement self._origSaxDoctype = sax.internalSubset @@ -249,6 +252,21 @@ except: context._handleSaxException(c_ctxt) +cdef void _handleSaxCData(void* ctxt, char* c_data, int data_len) with gil: + cdef _SaxParserContext context + cdef xmlparser.xmlParserCtxt* c_ctxt + c_ctxt = ctxt + if c_ctxt._private is NULL: + return + context = <_SaxParserContext>c_ctxt._private + if context._origSaxCData is not NULL: + context._origSaxCData(c_ctxt, c_data, data_len) + try: + context._target._handleSaxData( + python.PyUnicode_DecodeUTF8(c_data, data_len, NULL)) + except: + context._handleSaxException(c_ctxt) + cdef void _handleSaxDoctype(void* ctxt, char* c_name, char* c_public, char* c_system) with gil: cdef _SaxParserContext context Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Fri Apr 11 19:33:18 2008 @@ -462,6 +462,29 @@ "data-B", "end-root", "pi-test-c"], events) + def test_parser_target_cdata(self): + events = [] + class Target(object): + def start(self, tag, attrib): + events.append("start-" + tag) + def end(self, tag): + events.append("end-" + tag) + def data(self, data): + events.append("data-" + data) + def close(self): + return "DONE" + + parser = self.etree.XMLParser(target=Target(), + strip_cdata=False) + + parser.feed('AB') + done = parser.close() + + self.assertEquals("DONE", done) + self.assertEquals(["start-root", "data-A", "start-a", + "data-ca", "end-a", "data-B", "end-root"], + events) + def test_iterwalk_tag(self): iterwalk = self.etree.iterwalk root = self.etree.XML('') @@ -666,6 +689,55 @@ self.assertRaises(ValueError, Entity, '#abc') self.assertRaises(ValueError, Entity, '#xxyz') + def test_cdata(self): + CDATA = self.etree.CDATA + Element = self.etree.Element + tostring = self.etree.tostring + + root = Element("root") + root.text = CDATA('test') + + self.assertEquals('test', + root.text) + self.assertEquals('', + tostring(root)) + + def test_cdata_type(self): + CDATA = self.etree.CDATA + Element = self.etree.Element + root = Element("root") + + root.text = CDATA("test") + self.assertEquals('test', root.text) + + root.text = CDATA(u"test") + self.assertEquals('test', root.text) + + self.assertRaises(TypeError, CDATA, 1) + + def test_cdata_errors(self): + CDATA = self.etree.CDATA + Element = self.etree.Element + + root = Element("root") + cdata = CDATA('test') + + self.assertRaises(TypeError, + setattr, root, 'tail', cdata) + self.assertRaises(TypeError, + root.set, 'attr', cdata) + self.assertRaises(TypeError, + operator.setitem, root.attrib, 'attr', cdata) + + def test_cdata_parser(self): + tostring = self.etree.tostring + parser = self.etree.XMLParser(strip_cdata=False) + root = self.etree.XML('', parser) + + self.assertEquals('test', root.text) + self.assertEquals('', + tostring(root)) + # TypeError in etree, AssertionError in ElementTree; def test_setitem_assert(self): Element = self.etree.Element Modified: lxml/trunk/src/lxml/tree.pxd ============================================================================== --- lxml/trunk/src/lxml/tree.pxd (original) +++ lxml/trunk/src/lxml/tree.pxd Fri Apr 11 19:33:18 2008 @@ -181,6 +181,7 @@ cdef xmlNode* xmlNewDocComment(xmlDoc* doc, char* content) nogil cdef xmlNode* xmlNewDocPI(xmlDoc* doc, char* name, char* content) nogil cdef xmlNode* xmlNewReference(xmlDoc* doc, char* name) nogil + cdef xmlNode* xmlNewCDataBlock(xmlDoc* doc, char* text, int len) nogil cdef xmlNs* xmlNewNs(xmlNode* node, char* href, char* prefix) nogil cdef xmlNode* xmlAddChild(xmlNode* parent, xmlNode* cur) nogil cdef xmlNode* xmlReplaceNode(xmlNode* old, xmlNode* cur) nogil From scoder at codespeak.net Sat Apr 12 14:33:29 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 12 Apr 2008 14:33:29 +0200 (CEST) Subject: [Lxml-checkins] r53716 - lxml/trunk Message-ID: <20080412123329.2F26516A45F@codespeak.net> Author: scoder Date: Sat Apr 12 14:33:27 2008 New Revision: 53716 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt Log: r3943 at delle: sbehnel | 2008-04-11 19:39:06 +0200 rst fix Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Sat Apr 12 14:33:27 2008 @@ -8,7 +8,7 @@ Features added -------------- -* Parser option `strip_cdata` for normalising or keeping CDATA +* Parser option ``strip_cdata`` for normalising or keeping CDATA sections. Defaults to ``True`` as before, thus replacing CDATA sections by their text content. From scoder at codespeak.net Sat Apr 12 14:33:34 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 12 Apr 2008 14:33:34 +0200 (CEST) Subject: [Lxml-checkins] r53717 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20080412123334.B38DC16A460@codespeak.net> Author: scoder Date: Sat Apr 12 14:33:33 2008 New Revision: 53717 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/iterparse.pxi lxml/trunk/src/lxml/tests/test_elementtree.py lxml/trunk/src/lxml/tests/test_etree.py Log: r3944 at delle: sbehnel | 2008-04-12 12:02:37 +0200 fix for CDATA parsing in iterparse() Modified: lxml/trunk/src/lxml/iterparse.pxi ============================================================================== --- lxml/trunk/src/lxml/iterparse.pxi (original) +++ lxml/trunk/src/lxml/iterparse.pxi Sat Apr 12 14:33:33 2008 @@ -380,6 +380,8 @@ parse_options = parse_options ^ xmlparser.XML_PARSE_COMPACT if not resolve_entities: parse_options = parse_options ^ xmlparser.XML_PARSE_NOENT + if not strip_cdata: + parse_options = parse_options ^ xmlparser.XML_PARSE_NOCDATA _BaseParser.__init__(self, parse_options, html, schema, remove_comments, remove_pis, strip_cdata, Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Sat Apr 12 14:33:33 2008 @@ -2709,6 +2709,16 @@ ['b', 'c'], [ el.tag for el in root ]) + def test_iterparse_cdata(self): + tostring = self.etree.tostring + f = StringIO('') + context = self.etree.iterparse(f) + content = [ el.text for event,el in context ] + + self.assertEquals(['test'], content) + self.assertEquals('test', + tostring(context.root)) + def test_parse_file(self): parse = self.etree.parse # from file Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Sat Apr 12 14:33:33 2008 @@ -409,6 +409,26 @@ a = iterator.root self.assertEquals(a.text, text) + def test_iterparse_cdata(self): + tostring = self.etree.tostring + f = StringIO('') + context = self.etree.iterparse(f) + content = [ el.text for event,el in context ] + + self.assertEquals(['test'], content) + self.assertEquals('test', + tostring(context.root)) + + def test_iterparse_keep_cdata(self): + tostring = self.etree.tostring + f = StringIO('') + context = self.etree.iterparse(f, strip_cdata=False) + content = [ el.text for event,el in context ] + + self.assertEquals(['test'], content) + self.assertEquals('', + tostring(context.root)) + def test_parser_encoding_unknown(self): self.assertRaises( LookupError, self.etree.XMLParser, encoding="hopefully unknown") From scoder at codespeak.net Sat Apr 12 14:33:38 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 12 Apr 2008 14:33:38 +0200 (CEST) Subject: [Lxml-checkins] r53718 - in lxml/trunk: . src/lxml/tests Message-ID: <20080412123338.CC8C416A463@codespeak.net> Author: scoder Date: Sat Apr 12 14:33:38 2008 New Revision: 53718 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/tests/test_etree.py Log: r3945 at delle: sbehnel | 2008-04-12 12:05:52 +0200 cleanup Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Sat Apr 12 14:33:38 2008 @@ -409,16 +409,6 @@ a = iterator.root self.assertEquals(a.text, text) - def test_iterparse_cdata(self): - tostring = self.etree.tostring - f = StringIO('') - context = self.etree.iterparse(f) - content = [ el.text for event,el in context ] - - self.assertEquals(['test'], content) - self.assertEquals('test', - tostring(context.root)) - def test_iterparse_keep_cdata(self): tostring = self.etree.tostring f = StringIO('') From scoder at codespeak.net Sat Apr 12 14:33:43 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 12 Apr 2008 14:33:43 +0200 (CEST) Subject: [Lxml-checkins] r53719 - in lxml/trunk: . src/lxml/tests Message-ID: <20080412123343.3EFB316A463@codespeak.net> Author: scoder Date: Sat Apr 12 14:33:42 2008 New Revision: 53719 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/tests/test_elementtree.py Log: r3946 at delle: sbehnel | 2008-04-12 12:06:43 +0200 ET test for CDATA parsing Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Sat Apr 12 14:33:42 2008 @@ -2770,6 +2770,14 @@ tree.getroot() ) + def test_parse_cdata(self): + tostring = self.etree.tostring + root = self.etree.XML('') + + self.assertEquals('test', root.text) + self.assertEquals('test', + tostring(root)) + def test_parse_with_encoding(self): # this can fail in libxml2 <= 2.6.22 parse = self.etree.parse From scoder at codespeak.net Sat Apr 12 14:33:47 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 12 Apr 2008 14:33:47 +0200 (CEST) Subject: [Lxml-checkins] r53720 - in lxml/trunk: . src/lxml/tests Message-ID: <20080412123347.0495716A466@codespeak.net> Author: scoder Date: Sat Apr 12 14:33:47 2008 New Revision: 53720 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/tests/test_xslt.py Log: r3947 at delle: sbehnel | 2008-04-12 12:11:43 +0200 test case for currently lacking XSLT error on wrong stylesheet parameters Modified: lxml/trunk/src/lxml/tests/test_xslt.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_xslt.py (original) +++ lxml/trunk/src/lxml/tests/test_xslt.py Sat Apr 12 14:33:47 2008 @@ -299,6 +299,24 @@ ''', str(res)) + def _test_xslt_parameter_invalid(self): + tree = self.parse('BC') + style = self.parse('''\ + + + + +''') + + st = etree.XSLT(style) + res = self.assertRaises(etree.XSLTApplyError, + st, tree, bar="test") + res = self.assertRaises(etree.XSLTApplyError, + st, tree, bar="") + res = self.assertRaises(etree.XSLTApplyError, + st, tree, bar="....") + if etree.LIBXSLT_VERSION < (1,1,18): # later versions produce no error def test_xslt_parameter_missing(self): From scoder at codespeak.net Sun Apr 13 18:30:08 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 13 Apr 2008 18:30:08 +0200 (CEST) Subject: [Lxml-checkins] r53742 - in lxml/trunk: . doc Message-ID: <20080413163008.39FA149812A@codespeak.net> Author: scoder Date: Sun Apr 13 18:30:06 2008 New Revision: 53742 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/elementsoup.txt Log: r3953 at delle: sbehnel | 2008-04-13 18:28:47 +0200 doc update, section on using lxml.html.soupparser as a fallback Modified: lxml/trunk/doc/elementsoup.txt ============================================================================== --- lxml/trunk/doc/elementsoup.txt (original) +++ lxml/trunk/doc/elementsoup.txt Sun Apr 13 18:30:06 2008 @@ -3,7 +3,7 @@ ==================== BeautifulSoup_ is a Python package that parses broken HTML. While libxml2 -(and thus lxml) can also parse broken HTML, BeautifulSoup is somewhat more +(and thus lxml) can also parse broken HTML, BeautifulSoup is a bit more forgiving and has superiour `support for encoding detection`_. .. _BeautifulSoup: http://www.crummy.com/software/BeautifulSoup/ @@ -13,7 +13,7 @@ lxml can benefit from the parsing capabilities of BeautifulSoup through the ``lxml.html.soupparser`` module. It provides three main functions: ``fromstring()`` and ``parse()`` to parse a string or file -using BeautifulSoup, and `convert_tree()` to convert an existing +using BeautifulSoup, and ``convert_tree()`` to convert an existing BeautifulSoup tree into a list of top-level Elements. The functions ``fromstring()`` and ``parse()`` behave as known from @@ -58,6 +58,10 @@ ``makeelement`` factory function to ``parse()`` and ``fromstring()``. By default, this is based on the HTML parser defined in ``lxml.html``. + +Entity handling +=============== + By default, the BeautifulSoup parser also replaces the entities it finds by their character equivalent. @@ -94,3 +98,45 @@ >>> tostring(body, method="html", encoding=unicode) u'\xa9\u20ac-\xf5\u01bd

' + + +Using soupparser as a fallback +============================== + +The downside of using this parser is that it is `much slower`_ than +the HTML parser of lxml. So if performance matters, you might want to +consider using ``soupparser`` only as a fallback for certain cases. + +.. _`much slower`: http://blog.ianbicking.org/2008/03/30/python-html-parser-performance/ + +One common problem of lxml's parser is that it might not get the +encoding right in cases where the document contains a ```` tag +at the wrong place. In this case, you can exploit the fact that lxml +serialises much faster than most other HTML libraries for Python. +Just serialise the document to unicode and if that gives you an +exception, re-parse it with BeautifulSoup to see if that works +better. + +.. sourcecode:: pycon + + >>> tag_soup = '''\ + ... + ... + ... + ... Hello W\xc3\xb6rld! + ... + ... Hi all + ... ''' + + >>> import lxml.html + >>> import lxml.html.soupparser + + >>> root = lxml.html.fromstring(tag_soup) + >>> try: + ... ignore = tostring(root, encoding=unicode) + ... except UnicodeDecodeError: + ... root = lxml.html.soupparser.fromstring(tag_soup) + ... # try again, but don't catch the exception this time + ... ignore = tostring(root, encoding=unicode) + From scoder at codespeak.net Sun Apr 13 19:17:57 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 13 Apr 2008 19:17:57 +0200 (CEST) Subject: [Lxml-checkins] r53744 - in lxml/trunk: . doc Message-ID: <20080413171757.404E7168521@codespeak.net> Author: scoder Date: Sun Apr 13 19:17:56 2008 New Revision: 53744 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/elementsoup.txt Log: r3955 at delle: sbehnel | 2008-04-13 19:16:41 +0200 doc fix Modified: lxml/trunk/doc/elementsoup.txt ============================================================================== --- lxml/trunk/doc/elementsoup.txt (original) +++ lxml/trunk/doc/elementsoup.txt Sun Apr 13 19:17:56 2008 @@ -22,7 +22,8 @@ There is also a legacy module called ``lxml.html.ElementSoup``, which mimics the interface provided by ElementTree's own ElementSoup_ -module. +module. Note that the ``soupparser`` module was added in lxml 2.0.3. +Previous versions of lxml 2.0.x only have the ``ElementSoup`` module. Here is a document full of tag soup, similar to, but not quite like, HTML: @@ -73,7 +74,7 @@ u'\xa9\u20ac-\xf5\u01bd' If you want them back on the way out, you can just serialise with the -default encoding, which is 'US-ASCII'. The 'html' method +default encoding, which is 'US-ASCII'. .. sourcecode:: pycon From scoder at codespeak.net Sun Apr 13 19:27:22 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 13 Apr 2008 19:27:22 +0200 (CEST) Subject: [Lxml-checkins] r53745 - in lxml/branch/lxml-2.0: . doc Message-ID: <20080413172722.453B949812A@codespeak.net> Author: scoder Date: Sun Apr 13 19:27:20 2008 New Revision: 53745 Modified: lxml/branch/lxml-2.0/CHANGES.txt lxml/branch/lxml-2.0/doc/main.txt lxml/branch/lxml-2.0/version.txt Log: prepare release of 2.0.4 Modified: lxml/branch/lxml-2.0/CHANGES.txt ============================================================================== --- lxml/branch/lxml-2.0/CHANGES.txt (original) +++ lxml/branch/lxml-2.0/CHANGES.txt Sun Apr 13 19:27:20 2008 @@ -2,8 +2,8 @@ lxml changelog ============== -Under development -================= +2.0.4 (2008-04-13) +================== Features added -------------- Modified: lxml/branch/lxml-2.0/doc/main.txt ============================================================================== --- lxml/branch/lxml-2.0/doc/main.txt (original) +++ lxml/branch/lxml-2.0/doc/main.txt Sun Apr 13 19:27:20 2008 @@ -145,8 +145,8 @@ .. _`lxml at the Python Package Index`: http://pypi.python.org/pypi/lxml/ .. _`this key`: pubkey.asc -The latest version is `lxml 2.0.3`_, released 2008-03-26 -(`changes for 2.0.3`_). `Older versions`_ are listed below. +The latest version is `lxml 2.0.4`_, released 2008-04-13 +(`changes for 2.0.4`_). `Older versions`_ are listed below. .. _`Older versions`: #old-versions @@ -206,6 +206,8 @@ Old Versions ------------ +* `lxml 2.0.3`_, released 2008-03-26 (`changes for 2.0.3`_) + * `lxml 2.0.2`_, released 2008-02-22 (`changes for 2.0.2`_) * `lxml 2.0.1`_, released 2008-02-13 (`changes for 2.0.1`_) @@ -262,6 +264,7 @@ * `lxml 0.5`_, released 2005-04-08 +.. _`lxml 2.0.4`: lxml-2.0.4.tgz .. _`lxml 2.0.3`: lxml-2.0.3.tgz .. _`lxml 2.0.2`: lxml-2.0.2.tgz .. _`lxml 2.0.1`: lxml-2.0.1.tgz @@ -291,6 +294,7 @@ .. _`lxml 0.5.1`: lxml-0.5.1.tgz .. _`lxml 0.5`: lxml-0.5.tgz +.. _`changes for 2.0.4`: changes-2.0.4.html .. _`changes for 2.0.3`: changes-2.0.3.html .. _`changes for 2.0.2`: changes-2.0.2.html .. _`changes for 2.0.1`: changes-2.0.1.html Modified: lxml/branch/lxml-2.0/version.txt ============================================================================== --- lxml/branch/lxml-2.0/version.txt (original) +++ lxml/branch/lxml-2.0/version.txt Sun Apr 13 19:27:20 2008 @@ -1 +1 @@ -2.0.3 +2.0.4 From scoder at codespeak.net Sun Apr 13 20:28:23 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 13 Apr 2008 20:28:23 +0200 (CEST) Subject: [Lxml-checkins] r53746 - lxml/branch/lxml-2.0/doc Message-ID: <20080413182823.406462A0187@codespeak.net> Author: scoder Date: Sun Apr 13 20:28:20 2008 New Revision: 53746 Modified: lxml/branch/lxml-2.0/doc/elementsoup.txt Log: partial doc merge from trunk Modified: lxml/branch/lxml-2.0/doc/elementsoup.txt ============================================================================== --- lxml/branch/lxml-2.0/doc/elementsoup.txt (original) +++ lxml/branch/lxml-2.0/doc/elementsoup.txt Sun Apr 13 20:28:20 2008 @@ -3,22 +3,28 @@ ==================== BeautifulSoup_ is a Python package that parses broken HTML. While libxml2 -(and thus lxml) can also parse broken HTML, BeautifulSoup is much more +(and thus lxml) can also parse broken HTML, BeautifulSoup is a bit more forgiving and has superiour `support for encoding detection`_. .. _BeautifulSoup: http://www.crummy.com/software/BeautifulSoup/ .. _`support for encoding detection`: http://www.crummy.com/software/BeautifulSoup/documentation.html#Beautiful%20Soup%20Gives%20You%20Unicode,%20Dammit +.. _ElementSoup: http://effbot.org/zone/element-soup.htm lxml can benefit from the parsing capabilities of BeautifulSoup through the ``lxml.html.soupparser`` module. It provides three main functions: ``fromstring()`` and ``parse()`` to parse a string or file -using BeautifulSoup, and `convert_tree()` to convert an existing +using BeautifulSoup, and ``convert_tree()`` to convert an existing BeautifulSoup tree into a list of top-level Elements. The functions ``fromstring()`` and ``parse()`` behave as known from ElementTree. The first returns a root Element, the latter returns an ElementTree. +There is also a legacy module called ``lxml.html.ElementSoup``, which +mimics the interface provided by ElementTree's own ElementSoup_ +module. Note that the ``soupparser`` module was added in lxml 2.0.3. +Previous versions of lxml 2.0.x only have the ``ElementSoup`` module. + Here is a document full of tag soup, similar to, but not quite like, HTML:: >>> tag_soup = 'Hello</head<body onload=crash()>Hi all<p>' @@ -47,6 +53,10 @@ ``makeelement`` factory function to ``parse()`` and ``fromstring()``. By default, this is based on the HTML parser defined in ``lxml.html``. + +Entity handling +=============== + By default, the BeautifulSoup parser also replaces the entities it finds by their character equivalent:: @@ -83,4 +93,41 @@ mimics the interface provided by ElementTree's own ElementSoup_ module. -.. _ElementSoup: http://effbot.org/zone/element-soup.htm + +Using soupparser as a fallback +============================== + +The downside of using this parser is that it is `much slower`_ than +the HTML parser of lxml. So if performance matters, you might want to +consider using ``soupparser`` only as a fallback for certain cases. + +.. _`much slower`: http://blog.ianbicking.org/2008/03/30/python-html-parser-performance/ + +One common problem of lxml's parser is that it might not get the +encoding right in cases where the document contains a ``<meta>`` tag +at the wrong place. In this case, you can exploit the fact that lxml +serialises much faster than most other HTML libraries for Python. +Just serialise the document to unicode and if that gives you an +exception, re-parse it with BeautifulSoup to see if that works +better:: + + >>> tag_soup = '''\ + ... <meta http-equiv="Content-Type" + ... content="text/html;charset=utf-8" /> + ... <html> + ... <head> + ... <title>Hello W\xc3\xb6rld! + ... + ... Hi all + ... ''' + + >>> import lxml.html + >>> import lxml.html.soupparser + + >>> root = lxml.html.fromstring(tag_soup) + >>> try: + ... ignore = tostring(root, encoding=unicode) + ... except UnicodeDecodeError: + ... root = lxml.html.soupparser.fromstring(tag_soup) + ... # try again, but don't catch the exception this time + ... ignore = tostring(root, encoding=unicode) From scoder at codespeak.net Sun Apr 13 20:28:50 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 13 Apr 2008 20:28:50 +0200 (CEST) Subject: [Lxml-checkins] r53747 - in lxml/branch/lxml-2.0: . src/lxml Message-ID: <20080413182850.AD1ED16844F@codespeak.net> Author: scoder Date: Sun Apr 13 20:28:50 2008 New Revision: 53747 Modified: lxml/branch/lxml-2.0/CHANGES.txt lxml/branch/lxml-2.0/src/lxml/apihelpers.pxi Log: missing bug fix for 2.0.4 Modified: lxml/branch/lxml-2.0/CHANGES.txt ============================================================================== --- lxml/branch/lxml-2.0/CHANGES.txt (original) +++ lxml/branch/lxml-2.0/CHANGES.txt Sun Apr 13 20:28:50 2008 @@ -11,6 +11,8 @@ Bugs fixed ---------- +* Hanging thread in conjunction with GTK threading. + * Crash bug in iterparse when moving elements into other documents. * HTML elements' ``.cssselect()`` method was broken. Modified: lxml/branch/lxml-2.0/src/lxml/apihelpers.pxi ============================================================================== --- lxml/branch/lxml-2.0/src/lxml/apihelpers.pxi (original) +++ lxml/branch/lxml-2.0/src/lxml/apihelpers.pxi Sun Apr 13 20:28:50 2008 @@ -1103,20 +1103,16 @@ Returns None if not a file object. """ # file instances have a name attribute - try: - return source.name - except AttributeError: - pass + filename = getattr3(source, 'name', None) + if filename is not None: + return filename # gzip file instances have a filename attribute - try: - return source.filename - except AttributeError: - pass + filename = getattr3(source, 'filename', None) + if filename is not None: + return filename # urllib2 provides a geturl() method - try: - geturl = source.geturl - except AttributeError: - # can't determine filename - return None - else: + geturl = getattr3(source, 'geturl', None) + if geturl is not None: return geturl() + # can't determine filename + return None From scoder at codespeak.net Sun Apr 13 20:37:36 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 13 Apr 2008 20:37:36 +0200 (CEST) Subject: [Lxml-checkins] r53748 - lxml/branch/lxml-2.0/doc Message-ID: <20080413183736.1FB8F168554@codespeak.net> Author: scoder Date: Sun Apr 13 20:37:35 2008 New Revision: 53748 Modified: lxml/branch/lxml-2.0/doc/elementsoup.txt Log: remove redundant example code from docs Modified: lxml/branch/lxml-2.0/doc/elementsoup.txt ============================================================================== --- lxml/branch/lxml-2.0/doc/elementsoup.txt (original) +++ lxml/branch/lxml-2.0/doc/elementsoup.txt Sun Apr 13 20:37:35 2008 @@ -129,5 +129,3 @@ ... ignore = tostring(root, encoding=unicode) ... except UnicodeDecodeError: ... root = lxml.html.soupparser.fromstring(tag_soup) - ... # try again, but don't catch the exception this time - ... ignore = tostring(root, encoding=unicode) From scoder at codespeak.net Mon Apr 14 09:58:41 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 14 Apr 2008 09:58:41 +0200 (CEST) Subject: [Lxml-checkins] r53750 - lxml/trunk Message-ID: <20080414075841.DDC8F39B594@codespeak.net> Author: scoder Date: Mon Apr 14 09:58:39 2008 New Revision: 53750 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/INSTALL.txt Log: r3957 at delle: sbehnel | 2008-04-13 20:12:13 +0200 cleanup after release of 2.0.4 Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Mon Apr 14 09:58:39 2008 @@ -17,14 +17,8 @@ Bugs fixed ---------- -* Crash bug in iterparse when moving elements into other documents. - -* ``ElementTree.find*()`` didn't accept QName objects. - * lxml.etree accepted non well-formed namespace prefix names. -* HTML elements' ``.cssselect()`` method was broken. - Other changes ------------- @@ -35,6 +29,27 @@ context. +2.0.4 (2008-04-13) +================== + +Features added +-------------- + +Bugs fixed +---------- + +* Hanging thread in conjunction with GTK threading. + +* Crash bug in iterparse when moving elements into other documents. + +* HTML elements' ``.cssselect()`` method was broken. + +* ``ElementTree.find*()`` didn't accept QName objects. + +Other changes +------------- + + 2.1alpha1 (2008-03-27) ====================== Modified: lxml/trunk/INSTALL.txt ============================================================================== --- lxml/trunk/INSTALL.txt (original) +++ lxml/trunk/INSTALL.txt Mon Apr 14 09:58:39 2008 @@ -11,7 +11,8 @@ * libxml 2.6.20 or later. It can be found here: http://xmlsoft.org/downloads.html - If you want to use XPath, do not use libxml2 2.6.27. + If you want to use XPath, do not use libxml2 2.6.27. We recommend + libxml2 2.6.28 or later. * libxslt 1.1.15 or later. It can be found here: http://xmlsoft.org/XSLT/downloads.html From scoder at codespeak.net Mon Apr 14 09:58:45 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 14 Apr 2008 09:58:45 +0200 (CEST) Subject: [Lxml-checkins] r53751 - in lxml/trunk: . doc Message-ID: <20080414075845.68A4D39B594@codespeak.net> Author: scoder Date: Mon Apr 14 09:58:43 2008 New Revision: 53751 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/elementsoup.txt Log: r3958 at delle: sbehnel | 2008-04-13 20:35:31 +0200 remove redundant example code from docs Modified: lxml/trunk/doc/elementsoup.txt ============================================================================== --- lxml/trunk/doc/elementsoup.txt (original) +++ lxml/trunk/doc/elementsoup.txt Mon Apr 14 09:58:43 2008 @@ -138,6 +138,3 @@ ... ignore = tostring(root, encoding=unicode) ... except UnicodeDecodeError: ... root = lxml.html.soupparser.fromstring(tag_soup) - ... # try again, but don't catch the exception this time - ... ignore = tostring(root, encoding=unicode) - From scoder at codespeak.net Mon Apr 14 13:11:28 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 14 Apr 2008 13:11:28 +0200 (CEST) Subject: [Lxml-checkins] r53753 - in lxml/branch/lxml-2.0: . src/lxml Message-ID: <20080414111128.3EE1016853F@codespeak.net> Author: scoder Date: Mon Apr 14 13:11:26 2008 New Revision: 53753 Modified: lxml/branch/lxml-2.0/CHANGES.txt lxml/branch/lxml-2.0/src/lxml/schematron.pxi Log: memory leak in Schematron with libxml2 >= 2.6.31 Modified: lxml/branch/lxml-2.0/CHANGES.txt ============================================================================== --- lxml/branch/lxml-2.0/CHANGES.txt (original) +++ lxml/branch/lxml-2.0/CHANGES.txt Mon Apr 14 13:11:26 2008 @@ -2,6 +2,21 @@ lxml changelog ============== +Under development +================= + +Features added +-------------- + +Bugs fixed +---------- + +* Memory leak in Schematron with libxml2 >= 2.6.31. + +Other changes +------------- + + 2.0.4 (2008-04-13) ================== Modified: lxml/branch/lxml-2.0/src/lxml/schematron.pxi ============================================================================== --- lxml/branch/lxml-2.0/src/lxml/schematron.pxi (original) +++ lxml/branch/lxml-2.0/src/lxml/schematron.pxi Mon Apr 14 13:11:26 2008 @@ -74,24 +74,26 @@ the file system. """ cdef schematron.xmlSchematron* _c_schema + cdef xmlDoc* _c_schema_doc def __init__(self, etree=None, *, file=None): cdef _Document doc cdef _Element root_node cdef xmlNode* c_node - cdef xmlDoc* c_doc cdef char* c_href cdef schematron.xmlSchematronParserCtxt* parser_ctxt + self._c_schema = NULL + self._c_schema_doc = NULL _Validator.__init__(self) if not config.ENABLE_SCHEMATRON: raise SchematronError( "lxml.etree was compiled without Schematron support.") - self._c_schema = NULL if etree is not None: doc = _documentOrRaise(etree) root_node = _rootNodeOrRaise(etree) - c_doc = _copyDocRoot(doc._c_doc, root_node._c_node) + self._c_schema_doc = _copyDocRoot(doc._c_doc, root_node._c_node) self._error_log.connect() - parser_ctxt = schematron.xmlSchematronNewDocParserCtxt(c_doc) + parser_ctxt = schematron.xmlSchematronNewDocParserCtxt( + self._c_schema_doc) elif file is not None: filename = _getFilenameForFile(file) if filename is None: @@ -100,12 +102,14 @@ filename = _encodeFilename(filename) self._error_log.connect() parser_ctxt = schematron.xmlSchematronNewParserCtxt(_cstr(filename)) - c_doc = NULL else: raise SchematronParseError("No tree or file given") if parser_ctxt is NULL: self._error_log.disconnect() + if self._c_schema_doc is not NULL: + tree.xmlFreeDoc(self._c_schema_doc) + self._c_schema_doc = NULL python.PyErr_NoMemory() return @@ -114,16 +118,17 @@ schematron.xmlSchematronFreeParserCtxt(parser_ctxt) if self._c_schema is NULL: - if _LIBXML_VERSION_INT >= 20631: - # leak in older versions instead of just crashing - if c_doc is not NULL: - tree.xmlFreeDoc(c_doc) raise SchematronParseError( "Document is not a valid Schematron schema", self._error_log) def __dealloc__(self): schematron.xmlSchematronFree(self._c_schema) + if _LIBXML_VERSION_INT >= 20631: + # earlier libxml2 versions may have freed the document in + # xmlSchematronFree() already, we don't know ... + if self._c_schema_doc is not NULL: + tree.xmlFreeDoc(self._c_schema_doc) def __call__(self, etree): """__call__(self, etree) From scoder at codespeak.net Tue Apr 15 15:28:48 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 15 Apr 2008 15:28:48 +0200 (CEST) Subject: [Lxml-checkins] r53781 - in lxml/trunk: . src/lxml Message-ID: <20080415132848.0126E169EAF@codespeak.net> Author: scoder Date: Tue Apr 15 15:28:46 2008 New Revision: 53781 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/etree_defs.h lxml/trunk/src/lxml/schematron.pxd lxml/trunk/src/lxml/schematron.pxi Log: r3965 at delle: sbehnel | 2008-04-14 12:17:48 +0200 fix for Schematron error logging on libxml2 2.6.32 Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Tue Apr 15 15:28:46 2008 @@ -8,6 +8,8 @@ Features added -------------- +* Error logging in Schematron (requires libxml2 2.6.32 or later). + * Parser option ``strip_cdata`` for normalising or keeping CDATA sections. Defaults to ``True`` as before, thus replacing CDATA sections by their text content. Modified: lxml/trunk/src/lxml/etree_defs.h ============================================================================== --- lxml/trunk/src/lxml/etree_defs.h (original) +++ lxml/trunk/src/lxml/etree_defs.h Tue Apr 15 15:28:46 2008 @@ -73,11 +73,13 @@ # if LIBXML_VERSION < 20632 /* schematron error reporting was added in libxml2 2.6.32 */ # define xmlSchematronSetValidStructuredErrors(ctxt, errorfunc, data) +# define XML_SCHEMATRON_OUT_ERROR 0 # endif #else # define ENABLE_SCHEMATRON 0 # define XML_SCHEMATRON_OUT_QUIET 0 # define XML_SCHEMATRON_OUT_XML 0 +# define XML_SCHEMATRON_OUT_ERROR 0 typedef void xmlSchematron; typedef void xmlSchematronParserCtxt; typedef void xmlSchematronValidCtxt; Modified: lxml/trunk/src/lxml/schematron.pxd ============================================================================== --- lxml/trunk/src/lxml/schematron.pxd (original) +++ lxml/trunk/src/lxml/schematron.pxd Tue Apr 15 15:28:46 2008 @@ -7,12 +7,13 @@ ctypedef struct xmlSchematronValidCtxt ctypedef enum xmlSchematronValidOptions: - XML_SCHEMATRON_OUT_QUIET = 1 # quiet no report - XML_SCHEMATRON_OUT_TEXT = 2 # build a textual report - XML_SCHEMATRON_OUT_XML = 4 # output SVRL - XML_SCHEMATRON_OUT_FILE = 256 # output to a file descriptor - XML_SCHEMATRON_OUT_BUFFER = 512 # output to a buffer - XML_SCHEMATRON_OUT_IO = 1024 # output to I/O mechanism + XML_SCHEMATRON_OUT_QUIET = 1 # quiet no report + XML_SCHEMATRON_OUT_TEXT = 2 # build a textual report + XML_SCHEMATRON_OUT_XML = 4 # output SVRL + XML_SCHEMATRON_OUT_ERROR = 8 # output via xmlStructuredErrorFunc + XML_SCHEMATRON_OUT_FILE = 256 # output to a file descriptor + XML_SCHEMATRON_OUT_BUFFER = 512 # output to a buffer + XML_SCHEMATRON_OUT_IO = 1024 # output to I/O mechanism cdef xmlSchematronParserCtxt* xmlSchematronNewDocParserCtxt( xmlDoc* doc) nogil Modified: lxml/trunk/src/lxml/schematron.pxi ============================================================================== --- lxml/trunk/src/lxml/schematron.pxi (original) +++ lxml/trunk/src/lxml/schematron.pxi Tue Apr 15 15:28:46 2008 @@ -141,7 +141,8 @@ doc = _documentOrRaise(etree) root_node = _rootNodeOrRaise(etree) - if _LIBXML_VERSION_INT >= 20632: + if _LIBXML_VERSION_INT >= 20632 and \ + schematron.XML_SCHEMATRON_OUT_ERROR != 0: options = schematron.XML_SCHEMATRON_OUT_ERROR else: options = schematron.XML_SCHEMATRON_OUT_QUIET @@ -154,8 +155,9 @@ return python.PyErr_NoMemory() self._error_log.connect() - schematron.xmlSchematronSetValidStructuredErrors( - valid_ctxt, _receiveError, self.error_log) + if _LIBXML_VERSION_INT >= 20632: + schematron.xmlSchematronSetValidStructuredErrors( + valid_ctxt, _receiveError, self.error_log) c_doc = _fakeRootDoc(doc._c_doc, root_node._c_node) with nogil: ret = schematron.xmlSchematronValidateDoc(valid_ctxt, c_doc) From scoder at codespeak.net Tue Apr 15 15:28:53 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 15 Apr 2008 15:28:53 +0200 (CEST) Subject: [Lxml-checkins] r53782 - in lxml/trunk: . src/lxml Message-ID: <20080415132853.86DFF169EAF@codespeak.net> Author: scoder Date: Tue Apr 15 15:28:52 2008 New Revision: 53782 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/schematron.pxi Log: r3966 at delle: sbehnel | 2008-04-14 13:07:21 +0200 memory leak in schematron with libxml2 >= 2.6.31 Modified: lxml/trunk/src/lxml/schematron.pxi ============================================================================== --- lxml/trunk/src/lxml/schematron.pxi (original) +++ lxml/trunk/src/lxml/schematron.pxi Tue Apr 15 15:28:52 2008 @@ -74,24 +74,26 @@ the file system. """ cdef schematron.xmlSchematron* _c_schema + cdef xmlDoc* _c_schema_doc def __init__(self, etree=None, *, file=None): cdef _Document doc cdef _Element root_node cdef xmlNode* c_node - cdef xmlDoc* c_doc cdef char* c_href cdef schematron.xmlSchematronParserCtxt* parser_ctxt + self._c_schema = NULL + self._c_schema_doc = NULL _Validator.__init__(self) if not config.ENABLE_SCHEMATRON: raise SchematronError( "lxml.etree was compiled without Schematron support.") - self._c_schema = NULL if etree is not None: doc = _documentOrRaise(etree) root_node = _rootNodeOrRaise(etree) - c_doc = _copyDocRoot(doc._c_doc, root_node._c_node) + self._c_schema_doc = _copyDocRoot(doc._c_doc, root_node._c_node) self._error_log.connect() - parser_ctxt = schematron.xmlSchematronNewDocParserCtxt(c_doc) + parser_ctxt = schematron.xmlSchematronNewDocParserCtxt( + self._c_schema_doc) elif file is not None: filename = _getFilenameForFile(file) if filename is None: @@ -100,12 +102,14 @@ filename = _encodeFilename(filename) self._error_log.connect() parser_ctxt = schematron.xmlSchematronNewParserCtxt(_cstr(filename)) - c_doc = NULL else: raise SchematronParseError("No tree or file given") if parser_ctxt is NULL: self._error_log.disconnect() + if self._c_schema_doc is not NULL: + tree.xmlFreeDoc(self._c_schema_doc) + self._c_schema_doc = NULL python.PyErr_NoMemory() return @@ -114,16 +118,17 @@ schematron.xmlSchematronFreeParserCtxt(parser_ctxt) if self._c_schema is NULL: - if _LIBXML_VERSION_INT >= 20631: - # leak in older versions instead of just crashing - if c_doc is not NULL: - tree.xmlFreeDoc(c_doc) raise SchematronParseError( "Document is not a valid Schematron schema", self._error_log) def __dealloc__(self): schematron.xmlSchematronFree(self._c_schema) + if _LIBXML_VERSION_INT >= 20631: + # earlier libxml2 versions may have freed the document in + # xmlSchematronFree() already, we don't know ... + if self._c_schema_doc is not NULL: + tree.xmlFreeDoc(self._c_schema_doc) def __call__(self, etree): """__call__(self, etree) From scoder at codespeak.net Tue Apr 15 15:28:58 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 15 Apr 2008 15:28:58 +0200 (CEST) Subject: [Lxml-checkins] r53783 - lxml/trunk Message-ID: <20080415132858.A47C2169EB0@codespeak.net> Author: scoder Date: Tue Apr 15 15:28:57 2008 New Revision: 53783 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt Log: r3967 at delle: sbehnel | 2008-04-14 13:11:16 +0200 changelog Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Tue Apr 15 15:28:57 2008 @@ -19,6 +19,8 @@ Bugs fixed ---------- +* Memory leak in Schematron with libxml2 >= 2.6.31. + * lxml.etree accepted non well-formed namespace prefix names. Other changes From scoder at codespeak.net Tue Apr 15 15:29:03 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 15 Apr 2008 15:29:03 +0200 (CEST) Subject: [Lxml-checkins] r53784 - in lxml/trunk: . src/lxml Message-ID: <20080415132903.7DDE1169EB3@codespeak.net> Author: scoder Date: Tue Apr 15 15:29:02 2008 New Revision: 53784 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/xmlerror.pxi lxml/trunk/src/lxml/xslt.pxi Log: r3968 at delle: sbehnel | 2008-04-15 15:26:33 +0200 some cleanup, small fix: line count starts at 1 Modified: lxml/trunk/src/lxml/xmlerror.pxi ============================================================================== --- lxml/trunk/src/lxml/xmlerror.pxi (original) +++ lxml/trunk/src/lxml/xmlerror.pxi Tue Apr 15 15:29:02 2008 @@ -461,7 +461,7 @@ if cstd.strstr(msg, 'line %d'): c_error.line = cstd.va_int(args) else: - c_error.line = -1 + c_error.line = 0 if cstd.strstr(msg, 'element %s'): c_element = cstd.va_charptr(args) else: Modified: lxml/trunk/src/lxml/xslt.pxi ============================================================================== --- lxml/trunk/src/lxml/xslt.pxi (original) +++ lxml/trunk/src/lxml/xslt.pxi Tue Apr 15 15:29:02 2008 @@ -496,22 +496,24 @@ if resolver_context is not None and resolver_context._has_raised(): if c_result is not NULL: tree.xmlFreeDoc(c_result) + c_result = NULL resolver_context._raise_if_stored() if context._exc._has_raised(): if c_result is not NULL: tree.xmlFreeDoc(c_result) + c_result = NULL context._exc._raise_if_stored() if c_result is NULL: # last error seems to be the most accurate here error = self._error_log.last_error if error is not None and error.message: - if error.line >= 0: + if error.line > 0: message = "%s, line %d" % (error.message, error.line) else: message = error.message - elif error is not None and error.line >= 0: + elif error is not None and error.line > 0: message = "Error applying stylesheet, line %d" % error.line else: message = "Error applying stylesheet" From scoder at codespeak.net Tue Apr 15 15:29:08 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 15 Apr 2008 15:29:08 +0200 (CEST) Subject: [Lxml-checkins] r53785 - in lxml/trunk: . src/lxml/tests Message-ID: <20080415132908.9287C169EAF@codespeak.net> Author: scoder Date: Tue Apr 15 15:29:07 2008 New Revision: 53785 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/tests/test_elementtree.py Log: r3969 at delle: sbehnel | 2008-04-15 15:26:49 +0200 test cleanup Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Tue Apr 15 15:29:07 2008 @@ -2428,18 +2428,16 @@ def test_ns_move(self): Element = self.etree.Element - ElementTree = self.etree.ElementTree - - one = self.etree.parse( - StringIO('')) - baz = one.getroot()[0][0] + one = self.etree.fromstring( + '') + baz = one[0][0] - two = ElementTree(Element('root')) - two.getroot().append(baz) + two = Element('root') + two.append(baz) # removing the originating document could cause a crash/error before # as namespace is not moved along with it - del one - self.assertEquals('{http://a.b.c}baz', baz.tag) + del one, baz + self.assertEquals('{http://a.b.c}baz', two[0].tag) def test_ns_decl_tostring(self): tostring = self.etree.tostring From scoder at codespeak.net Tue Apr 15 15:29:13 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 15 Apr 2008 15:29:13 +0200 (CEST) Subject: [Lxml-checkins] r53786 - in lxml/trunk: . doc Message-ID: <20080415132913.468A5169EB3@codespeak.net> Author: scoder Date: Tue Apr 15 15:29:12 2008 New Revision: 53786 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/build.txt Log: r3970 at delle: sbehnel | 2008-04-15 15:27:19 +0200 require Cython 0.9.6.12 Modified: lxml/trunk/doc/build.txt ============================================================================== --- lxml/trunk/doc/build.txt (original) +++ lxml/trunk/doc/build.txt Tue Apr 15 15:29:12 2008 @@ -46,8 +46,8 @@ easy_install Cython==0.9.6.12 -lxml currently requires Cython 0.9.6.11b or 0.9.6.12, later versions -were not tested. +lxml currently requires Cython 0.9.6.12. Any 0.9.6.13 version will not +work, later versions were not tested. Subversion From scoder at codespeak.net Tue Apr 15 16:54:09 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 15 Apr 2008 16:54:09 +0200 (CEST) Subject: [Lxml-checkins] r53787 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20080415145409.B2A5C169EA3@codespeak.net> Author: scoder Date: Tue Apr 15 16:54:07 2008 New Revision: 53787 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/tests/test_xslt.py lxml/trunk/src/lxml/xslt.pxi Log: r3978 at delle: sbehnel | 2008-04-15 16:52:51 +0200 some XSLT errors could pass silently Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Tue Apr 15 16:54:07 2008 @@ -19,6 +19,9 @@ Bugs fixed ---------- +* lxml did not honour libxslt's second error state "STOPPED", which + let some XSLT errors pass silently. + * Memory leak in Schematron with libxml2 >= 2.6.31. * lxml.etree accepted non well-formed namespace prefix names. Modified: lxml/trunk/src/lxml/tests/test_xslt.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_xslt.py (original) +++ lxml/trunk/src/lxml/tests/test_xslt.py Tue Apr 15 16:54:07 2008 @@ -299,11 +299,12 @@ ''', str(res)) - def _test_xslt_parameter_invalid(self): + def test_xslt_parameter_invalid(self): tree = self.parse('BC') style = self.parse('''\ + @@ -311,8 +312,6 @@ st = etree.XSLT(style) res = self.assertRaises(etree.XSLTApplyError, - st, tree, bar="test") - res = self.assertRaises(etree.XSLTApplyError, st, tree, bar="") res = self.assertRaises(etree.XSLTApplyError, st, tree, bar="....") @@ -521,8 +520,8 @@ source = self.parse(xml) styledoc = self.parse(xslt) style = etree.XSLT(styledoc) - result = style(source) - self.assertEqual('', str(result)) + + self.assertRaises(etree.XSLTApplyError, style, source) self.assert_("TEST TEST TEST" in [entry.message for entry in style.error_log]) Modified: lxml/trunk/src/lxml/xslt.pxi ============================================================================== --- lxml/trunk/src/lxml/xslt.pxi (original) +++ lxml/trunk/src/lxml/xslt.pxi Tue Apr 15 16:54:07 2008 @@ -481,6 +481,11 @@ c_result = self._run_transform( c_doc, _kw, context, transform_ctxt) + if transform_ctxt.state != xslt.XSLT_STATE_OK: + if c_result is not NULL: + tree.xmlFreeDoc(c_result) + c_result = NULL + if transform_ctxt.profile: c_profile_doc = xslt.xsltGetProfileInformation(transform_ctxt) if c_profile_doc is not NULL: From scoder at codespeak.net Tue Apr 15 17:44:13 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 15 Apr 2008 17:44:13 +0200 (CEST) Subject: [Lxml-checkins] r53788 - in lxml/trunk: . src/lxml Message-ID: <20080415154413.643B1169F28@codespeak.net> Author: scoder Date: Tue Apr 15 17:44:10 2008 New Revision: 53788 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/proxy.pxi Log: r3980 at delle: sbehnel | 2008-04-15 17:42:57 +0200 huge cleanup in moveNodeToDocument() function Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Tue Apr 15 17:44:10 2008 @@ -29,6 +29,10 @@ Other changes ------------- +* Major cleanup in internal ``moveNodeToDocument()`` function, which + takes care of namespace cleanup when moving elements between + different namespace contexts. + * New Elements created through the ``makeelement()`` method of an HTML parser or through lxml.html now end up in a new HTML document (doctype HTML 4.01 Transitional) instead of a generic XML document. Modified: lxml/trunk/src/lxml/proxy.pxi ============================================================================== --- lxml/trunk/src/lxml/proxy.pxi (original) +++ lxml/trunk/src/lxml/proxy.pxi Tue Apr 15 17:44:10 2008 @@ -46,6 +46,18 @@ python.Py_XDECREF(proxy._gc_doc) proxy._gc_doc = NULL +cdef inline void _updateProxyDocument(xmlNode* c_node, _Document doc): + """Replace the document reference of a proxy. + + This may deallocate the original document of the proxy! + """ + cdef _Element element = <_Element>c_node._private + if element._doc is not doc: + python.Py_INCREF(doc) + python.Py_DECREF(element._doc) + element._doc = doc + element._gc_doc = doc + ################################################################################ # temporarily make a node the root node of its document @@ -198,6 +210,72 @@ c_new_ns = c_new_ns.next c_parent = c_parent.parent +ctypedef struct _nscache: + xmlNs** new + xmlNs** old + cstd.size_t size + cstd.size_t last + +cdef int _growNsCache(_nscache* c_ns_cache) except -1: + cdef xmlNs** c_ns_ptr + if c_ns_cache.size == 0: + c_ns_cache.size = 20 + else: + c_ns_cache.size *= 2 + c_ns_ptr = cstd.realloc( + c_ns_cache.new, c_ns_cache.size * sizeof(xmlNs*)) + if c_ns_ptr is not NULL: + c_ns_cache.new = c_ns_ptr + c_ns_ptr = cstd.realloc( + c_ns_cache.old, c_ns_cache.size * sizeof(xmlNs*)) + if c_ns_ptr is not NULL: + c_ns_cache.old = c_ns_ptr + else: + cstd.free(c_ns_cache.new) + cstd.free(c_ns_cache.old) + python.PyErr_NoMemory() + return -1 + return 0 + +cdef inline int _appendToNsCache(_nscache* c_ns_cache, + xmlNs* c_old_ns, xmlNs* c_new_ns) except -1: + if c_ns_cache.last >= c_ns_cache.size: + _growNsCache(c_ns_cache) + c_ns_cache.old[c_ns_cache.last] = c_old_ns + c_ns_cache.new[c_ns_cache.last] = c_new_ns + c_ns_cache.last += 1 + +cdef int _stripRedundantNamespaceDeclarations( + xmlNode* c_element, _nscache* c_ns_cache, xmlNs** c_del_ns_list) except -1: + """Removes namespace declarations from an element that are already + defined in its parents. Does not free the xmlNs's, just prepends + them to the c_del_ns_list. + """ + cdef xmlNs* c_ns + cdef xmlNs* c_ns_next + cdef xmlNs** c_nsdef + # use a xmlNs** to handle assignments to "c_element.nsDef" correctly + c_nsdef = &c_element.nsDef + while c_nsdef[0] is not NULL: + c_ns = tree.xmlSearchNsByHref( + c_element.doc, c_element.parent, c_nsdef[0].href) + if c_ns is NULL: + # new namespace href => keep and cache the ns declaration + _appendToNsCache(c_ns_cache, c_nsdef[0], c_nsdef[0]) + c_nsdef = &c_nsdef[0].next + else: + # known namespace href => strip the ns + if c_ns is tree.xmlSearchNs(c_element.doc, c_element.parent, + c_ns.prefix): + # prefix is not shadowed by parents => ns is reusable + _appendToNsCache(c_ns_cache, c_nsdef[0], c_ns) + # cut out c_nsdef.next and prepend it to garbage chain + c_ns_next = c_nsdef[0].next + c_nsdef[0].next = c_del_ns_list[0] + c_del_ns_list[0] = c_nsdef[0] + c_nsdef[0] = c_ns_next + return 0 + cdef int moveNodeToDocument(_Document doc, xmlNode* c_element) except -1: """Fix the xmlNs pointers of a node and its subtree that were moved. @@ -223,96 +301,48 @@ step 1), but freed only after the complete subtree was traversed and all occurrences were replaced by tree-internal pointers. """ - cdef _Element element - cdef xmlDoc* c_doc cdef xmlNode* c_start_node cdef xmlNode* c_node - cdef xmlNs** c_ns_ptr - cdef xmlNs** c_ns_new_cache - cdef xmlNs** c_ns_old_cache + cdef _nscache c_ns_cache cdef xmlNs* c_ns cdef xmlNs* c_ns_next cdef xmlNs* c_nsdef - cdef xmlNs* c_new_ns - cdef xmlNs* c_del_ns - cdef cstd.size_t i, c_cache_size, c_cache_last + cdef xmlNs* c_del_ns_list + cdef cstd.size_t i if not tree._isElementOrXInclude(c_element): return 0 - c_doc = c_element.doc c_start_node = c_element - c_ns_new_cache = NULL - c_ns_old_cache = NULL - c_cache_size = 0 - c_cache_last = 0 - c_del_ns = NULL + c_del_ns_list = NULL + + c_ns_cache.new = NULL + c_ns_cache.old = NULL + c_ns_cache.size = 0 + c_ns_cache.last = 0 while c_element is not NULL: # 1) cut out namespaces defined here that are already known by # the ancestors - c_nsdef = c_element.nsDef - if c_nsdef is not NULL: - # start with second nsdef to keep c_element.nsDef for now - while c_nsdef.next is not NULL: - if c_nsdef.next is c_element.ns: - c_nsdef = c_nsdef.next - continue - c_ns = tree.xmlSearchNsByHref( - c_element.doc, c_element.parent, c_nsdef.next.href) - if c_ns is NULL: - c_nsdef = c_nsdef.next - continue - # cut out c_nsdef.next and prepend it to garbage chain - c_ns_next = c_nsdef.next.next - c_nsdef.next.next = c_del_ns - c_del_ns = c_nsdef.next - c_nsdef.next = c_ns_next - # now handle c_element.nsDef - c_ns = tree.xmlSearchNsByHref( - c_element.doc, c_element.parent, c_element.nsDef.href) - if c_ns is not NULL: - c_ns_next = c_element.nsDef.next - c_element.nsDef.next = c_del_ns - c_del_ns = c_element.nsDef - c_element.nsDef = c_ns_next + if c_element.nsDef is not NULL: + _stripRedundantNamespaceDeclarations( + c_element, &c_ns_cache, &c_del_ns_list) - # 2) make sure the namespace of an element and its attributes - # is declared in this document (i.e. the node or its parents) + # 2) make sure the namespaces of an element and its attributes + # are declared in this document (i.e. on the node or its parents) c_node = c_element while c_node is not NULL: if c_node.ns is not NULL: - for i from 0 <= i < c_cache_last: - if c_node.ns is c_ns_old_cache[i]: - c_node.ns = c_ns_new_cache[i] + for i from 0 <= i < c_ns_cache.last: + if c_node.ns is c_ns_cache.old[i]: + c_node.ns = c_ns_cache.new[i] break else: # not in cache => find a replacement from this document - c_new_ns = doc._findOrBuildNodeNs( + c_ns = doc._findOrBuildNodeNs( c_element, c_node.ns.href, c_node.ns.prefix) - if c_cache_last >= c_cache_size: - # must resize cache - if c_cache_size == 0: - c_cache_size = 20 - else: - c_cache_size *= 2 - c_ns_ptr = cstd.realloc( - c_ns_new_cache, c_cache_size * sizeof(xmlNs*)) - if c_ns_ptr is not NULL: - c_ns_new_cache = c_ns_ptr - c_ns_ptr = cstd.realloc( - c_ns_old_cache, c_cache_size * sizeof(xmlNs*)) - if c_ns_ptr is not NULL: - c_ns_old_cache = c_ns_ptr - else: - cstd.free(c_ns_new_cache) - cstd.free(c_ns_old_cache) - python.PyErr_NoMemory() - return -1 - c_ns_new_cache[c_cache_last] = c_new_ns - c_ns_old_cache[c_cache_last] = c_node.ns - c_cache_last += 1 - c_node.ns = c_new_ns + _appendToNsCache(&c_ns_cache, c_node.ns, c_ns) + c_node.ns = c_ns if c_node is c_element: # after the element, continue with its attributes c_node = c_element.properties @@ -330,12 +360,7 @@ # 3) fix _Document reference (may dealloc the original document!) if c_element._private is not NULL: - element = <_Element>c_element._private - if element._doc is not doc: - python.Py_INCREF(doc) - python.Py_DECREF(element._doc) - element._doc = doc - element._gc_doc = doc + _updateProxyDocument(c_element, doc) if c_element is c_start_node: break # all done @@ -353,12 +378,7 @@ # 3) fix _Document reference (may dealloc the original document!) if c_element._private is not NULL: - element = <_Element>c_element._private - if element._doc is not doc: - python.Py_INCREF(doc) - python.Py_DECREF(element._doc) - element._doc = doc - element._gc_doc = doc + _updateProxyDocument(c_element, doc) if c_element is c_start_node: break @@ -372,13 +392,13 @@ c_element = c_node # free now unused namespace declarations - if c_del_ns is not NULL: - tree.xmlFreeNsList(c_del_ns) + if c_del_ns_list is not NULL: + tree.xmlFreeNsList(c_del_ns_list) # cleanup - if c_ns_new_cache is not NULL: - cstd.free(c_ns_new_cache) - if c_ns_old_cache is not NULL: - cstd.free(c_ns_old_cache) + if c_ns_cache.new is not NULL: + cstd.free(c_ns_cache.new) + if c_ns_cache.old is not NULL: + cstd.free(c_ns_cache.old) return 0 From scoder at codespeak.net Tue Apr 15 19:39:15 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 15 Apr 2008 19:39:15 +0200 (CEST) Subject: [Lxml-checkins] r53789 - in lxml/trunk: . src/lxml Message-ID: <20080415173915.8FACA498138@codespeak.net> Author: scoder Date: Tue Apr 15 19:39:12 2008 New Revision: 53789 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/schematron.pxi Log: r3982 at delle: sbehnel | 2008-04-15 19:37:57 +0200 fix for schematron error reporting Modified: lxml/trunk/src/lxml/schematron.pxi ============================================================================== --- lxml/trunk/src/lxml/schematron.pxi (original) +++ lxml/trunk/src/lxml/schematron.pxi Tue Apr 15 19:39:12 2008 @@ -162,7 +162,7 @@ self._error_log.connect() if _LIBXML_VERSION_INT >= 20632: schematron.xmlSchematronSetValidStructuredErrors( - valid_ctxt, _receiveError, self.error_log) + valid_ctxt, _receiveError, self._error_log) c_doc = _fakeRootDoc(doc._c_doc, root_node._c_node) with nogil: ret = schematron.xmlSchematronValidateDoc(valid_ctxt, c_doc) From scoder at codespeak.net Tue Apr 15 19:55:10 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 15 Apr 2008 19:55:10 +0200 (CEST) Subject: [Lxml-checkins] r53790 - in lxml/trunk: . doc Message-ID: <20080415175510.8FD95498136@codespeak.net> Author: scoder Date: Tue Apr 15 19:55:09 2008 New Revision: 53790 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/doc/main.txt lxml/trunk/version.txt Log: r3984 at delle: sbehnel | 2008-04-15 19:44:25 +0200 prepare release of 2.1beta1 Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Tue Apr 15 19:55:09 2008 @@ -2,8 +2,8 @@ lxml changelog ============== -Under development -================= +2.1beta1 (2008-04-15) +===================== Features added -------------- Modified: lxml/trunk/doc/main.txt ============================================================================== --- lxml/trunk/doc/main.txt (original) +++ lxml/trunk/doc/main.txt Tue Apr 15 19:55:09 2008 @@ -142,8 +142,8 @@ source release. If you can't wait, consider trying a less recent release version first. -The latest version is `lxml 2.1alpha1`_, released 2008-03-27 -(`changes for 2.1alpha1`_). `Older versions`_ are listed below. +The latest version is `lxml 2.1beta1`_, released 2008-04-15 +(`changes for 2.1beta1`_). `Older versions`_ are listed below. Please take a look at the `installation instructions`_! @@ -211,6 +211,10 @@ Old Versions ------------ +* `lxml 2.1alpha1`_, released 2008-03-27 (`changes for 2.1alpha1`_) + +* `lxml 2.0.4`_, released 2008-04-14 (`changes for 2.0.4`_) + * `lxml 2.0.3`_, released 2008-03-26 (`changes for 2.0.3`_) * `lxml 2.0.2`_, released 2008-02-22 (`changes for 2.0.2`_) @@ -269,6 +273,7 @@ * `lxml 0.5`_, released 2005-04-08 +.. _`lxml 2.1beta1`: lxml-2.1beta1.tgz .. _`lxml 2.1alpha1`: lxml-2.1alpha1.tgz .. _`lxml 2.0.3`: lxml-2.0.3.tgz .. _`lxml 2.0.2`: lxml-2.0.2.tgz @@ -299,6 +304,7 @@ .. _`lxml 0.5.1`: lxml-0.5.1.tgz .. _`lxml 0.5`: lxml-0.5.tgz +.. _`changes for 2.1beta1`: changes-2.1beta1.html .. _`changes for 2.1alpha1`: changes-2.1alpha1.html .. _`changes for 2.0.3`: changes-2.0.3.html .. _`changes for 2.0.2`: changes-2.0.2.html Modified: lxml/trunk/version.txt ============================================================================== --- lxml/trunk/version.txt (original) +++ lxml/trunk/version.txt Tue Apr 15 19:55:09 2008 @@ -1 +1 @@ -2.1alpha1 +2.1alpha2 From scoder at codespeak.net Tue Apr 15 19:55:16 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 15 Apr 2008 19:55:16 +0200 (CEST) Subject: [Lxml-checkins] r53791 - in lxml/trunk: . src/lxml Message-ID: <20080415175516.808B4498136@codespeak.net> Author: scoder Date: Tue Apr 15 19:55:15 2008 New Revision: 53791 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/parser.pxi Log: r3985 at delle: sbehnel | 2008-04-15 19:53:52 +0200 fix for resolving to filenames in custom resolvers Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Tue Apr 15 19:55:15 2008 @@ -19,6 +19,8 @@ Bugs fixed ---------- +* Resolving to a filename in custom resolvers didn't work. + * lxml did not honour libxslt's second error state "STOPPED", which let some XSLT errors pass silently. Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Tue Apr 15 19:55:15 2008 @@ -333,7 +333,7 @@ c_context, _cstr(data)) elif doc_ref._type == PARSER_DATA_FILENAME: c_input = xmlparser.xmlNewInputFromFile( - c_context, _cstr(doc_ref._data_bytes)) + c_context, _cstr(doc_ref._filename)) elif doc_ref._type == PARSER_DATA_FILE: file_context = _FileReaderContext(doc_ref._file, context, url) c_input = file_context._createParserInput(c_context) From scoder at codespeak.net Tue Apr 15 19:56:28 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 15 Apr 2008 19:56:28 +0200 (CEST) Subject: [Lxml-checkins] r53792 - in lxml/branch/lxml-2.0: . src/lxml Message-ID: <20080415175628.1305F49813F@codespeak.net> Author: scoder Date: Tue Apr 15 19:56:28 2008 New Revision: 53792 Modified: lxml/branch/lxml-2.0/CHANGES.txt lxml/branch/lxml-2.0/src/lxml/parser.pxi Log: trunk merge -c 53791: custom resolver fix Modified: lxml/branch/lxml-2.0/CHANGES.txt ============================================================================== --- lxml/branch/lxml-2.0/CHANGES.txt (original) +++ lxml/branch/lxml-2.0/CHANGES.txt Tue Apr 15 19:56:28 2008 @@ -11,6 +11,8 @@ Bugs fixed ---------- +* Resolving to a filename in custom resolvers didn't work. + * Memory leak in Schematron with libxml2 >= 2.6.31. Other changes Modified: lxml/branch/lxml-2.0/src/lxml/parser.pxi ============================================================================== --- lxml/branch/lxml-2.0/src/lxml/parser.pxi (original) +++ lxml/branch/lxml-2.0/src/lxml/parser.pxi Tue Apr 15 19:56:28 2008 @@ -333,7 +333,7 @@ c_context, _cstr(data)) elif doc_ref._type == PARSER_DATA_FILENAME: c_input = xmlparser.xmlNewInputFromFile( - c_context, _cstr(doc_ref._data_bytes)) + c_context, _cstr(doc_ref._filename)) elif doc_ref._type == PARSER_DATA_FILE: file_context = _FileReaderContext(doc_ref._file, context, url) c_input = file_context._createParserInput(c_context) From scoder at codespeak.net Tue Apr 15 20:20:04 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 15 Apr 2008 20:20:04 +0200 (CEST) Subject: [Lxml-checkins] r53793 - in lxml/trunk: . src/lxml/tests Message-ID: <20080415182004.CA7A0169F21@codespeak.net> Author: scoder Date: Tue Apr 15 20:20:04 2008 New Revision: 53793 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/tests/test_etree.py Log: r3988 at delle: sbehnel | 2008-04-15 20:18:50 +0200 test for custom filename resolver Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Tue Apr 15 20:20:04 2008 @@ -612,6 +612,28 @@ root = tree.getroot() self.assertEquals(root.text, test_url) + def test_resolve_filename_dtd(self): + parse = self.etree.parse + parser = self.etree.XMLParser(attribute_defaults=True) + assertEqual = self.assertEqual + test_url = u"__nosuch.dtd" + + class MyResolver(self.etree.Resolver): + def resolve(self, url, id, context): + assertEqual(url, test_url) + return self.resolve_filename( + fileInTestDir('test.dtd'), context) + + parser.resolvers.add(MyResolver()) + + xml = u'' % test_url + tree = parse(StringIO(xml), parser) + root = tree.getroot() + self.assertEquals( + root.attrib, {'default': 'valueA'}) + self.assertEquals( + root[0].attrib, {'default': 'valueB'}) + def test_resolve_empty(self): parse = self.etree.parse parser = self.etree.XMLParser(load_dtd=True) From scoder at codespeak.net Tue Apr 15 20:22:03 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 15 Apr 2008 20:22:03 +0200 (CEST) Subject: [Lxml-checkins] r53794 - lxml/branch/lxml-2.0/src/lxml/tests Message-ID: <20080415182203.6B4C0169F29@codespeak.net> Author: scoder Date: Tue Apr 15 20:22:03 2008 New Revision: 53794 Modified: lxml/branch/lxml-2.0/src/lxml/tests/test_etree.py Log: trunk merge -c 53793 (test case) Modified: lxml/branch/lxml-2.0/src/lxml/tests/test_etree.py ============================================================================== --- lxml/branch/lxml-2.0/src/lxml/tests/test_etree.py (original) +++ lxml/branch/lxml-2.0/src/lxml/tests/test_etree.py Tue Apr 15 20:22:03 2008 @@ -533,6 +533,28 @@ root = tree.getroot() self.assertEquals(root.text, test_url) + def test_resolve_filename_dtd(self): + parse = self.etree.parse + parser = self.etree.XMLParser(attribute_defaults=True) + assertEqual = self.assertEqual + test_url = u"__nosuch.dtd" + + class MyResolver(self.etree.Resolver): + def resolve(self, url, id, context): + assertEqual(url, test_url) + return self.resolve_filename( + fileInTestDir('test.dtd'), context) + + parser.resolvers.add(MyResolver()) + + xml = u'' % test_url + tree = parse(StringIO(xml), parser) + root = tree.getroot() + self.assertEquals( + root.attrib, {'default': 'valueA'}) + self.assertEquals( + root[0].attrib, {'default': 'valueB'}) + def test_resolve_empty(self): parse = self.etree.parse parser = self.etree.XMLParser(load_dtd=True) From scoder at codespeak.net Tue Apr 15 20:28:48 2008 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 15