From scoder at codespeak.net Sat Sep 1 10:45:07 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 1 Sep 2007 10:45:07 +0200 (CEST) Subject: [Lxml-checkins] r46231 - lxml/trunk/doc Message-ID: <20070901084507.9E7F281B0@code0.codespeak.net> Author: scoder Date: Sat Sep 1 10:45:04 2007 New Revision: 46231 Modified: lxml/trunk/doc/objectify.txt Log: fixed test cases to reflect annotation in objectify.E factory Modified: lxml/trunk/doc/objectify.txt ============================================================================== --- lxml/trunk/doc/objectify.txt (original) +++ lxml/trunk/doc/objectify.txt Sat Sep 1 10:45:04 2007 @@ -82,6 +82,13 @@ .. _`namespace specific classes`: element_classes.html#namespace-class-lookup +To make the doctests in this document look a little nicer, we also use this: + + >>> import lxml.usedoctest + +Imported from within a doctest, this relieves us from caring about the exact +formatting of XML output. + The lxml.objectify API ====================== @@ -274,18 +281,18 @@ >>> E = objectify.E >>> root = E.root( - ... E.a(5), + ... E.a(5L), ... E.b(6.1), ... E.c(True), ... E.d("how", tell="me") ... ) >>> print etree.tostring(root, pretty_print=True) - - 5 - 6.1 - true - how + + 5 + 6.1 + true + how This allows you to write up a specific language in tags:: @@ -300,9 +307,9 @@ ... ) >>> print etree.tostring(root, pretty_print=True) - - The title - 5 + + The title + 5 From scoder at codespeak.net Sat Sep 1 10:59:47 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 1 Sep 2007 10:59:47 +0200 (CEST) Subject: [Lxml-checkins] r46232 - in lxml/trunk: . doc src/lxml Message-ID: <20070901085947.AC74F81B0@code0.codespeak.net> Author: scoder Date: Sat Sep 1 10:59:45 2007 New Revision: 46232 Modified: lxml/trunk/CHANGES.txt lxml/trunk/doc/objectify.txt lxml/trunk/src/lxml/objectify.pyx Log: made annotation in objectify.ElementMaker optional through 'annotate' kw arg Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Sat Sep 1 10:59:45 2007 @@ -8,6 +8,10 @@ Features added -------------- +* Reimplemented ``objectify.E`` for better performance and improved + integration with objectify. Provides extended type support based on + registered PyTypes. + * XSLT objects now support deep copying * New ``makeSubElement()`` C-API function that allows creating a new @@ -35,8 +39,7 @@ * Schematron validation (incomplete in libxml2) -* Extended type support for ``objectify.E`` based on registered PyTypes. - Supports an additional argument to ``PyType()`` that takes a conversion +* Additional ``stringify`` argument to ``PyType()`` that takes a conversion function to strings to support setting text values from arbitrary types. * Entity support through an ``Entity`` factory and element classes. XML Modified: lxml/trunk/doc/objectify.txt ============================================================================== --- lxml/trunk/doc/objectify.txt (original) +++ lxml/trunk/doc/objectify.txt Sat Sep 1 10:59:45 2007 @@ -299,17 +299,32 @@ >>> ROOT = objectify.E.root >>> TITLE = objectify.E.title - >>> TYPE = objectify.E.type + >>> HOWMANY = getattr(objectify.E, "how-many") >>> root = ROOT( ... TITLE("The title"), - ... TYPE(5) + ... HOWMANY(5) ... ) >>> print etree.tostring(root, pretty_print=True) The title - 5 + 5 + + +``objectify.E`` is an instance of ``objectify.ElementMaker``. By default, it +creates pytype annotated Elements without a namespace. You can switch off the +pytype annotation by passing False to the ``annotate`` keyword argument of the +constructor. You can also pass a default namespace and an ``nsmap``:: + + >>> myE = objectify.ElementMaker(annotate=False, + ... namespace="http://my/ns", nsmap={None : "http://my/ns"}) + + >>> root = myE.root( myE.someint(2) ) + + >>> print etree.tostring(root, pretty_print=True) + + 2 Modified: lxml/trunk/src/lxml/objectify.pyx ============================================================================== --- lxml/trunk/src/lxml/objectify.pyx (original) +++ lxml/trunk/src/lxml/objectify.pyx Sat Sep 1 10:59:45 2007 @@ -1047,7 +1047,9 @@ cdef object _makeelement cdef object _namespace cdef object _nsmap - def __init__(self, namespace=None, nsmap=None, makeelement=None): + cdef int _annotate + def __init__(self, namespace=None, nsmap=None, annotate=True, + makeelement=None): if nsmap is None: nsmap = _DEFAULT_NSMAP self._nsmap = nsmap @@ -1055,6 +1057,7 @@ self._namespace = None else: self._namespace = "{%s}" % namespace + self._annotate = bool(annotate) if makeelement is not None: assert callable(makeelement) self._makeelement = makeelement @@ -1068,6 +1071,7 @@ element_maker = NEW_ELEMENT_MAKER(_ObjectifyElementMakerCaller) element_maker._tag = tag element_maker._nsmap = self._nsmap + element_maker._annotate = self._annotate element_maker._element_factory = self._makeelement return element_maker @@ -1075,6 +1079,7 @@ cdef object _tag cdef object _nsmap cdef object _element_factory + cdef int _annotate def __call__(self, *children, **attrib): cdef _ObjectifyElementMakerCaller elementMaker @@ -1088,6 +1093,7 @@ else: element = self._element_factory(self._tag, attrib, self._nsmap) + pytype_name = None has_children = 0 has_string_value = 0 for child in children: @@ -1113,7 +1119,7 @@ has_children = 1 else: if pytype_name is not None: - # concatenation makes the result a string + # concatenation always makes the result a string has_string_value = 1 pytype_name = _typename(child) pytype = python.PyDict_GetItem(_PYTYPE_DICT, pytype_name) @@ -1124,12 +1130,11 @@ child = str(child) _add_text(element, child) - if not has_children: + if self._annotate and not has_children: if has_string_value: cetree.setAttributeValue(element, PYTYPE_ATTRIBUTE, "str") elif pytype_name is not None: - cetree.setAttributeValue(element, PYTYPE_ATTRIBUTE, - pytype_name) + cetree.setAttributeValue(element, PYTYPE_ATTRIBUTE, pytype_name) return element @@ -1911,6 +1916,10 @@ _parse = etree.parse def parse(f, parser=None): + """Parse a file or file-like object with the objectify parser. + + You can pass a different parser as second argument. + """ if parser is None: parser = objectify_parser return _parse(f, parser) From scoder at codespeak.net Sun Sep 2 17:20:11 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 2 Sep 2007 17:20:11 +0200 (CEST) Subject: [Lxml-checkins] r46239 - lxml/trunk Message-ID: <20070902152011.513E78141@code0.codespeak.net> Author: scoder Date: Sun Sep 2 17:20:09 2007 New Revision: 46239 Modified: lxml/trunk/setup.py Log: doc clarification Modified: lxml/trunk/setup.py ============================================================================== --- lxml/trunk/setup.py (original) +++ lxml/trunk/setup.py Sun Sep 2 17:20:09 2007 @@ -71,10 +71,10 @@ Running ``easy_install lxml==dev`` will install it from http://codespeak.net/svn/lxml/trunk#egg=lxml-dev -Current bug fixes for the stable version are at -http://codespeak.net/svn/lxml/branch/lxml-%(branch_version)s . -Running ``easy_install lxml==%(branch_version)sbugfix`` will install this -version from +After an official release of a new stable series, current bug fixes might +become available at +http://codespeak.net/svn/lxml/branch/lxml-%(branch_version)s . Running +``easy_install lxml==%(branch_version)sbugfix`` will install this version from http://codespeak.net/svn/lxml/branch/lxml-%(branch_version)s#egg=lxml-%(branch_version)sbugfix """ % { "branch_version" : versioninfo.branch_version() }) + From scoder at codespeak.net Sun Sep 2 17:20:28 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 2 Sep 2007 17:20:28 +0200 (CEST) Subject: [Lxml-checkins] r46240 - lxml/trunk/doc Message-ID: <20070902152028.B66EE8141@code0.codespeak.net> Author: scoder Date: Sun Sep 2 17:20:28 2007 New Revision: 46240 Modified: lxml/trunk/doc/FAQ.txt Log: ReST fix Modified: lxml/trunk/doc/FAQ.txt ============================================================================== --- lxml/trunk/doc/FAQ.txt (original) +++ lxml/trunk/doc/FAQ.txt Sun Sep 2 17:20:28 2007 @@ -446,7 +446,7 @@ that problems become hard to debug and even harder to reproduce in a predictable way. If you encounter crashes in one these systems, but your code runs perfectly when started by hand, the following gives you a few hints for -possible approaches to solve your specific problem:: +possible approaches to solve your specific problem: * make sure you use recent versions of libxml2, libxslt and lxml. The libxml2 developers keep fixing bugs in each release, and lxml also tries to become From scoder at codespeak.net Sun Sep 2 18:13:31 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 2 Sep 2007 18:13:31 +0200 (CEST) Subject: [Lxml-checkins] r46241 - in lxml/trunk: . doc Message-ID: <20070902161331.7F1F680B8@code0.codespeak.net> Author: scoder Date: Sun Sep 2 18:13:29 2007 New Revision: 46241 Modified: lxml/trunk/CHANGES.txt lxml/trunk/doc/lxml2.txt lxml/trunk/doc/main.txt Log: pre-release cleanup Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Sun Sep 2 18:13:29 2007 @@ -2,7 +2,7 @@ lxml changelog ============== -2.0alpha1 (2007-08-31) +2.0alpha1 (2007-09-02) ====================== Features added @@ -39,8 +39,9 @@ * Schematron validation (incomplete in libxml2) -* Additional ``stringify`` argument to ``PyType()`` that takes a conversion - function to strings to support setting text values from arbitrary types. +* Additional ``stringify`` argument to ``objectify.PyType()`` takes a + conversion function to strings to support setting text values from arbitrary + types. * Entity support through an ``Entity`` factory and element classes. XML parsers now have a ``resolve_entities`` keyword argument that can be set to Modified: lxml/trunk/doc/lxml2.txt ============================================================================== --- lxml/trunk/doc/lxml2.txt (original) +++ lxml/trunk/doc/lxml2.txt Sun Sep 2 18:13:29 2007 @@ -89,7 +89,7 @@ facilitate further enhancements and an improved integration between lxml's features. -* lxml.objectify now has its own implementation of the ``E factory``. It uses +* lxml.objectify now has its own implementation of the `E factory`_. It uses the built-in type lookup mechanism of lxml.objectify, thus removing the need for an additional type registry mechanism (as previously available through the ``typemap`` parameter). @@ -104,6 +104,8 @@ bigger overlap with the XSLT code. The main benefits are improved thread safety in the XPath evaluators and Python RegExp support in standard XPath. +.. _`E factory`: objectify.html#tree-generation-with-the-e-factory + New modules =========== Modified: lxml/trunk/doc/main.txt ============================================================================== --- lxml/trunk/doc/main.txt (original) +++ lxml/trunk/doc/main.txt Sun Sep 2 18:13:29 2007 @@ -138,7 +138,7 @@ .. _`lxml at the Python Package Index`: http://pypi.python.org/pypi/lxml/ .. _`this key`: pubkey.asc -The latest version is `lxml 2.0alpha1`_, released 2007-08-31 +The latest version is `lxml 2.0alpha1`_, released 2007-09-02 (`changes for 2.0alpha1`_). `Older versions`_ are listed below. .. _`Older versions`: #old-versions From scoder at codespeak.net Sun Sep 2 18:34:07 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 2 Sep 2007 18:34:07 +0200 (CEST) Subject: [Lxml-checkins] r46242 - lxml/trunk/doc Message-ID: <20070902163407.0575D817B@code0.codespeak.net> Author: scoder Date: Sun Sep 2 18:34:07 2007 New Revision: 46242 Modified: lxml/trunk/doc/lxml2.txt Log: typo Modified: lxml/trunk/doc/lxml2.txt ============================================================================== --- lxml/trunk/doc/lxml2.txt (original) +++ lxml/trunk/doc/lxml2.txt Sun Sep 2 18:34:07 2007 @@ -67,7 +67,7 @@ * The type annotations in lxml.objectify (the ``pytype`` attribute) now use ``NoneType`` for the None value as this is the correct Python type name. - Previously, lxml 1.x used a lower case ``?one``. + Previously, lxml 1.x used a lower case ``none``. * Another change in objectify regards the way it deals with ambiguous types. Previously, setting a value like the string ``"3"`` through normal attribute From scoder at codespeak.net Mon Sep 3 11:57:29 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 3 Sep 2007 11:57:29 +0200 (CEST) Subject: [Lxml-checkins] r46262 - lxml/trunk/src/lxml Message-ID: <20070903095729.8D68C814C@code0.codespeak.net> Author: scoder Date: Mon Sep 3 11:57:27 2007 New Revision: 46262 Modified: lxml/trunk/src/lxml/etree.pyx Log: use list instead of dict in _TempStore to reduce overhead Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Mon Sep 3 11:57:27 2007 @@ -172,16 +172,13 @@ cdef class _TempStore: cdef object _storage def __init__(self): - self._storage = {} + self._storage = [] cdef void add(self, obj): - python.PyDict_SetItem(self._storage, id(obj), obj) + python.PyList_Append(self._storage, obj) cdef void clear(self): - python.PyDict_Clear(self._storage) - - cdef object dictcopy(self): - return self._storage.copy() + del self._storage[:] # class for temporarily storing exceptions raised in extensions cdef class _ExceptionContext: From scoder at codespeak.net Mon Sep 3 12:35:21 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 3 Sep 2007 12:35:21 +0200 (CEST) Subject: [Lxml-checkins] r46264 - lxml/trunk/src/lxml Message-ID: <20070903103521.65813814F@code0.codespeak.net> Author: scoder Date: Mon Sep 3 12:35:20 2007 New Revision: 46264 Modified: lxml/trunk/src/lxml/xslt.pxi Log: use separate resolver contexts for each XSLT call: exceptions and resolver temp storage must be local Modified: lxml/trunk/src/lxml/xslt.pxi ============================================================================== --- lxml/trunk/src/lxml/xslt.pxi (original) +++ lxml/trunk/src/lxml/xslt.pxi Mon Sep 3 12:35:20 2007 @@ -58,7 +58,7 @@ cdef _XSLTResolverContext _copy(self): cdef _XSLTResolverContext context context = _XSLTResolverContext(self._parser) - context._c_style_doc = _copyDoc(self._c_style_doc, 1) + context._c_style_doc = self._c_style_doc return context cdef xmlDoc* _xslt_resolve_stylesheet(char* c_uri, void* context): @@ -353,7 +353,10 @@ new_xslt._access_control = self._access_control new_xslt._error_log = _ErrorLog() new_xslt._context = self._context._copy() + new_xslt._xslt_resolver_context = self._xslt_resolver_context._copy() + new_xslt._xslt_resolver_context._c_style_doc = _copyDoc( + self._xslt_resolver_context._c_style_doc, 1) c_doc = _copyDoc(self._c_style.doc, 1) new_xslt._c_style = xslt.xsltParseStylesheetDoc(c_doc) @@ -365,6 +368,7 @@ def __call__(self, _input, profile_run=False, **_kw): cdef _XSLTContext context + cdef _XSLTResolverContext resolver_context cdef _Document input_doc cdef _Element root_node cdef _Document result_doc @@ -397,6 +401,9 @@ context = self._context._copy() context.register_context(transform_ctxt, input_doc) + resolver_context = self._xslt_resolver_context._copy() + transform_ctxt._private = resolver_context + c_result = self._run_transform( input_doc, c_doc, _kw, context, transform_ctxt) @@ -412,10 +419,10 @@ self._error_log.disconnect() try: - if self._xslt_resolver_context._has_raised(): + if resolver_context is not None and resolver_context._has_raised(): if c_result is not NULL: tree.xmlFreeDoc(c_result) - self._xslt_resolver_context._raise_if_stored() + resolver_context._raise_if_stored() if c_result is NULL: # last error seems to be the most accurate here @@ -431,31 +438,26 @@ message = "Error applying stylesheet" raise XSLTApplyError, message finally: - self._xslt_resolver_context.clear() + if resolver_context is not None: + resolver_context.clear() result_doc = _documentFactory(c_result, input_doc._parser) return _xsltResultTreeFactory(result_doc, self, profile_doc) cdef xmlDoc* _run_transform(self, _Document input_doc, xmlDoc* c_input_doc, - parameters, _XSLTContext context, - xslt.xsltTransformContext* transform_ctxt): + parameters, _XSLTContext context, + xslt.xsltTransformContext* transform_ctxt): cdef python.PyThreadState* state - cdef _XSLTResolverContext resolver_context cdef xmlDoc* c_result cdef char** params cdef Py_ssize_t i, parameter_count - resolver_context = _XSLTResolverContext(input_doc._parser) - resolver_context._c_style_doc = self._xslt_resolver_context._c_style_doc - xslt.xsltSetTransformErrorFunc(transform_ctxt, self._error_log, _receiveXSLTError) if self._access_control is not None: self._access_control._register_in_context(transform_ctxt) - transform_ctxt._private = self._xslt_resolver_context - parameter_count = python.PyDict_Size(parameters) if parameter_count > 0: # allocate space for parameters @@ -463,17 +465,21 @@ # and + 1 as array is NULL terminated params = python.PyMem_Malloc( sizeof(char*) * (parameter_count * 2 + 1)) - i = 0 - keep_ref = [] - for key, value in parameters.iteritems(): - k = _utf8(key) - python.PyList_Append(keep_ref, k) - v = _utf8(value) - python.PyList_Append(keep_ref, v) - params[i] = _cstr(k) - i = i + 1 - params[i] = _cstr(v) - i = i + 1 + try: + i = 0 + keep_ref = [] + for key, value in parameters.iteritems(): + k = _utf8(key) + python.PyList_Append(keep_ref, k) + v = _utf8(value) + python.PyList_Append(keep_ref, v) + params[i] = _cstr(k) + i = i + 1 + params[i] = _cstr(v) + i = i + 1 + except: + python.PyMem_Free(params) + raise params[i] = NULL else: params = NULL From scoder at codespeak.net Mon Sep 3 12:35:45 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 3 Sep 2007 12:35:45 +0200 (CEST) Subject: [Lxml-checkins] r46265 - lxml/trunk Message-ID: <20070903103545.B54D7814F@code0.codespeak.net> Author: scoder Date: Mon Sep 3 12:35:45 2007 New Revision: 46265 Modified: lxml/trunk/CHANGES.txt Log: changelog update Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Mon Sep 3 12:35:45 2007 @@ -2,6 +2,22 @@ lxml changelog ============== +Under development +================= + +Features added +-------------- + +Bugs fixed +---------- + +* Race condition in XSLT where the resolver context leaked between concurrent + XSLT calls + +Other changes +------------- + + 2.0alpha1 (2007-09-02) ====================== From scoder at codespeak.net Mon Sep 3 13:36:00 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 3 Sep 2007 13:36:00 +0200 (CEST) Subject: [Lxml-checkins] r46267 - lxml/trunk/src/lxml Message-ID: <20070903113600.3CF5A814D@code0.codespeak.net> Author: scoder Date: Mon Sep 3 13:35:58 2007 New Revision: 46267 Modified: lxml/trunk/src/lxml/parser.pxi Log: ET 1.3 compatible parser version Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Mon Sep 3 13:35:58 2007 @@ -484,6 +484,11 @@ return _makeElement(_tag, NULL, None, self, None, None, attrib, nsmap, _extra) + property version: + "The version of the underlying XML parser." + def __get__(self): + return "libxml2 %d.%d.%d" % LIBXML_VERSION + cdef xmlDoc* _parseUnicodeDoc(self, utext, char* c_filename) except NULL: """Parse unicode document, share dictionary if possible. """ From scoder at codespeak.net Mon Sep 3 13:43:01 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 3 Sep 2007 13:43:01 +0200 (CEST) Subject: [Lxml-checkins] r46269 - lxml/trunk/src/lxml/tests Message-ID: <20070903114301.85EC280E9@code0.codespeak.net> Author: scoder Date: Mon Sep 3 13:43:00 2007 New Revision: 46269 Modified: lxml/trunk/src/lxml/tests/common_imports.py Log: additional test import of cElementTree Modified: lxml/trunk/src/lxml/tests/common_imports.py ============================================================================== --- lxml/trunk/src/lxml/tests/common_imports.py (original) +++ lxml/trunk/src/lxml/tests/common_imports.py Mon Sep 3 13:43:00 2007 @@ -14,6 +14,14 @@ ElementTree = None try: + from xml.etree import cElementTree # Python 2.5 +except ImportError: + try: + from celementtree import cElementTree # standard ET + except ImportError: + cElementTree = None + +try: import doctest # check if the system version has everything we need doctest.DocFileSuite From scoder at codespeak.net Mon Sep 3 13:54:40 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 3 Sep 2007 13:54:40 +0200 (CEST) Subject: [Lxml-checkins] r46271 - lxml/trunk/src/lxml/tests Message-ID: <20070903115440.9B14B814E@code0.codespeak.net> Author: scoder Date: Mon Sep 3 13:54:37 2007 New Revision: 46271 Modified: lxml/trunk/src/lxml/tests/common_imports.py Log: fix import Modified: lxml/trunk/src/lxml/tests/common_imports.py ============================================================================== --- lxml/trunk/src/lxml/tests/common_imports.py (original) +++ lxml/trunk/src/lxml/tests/common_imports.py Mon Sep 3 13:54:37 2007 @@ -6,7 +6,7 @@ from lxml import etree try: - from xml.etree import ElementTree # Python 2.5 + from xml.etree import ElementTree # Python 2.5+ except ImportError: try: from elementtree import ElementTree # standard ET @@ -14,10 +14,10 @@ ElementTree = None try: - from xml.etree import cElementTree # Python 2.5 + from xml.etree import cElementTree # Python 2.5+ except ImportError: try: - from celementtree import cElementTree # standard ET + import cElementTree # standard ET except ImportError: cElementTree = None From scoder at codespeak.net Mon Sep 3 13:55:11 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 3 Sep 2007 13:55:11 +0200 (CEST) Subject: [Lxml-checkins] r46272 - lxml/trunk/src/lxml/tests Message-ID: <20070903115511.4FCBA814E@code0.codespeak.net> Author: scoder Date: Mon Sep 3 13:55:10 2007 New Revision: 46272 Modified: lxml/trunk/src/lxml/tests/test_elementtree.py Log: cleanup of test_elementtree.py to integrate cElementTree Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Mon Sep 3 13:55:10 2007 @@ -9,12 +9,17 @@ """ import unittest, doctest -import os, re, shutil, tempfile, copy +import os, re, shutil, tempfile, copy, operator -from common_imports import StringIO, etree, ElementTree -from common_imports import HelperTestCase, fileInTestDir, canonicalize +from common_imports import StringIO, etree, ElementTree, cElementTree +from common_imports import fileInTestDir, canonicalize -class ETreeTestCaseBase(HelperTestCase): +if cElementTree is not None: + if tuple([int(n) for n in + getattr(cElementTree, "VERSION", "0.0").split(".")]) <= (1,0,6): + cElementTree = None + +class ETreeTestCaseBase(unittest.TestCase): etree = None def setUp(self): @@ -75,7 +80,7 @@ self.assertEquals('one', root[0].tag) self.assertEquals('two', root[1].tag) self.assertEquals('three', root[2].tag) - self.assertRaises(IndexError, root.__getitem__, 3) + self.assertRaises(IndexError, operator.getitem, root, 3) def test_subelement(self): Element = self.etree.Element @@ -116,7 +121,7 @@ root = doc.getroot() self.assertEquals(1, len(root)) self.assertEquals('one', root[0].tag) - self.assertRaises(IndexError, root.__getitem__, 1) + self.assertRaises(IndexError, operator.getitem, root, 1) def test_element_indexing_with_text2(self): ElementTree = self.etree.ElementTree @@ -147,7 +152,7 @@ self.assertEquals(d, a[-1]) self.assertEquals(c, a[-2]) self.assertEquals(b, a[-3]) - self.assertRaises(IndexError, a.__getitem__, -4) + self.assertRaises(IndexError, operator.getitem, a, -4) a[-1] = e = Element('e') self.assertEquals(e, a[-1]) del a[-1] @@ -266,7 +271,7 @@ root = doc.getroot() self.assertEquals('One', root.attrib['one']) self.assertEquals('Two', root.attrib['two']) - self.assertRaises(KeyError, root.attrib.__getitem__, 'three') + self.assertRaises(KeyError, operator.getitem, root.attrib, 'three') def test_attributes2(self): ElementTree = self.etree.ElementTree @@ -917,6 +922,18 @@ self.assertXML("", a) + def test_processinginstruction(self): + # lxml.etree separates target and text + Element = self.etree.Element + SubElement = self.etree.SubElement + ProcessingInstruction = self.etree.PI + + a = Element('a') + a.append(ProcessingInstruction('foo', 'some more text')) + self.assertEquals(a[0].tag, ProcessingInstruction) + self.assertXML("", + a) + def test_pi_nonsense(self): ProcessingInstruction = self.etree.ProcessingInstruction pi = ProcessingInstruction('foo') @@ -980,7 +997,7 @@ a = Element('a') b = SubElement(a, 'b') - self.assertRaises(IndexError, a.__setitem__, 1, Element('c')) + self.assertRaises(IndexError, operator.setitem, a, 1, Element('c')) def test_setitem_tail(self): Element = self.etree.Element @@ -1583,7 +1600,7 @@ a.attrib['bar'] = 'Bar' self.assertEquals('Foo', a.attrib['foo']) del a.attrib['foo'] - self.assertRaises(KeyError, a.attrib.__getitem__, 'foo') + self.assertRaises(KeyError, operator.getitem, a.attrib, 'foo') def test_getslice(self): Element = self.etree.Element @@ -2514,6 +2531,15 @@ self.assertEquals(qname1, qname1) self.assertEquals(qname1, qname2) + def test_parser_version(self): + etree = self.etree + parser = etree.XMLParser() + if hasattr(parser, "version"): + # ElementTree 1.3+, cET + self.assert_(re.match("[^ ]+ [0-9.]+", parser.version)) + + # helper methods + def _writeElement(self, element, encoding='us-ascii'): """Write out element for comparison. """ @@ -2592,19 +2618,33 @@ mapping["key"] = "value" self.assertEquals("value", mapping["key"]) + # assertFalse doesn't exist in Python 2.3 + try: + unittest.TestCase.assertFalse + except AttributeError: + assertFalse = unittest.TestCase.failIf -class ETreeTestCase(ETreeTestCaseBase): - etree = etree + +if etree: + class ETreeTestCase(ETreeTestCaseBase): + etree = etree if ElementTree: class ElementTreeTestCase(ETreeTestCaseBase): etree = ElementTree +if cElementTree: + class CElementTreeTestCase(ETreeTestCaseBase): + etree = cElementTree + def test_suite(): suite = unittest.TestSuite() - suite.addTests([unittest.makeSuite(ETreeTestCase)]) + if etree: + suite.addTests([unittest.makeSuite(ETreeTestCase)]) if ElementTree: suite.addTests([unittest.makeSuite(ElementTreeTestCase)]) + if cElementTree: + suite.addTests([unittest.makeSuite(CElementTreeTestCase)]) return suite if __name__ == '__main__': From scoder at codespeak.net Mon Sep 3 16:02:37 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 3 Sep 2007 16:02:37 +0200 (CEST) Subject: [Lxml-checkins] r46276 - lxml/trunk/src/lxml/tests Message-ID: <20070903140237.7C45C8144@code0.codespeak.net> Author: scoder Date: Mon Sep 3 16:02:36 2007 New Revision: 46276 Modified: lxml/trunk/src/lxml/tests/common_imports.py lxml/trunk/src/lxml/tests/test_objectify.py lxml/trunk/src/lxml/tests/test_pyclasslookup.py Log: more test case cleanup Modified: lxml/trunk/src/lxml/tests/common_imports.py ============================================================================== --- lxml/trunk/src/lxml/tests/common_imports.py (original) +++ lxml/trunk/src/lxml/tests/common_imports.py Mon Sep 3 16:02:36 2007 @@ -31,12 +31,6 @@ # we need our own version to make it work (Python 2.3?) import local_doctest as doctest -try: - from operator import itemgetter -except ImportError: - def itemgetter(item): - return lambda obj: obj[item] - class HelperTestCase(unittest.TestCase): def parse(self, text): f = StringIO(text) Modified: lxml/trunk/src/lxml/tests/test_objectify.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_objectify.py (original) +++ lxml/trunk/src/lxml/tests/test_objectify.py Mon Sep 3 16:02:36 2007 @@ -9,7 +9,6 @@ from common_imports import etree, StringIO, HelperTestCase, fileInTestDir from common_imports import SillyFileLike, canonicalize, doctest -from common_imports import itemgetter from lxml import objectify @@ -373,7 +372,7 @@ self.assertEquals("0", root.c1.c2[0].text) self.assertEquals("1", root.c1.c2[1].text) self.assertEquals("2", root.c1.c2[2].text) - self.assertRaises(IndexError, itemgetter(3), root.c1.c2) + self.assertRaises(IndexError, operator.getitem, root.c1.c2, 3) def test_child_index_neg(self): root = self.XML(xml_str) @@ -381,7 +380,7 @@ self.assertEquals("0", root.c1.c2[-3].text) self.assertEquals("1", root.c1.c2[-2].text) self.assertEquals("2", root.c1.c2[-1].text) - self.assertRaises(IndexError, itemgetter(-4), root.c1.c2) + self.assertRaises(IndexError, operator.getitem, root.c1.c2, -4) def test_child_len(self): root = self.XML(xml_str) Modified: lxml/trunk/src/lxml/tests/test_pyclasslookup.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_pyclasslookup.py (original) +++ lxml/trunk/src/lxml/tests/test_pyclasslookup.py Mon Sep 3 16:02:36 2007 @@ -9,7 +9,6 @@ from common_imports import etree, StringIO, HelperTestCase, fileInTestDir from common_imports import SillyFileLike, canonicalize, doctest -from common_imports import itemgetter from lxml.pyclasslookup import PythonElementClassLookup From scoder at codespeak.net Mon Sep 3 19:32:24 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 3 Sep 2007 19:32:24 +0200 (CEST) Subject: [Lxml-checkins] r46292 - lxml/trunk/src/lxml/tests Message-ID: <20070903173224.5B117816C@code0.codespeak.net> Author: scoder Date: Mon Sep 3 19:32:23 2007 New Revision: 46292 Modified: lxml/trunk/src/lxml/tests/test_elementtree.py Log: API fix in ET tests Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Mon Sep 3 19:32:23 2007 @@ -687,7 +687,7 @@ SubElement = self.etree.SubElement el = Element('tag') - SubElement(el, 'foo', attrib={'foo':'Foo'}, baz="Baz") + SubElement(el, 'foo', {'foo':'Foo'}, baz="Baz") self.assertEquals("Baz", el[0].attrib['baz']) self.assertEquals('Foo', el[0].attrib['foo']) From scoder at codespeak.net Tue Sep 4 08:46:30 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 4 Sep 2007 08:46:30 +0200 (CEST) Subject: [Lxml-checkins] r46296 - lxml/trunk Message-ID: <20070904064630.30E8B8173@code0.codespeak.net> Author: scoder Date: Tue Sep 4 08:46:27 2007 New Revision: 46296 Modified: lxml/trunk/version.txt Log: set version to 2.0alpa2 Modified: lxml/trunk/version.txt ============================================================================== --- lxml/trunk/version.txt (original) +++ lxml/trunk/version.txt Tue Sep 4 08:46:27 2007 @@ -1 +1 @@ -2.0alpha1 +2.0alpha2 From scoder at codespeak.net Tue Sep 4 08:47:00 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 4 Sep 2007 08:47:00 +0200 (CEST) Subject: [Lxml-checkins] r46297 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20070904064700.AB2EE8173@code0.codespeak.net> Author: scoder Date: Tue Sep 4 08:47:00 2007 New Revision: 46297 Modified: lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/parser.pxi lxml/trunk/src/lxml/tests/test_elementtree.py Log: ET-like feed parser interface Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Tue Sep 4 08:47:00 2007 @@ -8,6 +8,9 @@ Features added -------------- +* ElementTree-like feed parser interface on XMLParser and HTMLParser + (``feed()`` and ``close()`` methods) + Bugs fixed ---------- Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Tue Sep 4 08:47:00 2007 @@ -4,7 +4,14 @@ cimport htmlparser from xmlparser cimport xmlParserCtxt, xmlDict -class XMLSyntaxError(LxmlSyntaxError): +class ParseError(LxmlSyntaxError): + """Syntax error while parsing an XML document. + + For compatibility with ElementTree 1.3 and later. + """ + pass + +class XMLSyntaxError(ParseError): """Syntax error while parsing an XML document. """ pass @@ -381,6 +388,7 @@ cdef xmlParserCtxt* _parser_ctxt cdef ElementClassLookup _class_lookup cdef python.PyThread_type_lock _parser_lock + cdef int _feed_parser_running def __init__(self, int parse_options, remove_comments, remove_pis, context_class=_ResolverContext): @@ -489,6 +497,113 @@ def __get__(self): return "libxml2 %d.%d.%d" % LIBXML_VERSION + # feed parser interface + + def feed(self, data): + """Feeds data to the parser. The argument should be an 8-bit string + buffer containing encoded data, although Unicode is supported as long + as both string types are not mixed. + + This is the main entry point to the consumer interface of a parser. + The parser will parse as much of the XML stream as it can on each + call. To finish parsing, call the ``close()`` method. + + It is not possible to use the parser in any other way after calling + the ``feed()`` method. The parser can only be reset by calling + ``close()``. + """ + cdef xmlParserCtxt* pctxt + cdef Py_ssize_t py_buffer_len + cdef char* c_data + cdef char* c_encoding + cdef int buffer_len + cdef int error + cdef int recover + if python.PyString_Check(data): + c_encoding = NULL + c_data = _cstr(data) + py_buffer_len = python.PyString_GET_SIZE(data) + elif python.PyUnicode_Check(data): + if _UNICODE_ENCODING is NULL: + raise ParserError, \ + "Unicode parsing is not supported on this platform" + c_encoding = _UNICODE_ENCODING + c_data = python.PyUnicode_AS_DATA(data) + py_buffer_len = python.PyUnicode_GET_DATA_SIZE(data) + else: + raise TypeError, "Parsing requires string data" + + if py_buffer_len > python.INT_MAX: + buffer_len = python.INT_MAX + else: + buffer_len = py_buffer_len + + pctxt = self._parser_ctxt + error = 0 + if not self._feed_parser_running: + self._lockParser() + self._feed_parser_running = 1 + self._error_log.connect() + __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt) + xmlparser.xmlCtxtUseOptions(pctxt, self._parse_options) + error = xmlparser.xmlCtxtResetPush( + pctxt, c_data, buffer_len, NULL, c_encoding) + py_buffer_len = py_buffer_len - buffer_len + + while error == 0 and py_buffer_len > 0: + c_data = c_data + buffer_len + if py_buffer_len > python.INT_MAX: + buffer_len = python.INT_MAX + else: + buffer_len = py_buffer_len + py_buffer_len = py_buffer_len - buffer_len + error = xmlparser.xmlParseChunk(pctxt, c_data, buffer_len, 0) + + if error: + self._feed_parser_running = 0 + try: + recover = self._parse_options & xmlparser.XML_PARSE_RECOVER + _handleParseResult(pctxt, pctxt.myDoc, None, + self._error_log, recover) + finally: + self._cleanup() + self._context.clear() + self._error_log.disconnect() + self._unlockParser() + + def close(self): + """Finishes feeding of data to this parser. This tells the parser to + process any remaining data in the feed buffer, and then returns the + root Element of the tree that was parsed. + + This method must be called after passing the last chunk of data into + the ``feed()`` method. It should only be called when using the feed + parser interface is used, all other usage is undefined. + """ + cdef xmlParserCtxt* pctxt + cdef xmlDoc* c_doc + cdef _Document doc + cdef int error + if not self._feed_parser_running: + raise XMLSyntaxError, "no element found" + pctxt = self._parser_ctxt + self._feed_parser_running = 0 + error = xmlparser.xmlParseChunk(pctxt, NULL, 0, 1) + try: + recover = self._parse_options & xmlparser.XML_PARSE_RECOVER + c_doc = _handleParseResult(pctxt, pctxt.myDoc, None, + self._error_log, recover) + finally: + self._cleanup() + self._context.clear() + self._error_log.disconnect() + self._unlockParser() + + doc = _documentFactory(c_doc, self) + return doc.getroot() + + # internal parser methods + cdef xmlDoc* _parseUnicodeDoc(self, utext, char* c_filename) except NULL: """Parse unicode document, share dictionary if possible. """ Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Tue Sep 4 08:47:00 2007 @@ -2538,6 +2538,34 @@ # ElementTree 1.3+, cET self.assert_(re.match("[^ ]+ [0-9.]+", parser.version)) + def test_feed_parser(self): + parser = self.etree.XMLParser() + + parser.feed('<') + parser.feed('a test="works"/') + parser.feed('>') + + root = parser.close() + + self.assertEquals(root.tag, "root") + self.assertEquals(root[0].tag, "a") + self.assertEquals(root[0].get("test"), "works") + + def test_feed_parser_error_close_empty(self): + parser = self.etree.XMLParser() + self.assertRaises(Exception, parser.close) + + def test_feed_parser_error_close_incomplete(self): + parser = self.etree.XMLParser() + + parser.feed(' An HTML attachment was scrubbed... URL: http://codespeak.net/pipermail/lxml-checkins/attachments/20070904/d0e19050/attachment.htm From scoder at codespeak.net Tue Sep 4 09:22:22 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 4 Sep 2007 09:22:22 +0200 (CEST) Subject: [Lxml-checkins] r46298 - in lxml/trunk: doc src/lxml Message-ID: <20070904072222.0E095817E@code0.codespeak.net> Author: scoder Date: Tue Sep 4 09:22:21 2007 New Revision: 46298 Modified: lxml/trunk/doc/parsing.txt lxml/trunk/src/lxml/parser.pxi Log: doc update on the feed parser Modified: lxml/trunk/doc/parsing.txt ============================================================================== --- lxml/trunk/doc/parsing.txt (original) +++ lxml/trunk/doc/parsing.txt Tue Sep 4 09:22:21 2007 @@ -9,8 +9,17 @@ .. contents:: .. 1 Parsers - 2 iterparse and iterwalk - 3 Python unicode strings + 1.1 Parser options + 1.2 Parsing HTML + 1.3 Doctype information + 2 The feed parser interface + 3 iterparse and iterwalk + 3.1 Selective tag events + 3.2 Modifying the tree + 3.3 iterwalk + 4 Python unicode strings + 4.1 Serialising to Unicode strings + The usual setup procedure:: @@ -167,6 +176,45 @@ ascii +The feed parser interface +========================= + +Since lxml 2.0, the parsers have a feed parser interface that is compatible to +the `ElementTree parsers`_. You can use it to feed data into the parser in a +controlled step-by-step way. Note that you can only use one interface at a +time: the ``parse()`` or ``XML()`` functions, or the feed parser interface. + +.. _`ElementTree parsers`: http://effbot.org/elementtree/elementtree-xmlparser.htm + +To start parsing with a feed parser, just call its ``feed()`` method:: + + >>> parser = etree.XMLParser() + + >>> for data in (''): + ... parser.feed(data) + +When you are done parsing, you **must** call the ``close()`` method to +retrieve the root Element of the parse result document, and to unlock the +parser:: + + >>> root = parser.close() + + >>> print root.tag + root + >>> print root[0].tag + a + +If you do not call ``close()``, the parser will stay locked and subsequent +usages will block till the end of times. So make sure you also close it in +the exception case. + +Another way of achieving the same step-by-step parsing is by writing your own +file-like object that returns a chunk of data on each ``read()`` call. Where +the feed parser interface allows you to actively pass data chunks into the +parser, a file-like object passively responds to ``read()`` requests of the +parser itself. Depending on the data source, either way may be more natural. + + iterparse and iterwalk ====================== Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Tue Sep 4 09:22:21 2007 @@ -578,7 +578,7 @@ This method must be called after passing the last chunk of data into the ``feed()`` method. It should only be called when using the feed - parser interface is used, all other usage is undefined. + parser interface, all other usage is undefined. """ cdef xmlParserCtxt* pctxt cdef xmlDoc* c_doc From scoder at codespeak.net Tue Sep 4 09:57:20 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 4 Sep 2007 09:57:20 +0200 (CEST) Subject: [Lxml-checkins] r46299 - in lxml/trunk: . src/lxml Message-ID: <20070904075720.E19E2812C@code0.codespeak.net> Author: scoder Date: Tue Sep 4 09:57:19 2007 New Revision: 46299 Modified: lxml/trunk/Makefile lxml/trunk/src/lxml/classlookup.pxi lxml/trunk/src/lxml/objectify.pyx lxml/trunk/src/lxml/sax.py Log: docstring cleanup Modified: lxml/trunk/Makefile ============================================================================== --- lxml/trunk/Makefile (original) +++ lxml/trunk/Makefile Tue Sep 4 09:57:19 2007 @@ -39,7 +39,7 @@ rm -fr doc/html/api @[ -x "`which epydoc`" ] \ && (cd src && echo "Generating API docs ..." && \ - PYTHONPATH=. epydoc -o ../doc/html/api --name lxml --url http://codespeak.net/lxml/ lxml/) \ + PYTHONPATH=. epydoc -v -o ../doc/html/api --name lxml --url http://codespeak.net/lxml/ lxml/) \ || (echo "not generating epydoc API documentation") # XXX What should the default be? Modified: lxml/trunk/src/lxml/classlookup.pxi ============================================================================== --- lxml/trunk/src/lxml/classlookup.pxi (original) +++ lxml/trunk/src/lxml/classlookup.pxi Tue Sep 4 09:57:19 2007 @@ -231,7 +231,7 @@ cdef class CustomElementClassLookup(FallbackElementClassLookup): """Element class lookup based on a subclass method. - You can inherit from this class and override the method + You can inherit from this class and override the method:: lookup(self, type, doc, namespace, name) Modified: lxml/trunk/src/lxml/objectify.pyx ============================================================================== --- lxml/trunk/src/lxml/objectify.pyx (original) +++ lxml/trunk/src/lxml/objectify.pyx Tue Sep 4 09:57:19 2007 @@ -266,12 +266,14 @@ _appendValue(self, _buildChildTag(self, tag), value) def __getitem__(self, key): - """Return a sibling, counting from the first child of the parent. + """Return a sibling, counting from the first child of the parent. The + method behaves like both a dict and a sequence. * If argument is an integer, returns the sibling at that position. - * If argument is a string, does the same as getattr(). This is used - to provide namespaces for element lookup. + * If argument is a string, does the same as getattr(). This can be + used to provide namespaces for element lookup, or to look up + children with special names (``text`` etc.). """ cdef tree.xmlNode* c_self_node cdef tree.xmlNode* c_parent Modified: lxml/trunk/src/lxml/sax.py ============================================================================== --- lxml/trunk/src/lxml/sax.py (original) +++ lxml/trunk/src/lxml/sax.py Tue Sep 4 09:57:19 2007 @@ -1,8 +1,9 @@ from xml.sax.handler import ContentHandler -from etree import ElementTree, Element, SubElement, LxmlError -from etree import XML, Comment, ProcessingInstruction +import etree +from etree import ElementTree, SubElement +from etree import Comment, ProcessingInstruction -class SaxError(LxmlError): +class SaxError(etree.LxmlError): """General SAX error. """ pass @@ -24,7 +25,7 @@ self._ns_mapping = { None : [None] } self._new_mappings = {} if makeelement is None: - makeelement = Element + makeelement = etree.Element self._makeelement = makeelement def _get_etree(self): From lxml-checkins at codespeak.net Tue Sep 4 21:30:02 2007 From: lxml-checkins at codespeak.net (Viagra.com Inc ®) Date: Tue, 4 Sep 2007 21:30:02 +0200 (CEST) Subject: [Lxml-checkins] Official Site Message-ID: <11023058544.0412311336185.567948300-5610@cimail939.msn.com> An HTML attachment was scrubbed... URL: http://codespeak.net/pipermail/lxml-checkins/attachments/20070904/31e65509/attachment.htm From scoder at codespeak.net Tue Sep 4 21:32:51 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 4 Sep 2007 21:32:51 +0200 (CEST) Subject: [Lxml-checkins] r46311 - lxml/trunk/src/lxml Message-ID: <20070904193251.C0F248130@code0.codespeak.net> Author: scoder Date: Tue Sep 4 21:32:51 2007 New Revision: 46311 Modified: lxml/trunk/src/lxml/parser.pxi Log: feed parser fix Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Tue Sep 4 21:32:51 2007 @@ -1,4 +1,4 @@ -# XML parser that provides dictionary sharing +# Parsers for XML and HTML cimport xmlparser cimport htmlparser @@ -533,11 +533,6 @@ else: raise TypeError, "Parsing requires string data" - if py_buffer_len > python.INT_MAX: - buffer_len = python.INT_MAX - else: - buffer_len = py_buffer_len - pctxt = self._parser_ctxt error = 0 if not self._feed_parser_running: @@ -546,18 +541,26 @@ self._error_log.connect() __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt) xmlparser.xmlCtxtUseOptions(pctxt, self._parse_options) + + if py_buffer_len > python.INT_MAX: + buffer_len = python.INT_MAX + else: + buffer_len = py_buffer_len + error = xmlparser.xmlCtxtResetPush( - pctxt, c_data, buffer_len, NULL, c_encoding) + pctxt, c_data, buffer_len, NULL, c_encoding) + py_buffer_len = py_buffer_len - buffer_len + c_data = c_data + buffer_len while error == 0 and py_buffer_len > 0: - c_data = c_data + buffer_len if py_buffer_len > python.INT_MAX: buffer_len = python.INT_MAX else: buffer_len = py_buffer_len py_buffer_len = py_buffer_len - buffer_len error = xmlparser.xmlParseChunk(pctxt, c_data, buffer_len, 0) + c_data = c_data + buffer_len if error: self._feed_parser_running = 0 From scoder at codespeak.net Tue Sep 4 21:33:53 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 4 Sep 2007 21:33:53 +0200 (CEST) Subject: [Lxml-checkins] r46312 - lxml/trunk/src/lxml/tests Message-ID: <20070904193353.BE2CC8130@code0.codespeak.net> Author: scoder Date: Tue Sep 4 21:33:53 2007 New Revision: 46312 Modified: lxml/trunk/src/lxml/tests/test_elementtree.py Log: test case for broken feed parser input Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Tue Sep 4 21:33:53 2007 @@ -2566,6 +2566,19 @@ self.assertRaises(Exception, parser.close) + def test_feed_parser_error_broken(self): + parser = self.etree.XMLParser() + + parser.feed(' Author: ianb Date: Thu Sep 6 17:40:00 2007 New Revision: 46372 Modified: lxml/trunk/src/lxml/html/clean.py Log: typo in copy Modified: lxml/trunk/src/lxml/html/clean.py ============================================================================== --- lxml/trunk/src/lxml/html/clean.py (original) +++ lxml/trunk/src/lxml/html/clean.py Thu Sep 6 17:40:00 2007 @@ -27,6 +27,7 @@ # +ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4- # you don't always have to have the charset set, if the page has no charset # and there's UTF7-like code in it. +# Look at these tests: http://htmlpurifier.org/live/smoketests/xssAttacks.php # This is an IE-specific construct you can have in a stylesheet to @@ -355,7 +356,7 @@ doc = fromstring(html) else: return_string = False - doc = copy.deepcopy(doc) + doc = copy.deepcopy(html) self(doc) if return_string: return tostring(doc) From lxml-checkins at codespeak.net Fri Sep 7 02:54:50 2007 From: lxml-checkins at codespeak.net (lxml-checkins at codespeak.net) Date: Fri, 7 Sep 2007 02:54:50 +0200 (CEST) Subject: [Lxml-checkins] apnalbdq lxml-checkins@codespeak.net Offer Message-ID: <20070907145617.23681.qmail@adsl-pool-222.123.32-32.tttmaxnet.com> An HTML attachment was scrubbed... URL: http://codespeak.net/pipermail/lxml-checkins/attachments/20070907/39b7c150/attachment.htm From scoder at codespeak.net Mon Sep 10 14:20:58 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 10 Sep 2007 14:20:58 +0200 (CEST) Subject: [Lxml-checkins] r46438 - lxml/trunk Message-ID: <20070910122058.A18CF810E@code0.codespeak.net> Author: scoder Date: Mon Sep 10 14:20:57 2007 New Revision: 46438 Modified: lxml/trunk/setup.py lxml/trunk/versioninfo.py Log: drop branch link in pre-releases Modified: lxml/trunk/setup.py ============================================================================== --- lxml/trunk/setup.py (original) +++ lxml/trunk/setup.py Mon Sep 10 14:20:57 2007 @@ -41,6 +41,19 @@ print "Building lxml version", svn_version +branch_link = """ +After an official release of a new stable series, current bug fixes become +available at http://codespeak.net/svn/lxml/branch/lxml-%(branch_version)s . +Running ``easy_install lxml==%(branch_version)sbugfix`` will install this +version from +http://codespeak.net/svn/lxml/branch/lxml-%(branch_version)s#egg=lxml-%(branch_version)sbugfix + +""" + +if versioninfo.is_pre_release(): + branch_link = "" + + extra_options.update(setupinfo.extra_setup_args()) setup( @@ -55,7 +68,7 @@ description="Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API.", - long_description=(("""\ + long_description=((("""\ lxml is a Pythonic, mature binding for the libxml2 and libxslt libraries. It provides safe and convenient access to these libraries using the ElementTree API. @@ -71,13 +84,7 @@ Running ``easy_install lxml==dev`` will install it from http://codespeak.net/svn/lxml/trunk#egg=lxml-dev -After an official release of a new stable series, current bug fixes might -become available at -http://codespeak.net/svn/lxml/branch/lxml-%(branch_version)s . Running -``easy_install lxml==%(branch_version)sbugfix`` will install this version from -http://codespeak.net/svn/lxml/branch/lxml-%(branch_version)s#egg=lxml-%(branch_version)sbugfix - -""" % { "branch_version" : versioninfo.branch_version() }) + +""" + branch_link) % { "branch_version" : versioninfo.branch_version() }) + versioninfo.changes()), classifiers = [ versioninfo.dev_status(), Modified: lxml/trunk/versioninfo.py ============================================================================== --- lxml/trunk/versioninfo.py (original) +++ lxml/trunk/versioninfo.py Mon Sep 10 14:20:57 2007 @@ -11,6 +11,10 @@ def branch_version(): return version()[:3] +def is_pre_release(): + version_string = version() + return "dev" in version_string or "alpha" in version_string or "beta" in version_string + def svn_version(): _version = version() src_dir = get_src_dir() From scoder at codespeak.net Mon Sep 10 14:21:10 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 10 Sep 2007 14:21:10 +0200 (CEST) Subject: [Lxml-checkins] r46439 - lxml/trunk/doc Message-ID: <20070910122110.D99C48111@code0.codespeak.net> Author: scoder Date: Mon Sep 10 14:21:10 2007 New Revision: 46439 Modified: lxml/trunk/doc/parsing.txt Log: small clarification in docs Modified: lxml/trunk/doc/parsing.txt ============================================================================== --- lxml/trunk/doc/parsing.txt (original) +++ lxml/trunk/doc/parsing.txt Mon Sep 10 14:21:10 2007 @@ -182,7 +182,8 @@ Since lxml 2.0, the parsers have a feed parser interface that is compatible to the `ElementTree parsers`_. You can use it to feed data into the parser in a controlled step-by-step way. Note that you can only use one interface at a -time: the ``parse()`` or ``XML()`` functions, or the feed parser interface. +time with each parser: the ``parse()`` or ``XML()`` functions, or the feed +parser interface. .. _`ElementTree parsers`: http://effbot.org/elementtree/elementtree-xmlparser.htm From scoder at codespeak.net Mon Sep 10 16:23:24 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 10 Sep 2007 16:23:24 +0200 (CEST) Subject: [Lxml-checkins] r46441 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20070910142324.3E714810F@code0.codespeak.net> Author: scoder Date: Mon Sep 10 16:23:22 2007 New Revision: 46441 Modified: lxml/trunk/selftest.py lxml/trunk/src/lxml/cstd.pxd lxml/trunk/src/lxml/docloader.pxi lxml/trunk/src/lxml/dtd.pxi lxml/trunk/src/lxml/etree.pyx lxml/trunk/src/lxml/htmlparser.pxd lxml/trunk/src/lxml/iterparse.pxi lxml/trunk/src/lxml/parser.pxi lxml/trunk/src/lxml/tests/test_elementtree.py lxml/trunk/src/lxml/xmlparser.pxd Log: major restructuring of the parser code to better interate feed parser and (the new) target parser Modified: lxml/trunk/selftest.py ============================================================================== --- lxml/trunk/selftest.py (original) +++ lxml/trunk/selftest.py Mon Sep 10 16:23:22 2007 @@ -266,7 +266,8 @@ ## """ ## Test HTML parsing. -## >>> p = HTMLTreeBuilder.TreeBuilder() +## >>> # p = HTMLTreeBuilder.TreeBuilder() +## >>> p = ElementTree.HTMLParser() ## >>> p.feed("

spamegg

") ## >>> serialize(p.close()) ## '

spamegg

' Modified: lxml/trunk/src/lxml/cstd.pxd ============================================================================== --- lxml/trunk/src/lxml/cstd.pxd (original) +++ lxml/trunk/src/lxml/cstd.pxd Mon Sep 10 16:23:22 2007 @@ -13,6 +13,7 @@ cdef int strcmp(char* s1, char* s2) cdef int strncmp(char* s1, char* s2, size_t len) cdef void* memcpy(void* dest, void* src, size_t len) + cdef void* memset(void* s, int c, size_t len) cdef extern from "stdarg.h": ctypedef void *va_list Modified: lxml/trunk/src/lxml/docloader.pxi ============================================================================== --- lxml/trunk/src/lxml/docloader.pxi (original) +++ lxml/trunk/src/lxml/docloader.pxi Mon Sep 10 16:23:22 2007 @@ -94,9 +94,12 @@ cdef class _ResolverContext(_ExceptionContext): cdef _ResolverRegistry _resolvers cdef _TempStore _storage - def __init__(self, _ResolverRegistry resolvers not None): + def __init__(self, _ResolverRegistry resolvers): _ExceptionContext.__init__(self) - self._resolvers = resolvers + if resolvers is None: + self._resolvers = _ResolverRegistry() + else: + self._resolvers = resolvers self._storage = _TempStore() cdef void clear(self): Modified: lxml/trunk/src/lxml/dtd.pxi ============================================================================== --- lxml/trunk/src/lxml/dtd.pxi (original) +++ lxml/trunk/src/lxml/dtd.pxi Mon Sep 10 16:23:22 2007 @@ -88,10 +88,10 @@ cdef tree.xmlDtd* _parseDtdFromFilelike(file) except NULL: cdef _ExceptionContext exc_context - cdef _FileParserContext dtd_parser + cdef _FileReaderContext dtd_parser cdef tree.xmlDtd* c_dtd exc_context = _ExceptionContext() - dtd_parser = _FileParserContext(file, exc_context) + dtd_parser = _FileReaderContext(file, exc_context) c_dtd = dtd_parser._readDtd() Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Mon Sep 10 16:23:22 2007 @@ -2131,19 +2131,20 @@ ################################################################################ # Include submodules -include "proxy.pxi" # Proxy handling (element backpointers/memory/etc.) -include "apihelpers.pxi" # Private helper functions -include "xmlerror.pxi" # Error and log handling -include "classlookup.pxi"# Element class lookup mechanisms -include "nsclasses.pxi" # Namespace implementation and registry -include "docloader.pxi" # Support for custom document loaders -include "parser.pxi" # XML Parser -include "serializer.pxi" # XML output functions -include "iterparse.pxi" # incremental XML parsing -include "xmlid.pxi" # XMLID and IDDict -include "extensions.pxi" # XPath/XSLT extension functions -include "xpath.pxi" # XPath evaluation -include "xslt.pxi" # XSL transformations +include "proxy.pxi" # Proxy handling (element backpointers/memory/etc.) +include "apihelpers.pxi" # Private helper functions +include "xmlerror.pxi" # Error and log handling +include "classlookup.pxi" # Element class lookup mechanisms +include "nsclasses.pxi" # Namespace implementation and registry +include "docloader.pxi" # Support for custom document loaders +include "parser.pxi" # XML Parser +include "parsertarget.pxi" # ET Parser target +include "serializer.pxi" # XML output functions +include "iterparse.pxi" # incremental XML parsing +include "xmlid.pxi" # XMLID and IDDict +include "extensions.pxi" # XPath/XSLT extension functions +include "xpath.pxi" # XPath evaluation +include "xslt.pxi" # XSL transformations ################################################################################ Modified: lxml/trunk/src/lxml/htmlparser.pxd ============================================================================== --- lxml/trunk/src/lxml/htmlparser.pxd (original) +++ lxml/trunk/src/lxml/htmlparser.pxd Mon Sep 10 16:23:22 2007 @@ -17,7 +17,11 @@ cdef xmlParserCtxt* htmlCreateMemoryParserCtxt(char* buffer, int size) cdef xmlParserCtxt* htmlCreateFileParserCtxt(char* filename, char* encoding) cdef void htmlFreeParserCtxt(xmlParserCtxt* ctxt) + cdef void htmlCtxtReset(xmlParserCtxt* ctxt) + cdef int htmlCtxtUseOptions(xmlParserCtxt* ctxt, int options) cdef int htmlParseDocument(xmlParserCtxt* ctxt) + cdef int htmlParseChunk(xmlParserCtxt* ctxt, + char* chunk, int size, int terminate) cdef xmlDoc* htmlCtxtReadFile(xmlParserCtxt* ctxt, char* filename, char* encoding, Modified: lxml/trunk/src/lxml/iterparse.pxi ============================================================================== --- lxml/trunk/src/lxml/iterparse.pxi (original) +++ lxml/trunk/src/lxml/iterparse.pxi Mon Sep 10 16:23:22 2007 @@ -48,7 +48,7 @@ c_ns = c_ns.next return count -cdef class _IterparseContext(_ResolverContext): +cdef class _IterparseContext(_ParserContext): cdef xmlparser.startElementNsSAX2Func _origSaxStart cdef xmlparser.endElementNsSAX2Func _origSaxEnd cdef _Element _root @@ -64,8 +64,8 @@ cdef char* _tag_href cdef char* _tag_name - def __init__(self, _ResolverRegistry resolvers): - _ResolverContext.__init__(self, resolvers) + def __init__(self): + _ParserContext.__init__(self) self._ns_stack = [] self._pop_ns = self._ns_stack.pop self._node_stack = [] @@ -73,22 +73,25 @@ self._events = [] self._event_index = 0 - cdef void _wrapCallbacks(self, xmlparser.xmlSAXHandler* sax): + cdef void _initParserContext(self, xmlparser.xmlParserCtxt* c_ctxt): "wrap original SAX2 callbacks" + cdef xmlparser.xmlSAXHandler* sax + _ParserContext._initParserContext(self, c_ctxt) + sax = c_ctxt.sax self._origSaxStart = sax.startElementNs # only override start event handler if needed if self._event_filter == 0 or \ self._event_filter & (ITERPARSE_FILTER_START | \ ITERPARSE_FILTER_START_NS | \ ITERPARSE_FILTER_END_NS): - sax.startElementNs = _saxStart + sax.startElementNs = _iterparseSaxStart self._origSaxEnd = sax.endElementNs # only override end event handler if needed if self._event_filter == 0 or \ self._event_filter & (ITERPARSE_FILTER_END | \ ITERPARSE_FILTER_END_NS): - sax.endElementNs = _saxEnd + sax.endElementNs = _iterparseSaxEnd cdef _setEventFilter(self, events, tag): self._event_filter = _buildIterparseEventFilter(events) @@ -184,9 +187,10 @@ cdef xmlparser.endElementNsSAX2Func _getOrigEnd(xmlparser.xmlParserCtxt* c_ctxt): return (<_IterparseContext>c_ctxt._private)._origSaxEnd -cdef void _saxStart(void* ctxt, char* localname, char* prefix, char* URI, - int nb_namespaces, char** namespaces, - int nb_attributes, int nb_defaulted, char** attributes): +cdef void _iterparseSaxStart(void* ctxt, char* localname, char* prefix, + char* URI, int nb_namespaces, char** namespaces, + int nb_attributes, int nb_defaulted, + char** attributes): # no Python in here! cdef xmlparser.xmlParserCtxt* c_ctxt cdef xmlparser.startElementNsSAX2Func origStart @@ -196,7 +200,7 @@ nb_attributes, nb_defaulted, attributes) _pushSaxStartEvent(c_ctxt, c_ctxt.node) -cdef void _saxEnd(void* ctxt, char* localname, char* prefix, char* URI): +cdef void _iterparseSaxEnd(void* ctxt, char* localname, char* prefix, char* URI): # no Python in here! cdef xmlparser.xmlParserCtxt* c_ctxt cdef xmlparser.endElementNsSAX2Func origEnd @@ -276,15 +280,17 @@ parse_options = parse_options | xmlparser.XML_PARSE_NOBLANKS _BaseParser.__init__(self, parse_options, remove_comments, remove_pis, - _IterparseContext) + None) context = <_IterparseContext>self._context context._setEventFilter(events, tag) - context._wrapCallbacks(self._parser_ctxt.sax) xmlparser.xmlCtxtUseOptions(self._parser_ctxt, parse_options) xmlparser.xmlCtxtResetPush(self._parser_ctxt, NULL, 0, c_filename, NULL) self._lockParser() # will not be unlocked - no other methods supported + cdef _ParserContext _createContext(self, target): + return _IterparseContext() + def __iter__(self): return self @@ -318,7 +324,8 @@ break if error != 0: self._source = None - _raiseParseError(self._parser_ctxt, self._filename, None) + _raiseParseError(self._parser_ctxt, self._filename, + self._context._error_log) if python.PyList_GET_SIZE(context._events) == 0: self.root = context._root self._source = None Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Mon Sep 10 16:23:22 2007 @@ -2,7 +2,6 @@ cimport xmlparser cimport htmlparser -from xmlparser cimport xmlParserCtxt, xmlDict class ParseError(LxmlSyntaxError): """Syntax error while parsing an XML document. @@ -26,17 +25,17 @@ LXML_HTML_PARSER LXML_ITERPARSE_PARSER -cdef class _ParserContext: +cdef class _ParserDictionaryContext: # Global parser context to share the string dictionary. # - # This class is a singleton! + # This class is a delegate singleton! # - # It creates _ParserContext objects for each thread to keep thread state, + # It creates _ParserDictionaryContext objects for each thread to keep thread state, # but those must never be used directly. Always stick to using the static # __GLOBAL_PARSER_CONTEXT as defined below the class. # - cdef xmlDict* _c_dict + cdef tree.xmlDict* _c_dict cdef _BaseParser _default_parser def __dealloc__(self): if self._c_dict is not NULL: @@ -49,33 +48,33 @@ cdef python.PyObject* result thread_dict = python.PyThreadState_GetDict() if thread_dict is not NULL: - python.PyDict_SetItem(thread_dict, "_ParserContext", self) + python.PyDict_SetItem(thread_dict, "_ParserDictionaryContext", self) - cdef _ParserContext _findThreadParserContext(self): - "Find (or create) the _ParserContext object for the current thread" + cdef _ParserDictionaryContext _findThreadParserContext(self): + "Find (or create) the _ParserDictionaryContext object for the current thread" cdef python.PyObject* thread_dict cdef python.PyObject* result - cdef _ParserContext context + cdef _ParserDictionaryContext context thread_dict = python.PyThreadState_GetDict() if thread_dict is NULL: return self d = thread_dict - result = python.PyDict_GetItem(d, "_ParserContext") + result = python.PyDict_GetItem(d, "_ParserDictionaryContext") if result is not NULL: return result - context = _ParserContext() - python.PyDict_SetItem(d, "_ParserContext", context) + context = _ParserDictionaryContext() + python.PyDict_SetItem(d, "_ParserDictionaryContext", context) return context cdef void setDefaultParser(self, _BaseParser parser): "Set the default parser for the current thread" - cdef _ParserContext context + cdef _ParserDictionaryContext context context = self._findThreadParserContext() context._default_parser = parser cdef _BaseParser getDefaultParser(self): "Return (or create) the default parser of the current thread" - cdef _ParserContext context + cdef _ParserDictionaryContext context context = self._findThreadParserContext() if context._default_parser is None: if self._default_parser is None: @@ -84,9 +83,9 @@ context._default_parser = self._default_parser._copy() return context._default_parser - cdef xmlDict* _getThreadDict(self, xmlDict* default): + cdef tree.xmlDict* _getThreadDict(self, tree.xmlDict* default): "Return the thread-local dict or create a new one if necessary." - cdef _ParserContext context + cdef _ParserDictionaryContext context context = self._findThreadParserContext() if context._c_dict is NULL: # thread dict not yet set up => use default or create a new one @@ -100,9 +99,9 @@ context._c_dict = xmlparser.xmlDictCreateSub(self._c_dict) return context._c_dict - cdef void initThreadDictRef(self, xmlDict** c_dict_ref): - cdef xmlDict* c_dict - cdef xmlDict* c_thread_dict + cdef void initThreadDictRef(self, tree.xmlDict** c_dict_ref): + cdef tree.xmlDict* c_dict + cdef tree.xmlDict* c_thread_dict c_dict = c_dict_ref[0] c_thread_dict = self._getThreadDict(c_dict) if c_dict is c_thread_dict: @@ -112,7 +111,7 @@ c_dict_ref[0] = c_thread_dict xmlparser.xmlDictReference(c_thread_dict) - cdef void initParserDict(self, xmlParserCtxt* pctxt): + cdef void initParserDict(self, xmlparser.xmlParserCtxt* pctxt): "Assure we always use the same string dictionary." self.initThreadDictRef(&pctxt.dict) @@ -127,11 +126,11 @@ # otherwise we'd free data that's in use => segfault self.initThreadDictRef(&result.dict) -cdef _ParserContext __GLOBAL_PARSER_CONTEXT -__GLOBAL_PARSER_CONTEXT = _ParserContext() +cdef _ParserDictionaryContext __GLOBAL_PARSER_CONTEXT +__GLOBAL_PARSER_CONTEXT = _ParserDictionaryContext() __GLOBAL_PARSER_CONTEXT.initMainParserContext() -cdef int _checkThreadDict(xmlDict* c_dict): +cdef int _checkThreadDict(tree.xmlDict* c_dict): """Check that c_dict is either the local thread dictionary or the global parent dictionary. """ @@ -205,7 +204,7 @@ ## support for file-like objects ############################################################ -cdef class _FileParserContext: +cdef class _FileReaderContext: cdef object _filelike cdef object _url cdef object _bytes @@ -223,14 +222,15 @@ self._bytes = '' self._bytes_read = 0 - cdef xmlparser.xmlParserInput* _createParserInput(self, xmlParserCtxt* ctxt): + cdef xmlparser.xmlParserInput* _createParserInput( + self, xmlparser.xmlParserCtxt* ctxt): cdef xmlparser.xmlParserInputBuffer* c_buffer c_buffer = xmlparser.xmlAllocParserInputBuffer(0) c_buffer.context = self c_buffer.readcallback = _readFilelikeParser return xmlparser.xmlNewIOInputStream(ctxt, c_buffer, 0) - cdef xmlDoc* _readDoc(self, xmlParserCtxt* ctxt, int options, + cdef xmlDoc* _readDoc(self, xmlparser.xmlParserCtxt* ctxt, int options, LxmlParserType parser_type): cdef python.PyThreadState* state cdef xmlDoc* result @@ -291,19 +291,19 @@ return -1 cdef int _readFilelikeParser(void* ctxt, char* c_buffer, int c_size): - return (<_FileParserContext>ctxt).copyToBuffer(c_buffer, c_size) + return (<_FileReaderContext>ctxt).copyToBuffer(c_buffer, c_size) ############################################################ ## support for custom document loaders ############################################################ cdef xmlparser.xmlParserInput* _parser_resolve_from_python( - char* c_url, char* c_pubid, xmlParserCtxt* c_context, int* error): + char* c_url, char* c_pubid, xmlparser.xmlParserCtxt* c_context, int* error): # call the Python document loaders cdef xmlparser.xmlParserInput* c_input cdef _ResolverContext context cdef _InputDocument doc_ref - cdef _FileParserContext file_context + cdef _FileReaderContext file_context error[0] = 0 context = <_ResolverContext>c_context._private try: @@ -338,7 +338,7 @@ c_input = xmlparser.xmlNewInputFromFile( c_context, _cstr(doc_ref._data_bytes)) elif doc_ref._type == PARSER_DATA_FILE: - file_context = _FileParserContext(doc_ref._file, context, url) + file_context = _FileReaderContext(doc_ref._file, context, url) c_input = file_context._createParserInput(c_context) data = file_context @@ -347,7 +347,7 @@ return c_input cdef xmlparser.xmlParserInput* _local_resolver(char* c_url, char* c_pubid, - xmlParserCtxt* c_context): + xmlparser.xmlParserCtxt* c_context): # no Python objects here, may be called without thread context ! # when we declare a Python object, Pyrex will INCREF(None) ! cdef xmlparser.xmlParserInput* c_input @@ -379,42 +379,145 @@ ## Parsers ############################################################ +cdef class _ParserContext(_ResolverContext): + cdef _ErrorLog _error_log + cdef xmlparser.xmlParserCtxt* _c_ctxt + def __init__(self): + _ResolverContext.__init__(self, _ResolverRegistry()) + self._error_log = _ErrorLog() + + cdef void _initParserContext(self, xmlparser.xmlParserCtxt* c_ctxt): + self._c_ctxt = c_ctxt + + cdef object _handleParseResult(self, _BaseParser parser, + xmlDoc* result, filename): + cdef xmlDoc* c_doc + cdef int recover + recover = parser._parse_options & xmlparser.XML_PARSE_RECOVER + c_doc = _handleParseResult(self, self._c_ctxt, result, + filename, recover) + return _documentFactory(c_doc, parser) + + cdef xmlDoc* _handleParseResultDoc(self, _BaseParser parser, + xmlDoc* result, filename) except NULL: + cdef int recover + recover = parser._parse_options & xmlparser.XML_PARSE_RECOVER + return _handleParseResult(self, self._c_ctxt, result, + filename, recover) + +cdef class _InternalParserContext(_ParserContext): + """Parser context for internal single-shot parsing + """ + +cdef int _raiseParseError(xmlparser.xmlParserCtxt* ctxt, filename, + _ErrorLog error_log) except 0: + if filename is not None and \ + ctxt.lastError.domain == xmlerror.XML_FROM_IO: + if ctxt.lastError.message is not NULL: + message = "Error reading file '%s': %s" % ( + filename, (ctxt.lastError.message).strip()) + else: + message = "Error reading file '%s'" % filename + raise IOError, message + elif error_log: + raise XMLSyntaxError, error_log._buildExceptionMessage( + "Document is not well formed") + elif ctxt.lastError.message is not NULL: + message = (ctxt.lastError.message).strip() + if ctxt.lastError.line > 0: + message = "line %d: %s" % (ctxt.lastError.line, message) + raise XMLSyntaxError, message + else: + raise XMLSyntaxError + +cdef xmlDoc* _handleParseResult(_ParserContext context, + xmlparser.xmlParserCtxt* c_ctxt, + xmlDoc* result, filename, + int recover) except NULL: + cdef int well_formed + if c_ctxt.myDoc is not NULL: + if c_ctxt.myDoc != result: + tree.xmlFreeDoc(c_ctxt.myDoc) + c_ctxt.myDoc = NULL + + if result is not NULL: + if recover or (c_ctxt.wellFormed and \ + c_ctxt.lastError.level < xmlerror.XML_ERR_ERROR): + well_formed = 1 + elif not c_ctxt.replaceEntities and not c_ctxt.validate \ + and context is not None: + # in this mode, we ignore errors about undefined entities + for error in context._error_log.filter_from_errors(): + if error.type != ErrorTypes.WAR_UNDECLARED_ENTITY and \ + error.type != ErrorTypes.ERR_UNDECLARED_ENTITY: + well_formed = 0 + break + else: + well_formed = 1 + else: + well_formed = 0 + + if well_formed: + __GLOBAL_PARSER_CONTEXT.initDocDict(result) + else: + # free broken document + tree.xmlFreeDoc(result) + result = NULL + + if context is not None and context._has_raised(): + if result is not NULL: + tree.xmlFreeDoc(result) + result = NULL + context._raise_if_stored() + + if result is NULL: + if context is not None: + _raiseParseError(c_ctxt, filename, context._error_log) + else: + _raiseParseError(c_ctxt, filename, None) + elif result.URL is NULL and filename is not None: + result.URL = tree.xmlStrdup(_cstr(filename)) + return result + + cdef class _BaseParser: cdef int _parse_options - cdef _ErrorLog _error_log - cdef readonly _ResolverRegistry resolvers - cdef _ResolverContext _context + cdef _ParserContext _context cdef LxmlParserType _parser_type - cdef xmlParserCtxt* _parser_ctxt + cdef xmlparser.xmlParserCtxt* _parser_ctxt cdef ElementClassLookup _class_lookup cdef python.PyThread_type_lock _parser_lock cdef int _feed_parser_running def __init__(self, int parse_options, remove_comments, remove_pis, - context_class=_ResolverContext): - cdef xmlParserCtxt* pctxt + target): + cdef xmlparser.xmlParserCtxt* pctxt if isinstance(self, HTMLParser): self._parser_type = LXML_HTML_PARSER - pctxt = htmlparser.htmlCreateMemoryParserCtxt('dummy', 5) elif isinstance(self, XMLParser): self._parser_type = LXML_XML_PARSER - pctxt = xmlparser.xmlNewParserCtxt() elif isinstance(self, iterparse): self._parser_type = LXML_ITERPARSE_PARSER - pctxt = xmlparser.xmlNewParserCtxt() else: raise TypeError, "This class cannot be instantiated" + self._parse_options = parse_options + + pctxt = self._newParserCtxt() self._parser_ctxt = pctxt if pctxt is NULL: python.PyErr_NoMemory() - if pctxt.sax != NULL: - if remove_comments: - pctxt.sax.comment = NULL - if remove_pis: - pctxt.sax.processingInstruction = NULL - # hard switch-off for CDATA nodes => makes them plain text - pctxt.sax.cdataBlock = NULL + + self._context = self._createContext(target) + self._context._initParserContext(pctxt) + pctxt._private = self._context + + if remove_comments: + pctxt.sax.comment = NULL + if remove_pis: + pctxt.sax.processingInstruction = NULL + # hard switch-off for CDATA nodes => makes them plain text + pctxt.sax.cdataBlock = NULL if not config.ENABLE_THREADING or \ self._parser_type == LXML_ITERPARSE_PARSER: @@ -422,10 +525,18 @@ self._parser_lock = NULL else: self._parser_lock = python.PyThread_allocate_lock() - self._error_log = _ErrorLog() - self.resolvers = _ResolverRegistry() - self._context = context_class(self.resolvers) - pctxt._private = self._context + + cdef _ParserContext _createContext(self, target): + if target is not None: + return _TargetParserContext(target) + else: + return _InternalParserContext() + + cdef xmlparser.xmlParserCtxt* _newParserCtxt(self): + if self._parser_type == LXML_HTML_PARSER: + return htmlparser.htmlCreateMemoryParserCtxt('dummy', 5) + else: + return xmlparser.xmlNewParserCtxt() def __dealloc__(self): if self._parser_ctxt is not NULL: @@ -434,7 +545,7 @@ python.PyThread_free_lock(self._parser_lock) cdef void _cleanup(self): - cdef xmlParserCtxt* pctxt + cdef xmlparser.xmlParserCtxt* pctxt pctxt = self._parser_ctxt if pctxt is not NULL: if pctxt.spaceTab is not NULL: # work around bug in libxml2 @@ -458,7 +569,11 @@ property error_log: def __get__(self): - return self._error_log.copy() + return self._context._error_log.copy() + + property resolvers: + def __get__(self): + return self._context._resolvers def setElementClassLookup(self, ElementClassLookup lookup = None): "Deprecated, use ``parser.set_element_class_lookup(lookup)`` instead." @@ -497,114 +612,6 @@ def __get__(self): return "libxml2 %d.%d.%d" % LIBXML_VERSION - # feed parser interface - - def feed(self, data): - """Feeds data to the parser. The argument should be an 8-bit string - buffer containing encoded data, although Unicode is supported as long - as both string types are not mixed. - - This is the main entry point to the consumer interface of a parser. - The parser will parse as much of the XML stream as it can on each - call. To finish parsing, call the ``close()`` method. - - It is not possible to use the parser in any other way after calling - the ``feed()`` method. The parser can only be reset by calling - ``close()``. - """ - cdef xmlParserCtxt* pctxt - cdef Py_ssize_t py_buffer_len - cdef char* c_data - cdef char* c_encoding - cdef int buffer_len - cdef int error - cdef int recover - if python.PyString_Check(data): - c_encoding = NULL - c_data = _cstr(data) - py_buffer_len = python.PyString_GET_SIZE(data) - elif python.PyUnicode_Check(data): - if _UNICODE_ENCODING is NULL: - raise ParserError, \ - "Unicode parsing is not supported on this platform" - c_encoding = _UNICODE_ENCODING - c_data = python.PyUnicode_AS_DATA(data) - py_buffer_len = python.PyUnicode_GET_DATA_SIZE(data) - else: - raise TypeError, "Parsing requires string data" - - pctxt = self._parser_ctxt - error = 0 - if not self._feed_parser_running: - self._lockParser() - self._feed_parser_running = 1 - self._error_log.connect() - __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt) - xmlparser.xmlCtxtUseOptions(pctxt, self._parse_options) - - if py_buffer_len > python.INT_MAX: - buffer_len = python.INT_MAX - else: - buffer_len = py_buffer_len - - error = xmlparser.xmlCtxtResetPush( - pctxt, c_data, buffer_len, NULL, c_encoding) - - py_buffer_len = py_buffer_len - buffer_len - c_data = c_data + buffer_len - - while error == 0 and py_buffer_len > 0: - if py_buffer_len > python.INT_MAX: - buffer_len = python.INT_MAX - else: - buffer_len = py_buffer_len - py_buffer_len = py_buffer_len - buffer_len - error = xmlparser.xmlParseChunk(pctxt, c_data, buffer_len, 0) - c_data = c_data + buffer_len - - if error: - self._feed_parser_running = 0 - try: - recover = self._parse_options & xmlparser.XML_PARSE_RECOVER - _handleParseResult(pctxt, pctxt.myDoc, None, - self._error_log, recover) - finally: - self._cleanup() - self._context.clear() - self._error_log.disconnect() - self._unlockParser() - - def close(self): - """Finishes feeding of data to this parser. This tells the parser to - process any remaining data in the feed buffer, and then returns the - root Element of the tree that was parsed. - - This method must be called after passing the last chunk of data into - the ``feed()`` method. It should only be called when using the feed - parser interface, all other usage is undefined. - """ - cdef xmlParserCtxt* pctxt - cdef xmlDoc* c_doc - cdef _Document doc - cdef int error - if not self._feed_parser_running: - raise XMLSyntaxError, "no element found" - pctxt = self._parser_ctxt - self._feed_parser_running = 0 - error = xmlparser.xmlParseChunk(pctxt, NULL, 0, 1) - try: - recover = self._parse_options & xmlparser.XML_PARSE_RECOVER - c_doc = _handleParseResult(pctxt, pctxt.myDoc, None, - self._error_log, recover) - finally: - self._cleanup() - self._context.clear() - self._error_log.disconnect() - self._unlockParser() - - doc = _documentFactory(c_doc, self) - return doc.getroot() - # internal parser methods cdef xmlDoc* _parseUnicodeDoc(self, utext, char* c_filename) except NULL: @@ -612,7 +619,7 @@ """ cdef python.PyThreadState* state cdef xmlDoc* result - cdef xmlParserCtxt* pctxt + cdef xmlparser.xmlParserCtxt* pctxt cdef int recover cdef Py_ssize_t py_buffer_len cdef int buffer_len @@ -625,7 +632,7 @@ buffer_len = py_buffer_len self._lockParser() - self._error_log.connect() + self._context._error_log.connect() try: pctxt = self._parser_ctxt __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt) @@ -642,13 +649,11 @@ self._parse_options) python.PyEval_RestoreThread(state) - recover = self._parse_options & xmlparser.XML_PARSE_RECOVER - return _handleParseResult(pctxt, result, None, - self._error_log, recover) + return self._context._handleParseResultDoc(self, result, None) finally: self._cleanup() self._context.clear() - self._error_log.disconnect() + self._context._error_log.disconnect() self._unlockParser() cdef xmlDoc* _parseDoc(self, char* c_text, Py_ssize_t c_len, @@ -657,12 +662,12 @@ """ cdef python.PyThreadState* state cdef xmlDoc* result - cdef xmlParserCtxt* pctxt + cdef xmlparser.xmlParserCtxt* pctxt cdef int recover if c_len > python.INT_MAX: raise ParserError, "string is too long to parse it with libxml2" self._lockParser() - self._error_log.connect() + self._context._error_log.connect() try: pctxt = self._parser_ctxt __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt) @@ -676,24 +681,22 @@ pctxt, c_text, c_len, c_filename, NULL, self._parse_options) python.PyEval_RestoreThread(state) - recover = self._parse_options & xmlparser.XML_PARSE_RECOVER - return _handleParseResult(pctxt, result, None, - self._error_log, recover) + return self._context._handleParseResultDoc(self, result, None) finally: self._cleanup() self._context.clear() - self._error_log.disconnect() + self._context._error_log.disconnect() self._unlockParser() cdef xmlDoc* _parseDocFromFile(self, char* c_filename) except NULL: cdef python.PyThreadState* state cdef xmlDoc* result - cdef xmlParserCtxt* pctxt + cdef xmlparser.xmlParserCtxt* pctxt cdef int recover cdef int orig_options result = NULL self._lockParser() - self._error_log.connect() + self._context._error_log.connect() try: pctxt = self._parser_ctxt __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt) @@ -709,108 +712,182 @@ python.PyEval_RestoreThread(state) pctxt.options = orig_options # work around libxml2 problem - recover = self._parse_options & xmlparser.XML_PARSE_RECOVER - return _handleParseResult(pctxt, result, c_filename, - self._error_log, recover) + return self._context._handleParseResultDoc( + self, result, c_filename) finally: self._cleanup() self._context.clear() - self._error_log.disconnect() + self._context._error_log.disconnect() self._unlockParser() cdef xmlDoc* _parseDocFromFilelike(self, filelike, filename) except NULL: - cdef _FileParserContext file_context + cdef _FileReaderContext file_context cdef xmlDoc* result - cdef xmlParserCtxt* pctxt + cdef xmlparser.xmlParserCtxt* pctxt cdef char* c_filename cdef int recover if not filename: filename = None self._lockParser() - self._error_log.connect() + self._context._error_log.connect() try: pctxt = self._parser_ctxt __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt) - file_context = _FileParserContext(filelike, self._context, filename) + file_context = _FileReaderContext(filelike, self._context, filename) result = file_context._readDoc( pctxt, self._parse_options, self._parser_type) - recover = self._parse_options & xmlparser.XML_PARSE_RECOVER - return _handleParseResult(pctxt, result, filename, - self._error_log, recover) + return self._context._handleParseResultDoc( + self, result, filename) finally: self._cleanup() self._context.clear() - self._error_log.disconnect() + self._context._error_log.disconnect() self._unlockParser() -cdef int _raiseParseError(xmlParserCtxt* ctxt, filename, - _ErrorLog error_log) except 0: - if filename is not None and \ - ctxt.lastError.domain == xmlerror.XML_FROM_IO: - if ctxt.lastError.message is not NULL: - message = "Error reading file '%s': %s" % ( - filename, (ctxt.lastError.message).strip()) +############################################################ +## ET feed parser +############################################################ + +cdef class _FeedParser(_BaseParser): + def feed(self, data): + """Feeds data to the parser. The argument should be an 8-bit string + buffer containing encoded data, although Unicode is supported as long + as both string types are not mixed. + + This is the main entry point to the consumer interface of a parser. + The parser will parse as much of the XML stream as it can on each + call. To finish parsing, call the ``close()`` method. + + It is not possible to use the parser in any other way after calling + the ``feed()`` method. The parser can only be reset by calling + ``close()``. + """ + cdef xmlparser.xmlParserCtxt* pctxt + cdef Py_ssize_t py_buffer_len + cdef char* c_data + cdef char* c_encoding + cdef int buffer_len + cdef int error + cdef int recover + if python.PyString_Check(data): + c_encoding = NULL + c_data = _cstr(data) + py_buffer_len = python.PyString_GET_SIZE(data) + elif python.PyUnicode_Check(data): + if _UNICODE_ENCODING is NULL: + raise ParserError, \ + "Unicode parsing is not supported on this platform" + c_encoding = _UNICODE_ENCODING + c_data = python.PyUnicode_AS_DATA(data) + py_buffer_len = python.PyUnicode_GET_DATA_SIZE(data) else: - message = "Error reading file '%s'" % filename - raise IOError, message - elif error_log is not None: - raise XMLSyntaxError, error_log._buildExceptionMessage( - "Document is not well formed") - elif ctxt.lastError.message is not NULL: - message = (ctxt.lastError.message).strip() - if ctxt.lastError.line > 0: - message = "line %d: %s" % (ctxt.lastError.line, message) - raise XMLSyntaxError, message - else: - raise XMLSyntaxError + raise TypeError, "Parsing requires string data" -cdef xmlDoc* _handleParseResult(xmlParserCtxt* ctxt, xmlDoc* result, - filename, _ErrorLog error_log, - int recover) except NULL: - cdef _ResolverContext context - cdef int well_formed - if ctxt.myDoc is not NULL: - if ctxt.myDoc != result: - tree.xmlFreeDoc(ctxt.myDoc) - ctxt.myDoc = NULL + pctxt = self._parser_ctxt + error = 0 + if not self._feed_parser_running: + self._lockParser() + self._feed_parser_running = 1 + self._context._error_log.connect() + __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt) - if result is not NULL: - if recover or (ctxt.wellFormed and \ - ctxt.lastError.level < xmlerror.XML_ERR_ERROR): - well_formed = 1 - elif not ctxt.replaceEntities and not ctxt.validate: - # in this mode, we ignore errors about undefined entities - for error in error_log.filter_from_errors(): - if error.type != ErrorTypes.WAR_UNDECLARED_ENTITY and \ - error.type != ErrorTypes.ERR_UNDECLARED_ENTITY: - well_formed = 0 - break + if py_buffer_len > python.INT_MAX: + buffer_len = python.INT_MAX else: - well_formed = 1 + buffer_len = py_buffer_len + if self._parser_type == LXML_HTML_PARSER: + error = _htmlCtxtResetPush(pctxt, c_data, buffer_len, + c_encoding, self._parse_options) + else: + error = xmlparser.xmlCtxtResetPush( + pctxt, c_data, buffer_len, NULL, c_encoding) + xmlparser.xmlCtxtUseOptions(pctxt, self._parse_options) + py_buffer_len = py_buffer_len - buffer_len + c_data = c_data + buffer_len + + while error == 0 and py_buffer_len > 0: + if py_buffer_len > python.INT_MAX: + buffer_len = python.INT_MAX + else: + buffer_len = py_buffer_len + if self._parser_type == LXML_HTML_PARSER: + error = htmlparser.htmlParseChunk(pctxt, c_data, buffer_len, 0) + else: + error = xmlparser.xmlParseChunk(pctxt, c_data, buffer_len, 0) + py_buffer_len = py_buffer_len - buffer_len + c_data = c_data + buffer_len + + if error: + self._feed_parser_running = 0 + try: + self._context._handleParseResult( + self, pctxt.myDoc, None) + finally: + self._cleanup() + self._context.clear() + self._context._error_log.disconnect() + self._unlockParser() + + def close(self): + """Terminates feeding data to this parser. This tells the parser to + process any remaining data in the feed buffer, and then returns the + root Element of the tree that was parsed. + + This method must be called after passing the last chunk of data into + the ``feed()`` method. It should only be called when using the feed + parser interface, all other usage is undefined. + """ + cdef xmlparser.xmlParserCtxt* pctxt + cdef xmlDoc* c_doc + cdef _Document doc + cdef int is_target_parser, error + if not self._feed_parser_running: + raise XMLSyntaxError, "no element found" + pctxt = self._parser_ctxt + self._feed_parser_running = 0 + if self._parser_type == LXML_HTML_PARSER: + error = htmlparser.htmlParseChunk(pctxt, NULL, 0, 1) else: - well_formed = 0 + error = xmlparser.xmlParseChunk(pctxt, NULL, 0, 1) + is_target_parser = isinstance(self._context, _TargetParserContext) + try: + result = self._context._handleParseResult( + self, pctxt.myDoc, None) + finally: + self._cleanup() + self._context.clear() + self._context._error_log.disconnect() + self._unlockParser() - if well_formed: - __GLOBAL_PARSER_CONTEXT.initDocDict(result) + if isinstance(result, _Document): + return (<_Document>result).getroot() else: - # free broken document - tree.xmlFreeDoc(result) - result = NULL + return result - if ctxt._private is not NULL: - context = <_ResolverContext>ctxt._private - if context._has_raised(): - if result is not NULL: - tree.xmlFreeDoc(result) - result = NULL - context._raise_if_stored() +cdef int _htmlCtxtResetPush(xmlparser.xmlParserCtxt* c_ctxt, + char* c_data, int buffer_len, + char* c_encoding, int parse_options) except -1: + cdef xmlparser.xmlParserInput* c_input_stream + # libxml2 crashes if spaceTab is not initialised + if _LIBXML_VERSION_INT < 20629 and c_ctxt.spaceTab is NULL: + c_ctxt.spaceTab = tree.xmlMalloc(10 * sizeof(int)) + c_ctxt.spaceMax = 10 - if result is NULL: - _raiseParseError(ctxt, filename, error_log) - elif result.URL is NULL and filename is not None: - result.URL = tree.xmlStrdup(_cstr(filename)) - return result + # libxml2 lacks an HTML push parser setup function + error = xmlparser.xmlCtxtResetPush(c_ctxt, NULL, 0, NULL, c_encoding) + if error: + return error + + # fix libxml2 setup for HTML + c_ctxt.progressive = 1 + c_ctxt.html = 1 + htmlparser.htmlCtxtUseOptions(c_ctxt, parse_options) + + if c_data is not NULL and buffer_len > 0: + return htmlparser.htmlParseChunk(c_ctxt, c_data, buffer_len, 0) + return 0 + ############################################################ ## XML parser @@ -824,7 +901,7 @@ xmlparser.XML_PARSE_COMPACT ) -cdef class XMLParser(_BaseParser): +cdef class XMLParser(_FeedParser): """The XML parser. Parsers can be supplied as additional argument to various parse functions of the lxml API. A default parser is always available and can be replaced by a call to the global function @@ -848,6 +925,8 @@ * compact - safe memory for short text content (default: True) * resolve_entities - replace entities by their text value (default: True) + You can pass a parser target as ``target`` keyword argument. + Note that you should avoid sharing parsers between threads. While this is not harmful, it is more efficient to use separate parsers. This does not apply to the default parser. @@ -856,7 +935,7 @@ load_dtd=False, no_network=True, ns_clean=False, recover=False, remove_blank_text=False, compact=True, resolve_entities=True, remove_comments=False, - remove_pis=False): + remove_pis=False, target=None): cdef int parse_options parse_options = _XML_DEFAULT_PARSE_OPTIONS if load_dtd: @@ -880,7 +959,8 @@ if not resolve_entities: parse_options = parse_options ^ xmlparser.XML_PARSE_NOENT - _BaseParser.__init__(self, parse_options, remove_comments, remove_pis) + _BaseParser.__init__(self, parse_options, remove_comments, remove_pis, + target) cdef class ETCompatXMLParser(XMLParser): """An XML parser with an ElementTree compatible default setup. See the @@ -893,18 +973,18 @@ load_dtd=False, no_network=True, ns_clean=False, recover=False, remove_blank_text=False, compact=True, resolve_entities=True, remove_comments=True, - remove_pis=True): + remove_pis=True, target=None): XMLParser.__init__(self, attribute_defaults, dtd_validation, load_dtd, no_network, ns_clean, recover, remove_blank_text, compact, resolve_entities, remove_comments, - remove_pis) + remove_pis, target) cdef xmlDoc* _internalParseDoc(char* c_text, int options, _ResolverContext context) except NULL: # internal parser function for XSLT - cdef xmlParserCtxt* pctxt + cdef xmlparser.xmlParserCtxt* pctxt cdef xmlDoc* c_doc cdef int recover pctxt = xmlparser.xmlNewParserCtxt() @@ -916,7 +996,7 @@ pctxt, c_text, NULL, NULL, options) try: recover = options & xmlparser.XML_PARSE_RECOVER - c_doc = _handleParseResult(pctxt, c_doc, None, None, recover) + c_doc = _handleParseResult(None, pctxt, c_doc, None, recover) finally: xmlparser.xmlFreeParserCtxt(pctxt) return c_doc @@ -924,7 +1004,7 @@ cdef xmlDoc* _internalParseDocFromFile(char* c_filename, int options, _ResolverContext context) except NULL: # internal parser function for XSLT - cdef xmlParserCtxt* pctxt + cdef xmlparser.xmlParserCtxt* pctxt cdef xmlDoc* c_doc cdef int recover pctxt = xmlparser.xmlNewParserCtxt() @@ -940,7 +1020,7 @@ filename = None else: filename = c_filename - c_doc = _handleParseResult(pctxt, c_doc, filename, None, recover) + c_doc = _handleParseResult(None, pctxt, c_doc, filename, recover) finally: xmlparser.xmlFreeParserCtxt(pctxt) return c_doc @@ -987,7 +1067,7 @@ htmlparser.HTML_PARSE_COMPACT ) -cdef class HTMLParser(_BaseParser): +cdef class HTMLParser(_FeedParser): """The HTML parser. This parser allows reading HTML into a normal XML tree. By default, it can read broken (non well-formed) HTML, depending on the capabilities of libxml2. Use the 'recover' option to switch this off. @@ -1000,11 +1080,14 @@ * remove_pis - discard processing instructions * compact - safe memory for short text content (default: True) + You can pass a parser target as ``target`` keyword argument. + Note that you should avoid sharing parsers between threads for performance reasons. """ def __init__(self, recover=True, no_network=True, remove_blank_text=False, - compact=True, remove_comments=False, remove_pis=False): + compact=True, remove_comments=False, remove_pis=False, + target=None): cdef int parse_options parse_options = _HTML_DEFAULT_PARSE_OPTIONS if remove_blank_text: @@ -1016,7 +1099,8 @@ if not compact: parse_options = parse_options ^ htmlparser.HTML_PARSE_COMPACT - _BaseParser.__init__(self, parse_options, remove_comments, remove_pis) + _BaseParser.__init__(self, parse_options, remove_comments, remove_pis, + target) cdef HTMLParser __DEFAULT_HTML_PARSER __DEFAULT_HTML_PARSER = HTMLParser() Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Mon Sep 10 16:23:22 2007 @@ -2538,6 +2538,8 @@ # ElementTree 1.3+, cET self.assert_(re.match("[^ ]+ [0-9.]+", parser.version)) + # feed parser interface + def test_feed_parser(self): parser = self.etree.XMLParser() @@ -2579,6 +2581,81 @@ self.assertRaises(Exception, parser.close) + # parser target interface + + def test_parser_target_tag(self): + assertEquals = self.assertEquals + assertFalse = self.assertFalse + + events = [] + class Target(object): + def start(self, tag, attrib): + events.append("start") + assertFalse(attrib) + assertEquals("TAG", tag) + def end(self, tag): + events.append("end") + assertEquals("TAG", tag) + def close(self): + return "DONE" + + parser = self.etree.XMLParser(target=Target()) + + parser.feed("") + done = parser.close() + + self.assertEquals("DONE", done) + self.assertEquals(["start", "end"], events) + + def test_parser_target_attrib(self): + assertEquals = self.assertEquals + assertFalse = self.assertFalse + + events = [] + class Target(object): + def start(self, tag, attrib): + events.append("start-" + tag) + for name, value in attrib.iteritems(): + assertEquals(tag + name, value) + def end(self, tag): + events.append("end-" + tag) + def close(self): + return "DONE" + + parser = self.etree.XMLParser(target=Target()) + + parser.feed('') + done = parser.close() + + self.assertEquals("DONE", done) + self.assertEquals(["start-root", "start-sub", "end-sub", "end-root"], + events) + + def test_parser_target_data(self): + assertEquals = self.assertEquals + assertFalse = self.assertFalse + + events = [] + class Target(object): + def start(self, tag, attrib): + events.append("start-" + tag) + def end(self, tag): + events.append("end-" + tag) + def data(self, data): + events.append("data-" + data) + def close(self): + return "DONE" + + parser = self.etree.XMLParser(target=Target()) + + parser.feed('AB') + done = parser.close() + + self.assertEquals("DONE", done) + self.assertEquals(["start-root", "data-A", "start-sub", + "end-sub", "data-B", "end-root"], + events) + # helper methods def _writeElement(self, element, encoding='us-ascii'): Modified: lxml/trunk/src/lxml/xmlparser.pxd ============================================================================== --- lxml/trunk/src/lxml/xmlparser.pxd (original) +++ lxml/trunk/src/lxml/xmlparser.pxd Mon Sep 10 16:23:22 2007 @@ -19,17 +19,25 @@ char* prefix, char* URI) - ctypedef void (*cdataBlockSAXFunc)(void* ctx, - char* value, - int len) + ctypedef void (*charactersSAXFunc)(void* ctx, char* ch, int len) - ctypedef void (*commentSAXFunc)(void* ctx, - char* value) + ctypedef void (*cdataBlockSAXFunc)(void* ctx, char* value, int len) - ctypedef void (*processingInstructionSAXFunc)(void * ctx, + ctypedef void (*commentSAXFunc)(void* ctx, char* value) + + ctypedef void (*processingInstructionSAXFunc)(void* ctx, char* target, char* data) + ctypedef void (*internalSubsetSAXFunc)(void* ctx, + char* name, + char* externalID, + char* systemID) + + ctypedef void (*endDocumentSAXFunc)(void* ctx) + + cdef int XML_SAX2_MAGIC + cdef extern from "libxml/tree.h": ctypedef struct xmlParserInput ctypedef struct xmlParserInputBuffer: @@ -38,11 +46,15 @@ xmlInputCloseCallback closecallback ctypedef struct xmlSAXHandler: + internalSubsetSAXFunc internalSubset startElementNsSAX2Func startElementNs endElementNsSAX2Func endElementNs + charactersSAXFunc characters cdataBlockSAXFunc cdataBlock commentSAXFunc comment processingInstructionSAXFunc processingInstruction + endDocumentSAXFunc endDocument + int initialized cdef extern from "libxml/xmlIO.h": cdef xmlParserInputBuffer* xmlAllocParserInputBuffer(int enc) @@ -54,6 +66,8 @@ cdef void xmlDictFree(xmlDict* sub) cdef int xmlDictReference(xmlDict* dict) + cdef int XML_COMPLETE_ATTRS # SAX option for adding DTD default attributes + ctypedef struct xmlParserCtxt: xmlDoc* myDoc xmlDict* dict @@ -64,11 +78,16 @@ int disableSAX int errNo int replaceEntities + int loadsubset int validate xmlError lastError xmlNode* node xmlSAXHandler* sax int* spaceTab + int spaceMax + int html + int progressive + int charset ctypedef enum xmlParserOption: XML_PARSE_RECOVER = 1 # recover on errors From scoder at codespeak.net Tue Sep 11 12:03:55 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 11 Sep 2007 12:03:55 +0200 (CEST) Subject: [Lxml-checkins] r46463 - lxml/trunk/src/lxml Message-ID: <20070911100355.A56EF80AF@code0.codespeak.net> Author: scoder Date: Tue Sep 11 12:03:54 2007 New Revision: 46463 Added: lxml/trunk/src/lxml/objectpath.pxi - copied, changed from r46423, lxml/trunk/src/lxml/objectify.pyx Modified: lxml/trunk/src/lxml/objectify.pyx Log: moved objectpath implementation to separate .pxi Modified: lxml/trunk/src/lxml/objectify.pyx ============================================================================== --- lxml/trunk/src/lxml/objectify.pyx (original) +++ lxml/trunk/src/lxml/objectify.pyx Tue Sep 11 12:03:54 2007 @@ -15,9 +15,6 @@ __version__ = etree.__version__ -cdef object SubElement -SubElement = etree.SubElement - cdef object re import re cdef object __builtin__ @@ -494,7 +491,8 @@ for item in value: _appendValue(parent, tag, item) else: - new_element = SubElement(parent, tag) + new_element = cetree.makeSubElement( + parent, tag, None, None, None, None) _setElementValue(new_element, value) cdef _setElementValue(_Element element, value): @@ -1294,341 +1292,6 @@ ################################################################################ -# ObjectPath - -ctypedef struct _ObjectPath: - char* href - char* name - Py_ssize_t index - -cdef class ObjectPath: - """Immutable object that represents a compiled object path. - - Example for a path: 'root.child[1].{other}child[25]' - """ - cdef readonly object find - cdef object _path - cdef object _path_str - cdef _ObjectPath* _c_path - cdef Py_ssize_t _path_len - def __init__(self, path): - if python._isString(path): - self._path = _parseObjectPathString(path) - self._path_str = path - else: - self._path = _parseObjectPathList(path) - self._path_str = '.'.join(path) - self._path_len = python.PyList_GET_SIZE(self._path) - self._c_path = _buildObjectPathSegments(self._path) - self.find = self.__call__ - - def __dealloc__(self): - if self._c_path is not NULL: - python.PyMem_Free(self._c_path) - - def __str__(self): - return self._path_str - - def __call__(self, _Element root not None, *default): - """Follow the attribute path in the object structure and return the - target attribute value. - - If it it not found, either returns a default value (if one was passed - as second argument) or raises AttributeError. - """ - cdef Py_ssize_t use_default - use_default = python.PyTuple_GET_SIZE(default) - if use_default == 1: - default = python.PyTuple_GET_ITEM(default, 0) - python.Py_INCREF(default) - use_default = 1 - elif use_default > 1: - raise TypeError, "invalid number of arguments: needs one or two" - return _findObjectPath(root, self._c_path, self._path_len, - default, use_default) - - def hasattr(self, _Element root not None): - try: - _findObjectPath(root, self._c_path, self._path_len, None, 0) - except AttributeError: - return False - return True - - def setattr(self, _Element root not None, value): - """Set the value of the target element in a subtree. - - If any of the children on the path does not exist, it is created. - """ - _createObjectPath(root, self._c_path, self._path_len, 1, value) - - def addattr(self, _Element root not None, value): - """Append a value to the target element in a subtree. - - If any of the children on the path does not exist, it is created. - """ - _createObjectPath(root, self._c_path, self._path_len, 0, value) - -cdef object __MATCH_PATH_SEGMENT -__MATCH_PATH_SEGMENT = re.compile( - r"(\.?)\s*(?:\{([^}]*)\})?\s*([^.{}\[\]\s]+)\s*(?:\[\s*([-0-9]+)\s*\])?", - re.U).match - -cdef object _RELATIVE_PATH_SEGMENT -_RELATIVE_PATH_SEGMENT = (None, None, 0) - -cdef _parseObjectPathString(path): - """Parse object path string into a 'hrefOnameOhrefOnameOOO' string and an - index list. The index list is None if no index was used in the path. - """ - cdef int has_dot - new_path = [] - path = cetree.utf8(path.strip()) - if path == '.': - return [_RELATIVE_PATH_SEGMENT] - path_pos = 0 - while python.PyString_GET_SIZE(path) > 0: - match = __MATCH_PATH_SEGMENT(path, path_pos) - if match is None: - break - - dot, ns, name, index = match.groups() - if index is None or python.PyString_GET_SIZE(index) == 0: - index = 0 - else: - index = python.PyNumber_Int(index) - has_dot = _cstr(dot)[0] == c'.' - if python.PyList_GET_SIZE(new_path) == 0: - if has_dot: - # path '.child' => ignore root - python.PyList_Append(new_path, _RELATIVE_PATH_SEGMENT) - elif index != 0: - raise ValueError, "index not allowed on root node" - elif not has_dot: - raise ValueError, "invalid path" - python.PyList_Append(new_path, (ns, name, index)) - - path_pos = match.end() - if python.PyList_GET_SIZE(new_path) == 0 or \ - python.PyString_GET_SIZE(path) > path_pos: - raise ValueError, "invalid path" - return new_path - -cdef _parseObjectPathList(path): - """Parse object path sequence into a 'hrefOnameOhrefOnameOOO' string and - an index list. The index list is None if no index was used in the path. - """ - cdef char* index_pos - cdef char* index_end - cdef char* c_name - new_path = [] - for item in path: - item = item.strip() - if python.PyList_GET_SIZE(new_path) == 0 and item == '': - # path '.child' => ignore root - ns = name = None - index = 0 - else: - ns, name = cetree.getNsTag(item) - c_name = _cstr(name) - index_pos = cstd.strchr(c_name, c'[') - if index_pos is NULL: - index = 0 - else: - name = python.PyString_FromStringAndSize( - c_name, (index_pos - c_name)) - index_pos = index_pos + 1 - index_end = cstd.strchr(index_pos, c']') - if index_end is NULL: - raise ValueError, "index must be enclosed in []" - index = python.PyNumber_Int( - python.PyString_FromStringAndSize( - index_pos, (index_end - index_pos))) - if python.PyList_GET_SIZE(new_path) == 0 and index != 0: - raise ValueError, "index not allowed on root node" - python.PyList_Append(new_path, (ns, name, index)) - if python.PyList_GET_SIZE(new_path) == 0: - raise ValueError, "invalid path" - return new_path - -cdef _ObjectPath* _buildObjectPathSegments(path_list) except NULL: - cdef _ObjectPath* c_path - cdef _ObjectPath* c_path_segments - cdef Py_ssize_t c_len - c_len = python.PyList_GET_SIZE(path_list) - c_path_segments = <_ObjectPath*>python.PyMem_Malloc(sizeof(_ObjectPath) * - c_len) - if c_path_segments is NULL: - python.PyErr_NoMemory() - return NULL - c_path = c_path_segments - for href, name, index in path_list: - if href is None: - c_path[0].href = NULL - else: - c_path[0].href = _cstr(href) - if name is None: - c_path[0].name = NULL - else: - c_path[0].name = _cstr(name) - c_path[0].index = index - c_path = c_path + 1 - return c_path_segments - -cdef _findObjectPath(_Element root, _ObjectPath* c_path, Py_ssize_t c_path_len, - default_value, int use_default): - """Follow the path to find the target element. - """ - cdef tree.xmlNode* c_node - cdef char* c_href - cdef char* c_name - cdef Py_ssize_t c_index - c_node = root._c_node - c_name = c_path[0].name - c_href = c_path[0].href - if c_href is NULL or c_href[0] == c'\0': - c_href = tree._getNs(c_node) - if not cetree.tagMatches(c_node, c_href, c_name): - raise ValueError, "root element does not match: need %s, got %s" % \ - (cetree.namespacedNameFromNsName(c_href, c_name), root.tag) - - while c_node is not NULL: - c_path_len = c_path_len - 1 - if c_path_len <= 0: - return cetree.elementFactory(root._doc, c_node) - - c_path = c_path + 1 - if c_path[0].href is not NULL: - c_href = c_path[0].href # otherwise: keep parent namespace - c_name = c_path[0].name - c_index = c_path[0].index - - if c_index < 0: - c_node = c_node.last - else: - c_node = c_node.children - c_node = _findFollowingSibling(c_node, c_href, c_name, c_index) - - if use_default: - return default_value - else: - tag = cetree.namespacedNameFromNsName(c_href, c_name) - raise AttributeError, "no such child: " + tag - -cdef _createObjectPath(_Element root, _ObjectPath* c_path, - Py_ssize_t c_path_len, int replace, value): - """Follow the path to find the target element, build the missing children - as needed and set the target element to 'value'. If replace is true, an - existing value is replaced, otherwise the new value is added. - """ - cdef _Element child - cdef tree.xmlNode* c_node - cdef tree.xmlNode* c_child - cdef char* c_href - cdef char* c_name - cdef Py_ssize_t c_index - if c_path_len == 1: - raise TypeError, "cannot update root node" - - c_node = root._c_node - c_name = c_path[0].name - c_href = c_path[0].href - if c_href is NULL or c_href[0] == c'\0': - c_href = tree._getNs(c_node) - if not cetree.tagMatches(c_node, c_href, c_name): - raise ValueError, "root element does not match: need %s, got %s" % \ - (cetree.namespacedNameFromNsName(c_href, c_name), root.tag) - - while c_path_len > 1: - c_path_len = c_path_len - 1 - c_path = c_path + 1 - if c_path[0].href is not NULL: - c_href = c_path[0].href # otherwise: keep parent namespace - c_name = c_path[0].name - c_index = c_path[0].index - - if c_index < 0: - c_child = c_node.last - else: - c_child = c_node.children - c_child = _findFollowingSibling(c_child, c_href, c_name, c_index) - - if c_child is not NULL: - c_node = c_child - elif c_index != 0: - raise TypeError, \ - "creating indexed path attributes is not supported" - elif c_path_len == 1: - _appendValue(cetree.elementFactory(root._doc, c_node), - cetree.namespacedNameFromNsName(c_href, c_name), - value) - return - else: - child = SubElement( - cetree.elementFactory(root._doc, c_node), - cetree.namespacedNameFromNsName(c_href, c_name)) - c_node = child._c_node - - # if we get here, the entire path was already there - if replace: - element = cetree.elementFactory(root._doc, c_node) - _replaceElement(element, value) - else: - _appendValue(cetree.elementFactory(root._doc, c_node.parent), - cetree.namespacedName(c_node), value) - -cdef _buildDescendantPaths(tree.xmlNode* c_node, prefix_string): - """Returns a list of all descendant paths. - """ - tag = cetree.namespacedName(c_node) - if prefix_string: - if prefix_string[-1] != '.': - prefix_string = prefix_string + '.' - prefix_string = prefix_string + tag - else: - prefix_string = tag - path = [prefix_string] - path_list = [] - _recursiveBuildDescendantPaths(c_node, path, path_list) - return path_list - -cdef _recursiveBuildDescendantPaths(tree.xmlNode* c_node, path, path_list): - """Fills the list 'path_list' with all descendant paths, initial prefix - being in the list 'path'. - """ - cdef python.PyObject* dict_result - cdef tree.xmlNode* c_child - cdef char* c_href - python.PyList_Append(path_list, '.'.join(path)) - tags = {} - c_href = tree._getNs(c_node) - c_child = c_node.children - while c_child is not NULL: - while c_child.type != tree.XML_ELEMENT_NODE: - c_child = c_child.next - if c_child is NULL: - return - if c_href is tree._getNs(c_child): - tag = c_child.name - elif c_href is not NULL and tree._getNs(c_child) is NULL: - # special case: parent has namespace, child does not - tag = '{}' + c_child.name - else: - tag = cetree.namespacedName(c_child) - dict_result = python.PyDict_GetItem(tags, tag) - if dict_result is NULL: - count = 0 - else: - count = (dict_result) + 1 - python.PyDict_SetItem(tags, tag, count) - if count > 0: - tag = tag + '[%d]' % count - python.PyList_Append(path, tag) - _recursiveBuildDescendantPaths(c_child, path, path_list) - del path[-1] - c_child = c_child.next - - -################################################################################ # Type annotations cdef PyType _check_type(tree.xmlNode* c_node, PyType pytype): @@ -2058,3 +1721,9 @@ python.PyDict_SetItem(_attributes, PYTYPE_ATTRIBUTE, _pytype) return _makeElement("value", strval, _attributes, nsmap) + + +################################################################################ +# ObjectPath + +include "objectpath.pxi" Copied: lxml/trunk/src/lxml/objectpath.pxi (from r46423, lxml/trunk/src/lxml/objectify.pyx) ============================================================================== --- lxml/trunk/src/lxml/objectify.pyx (original) +++ lxml/trunk/src/lxml/objectpath