From scoder at codespeak.net Sat Sep 1 10:45:07 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 1 Sep 2007 10:45:07 +0200 (CEST) Subject: [Lxml-checkins] r46231 - lxml/trunk/doc Message-ID: <20070901084507.9E7F281B0@code0.codespeak.net> Author: scoder Date: Sat Sep 1 10:45:04 2007 New Revision: 46231 Modified: lxml/trunk/doc/objectify.txt Log: fixed test cases to reflect annotation in objectify.E factory Modified: lxml/trunk/doc/objectify.txt ============================================================================== --- lxml/trunk/doc/objectify.txt (original) +++ lxml/trunk/doc/objectify.txt Sat Sep 1 10:45:04 2007 @@ -82,6 +82,13 @@ .. _`namespace specific classes`: element_classes.html#namespace-class-lookup +To make the doctests in this document look a little nicer, we also use this: + + >>> import lxml.usedoctest + +Imported from within a doctest, this relieves us from caring about the exact +formatting of XML output. + The lxml.objectify API ====================== @@ -274,18 +281,18 @@ >>> E = objectify.E >>> root = E.root( - ... E.a(5), + ... E.a(5L), ... E.b(6.1), ... E.c(True), ... E.d("how", tell="me") ... ) >>> print etree.tostring(root, pretty_print=True) - - 5 - 6.1 - true - how + + 5 + 6.1 + true + how This allows you to write up a specific language in tags:: @@ -300,9 +307,9 @@ ... ) >>> print etree.tostring(root, pretty_print=True) - - The title - 5 + + The title + 5 From scoder at codespeak.net Sat Sep 1 10:59:47 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 1 Sep 2007 10:59:47 +0200 (CEST) Subject: [Lxml-checkins] r46232 - in lxml/trunk: . doc src/lxml Message-ID: <20070901085947.AC74F81B0@code0.codespeak.net> Author: scoder Date: Sat Sep 1 10:59:45 2007 New Revision: 46232 Modified: lxml/trunk/CHANGES.txt lxml/trunk/doc/objectify.txt lxml/trunk/src/lxml/objectify.pyx Log: made annotation in objectify.ElementMaker optional through 'annotate' kw arg Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Sat Sep 1 10:59:45 2007 @@ -8,6 +8,10 @@ Features added -------------- +* Reimplemented ``objectify.E`` for better performance and improved + integration with objectify. Provides extended type support based on + registered PyTypes. + * XSLT objects now support deep copying * New ``makeSubElement()`` C-API function that allows creating a new @@ -35,8 +39,7 @@ * Schematron validation (incomplete in libxml2) -* Extended type support for ``objectify.E`` based on registered PyTypes. - Supports an additional argument to ``PyType()`` that takes a conversion +* Additional ``stringify`` argument to ``PyType()`` that takes a conversion function to strings to support setting text values from arbitrary types. * Entity support through an ``Entity`` factory and element classes. XML Modified: lxml/trunk/doc/objectify.txt ============================================================================== --- lxml/trunk/doc/objectify.txt (original) +++ lxml/trunk/doc/objectify.txt Sat Sep 1 10:59:45 2007 @@ -299,17 +299,32 @@ >>> ROOT = objectify.E.root >>> TITLE = objectify.E.title - >>> TYPE = objectify.E.type + >>> HOWMANY = getattr(objectify.E, "how-many") >>> root = ROOT( ... TITLE("The title"), - ... TYPE(5) + ... HOWMANY(5) ... ) >>> print etree.tostring(root, pretty_print=True) The title - 5 + 5 + + +``objectify.E`` is an instance of ``objectify.ElementMaker``. By default, it +creates pytype annotated Elements without a namespace. You can switch off the +pytype annotation by passing False to the ``annotate`` keyword argument of the +constructor. You can also pass a default namespace and an ``nsmap``:: + + >>> myE = objectify.ElementMaker(annotate=False, + ... namespace="http://my/ns", nsmap={None : "http://my/ns"}) + + >>> root = myE.root( myE.someint(2) ) + + >>> print etree.tostring(root, pretty_print=True) + + 2 Modified: lxml/trunk/src/lxml/objectify.pyx ============================================================================== --- lxml/trunk/src/lxml/objectify.pyx (original) +++ lxml/trunk/src/lxml/objectify.pyx Sat Sep 1 10:59:45 2007 @@ -1047,7 +1047,9 @@ cdef object _makeelement cdef object _namespace cdef object _nsmap - def __init__(self, namespace=None, nsmap=None, makeelement=None): + cdef int _annotate + def __init__(self, namespace=None, nsmap=None, annotate=True, + makeelement=None): if nsmap is None: nsmap = _DEFAULT_NSMAP self._nsmap = nsmap @@ -1055,6 +1057,7 @@ self._namespace = None else: self._namespace = "{%s}" % namespace + self._annotate = bool(annotate) if makeelement is not None: assert callable(makeelement) self._makeelement = makeelement @@ -1068,6 +1071,7 @@ element_maker = NEW_ELEMENT_MAKER(_ObjectifyElementMakerCaller) element_maker._tag = tag element_maker._nsmap = self._nsmap + element_maker._annotate = self._annotate element_maker._element_factory = self._makeelement return element_maker @@ -1075,6 +1079,7 @@ cdef object _tag cdef object _nsmap cdef object _element_factory + cdef int _annotate def __call__(self, *children, **attrib): cdef _ObjectifyElementMakerCaller elementMaker @@ -1088,6 +1093,7 @@ else: element = self._element_factory(self._tag, attrib, self._nsmap) + pytype_name = None has_children = 0 has_string_value = 0 for child in children: @@ -1113,7 +1119,7 @@ has_children = 1 else: if pytype_name is not None: - # concatenation makes the result a string + # concatenation always makes the result a string has_string_value = 1 pytype_name = _typename(child) pytype = python.PyDict_GetItem(_PYTYPE_DICT, pytype_name) @@ -1124,12 +1130,11 @@ child = str(child) _add_text(element, child) - if not has_children: + if self._annotate and not has_children: if has_string_value: cetree.setAttributeValue(element, PYTYPE_ATTRIBUTE, "str") elif pytype_name is not None: - cetree.setAttributeValue(element, PYTYPE_ATTRIBUTE, - pytype_name) + cetree.setAttributeValue(element, PYTYPE_ATTRIBUTE, pytype_name) return element @@ -1911,6 +1916,10 @@ _parse = etree.parse def parse(f, parser=None): + """Parse a file or file-like object with the objectify parser. + + You can pass a different parser as second argument. + """ if parser is None: parser = objectify_parser return _parse(f, parser) From scoder at codespeak.net Sun Sep 2 17:20:11 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 2 Sep 2007 17:20:11 +0200 (CEST) Subject: [Lxml-checkins] r46239 - lxml/trunk Message-ID: <20070902152011.513E78141@code0.codespeak.net> Author: scoder Date: Sun Sep 2 17:20:09 2007 New Revision: 46239 Modified: lxml/trunk/setup.py Log: doc clarification Modified: lxml/trunk/setup.py ============================================================================== --- lxml/trunk/setup.py (original) +++ lxml/trunk/setup.py Sun Sep 2 17:20:09 2007 @@ -71,10 +71,10 @@ Running ``easy_install lxml==dev`` will install it from http://codespeak.net/svn/lxml/trunk#egg=lxml-dev -Current bug fixes for the stable version are at -http://codespeak.net/svn/lxml/branch/lxml-%(branch_version)s . -Running ``easy_install lxml==%(branch_version)sbugfix`` will install this -version from +After an official release of a new stable series, current bug fixes might +become available at +http://codespeak.net/svn/lxml/branch/lxml-%(branch_version)s . Running +``easy_install lxml==%(branch_version)sbugfix`` will install this version from http://codespeak.net/svn/lxml/branch/lxml-%(branch_version)s#egg=lxml-%(branch_version)sbugfix """ % { "branch_version" : versioninfo.branch_version() }) + From scoder at codespeak.net Sun Sep 2 17:20:28 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 2 Sep 2007 17:20:28 +0200 (CEST) Subject: [Lxml-checkins] r46240 - lxml/trunk/doc Message-ID: <20070902152028.B66EE8141@code0.codespeak.net> Author: scoder Date: Sun Sep 2 17:20:28 2007 New Revision: 46240 Modified: lxml/trunk/doc/FAQ.txt Log: ReST fix Modified: lxml/trunk/doc/FAQ.txt ============================================================================== --- lxml/trunk/doc/FAQ.txt (original) +++ lxml/trunk/doc/FAQ.txt Sun Sep 2 17:20:28 2007 @@ -446,7 +446,7 @@ that problems become hard to debug and even harder to reproduce in a predictable way. If you encounter crashes in one these systems, but your code runs perfectly when started by hand, the following gives you a few hints for -possible approaches to solve your specific problem:: +possible approaches to solve your specific problem: * make sure you use recent versions of libxml2, libxslt and lxml. The libxml2 developers keep fixing bugs in each release, and lxml also tries to become From scoder at codespeak.net Sun Sep 2 18:13:31 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 2 Sep 2007 18:13:31 +0200 (CEST) Subject: [Lxml-checkins] r46241 - in lxml/trunk: . doc Message-ID: <20070902161331.7F1F680B8@code0.codespeak.net> Author: scoder Date: Sun Sep 2 18:13:29 2007 New Revision: 46241 Modified: lxml/trunk/CHANGES.txt lxml/trunk/doc/lxml2.txt lxml/trunk/doc/main.txt Log: pre-release cleanup Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Sun Sep 2 18:13:29 2007 @@ -2,7 +2,7 @@ lxml changelog ============== -2.0alpha1 (2007-08-31) +2.0alpha1 (2007-09-02) ====================== Features added @@ -39,8 +39,9 @@ * Schematron validation (incomplete in libxml2) -* Additional ``stringify`` argument to ``PyType()`` that takes a conversion - function to strings to support setting text values from arbitrary types. +* Additional ``stringify`` argument to ``objectify.PyType()`` takes a + conversion function to strings to support setting text values from arbitrary + types. * Entity support through an ``Entity`` factory and element classes. XML parsers now have a ``resolve_entities`` keyword argument that can be set to Modified: lxml/trunk/doc/lxml2.txt ============================================================================== --- lxml/trunk/doc/lxml2.txt (original) +++ lxml/trunk/doc/lxml2.txt Sun Sep 2 18:13:29 2007 @@ -89,7 +89,7 @@ facilitate further enhancements and an improved integration between lxml's features. -* lxml.objectify now has its own implementation of the ``E factory``. It uses +* lxml.objectify now has its own implementation of the `E factory`_. It uses the built-in type lookup mechanism of lxml.objectify, thus removing the need for an additional type registry mechanism (as previously available through the ``typemap`` parameter). @@ -104,6 +104,8 @@ bigger overlap with the XSLT code. The main benefits are improved thread safety in the XPath evaluators and Python RegExp support in standard XPath. +.. _`E factory`: objectify.html#tree-generation-with-the-e-factory + New modules =========== Modified: lxml/trunk/doc/main.txt ============================================================================== --- lxml/trunk/doc/main.txt (original) +++ lxml/trunk/doc/main.txt Sun Sep 2 18:13:29 2007 @@ -138,7 +138,7 @@ .. _`lxml at the Python Package Index`: http://pypi.python.org/pypi/lxml/ .. _`this key`: pubkey.asc -The latest version is `lxml 2.0alpha1`_, released 2007-08-31 +The latest version is `lxml 2.0alpha1`_, released 2007-09-02 (`changes for 2.0alpha1`_). `Older versions`_ are listed below. .. _`Older versions`: #old-versions From scoder at codespeak.net Sun Sep 2 18:34:07 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 2 Sep 2007 18:34:07 +0200 (CEST) Subject: [Lxml-checkins] r46242 - lxml/trunk/doc Message-ID: <20070902163407.0575D817B@code0.codespeak.net> Author: scoder Date: Sun Sep 2 18:34:07 2007 New Revision: 46242 Modified: lxml/trunk/doc/lxml2.txt Log: typo Modified: lxml/trunk/doc/lxml2.txt ============================================================================== --- lxml/trunk/doc/lxml2.txt (original) +++ lxml/trunk/doc/lxml2.txt Sun Sep 2 18:34:07 2007 @@ -67,7 +67,7 @@ * The type annotations in lxml.objectify (the ``pytype`` attribute) now use ``NoneType`` for the None value as this is the correct Python type name. - Previously, lxml 1.x used a lower case ``?one``. + Previously, lxml 1.x used a lower case ``none``. * Another change in objectify regards the way it deals with ambiguous types. Previously, setting a value like the string ``"3"`` through normal attribute From scoder at codespeak.net Mon Sep 3 11:57:29 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 3 Sep 2007 11:57:29 +0200 (CEST) Subject: [Lxml-checkins] r46262 - lxml/trunk/src/lxml Message-ID: <20070903095729.8D68C814C@code0.codespeak.net> Author: scoder Date: Mon Sep 3 11:57:27 2007 New Revision: 46262 Modified: lxml/trunk/src/lxml/etree.pyx Log: use list instead of dict in _TempStore to reduce overhead Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Mon Sep 3 11:57:27 2007 @@ -172,16 +172,13 @@ cdef class _TempStore: cdef object _storage def __init__(self): - self._storage = {} + self._storage = [] cdef void add(self, obj): - python.PyDict_SetItem(self._storage, id(obj), obj) + python.PyList_Append(self._storage, obj) cdef void clear(self): - python.PyDict_Clear(self._storage) - - cdef object dictcopy(self): - return self._storage.copy() + del self._storage[:] # class for temporarily storing exceptions raised in extensions cdef class _ExceptionContext: From scoder at codespeak.net Mon Sep 3 12:35:21 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 3 Sep 2007 12:35:21 +0200 (CEST) Subject: [Lxml-checkins] r46264 - lxml/trunk/src/lxml Message-ID: <20070903103521.65813814F@code0.codespeak.net> Author: scoder Date: Mon Sep 3 12:35:20 2007 New Revision: 46264 Modified: lxml/trunk/src/lxml/xslt.pxi Log: use separate resolver contexts for each XSLT call: exceptions and resolver temp storage must be local Modified: lxml/trunk/src/lxml/xslt.pxi ============================================================================== --- lxml/trunk/src/lxml/xslt.pxi (original) +++ lxml/trunk/src/lxml/xslt.pxi Mon Sep 3 12:35:20 2007 @@ -58,7 +58,7 @@ cdef _XSLTResolverContext _copy(self): cdef _XSLTResolverContext context context = _XSLTResolverContext(self._parser) - context._c_style_doc = _copyDoc(self._c_style_doc, 1) + context._c_style_doc = self._c_style_doc return context cdef xmlDoc* _xslt_resolve_stylesheet(char* c_uri, void* context): @@ -353,7 +353,10 @@ new_xslt._access_control = self._access_control new_xslt._error_log = _ErrorLog() new_xslt._context = self._context._copy() + new_xslt._xslt_resolver_context = self._xslt_resolver_context._copy() + new_xslt._xslt_resolver_context._c_style_doc = _copyDoc( + self._xslt_resolver_context._c_style_doc, 1) c_doc = _copyDoc(self._c_style.doc, 1) new_xslt._c_style = xslt.xsltParseStylesheetDoc(c_doc) @@ -365,6 +368,7 @@ def __call__(self, _input, profile_run=False, **_kw): cdef _XSLTContext context + cdef _XSLTResolverContext resolver_context cdef _Document input_doc cdef _Element root_node cdef _Document result_doc @@ -397,6 +401,9 @@ context = self._context._copy() context.register_context(transform_ctxt, input_doc) + resolver_context = self._xslt_resolver_context._copy() + transform_ctxt._private = resolver_context + c_result = self._run_transform( input_doc, c_doc, _kw, context, transform_ctxt) @@ -412,10 +419,10 @@ self._error_log.disconnect() try: - if self._xslt_resolver_context._has_raised(): + if resolver_context is not None and resolver_context._has_raised(): if c_result is not NULL: tree.xmlFreeDoc(c_result) - self._xslt_resolver_context._raise_if_stored() + resolver_context._raise_if_stored() if c_result is NULL: # last error seems to be the most accurate here @@ -431,31 +438,26 @@ message = "Error applying stylesheet" raise XSLTApplyError, message finally: - self._xslt_resolver_context.clear() + if resolver_context is not None: + resolver_context.clear() result_doc = _documentFactory(c_result, input_doc._parser) return _xsltResultTreeFactory(result_doc, self, profile_doc) cdef xmlDoc* _run_transform(self, _Document input_doc, xmlDoc* c_input_doc, - parameters, _XSLTContext context, - xslt.xsltTransformContext* transform_ctxt): + parameters, _XSLTContext context, + xslt.xsltTransformContext* transform_ctxt): cdef python.PyThreadState* state - cdef _XSLTResolverContext resolver_context cdef xmlDoc* c_result cdef char** params cdef Py_ssize_t i, parameter_count - resolver_context = _XSLTResolverContext(input_doc._parser) - resolver_context._c_style_doc = self._xslt_resolver_context._c_style_doc - xslt.xsltSetTransformErrorFunc(transform_ctxt, self._error_log, _receiveXSLTError) if self._access_control is not None: self._access_control._register_in_context(transform_ctxt) - transform_ctxt._private = self._xslt_resolver_context - parameter_count = python.PyDict_Size(parameters) if parameter_count > 0: # allocate space for parameters @@ -463,17 +465,21 @@ # and + 1 as array is NULL terminated params = python.PyMem_Malloc( sizeof(char*) * (parameter_count * 2 + 1)) - i = 0 - keep_ref = [] - for key, value in parameters.iteritems(): - k = _utf8(key) - python.PyList_Append(keep_ref, k) - v = _utf8(value) - python.PyList_Append(keep_ref, v) - params[i] = _cstr(k) - i = i + 1 - params[i] = _cstr(v) - i = i + 1 + try: + i = 0 + keep_ref = [] + for key, value in parameters.iteritems(): + k = _utf8(key) + python.PyList_Append(keep_ref, k) + v = _utf8(value) + python.PyList_Append(keep_ref, v) + params[i] = _cstr(k) + i = i + 1 + params[i] = _cstr(v) + i = i + 1 + except: + python.PyMem_Free(params) + raise params[i] = NULL else: params = NULL From scoder at codespeak.net Mon Sep 3 12:35:45 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 3 Sep 2007 12:35:45 +0200 (CEST) Subject: [Lxml-checkins] r46265 - lxml/trunk Message-ID: <20070903103545.B54D7814F@code0.codespeak.net> Author: scoder Date: Mon Sep 3 12:35:45 2007 New Revision: 46265 Modified: lxml/trunk/CHANGES.txt Log: changelog update Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Mon Sep 3 12:35:45 2007 @@ -2,6 +2,22 @@ lxml changelog ============== +Under development +================= + +Features added +-------------- + +Bugs fixed +---------- + +* Race condition in XSLT where the resolver context leaked between concurrent + XSLT calls + +Other changes +------------- + + 2.0alpha1 (2007-09-02) ====================== From scoder at codespeak.net Mon Sep 3 13:36:00 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 3 Sep 2007 13:36:00 +0200 (CEST) Subject: [Lxml-checkins] r46267 - lxml/trunk/src/lxml Message-ID: <20070903113600.3CF5A814D@code0.codespeak.net> Author: scoder Date: Mon Sep 3 13:35:58 2007 New Revision: 46267 Modified: lxml/trunk/src/lxml/parser.pxi Log: ET 1.3 compatible parser version Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Mon Sep 3 13:35:58 2007 @@ -484,6 +484,11 @@ return _makeElement(_tag, NULL, None, self, None, None, attrib, nsmap, _extra) + property version: + "The version of the underlying XML parser." + def __get__(self): + return "libxml2 %d.%d.%d" % LIBXML_VERSION + cdef xmlDoc* _parseUnicodeDoc(self, utext, char* c_filename) except NULL: """Parse unicode document, share dictionary if possible. """ From scoder at codespeak.net Mon Sep 3 13:43:01 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 3 Sep 2007 13:43:01 +0200 (CEST) Subject: [Lxml-checkins] r46269 - lxml/trunk/src/lxml/tests Message-ID: <20070903114301.85EC280E9@code0.codespeak.net> Author: scoder Date: Mon Sep 3 13:43:00 2007 New Revision: 46269 Modified: lxml/trunk/src/lxml/tests/common_imports.py Log: additional test import of cElementTree Modified: lxml/trunk/src/lxml/tests/common_imports.py ============================================================================== --- lxml/trunk/src/lxml/tests/common_imports.py (original) +++ lxml/trunk/src/lxml/tests/common_imports.py Mon Sep 3 13:43:00 2007 @@ -14,6 +14,14 @@ ElementTree = None try: + from xml.etree import cElementTree # Python 2.5 +except ImportError: + try: + from celementtree import cElementTree # standard ET + except ImportError: + cElementTree = None + +try: import doctest # check if the system version has everything we need doctest.DocFileSuite From scoder at codespeak.net Mon Sep 3 13:54:40 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 3 Sep 2007 13:54:40 +0200 (CEST) Subject: [Lxml-checkins] r46271 - lxml/trunk/src/lxml/tests Message-ID: <20070903115440.9B14B814E@code0.codespeak.net> Author: scoder Date: Mon Sep 3 13:54:37 2007 New Revision: 46271 Modified: lxml/trunk/src/lxml/tests/common_imports.py Log: fix import Modified: lxml/trunk/src/lxml/tests/common_imports.py ============================================================================== --- lxml/trunk/src/lxml/tests/common_imports.py (original) +++ lxml/trunk/src/lxml/tests/common_imports.py Mon Sep 3 13:54:37 2007 @@ -6,7 +6,7 @@ from lxml import etree try: - from xml.etree import ElementTree # Python 2.5 + from xml.etree import ElementTree # Python 2.5+ except ImportError: try: from elementtree import ElementTree # standard ET @@ -14,10 +14,10 @@ ElementTree = None try: - from xml.etree import cElementTree # Python 2.5 + from xml.etree import cElementTree # Python 2.5+ except ImportError: try: - from celementtree import cElementTree # standard ET + import cElementTree # standard ET except ImportError: cElementTree = None From scoder at codespeak.net Mon Sep 3 13:55:11 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 3 Sep 2007 13:55:11 +0200 (CEST) Subject: [Lxml-checkins] r46272 - lxml/trunk/src/lxml/tests Message-ID: <20070903115511.4FCBA814E@code0.codespeak.net> Author: scoder Date: Mon Sep 3 13:55:10 2007 New Revision: 46272 Modified: lxml/trunk/src/lxml/tests/test_elementtree.py Log: cleanup of test_elementtree.py to integrate cElementTree Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Mon Sep 3 13:55:10 2007 @@ -9,12 +9,17 @@ """ import unittest, doctest -import os, re, shutil, tempfile, copy +import os, re, shutil, tempfile, copy, operator -from common_imports import StringIO, etree, ElementTree -from common_imports import HelperTestCase, fileInTestDir, canonicalize +from common_imports import StringIO, etree, ElementTree, cElementTree +from common_imports import fileInTestDir, canonicalize -class ETreeTestCaseBase(HelperTestCase): +if cElementTree is not None: + if tuple([int(n) for n in + getattr(cElementTree, "VERSION", "0.0").split(".")]) <= (1,0,6): + cElementTree = None + +class ETreeTestCaseBase(unittest.TestCase): etree = None def setUp(self): @@ -75,7 +80,7 @@ self.assertEquals('one', root[0].tag) self.assertEquals('two', root[1].tag) self.assertEquals('three', root[2].tag) - self.assertRaises(IndexError, root.__getitem__, 3) + self.assertRaises(IndexError, operator.getitem, root, 3) def test_subelement(self): Element = self.etree.Element @@ -116,7 +121,7 @@ root = doc.getroot() self.assertEquals(1, len(root)) self.assertEquals('one', root[0].tag) - self.assertRaises(IndexError, root.__getitem__, 1) + self.assertRaises(IndexError, operator.getitem, root, 1) def test_element_indexing_with_text2(self): ElementTree = self.etree.ElementTree @@ -147,7 +152,7 @@ self.assertEquals(d, a[-1]) self.assertEquals(c, a[-2]) self.assertEquals(b, a[-3]) - self.assertRaises(IndexError, a.__getitem__, -4) + self.assertRaises(IndexError, operator.getitem, a, -4) a[-1] = e = Element('e') self.assertEquals(e, a[-1]) del a[-1] @@ -266,7 +271,7 @@ root = doc.getroot() self.assertEquals('One', root.attrib['one']) self.assertEquals('Two', root.attrib['two']) - self.assertRaises(KeyError, root.attrib.__getitem__, 'three') + self.assertRaises(KeyError, operator.getitem, root.attrib, 'three') def test_attributes2(self): ElementTree = self.etree.ElementTree @@ -917,6 +922,18 @@ self.assertXML("", a) + def test_processinginstruction(self): + # lxml.etree separates target and text + Element = self.etree.Element + SubElement = self.etree.SubElement + ProcessingInstruction = self.etree.PI + + a = Element('a') + a.append(ProcessingInstruction('foo', 'some more text')) + self.assertEquals(a[0].tag, ProcessingInstruction) + self.assertXML("", + a) + def test_pi_nonsense(self): ProcessingInstruction = self.etree.ProcessingInstruction pi = ProcessingInstruction('foo') @@ -980,7 +997,7 @@ a = Element('a') b = SubElement(a, 'b') - self.assertRaises(IndexError, a.__setitem__, 1, Element('c')) + self.assertRaises(IndexError, operator.setitem, a, 1, Element('c')) def test_setitem_tail(self): Element = self.etree.Element @@ -1583,7 +1600,7 @@ a.attrib['bar'] = 'Bar' self.assertEquals('Foo', a.attrib['foo']) del a.attrib['foo'] - self.assertRaises(KeyError, a.attrib.__getitem__, 'foo') + self.assertRaises(KeyError, operator.getitem, a.attrib, 'foo') def test_getslice(self): Element = self.etree.Element @@ -2514,6 +2531,15 @@ self.assertEquals(qname1, qname1) self.assertEquals(qname1, qname2) + def test_parser_version(self): + etree = self.etree + parser = etree.XMLParser() + if hasattr(parser, "version"): + # ElementTree 1.3+, cET + self.assert_(re.match("[^ ]+ [0-9.]+", parser.version)) + + # helper methods + def _writeElement(self, element, encoding='us-ascii'): """Write out element for comparison. """ @@ -2592,19 +2618,33 @@ mapping["key"] = "value" self.assertEquals("value", mapping["key"]) + # assertFalse doesn't exist in Python 2.3 + try: + unittest.TestCase.assertFalse + except AttributeError: + assertFalse = unittest.TestCase.failIf -class ETreeTestCase(ETreeTestCaseBase): - etree = etree + +if etree: + class ETreeTestCase(ETreeTestCaseBase): + etree = etree if ElementTree: class ElementTreeTestCase(ETreeTestCaseBase): etree = ElementTree +if cElementTree: + class CElementTreeTestCase(ETreeTestCaseBase): + etree = cElementTree + def test_suite(): suite = unittest.TestSuite() - suite.addTests([unittest.makeSuite(ETreeTestCase)]) + if etree: + suite.addTests([unittest.makeSuite(ETreeTestCase)]) if ElementTree: suite.addTests([unittest.makeSuite(ElementTreeTestCase)]) + if cElementTree: + suite.addTests([unittest.makeSuite(CElementTreeTestCase)]) return suite if __name__ == '__main__': From scoder at codespeak.net Mon Sep 3 16:02:37 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 3 Sep 2007 16:02:37 +0200 (CEST) Subject: [Lxml-checkins] r46276 - lxml/trunk/src/lxml/tests Message-ID: <20070903140237.7C45C8144@code0.codespeak.net> Author: scoder Date: Mon Sep 3 16:02:36 2007 New Revision: 46276 Modified: lxml/trunk/src/lxml/tests/common_imports.py lxml/trunk/src/lxml/tests/test_objectify.py lxml/trunk/src/lxml/tests/test_pyclasslookup.py Log: more test case cleanup Modified: lxml/trunk/src/lxml/tests/common_imports.py ============================================================================== --- lxml/trunk/src/lxml/tests/common_imports.py (original) +++ lxml/trunk/src/lxml/tests/common_imports.py Mon Sep 3 16:02:36 2007 @@ -31,12 +31,6 @@ # we need our own version to make it work (Python 2.3?) import local_doctest as doctest -try: - from operator import itemgetter -except ImportError: - def itemgetter(item): - return lambda obj: obj[item] - class HelperTestCase(unittest.TestCase): def parse(self, text): f = StringIO(text) Modified: lxml/trunk/src/lxml/tests/test_objectify.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_objectify.py (original) +++ lxml/trunk/src/lxml/tests/test_objectify.py Mon Sep 3 16:02:36 2007 @@ -9,7 +9,6 @@ from common_imports import etree, StringIO, HelperTestCase, fileInTestDir from common_imports import SillyFileLike, canonicalize, doctest -from common_imports import itemgetter from lxml import objectify @@ -373,7 +372,7 @@ self.assertEquals("0", root.c1.c2[0].text) self.assertEquals("1", root.c1.c2[1].text) self.assertEquals("2", root.c1.c2[2].text) - self.assertRaises(IndexError, itemgetter(3), root.c1.c2) + self.assertRaises(IndexError, operator.getitem, root.c1.c2, 3) def test_child_index_neg(self): root = self.XML(xml_str) @@ -381,7 +380,7 @@ self.assertEquals("0", root.c1.c2[-3].text) self.assertEquals("1", root.c1.c2[-2].text) self.assertEquals("2", root.c1.c2[-1].text) - self.assertRaises(IndexError, itemgetter(-4), root.c1.c2) + self.assertRaises(IndexError, operator.getitem, root.c1.c2, -4) def test_child_len(self): root = self.XML(xml_str) Modified: lxml/trunk/src/lxml/tests/test_pyclasslookup.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_pyclasslookup.py (original) +++ lxml/trunk/src/lxml/tests/test_pyclasslookup.py Mon Sep 3 16:02:36 2007 @@ -9,7 +9,6 @@ from common_imports import etree, StringIO, HelperTestCase, fileInTestDir from common_imports import SillyFileLike, canonicalize, doctest -from common_imports import itemgetter from lxml.pyclasslookup import PythonElementClassLookup From scoder at codespeak.net Mon Sep 3 19:32:24 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 3 Sep 2007 19:32:24 +0200 (CEST) Subject: [Lxml-checkins] r46292 - lxml/trunk/src/lxml/tests Message-ID: <20070903173224.5B117816C@code0.codespeak.net> Author: scoder Date: Mon Sep 3 19:32:23 2007 New Revision: 46292 Modified: lxml/trunk/src/lxml/tests/test_elementtree.py Log: API fix in ET tests Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Mon Sep 3 19:32:23 2007 @@ -687,7 +687,7 @@ SubElement = self.etree.SubElement el = Element('tag') - SubElement(el, 'foo', attrib={'foo':'Foo'}, baz="Baz") + SubElement(el, 'foo', {'foo':'Foo'}, baz="Baz") self.assertEquals("Baz", el[0].attrib['baz']) self.assertEquals('Foo', el[0].attrib['foo']) From scoder at codespeak.net Tue Sep 4 08:46:30 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 4 Sep 2007 08:46:30 +0200 (CEST) Subject: [Lxml-checkins] r46296 - lxml/trunk Message-ID: <20070904064630.30E8B8173@code0.codespeak.net> Author: scoder Date: Tue Sep 4 08:46:27 2007 New Revision: 46296 Modified: lxml/trunk/version.txt Log: set version to 2.0alpa2 Modified: lxml/trunk/version.txt ============================================================================== --- lxml/trunk/version.txt (original) +++ lxml/trunk/version.txt Tue Sep 4 08:46:27 2007 @@ -1 +1 @@ -2.0alpha1 +2.0alpha2 From scoder at codespeak.net Tue Sep 4 08:47:00 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 4 Sep 2007 08:47:00 +0200 (CEST) Subject: [Lxml-checkins] r46297 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20070904064700.AB2EE8173@code0.codespeak.net> Author: scoder Date: Tue Sep 4 08:47:00 2007 New Revision: 46297 Modified: lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/parser.pxi lxml/trunk/src/lxml/tests/test_elementtree.py Log: ET-like feed parser interface Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Tue Sep 4 08:47:00 2007 @@ -8,6 +8,9 @@ Features added -------------- +* ElementTree-like feed parser interface on XMLParser and HTMLParser + (``feed()`` and ``close()`` methods) + Bugs fixed ---------- Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Tue Sep 4 08:47:00 2007 @@ -4,7 +4,14 @@ cimport htmlparser from xmlparser cimport xmlParserCtxt, xmlDict -class XMLSyntaxError(LxmlSyntaxError): +class ParseError(LxmlSyntaxError): + """Syntax error while parsing an XML document. + + For compatibility with ElementTree 1.3 and later. + """ + pass + +class XMLSyntaxError(ParseError): """Syntax error while parsing an XML document. """ pass @@ -381,6 +388,7 @@ cdef xmlParserCtxt* _parser_ctxt cdef ElementClassLookup _class_lookup cdef python.PyThread_type_lock _parser_lock + cdef int _feed_parser_running def __init__(self, int parse_options, remove_comments, remove_pis, context_class=_ResolverContext): @@ -489,6 +497,113 @@ def __get__(self): return "libxml2 %d.%d.%d" % LIBXML_VERSION + # feed parser interface + + def feed(self, data): + """Feeds data to the parser. The argument should be an 8-bit string + buffer containing encoded data, although Unicode is supported as long + as both string types are not mixed. + + This is the main entry point to the consumer interface of a parser. + The parser will parse as much of the XML stream as it can on each + call. To finish parsing, call the ``close()`` method. + + It is not possible to use the parser in any other way after calling + the ``feed()`` method. The parser can only be reset by calling + ``close()``. + """ + cdef xmlParserCtxt* pctxt + cdef Py_ssize_t py_buffer_len + cdef char* c_data + cdef char* c_encoding + cdef int buffer_len + cdef int error + cdef int recover + if python.PyString_Check(data): + c_encoding = NULL + c_data = _cstr(data) + py_buffer_len = python.PyString_GET_SIZE(data) + elif python.PyUnicode_Check(data): + if _UNICODE_ENCODING is NULL: + raise ParserError, \ + "Unicode parsing is not supported on this platform" + c_encoding = _UNICODE_ENCODING + c_data = python.PyUnicode_AS_DATA(data) + py_buffer_len = python.PyUnicode_GET_DATA_SIZE(data) + else: + raise TypeError, "Parsing requires string data" + + if py_buffer_len > python.INT_MAX: + buffer_len = python.INT_MAX + else: + buffer_len = py_buffer_len + + pctxt = self._parser_ctxt + error = 0 + if not self._feed_parser_running: + self._lockParser() + self._feed_parser_running = 1 + self._error_log.connect() + __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt) + xmlparser.xmlCtxtUseOptions(pctxt, self._parse_options) + error = xmlparser.xmlCtxtResetPush( + pctxt, c_data, buffer_len, NULL, c_encoding) + py_buffer_len = py_buffer_len - buffer_len + + while error == 0 and py_buffer_len > 0: + c_data = c_data + buffer_len + if py_buffer_len > python.INT_MAX: + buffer_len = python.INT_MAX + else: + buffer_len = py_buffer_len + py_buffer_len = py_buffer_len - buffer_len + error = xmlparser.xmlParseChunk(pctxt, c_data, buffer_len, 0) + + if error: + self._feed_parser_running = 0 + try: + recover = self._parse_options & xmlparser.XML_PARSE_RECOVER + _handleParseResult(pctxt, pctxt.myDoc, None, + self._error_log, recover) + finally: + self._cleanup() + self._context.clear() + self._error_log.disconnect() + self._unlockParser() + + def close(self): + """Finishes feeding of data to this parser. This tells the parser to + process any remaining data in the feed buffer, and then returns the + root Element of the tree that was parsed. + + This method must be called after passing the last chunk of data into + the ``feed()`` method. It should only be called when using the feed + parser interface is used, all other usage is undefined. + """ + cdef xmlParserCtxt* pctxt + cdef xmlDoc* c_doc + cdef _Document doc + cdef int error + if not self._feed_parser_running: + raise XMLSyntaxError, "no element found" + pctxt = self._parser_ctxt + self._feed_parser_running = 0 + error = xmlparser.xmlParseChunk(pctxt, NULL, 0, 1) + try: + recover = self._parse_options & xmlparser.XML_PARSE_RECOVER + c_doc = _handleParseResult(pctxt, pctxt.myDoc, None, + self._error_log, recover) + finally: + self._cleanup() + self._context.clear() + self._error_log.disconnect() + self._unlockParser() + + doc = _documentFactory(c_doc, self) + return doc.getroot() + + # internal parser methods + cdef xmlDoc* _parseUnicodeDoc(self, utext, char* c_filename) except NULL: """Parse unicode document, share dictionary if possible. """ Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Tue Sep 4 08:47:00 2007 @@ -2538,6 +2538,34 @@ # ElementTree 1.3+, cET self.assert_(re.match("[^ ]+ [0-9.]+", parser.version)) + def test_feed_parser(self): + parser = self.etree.XMLParser() + + parser.feed('<') + parser.feed('a test="works"/') + parser.feed('>') + + root = parser.close() + + self.assertEquals(root.tag, "root") + self.assertEquals(root[0].tag, "a") + self.assertEquals(root[0].get("test"), "works") + + def test_feed_parser_error_close_empty(self): + parser = self.etree.XMLParser() + self.assertRaises(Exception, parser.close) + + def test_feed_parser_error_close_incomplete(self): + parser = self.etree.XMLParser() + + parser.feed(' An HTML attachment was scrubbed... URL: http://codespeak.net/pipermail/lxml-checkins/attachments/20070904/d0e19050/attachment.htm From scoder at codespeak.net Tue Sep 4 09:22:22 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 4 Sep 2007 09:22:22 +0200 (CEST) Subject: [Lxml-checkins] r46298 - in lxml/trunk: doc src/lxml Message-ID: <20070904072222.0E095817E@code0.codespeak.net> Author: scoder Date: Tue Sep 4 09:22:21 2007 New Revision: 46298 Modified: lxml/trunk/doc/parsing.txt lxml/trunk/src/lxml/parser.pxi Log: doc update on the feed parser Modified: lxml/trunk/doc/parsing.txt ============================================================================== --- lxml/trunk/doc/parsing.txt (original) +++ lxml/trunk/doc/parsing.txt Tue Sep 4 09:22:21 2007 @@ -9,8 +9,17 @@ .. contents:: .. 1 Parsers - 2 iterparse and iterwalk - 3 Python unicode strings + 1.1 Parser options + 1.2 Parsing HTML + 1.3 Doctype information + 2 The feed parser interface + 3 iterparse and iterwalk + 3.1 Selective tag events + 3.2 Modifying the tree + 3.3 iterwalk + 4 Python unicode strings + 4.1 Serialising to Unicode strings + The usual setup procedure:: @@ -167,6 +176,45 @@ ascii +The feed parser interface +========================= + +Since lxml 2.0, the parsers have a feed parser interface that is compatible to +the `ElementTree parsers`_. You can use it to feed data into the parser in a +controlled step-by-step way. Note that you can only use one interface at a +time: the ``parse()`` or ``XML()`` functions, or the feed parser interface. + +.. _`ElementTree parsers`: http://effbot.org/elementtree/elementtree-xmlparser.htm + +To start parsing with a feed parser, just call its ``feed()`` method:: + + >>> parser = etree.XMLParser() + + >>> for data in (''): + ... parser.feed(data) + +When you are done parsing, you **must** call the ``close()`` method to +retrieve the root Element of the parse result document, and to unlock the +parser:: + + >>> root = parser.close() + + >>> print root.tag + root + >>> print root[0].tag + a + +If you do not call ``close()``, the parser will stay locked and subsequent +usages will block till the end of times. So make sure you also close it in +the exception case. + +Another way of achieving the same step-by-step parsing is by writing your own +file-like object that returns a chunk of data on each ``read()`` call. Where +the feed parser interface allows you to actively pass data chunks into the +parser, a file-like object passively responds to ``read()`` requests of the +parser itself. Depending on the data source, either way may be more natural. + + iterparse and iterwalk ====================== Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Tue Sep 4 09:22:21 2007 @@ -578,7 +578,7 @@ This method must be called after passing the last chunk of data into the ``feed()`` method. It should only be called when using the feed - parser interface is used, all other usage is undefined. + parser interface, all other usage is undefined. """ cdef xmlParserCtxt* pctxt cdef xmlDoc* c_doc From scoder at codespeak.net Tue Sep 4 09:57:20 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 4 Sep 2007 09:57:20 +0200 (CEST) Subject: [Lxml-checkins] r46299 - in lxml/trunk: . src/lxml Message-ID: <20070904075720.E19E2812C@code0.codespeak.net> Author: scoder Date: Tue Sep 4 09:57:19 2007 New Revision: 46299 Modified: lxml/trunk/Makefile lxml/trunk/src/lxml/classlookup.pxi lxml/trunk/src/lxml/objectify.pyx lxml/trunk/src/lxml/sax.py Log: docstring cleanup Modified: lxml/trunk/Makefile ============================================================================== --- lxml/trunk/Makefile (original) +++ lxml/trunk/Makefile Tue Sep 4 09:57:19 2007 @@ -39,7 +39,7 @@ rm -fr doc/html/api @[ -x "`which epydoc`" ] \ && (cd src && echo "Generating API docs ..." && \ - PYTHONPATH=. epydoc -o ../doc/html/api --name lxml --url http://codespeak.net/lxml/ lxml/) \ + PYTHONPATH=. epydoc -v -o ../doc/html/api --name lxml --url http://codespeak.net/lxml/ lxml/) \ || (echo "not generating epydoc API documentation") # XXX What should the default be? Modified: lxml/trunk/src/lxml/classlookup.pxi ============================================================================== --- lxml/trunk/src/lxml/classlookup.pxi (original) +++ lxml/trunk/src/lxml/classlookup.pxi Tue Sep 4 09:57:19 2007 @@ -231,7 +231,7 @@ cdef class CustomElementClassLookup(FallbackElementClassLookup): """Element class lookup based on a subclass method. - You can inherit from this class and override the method + You can inherit from this class and override the method:: lookup(self, type, doc, namespace, name) Modified: lxml/trunk/src/lxml/objectify.pyx ============================================================================== --- lxml/trunk/src/lxml/objectify.pyx (original) +++ lxml/trunk/src/lxml/objectify.pyx Tue Sep 4 09:57:19 2007 @@ -266,12 +266,14 @@ _appendValue(self, _buildChildTag(self, tag), value) def __getitem__(self, key): - """Return a sibling, counting from the first child of the parent. + """Return a sibling, counting from the first child of the parent. The + method behaves like both a dict and a sequence. * If argument is an integer, returns the sibling at that position. - * If argument is a string, does the same as getattr(). This is used - to provide namespaces for element lookup. + * If argument is a string, does the same as getattr(). This can be + used to provide namespaces for element lookup, or to look up + children with special names (``text`` etc.). """ cdef tree.xmlNode* c_self_node cdef tree.xmlNode* c_parent Modified: lxml/trunk/src/lxml/sax.py ============================================================================== --- lxml/trunk/src/lxml/sax.py (original) +++ lxml/trunk/src/lxml/sax.py Tue Sep 4 09:57:19 2007 @@ -1,8 +1,9 @@ from xml.sax.handler import ContentHandler -from etree import ElementTree, Element, SubElement, LxmlError -from etree import XML, Comment, ProcessingInstruction +import etree +from etree import ElementTree, SubElement +from etree import Comment, ProcessingInstruction -class SaxError(LxmlError): +class SaxError(etree.LxmlError): """General SAX error. """ pass @@ -24,7 +25,7 @@ self._ns_mapping = { None : [None] } self._new_mappings = {} if makeelement is None: - makeelement = Element + makeelement = etree.Element self._makeelement = makeelement def _get_etree(self): From lxml-checkins at codespeak.net Tue Sep 4 21:30:02 2007 From: lxml-checkins at codespeak.net (Viagra.com Inc ®) Date: Tue, 4 Sep 2007 21:30:02 +0200 (CEST) Subject: [Lxml-checkins] Official Site Message-ID: <11023058544.0412311336185.567948300-5610@cimail939.msn.com> An HTML attachment was scrubbed... URL: http://codespeak.net/pipermail/lxml-checkins/attachments/20070904/31e65509/attachment.htm From scoder at codespeak.net Tue Sep 4 21:32:51 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 4 Sep 2007 21:32:51 +0200 (CEST) Subject: [Lxml-checkins] r46311 - lxml/trunk/src/lxml Message-ID: <20070904193251.C0F248130@code0.codespeak.net> Author: scoder Date: Tue Sep 4 21:32:51 2007 New Revision: 46311 Modified: lxml/trunk/src/lxml/parser.pxi Log: feed parser fix Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Tue Sep 4 21:32:51 2007 @@ -1,4 +1,4 @@ -# XML parser that provides dictionary sharing +# Parsers for XML and HTML cimport xmlparser cimport htmlparser @@ -533,11 +533,6 @@ else: raise TypeError, "Parsing requires string data" - if py_buffer_len > python.INT_MAX: - buffer_len = python.INT_MAX - else: - buffer_len = py_buffer_len - pctxt = self._parser_ctxt error = 0 if not self._feed_parser_running: @@ -546,18 +541,26 @@ self._error_log.connect() __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt) xmlparser.xmlCtxtUseOptions(pctxt, self._parse_options) + + if py_buffer_len > python.INT_MAX: + buffer_len = python.INT_MAX + else: + buffer_len = py_buffer_len + error = xmlparser.xmlCtxtResetPush( - pctxt, c_data, buffer_len, NULL, c_encoding) + pctxt, c_data, buffer_len, NULL, c_encoding) + py_buffer_len = py_buffer_len - buffer_len + c_data = c_data + buffer_len while error == 0 and py_buffer_len > 0: - c_data = c_data + buffer_len if py_buffer_len > python.INT_MAX: buffer_len = python.INT_MAX else: buffer_len = py_buffer_len py_buffer_len = py_buffer_len - buffer_len error = xmlparser.xmlParseChunk(pctxt, c_data, buffer_len, 0) + c_data = c_data + buffer_len if error: self._feed_parser_running = 0 From scoder at codespeak.net Tue Sep 4 21:33:53 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 4 Sep 2007 21:33:53 +0200 (CEST) Subject: [Lxml-checkins] r46312 - lxml/trunk/src/lxml/tests Message-ID: <20070904193353.BE2CC8130@code0.codespeak.net> Author: scoder Date: Tue Sep 4 21:33:53 2007 New Revision: 46312 Modified: lxml/trunk/src/lxml/tests/test_elementtree.py Log: test case for broken feed parser input Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Tue Sep 4 21:33:53 2007 @@ -2566,6 +2566,19 @@ self.assertRaises(Exception, parser.close) + def test_feed_parser_error_broken(self): + parser = self.etree.XMLParser() + + parser.feed(' Author: ianb Date: Thu Sep 6 17:40:00 2007 New Revision: 46372 Modified: lxml/trunk/src/lxml/html/clean.py Log: typo in copy Modified: lxml/trunk/src/lxml/html/clean.py ============================================================================== --- lxml/trunk/src/lxml/html/clean.py (original) +++ lxml/trunk/src/lxml/html/clean.py Thu Sep 6 17:40:00 2007 @@ -27,6 +27,7 @@ # +ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4- # you don't always have to have the charset set, if the page has no charset # and there's UTF7-like code in it. +# Look at these tests: http://htmlpurifier.org/live/smoketests/xssAttacks.php # This is an IE-specific construct you can have in a stylesheet to @@ -355,7 +356,7 @@ doc = fromstring(html) else: return_string = False - doc = copy.deepcopy(doc) + doc = copy.deepcopy(html) self(doc) if return_string: return tostring(doc) From lxml-checkins at codespeak.net Fri Sep 7 02:54:50 2007 From: lxml-checkins at codespeak.net (lxml-checkins at codespeak.net) Date: Fri, 7 Sep 2007 02:54:50 +0200 (CEST) Subject: [Lxml-checkins] apnalbdq lxml-checkins@codespeak.net Offer Message-ID: <20070907145617.23681.qmail@adsl-pool-222.123.32-32.tttmaxnet.com> An HTML attachment was scrubbed... URL: http://codespeak.net/pipermail/lxml-checkins/attachments/20070907/39b7c150/attachment.htm From scoder at codespeak.net Mon Sep 10 14:20:58 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 10 Sep 2007 14:20:58 +0200 (CEST) Subject: [Lxml-checkins] r46438 - lxml/trunk Message-ID: <20070910122058.A18CF810E@code0.codespeak.net> Author: scoder Date: Mon Sep 10 14:20:57 2007 New Revision: 46438 Modified: lxml/trunk/setup.py lxml/trunk/versioninfo.py Log: drop branch link in pre-releases Modified: lxml/trunk/setup.py ============================================================================== --- lxml/trunk/setup.py (original) +++ lxml/trunk/setup.py Mon Sep 10 14:20:57 2007 @@ -41,6 +41,19 @@ print "Building lxml version", svn_version +branch_link = """ +After an official release of a new stable series, current bug fixes become +available at http://codespeak.net/svn/lxml/branch/lxml-%(branch_version)s . +Running ``easy_install lxml==%(branch_version)sbugfix`` will install this +version from +http://codespeak.net/svn/lxml/branch/lxml-%(branch_version)s#egg=lxml-%(branch_version)sbugfix + +""" + +if versioninfo.is_pre_release(): + branch_link = "" + + extra_options.update(setupinfo.extra_setup_args()) setup( @@ -55,7 +68,7 @@ description="Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API.", - long_description=(("""\ + long_description=((("""\ lxml is a Pythonic, mature binding for the libxml2 and libxslt libraries. It provides safe and convenient access to these libraries using the ElementTree API. @@ -71,13 +84,7 @@ Running ``easy_install lxml==dev`` will install it from http://codespeak.net/svn/lxml/trunk#egg=lxml-dev -After an official release of a new stable series, current bug fixes might -become available at -http://codespeak.net/svn/lxml/branch/lxml-%(branch_version)s . Running -``easy_install lxml==%(branch_version)sbugfix`` will install this version from -http://codespeak.net/svn/lxml/branch/lxml-%(branch_version)s#egg=lxml-%(branch_version)sbugfix - -""" % { "branch_version" : versioninfo.branch_version() }) + +""" + branch_link) % { "branch_version" : versioninfo.branch_version() }) + versioninfo.changes()), classifiers = [ versioninfo.dev_status(), Modified: lxml/trunk/versioninfo.py ============================================================================== --- lxml/trunk/versioninfo.py (original) +++ lxml/trunk/versioninfo.py Mon Sep 10 14:20:57 2007 @@ -11,6 +11,10 @@ def branch_version(): return version()[:3] +def is_pre_release(): + version_string = version() + return "dev" in version_string or "alpha" in version_string or "beta" in version_string + def svn_version(): _version = version() src_dir = get_src_dir() From scoder at codespeak.net Mon Sep 10 14:21:10 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 10 Sep 2007 14:21:10 +0200 (CEST) Subject: [Lxml-checkins] r46439 - lxml/trunk/doc Message-ID: <20070910122110.D99C48111@code0.codespeak.net> Author: scoder Date: Mon Sep 10 14:21:10 2007 New Revision: 46439 Modified: lxml/trunk/doc/parsing.txt Log: small clarification in docs Modified: lxml/trunk/doc/parsing.txt ============================================================================== --- lxml/trunk/doc/parsing.txt (original) +++ lxml/trunk/doc/parsing.txt Mon Sep 10 14:21:10 2007 @@ -182,7 +182,8 @@ Since lxml 2.0, the parsers have a feed parser interface that is compatible to the `ElementTree parsers`_. You can use it to feed data into the parser in a controlled step-by-step way. Note that you can only use one interface at a -time: the ``parse()`` or ``XML()`` functions, or the feed parser interface. +time with each parser: the ``parse()`` or ``XML()`` functions, or the feed +parser interface. .. _`ElementTree parsers`: http://effbot.org/elementtree/elementtree-xmlparser.htm From scoder at codespeak.net Mon Sep 10 16:23:24 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 10 Sep 2007 16:23:24 +0200 (CEST) Subject: [Lxml-checkins] r46441 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20070910142324.3E714810F@code0.codespeak.net> Author: scoder Date: Mon Sep 10 16:23:22 2007 New Revision: 46441 Modified: lxml/trunk/selftest.py lxml/trunk/src/lxml/cstd.pxd lxml/trunk/src/lxml/docloader.pxi lxml/trunk/src/lxml/dtd.pxi lxml/trunk/src/lxml/etree.pyx lxml/trunk/src/lxml/htmlparser.pxd lxml/trunk/src/lxml/iterparse.pxi lxml/trunk/src/lxml/parser.pxi lxml/trunk/src/lxml/tests/test_elementtree.py lxml/trunk/src/lxml/xmlparser.pxd Log: major restructuring of the parser code to better interate feed parser and (the new) target parser Modified: lxml/trunk/selftest.py ============================================================================== --- lxml/trunk/selftest.py (original) +++ lxml/trunk/selftest.py Mon Sep 10 16:23:22 2007 @@ -266,7 +266,8 @@ ## """ ## Test HTML parsing. -## >>> p = HTMLTreeBuilder.TreeBuilder() +## >>> # p = HTMLTreeBuilder.TreeBuilder() +## >>> p = ElementTree.HTMLParser() ## >>> p.feed("

spamegg

") ## >>> serialize(p.close()) ## '

spamegg

' Modified: lxml/trunk/src/lxml/cstd.pxd ============================================================================== --- lxml/trunk/src/lxml/cstd.pxd (original) +++ lxml/trunk/src/lxml/cstd.pxd Mon Sep 10 16:23:22 2007 @@ -13,6 +13,7 @@ cdef int strcmp(char* s1, char* s2) cdef int strncmp(char* s1, char* s2, size_t len) cdef void* memcpy(void* dest, void* src, size_t len) + cdef void* memset(void* s, int c, size_t len) cdef extern from "stdarg.h": ctypedef void *va_list Modified: lxml/trunk/src/lxml/docloader.pxi ============================================================================== --- lxml/trunk/src/lxml/docloader.pxi (original) +++ lxml/trunk/src/lxml/docloader.pxi Mon Sep 10 16:23:22 2007 @@ -94,9 +94,12 @@ cdef class _ResolverContext(_ExceptionContext): cdef _ResolverRegistry _resolvers cdef _TempStore _storage - def __init__(self, _ResolverRegistry resolvers not None): + def __init__(self, _ResolverRegistry resolvers): _ExceptionContext.__init__(self) - self._resolvers = resolvers + if resolvers is None: + self._resolvers = _ResolverRegistry() + else: + self._resolvers = resolvers self._storage = _TempStore() cdef void clear(self): Modified: lxml/trunk/src/lxml/dtd.pxi ============================================================================== --- lxml/trunk/src/lxml/dtd.pxi (original) +++ lxml/trunk/src/lxml/dtd.pxi Mon Sep 10 16:23:22 2007 @@ -88,10 +88,10 @@ cdef tree.xmlDtd* _parseDtdFromFilelike(file) except NULL: cdef _ExceptionContext exc_context - cdef _FileParserContext dtd_parser + cdef _FileReaderContext dtd_parser cdef tree.xmlDtd* c_dtd exc_context = _ExceptionContext() - dtd_parser = _FileParserContext(file, exc_context) + dtd_parser = _FileReaderContext(file, exc_context) c_dtd = dtd_parser._readDtd() Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Mon Sep 10 16:23:22 2007 @@ -2131,19 +2131,20 @@ ################################################################################ # Include submodules -include "proxy.pxi" # Proxy handling (element backpointers/memory/etc.) -include "apihelpers.pxi" # Private helper functions -include "xmlerror.pxi" # Error and log handling -include "classlookup.pxi"# Element class lookup mechanisms -include "nsclasses.pxi" # Namespace implementation and registry -include "docloader.pxi" # Support for custom document loaders -include "parser.pxi" # XML Parser -include "serializer.pxi" # XML output functions -include "iterparse.pxi" # incremental XML parsing -include "xmlid.pxi" # XMLID and IDDict -include "extensions.pxi" # XPath/XSLT extension functions -include "xpath.pxi" # XPath evaluation -include "xslt.pxi" # XSL transformations +include "proxy.pxi" # Proxy handling (element backpointers/memory/etc.) +include "apihelpers.pxi" # Private helper functions +include "xmlerror.pxi" # Error and log handling +include "classlookup.pxi" # Element class lookup mechanisms +include "nsclasses.pxi" # Namespace implementation and registry +include "docloader.pxi" # Support for custom document loaders +include "parser.pxi" # XML Parser +include "parsertarget.pxi" # ET Parser target +include "serializer.pxi" # XML output functions +include "iterparse.pxi" # incremental XML parsing +include "xmlid.pxi" # XMLID and IDDict +include "extensions.pxi" # XPath/XSLT extension functions +include "xpath.pxi" # XPath evaluation +include "xslt.pxi" # XSL transformations ################################################################################ Modified: lxml/trunk/src/lxml/htmlparser.pxd ============================================================================== --- lxml/trunk/src/lxml/htmlparser.pxd (original) +++ lxml/trunk/src/lxml/htmlparser.pxd Mon Sep 10 16:23:22 2007 @@ -17,7 +17,11 @@ cdef xmlParserCtxt* htmlCreateMemoryParserCtxt(char* buffer, int size) cdef xmlParserCtxt* htmlCreateFileParserCtxt(char* filename, char* encoding) cdef void htmlFreeParserCtxt(xmlParserCtxt* ctxt) + cdef void htmlCtxtReset(xmlParserCtxt* ctxt) + cdef int htmlCtxtUseOptions(xmlParserCtxt* ctxt, int options) cdef int htmlParseDocument(xmlParserCtxt* ctxt) + cdef int htmlParseChunk(xmlParserCtxt* ctxt, + char* chunk, int size, int terminate) cdef xmlDoc* htmlCtxtReadFile(xmlParserCtxt* ctxt, char* filename, char* encoding, Modified: lxml/trunk/src/lxml/iterparse.pxi ============================================================================== --- lxml/trunk/src/lxml/iterparse.pxi (original) +++ lxml/trunk/src/lxml/iterparse.pxi Mon Sep 10 16:23:22 2007 @@ -48,7 +48,7 @@ c_ns = c_ns.next return count -cdef class _IterparseContext(_ResolverContext): +cdef class _IterparseContext(_ParserContext): cdef xmlparser.startElementNsSAX2Func _origSaxStart cdef xmlparser.endElementNsSAX2Func _origSaxEnd cdef _Element _root @@ -64,8 +64,8 @@ cdef char* _tag_href cdef char* _tag_name - def __init__(self, _ResolverRegistry resolvers): - _ResolverContext.__init__(self, resolvers) + def __init__(self): + _ParserContext.__init__(self) self._ns_stack = [] self._pop_ns = self._ns_stack.pop self._node_stack = [] @@ -73,22 +73,25 @@ self._events = [] self._event_index = 0 - cdef void _wrapCallbacks(self, xmlparser.xmlSAXHandler* sax): + cdef void _initParserContext(self, xmlparser.xmlParserCtxt* c_ctxt): "wrap original SAX2 callbacks" + cdef xmlparser.xmlSAXHandler* sax + _ParserContext._initParserContext(self, c_ctxt) + sax = c_ctxt.sax self._origSaxStart = sax.startElementNs # only override start event handler if needed if self._event_filter == 0 or \ self._event_filter & (ITERPARSE_FILTER_START | \ ITERPARSE_FILTER_START_NS | \ ITERPARSE_FILTER_END_NS): - sax.startElementNs = _saxStart + sax.startElementNs = _iterparseSaxStart self._origSaxEnd = sax.endElementNs # only override end event handler if needed if self._event_filter == 0 or \ self._event_filter & (ITERPARSE_FILTER_END | \ ITERPARSE_FILTER_END_NS): - sax.endElementNs = _saxEnd + sax.endElementNs = _iterparseSaxEnd cdef _setEventFilter(self, events, tag): self._event_filter = _buildIterparseEventFilter(events) @@ -184,9 +187,10 @@ cdef xmlparser.endElementNsSAX2Func _getOrigEnd(xmlparser.xmlParserCtxt* c_ctxt): return (<_IterparseContext>c_ctxt._private)._origSaxEnd -cdef void _saxStart(void* ctxt, char* localname, char* prefix, char* URI, - int nb_namespaces, char** namespaces, - int nb_attributes, int nb_defaulted, char** attributes): +cdef void _iterparseSaxStart(void* ctxt, char* localname, char* prefix, + char* URI, int nb_namespaces, char** namespaces, + int nb_attributes, int nb_defaulted, + char** attributes): # no Python in here! cdef xmlparser.xmlParserCtxt* c_ctxt cdef xmlparser.startElementNsSAX2Func origStart @@ -196,7 +200,7 @@ nb_attributes, nb_defaulted, attributes) _pushSaxStartEvent(c_ctxt, c_ctxt.node) -cdef void _saxEnd(void* ctxt, char* localname, char* prefix, char* URI): +cdef void _iterparseSaxEnd(void* ctxt, char* localname, char* prefix, char* URI): # no Python in here! cdef xmlparser.xmlParserCtxt* c_ctxt cdef xmlparser.endElementNsSAX2Func origEnd @@ -276,15 +280,17 @@ parse_options = parse_options | xmlparser.XML_PARSE_NOBLANKS _BaseParser.__init__(self, parse_options, remove_comments, remove_pis, - _IterparseContext) + None) context = <_IterparseContext>self._context context._setEventFilter(events, tag) - context._wrapCallbacks(self._parser_ctxt.sax) xmlparser.xmlCtxtUseOptions(self._parser_ctxt, parse_options) xmlparser.xmlCtxtResetPush(self._parser_ctxt, NULL, 0, c_filename, NULL) self._lockParser() # will not be unlocked - no other methods supported + cdef _ParserContext _createContext(self, target): + return _IterparseContext() + def __iter__(self): return self @@ -318,7 +324,8 @@ break if error != 0: self._source = None - _raiseParseError(self._parser_ctxt, self._filename, None) + _raiseParseError(self._parser_ctxt, self._filename, + self._context._error_log) if python.PyList_GET_SIZE(context._events) == 0: self.root = context._root self._source = None Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Mon Sep 10 16:23:22 2007 @@ -2,7 +2,6 @@ cimport xmlparser cimport htmlparser -from xmlparser cimport xmlParserCtxt, xmlDict class ParseError(LxmlSyntaxError): """Syntax error while parsing an XML document. @@ -26,17 +25,17 @@ LXML_HTML_PARSER LXML_ITERPARSE_PARSER -cdef class _ParserContext: +cdef class _ParserDictionaryContext: # Global parser context to share the string dictionary. # - # This class is a singleton! + # This class is a delegate singleton! # - # It creates _ParserContext objects for each thread to keep thread state, + # It creates _ParserDictionaryContext objects for each thread to keep thread state, # but those must never be used directly. Always stick to using the static # __GLOBAL_PARSER_CONTEXT as defined below the class. # - cdef xmlDict* _c_dict + cdef tree.xmlDict* _c_dict cdef _BaseParser _default_parser def __dealloc__(self): if self._c_dict is not NULL: @@ -49,33 +48,33 @@ cdef python.PyObject* result thread_dict = python.PyThreadState_GetDict() if thread_dict is not NULL: - python.PyDict_SetItem(thread_dict, "_ParserContext", self) + python.PyDict_SetItem(thread_dict, "_ParserDictionaryContext", self) - cdef _ParserContext _findThreadParserContext(self): - "Find (or create) the _ParserContext object for the current thread" + cdef _ParserDictionaryContext _findThreadParserContext(self): + "Find (or create) the _ParserDictionaryContext object for the current thread" cdef python.PyObject* thread_dict cdef python.PyObject* result - cdef _ParserContext context + cdef _ParserDictionaryContext context thread_dict = python.PyThreadState_GetDict() if thread_dict is NULL: return self d = thread_dict - result = python.PyDict_GetItem(d, "_ParserContext") + result = python.PyDict_GetItem(d, "_ParserDictionaryContext") if result is not NULL: return result - context = _ParserContext() - python.PyDict_SetItem(d, "_ParserContext", context) + context = _ParserDictionaryContext() + python.PyDict_SetItem(d, "_ParserDictionaryContext", context) return context cdef void setDefaultParser(self, _BaseParser parser): "Set the default parser for the current thread" - cdef _ParserContext context + cdef _ParserDictionaryContext context context = self._findThreadParserContext() context._default_parser = parser cdef _BaseParser getDefaultParser(self): "Return (or create) the default parser of the current thread" - cdef _ParserContext context + cdef _ParserDictionaryContext context context = self._findThreadParserContext() if context._default_parser is None: if self._default_parser is None: @@ -84,9 +83,9 @@ context._default_parser = self._default_parser._copy() return context._default_parser - cdef xmlDict* _getThreadDict(self, xmlDict* default): + cdef tree.xmlDict* _getThreadDict(self, tree.xmlDict* default): "Return the thread-local dict or create a new one if necessary." - cdef _ParserContext context + cdef _ParserDictionaryContext context context = self._findThreadParserContext() if context._c_dict is NULL: # thread dict not yet set up => use default or create a new one @@ -100,9 +99,9 @@ context._c_dict = xmlparser.xmlDictCreateSub(self._c_dict) return context._c_dict - cdef void initThreadDictRef(self, xmlDict** c_dict_ref): - cdef xmlDict* c_dict - cdef xmlDict* c_thread_dict + cdef void initThreadDictRef(self, tree.xmlDict** c_dict_ref): + cdef tree.xmlDict* c_dict + cdef tree.xmlDict* c_thread_dict c_dict = c_dict_ref[0] c_thread_dict = self._getThreadDict(c_dict) if c_dict is c_thread_dict: @@ -112,7 +111,7 @@ c_dict_ref[0] = c_thread_dict xmlparser.xmlDictReference(c_thread_dict) - cdef void initParserDict(self, xmlParserCtxt* pctxt): + cdef void initParserDict(self, xmlparser.xmlParserCtxt* pctxt): "Assure we always use the same string dictionary." self.initThreadDictRef(&pctxt.dict) @@ -127,11 +126,11 @@ # otherwise we'd free data that's in use => segfault self.initThreadDictRef(&result.dict) -cdef _ParserContext __GLOBAL_PARSER_CONTEXT -__GLOBAL_PARSER_CONTEXT = _ParserContext() +cdef _ParserDictionaryContext __GLOBAL_PARSER_CONTEXT +__GLOBAL_PARSER_CONTEXT = _ParserDictionaryContext() __GLOBAL_PARSER_CONTEXT.initMainParserContext() -cdef int _checkThreadDict(xmlDict* c_dict): +cdef int _checkThreadDict(tree.xmlDict* c_dict): """Check that c_dict is either the local thread dictionary or the global parent dictionary. """ @@ -205,7 +204,7 @@ ## support for file-like objects ############################################################ -cdef class _FileParserContext: +cdef class _FileReaderContext: cdef object _filelike cdef object _url cdef object _bytes @@ -223,14 +222,15 @@ self._bytes = '' self._bytes_read = 0 - cdef xmlparser.xmlParserInput* _createParserInput(self, xmlParserCtxt* ctxt): + cdef xmlparser.xmlParserInput* _createParserInput( + self, xmlparser.xmlParserCtxt* ctxt): cdef xmlparser.xmlParserInputBuffer* c_buffer c_buffer = xmlparser.xmlAllocParserInputBuffer(0) c_buffer.context = self c_buffer.readcallback = _readFilelikeParser return xmlparser.xmlNewIOInputStream(ctxt, c_buffer, 0) - cdef xmlDoc* _readDoc(self, xmlParserCtxt* ctxt, int options, + cdef xmlDoc* _readDoc(self, xmlparser.xmlParserCtxt* ctxt, int options, LxmlParserType parser_type): cdef python.PyThreadState* state cdef xmlDoc* result @@ -291,19 +291,19 @@ return -1 cdef int _readFilelikeParser(void* ctxt, char* c_buffer, int c_size): - return (<_FileParserContext>ctxt).copyToBuffer(c_buffer, c_size) + return (<_FileReaderContext>ctxt).copyToBuffer(c_buffer, c_size) ############################################################ ## support for custom document loaders ############################################################ cdef xmlparser.xmlParserInput* _parser_resolve_from_python( - char* c_url, char* c_pubid, xmlParserCtxt* c_context, int* error): + char* c_url, char* c_pubid, xmlparser.xmlParserCtxt* c_context, int* error): # call the Python document loaders cdef xmlparser.xmlParserInput* c_input cdef _ResolverContext context cdef _InputDocument doc_ref - cdef _FileParserContext file_context + cdef _FileReaderContext file_context error[0] = 0 context = <_ResolverContext>c_context._private try: @@ -338,7 +338,7 @@ c_input = xmlparser.xmlNewInputFromFile( c_context, _cstr(doc_ref._data_bytes)) elif doc_ref._type == PARSER_DATA_FILE: - file_context = _FileParserContext(doc_ref._file, context, url) + file_context = _FileReaderContext(doc_ref._file, context, url) c_input = file_context._createParserInput(c_context) data = file_context @@ -347,7 +347,7 @@ return c_input cdef xmlparser.xmlParserInput* _local_resolver(char* c_url, char* c_pubid, - xmlParserCtxt* c_context): + xmlparser.xmlParserCtxt* c_context): # no Python objects here, may be called without thread context ! # when we declare a Python object, Pyrex will INCREF(None) ! cdef xmlparser.xmlParserInput* c_input @@ -379,42 +379,145 @@ ## Parsers ############################################################ +cdef class _ParserContext(_ResolverContext): + cdef _ErrorLog _error_log + cdef xmlparser.xmlParserCtxt* _c_ctxt + def __init__(self): + _ResolverContext.__init__(self, _ResolverRegistry()) + self._error_log = _ErrorLog() + + cdef void _initParserContext(self, xmlparser.xmlParserCtxt* c_ctxt): + self._c_ctxt = c_ctxt + + cdef object _handleParseResult(self, _BaseParser parser, + xmlDoc* result, filename): + cdef xmlDoc* c_doc + cdef int recover + recover = parser._parse_options & xmlparser.XML_PARSE_RECOVER + c_doc = _handleParseResult(self, self._c_ctxt, result, + filename, recover) + return _documentFactory(c_doc, parser) + + cdef xmlDoc* _handleParseResultDoc(self, _BaseParser parser, + xmlDoc* result, filename) except NULL: + cdef int recover + recover = parser._parse_options & xmlparser.XML_PARSE_RECOVER + return _handleParseResult(self, self._c_ctxt, result, + filename, recover) + +cdef class _InternalParserContext(_ParserContext): + """Parser context for internal single-shot parsing + """ + +cdef int _raiseParseError(xmlparser.xmlParserCtxt* ctxt, filename, + _ErrorLog error_log) except 0: + if filename is not None and \ + ctxt.lastError.domain == xmlerror.XML_FROM_IO: + if ctxt.lastError.message is not NULL: + message = "Error reading file '%s': %s" % ( + filename, (ctxt.lastError.message).strip()) + else: + message = "Error reading file '%s'" % filename + raise IOError, message + elif error_log: + raise XMLSyntaxError, error_log._buildExceptionMessage( + "Document is not well formed") + elif ctxt.lastError.message is not NULL: + message = (ctxt.lastError.message).strip() + if ctxt.lastError.line > 0: + message = "line %d: %s" % (ctxt.lastError.line, message) + raise XMLSyntaxError, message + else: + raise XMLSyntaxError + +cdef xmlDoc* _handleParseResult(_ParserContext context, + xmlparser.xmlParserCtxt* c_ctxt, + xmlDoc* result, filename, + int recover) except NULL: + cdef int well_formed + if c_ctxt.myDoc is not NULL: + if c_ctxt.myDoc != result: + tree.xmlFreeDoc(c_ctxt.myDoc) + c_ctxt.myDoc = NULL + + if result is not NULL: + if recover or (c_ctxt.wellFormed and \ + c_ctxt.lastError.level < xmlerror.XML_ERR_ERROR): + well_formed = 1 + elif not c_ctxt.replaceEntities and not c_ctxt.validate \ + and context is not None: + # in this mode, we ignore errors about undefined entities + for error in context._error_log.filter_from_errors(): + if error.type != ErrorTypes.WAR_UNDECLARED_ENTITY and \ + error.type != ErrorTypes.ERR_UNDECLARED_ENTITY: + well_formed = 0 + break + else: + well_formed = 1 + else: + well_formed = 0 + + if well_formed: + __GLOBAL_PARSER_CONTEXT.initDocDict(result) + else: + # free broken document + tree.xmlFreeDoc(result) + result = NULL + + if context is not None and context._has_raised(): + if result is not NULL: + tree.xmlFreeDoc(result) + result = NULL + context._raise_if_stored() + + if result is NULL: + if context is not None: + _raiseParseError(c_ctxt, filename, context._error_log) + else: + _raiseParseError(c_ctxt, filename, None) + elif result.URL is NULL and filename is not None: + result.URL = tree.xmlStrdup(_cstr(filename)) + return result + + cdef class _BaseParser: cdef int _parse_options - cdef _ErrorLog _error_log - cdef readonly _ResolverRegistry resolvers - cdef _ResolverContext _context + cdef _ParserContext _context cdef LxmlParserType _parser_type - cdef xmlParserCtxt* _parser_ctxt + cdef xmlparser.xmlParserCtxt* _parser_ctxt cdef ElementClassLookup _class_lookup cdef python.PyThread_type_lock _parser_lock cdef int _feed_parser_running def __init__(self, int parse_options, remove_comments, remove_pis, - context_class=_ResolverContext): - cdef xmlParserCtxt* pctxt + target): + cdef xmlparser.xmlParserCtxt* pctxt if isinstance(self, HTMLParser): self._parser_type = LXML_HTML_PARSER - pctxt = htmlparser.htmlCreateMemoryParserCtxt('dummy', 5) elif isinstance(self, XMLParser): self._parser_type = LXML_XML_PARSER - pctxt = xmlparser.xmlNewParserCtxt() elif isinstance(self, iterparse): self._parser_type = LXML_ITERPARSE_PARSER - pctxt = xmlparser.xmlNewParserCtxt() else: raise TypeError, "This class cannot be instantiated" + self._parse_options = parse_options + + pctxt = self._newParserCtxt() self._parser_ctxt = pctxt if pctxt is NULL: python.PyErr_NoMemory() - if pctxt.sax != NULL: - if remove_comments: - pctxt.sax.comment = NULL - if remove_pis: - pctxt.sax.processingInstruction = NULL - # hard switch-off for CDATA nodes => makes them plain text - pctxt.sax.cdataBlock = NULL + + self._context = self._createContext(target) + self._context._initParserContext(pctxt) + pctxt._private = self._context + + if remove_comments: + pctxt.sax.comment = NULL + if remove_pis: + pctxt.sax.processingInstruction = NULL + # hard switch-off for CDATA nodes => makes them plain text + pctxt.sax.cdataBlock = NULL if not config.ENABLE_THREADING or \ self._parser_type == LXML_ITERPARSE_PARSER: @@ -422,10 +525,18 @@ self._parser_lock = NULL else: self._parser_lock = python.PyThread_allocate_lock() - self._error_log = _ErrorLog() - self.resolvers = _ResolverRegistry() - self._context = context_class(self.resolvers) - pctxt._private = self._context + + cdef _ParserContext _createContext(self, target): + if target is not None: + return _TargetParserContext(target) + else: + return _InternalParserContext() + + cdef xmlparser.xmlParserCtxt* _newParserCtxt(self): + if self._parser_type == LXML_HTML_PARSER: + return htmlparser.htmlCreateMemoryParserCtxt('dummy', 5) + else: + return xmlparser.xmlNewParserCtxt() def __dealloc__(self): if self._parser_ctxt is not NULL: @@ -434,7 +545,7 @@ python.PyThread_free_lock(self._parser_lock) cdef void _cleanup(self): - cdef xmlParserCtxt* pctxt + cdef xmlparser.xmlParserCtxt* pctxt pctxt = self._parser_ctxt if pctxt is not NULL: if pctxt.spaceTab is not NULL: # work around bug in libxml2 @@ -458,7 +569,11 @@ property error_log: def __get__(self): - return self._error_log.copy() + return self._context._error_log.copy() + + property resolvers: + def __get__(self): + return self._context._resolvers def setElementClassLookup(self, ElementClassLookup lookup = None): "Deprecated, use ``parser.set_element_class_lookup(lookup)`` instead." @@ -497,114 +612,6 @@ def __get__(self): return "libxml2 %d.%d.%d" % LIBXML_VERSION - # feed parser interface - - def feed(self, data): - """Feeds data to the parser. The argument should be an 8-bit string - buffer containing encoded data, although Unicode is supported as long - as both string types are not mixed. - - This is the main entry point to the consumer interface of a parser. - The parser will parse as much of the XML stream as it can on each - call. To finish parsing, call the ``close()`` method. - - It is not possible to use the parser in any other way after calling - the ``feed()`` method. The parser can only be reset by calling - ``close()``. - """ - cdef xmlParserCtxt* pctxt - cdef Py_ssize_t py_buffer_len - cdef char* c_data - cdef char* c_encoding - cdef int buffer_len - cdef int error - cdef int recover - if python.PyString_Check(data): - c_encoding = NULL - c_data = _cstr(data) - py_buffer_len = python.PyString_GET_SIZE(data) - elif python.PyUnicode_Check(data): - if _UNICODE_ENCODING is NULL: - raise ParserError, \ - "Unicode parsing is not supported on this platform" - c_encoding = _UNICODE_ENCODING - c_data = python.PyUnicode_AS_DATA(data) - py_buffer_len = python.PyUnicode_GET_DATA_SIZE(data) - else: - raise TypeError, "Parsing requires string data" - - pctxt = self._parser_ctxt - error = 0 - if not self._feed_parser_running: - self._lockParser() - self._feed_parser_running = 1 - self._error_log.connect() - __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt) - xmlparser.xmlCtxtUseOptions(pctxt, self._parse_options) - - if py_buffer_len > python.INT_MAX: - buffer_len = python.INT_MAX - else: - buffer_len = py_buffer_len - - error = xmlparser.xmlCtxtResetPush( - pctxt, c_data, buffer_len, NULL, c_encoding) - - py_buffer_len = py_buffer_len - buffer_len - c_data = c_data + buffer_len - - while error == 0 and py_buffer_len > 0: - if py_buffer_len > python.INT_MAX: - buffer_len = python.INT_MAX - else: - buffer_len = py_buffer_len - py_buffer_len = py_buffer_len - buffer_len - error = xmlparser.xmlParseChunk(pctxt, c_data, buffer_len, 0) - c_data = c_data + buffer_len - - if error: - self._feed_parser_running = 0 - try: - recover = self._parse_options & xmlparser.XML_PARSE_RECOVER - _handleParseResult(pctxt, pctxt.myDoc, None, - self._error_log, recover) - finally: - self._cleanup() - self._context.clear() - self._error_log.disconnect() - self._unlockParser() - - def close(self): - """Finishes feeding of data to this parser. This tells the parser to - process any remaining data in the feed buffer, and then returns the - root Element of the tree that was parsed. - - This method must be called after passing the last chunk of data into - the ``feed()`` method. It should only be called when using the feed - parser interface, all other usage is undefined. - """ - cdef xmlParserCtxt* pctxt - cdef xmlDoc* c_doc - cdef _Document doc - cdef int error - if not self._feed_parser_running: - raise XMLSyntaxError, "no element found" - pctxt = self._parser_ctxt - self._feed_parser_running = 0 - error = xmlparser.xmlParseChunk(pctxt, NULL, 0, 1) - try: - recover = self._parse_options & xmlparser.XML_PARSE_RECOVER - c_doc = _handleParseResult(pctxt, pctxt.myDoc, None, - self._error_log, recover) - finally: - self._cleanup() - self._context.clear() - self._error_log.disconnect() - self._unlockParser() - - doc = _documentFactory(c_doc, self) - return doc.getroot() - # internal parser methods cdef xmlDoc* _parseUnicodeDoc(self, utext, char* c_filename) except NULL: @@ -612,7 +619,7 @@ """ cdef python.PyThreadState* state cdef xmlDoc* result - cdef xmlParserCtxt* pctxt + cdef xmlparser.xmlParserCtxt* pctxt cdef int recover cdef Py_ssize_t py_buffer_len cdef int buffer_len @@ -625,7 +632,7 @@ buffer_len = py_buffer_len self._lockParser() - self._error_log.connect() + self._context._error_log.connect() try: pctxt = self._parser_ctxt __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt) @@ -642,13 +649,11 @@ self._parse_options) python.PyEval_RestoreThread(state) - recover = self._parse_options & xmlparser.XML_PARSE_RECOVER - return _handleParseResult(pctxt, result, None, - self._error_log, recover) + return self._context._handleParseResultDoc(self, result, None) finally: self._cleanup() self._context.clear() - self._error_log.disconnect() + self._context._error_log.disconnect() self._unlockParser() cdef xmlDoc* _parseDoc(self, char* c_text, Py_ssize_t c_len, @@ -657,12 +662,12 @@ """ cdef python.PyThreadState* state cdef xmlDoc* result - cdef xmlParserCtxt* pctxt + cdef xmlparser.xmlParserCtxt* pctxt cdef int recover if c_len > python.INT_MAX: raise ParserError, "string is too long to parse it with libxml2" self._lockParser() - self._error_log.connect() + self._context._error_log.connect() try: pctxt = self._parser_ctxt __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt) @@ -676,24 +681,22 @@ pctxt, c_text, c_len, c_filename, NULL, self._parse_options) python.PyEval_RestoreThread(state) - recover = self._parse_options & xmlparser.XML_PARSE_RECOVER - return _handleParseResult(pctxt, result, None, - self._error_log, recover) + return self._context._handleParseResultDoc(self, result, None) finally: self._cleanup() self._context.clear() - self._error_log.disconnect() + self._context._error_log.disconnect() self._unlockParser() cdef xmlDoc* _parseDocFromFile(self, char* c_filename) except NULL: cdef python.PyThreadState* state cdef xmlDoc* result - cdef xmlParserCtxt* pctxt + cdef xmlparser.xmlParserCtxt* pctxt cdef int recover cdef int orig_options result = NULL self._lockParser() - self._error_log.connect() + self._context._error_log.connect() try: pctxt = self._parser_ctxt __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt) @@ -709,108 +712,182 @@ python.PyEval_RestoreThread(state) pctxt.options = orig_options # work around libxml2 problem - recover = self._parse_options & xmlparser.XML_PARSE_RECOVER - return _handleParseResult(pctxt, result, c_filename, - self._error_log, recover) + return self._context._handleParseResultDoc( + self, result, c_filename) finally: self._cleanup() self._context.clear() - self._error_log.disconnect() + self._context._error_log.disconnect() self._unlockParser() cdef xmlDoc* _parseDocFromFilelike(self, filelike, filename) except NULL: - cdef _FileParserContext file_context + cdef _FileReaderContext file_context cdef xmlDoc* result - cdef xmlParserCtxt* pctxt + cdef xmlparser.xmlParserCtxt* pctxt cdef char* c_filename cdef int recover if not filename: filename = None self._lockParser() - self._error_log.connect() + self._context._error_log.connect() try: pctxt = self._parser_ctxt __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt) - file_context = _FileParserContext(filelike, self._context, filename) + file_context = _FileReaderContext(filelike, self._context, filename) result = file_context._readDoc( pctxt, self._parse_options, self._parser_type) - recover = self._parse_options & xmlparser.XML_PARSE_RECOVER - return _handleParseResult(pctxt, result, filename, - self._error_log, recover) + return self._context._handleParseResultDoc( + self, result, filename) finally: self._cleanup() self._context.clear() - self._error_log.disconnect() + self._context._error_log.disconnect() self._unlockParser() -cdef int _raiseParseError(xmlParserCtxt* ctxt, filename, - _ErrorLog error_log) except 0: - if filename is not None and \ - ctxt.lastError.domain == xmlerror.XML_FROM_IO: - if ctxt.lastError.message is not NULL: - message = "Error reading file '%s': %s" % ( - filename, (ctxt.lastError.message).strip()) +############################################################ +## ET feed parser +############################################################ + +cdef class _FeedParser(_BaseParser): + def feed(self, data): + """Feeds data to the parser. The argument should be an 8-bit string + buffer containing encoded data, although Unicode is supported as long + as both string types are not mixed. + + This is the main entry point to the consumer interface of a parser. + The parser will parse as much of the XML stream as it can on each + call. To finish parsing, call the ``close()`` method. + + It is not possible to use the parser in any other way after calling + the ``feed()`` method. The parser can only be reset by calling + ``close()``. + """ + cdef xmlparser.xmlParserCtxt* pctxt + cdef Py_ssize_t py_buffer_len + cdef char* c_data + cdef char* c_encoding + cdef int buffer_len + cdef int error + cdef int recover + if python.PyString_Check(data): + c_encoding = NULL + c_data = _cstr(data) + py_buffer_len = python.PyString_GET_SIZE(data) + elif python.PyUnicode_Check(data): + if _UNICODE_ENCODING is NULL: + raise ParserError, \ + "Unicode parsing is not supported on this platform" + c_encoding = _UNICODE_ENCODING + c_data = python.PyUnicode_AS_DATA(data) + py_buffer_len = python.PyUnicode_GET_DATA_SIZE(data) else: - message = "Error reading file '%s'" % filename - raise IOError, message - elif error_log is not None: - raise XMLSyntaxError, error_log._buildExceptionMessage( - "Document is not well formed") - elif ctxt.lastError.message is not NULL: - message = (ctxt.lastError.message).strip() - if ctxt.lastError.line > 0: - message = "line %d: %s" % (ctxt.lastError.line, message) - raise XMLSyntaxError, message - else: - raise XMLSyntaxError + raise TypeError, "Parsing requires string data" -cdef xmlDoc* _handleParseResult(xmlParserCtxt* ctxt, xmlDoc* result, - filename, _ErrorLog error_log, - int recover) except NULL: - cdef _ResolverContext context - cdef int well_formed - if ctxt.myDoc is not NULL: - if ctxt.myDoc != result: - tree.xmlFreeDoc(ctxt.myDoc) - ctxt.myDoc = NULL + pctxt = self._parser_ctxt + error = 0 + if not self._feed_parser_running: + self._lockParser() + self._feed_parser_running = 1 + self._context._error_log.connect() + __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt) - if result is not NULL: - if recover or (ctxt.wellFormed and \ - ctxt.lastError.level < xmlerror.XML_ERR_ERROR): - well_formed = 1 - elif not ctxt.replaceEntities and not ctxt.validate: - # in this mode, we ignore errors about undefined entities - for error in error_log.filter_from_errors(): - if error.type != ErrorTypes.WAR_UNDECLARED_ENTITY and \ - error.type != ErrorTypes.ERR_UNDECLARED_ENTITY: - well_formed = 0 - break + if py_buffer_len > python.INT_MAX: + buffer_len = python.INT_MAX else: - well_formed = 1 + buffer_len = py_buffer_len + if self._parser_type == LXML_HTML_PARSER: + error = _htmlCtxtResetPush(pctxt, c_data, buffer_len, + c_encoding, self._parse_options) + else: + error = xmlparser.xmlCtxtResetPush( + pctxt, c_data, buffer_len, NULL, c_encoding) + xmlparser.xmlCtxtUseOptions(pctxt, self._parse_options) + py_buffer_len = py_buffer_len - buffer_len + c_data = c_data + buffer_len + + while error == 0 and py_buffer_len > 0: + if py_buffer_len > python.INT_MAX: + buffer_len = python.INT_MAX + else: + buffer_len = py_buffer_len + if self._parser_type == LXML_HTML_PARSER: + error = htmlparser.htmlParseChunk(pctxt, c_data, buffer_len, 0) + else: + error = xmlparser.xmlParseChunk(pctxt, c_data, buffer_len, 0) + py_buffer_len = py_buffer_len - buffer_len + c_data = c_data + buffer_len + + if error: + self._feed_parser_running = 0 + try: + self._context._handleParseResult( + self, pctxt.myDoc, None) + finally: + self._cleanup() + self._context.clear() + self._context._error_log.disconnect() + self._unlockParser() + + def close(self): + """Terminates feeding data to this parser. This tells the parser to + process any remaining data in the feed buffer, and then returns the + root Element of the tree that was parsed. + + This method must be called after passing the last chunk of data into + the ``feed()`` method. It should only be called when using the feed + parser interface, all other usage is undefined. + """ + cdef xmlparser.xmlParserCtxt* pctxt + cdef xmlDoc* c_doc + cdef _Document doc + cdef int is_target_parser, error + if not self._feed_parser_running: + raise XMLSyntaxError, "no element found" + pctxt = self._parser_ctxt + self._feed_parser_running = 0 + if self._parser_type == LXML_HTML_PARSER: + error = htmlparser.htmlParseChunk(pctxt, NULL, 0, 1) else: - well_formed = 0 + error = xmlparser.xmlParseChunk(pctxt, NULL, 0, 1) + is_target_parser = isinstance(self._context, _TargetParserContext) + try: + result = self._context._handleParseResult( + self, pctxt.myDoc, None) + finally: + self._cleanup() + self._context.clear() + self._context._error_log.disconnect() + self._unlockParser() - if well_formed: - __GLOBAL_PARSER_CONTEXT.initDocDict(result) + if isinstance(result, _Document): + return (<_Document>result).getroot() else: - # free broken document - tree.xmlFreeDoc(result) - result = NULL + return result - if ctxt._private is not NULL: - context = <_ResolverContext>ctxt._private - if context._has_raised(): - if result is not NULL: - tree.xmlFreeDoc(result) - result = NULL - context._raise_if_stored() +cdef int _htmlCtxtResetPush(xmlparser.xmlParserCtxt* c_ctxt, + char* c_data, int buffer_len, + char* c_encoding, int parse_options) except -1: + cdef xmlparser.xmlParserInput* c_input_stream + # libxml2 crashes if spaceTab is not initialised + if _LIBXML_VERSION_INT < 20629 and c_ctxt.spaceTab is NULL: + c_ctxt.spaceTab = tree.xmlMalloc(10 * sizeof(int)) + c_ctxt.spaceMax = 10 - if result is NULL: - _raiseParseError(ctxt, filename, error_log) - elif result.URL is NULL and filename is not None: - result.URL = tree.xmlStrdup(_cstr(filename)) - return result + # libxml2 lacks an HTML push parser setup function + error = xmlparser.xmlCtxtResetPush(c_ctxt, NULL, 0, NULL, c_encoding) + if error: + return error + + # fix libxml2 setup for HTML + c_ctxt.progressive = 1 + c_ctxt.html = 1 + htmlparser.htmlCtxtUseOptions(c_ctxt, parse_options) + + if c_data is not NULL and buffer_len > 0: + return htmlparser.htmlParseChunk(c_ctxt, c_data, buffer_len, 0) + return 0 + ############################################################ ## XML parser @@ -824,7 +901,7 @@ xmlparser.XML_PARSE_COMPACT ) -cdef class XMLParser(_BaseParser): +cdef class XMLParser(_FeedParser): """The XML parser. Parsers can be supplied as additional argument to various parse functions of the lxml API. A default parser is always available and can be replaced by a call to the global function @@ -848,6 +925,8 @@ * compact - safe memory for short text content (default: True) * resolve_entities - replace entities by their text value (default: True) + You can pass a parser target as ``target`` keyword argument. + Note that you should avoid sharing parsers between threads. While this is not harmful, it is more efficient to use separate parsers. This does not apply to the default parser. @@ -856,7 +935,7 @@ load_dtd=False, no_network=True, ns_clean=False, recover=False, remove_blank_text=False, compact=True, resolve_entities=True, remove_comments=False, - remove_pis=False): + remove_pis=False, target=None): cdef int parse_options parse_options = _XML_DEFAULT_PARSE_OPTIONS if load_dtd: @@ -880,7 +959,8 @@ if not resolve_entities: parse_options = parse_options ^ xmlparser.XML_PARSE_NOENT - _BaseParser.__init__(self, parse_options, remove_comments, remove_pis) + _BaseParser.__init__(self, parse_options, remove_comments, remove_pis, + target) cdef class ETCompatXMLParser(XMLParser): """An XML parser with an ElementTree compatible default setup. See the @@ -893,18 +973,18 @@ load_dtd=False, no_network=True, ns_clean=False, recover=False, remove_blank_text=False, compact=True, resolve_entities=True, remove_comments=True, - remove_pis=True): + remove_pis=True, target=None): XMLParser.__init__(self, attribute_defaults, dtd_validation, load_dtd, no_network, ns_clean, recover, remove_blank_text, compact, resolve_entities, remove_comments, - remove_pis) + remove_pis, target) cdef xmlDoc* _internalParseDoc(char* c_text, int options, _ResolverContext context) except NULL: # internal parser function for XSLT - cdef xmlParserCtxt* pctxt + cdef xmlparser.xmlParserCtxt* pctxt cdef xmlDoc* c_doc cdef int recover pctxt = xmlparser.xmlNewParserCtxt() @@ -916,7 +996,7 @@ pctxt, c_text, NULL, NULL, options) try: recover = options & xmlparser.XML_PARSE_RECOVER - c_doc = _handleParseResult(pctxt, c_doc, None, None, recover) + c_doc = _handleParseResult(None, pctxt, c_doc, None, recover) finally: xmlparser.xmlFreeParserCtxt(pctxt) return c_doc @@ -924,7 +1004,7 @@ cdef xmlDoc* _internalParseDocFromFile(char* c_filename, int options, _ResolverContext context) except NULL: # internal parser function for XSLT - cdef xmlParserCtxt* pctxt + cdef xmlparser.xmlParserCtxt* pctxt cdef xmlDoc* c_doc cdef int recover pctxt = xmlparser.xmlNewParserCtxt() @@ -940,7 +1020,7 @@ filename = None else: filename = c_filename - c_doc = _handleParseResult(pctxt, c_doc, filename, None, recover) + c_doc = _handleParseResult(None, pctxt, c_doc, filename, recover) finally: xmlparser.xmlFreeParserCtxt(pctxt) return c_doc @@ -987,7 +1067,7 @@ htmlparser.HTML_PARSE_COMPACT ) -cdef class HTMLParser(_BaseParser): +cdef class HTMLParser(_FeedParser): """The HTML parser. This parser allows reading HTML into a normal XML tree. By default, it can read broken (non well-formed) HTML, depending on the capabilities of libxml2. Use the 'recover' option to switch this off. @@ -1000,11 +1080,14 @@ * remove_pis - discard processing instructions * compact - safe memory for short text content (default: True) + You can pass a parser target as ``target`` keyword argument. + Note that you should avoid sharing parsers between threads for performance reasons. """ def __init__(self, recover=True, no_network=True, remove_blank_text=False, - compact=True, remove_comments=False, remove_pis=False): + compact=True, remove_comments=False, remove_pis=False, + target=None): cdef int parse_options parse_options = _HTML_DEFAULT_PARSE_OPTIONS if remove_blank_text: @@ -1016,7 +1099,8 @@ if not compact: parse_options = parse_options ^ htmlparser.HTML_PARSE_COMPACT - _BaseParser.__init__(self, parse_options, remove_comments, remove_pis) + _BaseParser.__init__(self, parse_options, remove_comments, remove_pis, + target) cdef HTMLParser __DEFAULT_HTML_PARSER __DEFAULT_HTML_PARSER = HTMLParser() Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Mon Sep 10 16:23:22 2007 @@ -2538,6 +2538,8 @@ # ElementTree 1.3+, cET self.assert_(re.match("[^ ]+ [0-9.]+", parser.version)) + # feed parser interface + def test_feed_parser(self): parser = self.etree.XMLParser() @@ -2579,6 +2581,81 @@ self.assertRaises(Exception, parser.close) + # parser target interface + + def test_parser_target_tag(self): + assertEquals = self.assertEquals + assertFalse = self.assertFalse + + events = [] + class Target(object): + def start(self, tag, attrib): + events.append("start") + assertFalse(attrib) + assertEquals("TAG", tag) + def end(self, tag): + events.append("end") + assertEquals("TAG", tag) + def close(self): + return "DONE" + + parser = self.etree.XMLParser(target=Target()) + + parser.feed("") + done = parser.close() + + self.assertEquals("DONE", done) + self.assertEquals(["start", "end"], events) + + def test_parser_target_attrib(self): + assertEquals = self.assertEquals + assertFalse = self.assertFalse + + events = [] + class Target(object): + def start(self, tag, attrib): + events.append("start-" + tag) + for name, value in attrib.iteritems(): + assertEquals(tag + name, value) + def end(self, tag): + events.append("end-" + tag) + def close(self): + return "DONE" + + parser = self.etree.XMLParser(target=Target()) + + parser.feed('') + done = parser.close() + + self.assertEquals("DONE", done) + self.assertEquals(["start-root", "start-sub", "end-sub", "end-root"], + events) + + def test_parser_target_data(self): + assertEquals = self.assertEquals + assertFalse = self.assertFalse + + events = [] + class Target(object): + def start(self, tag, attrib): + events.append("start-" + tag) + def end(self, tag): + events.append("end-" + tag) + def data(self, data): + events.append("data-" + data) + def close(self): + return "DONE" + + parser = self.etree.XMLParser(target=Target()) + + parser.feed('AB') + done = parser.close() + + self.assertEquals("DONE", done) + self.assertEquals(["start-root", "data-A", "start-sub", + "end-sub", "data-B", "end-root"], + events) + # helper methods def _writeElement(self, element, encoding='us-ascii'): Modified: lxml/trunk/src/lxml/xmlparser.pxd ============================================================================== --- lxml/trunk/src/lxml/xmlparser.pxd (original) +++ lxml/trunk/src/lxml/xmlparser.pxd Mon Sep 10 16:23:22 2007 @@ -19,17 +19,25 @@ char* prefix, char* URI) - ctypedef void (*cdataBlockSAXFunc)(void* ctx, - char* value, - int len) + ctypedef void (*charactersSAXFunc)(void* ctx, char* ch, int len) - ctypedef void (*commentSAXFunc)(void* ctx, - char* value) + ctypedef void (*cdataBlockSAXFunc)(void* ctx, char* value, int len) - ctypedef void (*processingInstructionSAXFunc)(void * ctx, + ctypedef void (*commentSAXFunc)(void* ctx, char* value) + + ctypedef void (*processingInstructionSAXFunc)(void* ctx, char* target, char* data) + ctypedef void (*internalSubsetSAXFunc)(void* ctx, + char* name, + char* externalID, + char* systemID) + + ctypedef void (*endDocumentSAXFunc)(void* ctx) + + cdef int XML_SAX2_MAGIC + cdef extern from "libxml/tree.h": ctypedef struct xmlParserInput ctypedef struct xmlParserInputBuffer: @@ -38,11 +46,15 @@ xmlInputCloseCallback closecallback ctypedef struct xmlSAXHandler: + internalSubsetSAXFunc internalSubset startElementNsSAX2Func startElementNs endElementNsSAX2Func endElementNs + charactersSAXFunc characters cdataBlockSAXFunc cdataBlock commentSAXFunc comment processingInstructionSAXFunc processingInstruction + endDocumentSAXFunc endDocument + int initialized cdef extern from "libxml/xmlIO.h": cdef xmlParserInputBuffer* xmlAllocParserInputBuffer(int enc) @@ -54,6 +66,8 @@ cdef void xmlDictFree(xmlDict* sub) cdef int xmlDictReference(xmlDict* dict) + cdef int XML_COMPLETE_ATTRS # SAX option for adding DTD default attributes + ctypedef struct xmlParserCtxt: xmlDoc* myDoc xmlDict* dict @@ -64,11 +78,16 @@ int disableSAX int errNo int replaceEntities + int loadsubset int validate xmlError lastError xmlNode* node xmlSAXHandler* sax int* spaceTab + int spaceMax + int html + int progressive + int charset ctypedef enum xmlParserOption: XML_PARSE_RECOVER = 1 # recover on errors From scoder at codespeak.net Tue Sep 11 12:03:55 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 11 Sep 2007 12:03:55 +0200 (CEST) Subject: [Lxml-checkins] r46463 - lxml/trunk/src/lxml Message-ID: <20070911100355.A56EF80AF@code0.codespeak.net> Author: scoder Date: Tue Sep 11 12:03:54 2007 New Revision: 46463 Added: lxml/trunk/src/lxml/objectpath.pxi - copied, changed from r46423, lxml/trunk/src/lxml/objectify.pyx Modified: lxml/trunk/src/lxml/objectify.pyx Log: moved objectpath implementation to separate .pxi Modified: lxml/trunk/src/lxml/objectify.pyx ============================================================================== --- lxml/trunk/src/lxml/objectify.pyx (original) +++ lxml/trunk/src/lxml/objectify.pyx Tue Sep 11 12:03:54 2007 @@ -15,9 +15,6 @@ __version__ = etree.__version__ -cdef object SubElement -SubElement = etree.SubElement - cdef object re import re cdef object __builtin__ @@ -494,7 +491,8 @@ for item in value: _appendValue(parent, tag, item) else: - new_element = SubElement(parent, tag) + new_element = cetree.makeSubElement( + parent, tag, None, None, None, None) _setElementValue(new_element, value) cdef _setElementValue(_Element element, value): @@ -1294,341 +1292,6 @@ ################################################################################ -# ObjectPath - -ctypedef struct _ObjectPath: - char* href - char* name - Py_ssize_t index - -cdef class ObjectPath: - """Immutable object that represents a compiled object path. - - Example for a path: 'root.child[1].{other}child[25]' - """ - cdef readonly object find - cdef object _path - cdef object _path_str - cdef _ObjectPath* _c_path - cdef Py_ssize_t _path_len - def __init__(self, path): - if python._isString(path): - self._path = _parseObjectPathString(path) - self._path_str = path - else: - self._path = _parseObjectPathList(path) - self._path_str = '.'.join(path) - self._path_len = python.PyList_GET_SIZE(self._path) - self._c_path = _buildObjectPathSegments(self._path) - self.find = self.__call__ - - def __dealloc__(self): - if self._c_path is not NULL: - python.PyMem_Free(self._c_path) - - def __str__(self): - return self._path_str - - def __call__(self, _Element root not None, *default): - """Follow the attribute path in the object structure and return the - target attribute value. - - If it it not found, either returns a default value (if one was passed - as second argument) or raises AttributeError. - """ - cdef Py_ssize_t use_default - use_default = python.PyTuple_GET_SIZE(default) - if use_default == 1: - default = python.PyTuple_GET_ITEM(default, 0) - python.Py_INCREF(default) - use_default = 1 - elif use_default > 1: - raise TypeError, "invalid number of arguments: needs one or two" - return _findObjectPath(root, self._c_path, self._path_len, - default, use_default) - - def hasattr(self, _Element root not None): - try: - _findObjectPath(root, self._c_path, self._path_len, None, 0) - except AttributeError: - return False - return True - - def setattr(self, _Element root not None, value): - """Set the value of the target element in a subtree. - - If any of the children on the path does not exist, it is created. - """ - _createObjectPath(root, self._c_path, self._path_len, 1, value) - - def addattr(self, _Element root not None, value): - """Append a value to the target element in a subtree. - - If any of the children on the path does not exist, it is created. - """ - _createObjectPath(root, self._c_path, self._path_len, 0, value) - -cdef object __MATCH_PATH_SEGMENT -__MATCH_PATH_SEGMENT = re.compile( - r"(\.?)\s*(?:\{([^}]*)\})?\s*([^.{}\[\]\s]+)\s*(?:\[\s*([-0-9]+)\s*\])?", - re.U).match - -cdef object _RELATIVE_PATH_SEGMENT -_RELATIVE_PATH_SEGMENT = (None, None, 0) - -cdef _parseObjectPathString(path): - """Parse object path string into a 'hrefOnameOhrefOnameOOO' string and an - index list. The index list is None if no index was used in the path. - """ - cdef int has_dot - new_path = [] - path = cetree.utf8(path.strip()) - if path == '.': - return [_RELATIVE_PATH_SEGMENT] - path_pos = 0 - while python.PyString_GET_SIZE(path) > 0: - match = __MATCH_PATH_SEGMENT(path, path_pos) - if match is None: - break - - dot, ns, name, index = match.groups() - if index is None or python.PyString_GET_SIZE(index) == 0: - index = 0 - else: - index = python.PyNumber_Int(index) - has_dot = _cstr(dot)[0] == c'.' - if python.PyList_GET_SIZE(new_path) == 0: - if has_dot: - # path '.child' => ignore root - python.PyList_Append(new_path, _RELATIVE_PATH_SEGMENT) - elif index != 0: - raise ValueError, "index not allowed on root node" - elif not has_dot: - raise ValueError, "invalid path" - python.PyList_Append(new_path, (ns, name, index)) - - path_pos = match.end() - if python.PyList_GET_SIZE(new_path) == 0 or \ - python.PyString_GET_SIZE(path) > path_pos: - raise ValueError, "invalid path" - return new_path - -cdef _parseObjectPathList(path): - """Parse object path sequence into a 'hrefOnameOhrefOnameOOO' string and - an index list. The index list is None if no index was used in the path. - """ - cdef char* index_pos - cdef char* index_end - cdef char* c_name - new_path = [] - for item in path: - item = item.strip() - if python.PyList_GET_SIZE(new_path) == 0 and item == '': - # path '.child' => ignore root - ns = name = None - index = 0 - else: - ns, name = cetree.getNsTag(item) - c_name = _cstr(name) - index_pos = cstd.strchr(c_name, c'[') - if index_pos is NULL: - index = 0 - else: - name = python.PyString_FromStringAndSize( - c_name, (index_pos - c_name)) - index_pos = index_pos + 1 - index_end = cstd.strchr(index_pos, c']') - if index_end is NULL: - raise ValueError, "index must be enclosed in []" - index = python.PyNumber_Int( - python.PyString_FromStringAndSize( - index_pos, (index_end - index_pos))) - if python.PyList_GET_SIZE(new_path) == 0 and index != 0: - raise ValueError, "index not allowed on root node" - python.PyList_Append(new_path, (ns, name, index)) - if python.PyList_GET_SIZE(new_path) == 0: - raise ValueError, "invalid path" - return new_path - -cdef _ObjectPath* _buildObjectPathSegments(path_list) except NULL: - cdef _ObjectPath* c_path - cdef _ObjectPath* c_path_segments - cdef Py_ssize_t c_len - c_len = python.PyList_GET_SIZE(path_list) - c_path_segments = <_ObjectPath*>python.PyMem_Malloc(sizeof(_ObjectPath) * - c_len) - if c_path_segments is NULL: - python.PyErr_NoMemory() - return NULL - c_path = c_path_segments - for href, name, index in path_list: - if href is None: - c_path[0].href = NULL - else: - c_path[0].href = _cstr(href) - if name is None: - c_path[0].name = NULL - else: - c_path[0].name = _cstr(name) - c_path[0].index = index - c_path = c_path + 1 - return c_path_segments - -cdef _findObjectPath(_Element root, _ObjectPath* c_path, Py_ssize_t c_path_len, - default_value, int use_default): - """Follow the path to find the target element. - """ - cdef tree.xmlNode* c_node - cdef char* c_href - cdef char* c_name - cdef Py_ssize_t c_index - c_node = root._c_node - c_name = c_path[0].name - c_href = c_path[0].href - if c_href is NULL or c_href[0] == c'\0': - c_href = tree._getNs(c_node) - if not cetree.tagMatches(c_node, c_href, c_name): - raise ValueError, "root element does not match: need %s, got %s" % \ - (cetree.namespacedNameFromNsName(c_href, c_name), root.tag) - - while c_node is not NULL: - c_path_len = c_path_len - 1 - if c_path_len <= 0: - return cetree.elementFactory(root._doc, c_node) - - c_path = c_path + 1 - if c_path[0].href is not NULL: - c_href = c_path[0].href # otherwise: keep parent namespace - c_name = c_path[0].name - c_index = c_path[0].index - - if c_index < 0: - c_node = c_node.last - else: - c_node = c_node.children - c_node = _findFollowingSibling(c_node, c_href, c_name, c_index) - - if use_default: - return default_value - else: - tag = cetree.namespacedNameFromNsName(c_href, c_name) - raise AttributeError, "no such child: " + tag - -cdef _createObjectPath(_Element root, _ObjectPath* c_path, - Py_ssize_t c_path_len, int replace, value): - """Follow the path to find the target element, build the missing children - as needed and set the target element to 'value'. If replace is true, an - existing value is replaced, otherwise the new value is added. - """ - cdef _Element child - cdef tree.xmlNode* c_node - cdef tree.xmlNode* c_child - cdef char* c_href - cdef char* c_name - cdef Py_ssize_t c_index - if c_path_len == 1: - raise TypeError, "cannot update root node" - - c_node = root._c_node - c_name = c_path[0].name - c_href = c_path[0].href - if c_href is NULL or c_href[0] == c'\0': - c_href = tree._getNs(c_node) - if not cetree.tagMatches(c_node, c_href, c_name): - raise ValueError, "root element does not match: need %s, got %s" % \ - (cetree.namespacedNameFromNsName(c_href, c_name), root.tag) - - while c_path_len > 1: - c_path_len = c_path_len - 1 - c_path = c_path + 1 - if c_path[0].href is not NULL: - c_href = c_path[0].href # otherwise: keep parent namespace - c_name = c_path[0].name - c_index = c_path[0].index - - if c_index < 0: - c_child = c_node.last - else: - c_child = c_node.children - c_child = _findFollowingSibling(c_child, c_href, c_name, c_index) - - if c_child is not NULL: - c_node = c_child - elif c_index != 0: - raise TypeError, \ - "creating indexed path attributes is not supported" - elif c_path_len == 1: - _appendValue(cetree.elementFactory(root._doc, c_node), - cetree.namespacedNameFromNsName(c_href, c_name), - value) - return - else: - child = SubElement( - cetree.elementFactory(root._doc, c_node), - cetree.namespacedNameFromNsName(c_href, c_name)) - c_node = child._c_node - - # if we get here, the entire path was already there - if replace: - element = cetree.elementFactory(root._doc, c_node) - _replaceElement(element, value) - else: - _appendValue(cetree.elementFactory(root._doc, c_node.parent), - cetree.namespacedName(c_node), value) - -cdef _buildDescendantPaths(tree.xmlNode* c_node, prefix_string): - """Returns a list of all descendant paths. - """ - tag = cetree.namespacedName(c_node) - if prefix_string: - if prefix_string[-1] != '.': - prefix_string = prefix_string + '.' - prefix_string = prefix_string + tag - else: - prefix_string = tag - path = [prefix_string] - path_list = [] - _recursiveBuildDescendantPaths(c_node, path, path_list) - return path_list - -cdef _recursiveBuildDescendantPaths(tree.xmlNode* c_node, path, path_list): - """Fills the list 'path_list' with all descendant paths, initial prefix - being in the list 'path'. - """ - cdef python.PyObject* dict_result - cdef tree.xmlNode* c_child - cdef char* c_href - python.PyList_Append(path_list, '.'.join(path)) - tags = {} - c_href = tree._getNs(c_node) - c_child = c_node.children - while c_child is not NULL: - while c_child.type != tree.XML_ELEMENT_NODE: - c_child = c_child.next - if c_child is NULL: - return - if c_href is tree._getNs(c_child): - tag = c_child.name - elif c_href is not NULL and tree._getNs(c_child) is NULL: - # special case: parent has namespace, child does not - tag = '{}' + c_child.name - else: - tag = cetree.namespacedName(c_child) - dict_result = python.PyDict_GetItem(tags, tag) - if dict_result is NULL: - count = 0 - else: - count = (dict_result) + 1 - python.PyDict_SetItem(tags, tag, count) - if count > 0: - tag = tag + '[%d]' % count - python.PyList_Append(path, tag) - _recursiveBuildDescendantPaths(c_child, path, path_list) - del path[-1] - c_child = c_child.next - - -################################################################################ # Type annotations cdef PyType _check_type(tree.xmlNode* c_node, PyType pytype): @@ -2058,3 +1721,9 @@ python.PyDict_SetItem(_attributes, PYTYPE_ATTRIBUTE, _pytype) return _makeElement("value", strval, _attributes, nsmap) + + +################################################################################ +# ObjectPath + +include "objectpath.pxi" Copied: lxml/trunk/src/lxml/objectpath.pxi (from r46423, lxml/trunk/src/lxml/objectify.pyx) ============================================================================== --- lxml/trunk/src/lxml/objectify.pyx (original) +++ lxml/trunk/src/lxml/objectpath.pxi Tue Sep 11 12:03:54 2007 @@ -1,1298 +1,3 @@ -from etreepublic cimport _Document, _Element, ElementBase -from etreepublic cimport _ElementIterator, ElementClassLookup -from etreepublic cimport elementFactory, import_etree, textOf -from python cimport str, repr, isinstance, issubclass, callable, getattr -from python cimport _cstr, Py_ssize_t -cimport etreepublic as cetree -cimport python -cimport tree -cimport cstd - -cdef object etree -from lxml import etree -# initialize C-API of lxml.etree -import_etree(etree) - -__version__ = etree.__version__ - -cdef object SubElement -SubElement = etree.SubElement - -cdef object re -import re -cdef object __builtin__ -import __builtin__ -cdef object int -int = __builtin__.int -cdef object long -long = __builtin__.long -cdef object float -float = __builtin__.float -cdef object bool -bool = __builtin__.bool -cdef object pow -pow = __builtin__.pow -cdef object abs -abs = __builtin__.abs -cdef object len -len = __builtin__.len - -cdef object True -True = __builtin__.True -cdef object False -False = __builtin__.False - -cdef object AttributeError -AttributeError = __builtin__.AttributeError -cdef object TypeError -TypeError = __builtin__.TypeError -cdef object ValueError -ValueError = __builtin__.ValueError -cdef object IndexError -IndexError = __builtin__.IndexError -cdef object StopIteration -StopIteration = __builtin__.StopIteration - -cdef object IGNORABLE_ERRORS -IGNORABLE_ERRORS = (ValueError, TypeError) - -cdef object list -list = __builtin__.list -cdef object set -try: - set = __builtin__.set -except AttributeError: - from sets import Set as set - -cdef object islice -from itertools import islice - -cdef object _typename(object t): - cdef char* c_name - cdef char* s - c_name = python._fqtypename(t) - s = cstd.strrchr(c_name, c'.') - if s == NULL: - return c_name - else: - return (s+1) - -# namespace/name for "pytype" hint attribute -cdef object PYTYPE_NAMESPACE -cdef char* _PYTYPE_NAMESPACE - -cdef object PYTYPE_ATTRIBUTE_NAME -cdef char* _PYTYPE_ATTRIBUTE_NAME - -PYTYPE_ATTRIBUTE = None - -cdef object TREE_PYTYPE -TREE_PYTYPE = "TREE" - -def setPytypeAttributeTag(attribute_tag=None): - """Changes name and namespace of the XML attribute that holds Python type - information. - - Reset by calling without argument. - - Default: "{http://codespeak.net/lxml/objectify/pytype}pytype" - """ - global PYTYPE_ATTRIBUTE, _PYTYPE_NAMESPACE, _PYTYPE_ATTRIBUTE_NAME - global PYTYPE_NAMESPACE, PYTYPE_ATTRIBUTE_NAME - if attribute_tag is None: - PYTYPE_NAMESPACE = "http://codespeak.net/lxml/objectify/pytype" - PYTYPE_ATTRIBUTE_NAME = "pytype" - else: - PYTYPE_NAMESPACE, PYTYPE_ATTRIBUTE_NAME = cetree.getNsTag(attribute_tag) - _PYTYPE_NAMESPACE = _cstr(PYTYPE_NAMESPACE) - _PYTYPE_ATTRIBUTE_NAME = _cstr(PYTYPE_ATTRIBUTE_NAME) - PYTYPE_ATTRIBUTE = cetree.namespacedNameFromNsName( - _PYTYPE_NAMESPACE, _PYTYPE_ATTRIBUTE_NAME) - -setPytypeAttributeTag() - - -# namespaces for XML Schema -cdef object XML_SCHEMA_NS -XML_SCHEMA_NS = "http://www.w3.org/2001/XMLSchema" -cdef char* _XML_SCHEMA_NS -_XML_SCHEMA_NS = _cstr(XML_SCHEMA_NS) - -cdef object XML_SCHEMA_INSTANCE_NS -XML_SCHEMA_INSTANCE_NS = "http://www.w3.org/2001/XMLSchema-instance" -cdef char* _XML_SCHEMA_INSTANCE_NS -_XML_SCHEMA_INSTANCE_NS = _cstr(XML_SCHEMA_INSTANCE_NS) - -cdef object XML_SCHEMA_INSTANCE_NIL_ATTR -XML_SCHEMA_INSTANCE_NIL_ATTR = "{%s}nil" % XML_SCHEMA_INSTANCE_NS -cdef object XML_SCHEMA_INSTANCE_TYPE_ATTR -XML_SCHEMA_INSTANCE_TYPE_ATTR = "{%s}type" % XML_SCHEMA_INSTANCE_NS - - -################################################################################ -# Element class for the main API - -cdef class ObjectifiedElement(ElementBase): - """Main XML Element class. - - Element children are accessed as object attributes. Multiple children - with the same name are available through a list index. Example: - - >>> root = etree.XML("01") - >>> second_c2 = root.c1.c2[1] - """ - def __iter__(self): - """Iterate over self and all siblings with the same tag. - """ - parent = self.getparent() - if parent is None: - return iter([self]) - return etree.ElementChildIterator(parent, tag=self.tag) - - def __str__(self): - if __RECURSIVE_STR: - return _dump(self, 0) - else: - return textOf(self._c_node) or '' - - property text: - def __get__(self): - return textOf(self._c_node) - - property __dict__: - """A fake implementation for __dict__ to support dir() etc. - - Note that this only considers the first child with a given name. - """ - def __get__(self): - cdef char* c_ns - cdef char* c_child_ns - cdef _Element child - c_ns = tree._getNs(self._c_node) - if c_ns is NULL: - tag = None - else: - tag = "{%s}*" % c_ns - children = {} - for child in etree.ElementChildIterator(self, tag=tag): - if c_ns is NULL and tree._getNs(child._c_node) is not NULL: - continue - name = child._c_node.name - if python.PyDict_GetItem(children, name) is NULL: - python.PyDict_SetItem(children, name, child) - return children - - def __len__(self): - """Count self and siblings with the same tag. - """ - cdef tree.xmlNode* c_self_node - cdef tree.xmlNode* c_node - cdef char* c_href - cdef char* c_tag - cdef Py_ssize_t count - c_self_node = self._c_node - c_tag = c_self_node.name - c_href = tree._getNs(c_self_node) - count = 1 - c_node = c_self_node.next - while c_node is not NULL: - if c_node.type == tree.XML_ELEMENT_NODE and \ - cetree.tagMatches(c_node, c_href, c_tag): - count = count + 1 - c_node = c_node.next - c_node = c_self_node.prev - while c_node is not NULL: - if c_node.type == tree.XML_ELEMENT_NODE and \ - cetree.tagMatches(c_node, c_href, c_tag): - count = count + 1 - c_node = c_node.prev - return count - - def countchildren(self): - """Return the number of children of this element, regardless of their - name. - """ - # copied from etree - cdef Py_ssize_t c - cdef tree.xmlNode* c_node - c = 0 - c_node = self._c_node.children - while c_node is not NULL: - if tree._isElement(c_node): - c = c + 1 - c_node = c_node.next - return c - - def __getattr__(self, tag): - """Return the (first) child with the given tag name. If no namespace - is provided, the child will be looked up in the same one as self. - """ - return _lookupChildOrRaise(self, tag) - - def __setattr__(self, tag, value): - """Set the value of the (first) child with the given tag name. If no - namespace is provided, the child will be looked up in the same one as - self. - """ - cdef _Element element - # properties are looked up /after/ __setattr__, so we must emulate them - if tag == 'text' or tag == 'pyval': - # read-only ! - raise TypeError, "attribute '%s' of '%s' objects is not writable"% \ - (tag, _typename(self)) - elif tag == 'tail': - cetree.setTailText(self._c_node, value) - return - elif tag == 'tag': - ElementBase.tag.__set__(self, value) - return - - tag = _buildChildTag(self, tag) - element = _lookupChild(self, tag) - if element is None: - _appendValue(self, tag, value) - else: - _replaceElement(element, value) - - def __delattr__(self, tag): - child = _lookupChildOrRaise(self, tag) - self.remove(child) - - def addattr(self, tag, value): - """Add a child value to the element. - - As opposed to append(), it sets a data value, not an element. - """ - _appendValue(self, _buildChildTag(self, tag), value) - - def __getitem__(self, key): - """Return a sibling, counting from the first child of the parent. The - method behaves like both a dict and a sequence. - - * If argument is an integer, returns the sibling at that position. - - * If argument is a string, does the same as getattr(). This can be - used to provide namespaces for element lookup, or to look up - children with special names (``text`` etc.). - """ - cdef tree.xmlNode* c_self_node - cdef tree.xmlNode* c_parent - cdef tree.xmlNode* c_node - if python._isString(key): - return _lookupChildOrRaise(self, key) - c_self_node = self._c_node - c_parent = c_self_node.parent - if c_parent is NULL: - if key == 0: - return self - else: - raise IndexError, key - if key < 0: - c_node = c_parent.last - else: - c_node = c_parent.children - c_node = _findFollowingSibling( - c_node, tree._getNs(c_self_node), c_self_node.name, key) - if c_node is NULL: - raise IndexError, key - return elementFactory(self._doc, c_node) - - def __setitem__(self, key, value): - """Set the value of a sibling, counting from the first child of the - parent. - - * If argument is an integer, sets the sibling at that position. - - * If argument is a string, does the same as setattr(). This is used - to provide namespaces for element lookup. - - * If argument is a sequence (list, tuple, etc.), assign the contained - items to the siblings. - """ - cdef _Element element - cdef _Element new_element - cdef tree.xmlNode* c_self_node - cdef tree.xmlNode* c_parent - cdef tree.xmlNode* c_node - if python._isString(key): - key = _buildChildTag(self, key) - element = _lookupChild(self, key) - if element is None: - _appendValue(self, key, value) - else: - _replaceElement(element, value) - return - - c_self_node = self._c_node - c_parent = c_self_node.parent - if c_parent is NULL: - # the 'root[i] = ...' case - raise TypeError, "index assignment to root element is invalid" - if key < 0: - c_node = c_parent.last - else: - c_node = c_parent.children - c_node = _findFollowingSibling( - c_node, tree._getNs(c_self_node), c_self_node.name, key) - if c_node is NULL: - raise IndexError, key - element = elementFactory(self._doc, c_node) - _replaceElement(element, value) - - def __getslice__(self, Py_ssize_t start, Py_ssize_t end): - return list(islice(self, start, end)) - - def __setslice__(self, Py_ssize_t start, Py_ssize_t end, values): - cdef _Element el - parent = self.getparent() - if parent is None: - raise TypeError, "deleting slices of root element not supported" - # replace existing items - new_items = iter(values) - del_items = iter(list(islice(self, start, end))) - try: - for el in del_items: - item = new_items.next() - _replaceElement(el, item) - except StopIteration: - remove = parent.remove - remove(el) - for el in del_items: - remove(el) - return - - # append remaining new items - tag = self.tag - for item in new_items: - _appendValue(parent, tag, item) - - def __delslice__(self, Py_ssize_t start, Py_ssize_t end): - parent = self.getparent() - if parent is None: - raise TypeError, "deleting slices of root element not supported" - remove = parent.remove - for el in list(islice(self, start, end)): - remove(el) - - def __delitem__(self, key): - parent = self.getparent() - if parent is None: - raise TypeError, "deleting items not supported by root element" - sibling = self.__getitem__(key) - parent.remove(sibling) - - def findall(self, path): - # Reimplementation of Element.findall() to make it work without child - # iteration. - xpath = etree.ETXPath(path) - return xpath(self) - - def find(self, path): - # Reimplementation of Element.find() to make it work without child - # iteration. - result = self.findall(path) - if isinstance(result, list) and len(result): - return result[0] - elif isinstance(result, _Element): - return result - else: - return None - - def findtext(self, path, default=None): - # Reimplementation of Element.findtext() to make it work without child - # iteration. - result = self.find(path) - if isinstance(result, _Element): - return result.text or "" - else: - return default - - def descendantpaths(self, prefix=None): - """Returns a list of object path expressions for all descendants. - """ - if prefix is not None and not python._isString(prefix): - prefix = '.'.join(prefix) - return _buildDescendantPaths(self._c_node, prefix) - -cdef tree.xmlNode* _findFollowingSibling(tree.xmlNode* c_node, - char* href, char* name, - Py_ssize_t index): - cdef tree.xmlNode* (*next)(tree.xmlNode*) - if index >= 0: - next = cetree.nextElement - else: - index = -1 - index - next = cetree.previousElement - while c_node is not NULL: - if c_node.type == tree.XML_ELEMENT_NODE and \ - cetree.tagMatches(c_node, href, name): - index = index - 1 - if index < 0: - return c_node - c_node = next(c_node) - return NULL - -cdef object _lookupChild(_Element parent, tag): - cdef tree.xmlNode* c_result - cdef tree.xmlNode* c_node - cdef char* c_href - cdef char* c_tag - ns, tag = cetree.getNsTag(tag) - c_tag = _cstr(tag) - c_node = parent._c_node - if ns is None: - c_href = tree._getNs(c_node) - else: - c_href = _cstr(ns) - c_result = _findFollowingSibling(c_node.children, c_href, c_tag, 0) - if c_result is NULL: - return None - return elementFactory(parent._doc, c_result) - -cdef object _lookupChildOrRaise(_Element parent, tag): - element = _lookupChild(parent, tag) - if element is None: - raise AttributeError, "no such child: " + \ - _buildChildTag(parent, tag) - return element - -cdef object _buildChildTag(_Element parent, tag): - cdef char* c_href - cdef char* c_tag - ns, tag = cetree.getNsTag(tag) - c_tag = _cstr(tag) - if ns is None: - c_href = tree._getNs(parent._c_node) - else: - c_href = _cstr(ns) - return cetree.namespacedNameFromNsName(c_href, c_tag) - -cdef object _replaceElement(_Element element, value): - cdef _Element new_element - if isinstance(value, _Element): - # deep copy the new element - new_element = cetree.deepcopyNodeToDocument( - element._doc, (<_Element>value)._c_node) - new_element.tag = element.tag - elif python.PyList_Check(value) or python.PyTuple_Check(value): - element.__setslice__(0, python.PY_SSIZE_T_MAX, value) - return - else: - new_element = element.makeelement(element.tag) - _setElementValue(new_element, value) - element.getparent().replace(element, new_element) - -cdef object _appendValue(_Element parent, tag, value): - cdef _Element new_element - if isinstance(value, _Element): - # deep copy the new element - new_element = cetree.deepcopyNodeToDocument( - parent._doc, (<_Element>value)._c_node) - new_element.tag = tag - cetree.appendChild(parent, new_element) - elif python.PyList_Check(value) or python.PyTuple_Check(value): - for item in value: - _appendValue(parent, tag, item) - else: - new_element = SubElement(parent, tag) - _setElementValue(new_element, value) - -cdef _setElementValue(_Element element, value): - cdef python.PyObject* dict_result - if value is None: - cetree.setAttributeValue( - element, XML_SCHEMA_INSTANCE_NIL_ATTR, "true") - elif isinstance(value, _Element): - _replaceElement(element, value) - else: - cetree.delAttributeFromNsName( - element._c_node, _XML_SCHEMA_INSTANCE_NS, "nil") - if python._isString(value): - pytype_name = "str" - else: - pytype_name = _typename(value) - if isinstance(value, bool): - value = _lower_bool(value) - else: - value = str(value) - dict_result = python.PyDict_GetItem(_PYTYPE_DICT, pytype_name) - if dict_result is not NULL: - cetree.setAttributeValue(element, PYTYPE_ATTRIBUTE, pytype_name) - else: - cetree.delAttributeFromNsName(element._c_node, PYTYPE_NAMESPACE, - PYTYPE_ATTRIBUTE_NAME) - cetree.setNodeText(element._c_node, value) - -################################################################################ -# Data type support in subclasses - -cdef class ObjectifiedDataElement(ObjectifiedElement): - """This is the base class for all data type Elements. Subclasses should - override the 'pyval' property and possibly the __str__ method. - """ - property pyval: - def __get__(self): - return textOf(self._c_node) - - def __str__(self): - return textOf(self._c_node) or '' - - def __repr__(self): - return textOf(self._c_node) or '' - - def _setText(self, s): - """For use in subclasses only. Don't use unless you know what you are - doing. - """ - cetree.setNodeText(self._c_node, s) - -cdef class NumberElement(ObjectifiedDataElement): - cdef object _type - def _setValueParser(self, function): - "Set the function that parses the Python value from a string." - self._type = function - - cdef _value(self): - return self._type(textOf(self._c_node)) - - property pyval: - def __get__(self): - return self._value() - - def __int__(self): - return int(textOf(self._c_node)) - - def __long__(self): - return long(textOf(self._c_node)) - - def __float__(self): - return float(textOf(self._c_node)) - - def __str__(self): - return str(self._type(textOf(self._c_node))) - - def __repr__(self): - return repr(self._type(textOf(self._c_node))) - -# def __oct__(self): -# def __hex__(self): - - def __richcmp__(self, other, int op): - if hasattr(other, 'pyval'): - other = other.pyval - return python.PyObject_RichCompare( - _numericValueOf(self), other, op) - - def __add__(self, other): - return _numericValueOf(self) + _numericValueOf(other) - - def __sub__(self, other): - return _numericValueOf(self) - _numericValueOf(other) - - def __mul__(self, other): - return _numericValueOf(self) * _numericValueOf(other) - - def __div__(self, other): - return _numericValueOf(self) / _numericValueOf(other) - - def __truediv__(self, other): - return _numericValueOf(self) / _numericValueOf(other) - - def __mod__(self, other): - return _numericValueOf(self) % _numericValueOf(other) - - def __pow__(self, other, modulo): - if modulo is None: - return _numericValueOf(self) ** _numericValueOf(other) - else: - return pow(_numericValueOf(self), _numericValueOf(other), modulo) - - def __neg__(self): - return - _numericValueOf(self) - - def __pos__(self): - return + _numericValueOf(self) - - def __abs__(self): - return abs( _numericValueOf(self) ) - - def __nonzero__(self): - return _numericValueOf(self) != 0 - - def __invert__(self): - return ~ _numericValueOf(self) - - def __lshift__(self, other): - return _numericValueOf(self) << _numericValueOf(other) - - def __rshift__(self, other): - return _numericValueOf(self) >> _numericValueOf(other) - - def __and__(self, other): - return _numericValueOf(self) & _numericValueOf(other) - - def __or__(self, other): - return _numericValueOf(self) | _numericValueOf(other) - - def __xor__(self, other): - return _numericValueOf(self) ^ _numericValueOf(other) - -cdef class IntElement(NumberElement): - def _init(self): - self._type = int - -cdef class LongElement(NumberElement): - def _init(self): - self._type = long - -cdef class FloatElement(NumberElement): - def _init(self): - self._type = float - -cdef class StringElement(ObjectifiedDataElement): - """String data class. - - Note that this class does *not* support the sequence protocol of strings: - len(), iter(), str_attr[0], str_attr[0:1], etc. are *not* supported. - Instead, use the .text attribute to get a 'real' string. - """ - property pyval: - def __get__(self): - return textOf(self._c_node) or '' - - def __repr__(self): - return repr(textOf(self._c_node) or '') - - def strlen(self): - text = textOf(self._c_node) - if text is None: - return 0 - else: - return len(text) - - def __nonzero__(self): - text = textOf(self._c_node) - if text is None: - return False - return len(text) > 0 - - def __richcmp__(self, other, int op): - if hasattr(other, 'pyval'): - other = other.pyval - return python.PyObject_RichCompare( - _strValueOf(self), other, op) - - def __add__(self, other): - text = _strValueOf(self) - other = _strValueOf(other) - if text is None: - return other - if other is None: - return text - return text + other - - def __mul__(self, other): - if isinstance(self, StringElement): - return textOf((self)._c_node) * _numericValueOf(other) - elif isinstance(other, StringElement): - return _numericValueOf(self) * textOf((other)._c_node) - else: - raise TypeError, "invalid types for * operator" - - def __mod__(self, other): - if python.PyTuple_Check(other): - l = [] - for item in other: - python.PyList_Append(l, _strValueOf(item)) - other = tuple(l) - else: - other = _strValueOf(other) - return _strValueOf(self) % other - -cdef class NoneElement(ObjectifiedDataElement): - def __str__(self): - return "None" - - def __repr__(self): - return "None" - - def __nonzero__(self): - return False - - def __richcmp__(self, other, int op): - if other is None or self is None: - return python.PyObject_RichCompare(None, None, op) - if isinstance(self, NoneElement): - return python.PyObject_RichCompare(None, other, op) - else: - return python.PyObject_RichCompare(self, None, op) - - property pyval: - def __get__(self): - return None - -cdef class BoolElement(ObjectifiedDataElement): - """Boolean type base on string values: 'true' or 'false'. - """ - cdef int _boolval(self) except -1: - cdef char* c_str - text = textOf(self._c_node) - if text is None: - return 0 - c_str = _cstr(text) - if c_str[0] == c'0' or c_str[0] == c'f' or c_str[0] == c'F': - if c_str[1] == c'\0' or text == "false" or text.lower() == "false": - # '0' or 'f' or 'false' - return 0 - elif c_str[0] == c'1' or c_str[0] == c't' or c_str[0] == c'T': - if c_str[1] == c'\0' or text == "true" or text.lower() == "true": - # '1' or 't' or 'true' - return 1 - raise ValueError, "Invalid boolean value: '%s'" % text - - def __nonzero__(self): - if self._boolval(): - return True - else: - return False - - def __richcmp__(self, other, int op): - if hasattr(other, 'pyval'): - other = other.pyval - if hasattr(self, 'pyval'): - self_val = self.pyval - else: - self_val = bool(self) - return python.PyObject_RichCompare(self_val, other, op) - - def __str__(self): - if self._boolval(): - return "True" - else: - return "False" - - def __repr__(self): - if self._boolval(): - return "True" - else: - return "False" - - property pyval: - def __get__(self): - return self.__nonzero__() - -def __checkBool(s): - if s != 'true' and s != 'false' and s != '1' and s != '0': - raise ValueError - -cdef object _strValueOf(obj): - if python._isString(obj): - return obj - if isinstance(obj, _Element): - return textOf((<_Element>obj)._c_node) - if obj is None: - return '' - return str(obj) - -cdef object _numericValueOf(obj): - if isinstance(obj, NumberElement): - return (obj)._type( - textOf((obj)._c_node)) - elif hasattr(obj, 'pyval'): - # not always numeric, but Python will raise the right exception - return obj.pyval - return obj - -################################################################################ -# Python type registry - -cdef class PyType: - """User defined type. - - Named type that contains a type check function and a type class that - inherits from ObjectifiedDataElement. The type check must take a string - as argument and raise ValueError or TypeError if it cannot handle the - string value. It may be None in which case it is not considered for type - guessing. - - Example:: - PyType('int', int, MyIntClass).register() - - Note that the order in which types are registered matters. The first - matching type will be used. - """ - cdef readonly object name - cdef readonly object type_check - cdef object _add_text - cdef object _type - cdef object _schema_types - def __init__(self, name, type_check, type_class, stringify=None): - if not python._isString(name): - raise TypeError, "Type name must be a string" - elif name == TREE_PYTYPE: - raise ValueError, "Invalid type name" - if type_check is not None and not callable(type_check): - raise TypeError, "Type check function must be callable (or None)" - if not issubclass(type_class, ObjectifiedDataElement): - raise TypeError, \ - "Data classes must inherit from ObjectifiedDataElement" - self.name = name - self._type = type_class - self.type_check = type_check - if stringify is None: - self._add_text = _StringValueSetter(__builtin__.str) - else: - self._add_text = _StringValueSetter(stringify) - self._schema_types = [] - - def __repr__(self): - return "PyType(%s, %s)" % (self.name, self._type.__name__) - - def register(self, before=None, after=None): - """Register the type. - - The additional keyword arguments 'before' and 'after' accept a - sequence of type names that must appear before/after the new type in - the type list. If any of them is not currently known, it is simply - ignored. Raises ValueError if the dependencies cannot be fulfilled. - """ - if self.type_check is not None: - for item in _TYPE_CHECKS: - if item[0] is self.type_check: - _TYPE_CHECKS.remove(item) - break - entry = (self.type_check, self) - first_pos = 0 - last_pos = -1 - if before or after: - if before is None: - before = () - elif after is None: - after = () - for i, (check, pytype) in enumerate(_TYPE_CHECKS): - if last_pos == -1 and pytype.name in before: - last_pos = i - if pytype.name in after: - first_pos = i+1 - if last_pos == -1: - _TYPE_CHECKS.append(entry) - elif first_pos > last_pos: - raise ValueError, "inconsistent before/after dependencies" - else: - _TYPE_CHECKS.insert(last_pos, entry) - - _PYTYPE_DICT[self.name] = self - for xs_type in self._schema_types: - _SCHEMA_TYPE_DICT[xs_type] = self - - def unregister(self): - if _PYTYPE_DICT.get(self.name) is self: - del _PYTYPE_DICT[self.name] - for xs_type, pytype in _SCHEMA_TYPE_DICT.items(): - if pytype is self: - del _SCHEMA_TYPE_DICT[xs_type] - if self.type_check is None: - return - try: - _TYPE_CHECKS.remove( (self.type_check, self) ) - except ValueError: - pass - - property xmlSchemaTypes: - """The list of XML Schema datatypes this Python type maps to. - - Note that this must be set before registering the type! - """ - def __get__(self): - return self._schema_types - def __set__(self, types): - self._schema_types = list(types) - -cdef class _StringValueSetter: - cdef object _stringify - def __init__(self, stringify): - self._stringify = stringify - - def __call__(self, elem, value): - _add_text(elem, self._stringify(value)) - - -cdef object _PYTYPE_DICT -_PYTYPE_DICT = {} - -cdef object _SCHEMA_TYPE_DICT -_SCHEMA_TYPE_DICT = {} - -cdef object _TYPE_CHECKS -_TYPE_CHECKS = [] - -cdef _lower_bool(b): - if b: - return "true" - else: - return "false" - -def __lower_bool(b): - return _lower_bool(b) - -cdef _get_pytypename(obj): - if python.PyUnicode_Check(obj): - return "str" - else: - return _typename(obj) - -def __get_pytypename(obj): - return _get_pytypename(obj) - -cdef _registerPyTypes(): - pytype = PyType('int', int, IntElement) - pytype.xmlSchemaTypes = ("int", "short", "byte", "unsignedShort", - "unsignedByte",) - - pytype.register() - - pytype = PyType('long', long, LongElement) - pytype.xmlSchemaTypes = ("integer", "nonPositiveInteger", "negativeInteger", - "long", "nonNegativeInteger", "unsignedLong", - "unsignedInt", "positiveInteger",) - pytype.register() - - pytype = PyType('float', float, FloatElement) - pytype.xmlSchemaTypes = ("double", "float") - pytype.register() - - pytype = PyType('bool', __checkBool, BoolElement, __lower_bool) - pytype.xmlSchemaTypes = ("boolean",) - pytype.register() - - pytype = PyType('str', None, StringElement) - pytype.xmlSchemaTypes = ("string", "normalizedString", "token", "language", - "Name", "NCName", "ID", "IDREF", "ENTITY", - "NMTOKEN", ) - pytype.register() - - # since lxml 2.0 - pytype = PyType('NoneType', None, NoneElement) - pytype.register() - - # backwards compatibility - pytype = PyType('none', None, NoneElement) - pytype.register() - -_registerPyTypes() - -def getRegisteredTypes(): - """Returns a list of the currently registered PyType objects. - - To add a new type, retrieve this list and call unregister() for all - entries. Then add the new type at a suitable position (possibly replacing - an existing one) and call register() for all entries. - - This is necessary if the new type interferes with the type check functions - of existing ones (normally only int/float/bool) and must the tried before - other types. To add a type that is not yet parsable by the current type - check functions, you can simply register() it, which will append it to the - end of the type list. - """ - types = [] - known = set() - add_to_known = known.add - for check, pytype in _TYPE_CHECKS: - name = pytype.name - if name not in known: - add_to_known(name) - python.PyList_Append(types, pytype) - for pytype in _PYTYPE_DICT.itervalues(): - name = pytype.name - if name not in known: - add_to_known(name) - python.PyList_Append(types, pytype) - return types - -cdef PyType _guessPyType(value, PyType defaulttype): - if value is None: - return None - for type_check, tested_pytype in _TYPE_CHECKS: - try: - type_check(value) - return tested_pytype - except IGNORABLE_ERRORS: - # could not be parsed as the specififed type => ignore - pass - return defaulttype - -cdef object _guessElementClass(tree.xmlNode* c_node): - value = textOf(c_node) - if value is None: - return None - if value == '': - return StringElement - - for type_check, pytype in _TYPE_CHECKS: - try: - type_check(value) - return (pytype)._type - except IGNORABLE_ERRORS: - pass - return None - -################################################################################ -# adapted ElementMaker supports registered PyTypes - -cdef class _ObjectifyElementMakerCaller # forward declaration - -cdef extern from "etree_defs.h": - # macro call to 't->tp_new()' for fast instantiation - cdef _ObjectifyElementMakerCaller NEW_ELEMENT_MAKER "PY_NEW" (object t) - -cdef class ElementMaker: - cdef object _makeelement - cdef object _namespace - cdef object _nsmap - cdef int _annotate - def __init__(self, namespace=None, nsmap=None, annotate=True, - makeelement=None): - if nsmap is None: - nsmap = _DEFAULT_NSMAP - self._nsmap = nsmap - if namespace is None: - self._namespace = None - else: - self._namespace = "{%s}" % namespace - self._annotate = bool(annotate) - if makeelement is not None: - assert callable(makeelement) - self._makeelement = makeelement - else: - self._makeelement = None - - def __getattr__(self, tag): - cdef _ObjectifyElementMakerCaller element_maker - if self._namespace is not None and tag[0] != "{": - tag = self._namespace + tag - element_maker = NEW_ELEMENT_MAKER(_ObjectifyElementMakerCaller) - element_maker._tag = tag - element_maker._nsmap = self._nsmap - element_maker._annotate = self._annotate - element_maker._element_factory = self._makeelement - return element_maker - -cdef class _ObjectifyElementMakerCaller: - cdef object _tag - cdef object _nsmap - cdef object _element_factory - cdef int _annotate - - def __call__(self, *children, **attrib): - cdef _ObjectifyElementMakerCaller elementMaker - cdef python.PyObject* pytype - cdef _Element element - cdef _Element childElement - cdef int has_children - cdef int has_string_value - if self._element_factory is None: - element = _makeElement(self._tag, None, attrib, self._nsmap) - else: - element = self._element_factory(self._tag, attrib, self._nsmap) - - pytype_name = None - has_children = 0 - has_string_value = 0 - for child in children: - if child is None: - if python.PyTuple_GET_SIZE(children) == 1: - cetree.setAttributeValue( - element, XML_SCHEMA_INSTANCE_NIL_ATTR, "true") - elif python._isString(child): - _add_text(element, child) - has_string_value = 1 - elif isinstance(child, _Element): - cetree.appendChild(element, <_Element>child) - has_children = 1 - elif isinstance(child, _ObjectifyElementMakerCaller): - elementMaker = <_ObjectifyElementMakerCaller>child - if elementMaker._element_factory is None: - cetree.makeSubElement(element, elementMaker._tag, - None, None, None, None) - else: - childElement = elementMaker._element_factory( - elementMaker._tag) - cetree.appendChild(element, childElement) - has_children = 1 - else: - if pytype_name is not None: - # concatenation always makes the result a string - has_string_value = 1 - pytype_name = _typename(child) - pytype = python.PyDict_GetItem(_PYTYPE_DICT, pytype_name) - if pytype is not NULL: - (pytype)._add_text(element, child) - else: - has_string_value = 1 - child = str(child) - _add_text(element, child) - - if self._annotate and not has_children: - if has_string_value: - cetree.setAttributeValue(element, PYTYPE_ATTRIBUTE, "str") - elif pytype_name is not None: - cetree.setAttributeValue(element, PYTYPE_ATTRIBUTE, pytype_name) - - return element - -cdef _add_text(_Element elem, text): - cdef tree.xmlNode* c_child - c_child = cetree.findChildBackwards(elem._c_node, 0) - if c_child is not NULL: - old = cetree.tailOf(c_child) - if old is not None: - text = old + text - cetree.setTailText(c_child, text) - else: - old = cetree.textOf(elem._c_node) - if old is not None: - text = old + text - cetree.setNodeText(elem._c_node, text) - -################################################################################ -# Recursive element dumping - -cdef int __RECURSIVE_STR -__RECURSIVE_STR = 0 # default: off - -def enableRecursiveStr(on=True): - """Enable a recursively generated tree representation for str(element), - based on objectify.dump(element). - """ - global __RECURSIVE_STR - __RECURSIVE_STR = bool(on) - -def dump(_Element element not None): - """Return a recursively generated string representation of an element. - """ - return _dump(element, 0) - -cdef object _dump(_Element element, int indent): - indentstr = " " * indent - if isinstance(element, ObjectifiedDataElement): - value = repr(element) - else: - value = textOf(element._c_node) - if value is not None: - if python.PyString_GET_SIZE( value.strip() ) == 0: - value = None - else: - value = repr(value) - result = "%s%s = %s [%s]\n" % (indentstr, element.tag, - value, _typename(element)) - xsi_ns = "{%s}" % XML_SCHEMA_INSTANCE_NS - pytype_ns = "{%s}" % PYTYPE_NAMESPACE - for name, value in cetree.iterattributes(element, 3): - if '{' in name: - if name == PYTYPE_ATTRIBUTE: - if value == TREE_PYTYPE: - continue - else: - name = name.replace(pytype_ns, 'py:') - name = name.replace(xsi_ns, 'xsi:') - result = result + "%s * %s = %r\n" % (indentstr, name, value) - - indent = indent + 1 - for child in element.iterchildren(): - result = result + _dump(child, indent) - if indent == 1: - return result[:-1] # strip last '\n' - else: - return result - - -################################################################################ -# Pickle support - -cdef void _setupPickle(reduceFunction): - import copy_reg - copy_reg.constructor(fromstring) - copy_reg.pickle(ObjectifiedElement, reduceFunction, fromstring) - -def pickleReduce(obj): - return (fromstring, (etree.tostring(obj),)) - -_setupPickle(pickleReduce) -del pickleReduce - -################################################################################ -# Element class lookup - -cdef class ObjectifyElementClassLookup(ElementClassLookup): - """Element class lookup method that uses the objectify classes. - """ - cdef object empty_data_class - cdef object tree_class - def __init__(self, tree_class=None, empty_data_class=None): - """Lookup mechanism for objectify. - - The default Element classes can be replaced by passing subclasses of - ObjectifiedElement and ObjectifiedDataElement as keyword arguments. - 'tree_class' defines inner tree classes (defaults to - ObjectifiedElement), 'empty_data_class' defines the default class for - empty data elements (defauls to StringElement). - """ - self._lookup_function = _lookupElementClass - if tree_class is None: - tree_class = ObjectifiedElement - self.tree_class = tree_class - if empty_data_class is None: - empty_data_class = StringElement - self.empty_data_class = empty_data_class - -cdef object _lookupElementClass(state, _Document doc, tree.xmlNode* c_node): - cdef ObjectifyElementClassLookup lookup - cdef python.PyObject* dict_result - lookup = state - # if element has children => no data class - if cetree.findChildForwards(c_node, 0) is not NULL: - return lookup.tree_class - - # if element is defined as xsi:nil, return NoneElement class - if "true" == cetree.attributeValueFromNsName( - c_node, _XML_SCHEMA_INSTANCE_NS, "nil"): - return NoneElement - - # check for Python type hint - value = cetree.attributeValueFromNsName( - c_node, _PYTYPE_NAMESPACE, _PYTYPE_ATTRIBUTE_NAME) - if value is not None: - if value == TREE_PYTYPE: - return lookup.tree_class - dict_result = python.PyDict_GetItem(_PYTYPE_DICT, value) - if dict_result is not NULL: - return (dict_result)._type - # unknown 'pyval' => try to figure it out ourself, just go on - - # check for XML Schema type hint - value = cetree.attributeValueFromNsName( - c_node, _XML_SCHEMA_INSTANCE_NS, "type") - - if value is not None: - dict_result = python.PyDict_GetItem(_SCHEMA_TYPE_DICT, value) - if dict_result is NULL and ':' in value: - prefix, value = value.split(':', 1) - dict_result = python.PyDict_GetItem(_SCHEMA_TYPE_DICT, value) - if dict_result is not NULL: - return (dict_result)._type - - # otherwise determine class based on text content type - el_class = _guessElementClass(c_node) - if el_class is not None: - return el_class - - # if element is a root node => default to tree node - if c_node.parent is NULL or not tree._isElement(c_node.parent): - return lookup.tree_class - - return lookup.empty_data_class - - ################################################################################ # ObjectPath @@ -1301,6 +6,7 @@ char* name Py_ssize_t index + cdef class ObjectPath: """Immutable object that represents a compiled object path. @@ -1563,9 +269,10 @@ value) return else: - child = SubElement( + child = cetree.makeSubElement( cetree.elementFactory(root._doc, c_node), - cetree.namespacedNameFromNsName(c_href, c_name)) + cetree.namespacedNameFromNsName(c_href, c_name), + None, None, None, None) c_node = child._c_node # if we get here, the entire path was already there @@ -1626,435 +333,3 @@ _recursiveBuildDescendantPaths(c_child, path, path_list) del path[-1] c_child = c_child.next - - -################################################################################ -# Type annotations - -cdef PyType _check_type(tree.xmlNode* c_node, PyType pytype): - if pytype is None: - return None - value = textOf(c_node) - try: - pytype.type_check(value) - return pytype - except IGNORABLE_ERRORS: - # could not be parsed as the specified type => ignore - pass - return None - -def annotate(element_or_tree, ignore_old=True, ignore_xsi=False, - empty_pytype=None): - """Recursively annotates the elements of an XML tree with 'pytype' - attributes. - - If the 'ignore_old' keyword argument is True (the default), current 'pytype' - attributes will be ignored and replaced. Otherwise, they will be checked - and only replaced if they no longer fit the current text value. - - Setting the keyword argument ``ignore_xsi`` to True makes the function - additionally ignore existing ``xsi:type`` annotations. The default is to - use them as a type hint. - - The default annotation of empty elements can be set with the - ``empty_pytype`` keyword argument. The default is not to annotate empty - elements. Pass 'str', for example, to make string values the default. - """ - cdef _Element element - element = cetree.rootNodeOrRaise(element_or_tree) - _annotate(element, 0, 1, bool(ignore_xsi), bool(ignore_old), - None, empty_pytype) - -def xsiannotate(element_or_tree, ignore_old=True, ignore_pytype=False, - empty_type=None): - """Recursively annotates the elements of an XML tree with 'xsi:type' - attributes. - - If the 'ignore_old' keyword argument is True (the default), current - 'xsi:type' attributes will be ignored and replaced. Otherwise, they will be - checked and only replaced if they no longer fit the current text value. - - Note that the mapping from Python types to XSI types is usually ambiguous. - Currently, only the first XSI type name in the corresponding PyType - definition will be used for annotation. Thus, you should consider naming - the widest type first if you define additional types. - - Setting the keyword argument ``ignore_pytype`` to True makes the function - additionally ignore existing ``pytype`` annotations. The default is to - use them as a type hint. - - The default annotation of empty elements can be set with the - ``empty_type`` keyword argument. The default is not to annotate empty - elements. Pass 'string', for example, to make string values the default. - """ - cdef _Element element - element = cetree.rootNodeOrRaise(element_or_tree) - _annotate(element, 1, 0, bool(ignore_old), bool(ignore_pytype), - empty_type, None) - -cdef _annotate(_Element element, int annotate_xsi, int annotate_pytype, - int ignore_xsi, int ignore_pytype, - empty_type_name, empty_pytype_name): - cdef _Document doc - cdef tree.xmlNode* c_node - cdef tree.xmlNs* c_ns - cdef python.PyObject* dict_result - cdef PyType pytype, empty_pytype, StrType, NoneType - - if not annotate_xsi and not annotate_pytype: - return - - doc = element._doc - - if empty_type_name is not None: - dict_result = python.PyDict_GetItem(_SCHEMA_TYPE_DICT, empty_type_name) - elif empty_pytype_name is not None: - dict_result = python.PyDict_GetItem(_PYTYPE_DICT, empty_pytype_name) - else: - dict_result = NULL - if dict_result is not NULL: - empty_pytype = dict_result - else: - empty_pytype = None - - StrType = _PYTYPE_DICT.get('str') - NoneType = _PYTYPE_DICT.get('NoneType') - c_node = element._c_node - tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1) - if c_node.type == tree.XML_ELEMENT_NODE: - typename = None - pytype = None - value = None - istree = 0 - # if element is defined as xsi:nil, represent it as None - if cetree.attributeValueFromNsName( - c_node, _XML_SCHEMA_INSTANCE_NS, "nil") == "true": - pytype = NoneType - - if pytype is None and not ignore_xsi: - # check that old xsi type value is valid - typename = cetree.attributeValueFromNsName( - c_node, _XML_SCHEMA_INSTANCE_NS, "type") - if typename is not None: - dict_result = python.PyDict_GetItem(_SCHEMA_TYPE_DICT, typename) - if dict_result is NULL and ':' in typename: - prefix, typename = typename.split(':', 1) - dict_result = python.PyDict_GetItem(_SCHEMA_TYPE_DICT, typename) - if dict_result is not NULL: - pytype = dict_result - if pytype is not StrType: - # StrType does not have a typecheck but is the default anyway, - # so just accept it if given as type information - pytype = _check_type(c_node, pytype) - if pytype is None: - typename = None - - if pytype is None and not ignore_pytype: - # check that old pytype value is valid - old_value = cetree.attributeValueFromNsName( - c_node, _PYTYPE_NAMESPACE, _PYTYPE_ATTRIBUTE_NAME) - if old_value is not None and old_value != TREE_PYTYPE: - if old_value == 'none': - # transition from lxml 1.x - old_value = "NoneType" - dict_result = python.PyDict_GetItem(_PYTYPE_DICT, old_value) - if dict_result is not NULL: - pytype = dict_result - if pytype is not StrType: - # StrType does not have a typecheck but is the default - # anyway, so just accept it if given as type information - pytype = _check_type(c_node, pytype) - - if pytype is None: - # try to guess type - if cetree.findChildForwards(c_node, 0) is NULL: - # element has no children => data class - pytype = _guessPyType(textOf(c_node), StrType) - else: - istree = 1 - - if pytype is None: - # use default type for empty elements - if cetree.hasText(c_node): - pytype = StrType - else: - pytype = empty_pytype - if typename is None: - typename = empty_type_name - - if pytype is not None: - if typename is None: - if not istree: - if python.PyList_GET_SIZE(pytype._schema_types) > 0: - # pytype->xsi:type is a 1:n mapping - # simply take the first - typename = pytype._schema_types[0] - elif typename not in pytype._schema_types: - typename = pytype._schema_types[0] - - if annotate_xsi: - if typename is None or istree: - cetree.delAttributeFromNsName( - c_node, _XML_SCHEMA_INSTANCE_NS, "type") - else: - # update or create attribute - c_ns = cetree.findOrBuildNodeNsPrefix( - doc, c_node, _XML_SCHEMA_NS, 'xsd') - if c_ns is not NULL: - if ':' in typename: - prefix, name = typename.split(':', 1) - if c_ns.prefix is NULL or c_ns.prefix[0] == c'\0': - typename = name - elif cstd.strcmp(_cstr(prefix), c_ns.prefix) != 0: - prefix = c_ns.prefix - typename = prefix + ':' + name - elif c_ns.prefix is not NULL or c_ns.prefix[0] != c'\0': - prefix = c_ns.prefix - typename = prefix + ':' + typename - c_ns = cetree.findOrBuildNodeNsPrefix( - doc, c_node, _XML_SCHEMA_INSTANCE_NS, 'xsi') - tree.xmlSetNsProp(c_node, c_ns, "type", _cstr(typename)) - - if annotate_pytype: - if pytype is None: - # delete attribute if it exists - cetree.delAttributeFromNsName( - c_node, _PYTYPE_NAMESPACE, _PYTYPE_ATTRIBUTE_NAME) - else: - # update or create attribute - c_ns = cetree.findOrBuildNodeNsPrefix( - doc, c_node, _PYTYPE_NAMESPACE, 'py') - tree.xmlSetNsProp(c_node, c_ns, _PYTYPE_ATTRIBUTE_NAME, - _cstr(pytype.name)) - if pytype is NoneType: - c_ns = cetree.findOrBuildNodeNsPrefix( - doc, c_node, _XML_SCHEMA_INSTANCE_NS, 'xsi') - tree.xmlSetNsProp(c_node, c_ns, "nil", "true") - tree.END_FOR_EACH_ELEMENT_FROM(c_node) - -def deannotate(element_or_tree, pytype=True, xsi=True): - """Recursively de-annotate the elements of an XML tree by removing 'pytype' - and/or 'type' attributes. - - If the 'pytype' keyword argument is True (the default), 'pytype' attributes - will be removed. If the 'xsi' keyword argument is True (the default), - 'xsi:type' attributes will be removed. - """ - cdef _Element element - cdef tree.xmlNode* c_node - - element = cetree.rootNodeOrRaise(element_or_tree) - c_node = element._c_node - if pytype and xsi: - tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1) - if c_node.type == tree.XML_ELEMENT_NODE: - cetree.delAttributeFromNsName( - c_node, _PYTYPE_NAMESPACE, _PYTYPE_ATTRIBUTE_NAME) - cetree.delAttributeFromNsName( - c_node, _XML_SCHEMA_INSTANCE_NS, "type") - tree.END_FOR_EACH_ELEMENT_FROM(c_node) - elif pytype: - tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1) - if c_node.type == tree.XML_ELEMENT_NODE: - cetree.delAttributeFromNsName( - c_node, _PYTYPE_NAMESPACE, _PYTYPE_ATTRIBUTE_NAME) - tree.END_FOR_EACH_ELEMENT_FROM(c_node) - else: - tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1) - if c_node.type == tree.XML_ELEMENT_NODE: - cetree.delAttributeFromNsName( - c_node, _XML_SCHEMA_INSTANCE_NS, "type") - tree.END_FOR_EACH_ELEMENT_FROM(c_node) - - -################################################################################ -# Module level parser setup - -cdef object __DEFAULT_PARSER -__DEFAULT_PARSER = etree.XMLParser(remove_blank_text=True) -__DEFAULT_PARSER.setElementClassLookup( ObjectifyElementClassLookup() ) - -cdef object objectify_parser -objectify_parser = __DEFAULT_PARSER - -def setDefaultParser(new_parser = None): - set_default_parser(new_parser) - -def set_default_parser(new_parser = None): - """Replace the default parser used by objectify's Element() and - fromstring() functions. - - The new parser must be an etree.XMLParser. - - Call without arguments to reset to the original parser. - """ - global objectify_parser - if new_parser is None: - objectify_parser = __DEFAULT_PARSER - elif isinstance(new_parser, etree.XMLParser): - objectify_parser = new_parser - else: - raise TypeError, "parser must inherit from lxml.etree.XMLParser" - -cdef _Element _makeElement(tag, text, attrib, nsmap): - return cetree.makeElement(tag, None, objectify_parser, text, None, attrib, nsmap) - -################################################################################ -# Module level factory functions - -cdef object _fromstring -_fromstring = etree.fromstring - -def fromstring(xml): - """Objectify specific version of the lxml.etree fromstring() function. - - NOTE: requires parser based element class lookup activated in lxml.etree! - """ - return _fromstring(xml, objectify_parser) - -XML = fromstring - -cdef object _parse -_parse = etree.parse - -def parse(f, parser=None): - """Parse a file or file-like object with the objectify parser. - - You can pass a different parser as second argument. - """ - if parser is None: - parser = objectify_parser - return _parse(f, parser) - -cdef object _DEFAULT_NSMAP -_DEFAULT_NSMAP = { "py" : PYTYPE_NAMESPACE, - "xsi" : XML_SCHEMA_INSTANCE_NS, - "xsd" : XML_SCHEMA_NS} - -E = ElementMaker() - -def Element(_tag, attrib=None, nsmap=None, _pytype=None, **_attributes): - """Objectify specific version of the lxml.etree Element() factory that - always creates a structural (tree) element. - - NOTE: requires parser based element class lookup activated in lxml.etree! - """ - if attrib is not None: - if python.PyDict_Size(_attributes): - attrib.update(_attributes) - _attributes = attrib - if _pytype is None: - _pytype = TREE_PYTYPE - if nsmap is None: - nsmap = _DEFAULT_NSMAP - _attributes[PYTYPE_ATTRIBUTE] = _pytype - return _makeElement(_tag, None, _attributes, nsmap) - -def DataElement(_value, attrib=None, nsmap=None, _pytype=None, _xsi=None, - **_attributes): - """Create a new element from a Python value and XML attributes taken from - keyword arguments or a dictionary passed as second argument. - - Automatically adds a 'pytype' attribute for the Python type of the value, - if the type can be identified. If '_pytype' or '_xsi' are among the - keyword arguments, they will be used instead. - - If the _value argument is an ObjectifiedDataElement instance, its py:pytype, - xsi:type and other attributes and nsmap are reused unless they are redefined - in attrib and/or keyword arguments. - """ - cdef python.PyObject* dict_result - if nsmap is None: - nsmap = _DEFAULT_NSMAP - if attrib is not None and attrib: - if python.PyDict_Size(_attributes): - attrib = dict(attrib) - attrib.update(_attributes) - _attributes = attrib - if isinstance(_value, ObjectifiedElement): - if _pytype is None: - if _xsi is None and not _attributes and nsmap is _DEFAULT_NSMAP: - # special case: no change! - return _value.__copy__() - if isinstance(_value, ObjectifiedDataElement): - # reuse existing nsmap unless redefined in nsmap parameter - temp = _value.nsmap - if temp is not None and temp: - temp = dict(temp) - temp.update(nsmap) - nsmap = temp - # reuse existing attributes unless redefined in attrib/_attributes - temp = _value.attrib - if temp is not None and temp: - temp = dict(temp) - temp.update(_attributes) - _attributes = temp - # reuse existing xsi:type or py:pytype attributes, unless provided as - # arguments - if _xsi is None and _pytype is None: - dict_result = python.PyDict_GetItem(_attributes, - XML_SCHEMA_INSTANCE_TYPE_ATTR) - if dict_result is not NULL: - _xsi = dict_result - dict_result = python.PyDict_GetItem(_attributes, PYTYPE_ATTRIBUTE) - if dict_result is not NULL: - _pytype = dict_result - - if _xsi is not None: - if ':' in _xsi: - prefix, name = _xsi.split(':', 1) - ns = nsmap.get(prefix) - if ns != XML_SCHEMA_NS: - raise ValueError, "XSD types require the XSD namespace" - elif nsmap is _DEFAULT_NSMAP: - name = _xsi - _xsi = 'xsd:' + _xsi - else: - name = _xsi - for prefix, ns in nsmap.items(): - if ns == XML_SCHEMA_NS: - if prefix is not None and prefix: - _xsi = prefix + ':' + _xsi - break - else: - raise ValueError, "XSD types require the XSD namespace" - python.PyDict_SetItem(_attributes, XML_SCHEMA_INSTANCE_TYPE_ATTR, _xsi) - if _pytype is None: - # allow using unregistered or even wrong xsi:type names - dict_result = python.PyDict_GetItem(_SCHEMA_TYPE_DICT, _xsi) - if dict_result is NULL: - dict_result = python.PyDict_GetItem(_SCHEMA_TYPE_DICT, name) - if dict_result is not NULL: - _pytype = (dict_result).name - - if _value is None and _pytype != "str": - _pytype = _pytype or "NoneType" - strval = None - elif python._isString(_value): - strval = _value - elif python.PyBool_Check(_value): - if _value: - strval = "true" - else: - strval = "false" - else: - strval = str(_value) - - if _pytype is None: - _pytype = _typename(_value) - - if _pytype is not None: - if _pytype == "NoneType" or _pytype == "none": - strval = None - python.PyDict_SetItem(_attributes, XML_SCHEMA_INSTANCE_NIL_ATTR, "true") - else: - # check if type information from arguments is valid - dict_result = python.PyDict_GetItem(_PYTYPE_DICT, _pytype) - if dict_result is not NULL: - type_check = (dict_result).type_check - if type_check is not None: - type_check(strval) - - python.PyDict_SetItem(_attributes, PYTYPE_ATTRIBUTE, _pytype) - - return _makeElement("value", strval, _attributes, nsmap) From scoder at codespeak.net Tue Sep 11 21:55:39 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 11 Sep 2007 21:55:39 +0200 (CEST) Subject: [Lxml-checkins] r46484 - in lxml/trunk: . doc src/lxml Message-ID: <20070911195539.6BCC680F0@code0.codespeak.net> Author: scoder Date: Tue Sep 11 21:55:37 2007 New Revision: 46484 Added: lxml/trunk/cython-with-GIL-simple.patch Modified: lxml/trunk/CHANGES.txt lxml/trunk/doc/tutorial.txt lxml/trunk/src/lxml/etree.pyx lxml/trunk/src/lxml/extensions.pxi lxml/trunk/src/lxml/parser.pxi Log: cleanup in parser code, ET-compatible target parser interface (SAX-like), tutorial section on parsing Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Tue Sep 11 21:55:37 2007 @@ -8,6 +8,9 @@ Features added -------------- +* ElementTree-like parser target interface as described in + http://effbot.org/elementtree/elementtree-xmlparser.htm + * ElementTree-like feed parser interface on XMLParser and HTMLParser (``feed()`` and ``close()`` methods) Added: lxml/trunk/cython-with-GIL-simple.patch ============================================================================== --- (empty file) +++ lxml/trunk/cython-with-GIL-simple.patch Tue Sep 11 21:55:37 2007 @@ -0,0 +1,236 @@ +diff -r 43be72844df4 Compiler/Code.py +--- a/Compiler/Code.py Mon Sep 03 20:07:01 2007 +0200 ++++ b/Compiler/Code.py Mon Sep 10 20:13:13 2007 +0200 +@@ -284,6 +284,13 @@ class CCodeWriter: + # code = "((PyObject*)%s)" % code + self.put_init_to_py_none(code, entry.type) + ++ def put_py_gil_state_ensure(self, cname): ++ self.putln("PyGILState_STATE %s;" % cname) ++ self.putln("%s = PyGILState_Ensure();" % cname) ++ ++ def put_py_gil_state_release(self, cname): ++ self.putln("PyGILState_Release(%s);" % cname) ++ + def put_pymethoddef(self, entry, term): + if entry.doc: + doc_code = entry.doc_cname +diff -r 43be72844df4 Compiler/ExprNodes.py +--- a/Compiler/ExprNodes.py Mon Sep 03 20:07:01 2007 +0200 ++++ b/Compiler/ExprNodes.py Mon Sep 10 22:47:05 2007 +0200 +@@ -473,7 +473,7 @@ class ExprNode(Node): + else: # neither src nor dst are py types + # Added the string comparison, since for c types that + # is enough, but SageX gets confused when the types are +- # in different files. ++ # in different files. + if not (str(src.type) == str(dst_type) or dst_type.assignable_from(src_type)): + error(self.pos, "Cannot assign type '%s' to '%s'" % + (src.type, dst_type)) +diff -r 43be72844df4 Compiler/Naming.py +--- a/Compiler/Naming.py Mon Sep 03 20:07:01 2007 +0200 ++++ b/Compiler/Naming.py Mon Sep 10 20:13:13 2007 +0200 +@@ -53,5 +53,6 @@ stringtab_cname = pyrex_prefix + "strin + stringtab_cname = pyrex_prefix + "string_tab" + vtabslot_cname = pyrex_prefix + "vtab" + c_api_tab_cname = pyrex_prefix + "c_api_tab" ++gilstate_cname = pyrex_prefix + "state" + + extern_c_macro = pyrex_prefix.upper() + "EXTERN_C" +diff -r 43be72844df4 Compiler/Nodes.py +--- a/Compiler/Nodes.py Mon Sep 03 20:07:01 2007 +0200 ++++ b/Compiler/Nodes.py Mon Sep 10 20:13:13 2007 +0200 +@@ -282,6 +282,7 @@ class CFuncDeclaratorNode(CDeclaratorNod + # has_varargs boolean + # exception_value ConstNode + # exception_check boolean True if PyErr_Occurred check needed ++ # with_gil boolean True if GIL should be grabbed/released + + def analyse(self, return_type, env): + func_type_args = [] +@@ -317,7 +318,8 @@ class CFuncDeclaratorNode(CDeclaratorNod + exc_check = self.exception_check + func_type = PyrexTypes.CFuncType( + return_type, func_type_args, self.has_varargs, +- exception_value = exc_val, exception_check = exc_check) ++ exception_value = exc_val, exception_check = exc_check, ++ with_gil = self.with_gil) + return self.base.analyse(func_type, env) + + +@@ -572,6 +574,8 @@ class FuncDefNode(StatNode, BlockNode): + self.generate_keyword_list(code) + # ----- Extern library function declarations + lenv.generate_library_function_declarations(code) ++ # ----- Grab GIL ++ self.generate_grab_gil(code) + # ----- Fetch arguments + self.generate_argument_parsing_code(code) + self.generate_argument_increfs(lenv, code) +@@ -623,6 +627,9 @@ class FuncDefNode(StatNode, BlockNode): + code.put_var_decrefs(lenv.var_entries, used_only = 1) + code.put_var_decrefs(lenv.arg_entries) + self.put_stararg_decrefs(code) ++ # ----- Release GIL ++ self.generate_release_gil(code) ++ # ----- Return + if not self.return_type.is_void: + retval_code = Naming.retval_cname + #if self.return_type.is_extension_type: +@@ -651,6 +658,12 @@ class FuncDefNode(StatNode, BlockNode): + code.put_var_incref(entry) + + def generate_execution_code(self, code): ++ pass ++ ++ def generate_grab_gil(self, code): ++ pass ++ ++ def generate_release_gil(self, code): + pass + + +@@ -756,7 +769,19 @@ class CFuncDefNode(FuncDefNode): + else: + error(arg.pos, "Cannot test type of extern C class " + "without type object name specification") +- ++ ++ def generate_grab_gil(self, code): ++ if self.entry.type.with_gil: ++ code.putln("") ++ code.put_py_gil_state_ensure(Naming.gilstate_cname) ++ code.putln("") ++ ++ def generate_release_gil(self, code): ++ if self.entry.type.with_gil: ++ code.putln("") ++ code.put_py_gil_state_release(Naming.gilstate_cname) ++ code.putln("") ++ + def error_value(self): + if self.return_type.is_pyobject: + return "0" +diff -r 43be72844df4 Compiler/Parsing.py +--- a/Compiler/Parsing.py Mon Sep 03 20:07:01 2007 +0200 ++++ b/Compiler/Parsing.py Tue Sep 11 21:06:49 2007 +0200 +@@ -5,7 +5,7 @@ import os, re + import os, re + from string import join, replace + from types import ListType, TupleType +-from Scanning import PyrexScanner ++from Scanning import PyrexScanner, function_contexts + import Nodes + import ExprNodes + from ModuleNode import ModuleNode +@@ -1462,10 +1462,10 @@ def p_c_declarator(s, empty = 0, is_type + args = p_c_arg_list(s, in_pyfunc = 0, cmethod_flag = cmethod_flag) + ellipsis = p_optional_ellipsis(s) + s.expect(')') +- exc_val, exc_check = p_exception_value_clause(s) ++ options = p_c_func_options(s) + result = Nodes.CFuncDeclaratorNode(pos, + base = result, args = args, has_varargs = ellipsis, +- exception_value = exc_val, exception_check = exc_check) ++ **options) + cmethod_flag = 0 + return result + +@@ -1483,6 +1483,37 @@ def p_exception_value_clause(s): + s.next() + exc_val = p_simple_expr(s) #p_exception_value(s) + return exc_val, exc_check ++ ++def p_c_with(s): ++ if s.sy == 'with': ++ s.next() ++ return p_ident_list(s) ++ return () ++ ++def p_c_func_options(s): ++ exc_val = None ++ exc_check = 0 ++ contexts = [] ++ ++ if s.sy == 'except': ++ exc_val, exc_check = p_exception_value_clause(s) ++ contexts = p_c_with(s) ++ elif s.sy == 'with': ++ contexts = p_c_with(s) ++ exc_val, exc_check = p_exception_value_clause(s) ++ ++ for context in contexts: ++ if context not in function_contexts: ++ s.error("Unknown context: " + context) ++ return None ++ ++ ret = { ++ 'exception_value': exc_val, ++ 'exception_check': exc_check, ++ 'with_gil': 'GIL' in contexts, ++ } ++ ++ return ret + + #def p_exception_value(s): + # sign = "" +diff -r 43be72844df4 Compiler/PyrexTypes.py +--- a/Compiler/PyrexTypes.py Mon Sep 03 20:07:01 2007 +0200 ++++ b/Compiler/PyrexTypes.py Tue Sep 11 12:07:03 2007 +0200 +@@ -488,16 +488,18 @@ class CFuncType(CType): + # has_varargs boolean + # exception_value string + # exception_check boolean True if PyErr_Occurred check needed ++ # with_gil boolean True if GIL should be grabbed/released + + is_cfunction = 1 + + def __init__(self, return_type, args, has_varargs, +- exception_value = None, exception_check = 0): ++ exception_value = None, exception_check = 0, with_gil = False): + self.return_type = return_type + self.args = args + self.has_varargs = has_varargs + self.exception_value = exception_value + self.exception_check = exception_check ++ self.with_gil = with_gil + + def __repr__(self): + arg_reprs = map(repr, self.args) +@@ -580,6 +582,7 @@ class CFuncType(CType): + if not arg_decl_code and not pyrex: + arg_decl_code = "void" + exc_clause = "" ++ with_gil_clause = "" + if pyrex or for_display: + if self.exception_value and self.exception_check: + exc_clause = " except? %s" % self.exception_value +@@ -587,8 +590,11 @@ class CFuncType(CType): + exc_clause = " except %s" % self.exception_value + elif self.exception_check: + exc_clause = " except *" ++ if self.with_gil: ++ with_gil_clause = " with GIL" + return self.return_type.declaration_code( +- "(%s(%s)%s)" % (entity_code, arg_decl_code, exc_clause), ++ "(%s(%s)%s%s)" % (entity_code, arg_decl_code, ++ exc_clause, with_gil_clause), + for_display, dll_linkage, pyrex) + + +diff -r 43be72844df4 Compiler/Scanning.py +--- a/Compiler/Scanning.py Mon Sep 03 20:07:01 2007 +0200 ++++ b/Compiler/Scanning.py Tue Sep 11 21:05:33 2007 +0200 +@@ -138,7 +138,11 @@ reserved_words = [ + "raise", "import", "exec", "try", "except", "finally", + "while", "if", "elif", "else", "for", "in", "assert", + "and", "or", "not", "is", "in", "lambda", "from", +- "NULL", "cimport", "by" ++ "NULL", "cimport", "by", "with" ++] ++ ++function_contexts = [ # allowed arguments to the "with" option ++ "GIL" + ] + + class Method: Modified: lxml/trunk/doc/tutorial.txt ============================================================================== --- lxml/trunk/doc/tutorial.txt (original) +++ lxml/trunk/doc/tutorial.txt Tue Sep 11 21:55:37 2007 @@ -6,24 +6,32 @@ Stefan Behnel This tutorial briefly overviews the main concepts of the `ElementTree API`_ as -implemented by lxml.etree, and some simple enhancements that make your life as -a programmer easier. +implemented by ``lxml.etree``, and some simple enhancements that make your +life as a programmer easier. .. _`ElementTree API`: http://effbot.org/zone/element-index.htm#documentation .. contents:: .. - 1 Elements and ElementTrees - 1.1 The Element class - 1.2 The ElementTree class - 2 Parsing and XML literals - 2.1 The XML() function - 2.2 The parse() function - 3 Namespaces - 4 The find*() methods - 4.1 findall() - 4.2 find() - 4.3 findtext() + 1 The Element class + 1.1 Elements are lists + 1.2 Elements carry attributes + 1.3 Elements contain text + 1.4 Tree iteration + 2 The ElementTree class + 3 Parsing from strings and files + 3.1 The fromstring() function + 3.2 The XML() function + 3.3 The parse() function + 3.4 Parser objects + 3.5 Incremental parsing + 3.6 Event-driven parsing + 4 Namespaces + 5 The E-factory + 6 ElementPath + 6.1 findall() + 6.2 find() + 6.3 findtext() A common way to import ``lxml.etree`` is as follows:: @@ -380,15 +388,208 @@ made lxml loose DTD information in an input-output cycle. -Parsing files and XML literals +Parsing from strings and files ============================== +``lxml.etree`` supports parsing XML in a number of ways and from all important +sources, namely strings, files and file-like objects. The main parse +functions are ``fromstring()`` and ``parse()``, both called with the source as +first argument. By default, they use the standard parser, but you can always +pass a different parser as second argument. + + +The fromstring() function +------------------------- + +The ``fromstring()`` function is the easiest way to parse a string:: + + >>> some_xml_data = "data" + + >>> root = etree.fromstring(some_xml_data) + >>> print root.tag + root + >>> print etree.tostring(root) + data + + The XML() function ------------------ +The ``XML()`` function behaves like the ``fromstring()`` function, but is +commonly used to write XML literals right into the source:: + + >>> root = etree.XML("data") + >>> print root.tag + root + >>> print etree.tostring(root) + data + + The parse() function -------------------- +The ``parse()`` function is used to parse from files and file-like objects:: + + >>> some_file_like = StringIO("data") + + >>> tree = etree.parse(some_file_like) + + >>> print etree.tostring(tree) + data + +Note that ``parse()`` returns an ElementTree object, not an Element object as +the string parser functions:: + + >>> root = tree.getroot() + >>> print root.tag + root + >>> print etree.tostring(root) + data + + +Parser objects +-------------- + +By default, ``lxml.etree`` uses a standard parser with a default setup. If +you want to configure the parser, you can create a you instance:: + + >>> parser = etree.XMLParser(remove_blank_text=True) # lxml.etree only! + +This creates a parser that removes empty text between tags while parsing, +which can reduce the size of the tree and avoid dangling tail text if you know +that whitespace-only content is not meaningful for your data. An example:: + + >>> root = etree.XML(" ", parser) + + >>> print etree.tostring(root) + + +Note that the whitespace content inside the ```` tag was not removed, as +content at leaf elements tends to be data content (even if blank). You can +easily remove it in an additional step by traversing the tree:: + + >>> for element in root.getiterator("*"): + ... if element.text is not None and not element.text.strip(): + ... element.text = None + + >>> print etree.tostring(root) + + +See ``help(etree.XMLParser)`` to find out about the available parser options. + + +Incremental parsing +------------------- + +``lxml.etree`` provides two ways for incremental step-by-step parsing. One is +through file-like objects, where it calls the ``read()`` method repeatedly. +This is best used where the data arrives from a source like ``urllib`` or any +other file-like object that can provide data on request. Note that the parser +will block and wait until data becomes available in this case:: + + >>> class DataSource: + ... data = iter(["<", "a/", "><", "/root>"]) + ... def read(self, requested_size): + ... try: + ... return self.data.next() + ... except StopIteration: + ... return "" + + >>> root = etree.parse(DataSource()) + + >>> print etree.tostring(root) + + +The second way is through a feed parser interface, given by the ``feed(data)`` +and ``close()`` methods:: + + >>> parser = etree.XMLParser() + + >>> parser.feed(">> parser.feed("t><") + >>> parser.feed("a/") + >>> parser.feed("><") + >>> parser.feed("/root>") + + >>> root = parser.close() + + >>> print etree.tostring(root) + + +Here, you can interrupt the parsing process at any time and continue it later +on with another call to the ``feed()`` method. This comes in handy if you +want to avoid blocking calls to the parser, e.g. in frameworks like Twisted, +or whenever data comes in slowly or in chunks and you want to do other things +while waiting for the next chunk. + +You can reuse the parser by calling its ``feed()`` method again:: + + >>> parser.feed("") + >>> root = parser.close() + >>> print etree.tostring(root) + + + +Event-driven parsing +-------------------- + +Sometimes, all you need from a document is a small fraction somewhere deep +inside the tree, so parsing the whole tree into memory, traversing it and +dropping it can be too much overhead. ``lxml.etree`` supports this use case +with two event-driven parser interfaces, one that generates parser events +while building the tree (``iterparse``), and one that does not build the tree +at all, and instead calls feedback methods on a target object in a SAX-like +fashion. + +Here is a simple ``iterparse()`` example:: + + >>> some_file_like = StringIO("data") + + >>> for event, element in etree.iterparse(some_file_like): + ... print "%s, %4s, %s" % (event, element.tag, element.text) + end, a, data + end, root, None + +By default, ``iterparse()`` only generates events when it is done parsing an +element, but you can control this through the ``events`` keyword argument:: + + >>> some_file_like = StringIO("data") + + >>> for event, element in etree.iterparse(some_file_like, + ... events=("start", "end")): + ... print "%5s, %4s, %s" % (event, element.tag, element.text) + start, root, None + start, a, data + end, a, data + end, root, None + +Note that the text, tail and children of an Element are not necessarily there +yet when receiving the ``start`` event. Only the ``end`` event guarantees +that the Element has been parsed completely. It also allows to ``clear()`` or +modify the content of an Element to save memory. + +If memory is a real bottleneck, or if building the tree is not desired at all, +the target parser interface of ``lxml.etree`` can be used. It creates +SAX-like events by calling the methods of a target object. By implementing +some or all of these methods, you can control which events are generated:: + + >>> class ParserTarget: + ... events = [] + ... def start(self, tag, attrib): + ... self.events.append(("start", tag, attrib)) + ... def close(self): + ... return self.events + + >>> parser = etree.XMLParser(target=ParserTarget()) + >>> events = etree.fromstring('', parser) + + >>> for event in events: + ... print 'event: %s - tag: %s' % (event[0], event[1]) + ... for attr, value in event[2].iteritems(): + ... print ' * %s = %s' % (attr, value) + event: start - tag: root + * test = true + Namespaces ========== Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Tue Sep 11 21:55:37 2007 @@ -1992,7 +1992,10 @@ if element is not None: doc = element._doc elif file is not None: - doc = _parseDocument(file, parser) + try: + doc = _parseDocument(file, parser) + except _TargetParserResult, result_container: + return result_container.result else: c_doc = _newDoc() doc = _documentFactory(c_doc, parser) @@ -2015,8 +2018,11 @@ parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser() if not isinstance(parser, HTMLParser): parser = __DEFAULT_HTML_PARSER - doc = _parseMemoryDocument(text, base_url, parser) - return doc.getroot() + try: + doc = _parseMemoryDocument(text, base_url, parser) + return doc.getroot() + except _TargetParserResult, result_container: + return result_container.result def XML(text, _BaseParser parser=None, base_url=None): """Parses an XML document from a string constant. This function can be used @@ -2036,8 +2042,11 @@ parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser() if not isinstance(parser, XMLParser): parser = __DEFAULT_XML_PARSER - doc = _parseMemoryDocument(text, base_url, parser) - return doc.getroot() + try: + doc = _parseMemoryDocument(text, base_url, parser) + return doc.getroot() + except _TargetParserResult, result_container: + return result_container.result def fromstring(text, _BaseParser parser=None, base_url=None): """Parses an XML document from a string. @@ -2052,8 +2061,11 @@ cdef _Document doc if parser is None: parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser() - doc = _parseMemoryDocument(text, base_url, parser) - return doc.getroot() + try: + doc = _parseMemoryDocument(text, base_url, parser) + return doc.getroot() + except _TargetParserResult, result_container: + return result_container.result def iselement(element): """Checks if an object appears to be a valid element object. @@ -2124,8 +2136,11 @@ is provided as second argument, the default parser is used. """ cdef _Document doc - doc = _parseDocument(source, parser) - return ElementTree(doc.getroot()) + try: + doc = _parseDocument(source, parser) + return ElementTree(doc.getroot()) + except _TargetParserResult, result_container: + return result_container.result ################################################################################ Modified: lxml/trunk/src/lxml/extensions.pxi ============================================================================== --- lxml/trunk/src/lxml/extensions.pxi (original) +++ lxml/trunk/src/lxml/extensions.pxi Tue Sep 11 21:55:37 2007 @@ -578,14 +578,8 @@ # lookup the function by name and call it -cdef void _xpath_function_call(xpath.xmlXPathParserContext* ctxt, int nargs): - cdef python.PyGILState_STATE gil_state - gil_state = python.PyGILState_Ensure() - _call_python_xpath_function(ctxt, nargs) - python.PyGILState_Release(gil_state) - -cdef void _call_python_xpath_function(xpath.xmlXPathParserContext* ctxt, - int nargs): +cdef void _xpath_function_call(xpath.xmlXPathParserContext* ctxt, + int nargs) with GIL: cdef xpath.xmlXPathContext* rctxt cdef _BaseContext context rctxt = ctxt.context Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Tue Sep 11 21:55:37 2007 @@ -261,10 +261,8 @@ cdef int copyToBuffer(self, char* c_buffer, int c_size): cdef char* c_start cdef Py_ssize_t byte_count, remaining - cdef python.PyGILState_STATE gil_state if self._bytes_read < 0: return 0 - gil_state = python.PyGILState_Ensure() try: byte_count = python.PyString_GET_SIZE(self._bytes) remaining = byte_count - self._bytes_read @@ -276,21 +274,18 @@ self._bytes_read = 0 if remaining == 0: self._bytes_read = -1 - python.PyGILState_Release(gil_state) return 0 if c_size > remaining: c_size = remaining c_start = _cstr(self._bytes) + self._bytes_read - python.PyGILState_Release(gil_state) self._bytes_read = self._bytes_read + c_size cstd.memcpy(c_buffer, c_start, c_size) return c_size except: self._exc_context._store_raised() - python.PyGILState_Release(gil_state) return -1 -cdef int _readFilelikeParser(void* ctxt, char* c_buffer, int c_size): +cdef int _readFilelikeParser(void* ctxt, char* c_buffer, int c_size) with GIL: return (<_FileReaderContext>ctxt).copyToBuffer(c_buffer, c_size) ############################################################ @@ -298,7 +293,8 @@ ############################################################ cdef xmlparser.xmlParserInput* _parser_resolve_from_python( - char* c_url, char* c_pubid, xmlparser.xmlParserCtxt* c_context, int* error): + char* c_url, char* c_pubid, xmlparser.xmlParserCtxt* c_context, + int* error) with GIL: # call the Python document loaders cdef xmlparser.xmlParserInput* c_input cdef _ResolverContext context @@ -351,16 +347,13 @@ # no Python objects here, may be called without thread context ! # when we declare a Python object, Pyrex will INCREF(None) ! cdef xmlparser.xmlParserInput* c_input - cdef python.PyGILState_STATE gil_state cdef int error if c_context._private is NULL: if __DEFAULT_ENTITY_LOADER is NULL: return NULL return __DEFAULT_ENTITY_LOADER(c_url, c_pubid, c_context) - gil_state = python.PyGILState_Ensure() c_input = _parser_resolve_from_python(c_url, c_pubid, c_context, &error) - python.PyGILState_Release(gil_state) if c_input is not NULL: return c_input @@ -404,10 +397,7 @@ recover = parser._parse_options & xmlparser.XML_PARSE_RECOVER return _handleParseResult(self, self._c_ctxt, result, filename, recover) - -cdef class _InternalParserContext(_ParserContext): - """Parser context for internal single-shot parsing - """ + cdef int _raiseParseError(xmlparser.xmlParserCtxt* ctxt, filename, _ErrorLog error_log) except 0: @@ -530,7 +520,7 @@ if target is not None: return _TargetParserContext(target) else: - return _InternalParserContext() + return _ParserContext() cdef xmlparser.xmlParserCtxt* _newParserCtxt(self): if self._parser_type == LXML_HTML_PARSER: @@ -841,16 +831,14 @@ cdef xmlparser.xmlParserCtxt* pctxt cdef xmlDoc* c_doc cdef _Document doc - cdef int is_target_parser, error if not self._feed_parser_running: raise XMLSyntaxError, "no element found" pctxt = self._parser_ctxt self._feed_parser_running = 0 if self._parser_type == LXML_HTML_PARSER: - error = htmlparser.htmlParseChunk(pctxt, NULL, 0, 1) + htmlparser.htmlParseChunk(pctxt, NULL, 0, 1) else: - error = xmlparser.xmlParseChunk(pctxt, NULL, 0, 1) - is_target_parser = isinstance(self._context, _TargetParserContext) + xmlparser.xmlParseChunk(pctxt, NULL, 0, 1) try: result = self._context._handleParseResult( self, pctxt.myDoc, None) @@ -1150,7 +1138,6 @@ if recursive: state = python.PyEval_SaveThread() result = tree.xmlCopyDoc(c_doc, recursive) - _bugFixURL(c_doc, result) if recursive: python.PyEval_RestoreThread(state) __GLOBAL_PARSER_CONTEXT.initDocDict(result) @@ -1162,7 +1149,6 @@ cdef xmlDoc* result cdef xmlNode* c_node result = tree.xmlCopyDoc(c_doc, 0) # non recursive - _bugFixURL(c_doc, result) __GLOBAL_PARSER_CONTEXT.initDocDict(result) state = python.PyEval_SaveThread() c_node = tree.xmlDocCopyNode(c_new_root, result, 1) # recursive @@ -1178,14 +1164,6 @@ _copyTail(c_node.next, c_root) return c_root -cdef void _bugFixURL(xmlDoc* c_source_doc, xmlDoc* c_target_doc): - """libxml2 <= 2.6.17 had a bug that prevented it from copying the document - URL in xmlDocCopy()""" - if c_source_doc.URL is not NULL and _LIBXML_VERSION_INT < 20618: - if c_target_doc.URL is not NULL: - tree.xmlFree(c_target_doc.URL) - c_target_doc.URL = tree.xmlStrdup(c_source_doc.URL) - ############################################################ ## API level helper functions for _Document creation From scoder at codespeak.net Wed Sep 12 14:56:03 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 12 Sep 2007 14:56:03 +0200 (CEST) Subject: [Lxml-checkins] r46501 - in lxml/trunk: . src/lxml Message-ID: <20070912125603.9D1A880FB@code0.codespeak.net> Author: scoder Date: Wed Sep 12 14:56:01 2007 New Revision: 46501 Modified: lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/proxy.pxi lxml/trunk/src/lxml/serializer.pxi Log: work around libxml2's failure to serialise namespace declarations Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Wed Sep 12 14:56:01 2007 @@ -17,6 +17,9 @@ Bugs fixed ---------- +* lxml failed to serialise namespace declarations of elements other than the + root node of a tree + * Race condition in XSLT where the resolver context leaked between concurrent XSLT calls Modified: lxml/trunk/src/lxml/proxy.pxi ============================================================================== --- lxml/trunk/src/lxml/proxy.pxi (original) +++ lxml/trunk/src/lxml/proxy.pxi Wed Sep 12 14:56:01 2007 @@ -102,26 +102,6 @@ c_root.children = c_root.last = NULL tree.xmlFreeDoc(c_doc) -cdef void _copyParentNamespaces(xmlNode* c_from_node, xmlNode* c_to_node): - """Copy the namespaces of all ancestors of c_from_node to c_to_node. - - This is used in _fakeRootDoc() to avoid loosing namespace declarations. - """ - cdef xmlNode* c_parent - cdef xmlNs* c_ns - cdef xmlNs* c_new_ns - cdef int prefix_known - c_parent = c_from_node.parent - while c_parent is not NULL and tree._isElementOrXInclude(c_parent): - c_new_ns = c_parent.nsDef - while c_new_ns is not NULL: - # check if prefix is already defined - c_ns = tree.xmlSearchNs(c_to_node.doc, c_to_node, c_new_ns.prefix) - if c_ns is NULL: - tree.xmlNewNs(c_to_node, c_new_ns.href, c_new_ns.prefix) - c_new_ns = c_new_ns.next - c_parent = c_parent.parent - ################################################################################ # support for freeing tree elements when proxy objects are destroyed @@ -182,6 +162,25 @@ ################################################################################ # fix _Document references and namespaces when a node changes documents +cdef void _copyParentNamespaces(xmlNode* c_from_node, xmlNode* c_to_node): + """Copy the namespaces of all ancestors of c_from_node to c_to_node. + """ + cdef xmlNode* c_parent + cdef xmlNs* c_ns + cdef xmlNs* c_new_ns + cdef int prefix_known + c_parent = c_from_node.parent + while c_parent is not NULL and (tree._isElementOrXInclude(c_parent) or + c_parent.type == tree.XML_DOCUMENT_NODE): + c_new_ns = c_parent.nsDef + while c_new_ns is not NULL: + # check if prefix is already defined + c_ns = tree.xmlSearchNs(c_to_node.doc, c_to_node, c_new_ns.prefix) + if c_ns is NULL: + tree.xmlNewNs(c_to_node, c_new_ns.href, c_new_ns.prefix) + c_new_ns = c_new_ns.next + c_parent = c_parent.parent + cdef void moveNodeToDocument(_Document doc, xmlNode* c_element): """Fix the xmlNs pointers of a node and its subtree that were moved. Modified: lxml/trunk/src/lxml/serializer.pxi ============================================================================== --- lxml/trunk/src/lxml/serializer.pxi (original) +++ lxml/trunk/src/lxml/serializer.pxi Wed Sep 12 14:56:01 2007 @@ -79,14 +79,33 @@ int write_complete_document, int pretty_print): cdef xmlDoc* c_doc + cdef xmlNode* c_nsdecl_node c_doc = c_node.doc if write_xml_declaration: _writeDeclarationToBuffer(c_buffer, c_doc.version, encoding) + # write internal DTD subset, preceding PIs/comments, etc. if write_complete_document: _writeDtdToBuffer(c_buffer, c_doc, c_node.name, encoding) _writePrevSiblings(c_buffer, c_node, encoding, pretty_print) - tree.xmlNodeDumpOutput(c_buffer, c_doc, c_node, 0, pretty_print, encoding) + + # copy the node and add namespaces from parents to make libxml write them + c_nsdecl_node = tree.xmlCopyNode(c_node, 2) + _copyParentNamespaces(c_node, c_nsdecl_node) + + c_nsdecl_node.parent = c_node.parent + c_nsdecl_node.children = c_node.children + c_nsdecl_node.last = c_node.last + + # write node + tree.xmlNodeDumpOutput(c_buffer, c_doc, c_nsdecl_node, 0, + pretty_print, encoding) + + # clean up + c_nsdecl_node.children = c_nsdecl_node.last = NULL + tree.xmlFreeNode(c_nsdecl_node) + + # write tail, trailing comments, etc. _writeTail(c_buffer, c_node, encoding, pretty_print) if write_complete_document: _writeNextSiblings(c_buffer, c_node, encoding, pretty_print) From scoder at codespeak.net Wed Sep 12 15:09:06 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 12 Sep 2007 15:09:06 +0200 (CEST) Subject: [Lxml-checkins] r46502 - lxml/trunk/src/lxml/tests Message-ID: <20070912130906.90B5480FB@code0.codespeak.net> Author: scoder Date: Wed Sep 12 15:09:06 2007 New Revision: 46502 Modified: lxml/trunk/src/lxml/tests/test_elementtree.py Log: test cases for serialising namespace declarations Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Wed Sep 12 15:09:06 2007 @@ -1986,6 +1986,37 @@ del one self.assertEquals('{http://a.b.c}baz', baz.tag) + def test_ns_decl(self): + tostring = self.etree.tostring + root = self.etree.XML( + '') + baz = root[0][0] + + nsdecl = re.findall("xmlns(?::[a-z0-9]+)?=[\"']([^\"']+)[\"']", + tostring(baz)) + self.assertEquals(["http://a.b.c"], nsdecl) + + def test_ns_decl_default(self): + tostring = self.etree.tostring + root = self.etree.XML( + '') + baz = root[0][0] + + nsdecl = re.findall("xmlns(?::[a-z0-9]+)?=[\"']([^\"']+)[\"']", + tostring(baz)) + self.assertEquals(["http://a.b.c"], nsdecl) + + def test_ns_decl_root(self): + tostring = self.etree.tostring + root = self.etree.XML( + '') + baz = root[0][0] + + nsdecl = re.findall("xmlns(?::[a-z0-9]+)?=[\"']([^\"']+)[\"']", + tostring(baz)) + + self.assertEquals(["http://a.b.c"], nsdecl) + def test_attribute_xmlns_move(self): Element = self.etree.Element From scoder at codespeak.net Wed Sep 12 15:24:48 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 12 Sep 2007 15:24:48 +0200 (CEST) Subject: [Lxml-checkins] r46503 - lxml/trunk/src/lxml Message-ID: <20070912132448.CD9D98102@code0.codespeak.net> Author: scoder Date: Wed Sep 12 15:24:48 2007 New Revision: 46503 Modified: lxml/trunk/src/lxml/serializer.pxi Log: it's enough to work around the ns serialisation problem for nodes inside the tree, not the root node Modified: lxml/trunk/src/lxml/serializer.pxi ============================================================================== --- lxml/trunk/src/lxml/serializer.pxi (original) +++ lxml/trunk/src/lxml/serializer.pxi Wed Sep 12 15:24:48 2007 @@ -89,21 +89,25 @@ _writeDtdToBuffer(c_buffer, c_doc, c_node.name, encoding) _writePrevSiblings(c_buffer, c_node, encoding, pretty_print) - # copy the node and add namespaces from parents to make libxml write them - c_nsdecl_node = tree.xmlCopyNode(c_node, 2) - _copyParentNamespaces(c_node, c_nsdecl_node) - - c_nsdecl_node.parent = c_node.parent - c_nsdecl_node.children = c_node.children - c_nsdecl_node.last = c_node.last + c_nsdecl_node = c_node + if c_node.parent is NULL or c_node.parent.type != tree.XML_DOCUMENT_NODE: + # copy the node and add namespaces from parents + # this is required to make libxml write them + c_nsdecl_node = tree.xmlCopyNode(c_node, 2) + _copyParentNamespaces(c_node, c_nsdecl_node) + + c_nsdecl_node.parent = c_node.parent + c_nsdecl_node.children = c_node.children + c_nsdecl_node.last = c_node.last # write node tree.xmlNodeDumpOutput(c_buffer, c_doc, c_nsdecl_node, 0, pretty_print, encoding) - # clean up - c_nsdecl_node.children = c_nsdecl_node.last = NULL - tree.xmlFreeNode(c_nsdecl_node) + if c_nsdecl_node is not c_node: + # clean up + c_nsdecl_node.children = c_nsdecl_node.last = NULL + tree.xmlFreeNode(c_nsdecl_node) # write tail, trailing comments, etc. _writeTail(c_buffer, c_node, encoding, pretty_print) From scoder at codespeak.net Wed Sep 12 15:25:31 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 12 Sep 2007 15:25:31 +0200 (CEST) Subject: [Lxml-checkins] r46504 - in lxml/branch/lxml-1.3: . src/lxml src/lxml/tests Message-ID: <20070912132531.85E1A8102@code0.codespeak.net> Author: scoder Date: Wed Sep 12 15:25:31 2007 New Revision: 46504 Modified: lxml/branch/lxml-1.3/CHANGES.txt lxml/branch/lxml-1.3/src/lxml/proxy.pxi lxml/branch/lxml-1.3/src/lxml/serializer.pxi lxml/branch/lxml-1.3/src/lxml/tests/test_elementtree.py Log: merged in namespace serialisation fix from trunk Modified: lxml/branch/lxml-1.3/CHANGES.txt ============================================================================== --- lxml/branch/lxml-1.3/CHANGES.txt (original) +++ lxml/branch/lxml-1.3/CHANGES.txt Wed Sep 12 15:25:31 2007 @@ -2,6 +2,19 @@ lxml changelog ============== +Under development +================= + +Features added +-------------- + +Bugs fixed +---------- + +* lxml failed to serialise namespace declarations of elements other than the + root node of a tree + + 1.3.4 (2007-08-30) ================== Modified: lxml/branch/lxml-1.3/src/lxml/proxy.pxi ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/proxy.pxi (original) +++ lxml/branch/lxml-1.3/src/lxml/proxy.pxi Wed Sep 12 15:25:31 2007 @@ -97,26 +97,6 @@ c_root.children = c_root.last = NULL tree.xmlFreeDoc(c_doc) -cdef void _copyParentNamespaces(xmlNode* c_from_node, xmlNode* c_to_node): - """Copy the namespaces of all ancestors of c_from_node to c_to_node. - - This is used in _fakeRootDoc() to avoid loosing namespace declarations. - """ - cdef xmlNode* c_parent - cdef xmlNs* c_ns - cdef xmlNs* c_new_ns - cdef int prefix_known - c_parent = c_from_node.parent - while c_parent is not NULL and tree._isElementOrXInclude(c_parent): - c_new_ns = c_parent.nsDef - while c_new_ns is not NULL: - # check if prefix is already defined - c_ns = tree.xmlSearchNs(c_to_node.doc, c_to_node, c_new_ns.prefix) - if c_ns is NULL: - tree.xmlNewNs(c_to_node, c_new_ns.href, c_new_ns.prefix) - c_new_ns = c_new_ns.next - c_parent = c_parent.parent - ################################################################################ # support for freeing tree elements when proxy objects are destroyed @@ -177,6 +157,25 @@ ################################################################################ # fix _Document references and namespaces when a node changes documents +cdef void _copyParentNamespaces(xmlNode* c_from_node, xmlNode* c_to_node): + """Copy the namespaces of all ancestors of c_from_node to c_to_node. + """ + cdef xmlNode* c_parent + cdef xmlNs* c_ns + cdef xmlNs* c_new_ns + cdef int prefix_known + c_parent = c_from_node.parent + while c_parent is not NULL and (tree._isElementOrXInclude(c_parent) or + c_parent.type == tree.XML_DOCUMENT_NODE): + c_new_ns = c_parent.nsDef + while c_new_ns is not NULL: + # check if prefix is already defined + c_ns = tree.xmlSearchNs(c_to_node.doc, c_to_node, c_new_ns.prefix) + if c_ns is NULL: + tree.xmlNewNs(c_to_node, c_new_ns.href, c_new_ns.prefix) + c_new_ns = c_new_ns.next + c_parent = c_parent.parent + cdef void moveNodeToDocument(_Document doc, xmlNode* c_element): """Fix the xmlNs pointers of a node and its subtree that were moved. Modified: lxml/branch/lxml-1.3/src/lxml/serializer.pxi ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/serializer.pxi (original) +++ lxml/branch/lxml-1.3/src/lxml/serializer.pxi Wed Sep 12 15:25:31 2007 @@ -79,14 +79,37 @@ int write_complete_document, int pretty_print): cdef xmlDoc* c_doc + cdef xmlNode* c_nsdecl_node c_doc = c_node.doc if write_xml_declaration: _writeDeclarationToBuffer(c_buffer, c_doc.version, encoding) + # write internal DTD subset, preceding PIs/comments, etc. if write_complete_document: _writeDtdToBuffer(c_buffer, c_doc, c_node.name, encoding) _writePrevSiblings(c_buffer, c_node, encoding, pretty_print) - tree.xmlNodeDumpOutput(c_buffer, c_doc, c_node, 0, pretty_print, encoding) + + c_nsdecl_node = c_node + if c_node.parent is NULL or c_node.parent.type != tree.XML_DOCUMENT_NODE: + # copy the node and add namespaces from parents + # this is required to make libxml write them + c_nsdecl_node = tree.xmlCopyNode(c_node, 2) + _copyParentNamespaces(c_node, c_nsdecl_node) + + c_nsdecl_node.parent = c_node.parent + c_nsdecl_node.children = c_node.children + c_nsdecl_node.last = c_node.last + + # write node + tree.xmlNodeDumpOutput(c_buffer, c_doc, c_nsdecl_node, 0, + pretty_print, encoding) + + if c_nsdecl_node is not c_node: + # clean up + c_nsdecl_node.children = c_nsdecl_node.last = NULL + tree.xmlFreeNode(c_nsdecl_node) + + # write tail, trailing comments, etc. _writeTail(c_buffer, c_node, encoding, pretty_print) if write_complete_document: _writeNextSiblings(c_buffer, c_node, encoding, pretty_print) Modified: lxml/branch/lxml-1.3/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/tests/test_elementtree.py (original) +++ lxml/branch/lxml-1.3/src/lxml/tests/test_elementtree.py Wed Sep 12 15:25:31 2007 @@ -1969,6 +1969,37 @@ del one self.assertEquals('{http://a.b.c}baz', baz.tag) + def test_ns_decl(self): + tostring = self.etree.tostring + root = self.etree.XML( + '') + baz = root[0][0] + + nsdecl = re.findall("xmlns(?::[a-z0-9]+)?=[\"']([^\"']+)[\"']", + tostring(baz)) + self.assertEquals(["http://a.b.c"], nsdecl) + + def test_ns_decl_default(self): + tostring = self.etree.tostring + root = self.etree.XML( + '') + baz = root[0][0] + + nsdecl = re.findall("xmlns(?::[a-z0-9]+)?=[\"']([^\"']+)[\"']", + tostring(baz)) + self.assertEquals(["http://a.b.c"], nsdecl) + + def test_ns_decl_root(self): + tostring = self.etree.tostring + root = self.etree.XML( + '') + baz = root[0][0] + + nsdecl = re.findall("xmlns(?::[a-z0-9]+)?=[\"']([^\"']+)[\"']", + tostring(baz)) + + self.assertEquals(["http://a.b.c"], nsdecl) + def test_attribute_xmlns_move(self): Element = self.etree.Element From scoder at codespeak.net Wed Sep 12 21:53:25 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 12 Sep 2007 21:53:25 +0200 (CEST) Subject: [Lxml-checkins] r46512 - lxml/trunk/src/lxml Message-ID: <20070912195325.ABC3080F1@code0.codespeak.net> Author: scoder Date: Wed Sep 12 21:53:24 2007 New Revision: 46512 Modified: lxml/trunk/src/lxml/objectify.pyx Log: support external parser in objectify.fromstring() Modified: lxml/trunk/src/lxml/objectify.pyx ============================================================================== --- lxml/trunk/src/lxml/objectify.pyx (original) +++ lxml/trunk/src/lxml/objectify.pyx Wed Sep 12 21:53:24 2007 @@ -1568,12 +1568,15 @@ cdef object _fromstring _fromstring = etree.fromstring -def fromstring(xml): - """Objectify specific version of the lxml.etree fromstring() function. +def fromstring(xml, parser=None): + """Objectify specific version of the lxml.etree fromstring() function + that uses the objectify parser. - NOTE: requires parser based element class lookup activated in lxml.etree! + You can pass a different parser as second argument. """ - return _fromstring(xml, objectify_parser) + if parser is None: + parser = objectify_parser + return _fromstring(xml, parser) XML = fromstring From scoder at codespeak.net Wed Sep 12 21:54:30 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 12 Sep 2007 21:54:30 +0200 (CEST) Subject: [Lxml-checkins] r46513 - lxml/trunk Message-ID: <20070912195430.32A5880F1@code0.codespeak.net> Author: scoder Date: Wed Sep 12 21:54:29 2007 New Revision: 46513 Removed: lxml/trunk/cython-with-GIL-simple.patch Log: removed Cython patch after release of Cython 0.9.6.6 Deleted: /lxml/trunk/cython-with-GIL-simple.patch ============================================================================== --- /lxml/trunk/cython-with-GIL-simple.patch Wed Sep 12 21:54:29 2007 +++ (empty file) @@ -1,236 +0,0 @@ -diff -r 43be72844df4 Compiler/Code.py ---- a/Compiler/Code.py Mon Sep 03 20:07:01 2007 +0200 -+++ b/Compiler/Code.py Mon Sep 10 20:13:13 2007 +0200 -@@ -284,6 +284,13 @@ class CCodeWriter: - # code = "((PyObject*)%s)" % code - self.put_init_to_py_none(code, entry.type) - -+ def put_py_gil_state_ensure(self, cname): -+ self.putln("PyGILState_STATE %s;" % cname) -+ self.putln("%s = PyGILState_Ensure();" % cname) -+ -+ def put_py_gil_state_release(self, cname): -+ self.putln("PyGILState_Release(%s);" % cname) -+ - def put_pymethoddef(self, entry, term): - if entry.doc: - doc_code = entry.doc_cname -diff -r 43be72844df4 Compiler/ExprNodes.py ---- a/Compiler/ExprNodes.py Mon Sep 03 20:07:01 2007 +0200 -+++ b/Compiler/ExprNodes.py Mon Sep 10 22:47:05 2007 +0200 -@@ -473,7 +473,7 @@ class ExprNode(Node): - else: # neither src nor dst are py types - # Added the string comparison, since for c types that - # is enough, but SageX gets confused when the types are -- # in different files. -+ # in different files. - if not (str(src.type) == str(dst_type) or dst_type.assignable_from(src_type)): - error(self.pos, "Cannot assign type '%s' to '%s'" % - (src.type, dst_type)) -diff -r 43be72844df4 Compiler/Naming.py ---- a/Compiler/Naming.py Mon Sep 03 20:07:01 2007 +0200 -+++ b/Compiler/Naming.py Mon Sep 10 20:13:13 2007 +0200 -@@ -53,5 +53,6 @@ stringtab_cname = pyrex_prefix + "strin - stringtab_cname = pyrex_prefix + "string_tab" - vtabslot_cname = pyrex_prefix + "vtab" - c_api_tab_cname = pyrex_prefix + "c_api_tab" -+gilstate_cname = pyrex_prefix + "state" - - extern_c_macro = pyrex_prefix.upper() + "EXTERN_C" -diff -r 43be72844df4 Compiler/Nodes.py ---- a/Compiler/Nodes.py Mon Sep 03 20:07:01 2007 +0200 -+++ b/Compiler/Nodes.py Mon Sep 10 20:13:13 2007 +0200 -@@ -282,6 +282,7 @@ class CFuncDeclaratorNode(CDeclaratorNod - # has_varargs boolean - # exception_value ConstNode - # exception_check boolean True if PyErr_Occurred check needed -+ # with_gil boolean True if GIL should be grabbed/released - - def analyse(self, return_type, env): - func_type_args = [] -@@ -317,7 +318,8 @@ class CFuncDeclaratorNode(CDeclaratorNod - exc_check = self.exception_check - func_type = PyrexTypes.CFuncType( - return_type, func_type_args, self.has_varargs, -- exception_value = exc_val, exception_check = exc_check) -+ exception_value = exc_val, exception_check = exc_check, -+ with_gil = self.with_gil) - return self.base.analyse(func_type, env) - - -@@ -572,6 +574,8 @@ class FuncDefNode(StatNode, BlockNode): - self.generate_keyword_list(code) - # ----- Extern library function declarations - lenv.generate_library_function_declarations(code) -+ # ----- Grab GIL -+ self.generate_grab_gil(code) - # ----- Fetch arguments - self.generate_argument_parsing_code(code) - self.generate_argument_increfs(lenv, code) -@@ -623,6 +627,9 @@ class FuncDefNode(StatNode, BlockNode): - code.put_var_decrefs(lenv.var_entries, used_only = 1) - code.put_var_decrefs(lenv.arg_entries) - self.put_stararg_decrefs(code) -+ # ----- Release GIL -+ self.generate_release_gil(code) -+ # ----- Return - if not self.return_type.is_void: - retval_code = Naming.retval_cname - #if self.return_type.is_extension_type: -@@ -651,6 +658,12 @@ class FuncDefNode(StatNode, BlockNode): - code.put_var_incref(entry) - - def generate_execution_code(self, code): -+ pass -+ -+ def generate_grab_gil(self, code): -+ pass -+ -+ def generate_release_gil(self, code): - pass - - -@@ -756,7 +769,19 @@ class CFuncDefNode(FuncDefNode): - else: - error(arg.pos, "Cannot test type of extern C class " - "without type object name specification") -- -+ -+ def generate_grab_gil(self, code): -+ if self.entry.type.with_gil: -+ code.putln("") -+ code.put_py_gil_state_ensure(Naming.gilstate_cname) -+ code.putln("") -+ -+ def generate_release_gil(self, code): -+ if self.entry.type.with_gil: -+ code.putln("") -+ code.put_py_gil_state_release(Naming.gilstate_cname) -+ code.putln("") -+ - def error_value(self): - if self.return_type.is_pyobject: - return "0" -diff -r 43be72844df4 Compiler/Parsing.py ---- a/Compiler/Parsing.py Mon Sep 03 20:07:01 2007 +0200 -+++ b/Compiler/Parsing.py Tue Sep 11 21:06:49 2007 +0200 -@@ -5,7 +5,7 @@ import os, re - import os, re - from string import join, replace - from types import ListType, TupleType --from Scanning import PyrexScanner -+from Scanning import PyrexScanner, function_contexts - import Nodes - import ExprNodes - from ModuleNode import ModuleNode -@@ -1462,10 +1462,10 @@ def p_c_declarator(s, empty = 0, is_type - args = p_c_arg_list(s, in_pyfunc = 0, cmethod_flag = cmethod_flag) - ellipsis = p_optional_ellipsis(s) - s.expect(')') -- exc_val, exc_check = p_exception_value_clause(s) -+ options = p_c_func_options(s) - result = Nodes.CFuncDeclaratorNode(pos, - base = result, args = args, has_varargs = ellipsis, -- exception_value = exc_val, exception_check = exc_check) -+ **options) - cmethod_flag = 0 - return result - -@@ -1483,6 +1483,37 @@ def p_exception_value_clause(s): - s.next() - exc_val = p_simple_expr(s) #p_exception_value(s) - return exc_val, exc_check -+ -+def p_c_with(s): -+ if s.sy == 'with': -+ s.next() -+ return p_ident_list(s) -+ return () -+ -+def p_c_func_options(s): -+ exc_val = None -+ exc_check = 0 -+ contexts = [] -+ -+ if s.sy == 'except': -+ exc_val, exc_check = p_exception_value_clause(s) -+ contexts = p_c_with(s) -+ elif s.sy == 'with': -+ contexts = p_c_with(s) -+ exc_val, exc_check = p_exception_value_clause(s) -+ -+ for context in contexts: -+ if context not in function_contexts: -+ s.error("Unknown context: " + context) -+ return None -+ -+ ret = { -+ 'exception_value': exc_val, -+ 'exception_check': exc_check, -+ 'with_gil': 'GIL' in contexts, -+ } -+ -+ return ret - - #def p_exception_value(s): - # sign = "" -diff -r 43be72844df4 Compiler/PyrexTypes.py ---- a/Compiler/PyrexTypes.py Mon Sep 03 20:07:01 2007 +0200 -+++ b/Compiler/PyrexTypes.py Tue Sep 11 12:07:03 2007 +0200 -@@ -488,16 +488,18 @@ class CFuncType(CType): - # has_varargs boolean - # exception_value string - # exception_check boolean True if PyErr_Occurred check needed -+ # with_gil boolean True if GIL should be grabbed/released - - is_cfunction = 1 - - def __init__(self, return_type, args, has_varargs, -- exception_value = None, exception_check = 0): -+ exception_value = None, exception_check = 0, with_gil = False): - self.return_type = return_type - self.args = args - self.has_varargs = has_varargs - self.exception_value = exception_value - self.exception_check = exception_check -+ self.with_gil = with_gil - - def __repr__(self): - arg_reprs = map(repr, self.args) -@@ -580,6 +582,7 @@ class CFuncType(CType): - if not arg_decl_code and not pyrex: - arg_decl_code = "void" - exc_clause = "" -+ with_gil_clause = "" - if pyrex or for_display: - if self.exception_value and self.exception_check: - exc_clause = " except? %s" % self.exception_value -@@ -587,8 +590,11 @@ class CFuncType(CType): - exc_clause = " except %s" % self.exception_value - elif self.exception_check: - exc_clause = " except *" -+ if self.with_gil: -+ with_gil_clause = " with GIL" - return self.return_type.declaration_code( -- "(%s(%s)%s)" % (entity_code, arg_decl_code, exc_clause), -+ "(%s(%s)%s%s)" % (entity_code, arg_decl_code, -+ exc_clause, with_gil_clause), - for_display, dll_linkage, pyrex) - - -diff -r 43be72844df4 Compiler/Scanning.py ---- a/Compiler/Scanning.py Mon Sep 03 20:07:01 2007 +0200 -+++ b/Compiler/Scanning.py Tue Sep 11 21:05:33 2007 +0200 -@@ -138,7 +138,11 @@ reserved_words = [ - "raise", "import", "exec", "try", "except", "finally", - "while", "if", "elif", "else", "for", "in", "assert", - "and", "or", "not", "is", "in", "lambda", "from", -- "NULL", "cimport", "by" -+ "NULL", "cimport", "by", "with" -+] -+ -+function_contexts = [ # allowed arguments to the "with" option -+ "GIL" - ] - - class Method: From scoder at codespeak.net Wed Sep 12 21:59:56 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 12 Sep 2007 21:59:56 +0200 (CEST) Subject: [Lxml-checkins] r46514 - lxml/trunk/src/lxml Message-ID: <20070912195956.925C280F1@code0.codespeak.net> Author: scoder Date: Wed Sep 12 21:59:56 2007 New Revision: 46514 Added: lxml/trunk/src/lxml/parsertarget.pxi Log: parser target implementation was missing Added: lxml/trunk/src/lxml/parsertarget.pxi ============================================================================== --- (empty file) +++ lxml/trunk/src/lxml/parsertarget.pxi Wed Sep 12 21:59:56 2007 @@ -0,0 +1,192 @@ +# Parser target context (ET target interface) + +class _TargetParserResult(Exception): + # Admittedly, this is somewhat ugly, but it's the easiest way + # to push the Python level parser result through the parser + # machinery towards the API level functions + def __init__(self, result): + self.result = result + +cdef class _TargetParserContext(_ParserContext): + """This class maps SAX2 events to the ET parser target interface. + """ + cdef object _target + cdef object _target_start + cdef object _target_end + cdef object _target_data + cdef object _target_doctype + cdef object _target_pi + cdef object _target_comment + + def __init__(self, target): + _ParserContext.__init__(self) + self._target = target + + cdef void _initParserContext(self, xmlparser.xmlParserCtxt* c_ctxt): + "wrap original SAX2 callbacks" + cdef xmlparser.xmlSAXHandler* sax + _ParserContext._initParserContext(self, c_ctxt) + sax = c_ctxt.sax + cstd.memset(sax, 0, sizeof(xmlparser.xmlSAXHandler)) + try: + self._target_start = self._target.start + if self._target_start is not None: + sax.startElementNs = _targetSaxStart + except AttributeError: + pass + try: + self._target_end = self._target.end + if self._target_end is not None: + sax.endElementNs = _targetSaxEnd + except AttributeError: + pass + try: + self._target_data = self._target.data + if self._target_data is not None: + sax.characters = _targetSaxData + except AttributeError: + pass + try: + self._target_doctype = self._target.doctype + if self._target_doctype is not None: + sax.internalSubset = _targetSaxDoctype + except AttributeError: + pass + try: + self._target_pi = self._target.pi + if self._target_pi is not None: + sax.processingInstruction = _targetSaxPI + except AttributeError: + pass + try: + self._target_comment = self._target.comment + if self._target_comment is not None: + sax.startElementNs = _targetSaxStart + except AttributeError: + pass + + sax.initialized = xmlparser.XML_SAX2_MAGIC + + cdef object _handleParseResult(self, _BaseParser parser, xmlDoc* result, + filename): + self._raise_if_stored() + return self._target.close() + + cdef xmlDoc* _handleParseResultDoc(self, _BaseParser parser, + xmlDoc* result, filename) except NULL: + self._raise_if_stored() + raise _TargetParserResult(self._target.close()) + + +cdef void _targetSaxStart(void* ctxt, char* c_localname, char* c_prefix, + char* c_namespace, int c_nb_namespaces, + char** c_namespaces, + int c_nb_attributes, int c_nb_defaulted, + char** c_attributes) with GIL: + cdef _TargetParserContext context + cdef xmlparser.xmlParserCtxt* c_ctxt + cdef int i + c_ctxt = ctxt + if c_ctxt._private is NULL: + return + context = <_TargetParserContext>c_ctxt._private + try: + tag = _namespacedNameFromNsName(c_namespace, c_localname) + if c_nb_defaulted > 0: + # only add default attributes if we asked for them + if c_ctxt.loadsubset & xmlparser.XML_COMPLETE_ATTRS == 0: + c_nb_attributes = c_nb_attributes - c_nb_defaulted + attrib = {} + for i from 0 <= i < c_nb_attributes: + name = _namespacedNameFromNsName( + c_attributes[2], c_attributes[0]) + if c_attributes[3] is NULL: + value = "" + else: + value = python.PyUnicode_DecodeUTF8( + c_attributes[3], c_attributes[4] - c_attributes[3], + "strict") + python.PyDict_SetItem(attrib, name, value) + c_attributes = c_attributes + 5 + context._target_start(tag, attrib) + except: + _handleSaxTargetException(context, c_ctxt) + +cdef void _targetSaxEnd(void* ctxt, char* c_localname, char* c_prefix, + char* c_namespace) with GIL: + cdef _TargetParserContext context + cdef xmlparser.xmlParserCtxt* c_ctxt + c_ctxt = ctxt + if c_ctxt._private is NULL: + return + context = <_TargetParserContext>c_ctxt._private + try: + tag = _namespacedNameFromNsName(c_namespace, c_localname) + context._target_end(tag) + except: + _handleSaxTargetException(context, c_ctxt) + +cdef void _targetSaxData(void* ctxt, char* c_data, int data_len) with GIL: + cdef _TargetParserContext context + cdef xmlparser.xmlParserCtxt* c_ctxt + c_ctxt = ctxt + if c_ctxt._private is NULL: + return + context = <_TargetParserContext>c_ctxt._private + try: + context._target_data( + python.PyUnicode_DecodeUTF8(c_data, data_len, NULL)) + except: + _handleSaxTargetException(context, c_ctxt) + +cdef void _targetSaxDoctype(void* ctxt, char* c_name, char* c_public, + char* c_system) with GIL: + cdef _TargetParserContext context + cdef xmlparser.xmlParserCtxt* c_ctxt + c_ctxt = ctxt + if c_ctxt._private is NULL: + return + context = <_TargetParserContext>c_ctxt._private + try: + if c_public is not NULL: + public_id = funicode(c_public) + if c_system is not NULL: + system_id = funicode(c_system) + context._target_doctype( + funicode(c_name), public_id, system_id) + except: + _handleSaxTargetException(context, c_ctxt) + +cdef void _targetSaxPI(void* ctxt, char* c_target, char* c_data) with GIL: + cdef _TargetParserContext context + cdef xmlparser.xmlParserCtxt* c_ctxt + c_ctxt = ctxt + if c_ctxt._private is NULL: + return + context = <_TargetParserContext>c_ctxt._private + try: + if c_data is not NULL: + data = funicode(c_data) + context._target_pi(funicode(c_target), data) + except: + _handleSaxTargetException(context, c_ctxt) + +cdef void _targetSaxComment(void* ctxt, char* c_data, int data_len) with GIL: + cdef _TargetParserContext context + cdef xmlparser.xmlParserCtxt* c_ctxt + c_ctxt = ctxt + if c_ctxt._private is NULL: + return + context = <_TargetParserContext>c_ctxt._private + try: + context._target_comment( + python.PyUnicode_DecodeUTF8(c_data, data_len, NULL)) + except: + _handleSaxTargetException(context, c_ctxt) + +cdef void _handleSaxTargetException(_TargetParserContext context, + xmlparser.xmlParserCtxt* c_ctxt): + context._store_raised() + if c_ctxt.errNo == xmlerror.XML_ERR_OK: + c_ctxt.errNo = xmlerror.XML_ERR_INTERNAL_ERROR + c_ctxt.disableSAX = 1 From scoder at codespeak.net Wed Sep 12 22:03:38 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 12 Sep 2007 22:03:38 +0200 (CEST) Subject: [Lxml-checkins] r46515 - in lxml/trunk: . doc src/lxml src/lxml/tests Message-ID: <20070912200338.6B66D80F1@code0.codespeak.net> Author: scoder Date: Wed Sep 12 22:03:37 2007 New Revision: 46515 Modified: lxml/trunk/CHANGES.txt lxml/trunk/doc/compatibility.txt lxml/trunk/src/lxml/apihelpers.pxi lxml/trunk/src/lxml/etree.pyx lxml/trunk/src/lxml/tests/test_etree.py Log: accept QName objects as values for attributes and element text, and replace their namespace by the resolved prefix Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Wed Sep 12 22:03:37 2007 @@ -8,6 +8,9 @@ Features added -------------- +* Setting a QName object as value of the .text property or as an attribute + will resolve its prefix in the respective context + * ElementTree-like parser target interface as described in http://effbot.org/elementtree/elementtree-xmlparser.htm Modified: lxml/trunk/doc/compatibility.txt ============================================================================== --- lxml/trunk/doc/compatibility.txt (original) +++ lxml/trunk/doc/compatibility.txt Wed Sep 12 22:03:37 2007 @@ -155,6 +155,15 @@ ElementTree, you cannot pass it as a keyword argument to the Element and SubElement factories directly. +* ElementTree allows QName objects as attribute values and resolves their + prefix on serialisation (e.g. an attribute value ``QName("{myns}myname")`` + becomes "p:myname" if "p" is the namespace prefix of "myns"). lxml.etree + also allows you to set attribute values from QName instances (and also .text + values), but it resolves their prefix immediately and stores the plain text + value. So, if prefixes are modified later on, e.g. by moving a subtree to a + different tree (which reassigns the prefix mappings), the text values will + not be updated and you might end up with an undefined prefix. + * etree elements can be copied using ``copy.deepcopy()`` and ``copy.copy()``, just like ElementTree's. However, ``copy.copy()`` does *not* create a shallow copy where elements are shared between trees, as this makes no sense Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Wed Sep 12 22:03:37 2007 @@ -244,7 +244,10 @@ ns, tag = _getNsTag(key) _attributeValidOrRaise(tag) c_tag = _cstr(tag) - value = _utf8(value) + if isinstance(value, QName): + value = _resolveQNameText(element, value) + else: + value = _utf8(value) c_value = _cstr(value) if ns is None: tree.xmlSetProp(element._c_node, c_tag, c_value) @@ -413,6 +416,16 @@ tree.xmlAddNextSibling(c_node, c_text_node) return 0 +cdef _resolveQNameText(_Element element, value): + cdef xmlNs* c_ns + ns, tag = _getNsTag(value) + if ns is None: + return tag + else: + c_ns = element._doc._findOrBuildNodeNs( + element._c_node, _cstr(ns), NULL) + return '%s:%s' % (c_ns.prefix, tag) + cdef xmlNode* _findChild(xmlNode* c_node, Py_ssize_t index): if index < 0: return _findChildBackwards(c_node, -index - 1) Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Wed Sep 12 22:03:37 2007 @@ -217,7 +217,8 @@ else: if not _isString(text_or_uri): text_or_uri = str(text_or_uri) - _tagValidOrRaise(_utf8(text_or_uri)) + tag = _getNsTag(text_or_uri)[1] + _tagValidOrRaise(tag) self.text = text_or_uri def __str__(self): return self.text @@ -739,10 +740,13 @@ """ def __get__(self): return _collectText(self._c_node.children) - + def __set__(self, value): + if isinstance(value, QName): + value = python.PyUnicode_FromEncodedObject( + _resolveQNameText(self, value), 'UTF-8', 'strict') _setNodeText(self._c_node, value) - + property tail: """Text after this element's end tag, but before the next sibling element's start tag. This is either a string or the value None, if Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Wed Sep 12 22:03:37 2007 @@ -124,6 +124,15 @@ self.assertRaises(ValueError, QName, 'na me') self.assertRaises(ValueError, QName, 'test', ' name') + def test_qname_text_resolve(self): + # ET doesn't resove QNames as text values + etree = self.etree + qname = etree.QName('http://myns', 'a') + a = etree.Element(qname, nsmap={'p' : 'http://myns'}) + a.text = qname + + self.assertEquals("p:a", a.text) + def test_attribute_set(self): Element = self.etree.Element root = Element("root") From scoder at codespeak.net Wed Sep 12 22:06:10 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 12 Sep 2007 22:06:10 +0200 (CEST) Subject: [Lxml-checkins] r46516 - in lxml/trunk: . src/lxml/tests Message-ID: <20070912200610.36B2B80F1@code0.codespeak.net> Author: scoder Date: Wed Sep 12 22:06:09 2007 New Revision: 46516 Modified: lxml/trunk/selftest.py lxml/trunk/selftest2.py lxml/trunk/src/lxml/tests/test_elementtree.py Log: loads of test cases for QName resolving and serialisation Modified: lxml/trunk/selftest.py ============================================================================== --- lxml/trunk/selftest.py (original) +++ lxml/trunk/selftest.py Wed Sep 12 22:06:09 2007 @@ -23,6 +23,7 @@ def fix_compatibility(xml_data): xml_data = re.sub('\s*xmlns:[a-z0-9]+="http://www.w3.org/2001/XInclude"', '', xml_data) + xml_data = xml_data.replace(' />', '/>') return xml_data def serialize(elem, encoding=None): @@ -437,65 +438,65 @@ 'text' """ -## def encoding(): -## r""" -## Test encoding issues. +def encoding(): + r""" + Test encoding issues. -## >>> elem = ElementTree.Element("tag") -## >>> elem.text = u"abc" -## >>> serialize(elem) -## 'abc' -## >>> serialize(elem, "utf-8") -## 'abc' -## >>> serialize(elem, "us-ascii") -## 'abc' -## >>> serialize(elem, "iso-8859-1") -## "\nabc" + >>> elem = ElementTree.Element("tag") + >>> elem.text = u"abc" + >>> serialize(elem) + 'abc' + >>> serialize(elem, "utf-8") + 'abc' + >>> serialize(elem, "us-ascii") + 'abc' + >>> serialize(elem, "ISO-8859-1") + "\nabc" -## >>> elem.text = "<&\"\'>" -## >>> serialize(elem) -## '<&"\'>' -## >>> serialize(elem, "utf-8") -## '<&"\'>' -## >>> serialize(elem, "us-ascii") # cdata characters -## '<&"\'>' -## >>> serialize(elem, "iso-8859-1") -## '\n<&"\'>' + >>> elem.text = "<&\"\'>" + >>> serialize(elem) + '<&"\'>' + >>> serialize(elem, "utf-8") + '<&"\'>' + >>> serialize(elem, "us-ascii") # cdata characters + '<&"\'>' + >>> serialize(elem, "ISO-8859-1") + '\n<&"\'>' ## >>> elem.attrib["key"] = "<&\"\'>" ## >>> elem.text = None ## >>> serialize(elem) -## '' +## '' ## >>> serialize(elem, "utf-8") -## '' +## '' ## >>> serialize(elem, "us-ascii") -## '' +## '' ## >>> serialize(elem, "iso-8859-1") -## '\n' +## '\n' -## >>> elem.text = u'\xe5\xf6\xf6<>' -## >>> elem.attrib.clear() -## >>> serialize(elem) -## 'åöö<>' -## >>> serialize(elem, "utf-8") -## '\xc3\xa5\xc3\xb6\xc3\xb6<>' -## >>> serialize(elem, "us-ascii") -## 'åöö<>' -## >>> serialize(elem, "iso-8859-1") -## "\n\xe5\xf6\xf6<>" + >>> elem.text = u'\xe5\xf6\xf6<>' + >>> elem.attrib.clear() + >>> serialize(elem) + 'åöö<>' + >>> serialize(elem, "utf-8") + '\xc3\xa5\xc3\xb6\xc3\xb6<>' + >>> serialize(elem, "us-ascii") + 'åöö<>' + >>> serialize(elem, "ISO-8859-1") + "\n\xe5\xf6\xf6<>" ## >>> elem.attrib["key"] = u'\xe5\xf6\xf6<>' ## >>> elem.text = None ## >>> serialize(elem) -## '' +## '' ## >>> serialize(elem, "utf-8") -## '' +## '' ## >>> serialize(elem, "us-ascii") -## '' -## >>> serialize(elem, "iso-8859-1") -## '\n' +## '' +## >>> serialize(elem, "ISO-8859-1") +## '\n' -## """ + """ ENTITY_XML = """\ >> elem = ElementTree.XML("") >>> serialize(elem) # 1.1 '' - -# '' # ElementTree produces an extra blank """ def namespace(): @@ -560,33 +559,35 @@ """ -## def qname(): -## """ -## Test QName handling. +def qname(): + """ + Test QName handling. + + 1) decorated tags -## 1) decorated tags + >>> elem = ElementTree.Element("{uri}tag") + >>> serialize(elem) # 1.1 + '' + >>> elem = ElementTree.Element(ElementTree.QName("{uri}tag")) + >>> serialize(elem) # 1.2 + '' + >>> elem = ElementTree.Element(ElementTree.QName("uri", "tag")) + >>> serialize(elem) # 1.3 + '' -## >>> elem = ElementTree.Element("{uri}tag") -## >>> serialize(elem) # 1.1 -## '' -## >>> elem = ElementTree.Element(ElementTree.QName("{uri}tag")) -## >>> serialize(elem) # 1.2 -## '' -## >>> elem = ElementTree.Element(ElementTree.QName("uri", "tag")) -## >>> serialize(elem) # 1.3 -## '' +# ns/attribute order ... ## 2) decorated attributes ## >>> elem.clear() ## >>> elem.attrib["{uri}key"] = "value" ## >>> serialize(elem) # 2.1 -## '' +## '' ## >>> elem.clear() ## >>> elem.attrib[ElementTree.QName("{uri}key")] = "value" ## >>> serialize(elem) # 2.2 -## '' +## '' ## 3) decorated values are not converted by default, but the ## QName wrapper can be used for values @@ -594,12 +595,12 @@ ## >>> elem.clear() ## >>> elem.attrib["{uri}key"] = "{uri}value" ## >>> serialize(elem) # 3.1 -## '' +## '' ## >>> elem.clear() ## >>> elem.attrib["{uri}key"] = ElementTree.QName("{uri}value") ## >>> serialize(elem) # 3.2 -## '' +## '' ## >>> elem.clear() ## >>> subelem = ElementTree.Element("tag") @@ -607,9 +608,9 @@ ## >>> elem.append(subelem) ## >>> elem.append(subelem) ## >>> serialize(elem) # 3.3 -## '' +## '' -## """ + """ def xpath_tokenizer(p): """ Modified: lxml/trunk/selftest2.py ============================================================================== --- lxml/trunk/selftest2.py (original) +++ lxml/trunk/selftest2.py Wed Sep 12 22:06:09 2007 @@ -22,7 +22,7 @@ tree.write(file, encoding) else: tree.write(file) - return file.getvalue() + return file.getvalue().replace(' />', '/>') def summarize(elem): return elem.tag @@ -161,13 +161,13 @@ ## >>> elem.attrib["key"] = "<&\"\'>" ## >>> elem.text = None ## >>> serialize(elem) -## '' +## '' ## >>> serialize(elem, "utf-8") -## '' +## '' ## >>> serialize(elem, "us-ascii") -## '' +## '' ## >>> serialize(elem, "iso-8859-1") -## '\n' +## '\n' >>> elem.text = u'\xe5\xf6\xf6<>' >>> elem.attrib.clear() @@ -183,13 +183,13 @@ ## >>> elem.attrib["key"] = u'\xe5\xf6\xf6<>' ## >>> elem.text = None ## >>> serialize(elem) -## '' +## '' ## >>> serialize(elem, "utf-8") -## '' +## '' ## >>> serialize(elem, "us-ascii") -## '' +## '' ## >>> serialize(elem, "iso-8859-1") -## '\n' +## '\n' """ @@ -290,25 +290,28 @@ ['{http://effbot.org/ns}tag', '{http://effbot.org/ns}tag', '{http://effbot.org/ns}tag'] """ -# XXX gives a segfault +# XXX only deep copying is supported -## def copy(): -## """ -## Test copy handling (etc). +def copy(): + """ + Test copy handling (etc). -## >>> import copy -## >>> e1 = unserialize("hello") -## >>> e2 = copy.copy(e1) -## >>> e3 = copy.deepcopy(e1) -## >>> e1.find("foo").tag = "bar" -## >>> serialize(e1) -## 'hello' -## >>> serialize(e2) -## 'hello' -## >>> serialize(e3) -## 'hello' + >>> import copy + >>> e1 = unserialize("hello") + >>> # e2 = copy.copy(e1) + >>> e3 = copy.deepcopy(e1) + >>> e1.find("foo").tag = "bar" -## """ + >>> serialize(e1).replace(' ', '') + 'hello' + +## >>> serialize(e2).replace(' ', '') +## 'hello' + + >>> serialize(e3).replace(' ', '') + 'hello' + + """ def attrib(): """ Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Wed Sep 12 22:06:09 2007 @@ -1986,7 +1986,7 @@ del one self.assertEquals('{http://a.b.c}baz', baz.tag) - def test_ns_decl(self): + def test_ns_decl_tostring(self): tostring = self.etree.tostring root = self.etree.XML( '') @@ -1996,7 +1996,7 @@ tostring(baz)) self.assertEquals(["http://a.b.c"], nsdecl) - def test_ns_decl_default(self): + def test_ns_decl_tostring_default(self): tostring = self.etree.tostring root = self.etree.XML( '') @@ -2006,7 +2006,7 @@ tostring(baz)) self.assertEquals(["http://a.b.c"], nsdecl) - def test_ns_decl_root(self): + def test_ns_decl_tostring_root(self): tostring = self.etree.tostring root = self.etree.XML( '') @@ -2016,6 +2016,19 @@ tostring(baz)) self.assertEquals(["http://a.b.c"], nsdecl) + + def test_ns_decl_tostring_element(self): + Element = self.etree.Element + SubElement = self.etree.SubElement + + root = Element("foo") + bar = SubElement(root, "{http://a.b.c}bar") + baz = SubElement(bar, "{http://a.b.c}baz") + + nsdecl = re.findall("xmlns(?::[a-z0-9]+)?=[\"']([^\"']+)[\"']", + self.etree.tostring(baz)) + + self.assertEquals(["http://a.b.c"], nsdecl) def test_attribute_xmlns_move(self): Element = self.etree.Element @@ -2562,6 +2575,59 @@ self.assertEquals(qname1, qname1) self.assertEquals(qname1, qname2) + def test_qname_attribute_getset(self): + etree = self.etree + qname = etree.QName('myns', 'a') + + a = etree.Element(qname) + a.set(qname, "value") + + self.assertEquals(a.get(qname), "value") + self.assertEquals(a.get("{myns}a"), "value") + + def test_qname_attrib(self): + etree = self.etree + qname = etree.QName('myns', 'a') + + a = etree.Element(qname) + a.attrib[qname] = "value" + + self.assertEquals(a.attrib[qname], "value") + self.assertEquals(a.attrib.get(qname), "value") + + self.assertEquals(a.attrib["{myns}a"], "value") + self.assertEquals(a.attrib.get("{myns}a"), "value") + + def test_qname_attribute_resolve(self): + etree = self.etree + qname = etree.QName('http://myns', 'a') + a = etree.Element(qname) + a.set(qname, qname) + + self.assertXML( + '', + a) + + def test_qname_attribute_resolve_new(self): + etree = self.etree + qname = etree.QName('http://myns', 'a') + a = etree.Element('a') + a.set('a', qname) + + self.assertXML( + '', + a) + + def test_qname_attrib_resolve(self): + etree = self.etree + qname = etree.QName('http://myns', 'a') + a = etree.Element(qname) + a.attrib[qname] = qname + + self.assertXML( + '', + a) + def test_parser_version(self): etree = self.etree parser = etree.XMLParser() From scoder at codespeak.net Wed Sep 12 22:08:14 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 12 Sep 2007 22:08:14 +0200 (CEST) Subject: [Lxml-checkins] r46517 - lxml/trunk/src/lxml Message-ID: <20070912200814.BF2AC80F1@code0.codespeak.net> Author: scoder Date: Wed Sep 12 22:08:14 2007 New Revision: 46517 Modified: lxml/trunk/src/lxml/etree.pyx Log: small optimisation for _Document creation Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Wed Sep 12 22:08:14 2007 @@ -382,9 +382,13 @@ if node_ns_utf is not None: self._setNodeNs(c_node, _cstr(node_ns_utf)) +cdef extern from "etree_defs.h": + # macro call to 't->tp_new()' for fast instantiation + cdef _Document NEW_DOCUMENT "PY_NEW" (object t) + cdef _Document _documentFactory(xmlDoc* c_doc, _BaseParser parser): cdef _Document result - result = _Document() + result = NEW_DOCUMENT(_Document) result._c_doc = c_doc result._ns_counter = 0 if parser is None: From scoder at codespeak.net Wed Sep 12 22:12:08 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 12 Sep 2007 22:12:08 +0200 (CEST) Subject: [Lxml-checkins] r46518 - in lxml/trunk: doc src/lxml/tests Message-ID: <20070912201208.E70A480FF@code0.codespeak.net> Author: scoder Date: Wed Sep 12 22:12:08 2007 New Revision: 46518 Added: lxml/trunk/src/lxml/tests/test_schematron.py Modified: lxml/trunk/doc/validation.txt Log: some test cases for schematron support Modified: lxml/trunk/doc/validation.txt ============================================================================== --- lxml/trunk/doc/validation.txt (original) +++ lxml/trunk/doc/validation.txt Wed Sep 12 22:12:08 2007 @@ -241,7 +241,7 @@ ... ... ... - ... The sum is not 100%. + ... Sum is not 100%. ... ... ... Added: lxml/trunk/src/lxml/tests/test_schematron.py ============================================================================== --- (empty file) +++ lxml/trunk/src/lxml/tests/test_schematron.py Wed Sep 12 22:12:08 2007 @@ -0,0 +1,73 @@ +# -*- coding: utf-8 -*- + +""" +Test cases related to Schematron parsing and validation +""" + +import unittest, sys + +from common_imports import etree, doctest, HelperTestCase, fileInTestDir + +class ETreeSchematronTestCase(HelperTestCase): + def test_schematron(self): + tree_valid = self.parse('') + tree_invalid = self.parse('') + schema = self.parse('''\ + + + + BBB element is not present + CCC element is not present + + + + + BBB element is not present + CCC element is not present + There is an extra element + + + +''') + schema = etree.Schematron(schema) + self.assert_(schema.validate(tree_valid)) + self.assert_(not schema.validate(tree_invalid)) + + def test_schematron_elementtree_error(self): + self.assertRaises(ValueError, etree.Schematron, etree.ElementTree()) + + def test_schematron_invalid_schema(self): + schema = self.parse('''\ + + + + +''') + self.assertRaises(etree.SchematronParseError, + etree.Schematron, schema) + + def test_schematron_invalid_schema_empty(self): + schema = self.parse('''\ + +''') + self.assertRaises(etree.SchematronParseError, + etree.Schematron, schema) + + def test_schematron_invalid_schema_namespace(self): + # segfault + schema = self.parse('''\ + +''') + self.assertRaises(etree.SchematronParseError, + etree.Schematron, schema) + + +def test_suite(): + suite = unittest.TestSuite() + suite.addTests([unittest.makeSuite(ETreeSchematronTestCase)]) + suite.addTests( + [doctest.DocFileSuite('../../../doc/validation.txt')]) + return suite + +if __name__ == '__main__': + print 'to test use test.py %s' % __file__ From scoder at codespeak.net Wed Sep 12 22:43:48 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 12 Sep 2007 22:43:48 +0200 (CEST) Subject: [Lxml-checkins] r46520 - lxml/trunk/src/lxml Message-ID: <20070912204348.0B48880F0@code0.codespeak.net> Author: scoder Date: Wed Sep 12 22:43:47 2007 New Revision: 46520 Modified: lxml/trunk/src/lxml/extensions.pxi Log: windows compile fix Modified: lxml/trunk/src/lxml/extensions.pxi ============================================================================== --- lxml/trunk/src/lxml/extensions.pxi (original) +++ lxml/trunk/src/lxml/extensions.pxi Wed Sep 12 22:43:47 2007 @@ -22,7 +22,7 @@ # forward declarations -ctypedef int _register_function(void* ctxt, name_utf, ns_uri_utf) +ctypedef int (*_register_function)(void* ctxt, name_utf, ns_uri_utf) cdef class _ExsltRegExp ################################################################################ From scoder at codespeak.net Thu Sep 13 11:15:42 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 13 Sep 2007 11:15:42 +0200 (CEST) Subject: [Lxml-checkins] r46532 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20070913091542.31E2B80C3@code0.codespeak.net> Author: scoder Date: Thu Sep 13 11:15:41 2007 New Revision: 46532 Modified: lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/etree.pyx lxml/trunk/src/lxml/iterparse.pxi lxml/trunk/src/lxml/tests/test_etree.py Log: itertext() method on elements Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Thu Sep 13 11:15:41 2007 @@ -8,6 +8,8 @@ Features added -------------- +* ``itertext()`` method on Elements + * Setting a QName object as value of the .text property or as an attribute will resolve its prefix in the respective context Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Thu Sep 13 11:15:41 2007 @@ -1092,6 +1092,17 @@ """ return ElementDepthFirstIterator(self, tag) + def itertext(self, tag=None, with_tail=True): + """Iterates over the text content of a subtree. + + You can pass the ``tag`` keyword argument to restrict text content to + a specific tag name. + + You can set the ``with_tail`` keyword argument to ``False`` to skip + over tail text. + """ + return ElementTextIterator(self, tag, with_tail) + def makeelement(self, _tag, attrib=None, nsmap=None, **_extra): """Creates a new element associated with the same document. """ @@ -1897,6 +1908,36 @@ tree.END_FOR_EACH_ELEMENT_FROM(c_node) return NULL +cdef class ElementTextIterator: + """Iterates over the text content of a subtree. + + You can pass the ``tag`` keyword argument to restrict text content to a + specific tag name. + + You can set the ``with_tail`` keyword argument to ``False`` to skip over + tail text. + """ + cdef object _nextEvent + def __init__(self, _Element element not None, tag=None, with_tail=True): + if with_tail: + events = ("start", "end") + else: + events = ("start",) + self._nextEvent = iterwalk(element, events=events, tag=tag).next + + def __iter__(self): + return self + + def __next__(self): + cdef _Element element + while result is None: + event, element = self._nextEvent() + if event == "start": + result = element.text + else: + result = element.tail + return result + cdef xmlNode* _createElement(xmlDoc* c_doc, object name_utf) except NULL: cdef xmlNode* c_node c_node = tree.xmlNewDocNode(c_doc, NULL, _cstr(name_utf), NULL) Modified: lxml/trunk/src/lxml/iterparse.pxi ============================================================================== --- lxml/trunk/src/lxml/iterparse.pxi (original) +++ lxml/trunk/src/lxml/iterparse.pxi Thu Sep 13 11:15:41 2007 @@ -444,6 +444,7 @@ cdef _Element _end_node(self): cdef _Element node + cdef int i, ns_count node, ns_count = self._pop_node() if self._event_filter & ITERPARSE_FILTER_END: if self._tag_tuple is None or \ Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Thu Sep 13 11:15:41 2007 @@ -1420,6 +1420,15 @@ [a, b, c], list(a.getiterator('*'))) + def test_itertext(self): + # ET 1.3+ + XML = self.etree.XML + root = XML("RTEXTATAILCTEXTCTAIL") + + text = list(root.itertext()) + self.assertEquals(["RTEXT", "ATAIL", "CTEXT", "CTAIL"], + text) + def test_findall_ns(self): XML = self.etree.XML root = XML('') From scoder at codespeak.net Thu Sep 13 12:47:27 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 13 Sep 2007 12:47:27 +0200 (CEST) Subject: [Lxml-checkins] r46534 - lxml/trunk Message-ID: <20070913104727.8AF3580A3@code0.codespeak.net> Author: scoder Date: Thu Sep 13 12:47:24 2007 New Revision: 46534 Modified: lxml/trunk/TODO.txt Log: TODO cleanup Modified: lxml/trunk/TODO.txt ============================================================================== --- lxml/trunk/TODO.txt (original) +++ lxml/trunk/TODO.txt Thu Sep 13 12:47:24 2007 @@ -56,10 +56,8 @@ * clean up (and remove?) duplicated API for extension functions -* remove first 'context' argument from extension functions - -* always use ns-prefixed type names in objectify's ``xsi:type`` attributes - * follow PEP 8 in API naming (avoidCamelCase in_favour_of_underscores) * clean support for entities (is the Entity element class enough?) + +* implement 'position' property on ParseError exception From scoder at codespeak.net Thu Sep 13 12:52:26 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 13 Sep 2007 12:52:26 +0200 (CEST) Subject: [Lxml-checkins] r46535 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20070913105226.54C3980C8@code0.codespeak.net> Author: scoder Date: Thu Sep 13 12:52:26 2007 New Revision: 46535 Modified: lxml/trunk/CHANGES.txt lxml/trunk/selftest.py lxml/trunk/src/lxml/_elementpath.py lxml/trunk/src/lxml/etree.pyx lxml/trunk/src/lxml/tests/test_etree.py Log: ET 1.3 compatibility updates: iterfind(), new ElementPath implementation, updated selftest.py, fix for itertext() Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Thu Sep 13 12:52:26 2007 @@ -8,6 +8,9 @@ Features added -------------- +* ``iterfind()`` method on Elements returns an iterator equivalent to + ``findall()`` + * ``itertext()`` method on Elements * Setting a QName object as value of the .text property or as an attribute Modified: lxml/trunk/selftest.py ============================================================================== --- lxml/trunk/selftest.py (original) +++ lxml/trunk/selftest.py Thu Sep 13 12:52:26 2007 @@ -1,4 +1,4 @@ -# $Id: selftest.py 2193 2004-12-05 18:03:00Z fredrik $ +# $Id: selftest.py 3276 2007-09-12 06:52:30Z fredrik $ # -*- coding: iso-8859-1 -*- # elementtree selftest program @@ -14,6 +14,7 @@ from lxml import etree as ElementTree from lxml import _elementpath as ElementPath from lxml import ElementInclude +ET = ElementTree #from elementtree import ElementTree #from elementtree import ElementPath @@ -26,14 +27,11 @@ xml_data = xml_data.replace(' />', '/>') return xml_data -def serialize(elem, encoding=None): +def serialize(elem, **options): import StringIO file = StringIO.StringIO() tree = ElementTree.ElementTree(elem) - if encoding: - tree.write(file, encoding) - else: - tree.write(file) + tree.write(file, **options) return fix_compatibility( file.getvalue() ) def summarize(elem): @@ -106,18 +104,21 @@ # -------------------------------------------------------------------- # element tree tests -## def sanity(): -## """ -## >>> from elementtree.ElementTree import * -## >>> from elementtree.ElementInclude import * -## >>> from elementtree.ElementPath import * -## >>> from elementtree.HTMLTreeBuilder import * -## >>> from elementtree.SimpleXMLTreeBuilder import * -## >>> from elementtree.SimpleXMLWriter import * -## >>> from elementtree.TidyHTMLTreeBuilder import * -## >>> from elementtree.TidyTools import * -## >>> from elementtree.XMLTreeBuilder import * -## """ +def sanity(): + """ + >>> from elementtree.ElementTree import * + >>> from elementtree.ElementInclude import * + >>> from elementtree.ElementPath import * + >>> from elementtree.HTMLTreeBuilder import * + >>> from elementtree.SimpleXMLTreeBuilder import * + >>> from elementtree.SimpleXMLWriter import * + >>> from elementtree.TidyHTMLTreeBuilder import * + >>> from elementtree.TidyTools import * + >>> from elementtree.XMLTreeBuilder import * + """ + +# doesn't work with lxml.etree +del sanity def interface(): """ @@ -129,38 +130,41 @@ >>> check_element_tree(tree) """ -## def simplefind(): -## """ -## Test find methods using the elementpath fallback. +def simplefind(): + """ + Test find methods using the elementpath fallback. -## >>> CurrentElementPath = ElementTree.ElementPath -## >>> ElementTree.ElementPath = ElementTree._SimpleElementPath() -## >>> elem = SAMPLE_XML -## >>> elem.find("tag").tag -## 'tag' -## >>> ElementTree.ElementTree(elem).find("tag").tag -## 'tag' -## >>> elem.findtext("tag") -## 'text' -## >>> elem.findtext("tog") -## >>> elem.findtext("tog", "default") -## 'default' -## >>> ElementTree.ElementTree(elem).findtext("tag") -## 'text' -## >>> summarize_list(elem.findall("tag")) -## ['tag', 'tag'] -## >>> summarize_list(elem.findall(".//tag")) -## ['tag', 'tag', 'tag'] + >>> CurrentElementPath = ElementTree.ElementPath + >>> ElementTree.ElementPath = ElementTree._SimpleElementPath() + >>> elem = SAMPLE_XML + >>> elem.find("tag").tag + 'tag' + >>> ElementTree.ElementTree(elem).find("tag").tag + 'tag' + >>> elem.findtext("tag") + 'text' + >>> elem.findtext("tog") + >>> elem.findtext("tog", "default") + 'default' + >>> ElementTree.ElementTree(elem).findtext("tag") + 'text' + >>> summarize_list(elem.findall("tag")) + ['tag', 'tag'] + >>> summarize_list(elem.findall(".//tag")) + ['tag', 'tag', 'tag'] -## Path syntax doesn't work in this case. + Path syntax doesn't work in this case. -## >>> elem.find("section/tag") -## >>> elem.findtext("section/tag") -## >>> elem.findall("section/tag") -## [] + >>> elem.find("section/tag") + >>> elem.findtext("section/tag") + >>> elem.findall("section/tag") + [] -## >>> ElementTree.ElementPath = CurrentElementPath -## """ + >>> ElementTree.ElementPath = CurrentElementPath + """ + +# doesn't work with lxml.etree +del simplefind def find(): """ @@ -216,10 +220,31 @@ ['tag', 'tag', 'tag'] >>> summarize_list(elem.findall("././tag")) ['tag', 'tag'] + +## >>> summarize_list(elem.findall(".//tag[@class]")) +## ['tag', 'tag', 'tag'] +## >>> summarize_list(elem.findall(".//tag[@class='a']")) +## ['tag'] +## >>> summarize_list(elem.findall(".//tag[@class='b']")) +## ['tag', 'tag'] +## >>> summarize_list(elem.findall(".//tag[@id]")) +## ['tag'] +## >>> summarize_list(elem.findall(".//section[tag]")) +## ['section'] +## >>> summarize_list(elem.findall(".//section[element]")) +## [] +## >>> summarize_list(elem.findall("../tag")) +## [] +## >>> summarize_list(elem.findall("section/../tag")) +## ['tag', 'tag'] +## >>> summarize_list(ElementTree.ElementTree(elem).findall("./tag")) +## ['tag', 'tag'] + + FIXME: ET's Path module handles this case incorrectly; this gives + a warning in 1.3, and the behaviour will be modified in 1.4. + >>> summarize_list(ElementTree.ElementTree(elem).findall("/tag")) ['tag', 'tag'] - >>> summarize_list(ElementTree.ElementTree(elem).findall("./tag")) - ['tag', 'tag'] """ def bad_find(): @@ -230,15 +255,9 @@ >>> elem.findall("/tag") Traceback (most recent call last): SyntaxError: cannot use absolute path on element - >>> elem.findall("../tag") - Traceback (most recent call last): - SyntaxError: unsupported path syntax (..) >>> elem.findall("section//") Traceback (most recent call last): - SyntaxError: path cannot end with // - >>> elem.findall("tag[tag]") - Traceback (most recent call last): - SyntaxError: expected path separator ([) + SyntaxError: invalid path """ def parsefile(): @@ -261,6 +280,12 @@ texttail + +## +## text +## texttail +## +## """ ## def parsehtml(): @@ -282,6 +307,12 @@ >>> element = ElementTree.fromstring("text") >>> ElementTree.ElementTree(element).write(sys.stdout) text + +## >>> sequence = ["", "text"] +## >>> element = ElementTree.fromstringlist(sequence) +## >>> ElementTree.ElementTree(element).write(sys.stdout) +## text + >>> print ElementTree.tostring(element) text @@ -426,6 +457,11 @@ >>> ElementTree.SubElement(elem, "subtag").text = "subtext" >>> serialize(elem) 'textsubtext' + +## Test tag suppression +## >>> elem.tag = None +## >>> serialize(elem) +## 'textsubtext' """ def writestring(): @@ -446,58 +482,95 @@ >>> elem.text = u"abc" >>> serialize(elem) 'abc' - >>> serialize(elem, "utf-8") + >>> serialize(elem, encoding="utf-8") 'abc' - >>> serialize(elem, "us-ascii") + >>> serialize(elem, encoding="us-ascii") 'abc' - >>> serialize(elem, "ISO-8859-1") + >>> serialize(elem, encoding="ISO-8859-1") "\nabc" >>> elem.text = "<&\"\'>" >>> serialize(elem) '<&"\'>' - >>> serialize(elem, "utf-8") + >>> serialize(elem, encoding="utf-8") '<&"\'>' - >>> serialize(elem, "us-ascii") # cdata characters + >>> serialize(elem, encoding="us-ascii") # cdata characters '<&"\'>' - >>> serialize(elem, "ISO-8859-1") + >>> serialize(elem, encoding="ISO-8859-1") '\n<&"\'>' ## >>> elem.attrib["key"] = "<&\"\'>" ## >>> elem.text = None ## >>> serialize(elem) ## '' -## >>> serialize(elem, "utf-8") +## >>> serialize(elem, encoding="utf-8") ## '' -## >>> serialize(elem, "us-ascii") +## >>> serialize(elem, encoding="us-ascii") ## '' -## >>> serialize(elem, "iso-8859-1") +## >>> serialize(elem, encoding="iso-8859-1") ## '\n' >>> elem.text = u'\xe5\xf6\xf6<>' >>> elem.attrib.clear() >>> serialize(elem) 'åöö<>' - >>> serialize(elem, "utf-8") + >>> serialize(elem, encoding="utf-8") '\xc3\xa5\xc3\xb6\xc3\xb6<>' - >>> serialize(elem, "us-ascii") + >>> serialize(elem, encoding="us-ascii") 'åöö<>' - >>> serialize(elem, "ISO-8859-1") + >>> serialize(elem, encoding="ISO-8859-1") "\n\xe5\xf6\xf6<>" ## >>> elem.attrib["key"] = u'\xe5\xf6\xf6<>' ## >>> elem.text = None ## >>> serialize(elem) ## '' -## >>> serialize(elem, "utf-8") +## >>> serialize(elem, encoding="utf-8") ## '' -## >>> serialize(elem, "us-ascii") +## >>> serialize(elem, encoding="us-ascii") ## '' -## >>> serialize(elem, "ISO-8859-1") +## >>> serialize(elem, encoding="ISO-8859-1") ## '\n' """ +def methods(): + r""" + Test serialization methods. + + >>> e = ET.XML("") + >>> e.tail = "\n" + >>> serialize(e) + '\n' + >>> serialize(e, method=None) + '\n' + >>> serialize(e, method="xml") + '\n' + >>> serialize(e, method="html") + '\n' + >>> serialize(e, method="text") + '1 < 2\n' + + """ + +# doesn't work with lxml.etree +del methods + +def iterators(): + """ + Test iterators. + + >>> e = ET.XML("this is a paragraph...") + >>> summarize_list(e.iter()) + ['html', 'body', 'i'] + >>> summarize_list(e.find("body").iter()) + ['body', 'i'] + >>> "".join(e.itertext()) + 'this is a paragraph...' + >>> "".join(e.find("body").itertext()) + 'this is a paragraph.' + """ + ENTITY_XML = """\ @@ -506,40 +579,56 @@ &entity; """ -## def entity(): -## """ -## Test entity handling. +def entity(): + """ + Test entity handling. -## 1) bad entities + 1) bad entities -## >>> ElementTree.XML("&entity;") -## Traceback (most recent call last): -## ExpatError: undefined entity: line 1, column 10 + >>> ElementTree.XML("&entity;") + Traceback (most recent call last): + ExpatError: undefined entity: line 1, column 10 -## >>> ElementTree.XML(ENTITY_XML) -## Traceback (most recent call last): -## ExpatError: undefined entity &entity;: line 5, column 10 + >>> ElementTree.XML(ENTITY_XML) + Traceback (most recent call last): + ExpatError: undefined entity &entity;: line 5, column 10 -## (add more tests here) + (add more tests here) -## """ + """ + +# doesn't work with lxml.etree +del entity -def xmllang(): +def error(xml): """ - This appears to be a problem; in underlying libxml2? - - 1) xml namespace + Test error handling. + + >>> error("foo").position + (1, 0) + >>> error("&foo;").position + (1, 5) + >>> error("foobar<").position + (1, 6) - >>> elem = ElementTree.XML("") - >>> serialize(elem) # 1.1 - '' """ - + try: + ET.XML(xml) + except ET.ParseError: + return sys.exc_value + +# doesn't work with lxml.etree +del error + def namespace(): """ Test namespace issues. + 1) xml namespace + >>> elem = ElementTree.XML("") + >>> serialize(elem) # 1.1 + '' 2) other "well-known" namespaces @@ -634,15 +723,15 @@ >>> xpath_tokenizer("/doc/chapter[5]/section[2]") ['/', 'doc', '/', 'chapter', '[', '5', ']', '/', 'section', '[', '2', ']'] >>> xpath_tokenizer("chapter//para") - ['chapter', '/', '/', 'para'] + ['chapter', '//', 'para'] >>> xpath_tokenizer("//para") - ['/', '/', 'para'] + ['//', 'para'] >>> xpath_tokenizer("//olist/item") - ['/', '/', 'olist', '/', 'item'] + ['//', 'olist', '/', 'item'] >>> xpath_tokenizer(".") ['.'] >>> xpath_tokenizer(".//para") - ['.', '/', '/', 'para'] + ['.', '//', 'para'] >>> xpath_tokenizer("..") ['..'] >>> xpath_tokenizer("../@lang") @@ -658,7 +747,7 @@ >>> xpath_tokenizer("./spam.egg") ['.', '/', 'spam.egg'] >>> xpath_tokenizer(".//{http://spam}egg") - ['.', '/', '/', '{http://spam}egg'] + ['.', '//', '{http://spam}egg'] """ out = [] for op, tag in ElementPath.xpath_tokenizer(p): @@ -811,70 +900,76 @@ # # xmlwriter -## def xmlwriter(): -## r""" -## >>> file = StringIO.StringIO() -## >>> w = SimpleXMLWriter.XMLWriter(file) -## >>> html = w.start("html") -## >>> x = w.start("head") -## >>> w.element("title", "my document") -## >>> w.data("\n") -## >>> w.element("meta", name="hello", value="goodbye") -## >>> w.data("\n") -## >>> w.end() -## >>> x = w.start("body") -## >>> w.element("h1", "this is a heading") -## >>> w.data("\n") -## >>> w.element("p", u"this is a paragraph") -## >>> w.data("\n") -## >>> w.element("p", u"reserved characters: <&>") -## >>> w.data("\n") -## >>> w.element("p", u"detta ?r ocks? ett stycke") -## >>> w.data("\n") -## >>> w.close(html) -## >>> print file.getvalue() -## my document -## -##

this is a heading

-##

this is a paragraph

-##

reserved characters: <&>

-##

detta är också ett stycke

-## -## """ +def xmlwriter(): + r""" + >>> file = StringIO.StringIO() + >>> w = SimpleXMLWriter.XMLWriter(file) + >>> html = w.start("html") + >>> x = w.start("head") + >>> w.element("title", "my document") + >>> w.data("\n") + >>> w.element("meta", name="hello", value="goodbye") + >>> w.data("\n") + >>> w.end() + >>> x = w.start("body") + >>> w.element("h1", "this is a heading") + >>> w.data("\n") + >>> w.element("p", u"this is a paragraph") + >>> w.data("\n") + >>> w.element("p", u"reserved characters: <&>") + >>> w.data("\n") + >>> w.element("p", u"detta ?r ocks? ett stycke") + >>> w.data("\n") + >>> w.close(html) + >>> print file.getvalue() + my document + +

this is a heading

+

this is a paragraph

+

reserved characters: <&>

+

detta är också ett stycke

+ + """ + +# doesn't work with lxml.etree +del xmlwriter # -------------------------------------------------------------------- # reported bugs -## def bug_xmltoolkit21(): -## """ -## marshaller gives obscure errors for non-string values +def bug_xmltoolkit21(): + """ + marshaller gives obscure errors for non-string values -## >>> elem = ElementTree.Element(123) -## >>> serialize(elem) # tag -## Traceback (most recent call last): -## TypeError: cannot serialize 123 (type int) -## >>> elem = ElementTree.Element("elem") -## >>> elem.text = 123 -## >>> serialize(elem) # text -## Traceback (most recent call last): -## TypeError: cannot serialize 123 (type int) -## >>> elem = ElementTree.Element("elem") -## >>> elem.tail = 123 -## >>> serialize(elem) # tail -## Traceback (most recent call last): -## TypeError: cannot serialize 123 (type int) -## >>> elem = ElementTree.Element("elem") -## >>> elem.set(123, "123") -## >>> serialize(elem) # attribute key -## Traceback (most recent call last): -## TypeError: cannot serialize 123 (type int) -## >>> elem = ElementTree.Element("elem") -## >>> elem.set("123", 123) -## >>> serialize(elem) # attribute value -## Traceback (most recent call last): -## TypeError: cannot serialize 123 (type int) + >>> elem = ElementTree.Element(123) + >>> serialize(elem) # tag + Traceback (most recent call last): + TypeError: cannot serialize 123 (type int) + >>> elem = ElementTree.Element("elem") + >>> elem.text = 123 + >>> serialize(elem) # text + Traceback (most recent call last): + TypeError: cannot serialize 123 (type int) + >>> elem = ElementTree.Element("elem") + >>> elem.tail = 123 + >>> serialize(elem) # tail + Traceback (most recent call last): + TypeError: cannot serialize 123 (type int) + >>> elem = ElementTree.Element("elem") + >>> elem.set(123, "123") + >>> serialize(elem) # attribute key + Traceback (most recent call last): + TypeError: cannot serialize 123 (type int) + >>> elem = ElementTree.Element("elem") + >>> elem.set("123", 123) + >>> serialize(elem) # attribute value + Traceback (most recent call last): + TypeError: cannot serialize 123 (type int) -## """ + """ + +# doesn't work with lxml.etree +del bug_xmltoolkit21 def bug_xmltoolkit25(): """ @@ -898,92 +993,199 @@ ['tbody'] """ -## def bug_xmltoolkitX1(): -## """ -## dump() doesn't flush the output buffer +def bug_xmltoolkitX1(): + """ + dump() doesn't flush the output buffer -## >>> tree = ElementTree.XML("
") -## >>> ElementTree.dump(tree); sys.stdout.write("tail") -##
-## tail -## """ + >>> tree = ElementTree.XML("
") + >>> ElementTree.dump(tree); sys.stdout.write("tail") +
+ tail + """ -## def bug_xmltoolkit39(): -## """ -## non-ascii element and attribute names doesn't work +# doesn't work with lxml.etree +del bug_xmltoolkitX1 -## >>> tree = ElementTree.XML("") -## >>> ElementTree.tostring(tree, "utf-8") -## '' - -## >>> tree = ElementTree.XML("") -## >>> tree.attrib -## {u'\\xe4ttr': u'v\\xe4lue'} -## >>> ElementTree.tostring(tree, "utf-8") -## '' - -## >>> tree = ElementTree.XML("text") -## >>> ElementTree.tostring(tree, "utf-8") -## 'text' - -## >>> tree = ElementTree.Element(u"t?g") -## >>> ElementTree.tostring(tree, "utf-8") -## '' - -## >>> tree = ElementTree.Element("tag") -## >>> tree.set(u"?ttr", u"v?lue") -## >>> ElementTree.tostring(tree, "utf-8") -## '' +def bug_xmltoolkit39(): + """ + non-ascii element and attribute names doesn't work -## """ + >>> tree = ElementTree.XML("") + >>> ElementTree.tostring(tree, "utf-8") + '' -## def bug_xmltoolkit45(): -## """ -## problems parsing mixed unicode/non-ascii html documents + >>> tree = ElementTree.XML("") + >>> tree.attrib + {u'\\xe4ttr': u'v\\xe4lue'} + >>> ElementTree.tostring(tree, "utf-8") + '' -## latin-1 text -## >>> p = HTMLTreeBuilder.TreeBuilder() -## >>> p.feed("

v?lue

") -## >>> serialize(p.close()) -## '

välue

' + >>> tree = ElementTree.XML("text") + >>> ElementTree.tostring(tree, "utf-8") + 'text' -## utf-8 text -## >>> p = HTMLTreeBuilder.TreeBuilder(encoding="utf-8") -## >>> p.feed("

v\xc3\xa4lue

") -## >>> serialize(p.close()) -## '

välue

' + >>> tree = ElementTree.Element(u"t?g") + >>> ElementTree.tostring(tree, "utf-8") + '' -## utf-8 text using meta tag -## >>> p = HTMLTreeBuilder.TreeBuilder() -## >>> p.feed("

v\xc3\xa4lue

") -## >>> serialize(p.close().find("p")) -## '

välue

' - -## latin-1 character references -## >>> p = HTMLTreeBuilder.TreeBuilder() -## >>> p.feed("

välue

") -## >>> serialize(p.close()) -## '

välue

' + >>> tree = ElementTree.Element("tag") + >>> tree.set(u"?ttr", u"v?lue") + >>> ElementTree.tostring(tree, "utf-8") + '' -## latin-1 character entities -## >>> p = HTMLTreeBuilder.TreeBuilder() -## >>> p.feed("

välue

") -## >>> serialize(p.close()) -## '

välue

' + """ -## mixed latin-1 text and unicode entities -## >>> p = HTMLTreeBuilder.TreeBuilder() -## >>> p.feed("

”v?lue”

") -## >>> serialize(p.close()) -## '

”välue”

' +# doesn't work with lxml.etree +del bug_xmltoolkit39 -## mixed unicode and latin-1 entities -## >>> p = HTMLTreeBuilder.TreeBuilder() -## >>> p.feed("

”välue”

") -## >>> serialize(p.close()) -## '

”välue”

' +def bug_xmltoolkit45(): + """ + problems parsing mixed unicode/non-ascii html documents -## """ + latin-1 text + >>> p = HTMLTreeBuilder.TreeBuilder() + >>> p.feed("

v?lue

") + >>> serialize(p.close()) + '

välue

' + + utf-8 text + >>> p = HTMLTreeBuilder.TreeBuilder(encoding="utf-8") + >>> p.feed("

v\xc3\xa4lue

") + >>> serialize(p.close()) + '

välue

' + + utf-8 text using meta tag + >>> p = HTMLTreeBuilder.TreeBuilder() + >>> p.feed("

v\xc3\xa4lue

") + >>> serialize(p.close().find("p")) + '

välue

' + + latin-1 character references + >>> p = HTMLTreeBuilder.TreeBuilder() + >>> p.feed("

välue

") + >>> serialize(p.close()) + '

välue

' + + latin-1 character entities + >>> p = HTMLTreeBuilder.TreeBuilder() + >>> p.feed("

välue

") + >>> serialize(p.close()) + '

välue

' + + mixed latin-1 text and unicode entities + >>> p = HTMLTreeBuilder.TreeBuilder() + >>> p.feed("

”v?lue”

") + >>> serialize(p.close()) + '

”välue”

' + + mixed unicode and latin-1 entities + >>> p = HTMLTreeBuilder.TreeBuilder() + >>> p.feed("

”välue”

") + >>> serialize(p.close()) + '

”välue”

' + + """ + +# doesn't work with lxml.etree +del bug_xmltoolkit45 + +def bug_xmltoolkit46(): + """ + problems parsing open BR tags + + >>> p = HTMLTreeBuilder.TreeBuilder() + >>> p.feed("

key
value

") + >>> serialize(p.close()) + '

key
value

' + + """ + +# doesn't work with lxml.etree +del bug_xmltoolkit46 + +def bug_xmltoolkit54(): + """ + problems handling internally defined entities + + >>> e = ElementTree.XML("]>&ldots;") + >>> serialize(e) + '' + """ + +# doesn't work with lxml.etree +del bug_xmltoolkit54 + +def bug_xmltoolkit55(): + """ + make sure we're reporting the first error, not the last + + >>> e = ElementTree.XML("&ldots;&ndots;&rdots;") + Traceback (most recent call last): + ParseError: undefined entity &ldots;: line 1, column 36 + """ + +# doesn't work with lxml.etree +del bug_xmltoolkit55 + +def bug_200708_version(): + """ + >>> parser = ET.XMLParser() + >>> parser.version + 'Expat 2.0.0' + >>> parser.feed(open("samples/simple.xml").read()) + >>> print serialize(parser.close()) + + text + texttail + + + """ + +# doesn't work with lxml.etree +del bug_200708_version + +def bug_200708_newline(): + r""" + + Preserve newlines in attributes. + + >>> e = ET.Element('SomeTag', text="def _f():\n return 3\n") + >>> ET.tostring(e) + '' + >>> ET.XML(ET.tostring(e)).get("text") + 'def _f():\n return 3\n' + >>> ET.tostring(ET.XML(ET.tostring(e))) + '' + """ + +# doesn't work with lxml.etree +del bug_200708_newline + +def bug_200709_default_namespace(): + """ + + >>> e = ET.Element("{default}elem") + >>> s = ET.SubElement(e, "{default}elem") + >>> serialize(e, default_namespace="default") # 1 + '' + + >>> e = ET.Element("{default}elem") + >>> s = ET.SubElement(e, "{default}elem") + >>> s = ET.SubElement(e, "{not-default}elem") + >>> serialize(e, default_namespace="default") # 2 + '' + + >>> e = ET.Element("{default}elem") + >>> s = ET.SubElement(e, "{default}elem") + >>> s = ET.SubElement(e, "elem") # unprefixed name + >>> serialize(e, default_namespace="default") # 3 + Traceback (most recent call last): + ValueError: cannot use non-qualified names with default_namespace option + + """ + +# doesn't work with lxml.etree +del bug_200709_default_namespace # -------------------------------------------------------------------- Modified: lxml/trunk/src/lxml/_elementpath.py ============================================================================== --- lxml/trunk/src/lxml/_elementpath.py (original) +++ lxml/trunk/src/lxml/_elementpath.py Thu Sep 13 12:52:26 2007 @@ -1,4 +1,6 @@ -# This file is taken from ElementTree directly, unchanged beyond this line. +# +# ElementTree +# $Id: ElementPath.py 3276 2007-09-12 06:52:30Z fredrik $ # # limited xpath support for element trees # @@ -6,8 +8,9 @@ # 2003-05-23 fl created # 2003-05-28 fl added support for // etc # 2003-08-27 fl fixed parsing of periods in element names +# 2007-09-10 fl new selection engine # -# Copyright (c) 2003-2004 by Fredrik Lundh. All rights reserved. +# Copyright (c) 2003-2007 by Fredrik Lundh. All rights reserved. # # fredrik at pythonware.com # http://www.pythonware.com @@ -15,7 +18,7 @@ # -------------------------------------------------------------------- # The ElementTree toolkit is # -# Copyright (c) 1999-2004 by Fredrik Lundh +# Copyright (c) 1999-2007 by Fredrik Lundh # # By obtaining, using, and/or copying this software and/or its # associated documentation, you agree that you have read, understood, @@ -49,146 +52,178 @@ import re xpath_tokenizer = re.compile( - "(::|\.\.|\(\)|[/.*:\[\]\(\)@=])|((?:\{[^}]+\})?[^/:\[\]\(\)@=\s]+)|\s+" + "(" + "'[^']*'|\"[^\"]*\"|" + "::|" + "//?|" + "\.\.|" + "\(\)|" + "[/.*:\[\]\(\)@=])|" + "((?:\{[^}]+\})?[^/:\[\]\(\)@=\s]+)|" + "\s+" ).findall -class xpath_descendant_or_self: - pass - -## -# Wrapper for a compiled XPath. - -class Path: - - ## - # Create an Path instance from an XPath expression. - - def __init__(self, path): - tokens = xpath_tokenizer(path) - # the current version supports 'path/path'-style expressions only - self.path = [] - self.tag = None - if tokens and tokens[0][0] == "/": - raise SyntaxError("cannot use absolute path on element") - while tokens: - op, tag = tokens.pop(0) - if tag or op == "*": - self.path.append(tag or op) - elif op == ".": - pass - elif op == "/": - self.path.append(xpath_descendant_or_self()) - continue - else: - raise SyntaxError("unsupported path syntax (%s)" % op) - if tokens: - op, tag = tokens.pop(0) - if op != "/": - raise SyntaxError( - "expected path separator (%s)" % (op or tag) - ) - if self.path and isinstance(self.path[-1], xpath_descendant_or_self): - raise SyntaxError("path cannot end with //") - if len(self.path) == 1 and isinstance(self.path[0], type("")): - self.tag = self.path[0] - - ## - # Find first matching object. - - def find(self, element): - tag = self.tag - if tag is None: - nodeset = self.findall(element) - if not nodeset: - return None - return nodeset[0] - for elem in element: - if elem.tag == tag: - return elem - return None - - ## - # Find text for first matching object. - - def findtext(self, element, default=None): - tag = self.tag - if tag is None: - nodeset = self.findall(element) - if not nodeset: - return default - return nodeset[0].text or "" - for elem in element: - if elem.tag == tag: - return elem.text or "" - return default - - ## - # Find all matching objects. - - def findall(self, element): - nodeset = [element] - index = 0 - while 1: - try: - path = self.path[index] - index = index + 1 - except IndexError: - return nodeset - set = [] - if isinstance(path, xpath_descendant_or_self): - try: - tag = self.path[index] - if not isinstance(tag, type("")): - tag = None - else: - index = index + 1 - except IndexError: - tag = None # invalid path - for node in nodeset: - new = list(node.getiterator(tag)) - if new and new[0] is node: - set.extend(new[1:]) - else: - set.extend(new) +def prepare_tag(next, token): + tag = token[1] + def select(context, result): + for elem in result: + for e in elem: + if e.tag == tag: + yield e + return select + +def prepare_star(next, token): + def select(context, result): + for elem in result: + for e in elem: + yield e + return select + +def prepare_dot(next, token): + def select(context, result): + for elem in result: + yield elem + return select + +def prepare_iter(next, token): + token = next() + if token[0] == "*": + tag = "*" + elif not token[0]: + tag = token[1] + else: + raise SyntaxError + def select(context, result): + for elem in result: + for e in elem.iter(tag): + if e is not elem: + yield e + return select + +def prepare_dot_dot(next, token): + def select(context, result): + parent_map = context.parent_map + if parent_map is None: + context.parent_map = parent_map = {} + for p in context.root.iter(): + for e in p: + parent_map[e] = p + for elem in result: + if elem in parent_map: + yield parent_map[elem] + return select + +def prepare_predicate(next, token): + # this one should probably be refactored... + token = next() + if token[0] == "@": + # attribute + token = next() + if token[0]: + raise SyntaxError("invalid attribute predicate") + key = token[1] + token = next() + if token[0] == "]": + def select(context, result): + for elem in result: + if elem.get(key) is not None: + yield elem + elif token[0] == "=": + value = next()[0] + if value[:1] == "'" or value[:1] == '"': + value = value[1:-1] else: - for node in nodeset: - for node in node: - if path == "*" or node.tag == path: - set.append(node) - if not set: - return [] - nodeset = set + raise SyntaxError("invalid comparision target") + token = next() + def select(context, result): + for elem in result: + if elem.get(key) == value: + yield elem + if token[0] != "]": + raise SyntaxError("invalid attribute predicate") + elif not token[0]: + tag = token[1] + token = next() + if token[0] != "]": + raise SyntaxError("invalid node predicate") + def select(context, result): + for elem in result: + if elem.find(tag) is not None: + yield elem + else: + raise SyntaxError("invalid predicate") + return select + +ops = { + "": prepare_tag, + "*": prepare_star, + ".": prepare_dot, + "..": prepare_dot_dot, + "//": prepare_iter, + "[": prepare_predicate, + } _cache = {} -## -# (Internal) Compile path. +class _SelectorContext: + parent_map = None + def __init__(self, root): + self.root = root -def _compile(path): - p = _cache.get(path) - if p is not None: - return p - p = Path(path) - if len(_cache) >= 100: - _cache.clear() - _cache[path] = p - return p +# -------------------------------------------------------------------- ## # Find first matching object. -def find(element, path): - return _compile(path).find(element) +def find(elem, path): + try: + return iterfind(elem, path).next() + except StopIteration: + return None ## -# Find text for first matching object. +# Find all matching objects. -def findtext(element, path, default=None): - return _compile(path).findtext(element, default) +def findall(elem, path): + return list(iterfind(elem, path)) -## -# Find all matching objects. +def iterfind(elem, path): + # compile selector pattern + try: + selector = _cache[path] + except KeyError: + if len(_cache) > 100: + _cache.clear() + if path[:1] == "/": + raise SyntaxError("cannot use absolute path on element") + stream = iter(xpath_tokenizer(path)) + next = stream.next; token = next() + selector = [] + while 1: + try: + selector.append(ops[token[0]](next, token)) + except StopIteration: + raise SyntaxError("invalid path") + try: + token = next() + if token[0] == "/": + token = next() + except StopIteration: + break + _cache[path] = selector + # execute selector pattern + result = [elem] + context = _SelectorContext(elem) + for select in selector: + result = select(context, result) + return result -def findall(element, path): - return _compile(path).findall(element) +## +# Find text for first matching object. +def findtext(elem, path, default=None): + try: + elem = iterfind(elem, path).next() + return elem.text + except StopIteration: + return default Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Thu Sep 13 12:52:26 2007 @@ -1130,6 +1130,13 @@ path = (path).text return _elementpath.findall(self, path) + def iterfind(self, path): + """Iterates over all matching subelements, by tag name or path. + """ + if isinstance(path, QName): + path = (path).text + return _elementpath.iterfind(self, path) + def xpath(self, _path, namespaces=None, extensions=None, **_variables): """Evaluate an xpath expression using the element as context node. """ @@ -1423,8 +1430,8 @@ return root.iter(tag) def iter(self, tag=None): - """Creates an iterator for the root element. The iterator loops over all elements - in this tree, in document order. + """Creates an iterator for the root element. The iterator loops over + all elements in this tree, in document order. """ root = self.getroot() if root is None: @@ -1432,7 +1439,8 @@ return root.iter(tag) def find(self, path): - """Finds the first toplevel element with given tag. Same as getroot().find(path). + """Finds the first toplevel element with given tag. Same as + getroot().find(path). """ self._assertHasRoot() root = self.getroot() @@ -1441,7 +1449,8 @@ return root.find(path) def findtext(self, path, default=None): - """Finds the element text for the first toplevel element with given tag. Same as getroot().findtext(path) + """Finds the text for the first element matching the ElementPath + expression. Same as getroot().findtext(path) """ self._assertHasRoot() root = self.getroot() @@ -1450,14 +1459,25 @@ return root.findtext(path, default) def findall(self, path): - """Finds all toplevel elements with the given tag. Same as getroot().findall(path). + """Finds all elements matching the ElementPath expression. Same as + getroot().findall(path). """ self._assertHasRoot() root = self.getroot() if path[:1] == "/": path = "." + path return root.findall(path) - + + def iterfind(self, path): + """Iterates over all elements matching the ElementPath expression. + Same as getroot().finditer(path). + """ + self._assertHasRoot() + root = self.getroot() + if path[:1] == "/": + path = "." + path + return root.iterfind(path) + def xpath(self, _path, namespaces=None, extensions=None, **_variables): """XPath evaluate in context of document. @@ -1918,11 +1938,13 @@ tail text. """ cdef object _nextEvent + cdef _Element _start_element def __init__(self, _Element element not None, tag=None, with_tail=True): if with_tail: events = ("start", "end") else: events = ("start",) + self._start_element = element self._nextEvent = iterwalk(element, events=events, tag=tag).next def __iter__(self): @@ -1931,10 +1953,10 @@ def __next__(self): cdef _Element element while result is None: - event, element = self._nextEvent() + event, element = self._nextEvent() # raises StopIteration if event == "start": result = element.text - else: + elif element is not self._start_element: result = element.tail return result Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Thu Sep 13 12:52:26 2007 @@ -1429,6 +1429,15 @@ self.assertEquals(["RTEXT", "ATAIL", "CTEXT", "CTAIL"], text) + def test_itertext_child(self): + # ET 1.3+ + XML = self.etree.XML + root = XML("RTEXTATAILCTEXTCTAIL") + + text = list(root[2].itertext()) + self.assertEquals(["CTEXT"], + text) + def test_findall_ns(self): XML = self.etree.XML root = XML('') From scoder at codespeak.net Thu Sep 13 13:07:17 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 13 Sep 2007 13:07:17 +0200 (CEST) Subject: [Lxml-checkins] r46536 - lxml/trunk Message-ID: <20070913110717.02DA480C8@code0.codespeak.net> Author: scoder Date: Thu Sep 13 13:07:17 2007 New Revision: 46536 Modified: lxml/trunk/selftest.py Log: cleanup in selftest.py, use 'del function' instead of commenting out stuff Modified: lxml/trunk/selftest.py ============================================================================== --- lxml/trunk/selftest.py (original) +++ lxml/trunk/selftest.py Thu Sep 13 13:07:17 2007 @@ -47,11 +47,11 @@ SAMPLE_XML = ElementTree.XML(""" - text - -
- subtext -
+ text + +
+ subtext +
""") @@ -110,16 +110,22 @@ >>> from elementtree.ElementInclude import * >>> from elementtree.ElementPath import * >>> from elementtree.HTMLTreeBuilder import * - >>> from elementtree.SimpleXMLTreeBuilder import * >>> from elementtree.SimpleXMLWriter import * - >>> from elementtree.TidyHTMLTreeBuilder import * >>> from elementtree.TidyTools import * - >>> from elementtree.XMLTreeBuilder import * """ # doesn't work with lxml.etree del sanity +def version(): + """ + >>> ElementTree.VERSION + '1.3a2' + """ + +# doesn't work with lxml.etree +del version + def interface(): """ Test element tree interface. @@ -130,6 +136,28 @@ >>> check_element_tree(tree) """ +def simpleops(): + """ + >>> elem = ElementTree.XML("") + >>> serialize(elem) + '' + >>> e = ElementTree.Element("tag2") + >>> elem.append(e) + >>> serialize(elem) + '' + >>> elem.remove(e) + >>> serialize(elem) + '' + >>> elem.insert(0, e) + >>> serialize(elem) + '' + >>> elem.remove(e) + >>> elem.extend([e]) + >>> serialize(elem) + '' + >>> elem.remove(e) + """ + def simplefind(): """ Test find methods using the elementpath fallback. @@ -220,25 +248,24 @@ ['tag', 'tag', 'tag'] >>> summarize_list(elem.findall("././tag")) ['tag', 'tag'] - -## >>> summarize_list(elem.findall(".//tag[@class]")) -## ['tag', 'tag', 'tag'] -## >>> summarize_list(elem.findall(".//tag[@class='a']")) -## ['tag'] -## >>> summarize_list(elem.findall(".//tag[@class='b']")) -## ['tag', 'tag'] -## >>> summarize_list(elem.findall(".//tag[@id]")) -## ['tag'] -## >>> summarize_list(elem.findall(".//section[tag]")) -## ['section'] -## >>> summarize_list(elem.findall(".//section[element]")) -## [] -## >>> summarize_list(elem.findall("../tag")) -## [] -## >>> summarize_list(elem.findall("section/../tag")) -## ['tag', 'tag'] -## >>> summarize_list(ElementTree.ElementTree(elem).findall("./tag")) -## ['tag', 'tag'] + >>> summarize_list(elem.findall(".//tag[@class]")) + ['tag', 'tag', 'tag'] + >>> summarize_list(elem.findall(".//tag[@class='a']")) + ['tag'] + >>> summarize_list(elem.findall(".//tag[@class='b']")) + ['tag', 'tag'] + >>> summarize_list(elem.findall(".//tag[@id]")) + ['tag'] + >>> summarize_list(elem.findall(".//section[tag]")) + ['section'] + >>> summarize_list(elem.findall(".//section[element]")) + [] + >>> summarize_list(elem.findall("../tag")) + [] + >>> summarize_list(elem.findall("section/../tag")) + ['tag', 'tag'] + >>> summarize_list(ElementTree.ElementTree(elem).findall("./tag")) + ['tag', 'tag'] FIXME: ET's Path module handles this case incorrectly; this gives a warning in 1.3, and the behaviour will be modified in 1.4. @@ -288,16 +315,19 @@ ## """ -## def parsehtml(): -## """ -## Test HTML parsing. - -## >>> # p = HTMLTreeBuilder.TreeBuilder() -## >>> p = ElementTree.HTMLParser() -## >>> p.feed("

spamegg

") -## >>> serialize(p.close()) -## '

spamegg

' -## """ +def parsehtml(): + """ + Test HTML parsing. + + >>> # p = HTMLTreeBuilder.TreeBuilder() + >>> p = ElementTree.HTMLParser() + >>> p.feed("

spamegg

") + >>> serialize(p.close()) + '

spamegg

' + """ + +# doesn't work with lxml.etree +del parsehtml def parseliteral(): r""" @@ -331,21 +361,24 @@ 'body' """ -## def simpleparsefile(): -## """ -## Test the xmllib-based parser. - -## >>> from elementtree import SimpleXMLTreeBuilder -## >>> parser = SimpleXMLTreeBuilder.TreeBuilder() -## >>> tree = ElementTree.parse("samples/simple.xml", parser) -## >>> normalize_crlf(tree) -## >>> tree.write(sys.stdout) -## -## text -## texttail -## -## -## """ +def simpleparsefile(): + """ + Test the xmllib-based parser. + + >>> from elementtree import SimpleXMLTreeBuilder + >>> parser = SimpleXMLTreeBuilder.TreeBuilder() + >>> tree = ElementTree.parse("samples/simple.xml", parser) + >>> normalize_crlf(tree) + >>> tree.write(sys.stdout) + + text + texttail + + + """ + +# doesn't work with lxml.etree +del simpleparsefile def iterparse(): """ @@ -414,39 +447,42 @@ """ -## def fancyparsefile(): -## """ -## Test the "fancy" parser. - -## Sanity check. -## >>> from elementtree import XMLTreeBuilder -## >>> parser = XMLTreeBuilder.FancyTreeBuilder() -## >>> tree = ElementTree.parse("samples/simple.xml", parser) -## >>> normalize_crlf(tree) -## >>> tree.write(sys.stdout) -## -## text -## texttail -## -## - -## Callback check. -## >>> class MyFancyParser(XMLTreeBuilder.FancyTreeBuilder): -## ... def start(self, elem): -## ... print "START", elem.tag -## ... def end(self, elem): -## ... print "END", elem.tag -## >>> parser = MyFancyParser() -## >>> tree = ElementTree.parse("samples/simple.xml", parser) -## START root -## START element -## END element -## START element -## END element -## START empty-element -## END empty-element -## END root -## """ +def fancyparsefile(): + """ + Test the "fancy" parser. + + Sanity check. + >>> from elementtree import XMLTreeBuilder + >>> parser = XMLTreeBuilder.FancyTreeBuilder() + >>> tree = ElementTree.parse("samples/simple.xml", parser) + >>> normalize_crlf(tree) + >>> tree.write(sys.stdout) + + text + texttail + + + + Callback check. + >>> class MyFancyParser(XMLTreeBuilder.FancyTreeBuilder): + ... def start(self, elem): + ... print "START", elem.tag + ... def end(self, elem): + ... print "END", elem.tag + >>> parser = MyFancyParser() + >>> tree = ElementTree.parse("samples/simple.xml", parser) + START root + START element + END element + START element + END element + START empty-element + END empty-element + END root + """ + +# doesn't work with lxml.etree +del fancyparsefile def writefile(): """ From scoder at codespeak.net Thu Sep 13 15:10:03 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 13 Sep 2007 15:10:03 +0200 (CEST) Subject: [Lxml-checkins] r46537 - in lxml/trunk: doc src/lxml src/lxml/tests Message-ID: <20070913131003.0B7E580C3@code0.codespeak.net> Author: scoder Date: Thu Sep 13 15:10:02 2007 New Revision: 46537 Modified: lxml/trunk/doc/tutorial.txt lxml/trunk/src/lxml/etree.pyx lxml/trunk/src/lxml/tests/test_elementtree.py Log: ET 1.3 compatibility fixes: getiterator() returns a list, bool(element) writes a warning Modified: lxml/trunk/doc/tutorial.txt ============================================================================== --- lxml/trunk/doc/tutorial.txt (original) +++ lxml/trunk/doc/tutorial.txt Thu Sep 13 15:10:02 2007 @@ -122,16 +122,15 @@ >>> print child.tag child1 + >>> print len(root) + 3 + >>> for child in root: ... print child.tag child1 child2 child3 - >>> if root: - ... print "root has children!" - root has children! - >>> root.insert(0, etree.Element("child0")) >>> start = root[:1] >>> end = root[-1:] @@ -148,10 +147,17 @@ child1 child2 -Note how the last element was *moved* to a different position in the last -example. This is a difference from the original ElementTree (and from lists), -where elements can sit in multiple positions of any number of trees. In -lxml.etree, elements can only sit in one position of one tree at a time. +Prior to ElementTree 1.3 and lxml 2.0, you could also check the truth value of +an Element to see if it has children, i.e. if the list of children is empty. +This is no longer supported as people tend to find it surprising that a +non-None reference to an existing Element can evaluate to False. Instead, use +``len(element)``, which is both more explicit and less error prone. + +Note in the examples that the last element was *moved* to a different position +in the last example. This is a difference from the original ElementTree (and +from lists), where elements can sit in multiple positions of any number of +trees. In lxml.etree, elements can only sit in one position of one tree at a +time. If you want to *copy* an element to a different position, consider creating an independent *deep copy* using the ``copy`` module from Python's standard Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Thu Sep 13 15:10:02 2007 @@ -865,6 +865,13 @@ def __nonzero__(self): cdef xmlNode* c_node + import warnings + warnings.warn( + "The behavior of this method will change in future versions. " + "Use specific 'len(elem)' or 'elem is not None' test instead.", + FutureWarning + ) + # emulate old behaviour c_node = _findChildBackwards(self._c_node, 0) return c_node != NULL @@ -1066,8 +1073,8 @@ return _elementTreeFactory(self._doc, None) def getiterator(self, tag=None): - """Iterate over all elements in the subtree in document order (depth - first pre-order), starting with this element. + """Returns a sequence of all elements in the subtree in document order + (depth first pre-order), starting with this element. Can be restricted to find only elements with a specific tag or from a namespace. @@ -1075,10 +1082,11 @@ You can also pass the Element, Comment, ProcessingInstruction and Entity factory functions to look only for the specific element type. - Note that this method is deprecated in favour of the ``el.iter()`` - method. In new code, use it only for backwards compatibility. + Note that this method previously returned an iterator, which diverged + from the original ElementTree behaviour. If you want an efficient + iterator, use the ``el.iter()`` method instead. """ - return ElementDepthFirstIterator(self, tag) + return list(ElementDepthFirstIterator(self, tag)) def iter(self, tag=None): """Iterate over all elements in the subtree in document order (depth Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Thu Sep 13 15:10:02 2007 @@ -2524,7 +2524,8 @@ self.assert_(btree.getroot() is atree.getroot()) self.assertEquals('Foo', atree.getroot().text) - def test_element_boolean(self): + def _test_element_boolean(self): + # deprecated as of ET 1.3/lxml 2.0 etree = self.etree e = etree.Element('foo') self.assertEquals(False, bool(e)) From scoder at codespeak.net Thu Sep 13 19:38:16 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 13 Sep 2007 19:38:16 +0200 (CEST) Subject: [Lxml-checkins] r46551 - lxml/trunk/src/lxml Message-ID: <20070913173816.5A35580C1@code0.codespeak.net> Author: scoder Date: Thu Sep 13 19:38:16 2007 New Revision: 46551 Modified: lxml/trunk/src/lxml/etree.pyx Log: doc fix - not sure if we should remove getchildren() Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Thu Sep 13 19:38:16 2007 @@ -984,10 +984,18 @@ return _collectAttributes(self._c_node, 3) def getchildren(self): - """Returns all subelements. The elements are returned in document order. + """Returns all direct children. The elements are returned in document + order. """ cdef xmlNode* c_node cdef int ret +# ET 1.3 stops supporting this ... +## import warnings +## warnings.warn( +## "This method will be removed in future versions. " +## "Use 'list(elem)' or iteration over elem instead.", +## DeprecationWarning +## ) result = [] c_node = self._c_node.children while c_node is not NULL: From scoder at codespeak.net Thu Sep 13 19:38:59 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 13 Sep 2007 19:38:59 +0200 (CEST) Subject: [Lxml-checkins] r46552 - lxml/trunk/src/lxml/tests Message-ID: <20070913173859.7801480C1@code0.codespeak.net> Author: scoder Date: Thu Sep 13 19:38:58 2007 New Revision: 46552 Modified: lxml/trunk/src/lxml/tests/common_imports.py Log: prefer 3rd party imports of ET over xml.etree Modified: lxml/trunk/src/lxml/tests/common_imports.py ============================================================================== --- lxml/trunk/src/lxml/tests/common_imports.py (original) +++ lxml/trunk/src/lxml/tests/common_imports.py Thu Sep 13 19:38:58 2007 @@ -6,18 +6,19 @@ from lxml import etree try: - from xml.etree import ElementTree # Python 2.5+ + from elementtree import ElementTree # standard ET + print ElementTree.VERSION except ImportError: try: - from elementtree import ElementTree # standard ET + from xml.etree import ElementTree # Python 2.5+ except ImportError: ElementTree = None try: - from xml.etree import cElementTree # Python 2.5+ + import cElementTree # standard ET except ImportError: try: - import cElementTree # standard ET + from xml.etree import cElementTree # Python 2.5+ except ImportError: cElementTree = None @@ -31,6 +32,14 @@ # we need our own version to make it work (Python 2.3?) import local_doctest as doctest +try: + sorted +except NameError: + def sorted(seq, **kwargs): + seq = list(seq) + seq.sort(**kwargs) + return seq + class HelperTestCase(unittest.TestCase): def parse(self, text): f = StringIO(text) From scoder at codespeak.net Thu Sep 13 19:39:32 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 13 Sep 2007 19:39:32 +0200 (CEST) Subject: [Lxml-checkins] r46553 - lxml/trunk/src/lxml/tests Message-ID: <20070913173932.85A0680C1@code0.codespeak.net> Author: scoder Date: Thu Sep 13 19:39:31 2007 New Revision: 46553 Modified: lxml/trunk/src/lxml/tests/test_elementtree.py Log: test: fix problem with unpredictable attribute serialisation order Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Thu Sep 13 19:39:31 2007 @@ -447,12 +447,15 @@ def test_attribute_str(self): XML = self.etree.XML + + expected = "{'{http://ns.codespeak.net/test}baz': 'Baz', 'bar': 'Bar'}" + alternative = "{'bar': 'Bar', '{http://ns.codespeak.net/test}baz': 'Baz'}" root = XML('') - # XXX hope this is not dependent on unpredictable attribute order - self.assertEquals( - "{'{http://ns.codespeak.net/test}baz': 'Baz', 'bar': 'Bar'}", - str(root.attrib)) + try: + self.assertEquals(expected, str(root.attrib)) + except AssertionError: + self.assertEquals(alternative, str(root.attrib)) def test_attribute_has_key(self): XML = self.etree.XML From scoder at codespeak.net Thu Sep 13 19:40:57 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 13 Sep 2007 19:40:57 +0200 (CEST) Subject: [Lxml-checkins] r46554 - lxml/trunk/src/lxml Message-ID: <20070913174057.EFDBC80C1@code0.codespeak.net> Author: scoder Date: Thu Sep 13 19:40:57 2007 New Revision: 46554 Modified: lxml/trunk/src/lxml/iterparse.pxi Log: small cleanup in iterwalk(): avoid testing truth value of Element Modified: lxml/trunk/src/lxml/iterparse.pxi ============================================================================== --- lxml/trunk/src/lxml/iterparse.pxi (original) +++ lxml/trunk/src/lxml/iterparse.pxi Thu Sep 13 19:40:57 2007 @@ -391,6 +391,7 @@ return self def __next__(self): + cdef xmlNode* c_child cdef _Element node cdef _Element next_node cdef int ns_count @@ -399,13 +400,12 @@ ns_count = 0 # find next node while self._index >= 0: - node_tuple = python.PyList_GET_ITEM(self._node_stack, self._index) - python.Py_INCREF(node_tuple) # fix borrowed reference for Pyrex! - node = python.PyTuple_GET_ITEM(node_tuple, 0) - python.Py_INCREF(node) # fix borrowed reference for Pyrex! - if node: + node = self._node_stack[self._index][0] + + c_child = _findChildForwards(node._c_node, 0) + if c_child is not NULL: # try children - next_node = node[0] + next_node = _elementFactory(node._doc, c_child) else: # back off next_node = None From scoder at codespeak.net Thu Sep 13 20:01:29 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 13 Sep 2007 20:01:29 +0200 (CEST) Subject: [Lxml-checkins] r46555 - lxml/trunk/src/lxml/tests Message-ID: <20070913180129.5AAA180C1@code0.codespeak.net> Author: scoder Date: Thu Sep 13 20:01:28 2007 New Revision: 46555 Modified: lxml/trunk/src/lxml/tests/common_imports.py Log: forgotten debug output Modified: lxml/trunk/src/lxml/tests/common_imports.py ============================================================================== --- lxml/trunk/src/lxml/tests/common_imports.py (original) +++ lxml/trunk/src/lxml/tests/common_imports.py Thu Sep 13 20:01:28 2007 @@ -7,7 +7,6 @@ try: from elementtree import ElementTree # standard ET - print ElementTree.VERSION except ImportError: try: from xml.etree import ElementTree # Python 2.5+ From scoder at codespeak.net Fri Sep 14 10:43:46 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 14 Sep 2007 10:43:46 +0200 (CEST) Subject: [Lxml-checkins] r46567 - lxml/trunk/doc Message-ID: <20070914084346.9B6EE80EB@code0.codespeak.net> Author: scoder Date: Fri Sep 14 10:43:46 2007 New Revision: 46567 Modified: lxml/trunk/doc/FAQ.txt Log: FAQ: mention mod_python option that usually helps people get things working Modified: lxml/trunk/doc/FAQ.txt ============================================================================== --- lxml/trunk/doc/FAQ.txt (original) +++ lxml/trunk/doc/FAQ.txt Fri Sep 14 10:43:46 2007 @@ -459,6 +459,10 @@ case. This is especially a problem under MacOS-X when newer library versions were installed in addition to the outdated system libraries. +* if you use ``mod_python``, try setting this option: + + PythonInterpreter main_interpreter + * compile lxml without threading support by running ``setup.py`` with the ``--without-threading`` option. While this might be slower in certain scenarios on multi-processor systems, it *might* also keep your application From scoder at codespeak.net Fri Sep 14 11:58:42 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 14 Sep 2007 11:58:42 +0200 (CEST) Subject: [Lxml-checkins] r46570 - lxml/trunk/src/lxml Message-ID: <20070914095842.1A78980EB@code0.codespeak.net> Author: scoder Date: Fri Sep 14 11:58:41 2007 New Revision: 46570 Modified: lxml/trunk/src/lxml/parser.pxi lxml/trunk/src/lxml/parsertarget.pxi Log: fixed copying of _ParserContext for threads Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Fri Sep 14 11:58:41 2007 @@ -372,6 +372,13 @@ ## Parsers ############################################################ +cdef class _ParserContext(_ResolverContext) +cdef class _TargetParserContext(_ParserContext) + +cdef extern from "etree_defs.h": + # macro call to 't->tp_new()' for fast instantiation + cdef _ParserContext NEW_PARSER_CONTEXT "PY_NEW" (object t) + cdef class _ParserContext(_ResolverContext): cdef _ErrorLog _error_log cdef xmlparser.xmlParserCtxt* _c_ctxt @@ -379,6 +386,12 @@ _ResolverContext.__init__(self, _ResolverRegistry()) self._error_log = _ErrorLog() + cdef _ParserContext _copy(self): + cdef _ParserContext context + context = self.__class__() + context._resolvers = self._resolvers._copy() + return context + cdef void _initParserContext(self, xmlparser.xmlParserCtxt* c_ctxt): self._c_ctxt = c_ctxt @@ -517,10 +530,12 @@ self._parser_lock = python.PyThread_allocate_lock() cdef _ParserContext _createContext(self, target): - if target is not None: - return _TargetParserContext(target) - else: + cdef _TargetParserContext context + if target is None: return _ParserContext() + context = _TargetParserContext() + context._setTarget(target) + return context cdef xmlparser.xmlParserCtxt* _newParserCtxt(self): if self._parser_type == LXML_HTML_PARSER: @@ -582,8 +597,7 @@ parser = self.__class__() parser._parse_options = self._parse_options parser._class_lookup = self._class_lookup - parser.resolvers = self.resolvers._copy() - parser._context = _ResolverContext(parser.resolvers) + parser._context = self._context._copy() parser._parser_ctxt._private = parser._context return parser Modified: lxml/trunk/src/lxml/parsertarget.pxi ============================================================================== --- lxml/trunk/src/lxml/parsertarget.pxi (original) +++ lxml/trunk/src/lxml/parsertarget.pxi Fri Sep 14 11:58:41 2007 @@ -18,10 +18,15 @@ cdef object _target_pi cdef object _target_comment - def __init__(self, target): - _ParserContext.__init__(self) + cdef void _setTarget(self, target): self._target = target + cdef _ParserContext _copy(self): + cdef _TargetParserContext context + context = _ParserContext._copy(self) + context._setTarget(self._target) + return context + cdef void _initParserContext(self, xmlparser.xmlParserCtxt* c_ctxt): "wrap original SAX2 callbacks" cdef xmlparser.xmlSAXHandler* sax From scoder at codespeak.net Fri Sep 14 12:05:44 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 14 Sep 2007 12:05:44 +0200 (CEST) Subject: [Lxml-checkins] r46573 - lxml/trunk/doc Message-ID: <20070914100544.C6E2880FB@code0.codespeak.net> Author: scoder Date: Fri Sep 14 12:05:44 2007 New Revision: 46573 Modified: lxml/trunk/doc/FAQ.txt Log: FAQ link to ML thread on mod_python Modified: lxml/trunk/doc/FAQ.txt ============================================================================== --- lxml/trunk/doc/FAQ.txt (original) +++ lxml/trunk/doc/FAQ.txt Fri Sep 14 12:05:44 2007 @@ -463,6 +463,10 @@ PythonInterpreter main_interpreter + There was a discussion on the mailing list about this problem: + + http://comments.gmane.org/gmane.comp.python.lxml.devel/2942 + * compile lxml without threading support by running ``setup.py`` with the ``--without-threading`` option. While this might be slower in certain scenarios on multi-processor systems, it *might* also keep your application From scoder at codespeak.net Fri Sep 14 13:35:55 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 14 Sep 2007 13:35:55 +0200 (CEST) Subject: [Lxml-checkins] r46576 - in lxml/trunk: . doc Message-ID: <20070914113555.517AA80F1@code0.codespeak.net> Author: scoder Date: Fri Sep 14 13:35:54 2007 New Revision: 46576 Modified: lxml/trunk/INSTALL.txt lxml/trunk/doc/FAQ.txt Log: warning about using libxml2 2.6.27 with XPath Modified: lxml/trunk/INSTALL.txt ============================================================================== --- lxml/trunk/INSTALL.txt (original) +++ lxml/trunk/INSTALL.txt Fri Sep 14 13:35:54 2007 @@ -11,7 +11,7 @@ * libxml 2.6.20 or later. It can be found here: http://xmlsoft.org/downloads.html - If you want to use XPath reliably, try to avoid libxml2 2.6.27. + If you want to use XPath, do not use libxml2 2.6.27. * libxslt 1.1.15 or later. It can be found here: http://xmlsoft.org/XSLT/downloads.html Modified: lxml/trunk/doc/FAQ.txt ============================================================================== --- lxml/trunk/doc/FAQ.txt (original) +++ lxml/trunk/doc/FAQ.txt Fri Sep 14 13:35:54 2007 @@ -176,6 +176,11 @@ It really depends on your application, but the rule of thumb is: more recent versions contain less bugs and provide more features. +* Do not use libxml2 2.6.27 if you want to use XPath (including XSLT). You + will get crashes when XPath errors occur during the evaluation (e.g. for + unknown functions). This happens inside the evaluation call to libxml2, so + there is nothing that lxml can do about it. + * Try to use versions of both libraries that were released together. At least the libxml2 version should not be older than the libxslt version. From scoder at codespeak.net Fri Sep 14 13:36:30 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 14 Sep 2007 13:36:30 +0200 (CEST) Subject: [Lxml-checkins] r46577 - in lxml/branch/lxml-1.3: . doc Message-ID: <20070914113630.8D2A880F1@code0.codespeak.net> Author: scoder Date: Fri Sep 14 13:36:30 2007 New Revision: 46577 Modified: lxml/branch/lxml-1.3/INSTALL.txt lxml/branch/lxml-1.3/doc/FAQ.txt Log: warning about using libxml2 2.6.27 with XPath Modified: lxml/branch/lxml-1.3/INSTALL.txt ============================================================================== --- lxml/branch/lxml-1.3/INSTALL.txt (original) +++ lxml/branch/lxml-1.3/INSTALL.txt Fri Sep 14 13:36:30 2007 @@ -11,7 +11,7 @@ * libxml 2.6.20 or later. It can be found here: http://xmlsoft.org/downloads.html - If you want to use XPath reliably, try to avoid libxml2 2.6.27. + If you want to use XPath, do not use libxml2 2.6.27. * libxslt 1.1.15 or later. It can be found here: http://xmlsoft.org/XSLT/downloads.html Modified: lxml/branch/lxml-1.3/doc/FAQ.txt ============================================================================== --- lxml/branch/lxml-1.3/doc/FAQ.txt (original) +++ lxml/branch/lxml-1.3/doc/FAQ.txt Fri Sep 14 13:36:30 2007 @@ -175,6 +175,11 @@ It really depends on your application, but the rule of thumb is: more recent versions contain less bugs and provide more features. +* Do not use libxml2 2.6.27 if you want to use XPath (including XSLT). You + will get crashes when XPath errors occur during the evaluation (e.g. for + unknown functions). This happens inside the evaluation call to libxml2, so + there is nothing that lxml can do about it. + * Try to use versions of both libraries that were released together. At least the libxml2 version should not be older than the libxslt version. From scoder at codespeak.net Fri Sep 14 13:37:29 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 14 Sep 2007 13:37:29 +0200 (CEST) Subject: [Lxml-checkins] r46578 - lxml/branch/lxml-1.3 Message-ID: <20070914113729.1AF3580F1@code0.codespeak.net> Author: scoder Date: Fri Sep 14 13:37:28 2007 New Revision: 46578 Modified: lxml/branch/lxml-1.3/version.txt Log: version Modified: lxml/branch/lxml-1.3/version.txt ============================================================================== --- lxml/branch/lxml-1.3/version.txt (original) +++ lxml/branch/lxml-1.3/version.txt Fri Sep 14 13:37:28 2007 @@ -1 +1 @@ -1.3.4 +1.3.5 From scoder at codespeak.net Sat Sep 15 12:02:16 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 15 Sep 2007 12:02:16 +0200 (CEST) Subject: [Lxml-checkins] r46620 - lxml/trunk Message-ID: <20070915100216.65EF980C7@code0.codespeak.net> Author: scoder Date: Sat Sep 15 12:02:15 2007 New Revision: 46620 Modified: lxml/trunk/valgrind-python.supp Log: some more suppressions Modified: lxml/trunk/valgrind-python.supp ============================================================================== --- lxml/trunk/valgrind-python.supp (original) +++ lxml/trunk/valgrind-python.supp Sat Sep 15 12:02:15 2007 @@ -222,3 +222,19 @@ sigaction(act) fun:__libc_sigaction } + +{ + ld + Memcheck:Cond + obj:/lib/ld-2.6.so + obj:/lib/ld-2.6.so + obj:* +} + +{ + ld + Memcheck:Addr4 + obj:/lib/ld-2.6.so + obj:/lib/ld-2.6.so + obj:* +} From scoder at codespeak.net Sat Sep 15 12:05:12 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 15 Sep 2007 12:05:12 +0200 (CEST) Subject: [Lxml-checkins] r46621 - lxml/trunk/src/lxml Message-ID: <20070915100512.D900A80AE@code0.codespeak.net> Author: scoder Date: Sat Sep 15 12:05:02 2007 New Revision: 46621 Modified: lxml/trunk/src/lxml/etree.pyx lxml/trunk/src/lxml/objectify.pyx Log: cleanup in init code (requires Cython) Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Sat Sep 15 12:05:02 2007 @@ -1,17 +1,13 @@ cimport tree, python, config from tree cimport xmlDoc, xmlNode, xmlAttr, xmlNs, _isElement, _getNs -from python cimport isinstance, issubclass, hasattr, getattr, callable -from python cimport iter, repr, str, _cstr, _isString, Py_ssize_t +from python cimport issubclass, callable +from python cimport _cstr, _isString cimport xpath cimport xinclude cimport c14n cimport cstd import __builtin__ -cdef object True -cdef object False -True = __builtin__.True -False = __builtin__.False cdef object set try: @@ -19,14 +15,6 @@ except AttributeError: from sets import Set as set -cdef object id -id = __builtin__.id -cdef object super -super = __builtin__.super - -cdef object StopIteration -StopIteration = __builtin__.StopIteration - del __builtin__ cdef object _elementpath Modified: lxml/trunk/src/lxml/objectify.pyx ============================================================================== --- lxml/trunk/src/lxml/objectify.pyx (original) +++ lxml/trunk/src/lxml/objectify.pyx Sat Sep 15 12:05:02 2007 @@ -1,8 +1,7 @@ from etreepublic cimport _Document, _Element, ElementBase from etreepublic cimport _ElementIterator, ElementClassLookup from etreepublic cimport elementFactory, import_etree, textOf -from python cimport str, repr, isinstance, issubclass, callable, getattr -from python cimport _cstr, Py_ssize_t +from python cimport callable, issubclass, _cstr cimport etreepublic as cetree cimport python cimport tree @@ -17,50 +16,19 @@ cdef object re import re + cdef object __builtin__ import __builtin__ -cdef object int -int = __builtin__.int -cdef object long -long = __builtin__.long -cdef object float -float = __builtin__.float -cdef object bool -bool = __builtin__.bool -cdef object pow -pow = __builtin__.pow -cdef object abs -abs = __builtin__.abs -cdef object len -len = __builtin__.len - -cdef object True -True = __builtin__.True -cdef object False -False = __builtin__.False - -cdef object AttributeError -AttributeError = __builtin__.AttributeError -cdef object TypeError -TypeError = __builtin__.TypeError -cdef object ValueError -ValueError = __builtin__.ValueError -cdef object IndexError -IndexError = __builtin__.IndexError -cdef object StopIteration -StopIteration = __builtin__.StopIteration - -cdef object IGNORABLE_ERRORS -IGNORABLE_ERRORS = (ValueError, TypeError) -cdef object list -list = __builtin__.list cdef object set try: set = __builtin__.set except AttributeError: from sets import Set as set +cdef object IGNORABLE_ERRORS +IGNORABLE_ERRORS = (ValueError, TypeError) + cdef object islice from itertools import islice From scoder at codespeak.net Sat Sep 15 12:05:27 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 15 Sep 2007 12:05:27 +0200 (CEST) Subject: [Lxml-checkins] r46622 - lxml/trunk/src/lxml/tests Message-ID: <20070915100527.8E92180AE@code0.codespeak.net> Author: scoder Date: Sat Sep 15 12:05:27 2007 New Revision: 46622 Modified: lxml/trunk/src/lxml/tests/test_elementtree.py Log: small test fix Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Sat Sep 15 12:05:27 2007 @@ -2156,7 +2156,7 @@ def test_iterparse_large(self): iterparse = self.etree.iterparse CHILD_COUNT = 12345 - f = StringIO('' + ('test'*CHILD_COUNT) + '') + f = StringIO('%s' % ('test'*CHILD_COUNT)) i = 0 for key in iterparse(f): From scoder at codespeak.net Sat Sep 15 12:06:23 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 15 Sep 2007 12:06:23 +0200 (CEST) Subject: [Lxml-checkins] r46623 - lxml/trunk/src/lxml Message-ID: <20070915100623.6BD9380AE@code0.codespeak.net> Author: scoder Date: Sat Sep 15 12:06:23 2007 New Revision: 46623 Modified: lxml/trunk/src/lxml/apihelpers.pxi Log: small optimisation Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Sat Sep 15 12:06:23 2007 @@ -821,12 +821,20 @@ Returns None if not a file object. """ # file instances have a name attribute - if hasattr(source, 'name'): + try: return source.name + except AttributeError: + pass # gzip file instances have a filename attribute - if hasattr(source, 'filename'): + try: return source.filename - # urllib2 - if hasattr(source, 'geturl'): - return source.geturl() - return None + except AttributeError: + pass + # urllib2 provides a geturl() method + try: + geturl = source.geturl + except AttributeError: + # can't determine filename + return None + else: + return geturl() From scoder at codespeak.net Sat Sep 15 17:13:01 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 15 Sep 2007 17:13:01 +0200 (CEST) Subject: [Lxml-checkins] r46641 - lxml/trunk/src/lxml Message-ID: <20070915151301.D099980C0@code0.codespeak.net> Author: scoder Date: Sat Sep 15 17:13:00 2007 New Revision: 46641 Modified: lxml/trunk/src/lxml/etree.pyx lxml/trunk/src/lxml/objectify.pyx Log: adapt more imports to Cython Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Sat Sep 15 17:13:00 2007 @@ -1,7 +1,6 @@ cimport tree, python, config from tree cimport xmlDoc, xmlNode, xmlAttr, xmlNs, _isElement, _getNs -from python cimport issubclass, callable -from python cimport _cstr, _isString +from python cimport callable, _cstr, _isString cimport xpath cimport xinclude cimport c14n Modified: lxml/trunk/src/lxml/objectify.pyx ============================================================================== --- lxml/trunk/src/lxml/objectify.pyx (original) +++ lxml/trunk/src/lxml/objectify.pyx Sat Sep 15 17:13:00 2007 @@ -1,7 +1,7 @@ from etreepublic cimport _Document, _Element, ElementBase from etreepublic cimport _ElementIterator, ElementClassLookup from etreepublic cimport elementFactory, import_etree, textOf -from python cimport callable, issubclass, _cstr +from python cimport callable, _cstr cimport etreepublic as cetree cimport python cimport tree @@ -806,7 +806,7 @@ self._type = type_class self.type_check = type_check if stringify is None: - self._add_text = _StringValueSetter(__builtin__.str) + self._add_text = _StringValueSetter(str) else: self._add_text = _StringValueSetter(stringify) self._schema_types = [] From scoder at codespeak.net Sat Sep 15 17:29:26 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 15 Sep 2007 17:29:26 +0200 (CEST) Subject: [Lxml-checkins] r46642 - in lxml/trunk/src/lxml: . tests Message-ID: <20070915152926.E701080AF@code0.codespeak.net> Author: scoder Date: Sat Sep 15 17:29:26 2007 New Revision: 46642 Modified: lxml/trunk/src/lxml/etree.pyx lxml/trunk/src/lxml/tests/test_elementtree.py Log: ET 1.3 compat: getchildren() will go away, findall() might return an iterator (or maybe not) Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Sat Sep 15 17:29:26 2007 @@ -976,13 +976,12 @@ """ cdef xmlNode* c_node cdef int ret -# ET 1.3 stops supporting this ... -## import warnings -## warnings.warn( -## "This method will be removed in future versions. " -## "Use 'list(elem)' or iteration over elem instead.", -## DeprecationWarning -## ) + import warnings + warnings.warn( + "This method will be removed in future versions. " + "Use 'list(elem)' or iteration over elem instead.", + DeprecationWarning + ) result = [] c_node = self._c_node.children while c_node is not NULL: Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Sat Sep 15 17:29:26 2007 @@ -650,19 +650,19 @@ def test_findall(self): XML = self.etree.XML root = XML('') - self.assertEquals(len(root.findall("c")), 1) - self.assertEquals(len(root.findall(".//c")), 2) - self.assertEquals(len(root.findall(".//b")), 3) - self.assertEquals(len(root.findall(".//b")[0]), 1) - self.assertEquals(len(root.findall(".//b")[1]), 0) - self.assertEquals(len(root.findall(".//b")[2]), 0) + self.assertEquals(len(list(root.findall("c"))), 1) + self.assertEquals(len(list(root.findall(".//c"))), 2) + self.assertEquals(len(list(root.findall(".//b"))), 3) + self.assertEquals(len(list(root.findall(".//b"))[0]), 1) + self.assertEquals(len(list(root.findall(".//b"))[1]), 0) + self.assertEquals(len(list(root.findall(".//b"))[2]), 0) def test_findall_ns(self): XML = self.etree.XML root = XML('') - self.assertEquals(len(root.findall(".//{X}b")), 2) - self.assertEquals(len(root.findall(".//b")), 3) - self.assertEquals(len(root.findall("b")), 2) + self.assertEquals(len(list(root.findall(".//{X}b"))), 2) + self.assertEquals(len(list(root.findall(".//b"))), 3) + self.assertEquals(len(list(root.findall("b"))), 2) def test_element_with_attributes_keywords(self): Element = self.etree.Element @@ -1385,8 +1385,8 @@ '', a) self.assertEquals('b2', b.tail) - - def test_getchildren(self): + + def _test_getchildren(self): Element = self.etree.Element SubElement = self.etree.SubElement From scoder at codespeak.net Sat Sep 15 17:31:14 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 15 Sep 2007 17:31:14 +0200 (CEST) Subject: [Lxml-checkins] r46643 - lxml/trunk/src/lxml Message-ID: <20070915153114.3B9C880AF@code0.codespeak.net> Author: scoder Date: Sat Sep 15 17:31:13 2007 New Revision: 46643 Modified: lxml/trunk/src/lxml/parser.pxi lxml/trunk/src/lxml/xslt.pxi Log: cleanup: remove need for internal parsing functions in XSLT doc resolver code Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Sat Sep 15 17:31:13 2007 @@ -375,15 +375,11 @@ cdef class _ParserContext(_ResolverContext) cdef class _TargetParserContext(_ParserContext) -cdef extern from "etree_defs.h": - # macro call to 't->tp_new()' for fast instantiation - cdef _ParserContext NEW_PARSER_CONTEXT "PY_NEW" (object t) - cdef class _ParserContext(_ResolverContext): cdef _ErrorLog _error_log cdef xmlparser.xmlParserCtxt* _c_ctxt def __init__(self): - _ResolverContext.__init__(self, _ResolverRegistry()) + _ResolverContext.__init__(self, None) self._error_log = _ErrorLog() cdef _ParserContext _copy(self): @@ -394,6 +390,7 @@ cdef void _initParserContext(self, xmlparser.xmlParserCtxt* c_ctxt): self._c_ctxt = c_ctxt + c_ctxt._private = self cdef object _handleParseResult(self, _BaseParser parser, xmlDoc* result, filename): @@ -513,7 +510,6 @@ self._context = self._createContext(target) self._context._initParserContext(pctxt) - pctxt._private = self._context if remove_comments: pctxt.sax.comment = NULL @@ -983,50 +979,6 @@ resolve_entities, remove_comments, remove_pis, target) -cdef xmlDoc* _internalParseDoc(char* c_text, int options, - _ResolverContext context) except NULL: - # internal parser function for XSLT - cdef xmlparser.xmlParserCtxt* pctxt - cdef xmlDoc* c_doc - cdef int recover - pctxt = xmlparser.xmlNewParserCtxt() - if pctxt is NULL: - return NULL - __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt) - pctxt._private = context - c_doc = xmlparser.xmlCtxtReadDoc( - pctxt, c_text, NULL, NULL, options) - try: - recover = options & xmlparser.XML_PARSE_RECOVER - c_doc = _handleParseResult(None, pctxt, c_doc, None, recover) - finally: - xmlparser.xmlFreeParserCtxt(pctxt) - return c_doc - -cdef xmlDoc* _internalParseDocFromFile(char* c_filename, int options, - _ResolverContext context) except NULL: - # internal parser function for XSLT - cdef xmlparser.xmlParserCtxt* pctxt - cdef xmlDoc* c_doc - cdef int recover - pctxt = xmlparser.xmlNewParserCtxt() - if pctxt is NULL: - return NULL - __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt) - pctxt._private = context - c_doc = xmlparser.xmlCtxtReadFile( - pctxt, c_filename, NULL, options) - try: - recover = options & xmlparser.XML_PARSE_RECOVER - if c_filename is NULL: - filename = None - else: - filename = c_filename - c_doc = _handleParseResult(None, pctxt, c_doc, filename, recover) - finally: - xmlparser.xmlFreeParserCtxt(pctxt) - return c_doc - cdef XMLParser __DEFAULT_XML_PARSER __DEFAULT_XML_PARSER = XMLParser() Modified: lxml/trunk/src/lxml/xslt.pxi ============================================================================== --- lxml/trunk/src/lxml/xslt.pxi (original) +++ lxml/trunk/src/lxml/xslt.pxi Sat Sep 15 17:31:13 2007 @@ -69,43 +69,44 @@ return _copyDoc(c_doc, 1) return NULL -cdef xmlDoc* _xslt_resolve_from_python(char* c_uri, void* context, +cdef xmlDoc* _xslt_resolve_from_python(char* c_uri, void* c_context, int parse_options, int* error): # call the Python document loaders - cdef _XSLTResolverContext resolver_context + cdef _XSLTResolverContext context cdef _ResolverRegistry resolvers cdef _InputDocument doc_ref cdef xmlDoc* c_doc error[0] = 0 - resolver_context = <_XSLTResolverContext>context - resolvers = resolver_context._resolvers + context = <_XSLTResolverContext>c_context try: + resolvers = context._resolvers uri = funicode(c_uri) - doc_ref = resolvers.resolve(uri, None, resolver_context) + doc_ref = resolvers.resolve(uri, None, context) c_doc = NULL if doc_ref is not None: if doc_ref._type == PARSER_DATA_STRING: - c_doc = _internalParseDoc( - _cstr(doc_ref._data_bytes), parse_options, - resolver_context) + c_doc = _parseDoc( + doc_ref._data_bytes, None, context._parser) elif doc_ref._type == PARSER_DATA_FILENAME: - c_doc = _internalParseDocFromFile( - _cstr(doc_ref._data_bytes), parse_options, - resolver_context) + if python.PyUnicode_Check(doc_ref._data_bytes): + filename = _utf8(doc_ref._data_bytes) + else: + filename = doc_ref._data_bytes + c_doc = _parseDocFromFile(filename, context._parser) elif doc_ref._type == PARSER_DATA_FILE: + filename = _getFilenameForFile(doc_ref._file) data = doc_ref._file.read() - c_doc = _internalParseDoc( - _cstr(data), parse_options, - resolver_context) + c_doc = _parseDoc( + data, filename, context._parser) elif doc_ref._type == PARSER_DATA_EMPTY: c_doc = _newDoc() if c_doc is not NULL and c_doc.URL is NULL: c_doc.URL = tree.xmlStrdup(c_uri) return c_doc except: - resolver_context._store_raised() + context._store_raised() error[0] = 1 return NULL From scoder at codespeak.net Sat Sep 15 17:31:51 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 15 Sep 2007 17:31:51 +0200 (CEST) Subject: [Lxml-checkins] r46644 - lxml/trunk/benchmark Message-ID: <20070915153151.924BF80AF@code0.codespeak.net> Author: scoder Date: Sat Sep 15 17:31:51 2007 New Revision: 46644 Modified: lxml/trunk/benchmark/bench_etree.py Log: benchmark for list(element) Modified: lxml/trunk/benchmark/bench_etree.py ============================================================================== --- lxml/trunk/benchmark/bench_etree.py (original) +++ lxml/trunk/benchmark/bench_etree.py Sat Sep 15 17:31:51 2007 @@ -195,6 +195,9 @@ def bench_root_getchildren(self, root): root.getchildren() + def bench_root_list_children(self, root): + list(root) + @children def bench_getchildren(self, children): for child in children: From scoder at codespeak.net Sat Sep 15 19:05:57 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 15 Sep 2007 19:05:57 +0200 (CEST) Subject: [Lxml-checkins] r46646 - in lxml/trunk: . src/lxml Message-ID: <20070915170557.971AA80C4@code0.codespeak.net> Author: scoder Date: Sat Sep 15 19:05:56 2007 New Revision: 46646 Modified: lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/schematron.pxi Log: memory leak in Schematron Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Sat Sep 15 19:05:56 2007 @@ -25,6 +25,8 @@ Bugs fixed ---------- +* Memory leak in Schematron + * lxml failed to serialise namespace declarations of elements other than the root node of a tree Modified: lxml/trunk/src/lxml/schematron.pxi ============================================================================== --- lxml/trunk/src/lxml/schematron.pxi (original) +++ lxml/trunk/src/lxml/schematron.pxi Sat Sep 15 19:05:56 2007 @@ -107,6 +107,8 @@ self._c_schema = schematron.xmlSchematronParse(parser_ctxt) schematron.xmlSchematronFreeParserCtxt(parser_ctxt) + if c_doc is not NULL: + tree.xmlFreeDoc(c_doc) if self._c_schema is NULL: raise SchematronParseError, "Document is not a valid Schematron schema" _Validator.__init__(self) From scoder at codespeak.net Sat Sep 15 19:30:01 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 15 Sep 2007 19:30:01 +0200 (CEST) Subject: [Lxml-checkins] r46647 - lxml/trunk/src/lxml/tests Message-ID: <20070915173001.AB79C80CE@code0.codespeak.net> Author: scoder Date: Sat Sep 15 19:30:01 2007 New Revision: 46647 Modified: lxml/trunk/src/lxml/tests/test_elementtree.py Log: compat fix in ET test Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Sat Sep 15 19:30:01 2007 @@ -1970,9 +1970,14 @@ self.assertEquals( 'Bar', a.get('{%s}bar' % ns2)) - self.assertXML( - '' % (ns, ns2), - a) + try: + self.assertXML( + '' % (ns, ns2), + a) + except AssertionError: + self.assertXML( + '' % (ns2, ns), + a) def test_ns_move(self): Element = self.etree.Element From scoder at codespeak.net Sat Sep 15 19:30:58 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 15 Sep 2007 19:30:58 +0200 (CEST) Subject: [Lxml-checkins] r46648 - in lxml/trunk: . doc Message-ID: <20070915173058.F314E80C4@code0.codespeak.net> Author: scoder Date: Sat Sep 15 19:30:58 2007 New Revision: 46648 Modified: lxml/trunk/CHANGES.txt lxml/trunk/doc/tutorial.txt Log: getchildren() deprecated Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Sat Sep 15 19:30:58 2007 @@ -36,6 +36,11 @@ Other changes ------------- +* ``element.getchildren()`` deprecated (ElementTree 1.3 compatible behaviour) + +* ``element.getiterator()`` returns a list, use ``element.iter()`` to retrieve + an iterator (ElementTree 1.3 compatible behaviour) + 2.0alpha1 (2007-09-02) ====================== Modified: lxml/trunk/doc/tutorial.txt ============================================================================== --- lxml/trunk/doc/tutorial.txt (original) +++ lxml/trunk/doc/tutorial.txt Sat Sep 15 19:30:58 2007 @@ -125,6 +125,8 @@ >>> print len(root) 3 + >>> children = list(root) + >>> for child in root: ... print child.tag child1 @@ -173,20 +175,6 @@ >>> print [ c.tag for c in root ] ['child3', 'child1', 'child2'] -To retrieve a 'real' Python list of all children (or a *shallow copy* of the -element children list), you can call the ``getchildren()`` method:: - - >>> children = root.getchildren() - - >>> print type(children) is type([]) - True - - >>> for child in children: - ... print child.tag - child3 - child1 - child2 - The way up in the tree is provided through the ``getparent()`` method:: >>> root is root[0].getparent() # lxml.etree only! From scoder at codespeak.net Sat Sep 15 19:38:16 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 15 Sep 2007 19:38:16 +0200 (CEST) Subject: [Lxml-checkins] r46649 - lxml/trunk/src/lxml/tests Message-ID: <20070915173816.3243680BD@code0.codespeak.net> Author: scoder Date: Sat Sep 15 19:38:15 2007 New Revision: 46649 Modified: lxml/trunk/src/lxml/tests/test_elementtree.py Log: whitespace Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Sat Sep 15 19:38:15 2007 @@ -1978,7 +1978,7 @@ self.assertXML( '' % (ns2, ns), a) - + def test_ns_move(self): Element = self.etree.Element ElementTree = self.etree.ElementTree From scoder at codespeak.net Sat Sep 15 21:12:27 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 15 Sep 2007 21:12:27 +0200 (CEST) Subject: [Lxml-checkins] r46650 - lxml/trunk/src/lxml Message-ID: <20070915191227.783DC80B3@code0.codespeak.net> Author: scoder Date: Sat Sep 15 21:12:25 2007 New Revision: 46650 Modified: lxml/trunk/src/lxml/etree.pyx Log: un-deprecate getchildren() - still helpful in objectify Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Sat Sep 15 21:12:25 2007 @@ -976,12 +976,6 @@ """ cdef xmlNode* c_node cdef int ret - import warnings - warnings.warn( - "This method will be removed in future versions. " - "Use 'list(elem)' or iteration over elem instead.", - DeprecationWarning - ) result = [] c_node = self._c_node.children while c_node is not NULL: From scoder at codespeak.net Sat Sep 15 21:14:42 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 15 Sep 2007 21:14:42 +0200 (CEST) Subject: [Lxml-checkins] r46651 - lxml/trunk/src/lxml Message-ID: <20070915191442.D7F6B80B3@code0.codespeak.net> Author: scoder Date: Sat Sep 15 21:14:42 2007 New Revision: 46651 Modified: lxml/trunk/src/lxml/docloader.pxi lxml/trunk/src/lxml/iterparse.pxi lxml/trunk/src/lxml/parser.pxi lxml/trunk/src/lxml/xslt.pxi Log: rewrite of _ParserContext init code - use functions instead of __init__ Modified: lxml/trunk/src/lxml/docloader.pxi ============================================================================== --- lxml/trunk/src/lxml/docloader.pxi (original) +++ lxml/trunk/src/lxml/docloader.pxi Sat Sep 15 21:14:42 2007 @@ -94,14 +94,15 @@ cdef class _ResolverContext(_ExceptionContext): cdef _ResolverRegistry _resolvers cdef _TempStore _storage - def __init__(self, _ResolverRegistry resolvers): - _ExceptionContext.__init__(self) - if resolvers is None: - self._resolvers = _ResolverRegistry() - else: - self._resolvers = resolvers - self._storage = _TempStore() cdef void clear(self): _ExceptionContext.clear(self) self._storage.clear() + +cdef _initResolverContext(_ResolverContext context, + _ResolverRegistry resolvers): + if resolvers is None: + context._resolvers = _ResolverRegistry() + else: + context._resolvers = resolvers + context._storage = _TempStore() Modified: lxml/trunk/src/lxml/iterparse.pxi ============================================================================== --- lxml/trunk/src/lxml/iterparse.pxi (original) +++ lxml/trunk/src/lxml/iterparse.pxi Sat Sep 15 21:14:42 2007 @@ -65,7 +65,6 @@ cdef char* _tag_name def __init__(self): - _ParserContext.__init__(self) self._ns_stack = [] self._pop_ns = self._ns_stack.pop self._node_stack = [] Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Sat Sep 15 21:14:42 2007 @@ -378,14 +378,11 @@ cdef class _ParserContext(_ResolverContext): cdef _ErrorLog _error_log cdef xmlparser.xmlParserCtxt* _c_ctxt - def __init__(self): - _ResolverContext.__init__(self, None) - self._error_log = _ErrorLog() cdef _ParserContext _copy(self): cdef _ParserContext context context = self.__class__() - context._resolvers = self._resolvers._copy() + _initParserContext(context, self._resolvers._copy(), NULL) return context cdef void _initParserContext(self, xmlparser.xmlParserCtxt* c_ctxt): @@ -407,7 +404,15 @@ recover = parser._parse_options & xmlparser.XML_PARSE_RECOVER return _handleParseResult(self, self._c_ctxt, result, filename, recover) - + +cdef _initParserContext(_ParserContext context, + _ResolverRegistry resolvers, + xmlparser.xmlParserCtxt* c_ctxt): + _initResolverContext(context, resolvers) + if c_ctxt is not NULL: + context._initParserContext(c_ctxt) + context._error_log = _ErrorLog() + cdef int _raiseParseError(xmlparser.xmlParserCtxt* ctxt, filename, _ErrorLog error_log) except 0: @@ -509,7 +514,7 @@ python.PyErr_NoMemory() self._context = self._createContext(target) - self._context._initParserContext(pctxt) + _initParserContext(self._context, None, pctxt) if remove_comments: pctxt.sax.comment = NULL @@ -594,7 +599,7 @@ parser._parse_options = self._parse_options parser._class_lookup = self._class_lookup parser._context = self._context._copy() - parser._parser_ctxt._private = parser._context + parser._context._initParserContext(parser._parser_ctxt) return parser def copy(self): Modified: lxml/trunk/src/lxml/xslt.pxi ============================================================================== --- lxml/trunk/src/lxml/xslt.pxi (original) +++ lxml/trunk/src/lxml/xslt.pxi Sat Sep 15 21:14:42 2007 @@ -50,17 +50,20 @@ cdef class _XSLTResolverContext(_ResolverContext): cdef xmlDoc* _c_style_doc cdef _BaseParser _parser - def __init__(self, _BaseParser parser not None): - _ResolverContext.__init__(self, parser.resolvers) - self._parser = parser - self._c_style_doc = NULL cdef _XSLTResolverContext _copy(self): cdef _XSLTResolverContext context - context = _XSLTResolverContext(self._parser) + context = _XSLTResolverContext() + _initXSLTResolverContext(context, self._parser) context._c_style_doc = self._c_style_doc return context +cdef _initXSLTResolverContext(_XSLTResolverContext context, + _BaseParser parser): + _initResolverContext(context, parser.resolvers) + context._parser = parser + context._c_style_doc = NULL + cdef xmlDoc* _xslt_resolve_stylesheet(char* c_uri, void* context): cdef xmlDoc* c_doc c_doc = (<_XSLTResolverContext>context)._c_style_doc @@ -300,7 +303,8 @@ c_doc.URL = tree.xmlStrdup(_cstr(doc_url_utf)) self._error_log = _ErrorLog() - self._xslt_resolver_context = _XSLTResolverContext(doc._parser) + self._xslt_resolver_context = _XSLTResolverContext() + _initXSLTResolverContext(self._xslt_resolver_context, doc._parser) # keep a copy in case we need to access the stylesheet via 'document()' self._xslt_resolver_context._c_style_doc = _copyDoc(c_doc, 1) c_doc._private = self._xslt_resolver_context From scoder at codespeak.net Sat Sep 15 21:15:07 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 15 Sep 2007 21:15:07 +0200 (CEST) Subject: [Lxml-checkins] r46652 - lxml/trunk Message-ID: <20070915191507.38A9C80B6@code0.codespeak.net> Author: scoder Date: Sat Sep 15 21:15:06 2007 New Revision: 46652 Modified: lxml/trunk/CHANGES.txt Log: un-deprecate getchildren() - still helpful in objectify Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Sat Sep 15 21:15:06 2007 @@ -36,8 +36,6 @@ Other changes ------------- -* ``element.getchildren()`` deprecated (ElementTree 1.3 compatible behaviour) - * ``element.getiterator()`` returns a list, use ``element.iter()`` to retrieve an iterator (ElementTree 1.3 compatible behaviour) From scoder at codespeak.net Sat Sep 15 21:27:38 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 15 Sep 2007 21:27:38 +0200 (CEST) Subject: [Lxml-checkins] r46653 - lxml/trunk Message-ID: <20070915192738.7F68880B3@code0.codespeak.net> Author: scoder Date: Sat Sep 15 21:27:38 2007 New Revision: 46653 Modified: lxml/trunk/Makefile Log: longer stack traces from valgrind Modified: lxml/trunk/Makefile ============================================================================== --- lxml/trunk/Makefile (original) +++ lxml/trunk/Makefile Sat Sep 15 21:27:38 2007 @@ -21,7 +21,7 @@ PYTHONPATH=src $(PYTHON) selftest2.py valgrind_test_inplace: inplace - valgrind --tool=memcheck --leak-check=full --suppressions=valgrind-python.supp \ + valgrind --tool=memcheck --leak-check=full --num-callers=30 --suppressions=valgrind-python.supp \ $(PYTHON) test.py bench_inplace: inplace From scoder at codespeak.net Sat Sep 15 21:29:54 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 15 Sep 2007 21:29:54 +0200 (CEST) Subject: [Lxml-checkins] r46654 - in lxml/trunk: . src/lxml Message-ID: <20070915192954.C45B180B3@code0.codespeak.net> Author: scoder Date: Sat Sep 15 21:29:54 2007 New Revision: 46654 Modified: lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/schematron.pxi Log: reverted schematron fix - can crash Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Sat Sep 15 21:29:54 2007 @@ -25,8 +25,6 @@ Bugs fixed ---------- -* Memory leak in Schematron - * lxml failed to serialise namespace declarations of elements other than the root node of a tree Modified: lxml/trunk/src/lxml/schematron.pxi ============================================================================== --- lxml/trunk/src/lxml/schematron.pxi (original) +++ lxml/trunk/src/lxml/schematron.pxi Sat Sep 15 21:29:54 2007 @@ -107,8 +107,6 @@ self._c_schema = schematron.xmlSchematronParse(parser_ctxt) schematron.xmlSchematronFreeParserCtxt(parser_ctxt) - if c_doc is not NULL: - tree.xmlFreeDoc(c_doc) if self._c_schema is NULL: raise SchematronParseError, "Document is not a valid Schematron schema" _Validator.__init__(self) From scoder at codespeak.net Sat Sep 15 21:40:56 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 15 Sep 2007 21:40:56 +0200 (CEST) Subject: [Lxml-checkins] r46657 - lxml/trunk/src/lxml Message-ID: <20070915194056.F142980AD@code0.codespeak.net> Author: scoder Date: Sat Sep 15 21:40:56 2007 New Revision: 46657 Modified: lxml/trunk/src/lxml/etree.pyx lxml/trunk/src/lxml/objectify.pyx Log: comment on deprecation of getchildren(), copied over to objectify where it is still needed anyway Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Sat Sep 15 21:40:56 2007 @@ -973,6 +973,9 @@ def getchildren(self): """Returns all direct children. The elements are returned in document order. + + Note that this method has been deprecated as of ElementTree 1.3. New + code should use ``list(element)`` or simply iterate over elements. """ cdef xmlNode* c_node cdef int ret Modified: lxml/trunk/src/lxml/objectify.pyx ============================================================================== --- lxml/trunk/src/lxml/objectify.pyx (original) +++ lxml/trunk/src/lxml/objectify.pyx Sat Sep 15 21:40:56 2007 @@ -188,6 +188,23 @@ c_node = c_node.next return c + def getchildren(self): + """Returns a sequence of all direct children. The elements are + returned in document order. + """ + cdef xmlNode* c_node + cdef int ret + result = [] + c_node = self._c_node.children + while c_node is not NULL: + if _isElement(c_node): + ret = python.PyList_Append( + result, _elementFactory(self._doc, c_node)) + if ret: + raise + c_node = c_node.next + return result + def __getattr__(self, tag): """Return the (first) child with the given tag name. If no namespace is provided, the child will be looked up in the same one as self. From scoder at codespeak.net Sat Sep 15 21:41:20 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 15 Sep 2007 21:41:20 +0200 (CEST) Subject: [Lxml-checkins] r46658 - in lxml/trunk: . doc Message-ID: <20070915194120.DF38F80AD@code0.codespeak.net> Author: scoder Date: Sat Sep 15 21:41:20 2007 New Revision: 46658 Modified: lxml/trunk/CHANGES.txt lxml/trunk/doc/main.txt Log: prepare release of 2.0alpha2 Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Sat Sep 15 21:41:20 2007 @@ -2,8 +2,8 @@ lxml changelog ============== -Under development -================= +2.0alpha2 (2007-09-15) +====================== Features added -------------- Modified: lxml/trunk/doc/main.txt ============================================================================== --- lxml/trunk/doc/main.txt (original) +++ lxml/trunk/doc/main.txt Sat Sep 15 21:41:20 2007 @@ -138,8 +138,8 @@ .. _`lxml at the Python Package Index`: http://pypi.python.org/pypi/lxml/ .. _`this key`: pubkey.asc -The latest version is `lxml 2.0alpha1`_, released 2007-09-02 -(`changes for 2.0alpha1`_). `Older versions`_ are listed below. +The latest version is `lxml 2.0alpha2`_, released 2007-09-15 +(`changes for 2.0alpha2`_). `Older versions`_ are listed below. .. _`Older versions`: #old-versions @@ -199,6 +199,8 @@ Old Versions ------------ +* `lxml 2.0alpha1`_, released 2007-09-02 (`changes for 2.0alpha1`_) + * `lxml 1.3.4`_, released 2007-08-30 (`changes for 1.3.4`_) * `lxml 1.3.3`_, released 2007-07-26 (`changes for 1.3.3`_) @@ -245,6 +247,7 @@ * `lxml 0.5`_, released 2005-04-08 +.. _`lxml 2.0alpha2`: lxml-2.0alpha2.tgz .. _`lxml 2.0alpha1`: lxml-2.0alpha1.tgz .. _`lxml 1.3.4`: lxml-1.3.4.tgz .. _`lxml 1.3.3`: lxml-1.3.3.tgz @@ -269,6 +272,7 @@ .. _`lxml 0.5.1`: lxml-0.5.1.tgz .. _`lxml 0.5`: lxml-0.5.tgz +.. _`changes for 2.0alpha2`: changes-2.0alpha2.html .. _`changes for 2.0alpha1`: changes-2.0alpha1.html .. _`changes for 1.3.4`: changes-1.3.4.html .. _`changes for 1.3.3`: changes-1.3.3.html From scoder at codespeak.net Sat Sep 15 22:14:41 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 15 Sep 2007 22:14:41 +0200 (CEST) Subject: [Lxml-checkins] r46659 - lxml/trunk/src/lxml Message-ID: <20070915201441.0668680B3@code0.codespeak.net> Author: scoder Date: Sat Sep 15 22:14:40 2007 New Revision: 46659 Modified: lxml/trunk/src/lxml/objectify.pyx Log: compile fixes Modified: lxml/trunk/src/lxml/objectify.pyx ============================================================================== --- lxml/trunk/src/lxml/objectify.pyx (original) +++ lxml/trunk/src/lxml/objectify.pyx Sat Sep 15 22:14:40 2007 @@ -192,14 +192,14 @@ """Returns a sequence of all direct children. The elements are returned in document order. """ - cdef xmlNode* c_node + cdef tree.xmlNode* c_node cdef int ret result = [] c_node = self._c_node.children while c_node is not NULL: - if _isElement(c_node): + if tree._isElement(c_node): ret = python.PyList_Append( - result, _elementFactory(self._doc, c_node)) + result, cetree.elementFactory(self._doc, c_node)) if ret: raise c_node = c_node.next From scoder at codespeak.net Sat Sep 15 23:44:55 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 15 Sep 2007 23:44:55 +0200 (CEST) Subject: [Lxml-checkins] r46666 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20070915214455.2A15680B9@code0.codespeak.net> Author: scoder Date: Sat Sep 15 23:44:54 2007 New Revision: 46666 Modified: lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/etree.pyx lxml/trunk/src/lxml/extensions.pxi lxml/trunk/src/lxml/serializer.pxi lxml/trunk/src/lxml/tests/test_elementtree.py lxml/trunk/src/lxml/tests/test_etree.py lxml/trunk/src/lxml/tree.pxd Log: support for selection output method on serialisation Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Sat Sep 15 23:44:54 2007 @@ -8,6 +8,10 @@ Features added -------------- +* ``ET.write()``, ``tostring()`` and ``tounicode()`` now accept a keyword + argument ``method`` that can be one of 'xml' (or None), 'html' or 'text' to + serialise as XML, HTML or plain text content. + * ``iterfind()`` method on Elements returns an iterator equivalent to ``findall()`` Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Sat Sep 15 23:44:54 2007 @@ -1374,11 +1374,14 @@ return self._doc._parser return None - def write(self, file, encoding=None, + def write(self, file, encoding=None, method="xml", pretty_print=False, xml_declaration=None): """Write the tree to a file or file-like object. - + Defaults to ASCII encoding and writing a declaration as needed. + + The keyword argument 'method' selects the output method: 'xml' or + 'html'. """ cdef int c_write_declaration self._assertHasRoot() @@ -1394,7 +1397,7 @@ encoding = encoding.upper() c_write_declaration = encoding not in \ ('US-ASCII', 'ASCII', 'UTF8', 'UTF-8') - _tofilelike(file, self._context_node, encoding, + _tofilelike(file, self._context_node, encoding, method, c_write_declaration, 1, bool(pretty_print)) def getpath(self, _Element element not None): @@ -2148,7 +2151,7 @@ """ _dumpToFile(sys.stdout, elem._c_node, bool(pretty_print)) -def tostring(element_or_tree, encoding=None, +def tostring(element_or_tree, encoding=None, method="xml", xml_declaration=None, pretty_print=False): """Serialize an element to an encoded string representation of its XML tree. @@ -2159,6 +2162,8 @@ compatible encoding will enable a declaration by default. The keyword argument 'pretty_print' (bool) enables formatted XML. + + The keyword argument 'method' selects the output method: 'xml' or 'html'. """ cdef int write_declaration cdef int c_pretty_print @@ -2173,15 +2178,15 @@ encoding = 'ASCII' if isinstance(element_or_tree, _Element): - return _tostring(<_Element>element_or_tree, - encoding, write_declaration, 0, c_pretty_print) + return _tostring(<_Element>element_or_tree, encoding, method, + write_declaration, 0, c_pretty_print) elif isinstance(element_or_tree, _ElementTree): return _tostring((<_ElementTree>element_or_tree)._context_node, - encoding, write_declaration, 1, c_pretty_print) + encoding, method, write_declaration, 1, c_pretty_print) else: raise TypeError, "Type '%s' cannot be serialized." % type(element_or_tree) -def tounicode(element_or_tree, pretty_print=False): +def tounicode(element_or_tree, method="xml", pretty_print=False): """Serialize an element to the Python unicode representation of its XML tree. @@ -2190,14 +2195,16 @@ further treatment. The keyword argument 'pretty_print' (bool) enables formatted XML. + + The keyword argument 'method' selects the output method: 'xml' or 'html'. """ cdef int c_pretty_print c_pretty_print = bool(pretty_print) if isinstance(element_or_tree, _Element): - return _tounicode(<_Element>element_or_tree, 0, c_pretty_print) + return _tounicode(<_Element>element_or_tree, method, 0, c_pretty_print) elif isinstance(element_or_tree, _ElementTree): return _tounicode((<_ElementTree>element_or_tree)._context_node, - 1, c_pretty_print) + method, 1, c_pretty_print) else: raise TypeError, "Type '%s' cannot be serialized." % type(element_or_tree) Modified: lxml/trunk/src/lxml/extensions.pxi ============================================================================== --- lxml/trunk/src/lxml/extensions.pxi (original) +++ lxml/trunk/src/lxml/extensions.pxi Sat Sep 15 23:44:54 2007 @@ -363,8 +363,12 @@ return firstnode elif isinstance(firstnode, _Element): c_text = tree.xmlNodeGetContent((<_Element>firstnode)._c_node) - s = funicode(c_text) - tree.xmlFree(c_text) + if c_text is NULL: + python.PyErr_NoMemory() + try: + s = funicode(c_text) + finally: + tree.xmlFree(c_text) return s else: return str(firstnode) Modified: lxml/trunk/src/lxml/serializer.pxi ============================================================================== --- lxml/trunk/src/lxml/serializer.pxi (original) +++ lxml/trunk/src/lxml/serializer.pxi Sat Sep 15 23:44:54 2007 @@ -1,21 +1,75 @@ # XML serialization and output functions -cdef _tostring(_Element element, encoding, +cdef enum _OutputMethods: + OUTPUT_METHOD_XML + OUTPUT_METHOD_HTML + OUTPUT_METHOD_TEXT + +cdef int _findOutputMethod(method) except -1: + if method is None: + return OUTPUT_METHOD_XML + method = method.lower() + if method == "xml": + return OUTPUT_METHOD_XML + if method == "html": + return OUTPUT_METHOD_HTML + if method == "text": + return OUTPUT_METHOD_TEXT + raise ValueError, "unknown output method %r" % method + +cdef _textToString(xmlNode* c_node, encoding): + cdef python.PyThreadState* state + cdef char* c_text + state = python.PyEval_SaveThread() + c_text = tree.xmlNodeGetContent(c_node) + python.PyEval_RestoreThread(state) + if c_text is NULL: + python.PyErr_NoMemory() + + try: + if _hasTail(c_node): + tail = _collectText(c_node.next) + if tail: + text = c_text + tail + else: + text = c_text + else: + text = c_text + finally: + tree.xmlFree(c_text) + + if encoding is None: + return text + encoding = encoding.upper() + if encoding == 'UTF-8' or encoding == 'ASCII': + return text + + text = python.PyUnicode_FromEncodedObject(text, 'utf-8', 'strict') + return python.PyUnicode_AsEncodedString(text, encoding, 'strict') + +cdef _tostring(_Element element, encoding, method, int write_xml_declaration, int write_complete_document, int pretty_print): - "Serialize an element to an encoded string representation of its XML tree." + """Serialize an element to an encoded string representation of its XML + tree. + """ cdef python.PyThreadState* state cdef tree.xmlOutputBuffer* c_buffer cdef tree.xmlBuffer* c_result_buffer cdef tree.xmlCharEncodingHandler* enchandler cdef char* c_enc cdef char* c_version + cdef int c_method if element is None: return None if encoding is None: c_enc = NULL else: - c_enc = encoding + encoding = _utf8(encoding) + c_enc = _cstr(encoding) + c_method = _findOutputMethod(method) + if c_method == OUTPUT_METHOD_TEXT: + return _textToString(element._c_node, encoding) # it is necessary to *and* find the encoding handler *and* use # encoding during output enchandler = tree.xmlFindCharEncodingHandler(c_enc) @@ -29,7 +83,7 @@ try: state = python.PyEval_SaveThread() - _writeNodeToBuffer(c_buffer, element._c_node, c_enc, + _writeNodeToBuffer(c_buffer, element._c_node, c_enc, c_method, write_xml_declaration, write_complete_document, pretty_print) tree.xmlOutputBufferFlush(c_buffer) @@ -45,19 +99,27 @@ tree.xmlOutputBufferClose(c_buffer) return result -cdef _tounicode(_Element element, int write_complete_document, int pretty_print): - "Serialize an element to the Python unicode representation of its XML tree." +cdef _tounicode(_Element element, method, + int write_complete_document, int pretty_print): + """Serialize an element to the Python unicode representation of its XML + tree. + """ cdef python.PyThreadState* state cdef tree.xmlOutputBuffer* c_buffer cdef tree.xmlBuffer* c_result_buffer + cdef int c_method if element is None: return None + c_method = _findOutputMethod(method) + if c_method == OUTPUT_METHOD_TEXT: + text = _textToString(element._c_node, None) + return python.PyUnicode_FromEncodedObject(text, 'utf-8', 'strict') c_buffer = tree.xmlAllocOutputBuffer(NULL) if c_buffer is NULL: raise LxmlError, "Failed to create output buffer" try: state = python.PyEval_SaveThread() - _writeNodeToBuffer(c_buffer, element._c_node, NULL, 0, + _writeNodeToBuffer(c_buffer, element._c_node, NULL, c_method, 0, write_complete_document, pretty_print) tree.xmlOutputBufferFlush(c_buffer) python.PyEval_RestoreThread(state) @@ -74,14 +136,14 @@ return result cdef void _writeNodeToBuffer(tree.xmlOutputBuffer* c_buffer, - xmlNode* c_node, char* encoding, + xmlNode* c_node, char* encoding, int c_method, int write_xml_declaration, int write_complete_document, int pretty_print): cdef xmlDoc* c_doc cdef xmlNode* c_nsdecl_node c_doc = c_node.doc - if write_xml_declaration: + if write_xml_declaration and c_method == OUTPUT_METHOD_XML: _writeDeclarationToBuffer(c_buffer, c_doc.version, encoding) # write internal DTD subset, preceding PIs/comments, etc. @@ -101,8 +163,12 @@ c_nsdecl_node.last = c_node.last # write node - tree.xmlNodeDumpOutput(c_buffer, c_doc, c_nsdecl_node, 0, - pretty_print, encoding) + if c_method == OUTPUT_METHOD_XML: + tree.xmlNodeDumpOutput( + c_buffer, c_doc, c_nsdecl_node, 0, pretty_print, encoding) + else: + tree.htmlNodeDumpFormatOutput( + c_buffer, c_doc, c_nsdecl_node, encoding, pretty_print) if c_nsdecl_node is not c_node: # clean up @@ -244,7 +310,7 @@ cdef int _closeFilelikeWriter(void* ctxt): return (<_FilelikeWriter>ctxt).close() -cdef _tofilelike(f, _Element element, encoding, +cdef _tofilelike(f, _Element element, encoding, method, int write_xml_declaration, int write_doctype, int pretty_print): cdef python.PyThreadState* state @@ -255,7 +321,17 @@ if encoding is None: c_enc = NULL else: - c_enc = encoding + encoding = _utf8(encoding) + c_enc = _cstr(encoding) + c_method = _findOutputMethod(method) + if c_method == OUTPUT_METHOD_TEXT: + if _isString(f): + f = open(f, 'wb') + f.write(_textToString(element._c_node, encoding)) + f.close() + else: + f.write(_textToString(element._c_node, encoding)) + return enchandler = tree.xmlFindCharEncodingHandler(c_enc) if enchandler is NULL: raise LookupError, python.PyString_FromFormat( @@ -275,7 +351,7 @@ tree.xmlCharEncCloseFunc(enchandler) raise TypeError, "File or filename expected, got '%s'" % type(f) - _writeNodeToBuffer(c_buffer, element._c_node, c_enc, + _writeNodeToBuffer(c_buffer, element._c_node, c_enc, c_method, write_xml_declaration, write_doctype, pretty_print) tree.xmlOutputBufferClose(c_buffer) tree.xmlCharEncCloseFunc(enchandler) Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Sat Sep 15 23:44:54 2007 @@ -716,6 +716,47 @@ self.assertEquals( 'This is a test.' % (i, i), canonicalize(data)) + + def test_write_method_html(self): + ElementTree = self.etree.ElementTree + Element = self.etree.Element + SubElement = self.etree.SubElement + + html = Element('html') + body = SubElement(html, 'body') + p = SubElement(body, 'p') + p.text = "html" + SubElement(p, 'br').tail = "test" + + tree = ElementTree(element=html) + f = StringIO() + tree.write(f, method="html") + data = f.getvalue() + + self.assertEquals('

html
test

', + data) + + def test_write_method_text(self): + ElementTree = self.etree.ElementTree + Element = self.etree.Element + SubElement = self.etree.SubElement + + a = Element('a') + a.text = "A" + a.tail = "tail" + b = SubElement(a, 'b') + b.text = "B" + b.tail = "TAIL" + c = SubElement(a, 'c') + c.text = "C" + + tree = ElementTree(element=a) + f = StringIO() + tree.write(f, method="text") + data = f.getvalue() + + self.assertEquals('ABTAILCtail', + data) def test_write_fail(self): ElementTree = self.etree.ElementTree @@ -2099,6 +2140,37 @@ self.assert_(tostring(b) == 'Foo' or tostring(b) == 'Foo') + def test_tostring_method_html(self): + tostring = self.etree.tostring + Element = self.etree.Element + SubElement = self.etree.SubElement + + html = Element('html') + body = SubElement(html, 'body') + p = SubElement(body, 'p') + p.text = "html" + SubElement(p, 'br').tail = "test" + + self.assertEquals('

html
test

', + tostring(html, method="html")) + + def test_tostring_method_text(self): + tostring = self.etree.tostring + Element = self.etree.Element + SubElement = self.etree.SubElement + + a = Element('a') + a.text = "A" + a.tail = "tail" + b = SubElement(a, 'b') + b.text = "B" + b.tail = "TAIL" + c = SubElement(a, 'c') + c.text = "C" + + self.assertEquals('ABTAILCtail', + tostring(a, method="text")) + def test_iterparse(self): iterparse = self.etree.iterparse f = StringIO('') Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Sat Sep 15 23:44:54 2007 @@ -1761,6 +1761,25 @@ result = tostring(a, pretty_print=True) self.assertEquals(result, "\n \n \n") + def test_tostring_method_text_encoding(self): + tostring = self.etree.tostring + Element = self.etree.Element + SubElement = self.etree.SubElement + + a = Element('a') + a.text = "A" + a.tail = "tail" + b = SubElement(a, 'b') + b.text = "B" + b.tail = u"S?k p? nettet" + c = SubElement(a, 'c') + c.text = "C" + + result = tostring(a, method="text", encoding="UTF-16") + + self.assertEquals(u'ABS?k p? nettetCtail'.encode("UTF-16"), + result) + def test_tounicode(self): tounicode = self.etree.tounicode Element = self.etree.Element Modified: lxml/trunk/src/lxml/tree.pxd ============================================================================== --- lxml/trunk/src/lxml/tree.pxd (original) +++ lxml/trunk/src/lxml/tree.pxd Sat Sep 15 23:44:54 2007 @@ -234,6 +234,11 @@ cdef char* xmlBuildURI(char* href, char* base) cdef int xmlValidateNCName(char* value, int space) +cdef extern from "libxml/HTMLtree.h": + cdef void htmlNodeDumpFormatOutput(xmlOutputBuffer* buf, + xmlDoc* doc, xmlNode* cur, + char* encoding, int format) + cdef extern from "libxml/valid.h": cdef xmlAttr* xmlGetID(xmlDoc* doc, char* ID) cdef void xmlDumpNotationTable(xmlBuffer* buffer, xmlNotationTable* table) From lxml-checkins at codespeak.net Sun Sep 16 09:14:53 2007 From: lxml-checkins at codespeak.net (Viagra.com Inc ®) Date: Sun, 16 Sep 2007 09:14:53 +0200 (CEST) Subject: [Lxml-checkins] September -70% OFF Message-ID: <20030506091439.15411.qmail@ABTS-NCR-Dynamic-201.42.162.122.airtelbroadband.in> An HTML attachment was scrubbed... URL: http://codespeak.net/pipermail/lxml-checkins/attachments/20070916/17b875fd/attachment.htm From scoder at codespeak.net Mon Sep 17 18:02:57 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 17 Sep 2007 18:02:57 +0200 (CEST) Subject: [Lxml-checkins] r46696 - lxml/trunk/src/lxml/html/tests Message-ID: <20070917160257.4B0CB811B@code0.codespeak.net> Author: scoder Date: Mon Sep 17 18:02:55 2007 New Revision: 46696 Modified: lxml/trunk/src/lxml/html/tests/test_css_select.txt Log: removed test dependency on lxml.html Modified: lxml/trunk/src/lxml/html/tests/test_css_select.txt ============================================================================== --- lxml/trunk/src/lxml/html/tests/test_css_select.txt (original) +++ lxml/trunk/src/lxml/html/tests/test_css_select.txt Mon Sep 17 18:02:55 2007 @@ -2,8 +2,8 @@ all our selections, and a function make querying simpler: >>> from lxml.cssselect import CSSSelector - >>> from lxml.html import document_fromstring - >>> doc = document_fromstring(''' + >>> from lxml.etree import HTML + >>> doc = HTML(''' ... ...
... From scoder at codespeak.net Mon Sep 17 18:04:20 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 17 Sep 2007 18:04:20 +0200 (CEST) Subject: [Lxml-checkins] r46697 - lxml/trunk/src/lxml Message-ID: <20070917160420.D0BC7811B@code0.codespeak.net> Author: scoder Date: Mon Sep 17 18:04:20 2007 New Revision: 46697 Modified: lxml/trunk/src/lxml/serializer.pxi Log: cleanup in text serialiser Modified: lxml/trunk/src/lxml/serializer.pxi ============================================================================== --- lxml/trunk/src/lxml/serializer.pxi (original) +++ lxml/trunk/src/lxml/serializer.pxi Mon Sep 17 18:04:20 2007 @@ -26,17 +26,13 @@ if c_text is NULL: python.PyErr_NoMemory() - try: - if _hasTail(c_node): - tail = _collectText(c_node.next) - if tail: - text = c_text + tail - else: - text = c_text - else: - text = c_text - finally: - tree.xmlFree(c_text) + text = c_text + tree.xmlFree(c_text) + + if _hasTail(c_node): + tail = _collectText(c_node.next) + if tail: + text = text + tail if encoding is None: return text @@ -326,7 +322,8 @@ c_method = _findOutputMethod(method) if c_method == OUTPUT_METHOD_TEXT: if _isString(f): - f = open(f, 'wb') + filename8 = _encodeFilename(f) + f = open(filename8, 'wb') f.write(_textToString(element._c_node, encoding)) f.close() else: From scoder at codespeak.net Mon Sep 17 18:06:54 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 17 Sep 2007 18:06:54 +0200 (CEST) Subject: [Lxml-checkins] r46698 - in lxml/trunk/src/lxml: html/tests tests Message-ID: <20070917160654.EA496811B@code0.codespeak.net> Author: scoder Date: Mon Sep 17 18:06:54 2007 New Revision: 46698 Added: lxml/trunk/src/lxml/tests/css_shakespear.html - copied unchanged from r46695, lxml/trunk/src/lxml/html/tests/css_shakespear.html lxml/trunk/src/lxml/tests/test_css.py - copied unchanged from r46695, lxml/trunk/src/lxml/html/tests/test_css.py lxml/trunk/src/lxml/tests/test_css.txt - copied unchanged from r46695, lxml/trunk/src/lxml/html/tests/test_css.txt lxml/trunk/src/lxml/tests/test_css_select.txt - copied unchanged from r46696, lxml/trunk/src/lxml/html/tests/test_css_select.txt Removed: lxml/trunk/src/lxml/html/tests/css_shakespear.html lxml/trunk/src/lxml/html/tests/test_css.py lxml/trunk/src/lxml/html/tests/test_css.txt lxml/trunk/src/lxml/html/tests/test_css_select.txt Log: moved CSS tests from lxml.html to main level Deleted: /lxml/trunk/src/lxml/html/tests/css_shakespear.html ============================================================================== --- /lxml/trunk/src/lxml/html/tests/css_shakespear.html Mon Sep 17 18:06:54 2007 +++ (empty file) @@ -1,526 +0,0 @@ - - - - - - - - - -
-
-

As You Like It

-
- - by William Shakespeare - - -
-
- -

ACT I, SCENE III. A room in the palace.

- -
-
Enter CELIA and ROSALIND
- -
- -
CELIA
- -
-
Why, cousin! why, Rosalind! Cupid have mercy! not a word?
- -
- -
ROSALIND
- -
-
Not one to throw at a dog.
- -
- -
CELIA
- -
-
No, thy words are too precious to be cast away upon
- -
curs; throw some of them at me; come, lame me with reasons.
- -
- -
ROSALIND
- -
CELIA
- -
-
But is all this for your father?
- -
- -
-
Then there were two cousins laid up; when the one
-
should be lamed with reasons and the other mad
- -
without any.
-
- -
ROSALIND
- -
-
No, some of it is for my child's father. O, how
- -
full of briers is this working-day world!
- -
- -
CELIA
- -
- -
They are but burs, cousin, thrown upon thee in
-
holiday foolery: if we walk not in the trodden
- -
paths our very petticoats will catch them.
- -
- -
ROSALIND
- -
-
I could shake them off my coat: these burs are in my heart.
-
- -
CELIA
- -
-
Hem them away.
- -
- -
ROSALIND
- -
- -
I would try, if I could cry 'hem' and have him.
-
- -
CELIA
- -
-
Come, come, wrestle with thy affections.
- -
- -
ROSALIND
-
-
O, they take the part of a better wrestler than myself!
- -
- -
CELIA
- -
- -
O, a good wish upon you! you will try in time, in
-
despite of a fall. But, turning these jests out of
-
service, let us talk in good earnest: is it
- -
possible, on such a sudden, you should fall into so
- -
strong a liking with old Sir Rowland's youngest son?
- -
- -
ROSALIND
-
-
The duke my father loved his father dearly.
- -
- -
CELIA
- -
- -
Doth it therefore ensue that you should love his son
- -
dearly? By this kind of chase, I should hate him,
- -
for my father hated his father dearly; yet I hate
- -
not Orlando.
- -
- -
ROSALIND
- -
- -
No, faith, hate him not, for my sake.
- -
- -
CELIA
- -
-
Why should I not? doth he not deserve well?
- -
- -
ROSALIND
- -
-
Let me love him for that, and do you love him
-
because I do. Look, here comes the duke.
-
- -
CELIA
- -
- -
With his eyes full of anger.
-
Enter DUKE FREDERICK, with Lords
-
- -
DUKE FREDERICK
- -
- -
Mistress, dispatch you with your safest haste
- -
And get you from our court.
-
- -
ROSALIND
- -
- -
Me, uncle?
- -
- -
DUKE FREDERICK
-
-
You, cousin
- -
Within these ten days if that thou be'st found
- -
So near our public court as twenty miles,
- -
Thou diest for it.
- -
- -
ROSALIND
- -
- -
I do beseech your grace,
- -
Let me the knowledge of my fault bear with me:
-
If with myself I hold intelligence
- -
Or have acquaintance with mine own desires,
- -
If that I do not dream or be not frantic,--
- -
As I do trust I am not--then, dear uncle,
- -
Never so much as in a thought unborn
- -
Did I offend your highness.
- -
- -
DUKE FREDERICK
- -
-
Thus do all traitors:
- -
If their purgation did consist in words,
- -
They are as innocent as grace itself:
- -
Let it suffice thee that I trust thee not.
- -
- -
ROSALIND
- -
- -
Yet your mistrust cannot make me a traitor:
- -
Tell me whereon the likelihood depends.
- -
- -
DUKE FREDERICK
-
- -
Thou art thy father's daughter; there's enough.
- -
- -
ROSALIND
- -
-
So was I when your highness took his dukedom;
-
So was I when your highness banish'd him:
- -
Treason is not inherited, my lord;
- -
Or, if we did derive it from our friends,
- -
What's that to me? my father was no traitor:
- -
Then, good my liege, mistake me not so much
-
To think my poverty is treacherous.
- -
- -
CELIA
-
- -
Dear sovereign, hear me speak.
- -
- -
DUKE FREDERICK
- -
-
Ay, Celia; we stay'd her for your sake,
-
Else had she with her father ranged along.
- -
- -
CELIA
- -
- -
I did not then entreat to have her stay;
-
It was your pleasure and your own remorse:
-
I was too young that time to value her;
- -
But now I know her: if she be a traitor,
- -
Why so am I; we still have slept together,
- -
Rose at an instant, learn'd, play'd, eat together,
-
And wheresoever we went, like Juno's swans,
- -
Still we went coupled and inseparable.
-
- -
DUKE FREDERICK
- -
-
She is too subtle for thee; and her smoothness,
-
Her very silence and her patience
-
Speak to the people, and they pity her.
-
Thou art a fool: she robs thee of thy name;
- -
And thou wilt show more bright and seem more virtuous
- -
When she is gone. Then open not thy lips:
-
Firm and irrevocable is my doom
-
Which I have pass'd upon her; she is banish'd.
-
- -
CELIA
- -
-
Pronounce that sentence then on me, my liege:
-
I cannot live out of her company.
-
- -
DUKE FREDERICK
- -
-
You are a fool. You, niece, provide yourself:
- -
If you outstay the time, upon mine honour,
-
And in the greatness of my word, you die.
-
Exeunt DUKE FREDERICK and Lords
-
- -
CELIA
-
- -
O my poor Rosalind, whither wilt thou go?
- -
Wilt thou change fathers? I will give thee mine.
-
I charge thee, be not thou more grieved than I am.
- -
- -
ROSALIND
- -
- -
I have more cause.
-
- -
CELIA
- -
-
Thou hast not, cousin;
- -
Prithee be cheerful: know'st thou not, the duke
- -
Hath banish'd me, his daughter?
- -
- -
ROSALIND
-
-
That he hath not.
- -
- -
CELIA
- -
- -
No, hath not? Rosalind lacks then the love
- -
Which teacheth thee that thou and I am one:
-
Shall we be sunder'd? shall we part, sweet girl?
- -
No: let my father seek another heir.
- -
Therefore devise with me how we may fly,
- -
Whither to go and what to bear with us;
-
And do not seek to take your change upon you,
-
To bear your griefs yourself and leave me out;
- -
For, by this heaven, now at our sorrows pale,
- -
Say what thou canst, I'll go along with thee.
- -
- -
ROSALIND
-
- -
Why, whither shall we go?
- -
- -
CELIA
- -
-
To seek my uncle in the forest of Arden.
-
- -
ROSALIND
- -
- -
Alas, what danger will it be to us,
- -
Maids as we are, to travel forth so far!
-
Beauty provoketh thieves sooner than gold.
- -
- -
CELIA
- -
-
I'll put myself in poor and mean attire
- -
And with a kind of umber smirch my face;
-
The like do you: so shall we pass along
- -
And never stir assailants.
- -
- -
ROSALIND
-
- -
Were it not better,
- -
Because that I am more than common tall,
- -
That I did suit me all points like a man?
- -
A gallant curtle-axe upon my thigh,
- -
A boar-spear in my hand; and--in my heart
- -
Lie there what hidden woman's fear there will--
- -
We'll have a swashing and a martial outside,
- -
As many other mannish cowards have
- -
That do outface it with their semblances.
- -
- -
CELIA
- -
- -
What shall I call thee when thou art a man?
-
- -
ROSALIND
- -
-
I'll have no worse a name than Jove's own page;
- -
And therefore look you call me Ganymede.
- -
But what will you be call'd?
- -
- -
CELIA
- -
- -
Something that hath a reference to my state
-
No longer Celia, but Aliena.
- -
- -
ROSALIND
-
- -
But, cousin, what if we assay'd to steal
- -
The clownish fool out of your father's court?
- -
Would he not be a comfort to our travel?
- -
- -
CELIA
- -
- -
He'll go along o'er the wide world with me;
- -
Leave me alone to woo him. Let's away,
-
And get our jewels and our wealth together,
- -
Devise the fittest time and safest way
- -
To hide us from pursuit that will be made
- -
After my flight. Now go we in content
- -
To liberty and not to banishment.
-
Exeunt
- -
- -
-
-
- - - \ No newline at end of file Deleted: /lxml/trunk/src/lxml/html/tests/test_css.py ============================================================================== --- /lxml/trunk/src/lxml/html/tests/test_css.py Mon Sep 17 18:06:54 2007 +++ (empty file) @@ -1,118 +0,0 @@ -import unittest -from lxml.tests.common_imports import doctest -from lxml import html -from lxml import cssselect -import os - -doc_fn = os.path.join(os.path.dirname(__file__), - 'css_shakespear.html') - -# Data borrowed from http://mootools.net/slickspeed/ - -class CSSTestCase(unittest.TestCase): - - selectors = [ - ## Changed from original; probably because I'm only searching the body - #('*', 252), - ('*', 246), - ('div:only-child', 22), # ? - ## Changed from original, because the original doesn't make sense. - ## There really aren't that many occurrances of 'celia' - #('div:contains(CELIA)', 243), - ('div:contains(CELIA)', 30), - ('div:nth-child(even)', 106), - ('div:nth-child(2n)', 106), - ('div:nth-child(odd)', 137), - ('div:nth-child(2n+1)', 137), - ('div:nth-child(n)', 243), - ('div:last-child', 53), - ('div:first-child', 51), - ('div > div', 242), - ('div + div', 190), - ('div ~ div', 190), - ('body', 1), - ('body div', 243), - ('div', 243), - ('div div', 242), - ('div div div', 241), - ('div, div, div', 243), - ('div, a, span', 243), - ('.dialog', 51), - ('div.dialog', 51), - ('div .dialog', 51), - ('div.character, div.dialog', 99), - ('#speech5', 1), - ('div#speech5', 1), - ('div #speech5', 1), - ('div.scene div.dialog', 49), - ('div#scene1 div.dialog div', 142), - ('#scene1 #speech1', 1), - ('div[class]', 103), - ('div[class=dialog]', 50), - ('div[class^=dia]', 51), - ('div[class$=log]', 50), - ('div[class*=sce]', 1), - ('div[class|=dialog]', 50), # ? Seems right - ('div[class!=madeup]', 243), # ? Seems right - ('div[class~=dialog]', 51), # ? Seems right - ] - - def __init__(self, index): - self.index = index - unittest.TestCase.__init__(self) - - @classmethod - def all(cls): - for i in range(len(cls.selectors)): - yield cls(i) - - def runTest(self): - f = open(doc_fn, 'rb') - c = f.read() - f.close() - doc = html.document_fromstring(c) - body = doc.xpath('//body')[0] - bad = [] - selector, count = self.selectors[self.index] - xpath = cssselect.css_to_xpath(cssselect.parse(selector)) - try: - results = body.xpath(xpath) - except Exception, e: - e.args = ("%s for xpath %r" % (e, xpath)) - raise - found = {} - for item in results: - if item in found: - assert 0, ( - "Element shows up multiple times: %r" % item) - found[item] = None - if isinstance(results, basestring): - assert 0, ( - "Got string result (%r), not element, for xpath %r" - % (results[:20], str(xpath))) - if len(results) != count: - #if self.shortDescription() == 'div.character, div.dialog': - # import pdb; pdb.set_trace() - assert 0, ( - "Did not get expected results (%s) instead %s for xpath %r" - % (count, len(results), str(xpath))) - - def shortDescription(self): - return self.selectors[self.index][0] - -def unique(s): - found = {} - result = [] - for item in s: - if item in found: - continue - found[item] = None - result.append(s) - return result - -def test_suite(): - suite = unittest.TestSuite() - for fn in 'test_css.txt', 'test_css_select.txt': - suite.addTests([doctest.DocFileSuite(fn)]) - suite.addTests(list(CSSTestCase.all())) - return suite Deleted: /lxml/trunk/src/lxml/html/tests/test_css.txt ============================================================================== --- /lxml/trunk/src/lxml/html/tests/test_css.txt Mon Sep 17 18:06:54 2007 +++ (empty file) @@ -1,132 +0,0 @@ -A quick test of tokenizing: - - >>> from lxml.cssselect import tokenize, parse - >>> def ptok(s): - ... for item in tokenize(s): - ... print repr(item) - >>> ptok('E > f[a~="y\\"x"]') - Symbol(u'E', 0) - Token(u'>', 2) - Symbol(u'f', 4) - Token(u'[', 5) - Symbol(u'a', 6) - Token(u'~=', 7) - String(u'y"x', 9) - Token(u']', 15) - -Then of parsing: - - >>> parse('div, td.foo, div.bar span') - Or([Element[div], Class[Element[td].foo], CombinedSelector[Class[Element[div].bar] Element[span]]]) - >>> parse('div > p') - CombinedSelector[Element[div] > Element[p]] - >>> parse('td:first') - Pseudo[Element[td]:first] - >>> parse('a[name]') - Attrib[Element[a][name]] - >>> parse('a[rel="include"]') - Attrib[Element[a][rel = String(u'include', 6)]] - >>> parse('a[hreflang |= \'en\']') - Attrib[Element[a][hreflang |= String(u'en', 14)]] - >>> parse('div:nth-child(10)') - Function[Element[div]:nth-child(10)] - >>> parse('div:nth-of-type(10)') - Function[Element[div]:nth-of-type(10)] - >>> parse('label:only') - Pseudo[Element[label]:only] - >>> parse('a:lang(fr)') - Function[Element[a]:lang(Element[fr])] - >>> parse('div:contains("foo")') - Function[Element[div]:contains(String(u'foo', 13))] - >>> parse('div#foobar') - Hash[Element[div]#foobar] - >>> parse('div:not(div.foo)') - Function[Element[div]:not(Class[Element[div].foo])] - >>> parse('td ~ th') - CombinedSelector[Element[td] ~ Element[th]] - -Now of translation: - - >>> def xpath(css): - ... print parse(css).xpath() - >>> xpath('*') - * - >>> xpath('E') - e - >>> xpath('E[foo]') - e[@foo] - >>> xpath('E[foo="bar"]') - e[@foo = 'bar'] - >>> xpath('E[foo~="bar"]') - e[contains(concat(' ', normalize-space(@foo), ' '), ' bar ')] - >>> xpath('E[foo^="bar"]') - e[starts-with(@foo, 'bar')] - >>> xpath('E[foo$="bar"]') - e[substring(@foo, string-length(@foo)-2) = 'bar'] - >>> xpath('E[foo*="bar"]') - e[contains(@foo, 'bar')] - >>> xpath('E[hreflang|="en"]') - e[@hreflang = 'en' or starts-with(@hreflang, 'en-')] - >>> #xpath('E:root') - >>> xpath('E:nth-child(1)') - */*[name() = 'e' and (position() = 1)] - >>> xpath('E:nth-last-child(1)') - */*[name() = 'e' and (position() = last() - 1)] - >>> xpath('E:nth-last-child(2n+2)') - */*[name() = 'e' and ((position() +2) mod -2 = 0 and position() < (last() -2))] - >>> xpath('E:nth-of-type(1)') - */e[position() = 1] - >>> xpath('E:nth-last-of-type(1)') - */e[position() = last() - 1] - >>> xpath('E:first-child') - */*[name() = 'e' and (position() = 1)] - >>> xpath('E:last-child') - */*[name() = 'e' and (position() = last())] - >>> xpath('E:first-of-type') - */e[position() = 1] - >>> xpath('E:last-of-type') - */e[position() = last()] - >>> xpath('E:only-child') - */*[name() = 'e' and (last() = 1)] - >>> xpath('E:only-of-type') - e[last() = 1] - >>> xpath('E:empty') - e[not(*) and not(normalize-space())] - >>> xpath('E:contains("foo")') - e[contains(css:lower-case(string(.)), 'foo')] - >>> xpath('E.warning') - e[contains(concat(' ', normalize-space(@class), ' '), ' warning ')] - >>> xpath('E#myid') - e[@id = 'myid'] - >>> xpath('E:not(:contains("foo"))') - e[not(contains(css:lower-case(string(.)), 'foo'))] - >>> xpath('E F') - e/descendant::f - >>> xpath('E > F') - e/f - >>> xpath('E + F') - e/following-sibling::*[name() = 'f' and (position() = 1)] - >>> xpath('E ~ F') - e/following-sibling::f - >>> xpath('div#container p') - div[@id = 'container']/descendant::p - >>> xpath('p *:only-of-type') - Traceback (most recent call last): - ... - NotImplementedError: *:only-of-type is not implemented - -Then of parse_series: - - >>> from lxml.cssselect import parse_series - >>> parse_series('1n+3') - (1, 3) - >>> parse_series('n-5') - (1, -5) - >>> parse_series('odd') - (2, 1) - >>> parse_series('3n') - (3, 0) - >>> parse_series('n') - (1, 0) - >>> parse_series('5') - (0, 5) Deleted: /lxml/trunk/src/lxml/html/tests/test_css_select.txt ============================================================================== --- /lxml/trunk/src/lxml/html/tests/test_css_select.txt Mon Sep 17 18:06:54 2007 +++ (empty file) @@ -1,150 +0,0 @@ -This is a test of CSS selectors. We setup a document we'll use for -all our selections, and a function make querying simpler: - - >>> from lxml.cssselect import CSSSelector - >>> from lxml.etree import HTML - >>> doc = HTML(''' - ... - ...
- ... - ... - ... link - ...
    - ...
  1. content
  2. - ...
  3. - ...
    - ...
    - ...
  4. - ...
  5. - ...
  6. - ...
  7. - ...
  8. - ...
  9. - ...
- ...

- ... hi there - ... guy

- ...
    - ...
- ...
- ...
- ... ''') - >>> order = {} - >>> for count, el in enumerate(doc.getiterator()): - ... order[el] = count - >>> def select_ids(selector): - ... items = CSSSelector(selector)(doc) - ... if not items: - ... return 'empty' - ... items = CSSSelector(selector)(doc) - ... items.sort(key=lambda el: order[el]) - ... return ', '.join([el.get('id', 'nil') for el in items]) - >>> def pcss(main, *selectors): - ... result = select_ids(main) - ... for selector in selectors: - ... sel_result = select_ids(selector) - ... if sel_result != result: - ... print 'Selector %r returns %s' % (selector, sel_result) - ... print result - -Now, the tests: - - >>> pcss('*') # doctest: +ELLIPSIS - nil, nil, nil, outer-div, ... foobar-span - >>> pcss('div') - outer-div, li-div, foobar-div - >>> pcss('a[name]') - name-anchor - >>> pcss('a[rel]') - tag-anchor, nofollow-anchor - >>> pcss('a[rel="tag"]') - tag-anchor - >>> pcss('a[href*="localhost"]') - tag-anchor - >>> pcss('a[href^="http"]') - tag-anchor, nofollow-anchor - >>> pcss('a[href^="http:"]') - tag-anchor - >>> pcss('a[href$="org"]') - nofollow-anchor - >>> pcss('div[foobar~="bc"]', 'div[foobar~="cde"]') - foobar-div - >>> pcss('div[foobar~="cd"]') - empty - >>> pcss('*[lang|="en"]', '*[lang|="en-US"]') - second-li - >>> pcss('*[lang|="e"]') - empty - >>> pcss('li:nth-child(3)') - third-li - >>> pcss('li:nth-child(10)') - empty - >>> pcss('li:nth-child(2n)', 'li:nth-child(even)', 'li:nth-child(2n+0)') - second-li, fourth-li, sixth-li - >>> pcss('li:nth-child(+2n+1)', 'li:nth-child(odd)') - first-li, third-li, fifth-li, seventh-li - >>> pcss('li:nth-child(2n+4)') - fourth-li, sixth-li - >>> # FIXME: I'm not 100% sure this is right: - >>> pcss('li:nth-child(3n+1)') - first-li, fourth-li, seventh-li - >>> # FIXME: I'm not sure if nth-last-child(1) or nth-last-child(1) - >>> # should be equivalent to nth-last-child() - >>> pcss('li:nth-last-child()', 'li:nth-last-child(0)') - seventh-li - >>> pcss('li:nth-last-child(2n)', 'li:nth-last-child(even)') - second-li, fourth-li, sixth-li - >>> pcss('li:nth-last-child(2n+2)') - second-li, fourth-li - >>> pcss('ol:first-of-type') - first-ol - >>> pcss('ol:nth-child(1)') - empty - >>> pcss('ol:nth-of-type(2)') - second-ol - >>> # FIXME: like above, (1) or (2)? - >>> pcss('ol:nth-last-of-type(1)') - first-ol - >>> pcss('span:only-child') - foobar-span - >>> pcss('li div:only-child') - li-div - >>> pcss('div *:only-child') - foobar-span - >>> pcss('p *:only-of-type') - Traceback (most recent call last): - ... - NotImplementedError: *:only-of-type is not implemented - >>> pcss('p:only-of-type') - paragraph - >>> pcss('a:empty') - name-anchor - >>> pcss('li:empty') - third-li, fourth-li, fifth-li, sixth-li, seventh-li - >>> pcss('*:contains("link")') - nil, nil, outer-div, tag-anchor, nofollow-anchor - >>> pcss('*:contains("E")') - nil, nil, outer-div, first-ol, first-li, paragraph, p-em - >>> pcss('.a', '.b', '*.a', 'ol.a') - first-ol - >>> pcss('.c', '*.c') - first-ol, third-li, fourth-li - >>> pcss('ol *.c', 'ol li.c', 'li ~ li.c', 'ol > li.c') - third-li, fourth-li - >>> pcss('#first-li', 'li#first-li', '*#first-li') - first-li - >>> # Need some tests of :not() - >>> pcss('li div', 'li > div', 'div div') - li-div - >>> pcss('div > div') - empty - >>> pcss('div + div') - foobar-div - >>> pcss('a ~ a') - tag-anchor, nofollow-anchor - >>> pcss('a[rel="tag"] ~ a') - nofollow-anchor - >>> pcss('ol#first-ol li:last-child', 'ol#first-ol *:last-child') - seventh-li From scoder at codespeak.net Mon Sep 17 20:30:15 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 17 Sep 2007 20:30:15 +0200 (CEST) Subject: [Lxml-checkins] r46701 - in lxml/trunk/src/lxml: . tests Message-ID: <20070917183015.C6F078126@code0.codespeak.net> Author: scoder Date: Mon Sep 17 20:30:14 2007 New Revision: 46701 Modified: lxml/trunk/src/lxml/objectify.pyx lxml/trunk/src/lxml/tests/test_objectify.py Log: public annotate() function, keep TREE pytype annotation if required, do not ignore old types by default Modified: lxml/trunk/src/lxml/objectify.pyx ============================================================================== --- lxml/trunk/src/lxml/objectify.pyx (original) +++ lxml/trunk/src/lxml/objectify.pyx Mon Sep 17 20:30:14 2007 @@ -51,8 +51,8 @@ PYTYPE_ATTRIBUTE = None -cdef object TREE_PYTYPE -TREE_PYTYPE = "TREE" +cdef object TREE_PYTYPE_NAME +TREE_PYTYPE_NAME = "TREE" def setPytypeAttributeTag(attribute_tag=None): """Changes name and namespace of the XML attribute that holds Python type @@ -812,11 +812,10 @@ def __init__(self, name, type_check, type_class, stringify=None): if not python._isString(name): raise TypeError, "Type name must be a string" - elif name == TREE_PYTYPE: - raise ValueError, "Invalid type name" if type_check is not None and not callable(type_check): raise TypeError, "Type check function must be callable (or None)" - if not issubclass(type_class, ObjectifiedDataElement): + if name != TREE_PYTYPE_NAME and \ + not issubclass(type_class, ObjectifiedDataElement): raise TypeError, \ "Data classes must inherit from ObjectifiedDataElement" self.name = name @@ -839,6 +838,8 @@ the type list. If any of them is not currently known, it is simply ignored. Raises ValueError if the dependencies cannot be fulfilled. """ + if self.name == TREE_PYTYPE_NAME: + raise ValueError, "Cannot register tree type" if self.type_check is not None: for item in _TYPE_CHECKS: if item[0] is self.type_check: @@ -962,6 +963,10 @@ pytype = PyType('none', None, NoneElement) pytype.register() +# non-registered PyType for inner tree elements +cdef object TREE_PYTYPE +TREE_PYTYPE = PyType(TREE_PYTYPE_NAME, None, ObjectifiedElement) + _registerPyTypes() def getRegisteredTypes(): @@ -1173,7 +1178,7 @@ for name, value in cetree.iterattributes(element, 3): if '{' in name: if name == PYTYPE_ATTRIBUTE: - if value == TREE_PYTYPE: + if value == TREE_PYTYPE_NAME: continue else: name = name.replace(pytype_ns, 'py:') @@ -1245,7 +1250,7 @@ value = cetree.attributeValueFromNsName( c_node, _PYTYPE_NAMESPACE, _PYTYPE_ATTRIBUTE_NAME) if value is not None: - if value == TREE_PYTYPE: + if value == TREE_PYTYPE_NAME: return lookup.tree_class dict_result = python.PyDict_GetItem(_PYTYPE_DICT, value) if dict_result is not NULL: @@ -1291,7 +1296,7 @@ pass return None -def annotate(element_or_tree, ignore_old=True, ignore_xsi=False, +def pyannotate(element_or_tree, ignore_old=False, ignore_xsi=False, empty_pytype=None): """Recursively annotates the elements of an XML tree with 'pytype' attributes. @@ -1313,7 +1318,7 @@ _annotate(element, 0, 1, bool(ignore_xsi), bool(ignore_old), None, empty_pytype) -def xsiannotate(element_or_tree, ignore_old=True, ignore_pytype=False, +def xsiannotate(element_or_tree, ignore_old=False, ignore_pytype=False, empty_type=None): """Recursively annotates the elements of an XML tree with 'xsi:type' attributes. @@ -1340,6 +1345,43 @@ _annotate(element, 1, 0, bool(ignore_old), bool(ignore_pytype), empty_type, None) +def annotate(element_or_tree, ignore_old=True, ignore_xsi=False, + empty_pytype=None, empty_type=None, annotate_xsi=0, + annotate_pytype=1): + """Recursively annotates the elements of an XML tree with 'xsi:type' + and/or 'py:pytype' attributes. + + If the 'ignore_old' keyword argument is True (the default), current + 'py:pytype' attributes will be ignored for the type annotation. Set to False + if you want reuse existing 'py:pytype' information (iff appropriate for the + element text value). + + If the 'ignore_xsi' keyword argument is False (the default), existing + 'xsi:type' attributes will be used for the type annotation, if they fit the + element text values. + + Note that the mapping from Python types to XSI types is usually ambiguous. + Currently, only the first XSI type name in the corresponding PyType + definition will be used for annotation. Thus, you should consider naming + the widest type first if you define additional types. + + The default 'py:pytype' annotation of empty elements can be set with the + ``empty_pytype`` keyword argument. Pass 'str', for example, to make + string values the default. + + The default 'xsi:type' annotation of empty elements can be set with the + ``empty_type`` keyword argument. The default is not to annotate empty + elements. Pass 'string', for example, to make string values the default. + + The keyword arguments 'annotate_xsi' (default: 0) and 'annotate_pytype' + (default: 1) control which kind(s) of annotation to use. + """ + cdef _Element element + element = cetree.rootNodeOrRaise(element_or_tree) + _annotate(element, annotate_xsi, annotate_pytype, bool(ignore_xsi), + bool(ignore_old), empty_type, empty_pytype) + + cdef _annotate(_Element element, int annotate_xsi, int annotate_pytype, int ignore_xsi, int ignore_pytype, empty_type_name, empty_pytype_name): @@ -1384,34 +1426,45 @@ typename = cetree.attributeValueFromNsName( c_node, _XML_SCHEMA_INSTANCE_NS, "type") if typename is not None: - dict_result = python.PyDict_GetItem(_SCHEMA_TYPE_DICT, typename) + dict_result = python.PyDict_GetItem( + _SCHEMA_TYPE_DICT, typename) if dict_result is NULL and ':' in typename: prefix, typename = typename.split(':', 1) - dict_result = python.PyDict_GetItem(_SCHEMA_TYPE_DICT, typename) + dict_result = python.PyDict_GetItem( + _SCHEMA_TYPE_DICT, typename) if dict_result is not NULL: pytype = dict_result if pytype is not StrType: - # StrType does not have a typecheck but is the default anyway, - # so just accept it if given as type information + # StrType does not have a typecheck but is the default + # anyway, so just accept it if given as type + # information pytype = _check_type(c_node, pytype) if pytype is None: typename = None if pytype is None and not ignore_pytype: # check that old pytype value is valid - old_value = cetree.attributeValueFromNsName( + old_pytypename = cetree.attributeValueFromNsName( c_node, _PYTYPE_NAMESPACE, _PYTYPE_ATTRIBUTE_NAME) - if old_value is not None and old_value != TREE_PYTYPE: - if old_value == 'none': - # transition from lxml 1.x - old_value = "NoneType" - dict_result = python.PyDict_GetItem(_PYTYPE_DICT, old_value) - if dict_result is not NULL: - pytype = dict_result - if pytype is not StrType: - # StrType does not have a typecheck but is the default - # anyway, so just accept it if given as type information - pytype = _check_type(c_node, pytype) + if old_pytypename is not None: + if old_pytypename == TREE_PYTYPE_NAME: + if cetree.findChild(c_node, 0) is NULL: + # only case where we should keep it, + # everything else is clear enough + pytype = TREE_PYTYPE + else: + if old_pytypename == 'none': + # transition from lxml 1.x + old_pytypename = "NoneType" + dict_result = python.PyDict_GetItem( + _PYTYPE_DICT, old_pytypename) + if dict_result is not NULL: + pytype = dict_result + if pytype is not StrType: + # StrType does not have a typecheck but is the + # default anyway, so just accept it if given as + # type information + pytype = _check_type(c_node, pytype) if pytype is None: # try to guess type @@ -1595,7 +1648,7 @@ attrib.update(_attributes) _attributes = attrib if _pytype is None: - _pytype = TREE_PYTYPE + _pytype = TREE_PYTYPE_NAME if nsmap is None: nsmap = _DEFAULT_NSMAP _attributes[PYTYPE_ATTRIBUTE] = _pytype Modified: lxml/trunk/src/lxml/tests/test_objectify.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_objectify.py (original) +++ lxml/trunk/src/lxml/tests/test_objectify.py Mon Sep 17 20:30:14 2007 @@ -17,6 +17,7 @@ XML_SCHEMA_INSTANCE_NS = "http://www.w3.org/2001/XMLSchema-instance" XML_SCHEMA_INSTANCE_TYPE_ATTR = "{%s}type" % XML_SCHEMA_INSTANCE_NS XML_SCHEMA_NIL_ATTR = "{%s}nil" % XML_SCHEMA_INSTANCE_NS +TREE_PYTYPE = "TREE" DEFAULT_NSMAP = { "py" : PYTYPE_NAMESPACE, "xsi" : XML_SCHEMA_INSTANCE_NS, "xsd" : XML_SCHEMA_NS} @@ -1069,9 +1070,9 @@ self.assertEquals("xsd:boolean", child_types[ 4]) self.assertEquals(None, child_types[ 5]) self.assertEquals(None, child_types[ 6]) - self.assertEquals("xsd:int", child_types[ 7]) - self.assertEquals("xsd:int", child_types[ 8]) - self.assertEquals("xsd:int", child_types[ 9]) + self.assertEquals("xsd:double", child_types[ 7]) + self.assertEquals("xsd:float", child_types[ 8]) + self.assertEquals("xsd:string", child_types[ 9]) self.assertEquals("xsd:string", child_types[10]) self.assertEquals("xsd:double", child_types[11]) self.assertEquals("xsd:integer", child_types[12]) From scoder at codespeak.net Tue Sep 18 10:28:17 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 18 Sep 2007 10:28:17 +0200 (CEST) Subject: [Lxml-checkins] r46709 - lxml/trunk/src/lxml/tests Message-ID: <20070918082817.642CF810B@code0.codespeak.net> Author: scoder Date: Tue Sep 18 10:28:12 2007 New Revision: 46709 Modified: lxml/trunk/src/lxml/tests/test_objectify.py Log: new type annotation tests by Holger Modified: lxml/trunk/src/lxml/tests/test_objectify.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_objectify.py (original) +++ lxml/trunk/src/lxml/tests/test_objectify.py Tue Sep 18 10:28:12 2007 @@ -1039,7 +1039,7 @@ self.assertEquals("true", root.n.get(XML_SCHEMA_NIL_ATTR)) - def test_xsitype_annotation(self): + def test_pytype_xsitype_annotation(self): XML = self.XML root = XML(u'''\ 2 ''') - objectify.xsiannotate(root) + objectify.annotate(root, ignore_old=False, ignore_xsi=False, + annotate_xsi=1, annotate_pytype=1) + + # check py annotations + child_types = [ c.get(objectify.PYTYPE_ATTRIBUTE) + for c in root.iterchildren() ] + self.assertEquals("int", child_types[ 0]) + self.assertEquals("str", child_types[ 1]) + self.assertEquals("float", child_types[ 2]) + self.assertEquals("str", child_types[ 3]) + self.assertEquals("bool", child_types[ 4]) + self.assertEquals("NoneType", child_types[ 5]) + self.assertEquals(None, child_types[ 6]) + self.assertEquals("float", child_types[ 7]) + self.assertEquals("float", child_types[ 8]) + self.assertEquals("str", child_types[ 9]) + self.assertEquals("str", child_types[10]) + self.assertEquals("float", child_types[11]) + self.assertEquals("long", child_types[12]) + + self.assertEquals("true", root.n.get(XML_SCHEMA_NIL_ATTR)) + + child_xsitypes = [ c.get(XML_SCHEMA_INSTANCE_TYPE_ATTR) + for c in root.iterchildren() ] + # check xsi annotations child_types = [ c.get(XML_SCHEMA_INSTANCE_TYPE_ATTR) for c in root.iterchildren() ] self.assertEquals("xsd:int", child_types[ 0]) @@ -1079,7 +1103,7 @@ self.assertEquals("true", root.n.get(XML_SCHEMA_NIL_ATTR)) - def test_xsitype_annotation_use_old(self): + def test_xsiannotate_use_old(self): XML = self.XML root = XML(u'''\ + 5 + test + 1.1 + \uF8D2 + true + + + 5 + 5 + 23 + 42 + 300 + 2 + + ''') + objectify.pyannotate(root, ignore_old=True) + + child_types = [ c.get(objectify.PYTYPE_ATTRIBUTE) + for c in root.iterchildren() ] + self.assertEquals("int", child_types[ 0]) + self.assertEquals("str", child_types[ 1]) + self.assertEquals("float", child_types[ 2]) + self.assertEquals("str", child_types[ 3]) + self.assertEquals("bool", child_types[ 4]) + self.assertEquals("NoneType", child_types[ 5]) + self.assertEquals(None, child_types[ 6]) + self.assertEquals("float", child_types[ 7]) + self.assertEquals("float", child_types[ 8]) + self.assertEquals("str", child_types[ 9]) + self.assertEquals("int", child_types[10]) + self.assertEquals("int", child_types[11]) + self.assertEquals("int", child_types[12]) + + self.assertEquals("true", root.n.get(XML_SCHEMA_NIL_ATTR)) + + def test_pyannotate_empty(self): + XML = self.XML + root = XML(u'''\ + + + + ''') + objectify.pyannotate(root) + + child_types = [ c.get(objectify.PYTYPE_ATTRIBUTE) + for c in root.iterchildren() ] + self.assertEquals(None, child_types[0]) + + objectify.annotate(root, empty_pytype="str") + + child_types = [ c.get(objectify.PYTYPE_ATTRIBUTE) + for c in root.iterchildren() ] + self.assertEquals("str", child_types[0]) + + def test_pyannotate_use_old(self): + XML = self.XML + root = XML(u'''\ + + 5 + test + 1.1 + \uF8D2 + true + + + 5 + 5 + 23 + 42 + 300 + 2 + + ''') + objectify.pyannotate(root) + + child_types = [ c.get(objectify.PYTYPE_ATTRIBUTE) + for c in root.iterchildren() ] + self.assertEquals("int", child_types[ 0]) + self.assertEquals("str", child_types[ 1]) + self.assertEquals("float", child_types[ 2]) + self.assertEquals("str", child_types[ 3]) + self.assertEquals("bool", child_types[ 4]) + self.assertEquals("NoneType", child_types[ 5]) + self.assertEquals(None, child_types[ 6]) + self.assertEquals("float", child_types[ 7]) + self.assertEquals("float", child_types[ 8]) + self.assertEquals("str", child_types[ 9]) + self.assertEquals("str", child_types[10]) + self.assertEquals("float", child_types[11]) + self.assertEquals("long", child_types[12]) + + self.assertEquals("true", root.n.get(XML_SCHEMA_NIL_ATTR)) + + def test_xsiannotate_ignore_old(self): + XML = self.XML + root = XML(u'''\ + + 5 + test + 1.1 + \uF8D2 + true + + + 5 + 5 + 23 + 42 + 300 + 2 + + ''') + objectify.xsiannotate(root, ignore_old=True) + + child_types = [ c.get(XML_SCHEMA_INSTANCE_TYPE_ATTR) + for c in root.iterchildren() ] + self.assertEquals("xsd:int", child_types[ 0]) + self.assertEquals("xsd:string", child_types[ 1]) + self.assertEquals("xsd:double", child_types[ 2]) + self.assertEquals("xsd:string", child_types[ 3]) + self.assertEquals("xsd:boolean", child_types[ 4]) + self.assertEquals(None, child_types[ 5]) + self.assertEquals(None, child_types[ 6]) + self.assertEquals("xsd:int", child_types[ 7]) + self.assertEquals("xsd:int", child_types[ 8]) + self.assertEquals("xsd:int", child_types[ 9]) + self.assertEquals("xsd:string", child_types[10]) + self.assertEquals("xsd:double", child_types[11]) + self.assertEquals("xsd:integer", child_types[12]) + self.assertEquals("true", root.n.get(XML_SCHEMA_NIL_ATTR)) def test_deannotate(self): From scoder at codespeak.net Tue Sep 18 10:53:23 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 18 Sep 2007 10:53:23 +0200 (CEST) Subject: [Lxml-checkins] r46712 - lxml/trunk/src/lxml Message-ID: <20070918085323.A49E98071@code0.codespeak.net> Author: scoder Date: Tue Sep 18 10:53:13 2007 New Revision: 46712 Modified: lxml/trunk/src/lxml/apihelpers.pxi lxml/trunk/src/lxml/etree.pyx lxml/trunk/src/lxml/etreepublic.pxd lxml/trunk/src/lxml/objectify.pyx lxml/trunk/src/lxml/public-api.pxi Log: new C-API function hasChild(), some cleanup to use it Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Tue Sep 18 10:53:13 2007 @@ -426,6 +426,9 @@ element._c_node, _cstr(ns), NULL) return '%s:%s' % (c_ns.prefix, tag) +cdef int _hasChild(xmlNode* c_node): + return c_node is not NULL and _findChildForwards(c_node, 0) is not NULL + cdef xmlNode* _findChild(xmlNode* c_node, Py_ssize_t index): if index < 0: return _findChildBackwards(c_node, -index - 1) Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Tue Sep 18 10:53:13 2007 @@ -859,8 +859,7 @@ FutureWarning ) # emulate old behaviour - c_node = _findChildBackwards(self._c_node, 0) - return c_node != NULL + return bool(_hasChild(self._c_node)) def __contains__(self, element): cdef xmlNode* c_node Modified: lxml/trunk/src/lxml/etreepublic.pxd ============================================================================== --- lxml/trunk/src/lxml/etreepublic.pxd (original) +++ lxml/trunk/src/lxml/etreepublic.pxd Tue Sep 18 10:53:13 2007 @@ -128,6 +128,9 @@ ########################################################################## # XML node helper functions + # check if the element has at least one child + cdef int hasChild(tree.xmlNode* c_node) + # find child element number 'index' (supports negative indexes) cdef tree.xmlNode* findChild(tree.xmlNode* c_node, python.Py_ssize_t index) Modified: lxml/trunk/src/lxml/objectify.pyx ============================================================================== --- lxml/trunk/src/lxml/objectify.pyx (original) +++ lxml/trunk/src/lxml/objectify.pyx Tue Sep 18 10:53:13 2007 @@ -1238,7 +1238,7 @@ cdef python.PyObject* dict_result lookup = state # if element has children => no data class - if cetree.findChildForwards(c_node, 0) is not NULL: + if cetree.hasChild(c_node): return lookup.tree_class # if element is defined as xsi:nil, return NoneElement class @@ -1448,7 +1448,7 @@ c_node, _PYTYPE_NAMESPACE, _PYTYPE_ATTRIBUTE_NAME) if old_pytypename is not None: if old_pytypename == TREE_PYTYPE_NAME: - if cetree.findChild(c_node, 0) is NULL: + if not cetree.hasChild(c_node): # only case where we should keep it, # everything else is clear enough pytype = TREE_PYTYPE @@ -1468,7 +1468,7 @@ if pytype is None: # try to guess type - if cetree.findChildForwards(c_node, 0) is NULL: + if not cetree.hasChild(c_node): # element has no children => data class pytype = _guessPyType(textOf(c_node), StrType) else: Modified: lxml/trunk/src/lxml/public-api.pxi ============================================================================== --- lxml/trunk/src/lxml/public-api.pxi (original) +++ lxml/trunk/src/lxml/public-api.pxi Tue Sep 18 10:53:13 2007 @@ -106,6 +106,9 @@ char* c_href, char* c_name): return _delAttributeFromNsName(c_element, c_href, c_name) +cdef public int hasChild(xmlNode* c_node): + return _hasChild(c_node) + cdef public xmlNode* findChild(xmlNode* c_node, Py_ssize_t index): return _findChild(c_node, index) From scoder at codespeak.net Tue Sep 18 11:20:30 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 18 Sep 2007 11:20:30 +0200 (CEST) Subject: [Lxml-checkins] r46714 - lxml/trunk Message-ID: <20070918092030.989CD813E@code0.codespeak.net> Author: scoder Date: Tue Sep 18 11:20:15 2007 New Revision: 46714 Modified: lxml/trunk/CHANGES.txt Log: changelog Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Tue Sep 18 11:20:15 2007 @@ -2,6 +2,27 @@ lxml changelog ============== +Under development +================= + +Features added +-------------- + +* New C-API function ``hasChild()`` to test for children + +* ``annotate()`` function in objectify can annotate with Python types and XSI + types in one step. Accompanied by ``xsiannotate()`` and ``pyannotate()``. + +Bugs fixed +---------- + +Other changes +------------- + +* Type annotation in objectify now preserves the already annotated type be + default to prevent loosing type information that is already there. + + 2.0alpha2 (2007-09-15) ====================== From scoder at codespeak.net Tue Sep 18 11:24:37 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 18 Sep 2007 11:24:37 +0200 (CEST) Subject: [Lxml-checkins] r46715 - lxml/trunk/src/lxml/tests Message-ID: <20070918092437.B4866813E@code0.codespeak.net> Author: scoder Date: Tue Sep 18 11:24:29 2007 New Revision: 46715 Modified: lxml/trunk/src/lxml/tests/test_elementtree.py Log: test case for UTF-8 BOM Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Tue Sep 18 11:24:29 2007 @@ -2473,6 +2473,15 @@ tree = self.etree.XML(isoxml) self.assertEquals(utext, tree.text) + def test_encoding_utf8_bom(self): + utext = u'S?k p? nettet' + uxml = u'' + \ + u'

%s

' % utext + bom = '\xEF\xBB\xBF' + xml = bom + uxml.encode("utf-8") + tree = etree.XML(xml) + self.assertEquals(utext, tree.text) + def test_encoding_8bit_parse_stringio(self): utext = u'S?k p? nettet' uxml = u'

%s

' % utext From scoder at codespeak.net Tue Sep 18 12:31:21 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 18 Sep 2007 12:31:21 +0200 (CEST) Subject: [Lxml-checkins] r46717 - lxml/trunk/src/lxml Message-ID: <20070918103121.4D279813A@code0.codespeak.net> Author: scoder Date: Tue Sep 18 12:31:20 2007 New Revision: 46717 Modified: lxml/trunk/src/lxml/objectify.pyx Log: iterfind() in objectify Modified: lxml/trunk/src/lxml/objectify.pyx ============================================================================== --- lxml/trunk/src/lxml/objectify.pyx (original) +++ lxml/trunk/src/lxml/objectify.pyx Tue Sep 18 12:31:20 2007 @@ -363,6 +363,12 @@ sibling = self.__getitem__(key) parent.remove(sibling) + def iterfind(self, path): + # Reimplementation of Element.iterfind() to make it work without child + # iteration. + xpath = etree.ETXPath(path) + return iter(xpath(self)) + def findall(self, path): # Reimplementation of Element.findall() to make it work without child # iteration. From scoder at codespeak.net Tue Sep 18 12:48:00 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 18 Sep 2007 12:48:00 +0200 (CEST) Subject: [Lxml-checkins] r46718 - lxml/trunk/src/lxml Message-ID: <20070918104800.DFECC8132@code0.codespeak.net> Author: scoder Date: Tue Sep 18 12:48:00 2007 New Revision: 46718 Modified: lxml/trunk/src/lxml/xslt.pxi Log: avoid XPath instantiation on startup Modified: lxml/trunk/src/lxml/xslt.pxi ============================================================================== --- lxml/trunk/src/lxml/xslt.pxi (original) +++ lxml/trunk/src/lxml/xslt.pxi Tue Sep 18 12:48:00 2007 @@ -598,10 +598,16 @@ cdef object _REPLACE_PI_HREF _REPLACE_PI_HREF = _RE_PI_HREF.sub -cdef XPath _findStylesheetByID -_findStylesheetByID = XPath( - "//xsl:stylesheet[@xml:id = $id]", - {"xsl":"http://www.w3.org/1999/XSL/Transform"}) +cdef XPath __findStylesheetByID +__findStylesheetByID = None + +cdef _findStylesheetByID(_Document doc, id): + global __findStylesheetByID + if __findStylesheetByID is None: + __findStylesheetByID = XPath( + "//xsl:stylesheet[@xml:id = $id]", + {"xsl" : "http://www.w3.org/1999/XSL/Transform"}) + return __findStylesheetByID(doc, id=id) cdef class _XSLTProcessingInstruction(PIBase): def parseXSL(self, parser=None): @@ -647,7 +653,7 @@ return _elementTreeFactory(result_node._doc, result_node) # try XPath search - root = _findStylesheetByID(self._doc, id=funicode(c_href)) + root = _findStylesheetByID(self._doc, funicode(c_href)) if not root: raise ValueError, "reference to non-existing embedded stylesheet" elif len(root) > 1: From scoder at codespeak.net Tue Sep 18 12:50:07 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 18 Sep 2007 12:50:07 +0200 (CEST) Subject: [Lxml-checkins] r46719 - lxml/trunk/src/lxml Message-ID: <20070918105007.187EB8078@code0.codespeak.net> Author: scoder Date: Tue Sep 18 12:50:06 2007 New Revision: 46719 Modified: lxml/trunk/src/lxml/xpath.pxi Log: warn about libxml2 2.6.27 XPath bug Modified: lxml/trunk/src/lxml/xpath.pxi ============================================================================== --- lxml/trunk/src/lxml/xpath.pxi (original) +++ lxml/trunk/src/lxml/xpath.pxi Tue Sep 18 12:50:06 2007 @@ -91,6 +91,12 @@ cdef void _setupDict(self, xpath.xmlXPathContext* xpathCtxt): __GLOBAL_PARSER_CONTEXT.initXPathParserDict(xpathCtxt) +cdef int _XPATH_VERSION_WARNING_REQUIRED +if _LIBXML_VERSION_INT == 20627: + _XPATH_VERSION_WARNING_REQUIRED = 1 +else: + _XPATH_VERSION_WARNING_REQUIRED = 0 + cdef class _XPathEvaluatorBase: cdef xpath.xmlXPathContext* _xpathCtxt cdef _XPathContext _context @@ -98,6 +104,12 @@ cdef _ErrorLog _error_log def __init__(self, namespaces, extensions, enable_regexp): + global _XPATH_VERSION_WARNING_REQUIRED + if _XPATH_VERSION_WARNING_REQUIRED: + _XPATH_VERSION_WARNING_REQUIRED = 0 + import warnings + warnings.warn("This version of libxml2 has a known XPath bug. " + \ + "Use it at your own risk.") self._error_log = _ErrorLog() self._context = _XPathContext(namespaces, extensions, enable_regexp, None) From lxml-checkins at codespeak.net Tue Sep 18 21:34:14 2007 From: lxml-checkins at codespeak.net (Viagra.com Inc) Date: Tue, 18 Sep 2007 21:34:14 +0200 (CEST) Subject: [Lxml-checkins] September 70% OFF Message-ID: <20070918103555.4293.qmail@xdsl-355.elblag.dialog.net.pl> An HTML attachment was scrubbed... URL: http://codespeak.net/pipermail/lxml-checkins/attachments/20070918/44868afa/attachment.htm From scoder at codespeak.net Wed Sep 19 14:48:26 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 19 Sep 2007 14:48:26 +0200 (CEST) Subject: [Lxml-checkins] r46730 - lxml/trunk/doc Message-ID: <20070919124826.82E0D816D@code0.codespeak.net> Author: scoder Date: Wed Sep 19 14:48:25 2007 New Revision: 46730 Modified: lxml/trunk/doc/parsing.txt lxml/trunk/doc/tutorial.txt Log: getiterator() -> iter() Modified: lxml/trunk/doc/parsing.txt ============================================================================== --- lxml/trunk/doc/parsing.txt (original) +++ lxml/trunk/doc/parsing.txt Wed Sep 19 14:48:25 2007 @@ -269,7 +269,7 @@ -------------------- As an extension over ElementTree, lxml.etree accepts a ``tag`` keyword -argument just like ``element.getiterator(tag)``. This restricts events to a +argument just like ``element.iter(tag)``. This restricts events to a specific tag or namespace:: >>> context = etree.iterparse(StringIO(xml), tag="element") Modified: lxml/trunk/doc/tutorial.txt ============================================================================== --- lxml/trunk/doc/tutorial.txt (original) +++ lxml/trunk/doc/tutorial.txt Wed Sep 19 14:48:25 2007 @@ -310,7 +310,7 @@ Child 3 - >>> for element in root.getiterator(): + >>> for element in root.iter(): ... print element.tag, '-', element.text root - None child - Child 1 @@ -318,9 +318,9 @@ another - Child 3 If you know you are only interested in a single tag, you can pass its name to -``getiterator()`` to have it filter for you:: +``iter()`` to have it filter for you:: - >>> for element in root.getiterator("child"): + >>> for element in root.iter("child"): ... print element.tag, '-', element.text child - Child 1 child - Child 2 @@ -462,7 +462,7 @@ content at leaf elements tends to be data content (even if blank). You can easily remove it in an additional step by traversing the tree:: - >>> for element in root.getiterator("*"): + >>> for element in root.iter("*"): ... if element.text is not None and not element.text.strip(): ... element.text = None From scoder at codespeak.net Wed Sep 19 14:49:50 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 19 Sep 2007 14:49:50 +0200 (CEST) Subject: [Lxml-checkins] r46731 - lxml/trunk/src/lxml Message-ID: <20070919124950.DCCF8816D@code0.codespeak.net> Author: scoder Date: Wed Sep 19 14:49:50 2007 New Revision: 46731 Modified: lxml/trunk/src/lxml/objectify.pyx Log: fix unicode annotation in DataElement Modified: lxml/trunk/src/lxml/objectify.pyx ============================================================================== --- lxml/trunk/src/lxml/objectify.pyx (original) +++ lxml/trunk/src/lxml/objectify.pyx Wed Sep 19 14:49:50 2007 @@ -925,15 +925,12 @@ def __lower_bool(b): return _lower_bool(b) -cdef _get_pytypename(obj): +cdef _pytypename(obj): if python.PyUnicode_Check(obj): return "str" else: return _typename(obj) -def __get_pytypename(obj): - return _get_pytypename(obj) - cdef _registerPyTypes(): pytype = PyType('int', int, IntElement) pytype.xmlSchemaTypes = ("int", "short", "byte", "unsignedShort", @@ -1751,8 +1748,8 @@ strval = str(_value) if _pytype is None: - _pytype = _typename(_value) - + _pytype = _pytypename(_value) + if _pytype is not None: if _pytype == "NoneType" or _pytype == "none": strval = None From scoder at codespeak.net Wed Sep 19 14:50:18 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 19 Sep 2007 14:50:18 +0200 (CEST) Subject: [Lxml-checkins] r46732 - lxml/trunk/src/lxml/tests Message-ID: <20070919125018.69DAF816D@code0.codespeak.net> Author: scoder Date: Wed Sep 19 14:50:10 2007 New Revision: 46732 Modified: lxml/trunk/src/lxml/tests/test_objectify.py Log: extended objectify test cases Modified: lxml/trunk/src/lxml/tests/test_objectify.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_objectify.py (original) +++ lxml/trunk/src/lxml/tests/test_objectify.py Wed Sep 19 14:50:10 2007 @@ -614,6 +614,64 @@ self.assert_(isinstance(value, objectify.StringElement)) self.assertEquals(value, "3.20") + def test_type_ustr(self): + Element = self.Element + SubElement = self.etree.SubElement + root = Element("{objectified}root") + root.s = u"test" + self.assert_(isinstance(root.s, objectify.StringElement)) + + def test_type_ustr_intliteral(self): + Element = self.Element + SubElement = self.etree.SubElement + root = Element("{objectified}root") + root.s = u"3" + self.assert_(isinstance(root.s, objectify.StringElement)) + + def test_type_ustr_floatliteral(self): + Element = self.Element + SubElement = self.etree.SubElement + root = Element("{objectified}root") + root.s = u"3.72" + self.assert_(isinstance(root.s, objectify.StringElement)) + + def test_type_ustr_mul(self): + Element = self.Element + SubElement = self.etree.SubElement + root = Element("{objectified}root") + root.s = u"test" + + self.assertEquals(u"test" * 5, root.s * 5) + self.assertEquals(5 * u"test", 5 * root.s) + + self.assertRaises(TypeError, operator.mul, root.s, u"honk") + self.assertRaises(TypeError, operator.mul, u"honk", root.s) + + def test_type_ustr_add(self): + Element = self.Element + SubElement = self.etree.SubElement + root = Element("{objectified}root") + root.s = u"test" + + s = u"toast" + self.assertEquals(u"test" + s, root.s + s) + self.assertEquals(s + u"test", s + root.s) + + def test_data_element_ustr(self): + value = objectify.DataElement(u"test") + self.assert_(isinstance(value, objectify.StringElement)) + self.assertEquals(value, u"test") + + def test_data_element_ustr_intliteral(self): + value = objectify.DataElement("3") + self.assert_(isinstance(value, objectify.StringElement)) + self.assertEquals(value, u"3") + + def test_data_element_ustr_floatliteral(self): + value = objectify.DataElement(u"3.20") + self.assert_(isinstance(value, objectify.StringElement)) + self.assertEquals(value, u"3.20") + def test_type_int(self): Element = self.Element SubElement = self.etree.SubElement @@ -957,6 +1015,7 @@ 42 300 2 + ''') objectify.annotate(root) @@ -976,6 +1035,7 @@ self.assertEquals("int", child_types[10]) self.assertEquals("int", child_types[11]) self.assertEquals("int", child_types[12]) + self.assertEquals(None, child_types[13]) self.assertEquals("true", root.n.get(XML_SCHEMA_NIL_ATTR)) @@ -1017,6 +1077,7 @@ 42 300 2 + ''') objectify.annotate(root, ignore_old=False) @@ -1036,6 +1097,7 @@ self.assertEquals("str", child_types[10]) self.assertEquals("float", child_types[11]) self.assertEquals("long", child_types[12]) + self.assertEquals(TREE_PYTYPE, child_types[13]) self.assertEquals("true", root.n.get(XML_SCHEMA_NIL_ATTR)) @@ -1057,6 +1119,7 @@ 42 300 2 + ''') objectify.annotate(root, ignore_old=False, ignore_xsi=False, @@ -1078,6 +1141,7 @@ self.assertEquals("str", child_types[10]) self.assertEquals("float", child_types[11]) self.assertEquals("long", child_types[12]) + self.assertEquals(TREE_PYTYPE, child_types[13]) self.assertEquals("true", root.n.get(XML_SCHEMA_NIL_ATTR)) @@ -1100,6 +1164,7 @@ self.assertEquals("xsd:string", child_types[10]) self.assertEquals("xsd:double", child_types[11]) self.assertEquals("xsd:integer", child_types[12]) + self.assertEquals(None, child_types[13]) self.assertEquals("true", root.n.get(XML_SCHEMA_NIL_ATTR)) @@ -1121,6 +1186,7 @@ 42 300 2 + ''') objectify.xsiannotate(root, ignore_old=False) @@ -1140,6 +1206,7 @@ self.assertEquals("xsd:string", child_types[10]) self.assertEquals("xsd:double", child_types[11]) self.assertEquals("xsd:integer", child_types[12]) + self.assertEquals(None, child_types[13]) def test_pyannotate_ignore_old(self): XML = self.XML @@ -1159,6 +1226,7 @@ 42 300 2 + ''') objectify.pyannotate(root, ignore_old=True) @@ -1178,6 +1246,7 @@ self.assertEquals("int", child_types[10]) self.assertEquals("int", child_types[11]) self.assertEquals("int", child_types[12]) + self.assertEquals(None, child_types[13]) self.assertEquals("true", root.n.get(XML_SCHEMA_NIL_ATTR)) @@ -1219,6 +1288,7 @@ 42 300 2 + ''') objectify.pyannotate(root) @@ -1238,6 +1308,7 @@ self.assertEquals("str", child_types[10]) self.assertEquals("float", child_types[11]) self.assertEquals("long", child_types[12]) + self.assertEquals(TREE_PYTYPE, child_types[13]) self.assertEquals("true", root.n.get(XML_SCHEMA_NIL_ATTR)) @@ -1259,6 +1330,7 @@ 42 300 2 + ''') objectify.xsiannotate(root, ignore_old=True) @@ -1278,6 +1350,7 @@ self.assertEquals("xsd:string", child_types[10]) self.assertEquals("xsd:double", child_types[11]) self.assertEquals("xsd:integer", child_types[12]) + self.assertEquals(None, child_types[13]) self.assertEquals("true", root.n.get(XML_SCHEMA_NIL_ATTR)) @@ -1299,6 +1372,7 @@ 42 300 2 + ''') objectify.deannotate(root) @@ -1327,6 +1401,7 @@ 42 300 2 + ''') objectify.xsiannotate(root) @@ -1347,6 +1422,7 @@ self.assertEquals("xsd:string", child_types[10]) self.assertEquals("xsd:double", child_types[11]) self.assertEquals("xsd:integer", child_types[12]) + self.assertEquals(None, child_types[13]) self.assertEquals("true", root.n.get(XML_SCHEMA_NIL_ATTR)) @@ -1372,6 +1448,7 @@ 42 300 2 + ''') objectify.annotate(root) @@ -1392,6 +1469,7 @@ self.assertEquals("int", child_types[10]) self.assertEquals("int", child_types[11]) self.assertEquals("int", child_types[12]) + self.assertEquals(None, child_types[13]) self.assertEquals("true", root.n.get(XML_SCHEMA_NIL_ATTR)) @@ -1417,6 +1495,7 @@ 42 300 2 + ''') objectify.annotate(root) @@ -1437,6 +1516,7 @@ self.assertEquals("xsd:string", child_types[10]) self.assertEquals("xsd:float", child_types[11]) self.assertEquals("xsd:long", child_types[12]) + self.assertEquals(None, child_types[13]) self.assertEquals("true", root.n.get(XML_SCHEMA_NIL_ATTR)) From scoder at codespeak.net Wed Sep 19 14:50:53 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 19 Sep 2007 14:50:53 +0200 (CEST) Subject: [Lxml-checkins] r46733 - lxml/trunk Message-ID: <20070919125053.6A248816D@code0.codespeak.net> Author: scoder Date: Wed Sep 19 14:50:48 2007 New Revision: 46733 Modified: lxml/trunk/CHANGES.txt Log: changelog Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Wed Sep 19 14:50:48 2007 @@ -16,6 +16,8 @@ Bugs fixed ---------- +* Type annotation for unicode strings in DataElement + Other changes ------------- From scoder at codespeak.net Wed Sep 19 20:31:04 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 19 Sep 2007 20:31:04 +0200 (CEST) Subject: [Lxml-checkins] r46745 - lxml/trunk Message-ID: <20070919183104.B265D816A@code0.codespeak.net> Author: scoder Date: Wed Sep 19 20:31:03 2007 New Revision: 46745 Modified: lxml/trunk/CHANGES.txt Log: changelog Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Wed Sep 19 20:31:03 2007 @@ -16,7 +16,7 @@ Bugs fixed ---------- -* Type annotation for unicode strings in DataElement +* Type annotation for unicode strings in ``DataElement()`` Other changes ------------- From scoder at codespeak.net Thu Sep 20 14:13:50 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 20 Sep 2007 14:13:50 +0200 (CEST) Subject: [Lxml-checkins] r46759 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20070920121350.21BDD819B@code0.codespeak.net> Author: scoder Date: Thu Sep 20 14:13:48 2007 New Revision: 46759 Modified: lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/dtd.pxi lxml/trunk/src/lxml/iterparse.pxi lxml/trunk/src/lxml/parser.pxi lxml/trunk/src/lxml/tests/test_elementtree.py lxml/trunk/src/lxml/tests/test_etree.py lxml/trunk/src/lxml/tests/test_htmlparser.py lxml/trunk/src/lxml/tree.pxd Log: 'encoding' kw argument in parsers to override document encoding Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Thu Sep 20 14:13:48 2007 @@ -8,6 +8,9 @@ Features added -------------- +* Parsers accept an ``encoding`` keyword argument that overrides the encoding + of the parsed documents. + * New C-API function ``hasChild()`` to test for children * ``annotate()`` function in objectify can annotate with Python types and XSI Modified: lxml/trunk/src/lxml/dtd.pxi ============================================================================== --- lxml/trunk/src/lxml/dtd.pxi (original) +++ lxml/trunk/src/lxml/dtd.pxi Thu Sep 20 14:13:48 2007 @@ -91,7 +91,7 @@ cdef _FileReaderContext dtd_parser cdef tree.xmlDtd* c_dtd exc_context = _ExceptionContext() - dtd_parser = _FileReaderContext(file, exc_context) + dtd_parser = _FileReaderContext(file, exc_context, None, None) c_dtd = dtd_parser._readDtd() Modified: lxml/trunk/src/lxml/iterparse.pxi ============================================================================== --- lxml/trunk/src/lxml/iterparse.pxi (original) +++ lxml/trunk/src/lxml/iterparse.pxi Thu Sep 20 14:13:48 2007 @@ -239,6 +239,9 @@ * remove_blank_text - discard blank text nodes * remove_comments - discard comments * remove_pis - discard processing instructions + + Other keyword arguments: + * encoding - override the document encoding """ cdef object _source cdef object _filename @@ -246,9 +249,10 @@ def __init__(self, source, events=("end",), tag=None, attribute_defaults=False, dtd_validation=False, load_dtd=False, no_network=True, remove_blank_text=False, - remove_comments=False, remove_pis=False): + remove_comments=False, remove_pis=False, encoding=None): cdef _IterparseContext context cdef char* c_filename + cdef char* c_encoding cdef int parse_options if not hasattr(source, 'read'): self._filename = _encodeFilename(source) @@ -279,12 +283,18 @@ parse_options = parse_options | xmlparser.XML_PARSE_NOBLANKS _BaseParser.__init__(self, parse_options, remove_comments, remove_pis, - None) + None, encoding) + + if self._default_encoding is None: + c_encoding = NULL + else: + c_encoding = _cstr(self._default_encoding) context = <_IterparseContext>self._context context._setEventFilter(events, tag) xmlparser.xmlCtxtUseOptions(self._parser_ctxt, parse_options) - xmlparser.xmlCtxtResetPush(self._parser_ctxt, NULL, 0, c_filename, NULL) + xmlparser.xmlCtxtResetPush(self._parser_ctxt, NULL, 0, + c_filename, c_encoding) self._lockParser() # will not be unlocked - no other methods supported cdef _ParserContext _createContext(self, target): Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Thu Sep 20 14:13:48 2007 @@ -206,14 +206,16 @@ cdef class _FileReaderContext: cdef object _filelike + cdef object _encoding cdef object _url cdef object _bytes cdef _ExceptionContext _exc_context cdef cstd.size_t _bytes_read cdef char* _c_url - def __init__(self, filelike, exc_context, url=None): + def __init__(self, filelike, exc_context, url, encoding): self._exc_context = exc_context self._filelike = filelike + self._encoding = encoding self._url = url if url is None: self._c_url = NULL @@ -234,15 +236,22 @@ LxmlParserType parser_type): cdef python.PyThreadState* state cdef xmlDoc* result + cdef char* c_encoding + + if self._encoding is None: + c_encoding = NULL + else: + c_encoding = _cstr(self._encoding) + state = python.PyEval_SaveThread() if parser_type == LXML_XML_PARSER: result = xmlparser.xmlCtxtReadIO( ctxt, _readFilelikeParser, NULL, self, - self._c_url, NULL, options) + self._c_url, c_encoding, options) else: result = htmlparser.htmlCtxtReadIO( ctxt, _readFilelikeParser, NULL, self, - self._c_url, NULL, options) + self._c_url, c_encoding, options) python.PyEval_RestoreThread(state) return result @@ -493,9 +502,11 @@ cdef ElementClassLookup _class_lookup cdef python.PyThread_type_lock _parser_lock cdef int _feed_parser_running + cdef object _default_encoding def __init__(self, int parse_options, remove_comments, remove_pis, - target): + target, encoding): + cdef int c_encoding cdef xmlparser.xmlParserCtxt* pctxt if isinstance(self, HTMLParser): self._parser_type = LXML_HTML_PARSER @@ -516,6 +527,16 @@ self._context = self._createContext(target) _initParserContext(self._context, None, pctxt) + if encoding is None: + self._default_encoding = None + else: + encoding = _utf8(encoding) + c_encoding = tree.xmlParseCharEncoding(_cstr(encoding)) + if c_encoding == tree.XML_CHAR_ENCODING_ERROR or \ + c_encoding == tree.XML_CHAR_ENCODING_NONE: + raise LookupError, "unknown encoding: '%s'" % encoding + self._default_encoding = encoding + if remove_comments: pctxt.sax.comment = NULL if remove_pis: @@ -669,6 +690,7 @@ cdef xmlDoc* result cdef xmlparser.xmlParserCtxt* pctxt cdef int recover + cdef char* c_encoding if c_len > python.INT_MAX: raise ParserError, "string is too long to parse it with libxml2" self._lockParser() @@ -677,13 +699,20 @@ pctxt = self._parser_ctxt __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt) + if self._default_encoding is None: + c_encoding = NULL + else: + c_encoding = _cstr(self._default_encoding) + state = python.PyEval_SaveThread() if self._parser_type == LXML_HTML_PARSER: result = htmlparser.htmlCtxtReadMemory( - pctxt, c_text, c_len, c_filename, NULL, self._parse_options) + pctxt, c_text, c_len, c_filename, + c_encoding, self._parse_options) else: result = xmlparser.xmlCtxtReadMemory( - pctxt, c_text, c_len, c_filename, NULL, self._parse_options) + pctxt, c_text, c_len, c_filename, + c_encoding, self._parse_options) python.PyEval_RestoreThread(state) return self._context._handleParseResultDoc(self, result, None) @@ -699,6 +728,7 @@ cdef xmlparser.xmlParserCtxt* pctxt cdef int recover cdef int orig_options + cdef char* c_encoding result = NULL self._lockParser() self._context._error_log.connect() @@ -706,14 +736,19 @@ pctxt = self._parser_ctxt __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt) + if self._default_encoding is None: + c_encoding = NULL + else: + c_encoding = _cstr(self._default_encoding) + orig_options = pctxt.options state = python.PyEval_SaveThread() if self._parser_type == LXML_HTML_PARSER: result = htmlparser.htmlCtxtReadFile( - pctxt, c_filename, NULL, self._parse_options) + pctxt, c_filename, c_encoding, self._parse_options) else: result = xmlparser.xmlCtxtReadFile( - pctxt, c_filename, NULL, self._parse_options) + pctxt, c_filename, c_encoding, self._parse_options) python.PyEval_RestoreThread(state) pctxt.options = orig_options # work around libxml2 problem @@ -738,7 +773,8 @@ try: pctxt = self._parser_ctxt __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt) - file_context = _FileReaderContext(filelike, self._context, filename) + file_context = _FileReaderContext(filelike, self._context, + filename, self._default_encoding) result = file_context._readDoc( pctxt, self._parse_options, self._parser_type) @@ -928,7 +964,9 @@ * compact - safe memory for short text content (default: True) * resolve_entities - replace entities by their text value (default: True) - You can pass a parser target as ``target`` keyword argument. + Other keyword arguments: + * encoding - override the document encoding + * target - a parser target object that will receive the parse events Note that you should avoid sharing parsers between threads. While this is not harmful, it is more efficient to use separate parsers. This does not @@ -938,7 +976,7 @@ load_dtd=False, no_network=True, ns_clean=False, recover=False, remove_blank_text=False, compact=True, resolve_entities=True, remove_comments=False, - remove_pis=False, target=None): + remove_pis=False, target=None, encoding=None): cdef int parse_options parse_options = _XML_DEFAULT_PARSE_OPTIONS if load_dtd: @@ -963,26 +1001,34 @@ parse_options = parse_options ^ xmlparser.XML_PARSE_NOENT _BaseParser.__init__(self, parse_options, remove_comments, remove_pis, - target) + target, encoding) cdef class ETCompatXMLParser(XMLParser): """An XML parser with an ElementTree compatible default setup. See the XMLParser class for details. - This parser defaults to removing processing instructions and comments from - the tree. + This parser has ``remove_comments`` and ``remove_pis`` enabled by default + and thus ignores comments and processing instructions. """ def __init__(self, attribute_defaults=False, dtd_validation=False, load_dtd=False, no_network=True, ns_clean=False, recover=False, remove_blank_text=False, compact=True, resolve_entities=True, remove_comments=True, - remove_pis=True, target=None): + remove_pis=True, target=None, encoding=None): XMLParser.__init__(self, - attribute_defaults, dtd_validation, - load_dtd, no_network, ns_clean, - recover, remove_blank_text, compact, - resolve_entities, remove_comments, - remove_pis, target) + attribute_defaults=attribute_defaults, + dtd_validation=dtd_validation, + load_dtd=load_dtd, + no_network=no_network, + ns_clean=ns_clean, + recover=recover, + remove_blank_text=remove_blank_text, + compact=compact, + resolve_entities=resolve_entities, + remove_comments=remove_comments, + remove_pis=remove_pis, + target=target, + encoding=encoding) cdef XMLParser __DEFAULT_XML_PARSER @@ -1039,14 +1085,16 @@ * remove_pis - discard processing instructions * compact - safe memory for short text content (default: True) - You can pass a parser target as ``target`` keyword argument. + Other keyword arguments: + * encoding - override the document encoding + * target - a parser target object that will receive the parse events Note that you should avoid sharing parsers between threads for performance reasons. """ def __init__(self, recover=True, no_network=True, remove_blank_text=False, compact=True, remove_comments=False, remove_pis=False, - target=None): + target=None, encoding=None): cdef int parse_options parse_options = _HTML_DEFAULT_PARSE_OPTIONS if remove_blank_text: @@ -1059,7 +1107,7 @@ parse_options = parse_options ^ htmlparser.HTML_PARSE_COMPACT _BaseParser.__init__(self, parse_options, remove_comments, remove_pis, - target) + target, encoding) cdef HTMLParser __DEFAULT_HTML_PARSER __DEFAULT_HTML_PARSER = HTMLParser() Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Thu Sep 20 14:13:48 2007 @@ -2377,10 +2377,43 @@ self.assertEquals(u'S?k p? nettet'.encode('iso-8859-1'), result) - # raise error on wrong (left-over?) encoding declaration in unicode strings + def test_parse_encoding_8bit_explicit(self): + XMLParser = self.etree.XMLParser + + text = u'S?k p? nettet' + xml_latin1 = (u'%s' % text).encode('iso-8859-1') + + self.assertRaises(self.etree.ParseError, + self.etree.parse, + StringIO(xml_latin1)) + + tree = self.etree.parse(StringIO(xml_latin1), + XMLParser(encoding="iso-8859-1")) + a = tree.getroot() + self.assertEquals(a.text, text) + + def test_parse_encoding_8bit_override(self): + XMLParser = self.etree.XMLParser + + text = u'S?k p? nettet' + wrong_declaration = "" + xml_latin1 = (u'%s%s' % (wrong_declaration, text) + ).encode('iso-8859-1') + + self.assertRaises(self.etree.ParseError, + self.etree.parse, + StringIO(xml_latin1)) + + tree = self.etree.parse(StringIO(xml_latin1), + XMLParser(encoding="iso-8859-1")) + a = tree.getroot() + self.assertEquals(a.text, text) + def _test_wrong_unicode_encoding(self): + # raise error on wrong encoding declaration in unicode strings XML = self.etree.XML - test_utf = u'S?k p? nettet' + test_utf = (u'' + \ + u'S?k p? nettet') self.assertRaises(SyntaxError, XML, test_utf) def test_encoding_write_default_encoding(self): Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Thu Sep 20 14:13:48 2007 @@ -368,6 +368,26 @@ 8, len(events)) + def test_iterparse_encoding_8bit_override(self): + text = u'S?k p? nettet' + wrong_declaration = "" + xml_latin1 = (u'%s%s' % (wrong_declaration, text) + ).encode('iso-8859-1') + + self.assertRaises(self.etree.ParseError, + list, self.etree.iterparse(StringIO(xml_latin1))) + + iterator = self.etree.iterparse(StringIO(xml_latin1), + encoding="iso-8859-1") + self.assertEquals(1, len(list(iterator))) + + a = iterator.root + self.assertEquals(a.text, text) + + def test_parser_encoding_unknown(self): + self.assertRaises( + LookupError, self.etree.XMLParser, encoding="hopefully unknown") + def test_iterwalk_tag(self): iterwalk = self.etree.iterwalk root = self.etree.XML('') Modified: lxml/trunk/src/lxml/tests/test_htmlparser.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_htmlparser.py (original) +++ lxml/trunk/src/lxml/tests/test_htmlparser.py Thu Sep 20 14:13:48 2007 @@ -46,6 +46,37 @@ self.assertRaises(self.etree.XMLSyntaxError, parse, f, parser) + def test_parse_encoding_8bit_explicit(self): + text = u'S?k p? nettet' + html_latin1 = (u'

%s

' % text).encode('iso-8859-1') + + tree = self.etree.parse( + StringIO(html_latin1), + self.etree.HTMLParser(encoding="iso-8859-1")) + p = tree.find("//p") + self.assertEquals(p.text, text) + + def test_parse_encoding_8bit_override(self): + text = u'S?k p? nettet' + wrong_head = ''' + + + ''' + html_latin1 = (u'%s

%s

' % (wrong_head, + text) + ).encode('iso-8859-1') + + self.assertRaises(self.etree.ParseError, + self.etree.parse, + StringIO(html_latin1)) + + tree = self.etree.parse( + StringIO(html_latin1), + self.etree.HTMLParser(encoding="iso-8859-1")) + p = tree.find("//p") + self.assertEquals(p.text, text) + def test_module_HTML_broken(self): element = self.etree.HTML(self.broken_html_str) self.assertEqual(self.etree.tostring(element), Modified: lxml/trunk/src/lxml/tree.pxd ============================================================================== --- lxml/trunk/src/lxml/tree.pxd (original) +++ lxml/trunk/src/lxml/tree.pxd Thu Sep 20 14:13:48 2007 @@ -40,6 +40,7 @@ cdef int xmlCharEncCloseFunc(xmlCharEncodingHandler* handler) cdef xmlCharEncoding xmlDetectCharEncoding(char* text, int len) cdef char* xmlGetCharEncodingName(xmlCharEncoding enc) + cdef xmlCharEncoding xmlParseCharEncoding(char* name) cdef extern from "libxml/chvalid.h": cdef int xmlIsChar_ch(char c) From lxml-checkins at codespeak.net Thu Sep 20 22:27:59 2007 From: lxml-checkins at codespeak.net (Viagra.com Inc) Date: Thu, 20 Sep 2007 22:27:59 +0200 (CEST) Subject: [Lxml-checkins] Lovers package at discount price! Message-ID: <20070920112931.3344.qmail@host30-221-dynamic.11-79-r.retail.telecomitalia.it> An HTML attachment was scrubbed... URL: http://codespeak.net/pipermail/lxml-checkins/attachments/20070920/69e10e78/attachment.htm From scoder at codespeak.net Fri Sep 21 08:44:23 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 21 Sep 2007 08:44:23 +0200 (CEST) Subject: [Lxml-checkins] r46774 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20070921064423.C8E058111@code0.codespeak.net> Author: scoder Date: Fri Sep 21 08:44:22 2007 New Revision: 46774 Modified: lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/htmlparser.pxd lxml/trunk/src/lxml/iterparse.pxi lxml/trunk/src/lxml/parser.pxi lxml/trunk/src/lxml/tests/test_htmlparser.py lxml/trunk/src/lxml/xmlparser.pxd Log: preliminary HTML support for iterparse Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Fri Sep 21 08:44:22 2007 @@ -8,6 +8,10 @@ Features added -------------- +* ``iterparse()`` accepts an ``html`` boolean keyword argument for + parsing with the HTML parser (note that this interface may be + subject to change) + * Parsers accept an ``encoding`` keyword argument that overrides the encoding of the parsed documents. Modified: lxml/trunk/src/lxml/htmlparser.pxd ============================================================================== --- lxml/trunk/src/lxml/htmlparser.pxd (original) +++ lxml/trunk/src/lxml/htmlparser.pxd Fri Sep 21 08:44:22 2007 @@ -1,6 +1,6 @@ from tree cimport xmlDoc, xmlDict from tree cimport xmlInputReadCallback, xmlInputCloseCallback -from xmlparser cimport xmlParserCtxt +from xmlparser cimport xmlParserCtxt, xmlSAXHandler from xmlerror cimport xmlError cdef extern from "libxml/HTMLparser.h": @@ -16,6 +16,10 @@ cdef xmlParserCtxt* htmlCreateMemoryParserCtxt(char* buffer, int size) cdef xmlParserCtxt* htmlCreateFileParserCtxt(char* filename, char* encoding) + cdef xmlParserCtxt* htmlCreatePushParserCtxt(xmlSAXHandler* sax, + void* user_data, + char* chunk, int size, + char* filename, int enc) cdef void htmlFreeParserCtxt(xmlParserCtxt* ctxt) cdef void htmlCtxtReset(xmlParserCtxt* ctxt) cdef int htmlCtxtUseOptions(xmlParserCtxt* ctxt, int options) Modified: lxml/trunk/src/lxml/iterparse.pxi ============================================================================== --- lxml/trunk/src/lxml/iterparse.pxi (original) +++ lxml/trunk/src/lxml/iterparse.pxi Fri Sep 21 08:44:22 2007 @@ -51,6 +51,8 @@ cdef class _IterparseContext(_ParserContext): cdef xmlparser.startElementNsSAX2Func _origSaxStart cdef xmlparser.endElementNsSAX2Func _origSaxEnd + cdef xmlparser.startElementSAXFunc _origSaxStartNoNs + cdef xmlparser.endElementSAXFunc _origSaxEndNoNs cdef _Element _root cdef _Document _doc cdef int _event_filter @@ -78,19 +80,23 @@ _ParserContext._initParserContext(self, c_ctxt) sax = c_ctxt.sax self._origSaxStart = sax.startElementNs + self._origSaxStartNoNs = sax.startElement # only override start event handler if needed if self._event_filter == 0 or \ self._event_filter & (ITERPARSE_FILTER_START | \ ITERPARSE_FILTER_START_NS | \ ITERPARSE_FILTER_END_NS): - sax.startElementNs = _iterparseSaxStart + sax.startElementNs = _iterparseSaxStart + sax.startElement = _iterparseSaxStartNoNs self._origSaxEnd = sax.endElementNs + self._origSaxEndNoNs = sax.endElement # only override end event handler if needed if self._event_filter == 0 or \ self._event_filter & (ITERPARSE_FILTER_END | \ ITERPARSE_FILTER_END_NS): sax.endElementNs = _iterparseSaxEnd + sax.endElement = _iterparseSaxEndNoNs cdef _setEventFilter(self, events, tag): self._event_filter = _buildIterparseEventFilter(events) @@ -183,9 +189,15 @@ cdef xmlparser.startElementNsSAX2Func _getOrigStart(xmlparser.xmlParserCtxt* c_ctxt): return (<_IterparseContext>c_ctxt._private)._origSaxStart +cdef xmlparser.startElementSAXFunc _getOrigStartNoNs(xmlparser.xmlParserCtxt* c_ctxt): + return (<_IterparseContext>c_ctxt._private)._origSaxStartNoNs + cdef xmlparser.endElementNsSAX2Func _getOrigEnd(xmlparser.xmlParserCtxt* c_ctxt): return (<_IterparseContext>c_ctxt._private)._origSaxEnd +cdef xmlparser.endElementSAXFunc _getOrigEndNoNs(xmlparser.xmlParserCtxt* c_ctxt): + return (<_IterparseContext>c_ctxt._private)._origSaxEndNoNs + cdef void _iterparseSaxStart(void* ctxt, char* localname, char* prefix, char* URI, int nb_namespaces, char** namespaces, int nb_attributes, int nb_defaulted, @@ -208,6 +220,24 @@ origEnd = _getOrigEnd(c_ctxt) origEnd(ctxt, localname, prefix, URI) +cdef void _iterparseSaxStartNoNs(void* ctxt, char* name, char** attributes): + # no Python in here! + cdef xmlparser.xmlParserCtxt* c_ctxt + cdef xmlparser.startElementSAXFunc origStart + c_ctxt = ctxt + origStart = _getOrigStartNoNs(c_ctxt) + origStart(ctxt, name, attributes) + _pushSaxStartEvent(c_ctxt, c_ctxt.node) + +cdef void _iterparseSaxEndNoNs(void* ctxt, char* name): + # no Python in here! + cdef xmlparser.xmlParserCtxt* c_ctxt + cdef xmlparser.endElementSAXFunc origEnd + c_ctxt = ctxt + _pushSaxEndEvent(c_ctxt, c_ctxt.node) + origEnd = _getOrigEndNoNs(c_ctxt) + origEnd(ctxt, name) + cdef class iterparse(_BaseParser): """Incremental parser. Parses XML into a tree and generates tuples (event, element) in a SAX-like fashion. ``event`` is any of 'start', @@ -235,7 +265,7 @@ * attribute_defaults - read default attributes from DTD * dtd_validation - validate (if DTD is available) * load_dtd - use DTD for parsing - * no_network - prevent network access + * no_network - prevent network access for related files * remove_blank_text - discard blank text nodes * remove_comments - discard comments * remove_pis - discard processing instructions @@ -246,10 +276,12 @@ cdef object _source cdef object _filename cdef readonly object root + cdef int _html def __init__(self, source, events=("end",), tag=None, attribute_defaults=False, dtd_validation=False, load_dtd=False, no_network=True, remove_blank_text=False, - remove_comments=False, remove_pis=False, encoding=None): + remove_comments=False, remove_pis=False, encoding=None, + html=False): cdef _IterparseContext context cdef char* c_filename cdef char* c_encoding @@ -267,6 +299,19 @@ c_filename = NULL self._source = source + if html: + self._html = 1 + if 'start' in events: + if 'end' in events: + events = ('start', 'end') + else: + events = ('start',) + elif 'end' in events: + events = ('end',) + else: + events = () + else: + self._html = 0 parse_options = _XML_DEFAULT_PARSE_OPTIONS if load_dtd: @@ -284,22 +329,39 @@ _BaseParser.__init__(self, parse_options, remove_comments, remove_pis, None, encoding) - - if self._default_encoding is None: - c_encoding = NULL - else: - c_encoding = _cstr(self._default_encoding) - + context = <_IterparseContext>self._context context._setEventFilter(events, tag) - xmlparser.xmlCtxtUseOptions(self._parser_ctxt, parse_options) - xmlparser.xmlCtxtResetPush(self._parser_ctxt, NULL, 0, - c_filename, c_encoding) self._lockParser() # will not be unlocked - no other methods supported cdef _ParserContext _createContext(self, target): return _IterparseContext() + cdef xmlparser.xmlParserCtxt* _newParserCtxt(self): + cdef xmlparser.xmlParserCtxt* c_ctxt + cdef char* c_filename + if self._filename is not None: + c_filename = _cstr(self._filename) + else: + c_filename = NULL + if self._html: + c_ctxt = htmlparser.htmlCreatePushParserCtxt( + NULL, NULL, NULL, 0, c_filename, self._default_encoding_int) + if c_ctxt is not NULL: + htmlparser.htmlCtxtUseOptions(c_ctxt, self._parse_options) + else: + c_ctxt = xmlparser.xmlCreatePushParserCtxt( + NULL, NULL, NULL, 0, c_filename) + if c_ctxt is not NULL: + xmlparser.xmlCtxtUseOptions(c_ctxt, self._parse_options) + if self._default_encoding_int != tree.XML_CHAR_ENCODING_NONE: + xmlparser.xmlSwitchEncoding( + c_ctxt, self._default_encoding_int) + return c_ctxt + + def copy(self): + raise TypeError, "iterparse parsers cannot be copied" + def __iter__(self): return self @@ -324,11 +386,21 @@ self._source = None raise TypeError, "reading file objects must return plain strings" elif data: - error = xmlparser.xmlParseChunk( - self._parser_ctxt, _cstr(data), - python.PyString_GET_SIZE(data), 0) + if self._html: + error = htmlparser.htmlParseChunk( + self._parser_ctxt, _cstr(data), + python.PyString_GET_SIZE(data), 0) + else: + error = xmlparser.xmlParseChunk( + self._parser_ctxt, _cstr(data), + python.PyString_GET_SIZE(data), 0) else: - error = xmlparser.xmlParseChunk(self._parser_ctxt, NULL, 0, 1) + if self._html: + error = htmlparser.htmlParseChunk( + self._parser_ctxt, NULL, 0, 1) + else: + error = xmlparser.xmlParseChunk( + self._parser_ctxt, NULL, 0, 1) self._source = None break if error != 0: Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Fri Sep 21 08:44:22 2007 @@ -503,6 +503,7 @@ cdef python.PyThread_type_lock _parser_lock cdef int _feed_parser_running cdef object _default_encoding + cdef int _default_encoding_int def __init__(self, int parse_options, remove_comments, remove_pis, target, encoding): @@ -519,16 +520,9 @@ self._parse_options = parse_options - pctxt = self._newParserCtxt() - self._parser_ctxt = pctxt - if pctxt is NULL: - python.PyErr_NoMemory() - - self._context = self._createContext(target) - _initParserContext(self._context, None, pctxt) - if encoding is None: self._default_encoding = None + self._default_encoding_int = tree.XML_CHAR_ENCODING_NONE else: encoding = _utf8(encoding) c_encoding = tree.xmlParseCharEncoding(_cstr(encoding)) @@ -536,6 +530,15 @@ c_encoding == tree.XML_CHAR_ENCODING_NONE: raise LookupError, "unknown encoding: '%s'" % encoding self._default_encoding = encoding + self._default_encoding_int = c_encoding + + pctxt = self._newParserCtxt() + self._parser_ctxt = pctxt + if pctxt is NULL: + python.PyErr_NoMemory() + + self._context = self._createContext(target) + _initParserContext(self._context, None, pctxt) if remove_comments: pctxt.sax.comment = NULL @@ -544,9 +547,7 @@ # hard switch-off for CDATA nodes => makes them plain text pctxt.sax.cdataBlock = NULL - if not config.ENABLE_THREADING or \ - self._parser_type == LXML_ITERPARSE_PARSER: - # no threading + if not config.ENABLE_THREADING: self._parser_lock = NULL else: self._parser_lock = python.PyThread_allocate_lock() @@ -955,7 +956,7 @@ * attribute_defaults - read default attributes from DTD * dtd_validation - validate (if DTD is available) * load_dtd - use DTD for parsing - * no_network - prevent network access (default: True) + * no_network - prevent network access for related files (default: True) * ns_clean - clean up redundant namespace declarations * recover - try hard to parse through broken XML * remove_blank_text - discard blank text nodes @@ -1079,7 +1080,7 @@ Available boolean keyword arguments: * recover - try hard to parse through broken HTML (default: True) - * no_network - prevent network access (default: True) + * no_network - prevent network access for related files (default: True) * remove_blank_text - discard empty text nodes * remove_comments - discard comments * remove_pis - discard processing instructions Modified: lxml/trunk/src/lxml/tests/test_htmlparser.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_htmlparser.py (original) +++ lxml/trunk/src/lxml/tests/test_htmlparser.py Fri Sep 21 08:44:22 2007 @@ -140,6 +140,36 @@ self.assertRaises(self.etree.XMLSyntaxError, self.etree.parse, StringIO(self.broken_html_str)) + def test_html_iterparse(self): + iterparse = self.etree.iterparse + f = StringIO( + 'TITLE

P

') + + iterator = iterparse(f, html=True) + self.assertEquals(None, iterator.root) + + events = list(iterator) + root = iterator.root + self.assert_(root is not None) + self.assertEquals( + [('end', root[0][0]), ('end', root[0]), ('end', root[1][0]), + ('end', root[1]), ('end', root)], + events) + + def test_html_iterparse_file(self): + iterparse = self.etree.iterparse + iterator = iterparse(fileInTestDir("css_shakespear.html"), + html=True) + + self.assertEquals(None, iterator.root) + events = list(iterator) + root = iterator.root + self.assert_(root is not None) + self.assertEquals(249, len(events)) + self.assertEquals( + [], + [ event for (event, element) in events if event != 'end' ]) + def test_suite(): suite = unittest.TestSuite() suite.addTests([unittest.makeSuite(HtmlParserTestCaseBase)]) Modified: lxml/trunk/src/lxml/xmlparser.pxd ============================================================================== --- lxml/trunk/src/lxml/xmlparser.pxd (original) +++ lxml/trunk/src/lxml/xmlparser.pxd Fri Sep 21 08:44:22 2007 @@ -19,6 +19,10 @@ char* prefix, char* URI) + ctypedef void (*startElementSAXFunc)(void* ctx, char* name, char** atts) + + ctypedef void (*endElementSAXFunc)(void* ctx, char* name) + ctypedef void (*charactersSAXFunc)(void* ctx, char* ch, int len) ctypedef void (*cdataBlockSAXFunc)(void* ctx, char* value, int len) @@ -49,6 +53,8 @@ internalSubsetSAXFunc internalSubset startElementNsSAX2Func startElementNs endElementNsSAX2Func endElementNs + startElementSAXFunc startElement + endElementSAXFunc endElement charactersSAXFunc characters cdataBlockSAXFunc cdataBlock commentSAXFunc comment @@ -170,3 +176,4 @@ cdef xmlParserInput* xmlNewInputFromFile(xmlParserCtxt* ctxt, char* filename) cdef void xmlFreeInputStream(xmlParserInput* input) + cdef int xmlSwitchEncoding(xmlParserCtxt* ctxt, int enc) From scoder at codespeak.net Fri Sep 21 09:34:01 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 21 Sep 2007 09:34:01 +0200 (CEST) Subject: [Lxml-checkins] r46775 - in lxml/trunk: . doc Message-ID: <20070921073401.D8DE48150@code0.codespeak.net> Author: scoder Date: Fri Sep 21 09:34:00 2007 New Revision: 46775 Modified: lxml/trunk/doc/build.txt lxml/trunk/setup.py Log: require Cython 0.9.6.6 Modified: lxml/trunk/doc/build.txt ============================================================================== --- lxml/trunk/doc/build.txt (original) +++ lxml/trunk/doc/build.txt Fri Sep 21 09:34:00 2007 @@ -33,11 +33,11 @@ be an lxml developer, you do need a working Cython installation. You can use EasyInstall_ to install it:: - easy_install Cython==0.9.6.5 + easy_install Cython==0.9.6.6 .. _EasyInstall: http://peak.telecommunity.com/DevCenter/EasyInstall -lxml currently requires Cython 0.9.6.5, but it should work with later +lxml currently requires Cython 0.9.6.6, but it should work with later versions. Modified: lxml/trunk/setup.py ============================================================================== --- lxml/trunk/setup.py (original) +++ lxml/trunk/setup.py Fri Sep 21 09:34:00 2007 @@ -16,7 +16,7 @@ except pkg_resources.VersionConflict: from ez_setup import use_setuptools use_setuptools(version="0.6c5") - #pkg_resources.require("Cython==0.9.6.5") + #pkg_resources.require("Cython==0.9.6.6") from setuptools import setup extra_options["zip_safe"] = False except ImportError: From scoder at codespeak.net Fri Sep 21 10:16:44 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 21 Sep 2007 10:16:44 +0200 (CEST) Subject: [Lxml-checkins] r46776 - lxml/trunk/src/lxml Message-ID: <20070921081644.2FD948097@code0.codespeak.net> Author: scoder Date: Fri Sep 21 10:16:42 2007 New Revision: 46776 Modified: lxml/trunk/src/lxml/objectify.pyx Log: public function objectify.pytypename() Modified: lxml/trunk/src/lxml/objectify.pyx ============================================================================== --- lxml/trunk/src/lxml/objectify.pyx (original) +++ lxml/trunk/src/lxml/objectify.pyx Fri Sep 21 10:16:42 2007 @@ -926,11 +926,16 @@ return _lower_bool(b) cdef _pytypename(obj): - if python.PyUnicode_Check(obj): + if python._isString(obj): return "str" else: return _typename(obj) +def pytypename(obj): + """Find the name of the corresponding PyType for a Python object. + """ + return _pytypename(obj) + cdef _registerPyTypes(): pytype = PyType('int', int, IntElement) pytype.xmlSchemaTypes = ("int", "short", "byte", "unsignedShort", From ianb at codespeak.net Fri Sep 21 19:31:55 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Fri, 21 Sep 2007 19:31:55 +0200 (CEST) Subject: [Lxml-checkins] r46801 - lxml/trunk/src/lxml/html Message-ID: <20070921173155.30C468162@code0.codespeak.net> Author: ianb Date: Fri Sep 21 19:31:53 2007 New Revision: 46801 Modified: lxml/trunk/src/lxml/html/setmixin.py Log: Because list() calls len(obj), implementing __len__ in terms of list() causes infinite recursion Modified: lxml/trunk/src/lxml/html/setmixin.py ============================================================================== --- lxml/trunk/src/lxml/html/setmixin.py (original) +++ lxml/trunk/src/lxml/html/setmixin.py Fri Sep 21 19:31:53 2007 @@ -5,7 +5,10 @@ """ def __len__(self): - return len(list(self)) + length = 0 + for item in self: + length += 1 + return length def __contains__(self, item): for has_item in self: From ianb at codespeak.net Fri Sep 21 19:33:15 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Fri, 21 Sep 2007 19:33:15 +0200 (CEST) Subject: [Lxml-checkins] r46802 - lxml/trunk/src/lxml/html Message-ID: <20070921173315.0CF8D8162@code0.codespeak.net> Author: ianb Date: Fri Sep 21 19:33:14 2007 New Revision: 46802 Modified: lxml/trunk/src/lxml/html/__init__.py Log: Some fixes to forms: ignore unnamed inputs when setting form.fields; don't try to delete keys when setting form.fields; be more careful about updating CheckboxGroup.value. Also, export more symbols in lxml.html.__all__ Modified: lxml/trunk/src/lxml/html/__init__.py ============================================================================== --- lxml/trunk/src/lxml/html/__init__.py (original) +++ lxml/trunk/src/lxml/html/__init__.py Fri Sep 21 19:33:14 2007 @@ -13,9 +13,11 @@ from lxml.html._dictmixin import DictMixin import sets -__all__ = ['document_fromstring', 'tostring', 'Element', 'defs', - 'find_rel_links', 'find_class', 'make_links_absolute', - 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser'] +__all__ = [ + 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring', + 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form', + 'find_rel_links', 'find_class', 'make_links_absolute', + 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser'] _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]") #_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'}) @@ -583,10 +585,11 @@ prev_keys.remove(key) self.fields[key] = value for key in prev_keys: - # FIXME: but right now I don't even allow - # deleting, and I'm not sure what it would - # mean if I did. - del self.fields[key] + if key is None: + # Case of an unnamed input; these aren't really + # expressed in form_values() anyway. + continue + self.fields[key] = None fields = property(fields__get, fields__set, doc=fields__get.__doc__) @@ -1040,7 +1043,11 @@ return CheckboxValues(self) def value__set(self, value): self.value.clear() - self.value |= value + if not hasattr(value, '__iter__'): + raise ValueError( + "A CheckboxGroup (name=%r) must be set to a sequence (not %r)" + % (self[0].name, value)) + self.value.update(value) def value__del(self): self.value.clear() value = property(value__get, value__set, value__del, doc=value__get.__doc__) From ianb at codespeak.net Fri Sep 21 19:33:50 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Fri, 21 Sep 2007 19:33:50 +0200 (CEST) Subject: [Lxml-checkins] r46803 - lxml/trunk/doc Message-ID: <20070921173350.2C2CD8167@code0.codespeak.net> Author: ianb Date: Fri Sep 21 19:33:49 2007 New Revision: 46803 Modified: lxml/trunk/doc/lxmlhtml.txt Log: Fixed all the parsing references. Fix builder import. Added section on parsing. Added docs on forms. Modified: lxml/trunk/doc/lxmlhtml.txt ============================================================================== --- lxml/trunk/doc/lxmlhtml.txt (original) +++ lxml/trunk/doc/lxmlhtml.txt Fri Sep 21 19:33:49 2007 @@ -29,6 +29,35 @@ Parsing HTML fragments ---------------------- +There are several functions available to parse HTML: + +``parse(filename_url_or_file)``: + Parses the named file or url, or if the object has a ``.read()`` + method, parses from that. + + If you give a URL, or if the object has a ``.geturl()`` method (as + file-like objects from ``urllib.urlopen()`` have), then that URL + is used as the base URL. + +``document_fromstring(string)``: + Parses a document from the given string. This always creates a + correct HTML document, which means the parent node is ````, + and there is a body and possibly a head. + +``fragment_fromstring(string, create_parent=False)``: + Returns an HTML fragment from a string. The fragment must contain + just a single element, unless ``create_parent`` is given; + e.g,. ``fragment_fromstring(string, create_parent='div')`` will + wrap the element in a ``
``. + +``fragments_fromstring(string)``: + Returns a list of the elements found in the fragment. + +``fromstring(string)``: + Returns ``document_fromstring`` or ``fragment_fromstring``, based + on whether the string looks like a full document, or just a + fragment. + HTML Element Methods ==================== @@ -71,6 +100,16 @@ selector expression. (Note that ``.xpath(expr)`` is also available as on all lxml elements.) +``.label``: + Returns the corresponding ``