From jholg at codespeak.net Sat Jan 2 14:44:10 2010 From: jholg at codespeak.net (jholg at codespeak.net) Date: Sat, 2 Jan 2010 14:44:10 +0100 (CET) Subject: [Lxml-checkins] r70380 - lxml/branch/iso-schematron/src/lxml/tests Message-ID: <20100102134410.1C79D168039@codespeak.net> Author: jholg Date: Sat Jan 2 14:44:10 2010 New Revision: 70380 Added: lxml/branch/iso-schematron/src/lxml/tests/test.sch Log: added test.sch (forgotten commit) Added: lxml/branch/iso-schematron/src/lxml/tests/test.sch ============================================================================== --- (empty file) +++ lxml/branch/iso-schematron/src/lxml/tests/test.sch Sat Jan 2 14:44:10 2010 @@ -0,0 +1,8 @@ + + + mandatory number_of_entries tests + + [ERROR] number_of_entries () must equal the number of entries/entry elements () + + + From jholg at codespeak.net Sat Jan 2 14:48:53 2010 From: jholg at codespeak.net (jholg at codespeak.net) Date: Sat, 2 Jan 2010 14:48:53 +0100 (CET) Subject: [Lxml-checkins] r70381 - lxml/trunk/src/lxml/tests Message-ID: <20100102134853.9ACE5168039@codespeak.net> Author: jholg Date: Sat Jan 2 14:48:53 2010 New Revision: 70381 Added: lxml/trunk/src/lxml/tests/test.sch - copied unchanged from r70380, lxml/branch/iso-schematron/src/lxml/tests/test.sch Log: Merged haddition of test.sch (was a forgotten commit): svn merge -r70379:HEAD http://codespeak.net/svn/lxml/branch/iso-schematron From jholg at codespeak.net Sat Jan 2 17:18:21 2010 From: jholg at codespeak.net (jholg at codespeak.net) Date: Sat, 2 Jan 2010 17:18:21 +0100 (CET) Subject: [Lxml-checkins] r70382 - in lxml/branch/iso-schematron/src/lxml: isoschematron tests Message-ID: <20100102161821.81278168032@codespeak.net> Author: jholg Date: Sat Jan 2 17:18:20 2010 New Revision: 70382 Modified: lxml/branch/iso-schematron/src/lxml/isoschematron/__init__.py lxml/branch/iso-schematron/src/lxml/tests/test_isoschematron.py Log: Changes: * stylesheet_params now raises error for None args, with test * Schematron class docstring update Modified: lxml/branch/iso-schematron/src/lxml/isoschematron/__init__.py ============================================================================== --- lxml/branch/iso-schematron/src/lxml/isoschematron/__init__.py (original) +++ lxml/branch/iso-schematron/src/lxml/isoschematron/__init__.py Sat Jan 2 17:18:20 2010 @@ -69,10 +69,15 @@ def stylesheet_params(**kwargs): """Convert keyword args to a dictionary of stylesheet parameters. - Conversion follows these rules: + XSL stylesheet parameters must be XPath expressions, i.e.: + * string expressions, like "'5'" + * simple (number) expressions, like "5" + * valid XPath expressions, like "/a/b/text()" + This function converts native Python keyword arguments to stylesheet + parameters following these rules: If an arg is a string wrap it with XSLT.strparam(). If an arg is an XPath object use its path string. - If arg is None ignore the parameter. + If arg is None raise TypeError. Else convert arg to string. """ result = {} @@ -80,7 +85,7 @@ if isinstance(val, basestring): val = _etree.XSLT.strparam(val) elif val is None: - continue + raise TypeError('None not allowed as a stylesheet parameter') elif not isinstance(val, _etree.XPath): val = unicode(val) result[key] = val @@ -93,14 +98,11 @@ stylesheet arguments. kwargsDict entries with a value of None are ignored. """ - if paramsDict: - # beware of changing mutable default arg - paramsDict = dict(paramsDict) - for k, v in kwargsDict.items(): - if v is not None: # None values do not override - paramsDict[k] = v - else: - paramsDict = kwargsDict + # beware of changing mutable default arg + paramsDict = dict(paramsDict) + for k, v in kwargsDict.items(): + if v is not None: # None values do not override + paramsDict[k] = v paramsDict = stylesheet_params(**paramsDict) return paramsDict @@ -120,9 +122,12 @@ 3) Compile the schematron schema to XSLT The ``include`` and ``expand`` keyword arguments can be used to switch off steps 1) and 2). - To set parameters for steps 1), 2) and 3) hand dictionaries containing xslt - parameters to the keyword arguments ``include_params``, ``expand_params`` - or ``compile_params``. + To set parameters for steps 1), 2) and 3) hand parameter dictionaries to the + keyword arguments ``include_params``, ``expand_params`` or + ``compile_params``. + For convenience, the compile-step parameter ``phase`` is also exposed as a + keyword argument ``phase``. This takes precedence if the parameter is also + given in the parameter dictionary. If ``store_schematron`` is set to True, the (included-and-expanded) schematron document tree is stored and available through the ``schematron`` property. Modified: lxml/branch/iso-schematron/src/lxml/tests/test_isoschematron.py ============================================================================== --- lxml/branch/iso-schematron/src/lxml/tests/test_isoschematron.py (original) +++ lxml/branch/iso-schematron/src/lxml/tests/test_isoschematron.py Sat Jan 2 17:18:20 2010 @@ -762,6 +762,21 @@ self.assert_(relaxng(tree_invalid), relaxng.error_log) self.assert_(not schematron(tree_invalid)) + def test_schematron_invalid_args(self): + schema = self.parse('''\ + + + mandatory number_of_entries tests + + [ERROR] number_of_entries () must equal the number of entries/entry elements () + + + +''') + # handing phase as keyword arg will *not* raise the type error + self.assertRaises(TypeError, isoschematron.Schematron, schema, + compile_params={'phase': None}) + #TODO: test xslt parameters for inclusion, expand & compile steps (?) From scoder at codespeak.net Mon Jan 4 16:11:58 2010 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 4 Jan 2010 16:11:58 +0100 (CET) Subject: [Lxml-checkins] r70396 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20100104151158.30155168032@codespeak.net> Author: scoder Date: Mon Jan 4 16:11:57 2010 New Revision: 70396 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/apihelpers.pxi lxml/trunk/src/lxml/tests/test_etree.py Log: r5403 at lenny: sbehnel | 2010-01-04 16:11:30 +0100 fix bug #502976: reject invalid XML characters also after unicode sequences Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Mon Jan 4 16:11:57 2010 @@ -42,6 +42,9 @@ Bugs fixed ---------- +* Invalid XML text characters were not rejected by the API when they + appeared in unicode strings directly after non-ASCII characters. + * ``lxml.cssselect`` did not distinguish between ``x[attr="val"]`` and ``x [attr="val"]`` (with a space). The latter now matches the attribute independent of the element. Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Mon Jan 4 16:11:57 2010 @@ -1299,11 +1299,11 @@ cdef bint is_non_ascii = 0 while s < c_end: if s[0] & 0x80: - # skip the entire multi byte sequence - while s[0] & 0x80: + # skip over multi byte sequences + while s < c_end and s[0] & 0x80: s += 1 is_non_ascii = 1 - elif not tree.xmlIsChar_ch(s[0]): + if s < c_end and not tree.xmlIsChar_ch(s[0]): return -1 # invalid! s += 1 return is_non_ascii Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Mon Jan 4 16:11:57 2010 @@ -2553,6 +2553,25 @@ self.assertRaises(ValueError, Element, _str('ha\x02ho')) + def test_unicode_byte_invalid_sequence(self): + Element = self.etree.Element + + a = Element('a') + self.assertRaises(ValueError, setattr, a, "text", + _str('ha\u1234\x07ho')) + self.assertRaises(ValueError, setattr, a, "text", + _str('ha\u1234\x02ho')) + + self.assertRaises(ValueError, setattr, a, "tail", + _str('ha\u1234\x07ho')) + self.assertRaises(ValueError, setattr, a, "tail", + _str('ha\u1234\x02ho')) + + self.assertRaises(ValueError, Element, + _str('ha\u1234\x07ho')) + self.assertRaises(ValueError, Element, + _str('ha\u1234\x02ho')) + def test_encoding_tostring_utf16(self): # ElementTree fails to serialize this tostring = self.etree.tostring From lxml-checkins at codespeak.net Sat Jan 9 21:05:52 2010 From: lxml-checkins at codespeak.net (VIAGRA Best Supplier (c)) Date: Sat, 9 Jan 2010 21:05:52 +0100 (CET) Subject: [Lxml-checkins] Visitor lxml-checkins's personal 80% OFF Message-ID: <20100109200552.4568D498447@codespeak.net> An HTML attachment was scrubbed... URL: http://codespeak.net/pipermail/lxml-checkins/attachments/20100109/4b5a154f/attachment.htm From scoder at codespeak.net Thu Jan 14 09:52:19 2010 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 14 Jan 2010 09:52:19 +0100 (CET) Subject: [Lxml-checkins] r70571 - in lxml/trunk: . doc Message-ID: <20100114085219.F38E216803B@codespeak.net> Author: scoder Date: Thu Jan 14 09:52:18 2010 New Revision: 70571 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/FAQ.txt Log: r5405 at lenny: sbehnel | 2010-01-13 13:15:37 +0100 FAQ update Modified: lxml/trunk/doc/FAQ.txt ============================================================================== --- lxml/trunk/doc/FAQ.txt (original) +++ lxml/trunk/doc/FAQ.txt Thu Jan 14 09:52:18 2010 @@ -817,6 +817,22 @@ tree. If you now call a serialization function to pretty print this tree, lxml can add fresh whitespace to the XML tree to indent it. +Note that the ``remove_blank_text`` option also uses a heuristic if it +has no definite knowledge about the document's ignorable whitespace. +It will keep blank text nodes that appear after non-blank text nodes +at the same level. This is to prevent document-style XML from +breaking. + +If you want to be sure all blank text is removed, you have to use +either a DTD to tell the parser which whitespace it can safely ignore, +or remove the ignorable whitespace manually after parsing, e.g. by +setting all tail text to None:: + +.. sourcecode:: python + + for element in root.iter(): + element.tail = None + Fredrik Lundh also has a Python-level function for indenting XML by appending whitespace to tags. It can be found on his `element library`_ recipe page. From scoder at codespeak.net Thu Jan 14 09:52:24 2010 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 14 Jan 2010 09:52:24 +0100 (CET) Subject: [Lxml-checkins] r70572 - in lxml/trunk: . doc Message-ID: <20100114085224.4668716803D@codespeak.net> Author: scoder Date: Thu Jan 14 09:52:22 2010 New Revision: 70572 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/FAQ.txt Log: r5406 at lenny: sbehnel | 2010-01-14 09:33:52 +0100 doc reference to python-docx package Modified: lxml/trunk/doc/FAQ.txt ============================================================================== --- lxml/trunk/doc/FAQ.txt (original) +++ lxml/trunk/doc/FAQ.txt Thu Jan 14 09:52:22 2010 @@ -189,6 +189,7 @@ * OpenXMLlib_, a library for handling OpenXML document meta data * Pycoon_, a WSGI web development framework based on XML pipelines * PyQuery_, a query framework for XML/HTML, similar to jQuery for JavaScript +* `python-docx`_, a package for handling Microsoft's Word OpenXML format * Rambler_, a meta search engine that aggregates different data sources * rdfadict_, an RDFa parser with a simple dictionary-like interface. @@ -219,6 +220,7 @@ .. _rdfadict: http://pypi.python.org/pypi/rdfadict .. _z3c.rml: http://pypi.python.org/pypi/z3c.rml .. _zif.sedna: http://pypi.python.org/pypi/zif.sedna +.. _`python-docx`: http://github.com/mikemaccana/python-docx .. _happy: http://thread.gmane.org/gmane.comp.python.lxml.devel/3244/focus=3244 .. _users: http://article.gmane.org/gmane.comp.python.lxml.devel/3246 From scoder at codespeak.net Thu Jan 21 13:22:35 2010 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 21 Jan 2010 13:22:35 +0100 (CET) Subject: [Lxml-checkins] r70741 - in lxml/trunk: . src/lxml/html Message-ID: <20100121122235.6EE13168026@codespeak.net> Author: scoder Date: Thu Jan 21 13:22:33 2010 New Revision: 70741 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/html/html5parser.py Log: r5409 at lenny: sbehnel | 2010-01-21 13:22:23 +0100 do not require XHTMLParser in html5lib Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Thu Jan 21 13:22:33 2010 @@ -42,6 +42,9 @@ Bugs fixed ---------- +* Support recent versions of html5lib by not requiring its + ``XHTMLParser`` in ``htmlparser.py`` anymore. + * Invalid XML text characters were not rejected by the API when they appeared in unicode strings directly after non-ASCII characters. Modified: lxml/trunk/src/lxml/html/html5parser.py ============================================================================== --- lxml/trunk/src/lxml/html/html5parser.py (original) +++ lxml/trunk/src/lxml/html/html5parser.py Thu Jan 21 13:22:33 2010 @@ -3,7 +3,7 @@ """ import urllib -from html5lib import HTMLParser as _HTMLParser, XHTMLParser as _XHTMLParser +from html5lib import HTMLParser as _HTMLParser from lxml import etree from lxml.html import _contains_block_level_tag, XHTML_NAMESPACE from lxml.html._html5builder import TreeBuilder @@ -22,11 +22,18 @@ _HTMLParser.__init__(self, strict=strict, tree=TreeBuilder) -class XHTMLParser(_XHTMLParser): - """An html5lib XHTML Parser with lxml as tree.""" +try: + from html5lib import XHTMLParser as _XHTMLParser +except ImportError: + pass +else: + class XHTMLParser(_XHTMLParser): + """An html5lib XHTML Parser with lxml as tree.""" - def __init__(self, strict=False): - _XHTMLParser.__init__(self, strict=strict, tree=TreeBuilder) + def __init__(self, strict=False): + _XHTMLParser.__init__(self, strict=strict, tree=TreeBuilder) + + xhtml_parser = XHTMLParser() def _find_tag(tree, tag): @@ -161,4 +168,3 @@ html_parser = HTMLParser() -xhtml_parser = XHTMLParser() From scoder at codespeak.net Sun Jan 24 19:36:05 2010 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 24 Jan 2010 19:36:05 +0100 (CET) Subject: [Lxml-checkins] r70799 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20100124183605.435BF31813B@codespeak.net> Author: scoder Date: Sun Jan 24 19:36:04 2010 New Revision: 70799 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/readonlytree.pxi lxml/trunk/src/lxml/tests/test_xslt.py lxml/trunk/src/lxml/xslt.pxd lxml/trunk/src/lxml/xsltext.pxi Log: r5411 at lenny: sbehnel | 2010-01-24 19:32:39 +0100 extended XSLT extension element API, original patch by Marat Dakota Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Sun Jan 24 19:36:04 2010 @@ -8,6 +8,12 @@ Features added -------------- +* New parameter ``output_parent`` to ``XSLTExtension.apply_templates()`` + to append the resulting content directly to an output element. + +* ``XSLTExtension.process_children()`` to process the content of the + XSLT extension element itself. + * Enable path caching in ElementPath (``el.find*()``) to avoid parsing overhead. @@ -42,6 +48,9 @@ Bugs fixed ---------- +* Using XSLT extension elements around the root of the output document + crashed. + * Support recent versions of html5lib by not requiring its ``XHTMLParser`` in ``htmlparser.py`` anymore. Modified: lxml/trunk/src/lxml/readonlytree.pxi ============================================================================== --- lxml/trunk/src/lxml/readonlytree.pxi (original) +++ lxml/trunk/src/lxml/readonlytree.pxi Sun Jan 24 19:36:04 2010 @@ -307,7 +307,6 @@ self._assertNode() return _collectAttributes(self._c_node, 3) - cdef extern from "etree_defs.h": # macro call to 't->tp_new()' for fast instantiation cdef _ReadOnlyProxy NEW_RO_PROXY "PY_NEW" (object t) @@ -352,6 +351,62 @@ tree.xmlFreeNode(c_node) del sourceProxy._dependent_proxies[:] +# opaque wrapper around non-element nodes, e.g. the document node +# +# This class does not imply any restrictions on modifiability or +# read-only status of the node, so use with caution. + +cdef class _OpaqueNodeWrapper: + cdef tree.xmlNode* _c_node + def __init__(self): + raise TypeError, u"This type cannot be instatiated from Python" + +cdef class _OpaqueDocumentWrapper(_OpaqueNodeWrapper): + cdef int _assertNode(self) except -1: + u"""This is our way of saying: this proxy is invalid! + """ + assert self._c_node is not NULL, u"Proxy invalidated!" + return 0 + + cpdef append(self, other_element): + u"""Append a copy of an Element to the list of children. + """ + cdef xmlNode* c_next + cdef xmlNode* c_node + self._assertNode() + c_node = _roNodeOf(other_element) + if c_node.type == tree.XML_ELEMENT_NODE: + if tree.xmlDocGetRootElement(self._c_node) is not NULL: + raise ValueError, u"cannot append, document already has a root element" + elif c_node.type not in (tree.XML_PI_NODE, tree.XML_COMMENT_NODE): + raise TypeError, u"unsupported element type for top-level node: %d" % c_node.type + c_node = _copyNodeToDoc(c_node, self._c_node) + c_next = c_node.next + tree.xmlAddChild(self._c_node, c_node) + _moveTail(c_next, c_node) + + def extend(self, elements): + u"""Append a copy of all Elements from a sequence to the list of + children. + """ + self._assertNode() + for element in elements: + self.append(element) + +cdef extern from "etree_defs.h": + # macro call to 't->tp_new()' for fast instantiation + cdef _OpaqueNodeWrapper NEW_OPAQUE_NODE_PROXY "PY_NEW" (object t) + +cdef _OpaqueNodeWrapper _newOpaqueAppendOnlyNodeWrapper(xmlNode* c_node): + cdef _OpaqueNodeWrapper node + if c_node.type in (tree.XML_DOCUMENT_NODE, tree.XML_HTML_DOCUMENT_NODE): + node = NEW_OPAQUE_NODE_PROXY(_OpaqueDocumentWrapper) + else: + node = NEW_OPAQUE_NODE_PROXY(_OpaqueNodeWrapper) + node._c_node = c_node + return node + +# element proxies that allow restricted modification cdef class _ModifyContentOnlyProxy(_ReadOnlyProxy): u"""A read-only proxy that allows changing the text content. @@ -453,6 +508,8 @@ el = NEW_RO_PROXY(_ModifyContentOnlyPIProxy) elif c_node.type == tree.XML_COMMENT_NODE: el = NEW_RO_PROXY(_ModifyContentOnlyProxy) + else: + raise TypeError("Unsupported element type: %d" % c_node.type) el._c_node = c_node _initReadOnlyProxy(el, source_proxy) return el @@ -463,8 +520,25 @@ c_node = (<_Element>element)._c_node elif isinstance(element, _ReadOnlyProxy): c_node = (<_ReadOnlyProxy>element)._c_node + elif isinstance(element, _OpaqueNodeWrapper): + c_node = (<_OpaqueNodeWrapper>element)._c_node + else: + raise TypeError, u"invalid argument type %s" % type(element) + + if c_node is NULL: + raise TypeError, u"invalid element" + return c_node + +cdef xmlNode* _nonRoNodeOf(element) except NULL: + cdef xmlNode* c_node + if isinstance(element, _Element): + c_node = (<_Element>element)._c_node + elif isinstance(element, _AppendOnlyElementProxy): + c_node = (<_AppendOnlyElementProxy>element)._c_node + elif isinstance(element, _OpaqueNodeWrapper): + c_node = (<_OpaqueNodeWrapper>element)._c_node else: - raise TypeError, u"invalid value to append()" + raise TypeError, u"invalid argument type %s" % type(element) if c_node is NULL: raise TypeError, u"invalid element" Modified: lxml/trunk/src/lxml/tests/test_xslt.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_xslt.py (original) +++ lxml/trunk/src/lxml/tests/test_xslt.py Sun Jan 24 19:36:04 2010 @@ -1473,6 +1473,168 @@ self.assertEquals(self._rootstring(result), _bytes('YXYZ')) + def test_extension_element_apply_templates_target_node(self): + tree = self.parse('B') + style = self.parse('''\ + + + XY + + + XYZ +''') + + class MyExt(etree.XSLTExtension): + def execute(self, context, self_node, input_node, output_parent): + for child in self_node: + self.apply_templates(context, child, output_parent) + + extensions = { ('testns', 'myext') : MyExt() } + + result = tree.xslt(style, extensions=extensions) + self.assertEquals(self._rootstring(result), + _bytes('YXYZ')) + + def test_extension_element_apply_templates_target_node_doc(self): + tree = self.parse('B') + style = self.parse('''\ + + + XY + + TEST + XYZ + TEST +''') + + class MyExt(etree.XSLTExtension): + def execute(self, context, self_node, input_node, output_parent): + for child in self_node: + self.apply_templates(context, child, output_parent) + + extensions = { ('testns', 'myext') : MyExt() } + + result = tree.xslt(style, extensions=extensions) + self.assertEquals(etree.tostring(result), + _bytes('XYZ')) + + def test_extension_element_process_children(self): + tree = self.parse('E') + style = self.parse('''\ + + + yo + + + + + + + + + + + + + + +''') + + class MyExt(etree.XSLTExtension): + def execute(self, context, self_node, input_node, output_parent): + el = etree.Element('MY') + self.process_children(context, el) + output_parent.append(el) + + extensions = { ('testns', 'myext') : MyExt() } + + result = tree.xslt(style, extensions=extensions) + self.assertEquals(self._rootstring(result), + _bytes('E')) + + def test_extension_element_process_children_to_append_only(self): + tree = self.parse('') + style = self.parse('''\ + + + + + + +''') + + class MyExt(etree.XSLTExtension): + def execute(self, context, self_node, input_node, output_parent): + self.process_children(context, output_parent) + + extensions = { ('testns', 'myext') : MyExt() } + + result = tree.xslt(style, extensions=extensions) + self.assertEquals(self._rootstring(result), + _bytes('')) + + def test_extension_element_process_children_to_read_only_raise(self): + tree = self.parse('') + style = self.parse('''\ + + + + + + +''') + + class MyExt(etree.XSLTExtension): + def execute(self, context, self_node, input_node, output_parent): + self.process_children(context, self_node) + + extensions = { ('testns', 'myext') : MyExt() } + + self.assertRaises(TypeError, tree.xslt, style, extensions=extensions) + + def test_extension_element_process_children_with_subextension_element(self): + tree = self.parse('') + style = self.parse('''\ + + + + + + +''') + + class MyExt(etree.XSLTExtension): + callback_call_counter = 0 + def execute(self, context, self_node, input_node, output_parent): + self.callback_call_counter += 1 + el = etree.Element('MY', n=str(self.callback_call_counter)) + self.process_children(context, el) + output_parent.append(el) + + extensions = { ('testns', 'myext') : MyExt() } + + result = tree.xslt(style, extensions=extensions) + self.assertEquals(self._rootstring(result), + _bytes('')) + def test_extension_element_raise(self): tree = self.parse('B') style = self.parse('''\ Modified: lxml/trunk/src/lxml/xslt.pxd ============================================================================== --- lxml/trunk/src/lxml/xslt.pxd (original) +++ lxml/trunk/src/lxml/xslt.pxd Sun Jan 24 19:36:04 2010 @@ -30,10 +30,13 @@ xmlNode* node xmlDoc* output xmlNode* insert + xmlNode* inst xsltTransformState state ctypedef struct xsltStackElem + ctypedef struct xsltTemplate + cdef xsltStylesheet* xsltParseStylesheetDoc(xmlDoc* doc) nogil cdef void xsltFreeStylesheet(xsltStylesheet* sheet) nogil @@ -84,6 +87,10 @@ cdef xsltTransformContext* xsltNewTransformContext(xsltStylesheet* style, xmlDoc* doc) nogil cdef void xsltFreeTransformContext(xsltTransformContext* context) nogil + cdef void xsltApplyOneTemplate(xsltTransformContext* ctxt, + xmlNode* contextNode, xmlNode* list, + xsltTemplate* templ, + xsltStackElem* params) nogil cdef extern from "libxslt/xsltutils.h": cdef int xsltSaveResultToString(char** doc_txt_ptr, Modified: lxml/trunk/src/lxml/xsltext.pxi ============================================================================== --- lxml/trunk/src/lxml/xsltext.pxi (original) +++ lxml/trunk/src/lxml/xsltext.pxi Sun Jan 24 19:36:04 2010 @@ -12,30 +12,40 @@ content. To this end, the `input_node` provides read-only access to the current node in the input document, and the `self_node` points to the extension element in the stylesheet. + + Note that the `output_parent` parameter may be `None` if there + is no parent element in the current context (e.g. no content + was added to the output tree yet). """ pass - def apply_templates(self, _XSLTContext context not None, node): - u"""apply_templates(self, context, node) + def apply_templates(self, _XSLTContext context not None, node, output_parent=None): + u"""apply_templates(self, context, node, output_parent=None) Call this method to retrieve the result of applying templates to an element. The return value is a list of elements or text strings that were generated by the XSLT processor. + + If you pass an Element as `output_parent` parameter, the result + will instead be appended to the element (including attributes + etc.) and the return value will be `None`. This is a safe way + to generate content into the output document directly, without + having to take care of special values like text or attributes. """ cdef xmlNode* c_parent cdef xmlNode* c_node - cdef xmlNode* c_next cdef xmlNode* c_context_node - cdef _ReadOnlyProxy proxy - cdef list results c_context_node = _roNodeOf(node) #assert c_context_node.doc is context._xsltContext.node.doc, \ # "switching input documents during transformation is not currently supported" - c_parent = tree.xmlNewDocNode( - context._xsltCtxt.output, NULL, "fake-parent", NULL) + if output_parent is not None: + c_parent = _nonRoNodeOf(output_parent) + else: + c_parent = tree.xmlNewDocNode( + context._xsltCtxt.output, NULL, "fake-parent", NULL) c_node = context._xsltCtxt.insert context._xsltCtxt.insert = c_parent @@ -43,27 +53,79 @@ context._xsltCtxt, c_context_node, NULL) context._xsltCtxt.insert = c_node - results = [] # or maybe _collectAttributes(c_parent, 2) ? - c_node = c_parent.children + if output_parent is not None: + return None + try: - while c_node is not NULL: - c_next = c_node.next - if c_node.type == tree.XML_TEXT_NODE: - results.append(funicode(c_node.content)) - elif c_node.type == tree.XML_ELEMENT_NODE: - proxy = _newReadOnlyProxy( - context._extension_element_proxy, c_node) - results.append(proxy) - # unlink node and make sure it will be freed later on - tree.xmlUnlinkNode(c_node) - proxy.free_after_use() - else: - raise TypeError, \ - u"unsupported XSLT result type: %d" % c_node.type - c_node = c_next + return self._collectXSLTResultContent(context, c_parent) + finally: + # free all intermediate nodes that will not be freed by proxies + tree.xmlFreeNode(c_parent) + + def process_children(self, _XSLTContext context not None, output_parent=None): + u"""process_children(self, context, output_parent=None) + + Call this method to process the XSLT content of the extension + element itself. + + The return value is a list of elements or text strings that + were generated by the XSLT processor. + + If you pass an Element as `output_parent` parameter, the result + will instead be appended to the element (including attributes + etc.) and the return value will be `None`. This is a safe way + to generate content into the output document directly, without + having to take care of special values like text or attributes. + """ + cdef xmlNode* c_parent + cdef xslt.xsltTransformContext* c_ctxt = context._xsltCtxt + cdef xmlNode* c_old_output_parent = c_ctxt.insert + + # output_parent node is used for adding results instead of + # elements list used in apply_templates, that's easier and allows to + # use attributes added to extension element with . + + if output_parent is not None: + c_parent = _nonRoNodeOf(output_parent) + else: + c_parent = tree.xmlNewDocNode( + context._xsltCtxt.output, NULL, "fake-parent", NULL) + + c_ctxt.insert = _nonRoNodeOf(output_parent) + xslt.xsltApplyOneTemplate(c_ctxt, + c_ctxt.node, c_ctxt.inst.children, NULL, NULL) + c_ctxt.insert = c_old_output_parent + + if output_parent is not None: + return None + + try: + return self._collectXSLTResultContent(context, c_parent) finally: # free all intermediate nodes that will not be freed by proxies tree.xmlFreeNode(c_parent) + + cdef _collectXSLTResultContent(self, _XSLTContext context, xmlNode* c_parent): + cdef xmlNode* c_node + cdef xmlNode* c_next + cdef _ReadOnlyProxy proxy + cdef list results = [] # or maybe _collectAttributes(c_parent, 2) ? + c_node = c_parent.children + while c_node is not NULL: + c_next = c_node.next + if c_node.type == tree.XML_TEXT_NODE: + results.append(funicode(c_node.content)) + elif c_node.type == tree.XML_ELEMENT_NODE: + proxy = _newReadOnlyProxy( + context._extension_element_proxy, c_node) + results.append(proxy) + # unlink node and make sure it will be freed later on + tree.xmlUnlinkNode(c_node) + proxy.free_after_use() + else: + raise TypeError, \ + u"unsupported XSLT result type: %d" % c_node.type + c_node = c_next return results @@ -82,7 +144,8 @@ cdef python.PyObject* dict_result cdef char* c_uri cdef xmlNode* c_node - cdef _ReadOnlyProxy context_node = None, self_node = None, output_parent + cdef _ReadOnlyProxy context_node = None, self_node = None + cdef object output_parent # not restricted to ro-nodes c_uri = _getNs(c_inst_node) if c_uri is NULL: # not allowed, and should never happen @@ -103,7 +166,11 @@ try: # build the context proxy nodes self_node = _newReadOnlyProxy(None, c_inst_node) - output_parent = _newAppendOnlyProxy(self_node, c_ctxt.insert) + if _isElement(c_ctxt.insert): + output_parent = _newAppendOnlyProxy(self_node, c_ctxt.insert) + else: + # may be the document node or other stuff + output_parent = _newOpaqueAppendOnlyNodeWrapper(c_ctxt.insert) if c_context_node.type in (tree.XML_DOCUMENT_NODE, tree.XML_HTML_DOCUMENT_NODE): c_node = tree.xmlDocGetRootElement(c_context_node) From scoder at codespeak.net Mon Jan 25 10:26:59 2010 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 25 Jan 2010 10:26:59 +0100 (CET) Subject: [Lxml-checkins] r70814 - lxml/trunk Message-ID: <20100125092659.C54751680F7@codespeak.net> Author: scoder Date: Mon Jan 25 10:26:58 2010 New Revision: 70814 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt Log: r5413 at lenny: sbehnel | 2010-01-25 10:15:10 +0100 changelog Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Mon Jan 25 10:26:58 2010 @@ -14,6 +14,12 @@ * ``XSLTExtension.process_children()`` to process the content of the XSLT extension element itself. +* ISO-Schematron support based on the de-facto Schematron reference + 'skeleton implementation'. + +* XSLT objects now take XPath object as ``__call__`` stylesheet + parameters. + * Enable path caching in ElementPath (``el.find*()``) to avoid parsing overhead. @@ -40,11 +46,6 @@ * Target parsers show their target object in the ``.target`` property (compatible with ElementTree). -* XSLT objects now take XPath object as __call__ stylesheet parameters - -* ISO-Schematron support based on the de-facto Schematron reference - 'skeleton implementation' - Bugs fixed ---------- From scoder at codespeak.net Mon Jan 25 10:27:03 2010 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 25 Jan 2010 10:27:03 +0100 (CET) Subject: [Lxml-checkins] r70815 - in lxml/trunk: . src/lxml Message-ID: <20100125092703.DC84E1680F8@codespeak.net> Author: scoder Date: Mon Jan 25 10:27:02 2010 New Revision: 70815 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/lxml.etree.pyx Log: r5414 at lenny: sbehnel | 2010-01-25 10:26:52 +0100 docstring updates Modified: lxml/trunk/src/lxml/lxml.etree.pyx ============================================================================== --- lxml/trunk/src/lxml/lxml.etree.pyx (original) +++ lxml/trunk/src/lxml/lxml.etree.pyx Mon Jan 25 10:27:02 2010 @@ -2604,7 +2604,7 @@ compatible encoding will enable a declaration by default. You can also serialise to a Unicode string without declaration by - passing the ``unicode`` function as encoding. + passing the ``unicode`` function as encoding (or ``str`` in Py3). The keyword argument 'pretty_print' (bool) enables formatted XML. @@ -2673,6 +2673,8 @@ Serialize an element to the Python unicode representation of its XML tree. + :deprecated: use ``tostring(el, encoding=unicode)`` instead. + Note that the result does not carry an XML encoding declaration and is therefore not necessarily suited for serialization to byte streams without further treatment. @@ -2685,8 +2687,6 @@ You can prevent the tail text of the element from being serialised by passing the boolean ``with_tail`` option. This has no impact on the tail text of children, which will always be serialised. - - :deprecated: use ``tostring(el, encoding=unicode)`` instead. """ if isinstance(element_or_tree, _Element): return _tostring(<_Element>element_or_tree, _unicode, method, @@ -2705,6 +2705,20 @@ Return an ElementTree object loaded with source elements. If no parser is provided as second argument, the default parser is used. + The ``source`` can be any of the following: + + - a file name/path + - a file object + - a file-like object + - a URL using the HTTP or FTP protocol + + To parse from a string, use the ``fromstring()`` function instead. + + Note that it is generally faster to parse from a file path or URL + than from an open file object or file-like object. Transparent + decompression from gzip compressed sources is supported (unless + explicitly disabled in libxml2). + The ``base_url`` keyword allows setting a URL for the document when parsing from a file-like object. This is needed when looking up external entities (DTD, XInclude, ...) with relative paths. From scoder at codespeak.net Mon Jan 25 10:33:08 2010 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 25 Jan 2010 10:33:08 +0100 (CET) Subject: [Lxml-checkins] r70816 - in lxml/trunk: . doc Message-ID: <20100125093308.6182B1680F4@codespeak.net> Author: scoder Date: Mon Jan 25 10:33:07 2010 New Revision: 70816 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/docstructure.py Log: r5417 at lenny: sbehnel | 2010-01-25 10:32:52 +0100 removed 'what's new in 2.0' page Modified: lxml/trunk/doc/docstructure.py ============================================================================== --- lxml/trunk/doc/docstructure.py (original) +++ lxml/trunk/doc/docstructure.py Mon Jan 25 10:33:07 2010 @@ -1,6 +1,6 @@ SITE_STRUCTURE = [ - ('lxml', ('main.txt', 'intro.txt', '../INSTALL.txt', 'lxml2.txt', + ('lxml', ('main.txt', 'intro.txt', '../INSTALL.txt', # 'lxml2.txt', 'performance.txt', 'compatibility.txt', 'FAQ.txt')), ('Developing with lxml', ('tutorial.txt', '@API reference', 'api.txt', 'parsing.txt', From scoder at codespeak.net Mon Jan 25 10:35:39 2010 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 25 Jan 2010 10:35:39 +0100 (CET) Subject: [Lxml-checkins] r70817 - in lxml/trunk: . src/lxml/isoschematron Message-ID: <20100125093539.DAF291680F7@codespeak.net> Author: scoder Date: Mon Jan 25 10:35:39 2010 New Revision: 70817 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/isoschematron/__init__.py Log: r5419 at lenny: sbehnel | 2010-01-25 10:35:25 +0100 docstring fix Modified: lxml/trunk/src/lxml/isoschematron/__init__.py ============================================================================== --- lxml/trunk/src/lxml/isoschematron/__init__.py (original) +++ lxml/trunk/src/lxml/isoschematron/__init__.py Mon Jan 25 10:35:39 2010 @@ -114,10 +114,12 @@ Built on the Schematron language 'reference' skeleton pure-xslt implementation, the validator is created as an XSLT 1.0 stylesheet using these steps: - 0) (Extract from XML Schema or RelaxNG schema) - 1) Process inclusions - 2) Process abstract patterns - 3) Compile the schematron schema to XSLT + + 0) (Extract from XML Schema or RelaxNG schema) + 1) Process inclusions + 2) Process abstract patterns + 3) Compile the schematron schema to XSLT + The ``include`` and ``expand`` keyword arguments can be used to switch off steps 1) and 2). To set parameters for steps 1), 2) and 3) hand dictionaries containing xslt From scoder at codespeak.net Wed Jan 27 18:10:46 2010 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 27 Jan 2010 18:10:46 +0100 (CET) Subject: [Lxml-checkins] r70928 - in lxml/trunk: . doc Message-ID: <20100127171046.B07C5168075@codespeak.net> Author: scoder Date: Wed Jan 27 18:10:43 2010 New Revision: 70928 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/mklatex.py Log: r5421 at lenny: sbehnel | 2010-01-27 18:10:35 +0100 fix PDF building for docutils 0.6 Modified: lxml/trunk/doc/mklatex.py ============================================================================== --- lxml/trunk/doc/mklatex.py (original) +++ lxml/trunk/doc/mklatex.py Wed Jan 27 18:10:43 2010 @@ -43,8 +43,25 @@ DOCUMENT_CLASS = r""" \documentclass[10pt,english]{report} \usepackage[a4paper]{geometry} +\usepackage{tabularx} +\usepackage{ifthen} +\usepackage[pdftex]{graphicx} \parindent0pt \parskip1ex + +%%% Fallback definitions for Docutils-specific commands + +% providelength (provide a length variable and set default, if it is new) +\providecommand*{\DUprovidelength}[2]{ + \ifthenelse{\isundefined{#1}}{\newlength{#1}\setlength{#1}{#2}}{} +} + +% docinfo (width of docinfo table) +\DUprovidelength{\DUdocinfowidth}{0.9\textwidth} + +% titlereference role +\providecommand*{\DUroletitlereference}[1]{\textsl{#1}} + """ PYGMENTS_IMPORT = r""" @@ -86,7 +103,10 @@ doc = file(src, 'r') out = file(dest, "w") for line in doc: - if line.startswith('%% generator') or line.startswith('% generated by '): + if line.startswith('%% generator') \ + or line.startswith('% generated by ') \ + or '\\begin{document}' in line \ + or '\\makeindex' in line: break if line.startswith('%') or \ r'\documentclass' in line or \ @@ -107,11 +127,11 @@ counter_no = 0 -def tex_postprocess(src, dest, want_header = False, process_line=noop): +def tex_postprocess(src_path, dest_path, want_header=False, process_line=noop): """ Postprocessing of the LaTeX file generated from ReST. - Reads file src and saves to dest only the true content + Reads file src_path and saves to dest_path only the true content (without the document header and final) - so it is suitable to be used as part of the longer document. @@ -127,22 +147,27 @@ counter_no = counter_no + 1 counter_text = "listcnt%d" % counter_no - search_title = re.compile(r'\\title{([^}]*)}').search - skipping = re.compile(r'(\\end{document}|\\tableofcontents)').search + search_title = re.compile(r'\\title{([^{}]*(?:{[^}]*})*)}').search + skipping = re.compile(r'(\\end{document}|\\tableofcontents|^%)').search + + src = file(src_path) + dest = file(dest_path, "w") - src = file(src) - dest = file(dest, "w") + src_text = src.read() + src.close() - iter_lines = iter(src.readlines()) + title = search_title(src_text) + if title: + # remove any commands from the title + title = re.sub(r'\\\w+({[^}]*})?', '', title.group(1)) + + iter_lines = iter(src_text.splitlines()) for l in iter_lines: l = process_line(l) if not l: continue if want_header: add_header_line(replace_rst_macros('', l)) - m = search_title(l) - if m: - title = m.group(0) if l.startswith("\\maketitle"): break @@ -155,10 +180,10 @@ break elif "listcnt0" in l: l = l.replace("listcnt0", counter_text) - dest.write(l) + dest.write(l + '\n') if not title: - raise Exception("Bueee, no title") + raise Exception("Bueee, no title in %s" % src_path) return title, header def publish(dirname, lxml_path, release): @@ -205,7 +230,6 @@ return replace_docinternal_hyperrefs(r'\hyperref[\1]', line) # Building pages - have_epydoc_macros = False for section, text_files in SITE_STRUCTURE: for filename in text_files: if filename.startswith('@'): @@ -270,9 +294,7 @@ for hln in header: if hln.startswith(r"\documentclass"): #hln = hln.replace('article', 'book') - hln = DOCUMENT_CLASS - elif hln.startswith("%% generator ") or hln.startswith("% generated "): - master.write(EPYDOC_IMPORT) + hln = DOCUMENT_CLASS + EPYDOC_IMPORT elif hln.startswith(r"\begin{document}"): # pygments and epydoc support master.write(PYGMENTS_IMPORT) @@ -285,7 +307,7 @@ elif hln.startswith("pdftitle"): hln = replace_content( r'{%s}' % book_title, hln) - master.write(hln) + master.write(hln + '\n') master.write("\\setcounter{page}{2}\n") master.write("\\tableofcontents\n") From scoder at codespeak.net Thu Jan 28 16:37:01 2010 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 28 Jan 2010 16:37:01 +0100 (CET) Subject: [Lxml-checkins] r70954 - in lxml/trunk: . src/lxml Message-ID: <20100128153701.50114168079@codespeak.net> Author: scoder Date: Thu Jan 28 16:37:00 2010 New Revision: 70954 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/lxml.etree.pyx Log: r5423 at lenny: sbehnel | 2010-01-28 16:36:53 +0100 code simplification, docstring clarification for el.nsmap Modified: lxml/trunk/src/lxml/lxml.etree.pyx ============================================================================== --- lxml/trunk/src/lxml/lxml.etree.pyx (original) +++ lxml/trunk/src/lxml/lxml.etree.pyx Thu Jan 28 16:37:00 2010 @@ -887,6 +887,8 @@ # not in ElementTree, read-only property nsmap: u"""Namespace prefix->URI mapping known in the context of this Element. + + Note that changing the returned dict has no effect on the Element. """ def __get__(self): cdef xmlNode* c_node @@ -900,8 +902,7 @@ prefix = None else: prefix = funicode(c_ns.prefix) - if prefix not in nsmap: - nsmap[prefix] = funicode(c_ns.href) + nsmap[prefix] = funicode(c_ns.href) c_ns = c_ns.next c_node = c_node.parent return nsmap From scoder at codespeak.net Fri Jan 29 15:48:03 2010 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 29 Jan 2010 15:48:03 +0100 (CET) Subject: [Lxml-checkins] r70972 - lxml/trunk/src/lxml Message-ID: <20100129144803.80B7F16810C@codespeak.net> Author: scoder Date: Fri Jan 29 15:48:02 2010 New Revision: 70972 Modified: lxml/trunk/src/lxml/lxml.etree.pyx Log: reverted last el.nsmap change, more docstring updates Modified: lxml/trunk/src/lxml/lxml.etree.pyx ============================================================================== --- lxml/trunk/src/lxml/lxml.etree.pyx (original) +++ lxml/trunk/src/lxml/lxml.etree.pyx Fri Jan 29 15:48:02 2010 @@ -886,7 +886,9 @@ # not in ElementTree, read-only property nsmap: - u"""Namespace prefix->URI mapping known in the context of this Element. + u"""Namespace prefix->URI mapping known in the context of this + Element. This includes all namespace declarations of the + parents. Note that changing the returned dict has no effect on the Element. """ @@ -902,7 +904,8 @@ prefix = None else: prefix = funicode(c_ns.prefix) - nsmap[prefix] = funicode(c_ns.href) + if prefix not in nsmap: + nsmap[prefix] = funicode(c_ns.href) c_ns = c_ns.next c_node = c_node.parent return nsmap From scoder at codespeak.net Fri Jan 29 16:17:06 2010 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 29 Jan 2010 16:17:06 +0100 (CET) Subject: [Lxml-checkins] r70973 - in lxml/trunk: . src/lxml Message-ID: <20100129151706.D39FE1683D2@codespeak.net> Author: scoder Date: Fri Jan 29 16:17:06 2010 New Revision: 70973 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/tree.pxd Log: r5425 at lenny: sbehnel | 2010-01-29 15:37:06 +0100 new serialisation constants Modified: lxml/trunk/src/lxml/tree.pxd ============================================================================== --- lxml/trunk/src/lxml/tree.pxd (original) +++ lxml/trunk/src/lxml/tree.pxd Fri Jan 29 16:17:06 2010 @@ -296,6 +296,9 @@ XML_SAVE_NO_DECL = 2 # drop the xml declaration (2.6.21) XML_SAVE_NO_EMPTY = 4 # no empty tags (2.6.22) XML_SAVE_NO_XHTML = 8 # disable XHTML1 specific rules (2.6.22) + XML_SAVE_XHTML = 16 # force XHTML1 specific rules (2.7.2) + XML_SAVE_AS_XML = 32 # force XML serialization on HTML doc (2.7.2) + XML_SAVE_AS_HTML = 64 # force HTML serialization on XML doc (2.7.2) cdef xmlSaveCtxt* xmlSaveToFilename(char* filename, char* encoding, int options) nogil From scoder at codespeak.net Fri Jan 29 16:17:09 2010 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 29 Jan 2010 16:17:09 +0100 (CET) Subject: [Lxml-checkins] r70974 - lxml/trunk Message-ID: <20100129151709.ABA2D1683D3@codespeak.net> Author: scoder Date: Fri Jan 29 16:17:09 2010 New Revision: 70974 Modified: lxml/trunk/ (props changed) lxml/trunk/TODO.txt Log: r5426 at lenny: sbehnel | 2010-01-29 15:37:25 +0100 todos Modified: lxml/trunk/TODO.txt ============================================================================== --- lxml/trunk/TODO.txt (original) +++ lxml/trunk/TODO.txt Fri Jan 29 16:17:09 2010 @@ -8,8 +8,6 @@ In general ---------- -* test namespaces more in-depth - * more testing on multi-threading * better exception messages for XPath and schemas based on error log, @@ -26,6 +24,8 @@ * use per-call or per-thread error logs in XSLT/XPath/etc. to keep the messages separate, especially in exceptions +* add 'nsmap' parameter to cleanup_namespaces() + QName ----- From scoder at codespeak.net Fri Jan 29 16:17:20 2010 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 29 Jan 2010 16:17:20 +0100 (CET) Subject: [Lxml-checkins] r70975 - in lxml/trunk: . src/lxml/tests Message-ID: <20100129151720.81D3C1683D2@codespeak.net> Author: scoder Date: Fri Jan 29 16:17:20 2010 New Revision: 70975 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/tests/test_etree.py Log: r5429 at lenny: sbehnel | 2010-01-29 15:50:17 +0100 cleanup Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Fri Jan 29 16:17:20 2010 @@ -1993,12 +1993,8 @@ r = re.copy() r.update(rs) - self.assertEquals( - re, - e.nsmap) - self.assertEquals( - r, - s.nsmap) + self.assertEquals(re, e.nsmap) + self.assertEquals(r, s.nsmap) def test_getiterator_filter_namespace(self): Element = self.etree.Element From scoder at codespeak.net Fri Jan 29 16:17:24 2010 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 29 Jan 2010 16:17:24 +0100 (CET) Subject: [Lxml-checkins] r70976 - in lxml/trunk: . doc src/lxml src/lxml/tests Message-ID: <20100129151724.94A761683D3@codespeak.net> Author: scoder Date: Fri Jan 29 16:17:24 2010 New Revision: 70976 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/doc/api.txt lxml/trunk/src/lxml/lxml.etree.pyx lxml/trunk/src/lxml/serializer.pxi lxml/trunk/src/lxml/tests/test_etree.py Log: r5430 at lenny: sbehnel | 2010-01-29 16:15:53 +0100 support overriding the DOCTYPE on serialisation Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Fri Jan 29 16:17:24 2010 @@ -8,6 +8,9 @@ Features added -------------- +* On serialisation, the new ``doctype`` parameter can be used to + override the DOCTYPE (internal subset) of the document. + * New parameter ``output_parent`` to ``XSLTExtension.apply_templates()`` to append the resulting content directly to an output element. Modified: lxml/trunk/doc/api.txt ============================================================================== --- lxml/trunk/doc/api.txt (original) +++ lxml/trunk/doc/api.txt Fri Jan 29 16:17:24 2010 @@ -434,6 +434,27 @@ ... lxml.etree.XMLSyntaxError: ... +Since version 2.3, the serialisation can override the internal subset +of the document with a user provided DOCTYPE: + +.. sourcecode:: pycon + + >>> xml = '\n' + >>> tree = etree.parse(StringIO(xml)) + + >>> print(etree.tostring(tree)) + + + + >>> print(etree.tostring(tree, + ... doctype='')) + + + +The content will be encoded, but otherwise copied verbatimly into the +output stream. It is therefore left to the user to take care for a +correct doctype format, including the name of the root node. + CDATA ----- Modified: lxml/trunk/src/lxml/lxml.etree.pyx ============================================================================== --- lxml/trunk/src/lxml/lxml.etree.pyx (original) +++ lxml/trunk/src/lxml/lxml.etree.pyx Fri Jan 29 16:17:24 2010 @@ -1659,7 +1659,7 @@ def write(self, file, *, encoding=None, method=u"xml", pretty_print=False, xml_declaration=None, with_tail=True, - standalone=None, compression=0): + standalone=None, docstring=None, compression=0): u"""write(self, file, encoding=None, method="xml", pretty_print=False, xml_declaration=None, with_tail=True, standalone=None, compression=0) @@ -1702,7 +1702,7 @@ is_standalone = 0 if compression is None or compression < 0: compression = 0 - _tofilelike(file, self._context_node, encoding, method, + _tofilelike(file, self._context_node, encoding, docstring, method, write_declaration, 1, pretty_print, with_tail, is_standalone, compression) @@ -2594,10 +2594,10 @@ def tostring(element_or_tree, *, encoding=None, method=u"xml", xml_declaration=None, pretty_print=False, with_tail=True, - standalone=None): + standalone=None, doctype=None): u"""tostring(element_or_tree, encoding=None, method="xml", xml_declaration=None, pretty_print=False, with_tail=True, - standalone=None) + standalone=None, doctype=None) Serialize an element to an encoded string representation of its XML tree. @@ -2618,6 +2618,10 @@ Passing a boolean value to the ``standalone`` option will output an XML declaration with the corresponding ``standalone`` flag. + The ``doctype`` option allows passing in a plain string that will + be serialised before the XML tree. Note that passing in non + well-formed content here will make the XML output non well-formed. + You can prevent the tail text of the element from being serialised by passing the boolean ``with_tail`` option. This has no impact on the tail text of children, which will always be serialised. @@ -2647,13 +2651,13 @@ is_standalone = 0 if isinstance(element_or_tree, _Element): - return _tostring(<_Element>element_or_tree, encoding, method, + return _tostring(<_Element>element_or_tree, encoding, doctype, method, write_declaration, 0, pretty_print, with_tail, is_standalone) elif isinstance(element_or_tree, _ElementTree): return _tostring((<_ElementTree>element_or_tree)._context_node, - encoding, method, write_declaration, 1, pretty_print, - with_tail, is_standalone) + encoding, doctype, method, write_declaration, 1, + pretty_print, with_tail, is_standalone) else: raise TypeError, u"Type '%s' cannot be serialized." % \ python._fqtypename(element_or_tree) @@ -2670,9 +2674,9 @@ return [tostring(element_or_tree, *args, **kwargs)] def tounicode(element_or_tree, *, method=u"xml", pretty_print=False, - with_tail=True): + with_tail=True, doctype=None): u"""tounicode(element_or_tree, method="xml", pretty_print=False, - with_tail=True) + with_tail=True, doctype=None) Serialize an element to the Python unicode representation of its XML tree. @@ -2693,12 +2697,12 @@ on the tail text of children, which will always be serialised. """ if isinstance(element_or_tree, _Element): - return _tostring(<_Element>element_or_tree, _unicode, method, + return _tostring(<_Element>element_or_tree, _unicode, doctype, method, 0, 0, pretty_print, with_tail, -1) elif isinstance(element_or_tree, _ElementTree): return _tostring((<_ElementTree>element_or_tree)._context_node, - _unicode, method, 0, 1, pretty_print, with_tail, - -1) + _unicode, doctype, method, 0, 1, pretty_print, + with_tail, -1) else: raise TypeError, u"Type '%s' cannot be serialized." % \ type(element_or_tree) Modified: lxml/trunk/src/lxml/serializer.pxi ============================================================================== --- lxml/trunk/src/lxml/serializer.pxi (original) +++ lxml/trunk/src/lxml/serializer.pxi Fri Jan 29 16:17:24 2010 @@ -73,7 +73,7 @@ return text -cdef _tostring(_Element element, encoding, method, +cdef _tostring(_Element element, encoding, doctype, method, bint write_xml_declaration, bint write_complete_document, bint pretty_print, bint with_tail, int standalone): u"""Serialize an element to an encoded string representation of its XML @@ -84,6 +84,7 @@ cdef tree.xmlCharEncodingHandler* enchandler cdef char* c_enc cdef char* c_version + cdef char* c_doctype cdef int c_method cdef int error_result if element is None: @@ -96,6 +97,11 @@ else: encoding = _utf8(encoding) c_enc = _cstr(encoding) + if doctype is None: + c_doctype = NULL + else: + doctype = _utf8(doctype) + c_doctype = _cstr(doctype) # it is necessary to *and* find the encoding handler *and* use # encoding during output enchandler = tree.xmlFindCharEncodingHandler(c_enc) @@ -109,7 +115,7 @@ return python.PyErr_NoMemory() with nogil: - _writeNodeToBuffer(c_buffer, element._c_node, c_enc, c_method, + _writeNodeToBuffer(c_buffer, element._c_node, c_enc, c_doctype, c_method, write_xml_declaration, write_complete_document, pretty_print, with_tail, standalone) tree.xmlOutputBufferFlush(c_buffer) @@ -149,8 +155,8 @@ # low-level serialisation functions cdef void _writeNodeToBuffer(tree.xmlOutputBuffer* c_buffer, - xmlNode* c_node, char* encoding, int c_method, - bint write_xml_declaration, + xmlNode* c_node, char* encoding, char* c_doctype, + int c_method, bint write_xml_declaration, bint write_complete_document, bint pretty_print, bint with_tail, int standalone) nogil: @@ -160,9 +166,14 @@ if write_xml_declaration and c_method == OUTPUT_METHOD_XML: _writeDeclarationToBuffer(c_buffer, c_doc.version, encoding, standalone) + if c_doctype is not NULL: + tree.xmlOutputBufferWrite(c_buffer, cstd.strlen(c_doctype), c_doctype) + tree.xmlOutputBufferWriteString(c_buffer, "\n") + # write internal DTD subset, preceding PIs/comments, etc. if write_complete_document: - _writeDtdToBuffer(c_buffer, c_doc, c_node.name, encoding) + if c_doctype is NULL: + _writeDtdToBuffer(c_buffer, c_doc, c_node.name, encoding) _writePrevSiblings(c_buffer, c_node, encoding, pretty_print) c_nsdecl_node = c_node @@ -180,12 +191,12 @@ c_nsdecl_node.last = c_node.last # write node - if c_method == OUTPUT_METHOD_XML: - tree.xmlNodeDumpOutput( - c_buffer, c_doc, c_nsdecl_node, 0, pretty_print, encoding) - else: + if c_method == OUTPUT_METHOD_HTML: tree.htmlNodeDumpFormatOutput( c_buffer, c_doc, c_nsdecl_node, encoding, pretty_print) + else: + tree.xmlNodeDumpOutput( + c_buffer, c_doc, c_nsdecl_node, 0, pretty_print, encoding) if c_nsdecl_node is not c_node: # clean up @@ -353,7 +364,7 @@ cdef int _closeFilelikeWriter(void* ctxt): return (<_FilelikeWriter>ctxt).close() -cdef _tofilelike(f, _Element element, encoding, method, +cdef _tofilelike(f, _Element element, encoding, doctype, method, bint write_xml_declaration, bint write_doctype, bint pretty_print, bint with_tail, int standalone, int compression): @@ -362,12 +373,18 @@ cdef tree.xmlOutputBuffer* c_buffer cdef tree.xmlCharEncodingHandler* enchandler cdef char* c_enc + cdef char* c_doctype cdef int error_result if encoding is None: c_enc = NULL else: encoding = _utf8(encoding) c_enc = _cstr(encoding) + if doctype is None: + c_doctype = NULL + else: + doctype = _utf8(doctype) + c_doctype = _cstr(doctype) c_method = _findOutputMethod(method) if c_method == OUTPUT_METHOD_TEXT: data = _textToString(element._c_node, encoding, with_tail) @@ -411,7 +428,7 @@ raise TypeError, \ u"File or filename expected, got '%s'" % funicode(python._fqtypename(f)) - _writeNodeToBuffer(c_buffer, element._c_node, c_enc, c_method, + _writeNodeToBuffer(c_buffer, element._c_node, c_enc, c_doctype, c_method, write_xml_declaration, write_doctype, pretty_print, with_tail, standalone) error_result = c_buffer.error Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Fri Jan 29 16:17:24 2010 @@ -2439,6 +2439,17 @@ tree = etree.parse(BytesIO(xml)) self.assertEquals(xml, etree.tostring(tree)) + def test_doctype_output_override(self): + etree = self.etree + pub_id = "-//W3C//DTD XHTML 1.0 Transitional//EN" + sys_id = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd" + doctype_string = _bytes('' % (pub_id, sys_id)) + + xml = _bytes('\n') + tree = etree.parse(BytesIO(xml)) + self.assertEquals(xml.replace('', doctype_string), + etree.tostring(tree, doctype=doctype_string)) + def test_xml_base(self): etree = self.etree root = etree.XML(_bytes(""), base_url="http://no/such/url") From scoder at codespeak.net Sat Jan 30 20:41:17 2010 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 30 Jan 2010 20:41:17 +0100 (CET) Subject: [Lxml-checkins] r70995 - in lxml/trunk: . src/lxml Message-ID: <20100130194117.D7E21168078@codespeak.net> Author: scoder Date: Sat Jan 30 20:41:17 2010 New Revision: 70995 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/extensions.pxi Log: r5435 at lenny: sbehnel | 2010-01-30 17:55:15 +0100 comment Modified: lxml/trunk/src/lxml/extensions.pxi ============================================================================== --- lxml/trunk/src/lxml/extensions.pxi (original) +++ lxml/trunk/src/lxml/extensions.pxi Sat Jan 30 20:41:17 2010 @@ -466,6 +466,7 @@ if python.PyUnicode_Check(obj): obj = _utf8(obj) if python.PyBytes_Check(obj): + # libxml2 copies the string value return xpath.xmlXPathNewCString(_cstr(obj)) if python.PyBool_Check(obj): return xpath.xmlXPathNewBoolean(obj) From scoder at codespeak.net Sat Jan 30 20:41:21 2010 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 30 Jan 2010 20:41:21 +0100 (CET) Subject: [Lxml-checkins] r70996 - in lxml/trunk: . src/lxml Message-ID: <20100130194121.36E921680AB@codespeak.net> Author: scoder Date: Sat Jan 30 20:41:20 2010 New Revision: 70996 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/classlookup.pxi Log: r5436 at lenny: sbehnel | 2010-01-30 20:41:08 +0100 fix crash when instantiating custom element classes from lxml.objectify Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Sat Jan 30 20:41:20 2010 @@ -52,6 +52,9 @@ Bugs fixed ---------- +* Manually instantiating the custom element classes in + ``lxml.objectify`` could crash. + * Using XSLT extension elements around the root of the output document crashed. Modified: lxml/trunk/src/lxml/classlookup.pxi ============================================================================== --- lxml/trunk/src/lxml/classlookup.pxi (original) +++ lxml/trunk/src/lxml/classlookup.pxi Sat Jan 30 20:41:20 2010 @@ -45,20 +45,22 @@ cdef bint is_html = 0 cdef _BaseParser parser cdef _Element last_child + # don't use normal attribute access as it might be overridden + _getattr = object.__getattribute__ try: - namespace = _utf8(self.NAMESPACE) + namespace = _utf8(_getattr(self, 'NAMESPACE')) except AttributeError: namespace = None try: - ns, tag = _getNsTag(self.TAG) + ns, tag = _getNsTag(_getattr(self, 'TAG')) if ns is not None: namespace = ns except AttributeError: - tag = _utf8(self.__class__.__name__) - if '.' in tag: - tag = tag.split('.')[-1] + tag = _utf8(_getattr(_getattr(self, '__class__'), '__name__')) + if b'.' in tag: + tag = tag.split(b'.')[-1] try: - parser = self.PARSER + parser = _getattr(self, 'PARSER') except AttributeError: parser = None for child in children: @@ -69,7 +71,7 @@ is_html = 1 if namespace is None: try: - is_html = self.HTML + is_html = _getattr(self, 'HTML') except AttributeError: pass _initNewElement(self, is_html, tag, namespace, parser, From scoder at codespeak.net Sat Jan 30 20:47:09 2010 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 30 Jan 2010 20:47:09 +0100 (CET) Subject: [Lxml-checkins] r70997 - in lxml/branch/lxml-2.2: . src/lxml Message-ID: <20100130194709.14408168078@codespeak.net> Author: scoder Date: Sat Jan 30 20:47:08 2010 New Revision: 70997 Modified: lxml/branch/lxml-2.2/ (props changed) lxml/branch/lxml-2.2/CHANGES.txt lxml/branch/lxml-2.2/INSTALL.txt (props changed) lxml/branch/lxml-2.2/src/lxml/classlookup.pxi Log: trunk merge: fix crash when instantiating custom element classes from lxml.objectify Modified: lxml/branch/lxml-2.2/CHANGES.txt ============================================================================== --- lxml/branch/lxml-2.2/CHANGES.txt (original) +++ lxml/branch/lxml-2.2/CHANGES.txt Sat Jan 30 20:47:08 2010 @@ -13,6 +13,8 @@ Bugs fixed ---------- +* Manually instantiating the custom element classes in + ``lxml.objectify`` could crash. * lxml.html.open_http_urllib() did not work in Python 3. Modified: lxml/branch/lxml-2.2/src/lxml/classlookup.pxi ============================================================================== --- lxml/branch/lxml-2.2/src/lxml/classlookup.pxi (original) +++ lxml/branch/lxml-2.2/src/lxml/classlookup.pxi Sat Jan 30 20:47:08 2010 @@ -45,20 +45,22 @@ cdef bint is_html = 0 cdef _BaseParser parser cdef _Element last_child + # don't use normal attribute access as it might be overridden + _getattr = object.__getattribute__ try: - namespace = _utf8(self.NAMESPACE) + namespace = _utf8(_getattr(self, 'NAMESPACE')) except AttributeError: namespace = None try: - ns, tag = _getNsTag(self.TAG) + ns, tag = _getNsTag(_getattr(self, 'TAG')) if ns is not None: namespace = ns except AttributeError: - tag = _utf8(self.__class__.__name__) - if '.' in tag: - tag = tag.split('.')[-1] + tag = _utf8(_getattr(_getattr(self, '__class__'), '__name__')) + if b'.' in tag: + tag = tag.split(b'.')[-1] try: - parser = self.PARSER + parser = _getattr(self, 'PARSER') except AttributeError: parser = None for child in children: @@ -69,7 +71,7 @@ is_html = 1 if namespace is None: try: - is_html = self.HTML + is_html = _getattr(self, 'HTML') except AttributeError: pass _initNewElement(self, is_html, tag, namespace, parser, From scoder at codespeak.net Sat Jan 30 21:02:36 2010 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 30 Jan 2010 21:02:36 +0100 (CET) Subject: [Lxml-checkins] r70998 - in lxml/branch/lxml-2.2: . doc src/lxml src/lxml/html src/lxml/tests Message-ID: <20100130200236.B3BC2318139@codespeak.net> Author: scoder Date: Sat Jan 30 21:02:33 2010 New Revision: 70998 Modified: lxml/branch/lxml-2.2/ (props changed) lxml/branch/lxml-2.2/CHANGES.txt lxml/branch/lxml-2.2/INSTALL.txt (props changed) lxml/branch/lxml-2.2/doc/FAQ.txt lxml/branch/lxml-2.2/doc/build.txt lxml/branch/lxml-2.2/doc/mklatex.py lxml/branch/lxml-2.2/src/lxml/apihelpers.pxi lxml/branch/lxml-2.2/src/lxml/cleanup.pxi lxml/branch/lxml-2.2/src/lxml/extensions.pxi lxml/branch/lxml-2.2/src/lxml/html/html5parser.py lxml/branch/lxml-2.2/src/lxml/lxml.etree.pyx lxml/branch/lxml-2.2/src/lxml/tests/test_etree.py Log: trunk merges up to rev 70996 Modified: lxml/branch/lxml-2.2/CHANGES.txt ============================================================================== --- lxml/branch/lxml-2.2/CHANGES.txt (original) +++ lxml/branch/lxml-2.2/CHANGES.txt Sat Jan 30 21:02:33 2010 @@ -13,9 +13,16 @@ Bugs fixed ---------- + +* Support recent versions of html5lib by not requiring its + ``XHTMLParser`` in ``htmlparser.py`` anymore. + * Manually instantiating the custom element classes in ``lxml.objectify`` could crash. +* Invalid XML text characters were not rejected by the API when they + appeared in unicode strings directly after non-ASCII characters. + * lxml.html.open_http_urllib() did not work in Python 3. * The functions ``strip_tags()`` and ``strip_elements()`` in Modified: lxml/branch/lxml-2.2/doc/FAQ.txt ============================================================================== --- lxml/branch/lxml-2.2/doc/FAQ.txt (original) +++ lxml/branch/lxml-2.2/doc/FAQ.txt Sat Jan 30 21:02:33 2010 @@ -816,6 +816,22 @@ tree. If you now call a serialization function to pretty print this tree, lxml can add fresh whitespace to the XML tree to indent it. +Note that the ``remove_blank_text`` option also uses a heuristic if it +has no definite knowledge about the document's ignorable whitespace. +It will keep blank text nodes that appear after non-blank text nodes +at the same level. This is to prevent document-style XML from +breaking. + +If you want to be sure all blank text is removed, you have to use +either a DTD to tell the parser which whitespace it can safely ignore, +or remove the ignorable whitespace manually after parsing, e.g. by +setting all tail text to None:: + +.. sourcecode:: python + + for element in root.iter(): + element.tail = None + Fredrik Lundh also has a Python-level function for indenting XML by appending whitespace to tags. It can be found on his `element library`_ recipe page. Modified: lxml/branch/lxml-2.2/doc/build.txt ============================================================================== --- lxml/branch/lxml-2.2/doc/build.txt (original) +++ lxml/branch/lxml-2.2/doc/build.txt Sat Jan 30 21:02:33 2010 @@ -43,7 +43,7 @@ *Only* if you are interested in building lxml from a Subversion checkout (e.g. to test a bug fix that has not been release yet) or if -want to be an lxml developer, then you do need a working Cython +you want to be an lxml developer, then you do need a working Cython installation. You can use EasyInstall_ to install it:: easy_install Cython==0.11 Modified: lxml/branch/lxml-2.2/doc/mklatex.py ============================================================================== --- lxml/branch/lxml-2.2/doc/mklatex.py (original) +++ lxml/branch/lxml-2.2/doc/mklatex.py Sat Jan 30 21:02:33 2010 @@ -43,8 +43,25 @@ DOCUMENT_CLASS = r""" \documentclass[10pt,english]{report} \usepackage[a4paper]{geometry} +\usepackage{tabularx} +\usepackage{ifthen} +\usepackage[pdftex]{graphicx} \parindent0pt \parskip1ex + +%%% Fallback definitions for Docutils-specific commands + +% providelength (provide a length variable and set default, if it is new) +\providecommand*{\DUprovidelength}[2]{ + \ifthenelse{\isundefined{#1}}{\newlength{#1}\setlength{#1}{#2}}{} +} + +% docinfo (width of docinfo table) +\DUprovidelength{\DUdocinfowidth}{0.9\textwidth} + +% titlereference role +\providecommand*{\DUroletitlereference}[1]{\textsl{#1}} + """ PYGMENTS_IMPORT = r""" @@ -86,7 +103,10 @@ doc = file(src, 'r') out = file(dest, "w") for line in doc: - if line.startswith('%% generator') or line.startswith('% generated by '): + if line.startswith('%% generator') \ + or line.startswith('% generated by ') \ + or '\\begin{document}' in line \ + or '\\makeindex' in line: break if line.startswith('%') or \ r'\documentclass' in line or \ @@ -107,11 +127,11 @@ counter_no = 0 -def tex_postprocess(src, dest, want_header = False, process_line=noop): +def tex_postprocess(src_path, dest_path, want_header=False, process_line=noop): """ Postprocessing of the LaTeX file generated from ReST. - Reads file src and saves to dest only the true content + Reads file src_path and saves to dest_path only the true content (without the document header and final) - so it is suitable to be used as part of the longer document. @@ -127,22 +147,27 @@ counter_no = counter_no + 1 counter_text = "listcnt%d" % counter_no - search_title = re.compile(r'\\title{([^}]*)}').search - skipping = re.compile(r'(\\end{document}|\\tableofcontents)').search + search_title = re.compile(r'\\title{([^{}]*(?:{[^}]*})*)}').search + skipping = re.compile(r'(\\end{document}|\\tableofcontents|^%)').search + + src = file(src_path) + dest = file(dest_path, "w") - src = file(src) - dest = file(dest, "w") + src_text = src.read() + src.close() - iter_lines = iter(src.readlines()) + title = search_title(src_text) + if title: + # remove any commands from the title + title = re.sub(r'\\\w+({[^}]*})?', '', title.group(1)) + + iter_lines = iter(src_text.splitlines()) for l in iter_lines: l = process_line(l) if not l: continue if want_header: add_header_line(replace_rst_macros('', l)) - m = search_title(l) - if m: - title = m.group(0) if l.startswith("\\maketitle"): break @@ -155,10 +180,10 @@ break elif "listcnt0" in l: l = l.replace("listcnt0", counter_text) - dest.write(l) + dest.write(l + '\n') if not title: - raise Exception("Bueee, no title") + raise Exception("Bueee, no title in %s" % src_path) return title, header def publish(dirname, lxml_path, release): @@ -205,7 +230,6 @@ return replace_docinternal_hyperrefs(r'\hyperref[\1]', line) # Building pages - have_epydoc_macros = False for section, text_files in SITE_STRUCTURE: for filename in text_files: if filename.startswith('@'): @@ -270,9 +294,7 @@ for hln in header: if hln.startswith(r"\documentclass"): #hln = hln.replace('article', 'book') - hln = DOCUMENT_CLASS - elif hln.startswith("%% generator ") or hln.startswith("% generated "): - master.write(EPYDOC_IMPORT) + hln = DOCUMENT_CLASS + EPYDOC_IMPORT elif hln.startswith(r"\begin{document}"): # pygments and epydoc support master.write(PYGMENTS_IMPORT) @@ -285,7 +307,7 @@ elif hln.startswith("pdftitle"): hln = replace_content( r'{%s}' % book_title, hln) - master.write(hln) + master.write(hln + '\n') master.write("\\setcounter{page}{2}\n") master.write("\\tableofcontents\n") Modified: lxml/branch/lxml-2.2/src/lxml/apihelpers.pxi ============================================================================== --- lxml/branch/lxml-2.2/src/lxml/apihelpers.pxi (original) +++ lxml/branch/lxml-2.2/src/lxml/apihelpers.pxi Sat Jan 30 21:02:33 2010 @@ -1250,11 +1250,11 @@ is_non_ascii = 0 while s < c_end: if s[0] & 0x80: - # skip the entire multi byte sequence - while s[0] & 0x80: + # skip over multi byte sequences + while s < c_end and s[0] & 0x80: s += 1 is_non_ascii = 1 - elif not tree.xmlIsChar_ch(s[0]): + if s < c_end and not tree.xmlIsChar_ch(s[0]): return -1 # invalid! s += 1 return is_non_ascii Modified: lxml/branch/lxml-2.2/src/lxml/cleanup.pxi ============================================================================== --- lxml/branch/lxml-2.2/src/lxml/cleanup.pxi (original) +++ lxml/branch/lxml-2.2/src/lxml/cleanup.pxi Sat Jan 30 21:02:33 2010 @@ -57,7 +57,7 @@ tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1) if c_node.type == tree.XML_ELEMENT_NODE: if c_node.properties is not NULL: - for i in xrange(c_tag_count): + for i in range(c_tag_count): c_href = c_ns_tags[2*i] c_name = c_ns_tags[2*i+1] # must compare attributes manually to make sure we @@ -152,7 +152,7 @@ while c_child is not NULL: c_next = _nextElement(c_child) if c_child.type == tree.XML_ELEMENT_NODE: - for i in xrange(c_tag_count): + for i in range(c_tag_count): if _tagMatchesExactly(c_child, c_ns_tags[2*i], c_ns_tags[2*i+1]): if not with_tail: tree.xmlUnlinkNode(c_child) @@ -241,7 +241,7 @@ c_child = _findChildForwards(c_node, 0) while c_child is not NULL: if c_child.type == tree.XML_ELEMENT_NODE: - for i in xrange(c_tag_count): + for i in range(c_tag_count): if _tagMatchesExactly(c_child, c_ns_tags[2*i], c_ns_tags[2*i+1]): c_next = _findChildForwards(c_child, 0) or _nextElement(c_child) _replaceNodeByChildren(doc, c_child) Modified: lxml/branch/lxml-2.2/src/lxml/extensions.pxi ============================================================================== --- lxml/branch/lxml-2.2/src/lxml/extensions.pxi (original) +++ lxml/branch/lxml-2.2/src/lxml/extensions.pxi Sat Jan 30 21:02:33 2010 @@ -448,10 +448,10 @@ return rexpc.sub(replacement, s, count) cdef _register_in_context(self, _BaseContext context): - ns = "http://exslt.org/regular-expressions" - context._addLocalExtensionFunction(ns, "test", self.test) - context._addLocalExtensionFunction(ns, "match", self.match) - context._addLocalExtensionFunction(ns, "replace", self.replace) + ns = b"http://exslt.org/regular-expressions" + context._addLocalExtensionFunction(ns, b"test", self.test) + context._addLocalExtensionFunction(ns, b"match", self.match) + context._addLocalExtensionFunction(ns, b"replace", self.replace) ################################################################################ Modified: lxml/branch/lxml-2.2/src/lxml/html/html5parser.py ============================================================================== --- lxml/branch/lxml-2.2/src/lxml/html/html5parser.py (original) +++ lxml/branch/lxml-2.2/src/lxml/html/html5parser.py Sat Jan 30 21:02:33 2010 @@ -3,7 +3,7 @@ """ import urllib -from html5lib import HTMLParser as _HTMLParser, XHTMLParser as _XHTMLParser +from html5lib import HTMLParser as _HTMLParser from lxml import etree from lxml.html import _contains_block_level_tag, XHTML_NAMESPACE from lxml.html._html5builder import TreeBuilder @@ -22,11 +22,18 @@ _HTMLParser.__init__(self, strict=strict, tree=TreeBuilder) -class XHTMLParser(_XHTMLParser): - """An html5lib XHTML Parser with lxml as tree.""" +try: + from html5lib import XHTMLParser as _XHTMLParser +except ImportError: + pass +else: + class XHTMLParser(_XHTMLParser): + """An html5lib XHTML Parser with lxml as tree.""" - def __init__(self, strict=False): - _XHTMLParser.__init__(self, strict=strict, tree=TreeBuilder) + def __init__(self, strict=False): + _XHTMLParser.__init__(self, strict=strict, tree=TreeBuilder) + + xhtml_parser = XHTMLParser() def _find_tag(tree, tag): @@ -161,4 +168,3 @@ html_parser = HTMLParser() -xhtml_parser = XHTMLParser() Modified: lxml/branch/lxml-2.2/src/lxml/lxml.etree.pyx ============================================================================== --- lxml/branch/lxml-2.2/src/lxml/lxml.etree.pyx (original) +++ lxml/branch/lxml-2.2/src/lxml/lxml.etree.pyx Sat Jan 30 21:02:33 2010 @@ -2582,7 +2582,7 @@ compatible encoding will enable a declaration by default. You can also serialise to a Unicode string without declaration by - passing the ``unicode`` function as encoding. + passing the ``unicode`` function as encoding (or ``str`` in Py3). The keyword argument 'pretty_print' (bool) enables formatted XML. @@ -2651,6 +2651,8 @@ Serialize an element to the Python unicode representation of its XML tree. + :deprecated: use ``tostring(el, encoding=unicode)`` instead. + Note that the result does not carry an XML encoding declaration and is therefore not necessarily suited for serialization to byte streams without further treatment. @@ -2663,8 +2665,6 @@ You can prevent the tail text of the element from being serialised by passing the boolean ``with_tail`` option. This has no impact on the tail text of children, which will always be serialised. - - :deprecated: use ``tostring(el, encoding=unicode)`` instead. """ if isinstance(element_or_tree, _Element): return _tostring(<_Element>element_or_tree, _unicode, method, @@ -2683,6 +2683,20 @@ Return an ElementTree object loaded with source elements. If no parser is provided as second argument, the default parser is used. + The ``source`` can be any of the following: + + - a file name/path + - a file object + - a file-like object + - a URL using the HTTP or FTP protocol + + To parse from a string, use the ``fromstring()`` function instead. + + Note that it is generally faster to parse from a file path or URL + than from an open file object or file-like object. Transparent + decompression from gzip compressed sources is supported (unless + explicitly disabled in libxml2). + The ``base_url`` keyword allows setting a URL for the document when parsing from a file-like object. This is needed when looking up external entities (DTD, XInclude, ...) with relative paths. Modified: lxml/branch/lxml-2.2/src/lxml/tests/test_etree.py ============================================================================== --- lxml/branch/lxml-2.2/src/lxml/tests/test_etree.py (original) +++ lxml/branch/lxml-2.2/src/lxml/tests/test_etree.py Sat Jan 30 21:02:33 2010 @@ -2495,6 +2495,25 @@ self.assertRaises(ValueError, Element, _str('ha\x02ho')) + def test_unicode_byte_invalid_sequence(self): + Element = self.etree.Element + + a = Element('a') + self.assertRaises(ValueError, setattr, a, "text", + _str('ha\u1234\x07ho')) + self.assertRaises(ValueError, setattr, a, "text", + _str('ha\u1234\x02ho')) + + self.assertRaises(ValueError, setattr, a, "tail", + _str('ha\u1234\x07ho')) + self.assertRaises(ValueError, setattr, a, "tail", + _str('ha\u1234\x02ho')) + + self.assertRaises(ValueError, Element, + _str('ha\u1234\x07ho')) + self.assertRaises(ValueError, Element, + _str('ha\u1234\x02ho')) + def test_encoding_tostring_utf16(self): # ElementTree fails to serialize this tostring = self.etree.tostring From scoder at codespeak.net Sat Jan 30 21:06:56 2010 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 30 Jan 2010 21:06:56 +0100 (CET) Subject: [Lxml-checkins] r70999 - lxml/branch/lxml-2.2/src/lxml Message-ID: <20100130200656.21C97168078@codespeak.net> Author: scoder Date: Sat Jan 30 21:06:52 2010 New Revision: 70999 Modified: lxml/branch/lxml-2.2/src/lxml/lxml.etree.pyx Log: build fix Modified: lxml/branch/lxml-2.2/src/lxml/lxml.etree.pyx ============================================================================== --- lxml/branch/lxml-2.2/src/lxml/lxml.etree.pyx (original) +++ lxml/branch/lxml-2.2/src/lxml/lxml.etree.pyx Sat Jan 30 21:06:52 2010 @@ -1595,11 +1595,11 @@ _copyNonElementSiblings(self._context_node._c_node, root._c_node) doc = root._doc c_doc = self._context_node._doc._c_doc - if c_doc.intSubset and not doc._c_doc.intSubset: + if c_doc.intSubset is not NULL and doc._c_doc.intSubset is NULL: doc._c_doc.intSubset = tree.xmlCopyDtd(c_doc.intSubset) if doc._c_doc.intSubset is NULL: python.PyErr_NoMemory() - if c_doc.extSubset and not doc._c_doc.extSubset: + if c_doc.extSubset is not NULL and doc._c_doc.extSubset is NULL: doc._c_doc.extSubset = tree.xmlCopyDtd(c_doc.extSubset) if doc._c_doc.extSubset is NULL: python.PyErr_NoMemory() From scoder at codespeak.net Sat Jan 30 21:39:49 2010 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 30 Jan 2010 21:39:49 +0100 (CET) Subject: [Lxml-checkins] r71000 - in lxml/trunk: . src/lxml Message-ID: <20100130203949.142C71680AB@codespeak.net> Author: scoder Date: Sat Jan 30 21:39:48 2010 New Revision: 71000 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/parser.pxi Log: r5439 at lenny: sbehnel | 2010-01-30 21:10:56 +0100 free the GIL while parsing chunks Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Sat Jan 30 21:39:48 2010 @@ -1082,17 +1082,20 @@ py_buffer_len -= buffer_len c_data += buffer_len - while (recover or error == 0) and py_buffer_len > 0: - if py_buffer_len > python.INT_MAX: - buffer_len = python.INT_MAX - else: - buffer_len = py_buffer_len - if self._for_html: - error = htmlparser.htmlParseChunk(pctxt, c_data, buffer_len, 0) - else: - error = xmlparser.xmlParseChunk(pctxt, c_data, buffer_len, 0) - py_buffer_len -= buffer_len - c_data += buffer_len + #print pctxt.charset, 'NONE' if c_encoding is NULL else c_encoding + + while py_buffer_len > 0 and (error == 0 or recover): + with nogil: + if py_buffer_len > python.INT_MAX: + buffer_len = python.INT_MAX + else: + buffer_len = py_buffer_len + if self._for_html: + error = htmlparser.htmlParseChunk(pctxt, c_data, buffer_len, 0) + else: + error = xmlparser.xmlParseChunk(pctxt, c_data, buffer_len, 0) + py_buffer_len -= buffer_len + c_data += buffer_len if error and not pctxt.replaceEntities and not pctxt.validate: # in this mode, we ignore errors about undefined entities From scoder at codespeak.net Sat Jan 30 21:39:55 2010 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 30 Jan 2010 21:39:55 +0100 (CET) Subject: [Lxml-checkins] r71001 - in lxml/trunk: . src/lxml Message-ID: <20100130203955.DA5A21680AB@codespeak.net> Author: scoder Date: Sat Jan 30 21:39:55 2010 New Revision: 71001 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/iterparse.pxi Log: r5440 at lenny: sbehnel | 2010-01-30 21:12:44 +0100 streamlined __next__ method in iterparse() Modified: lxml/trunk/src/lxml/iterparse.pxi ============================================================================== --- lxml/trunk/src/lxml/iterparse.pxi (original) +++ lxml/trunk/src/lxml/iterparse.pxi Sat Jan 30 21:39:55 2010 @@ -457,24 +457,26 @@ def __next__(self): cdef _IterparseContext context - cdef xmlparser.xmlParserCtxt* pctxt - cdef cstd.FILE* c_stream - cdef char* c_data - cdef Py_ssize_t c_data_len - cdef int error, done if self._source is None: raise StopIteration context = <_IterparseContext>self._push_parser_context - if python.PyList_GET_SIZE(context._events) > context._event_index: - item = python.PyList_GET_ITEM(context._events, context._event_index) - python.Py_INCREF(item) # 'borrowed reference' from PyList_GET_ITEM - context._event_index += 1 - return item + if python.PyList_GET_SIZE(context._events) <= context._event_index: + self._read_more_events(context) + item = python.PyList_GET_ITEM(context._events, context._event_index) + python.Py_INCREF(item) # 'borrowed reference' from PyList_GET_ITEM + context._event_index += 1 + return item + + cdef _read_more_events(self, _IterparseContext context): + cdef cstd.FILE* c_stream + cdef char* c_data + cdef Py_ssize_t c_data_len + cdef xmlparser.xmlParserCtxt* pctxt = context._c_ctxt + cdef int error = 0, done = 0 del context._events[:] - pctxt = context._c_ctxt - error = done = 0 + context._event_index = 0 c_stream = python.PyFile_AsFile(self._source) while python.PyList_GET_SIZE(context._events) == 0: if c_stream is NULL: @@ -518,11 +520,6 @@ self._source = None raise StopIteration - context._event_index = 1 - element = python.PyList_GET_ITEM(context._events, 0) - python.Py_INCREF(element) # 'borrowed reference' from PyList_GET_ITEM - return element - cdef class iterwalk: u"""iterwalk(self, element_or_tree, events=("end",), tag=None) From scoder at codespeak.net Sat Jan 30 21:40:01 2010 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 30 Jan 2010 21:40:01 +0100 (CET) Subject: [Lxml-checkins] r71003 - in lxml/trunk: . src/lxml Message-ID: <20100130204001.C1F1F1680AF@codespeak.net> Author: scoder Date: Sat Jan 30 21:39:59 2010 New Revision: 71003 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/xmlerror.pxi Log: r5441 at lenny: sbehnel | 2010-01-30 21:18:09 +0100 support passing a logger into PyErrorLog instead of just its name Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Sat Jan 30 21:39:59 2010 @@ -8,6 +8,9 @@ Features added -------------- +* Support passing a readily configured logger instance into + ``PyErrorLog``, instead of a logger name. + * On serialisation, the new ``doctype`` parameter can be used to override the DOCTYPE (internal subset) of the document. Modified: lxml/trunk/src/lxml/xmlerror.pxi ============================================================================== --- lxml/trunk/src/lxml/xmlerror.pxi (original) +++ lxml/trunk/src/lxml/xmlerror.pxi Sat Jan 30 21:39:59 2010 @@ -360,10 +360,11 @@ self._entries.append(entry) cdef class PyErrorLog(_BaseErrorLog): - u"""PyErrorLog(self, logger_name=None) + u"""PyErrorLog(self, logger_name=None, logger=None) A global error log that connects to the Python stdlib logging package. - The constructor accepts an optional logger name. + The constructor accepts an optional logger name or a readily + instantiated logger instance. If you want to change the mapping between libxml2's ErrorLevels and Python logging levels, you can modify the level_map dictionary from a subclass. @@ -378,10 +379,10 @@ object and calls ``self.log(log_entry, format_string, arg1, arg2, ...)`` with appropriate data. """ - cdef readonly object level_map + cdef readonly dict level_map cdef object _map_level cdef object _log - def __init__(self, logger_name=None): + def __init__(self, logger_name=None, logger=None): _BaseErrorLog.__init__(self, None, None) import logging self.level_map = { @@ -390,10 +391,11 @@ ErrorLevels.FATAL : logging.CRITICAL } self._map_level = self.level_map.get - if logger_name: - logger = logging.getLogger(logger_name) - else: - logger = logging.getLogger() + if logger is None: + if logger_name: + logger = logging.getLogger(logger_name) + else: + logger = logging.getLogger() self._log = logger.log def copy(self): From scoder at codespeak.net Sat Jan 30 21:40:05 2010 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 30 Jan 2010 21:40:05 +0100 (CET) Subject: [Lxml-checkins] r71004 - in lxml/trunk: . doc Message-ID: <20100130204005.6B36D1680C1@codespeak.net> Author: scoder Date: Sat Jan 30 21:40:04 2010 New Revision: 71004 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/elementsoup.txt Log: r5442 at lenny: sbehnel | 2010-01-30 21:39:43 +0100 test fix: looks like BS can't handle broken 'HTML' after all Modified: lxml/trunk/doc/elementsoup.txt ============================================================================== --- lxml/trunk/doc/elementsoup.txt (original) +++ lxml/trunk/doc/elementsoup.txt Sat Jan 30 21:40:04 2010 @@ -47,7 +47,7 @@ .. sourcecode:: pycon - >>> tag_soup = 'Hello</head<body onload=crash()>Hi all<p>' + >>> tag_soup = '<meta><head><title>Hello</head><body onload=crash()>Hi all<p>' all you need to do is pass it to the ``fromstring()`` function: From scoder at codespeak.net Sat Jan 30 21:41:42 2010 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 30 Jan 2010 21:41:42 +0100 (CET) Subject: [Lxml-checkins] r71005 - lxml/branch/lxml-2.2/doc Message-ID: <20100130204142.2C01F1680AB@codespeak.net> Author: scoder Date: Sat Jan 30 21:41:41 2010 New Revision: 71005 Modified: lxml/branch/lxml-2.2/doc/elementsoup.txt Log: test fix Modified: lxml/branch/lxml-2.2/doc/elementsoup.txt ============================================================================== --- lxml/branch/lxml-2.2/doc/elementsoup.txt (original) +++ lxml/branch/lxml-2.2/doc/elementsoup.txt Sat Jan 30 21:41:41 2010 @@ -47,7 +47,7 @@ .. sourcecode:: pycon - >>> tag_soup = '<meta><head><title>Hello</head<body onload=crash()>Hi all<p>' + >>> tag_soup = '<meta><head><title>Hello</head><body onload=crash()>Hi all<p>' all you need to do is pass it to the ``fromstring()`` function: From scoder at codespeak.net Sat Jan 30 21:42:18 2010 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 30 Jan 2010 21:42:18 +0100 (CET) Subject: [Lxml-checkins] r71006 - lxml/branch/lxml-2.2/src/lxml Message-ID: <20100130204218.430A21680AB@codespeak.net> Author: scoder Date: Sat Jan 30 21:42:17 2010 New Revision: 71006 Modified: lxml/branch/lxml-2.2/src/lxml/cleanup.pxi Log: build fix for Cython 0.11.3 Modified: lxml/branch/lxml-2.2/src/lxml/cleanup.pxi ============================================================================== --- lxml/branch/lxml-2.2/src/lxml/cleanup.pxi (original) +++ lxml/branch/lxml-2.2/src/lxml/cleanup.pxi Sat Jan 30 21:42:17 2010 @@ -243,7 +243,9 @@ if c_child.type == tree.XML_ELEMENT_NODE: for i in range(c_tag_count): if _tagMatchesExactly(c_child, c_ns_tags[2*i], c_ns_tags[2*i+1]): - c_next = _findChildForwards(c_child, 0) or _nextElement(c_child) + c_next = _findChildForwards(c_child, 0) + if c_next is NULL: + c_next = _nextElement(c_child) _replaceNodeByChildren(doc, c_child) if not attemptDeallocation(c_child): if c_child.nsDef is not NULL: From scoder at codespeak.net Sat Jan 30 21:52:48 2010 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 30 Jan 2010 21:52:48 +0100 (CET) Subject: [Lxml-checkins] r71008 - in lxml/trunk: . src/lxml/tests Message-ID: <20100130205248.976C81680AB@codespeak.net> Author: scoder Date: Sat Jan 30 21:52:48 2010 New Revision: 71008 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/tests/test_objectify.py Log: r5450 at lenny: sbehnel | 2010-01-30 21:52:34 +0100 test case for lxml.objectify bug Modified: lxml/trunk/src/lxml/tests/test_objectify.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_objectify.py (original) +++ lxml/trunk/src/lxml/tests/test_objectify.py Sat Jan 30 21:52:48 2010 @@ -2384,6 +2384,18 @@ self.assert_(isinstance(root.value[0], objectify.IntElement)) self.assert_(isinstance(root.value[1], objectify.FloatElement)) + def test_efactory_subtype(self): + class Attribute(objectify.ObjectifiedDataElement): + def __init__(self): + objectify.ObjectifiedDataElement.__init__(self) + self.set("datatype", "TYPE") + self.set("range", "0.,1.") + + attr = Attribute() + self.assertEquals(attr.text, None) + self.assertEquals(attr.get("datatype"), "TYPE") + self.assertEquals(attr.get("range"), "0.,1.") + def test_XML_base_url_docinfo(self): root = objectify.XML(_bytes("<root/>"), base_url="http://no/such/url") docinfo = root.getroottree().docinfo From scoder at codespeak.net Sat Jan 30 21:53:23 2010 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 30 Jan 2010 21:53:23 +0100 (CET) Subject: [Lxml-checkins] r71009 - in lxml/branch/lxml-2.2: . src/lxml/tests Message-ID: <20100130205323.76A35168016@codespeak.net> Author: scoder Date: Sat Jan 30 21:53:22 2010 New Revision: 71009 Modified: lxml/branch/lxml-2.2/ (props changed) lxml/branch/lxml-2.2/INSTALL.txt (props changed) lxml/branch/lxml-2.2/src/lxml/tests/test_objectify.py Log: merged test case from trunk Modified: lxml/branch/lxml-2.2/src/lxml/tests/test_objectify.py ============================================================================== --- lxml/branch/lxml-2.2/src/lxml/tests/test_objectify.py (original) +++ lxml/branch/lxml-2.2/src/lxml/tests/test_objectify.py Sat Jan 30 21:53:22 2010 @@ -2384,6 +2384,18 @@ self.assert_(isinstance(root.value[0], objectify.IntElement)) self.assert_(isinstance(root.value[1], objectify.FloatElement)) + def test_efactory_subtype(self): + class Attribute(objectify.ObjectifiedDataElement): + def __init__(self): + objectify.ObjectifiedDataElement.__init__(self) + self.set("datatype", "TYPE") + self.set("range", "0.,1.") + + attr = Attribute() + self.assertEquals(attr.text, None) + self.assertEquals(attr.get("datatype"), "TYPE") + self.assertEquals(attr.get("range"), "0.,1.") + def test_XML_base_url_docinfo(self): root = objectify.XML(_bytes("<root/>"), base_url="http://no/such/url") docinfo = root.getroottree().docinfo From scoder at codespeak.net Sat Jan 30 22:37:52 2010 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 30 Jan 2010 22:37:52 +0100 (CET) Subject: [Lxml-checkins] r71010 - in lxml/trunk: . src/lxml/html Message-ID: <20100130213752.9890C1680C1@codespeak.net> Author: scoder Date: Sat Jan 30 22:37:51 2010 New Revision: 71010 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/html/html5parser.py Log: r5454 at lenny: sbehnel | 2010-01-30 22:10:29 +0100 bug #511252: fix fragment parsing in html5parser.py Modified: lxml/trunk/src/lxml/html/html5parser.py ============================================================================== --- lxml/trunk/src/lxml/html/html5parser.py (original) +++ lxml/trunk/src/lxml/html/html5parser.py Sat Jan 30 22:37:51 2010 @@ -94,14 +94,19 @@ if not isinstance(html, _strings): raise TypeError('string required') - if create_parent: - container = create_parent or 'div' - html = '<%s>%s</%s>' % (container, html, container) - children = fragments_fromstring(html, True, guess_charset, parser) if not children: raise etree.ParserError('No elements found') - if len(children) > 1: + if create_parent: + if not isinstance(create_parent, _strings): + create_parent = 'div' + new_root = Element(create_parent) + if isinstance(children[0], _strings): + new_root.text = children[0] + del children[0] + new_root.extend(children) + children = new_root + elif len(children) > 1: raise etree.ParserError('Multiple elements found') result = children[0] From scoder at codespeak.net Sat Jan 30 22:37:57 2010 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 30 Jan 2010 22:37:57 +0100 (CET) Subject: [Lxml-checkins] r71011 - in lxml/trunk: . src/lxml/html Message-ID: <20100130213757.9EF3B1680F2@codespeak.net> Author: scoder Date: Sat Jan 30 22:37:56 2010 New Revision: 71011 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/html/html5parser.py Log: r5455 at lenny: sbehnel | 2010-01-30 22:37:46 +0100 bug #511252: fix fragment parsing in lxml.html Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Sat Jan 30 22:37:56 2010 @@ -55,6 +55,9 @@ Bugs fixed ---------- +* Parsing broken fragments in lxml.html could fail if the fragment + contained an orphaned closing '</div>' tag. + * Manually instantiating the custom element classes in ``lxml.objectify`` could crash. Modified: lxml/trunk/src/lxml/html/html5parser.py ============================================================================== --- lxml/trunk/src/lxml/html/html5parser.py (original) +++ lxml/trunk/src/lxml/html/html5parser.py Sat Jan 30 22:37:56 2010 @@ -89,27 +89,34 @@ element. If create_parent is true (or is a tag name) then a parent node - will be created to encapsulate the HTML in a single element. + will be created to encapsulate the HTML in a single element. In + this case, leading or trailing text is allowed. """ if not isinstance(html, _strings): raise TypeError('string required') - children = fragments_fromstring(html, True, guess_charset, parser) - if not children: - raise etree.ParserError('No elements found') + accept_leading_text = bool(create_parent) + + elements = fragments_fromstring( + html, guess_charset=guess_charset, parser=parser, + no_leading_text=not accept_leading_text, **kw) + if create_parent: - if not isinstance(create_parent, _strings): + if not isinstance(create_parent, basestring): create_parent = 'div' new_root = Element(create_parent) - if isinstance(children[0], _strings): - new_root.text = children[0] - del children[0] - new_root.extend(children) - children = new_root - elif len(children) > 1: - raise etree.ParserError('Multiple elements found') + if elements: + if isinstance(elements[0], basestring): + new_root.text = elements[0] + del elements[0] + new_root.extend(elements) + return new_root - result = children[0] + if not elements: + raise etree.ParserError('No elements found') + if len(elements) > 1: + raise etree.ParserError('Multiple elements found') + result = elements[0] if result.tail and result.tail.strip(): raise etree.ParserError('Element followed by text: %r' % result.tail) result.tail = None From scoder at codespeak.net Sat Jan 30 22:52:14 2010 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 30 Jan 2010 22:52:14 +0100 (CET) Subject: [Lxml-checkins] r71012 - in lxml/trunk: . src/lxml/html Message-ID: <20100130215214.15CD91680C1@codespeak.net> Author: scoder Date: Sat Jan 30 22:52:13 2010 New Revision: 71012 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/html/__init__.py Log: r5459 at lenny: sbehnel | 2010-01-30 22:52:08 +0100 bug #511252: fix fragment parsing in lxml.html Modified: lxml/trunk/src/lxml/html/__init__.py ============================================================================== --- lxml/trunk/src/lxml/html/__init__.py (original) +++ lxml/trunk/src/lxml/html/__init__.py Sat Jan 30 22:52:13 2010 @@ -577,23 +577,33 @@ element. If create_parent is true (or is a tag name) then a parent node - will be created to encapsulate the HTML in a single element. + will be created to encapsulate the HTML in a single element. In + this case, leading or trailing text is allowed. base_url will set the document's base_url attribute (and the tree's docinfo.URL) """ if parser is None: parser = html_parser + + accept_leading_text = bool(create_parent) + + elements = fragments_fromstring( + html, parser=parser, no_leading_text=not accept_leading_text, + base_url=base_url, **kw) + if create_parent: if not isinstance(create_parent, basestring): create_parent = 'div' - return fragment_fromstring('<%s>%s</%s>' % ( - create_parent, html, create_parent), - parser=parser, base_url=base_url, **kw) - elements = fragments_fromstring(html, parser=parser, no_leading_text=True, - base_url=base_url, **kw) + new_root = Element(create_parent) + if elements: + if isinstance(elements[0], basestring): + new_root.text = elements[0] + del elements[0] + new_root.extend(elements) + return new_root + if not elements: - raise etree.ParserError( - "No elements found") + raise etree.ParserError('No elements found') if len(elements) > 1: raise etree.ParserError( "Multiple elements found (%s)" From scoder at codespeak.net Sat Jan 30 23:02:48 2010 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 30 Jan 2010 23:02:48 +0100 (CET) Subject: [Lxml-checkins] r71013 - lxml/trunk Message-ID: <20100130220248.557391680AB@codespeak.net> Author: scoder Date: Sat Jan 30 23:02:47 2010 New Revision: 71013 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/buildlibxml.py lxml/trunk/setupinfo.py Log: r5461 at lenny: sbehnel | 2010-01-30 23:02:39 +0100 bug #506558: applied patch by Sridhar Ratnakumar to download Windows dependency binaries during build Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Sat Jan 30 23:02:47 2010 @@ -94,6 +94,9 @@ Other changes ------------- +* Static MS Windows builds can now download their dependencies + themselves. + * ``Element.attrib`` no longer uses a cyclic reference back to its Element object. It therefore no longer requires the garbage collector to clean up. Modified: lxml/trunk/buildlibxml.py ============================================================================== --- lxml/trunk/buildlibxml.py (original) +++ lxml/trunk/buildlibxml.py Sat Jan 30 23:02:47 2010 @@ -7,8 +7,63 @@ except ImportError: from urllib.parse import urlsplit from urllib.request import urlretrieve + + -## Routines to download and build libxml2/xslt: +# use pre-built libraries on Windows + +def download_and_extract_zlatkovic_binaries(destdir): + url = 'ftp://ftp.zlatkovic.com/pub/libxml/' + libs = dict( + libxml2 = None, + libxslt = None, + zlib = None, + iconv = None, + ) + for fn in ftp_listdir(url): + for libname in libs: + if fn.startswith(libname): + assert libs[libname] is None, 'duplicate listings?' + assert fn.endswith('.win32.zip') + libs[libname] = fn + + if not os.path.exists(destdir): os.makedirs(destdir) + for libname, libfn in libs.items(): + srcfile = urljoin(url, libfn) + destfile = os.path.join(destdir, libfn) + print('Retrieving "%s" to "%s"' % (srcfile, destfile)) + urlretrieve(srcfile, destfile) + d = unpack_zipfile(destfile, destdir) + libs[libname] = d + + return libs + +def unpack_zipfile(zipfn, destdir): + assert zipfn.endswith('.zip') + import zipfile + print('Unpacking %s into %s' % (os.path.basename(zipfn), destdir)) + f = zipfile.ZipFile(zipfn) + try: + f.extractall(path=destdir) + finally: + f.close() + edir = os.path.join(destdir, os.path.basename(zipfn)[:-len('.zip')]) + assert os.path.exists(edir), 'missing: %s' % edir + return edir + +def get_prebuilt_libxml2xslt(download_dir, static_include_dirs, static_library_dirs): + assert sys.platform.startswith('win') + libs = download_and_extract_zlatkovic_binaries(download_dir) + for libname, path in libs.items(): + i = os.path.join(path, 'include') + l = os.path.join(path, 'lib') + assert os.path.exists(i), 'does not exist: %s' % i + assert os.path.exists(l), 'does not exist: %s' % l + static_include_dirs.append(i) + static_library_dirs.append(l) + + +## Routines to download and build libxml2/xslt from sources: LIBXML2_LOCATION = 'ftp://xmlsoft.org/libxml2/' LIBICONV_LOCATION = 'ftp://ftp.gnu.org/pub/gnu/libiconv/' Modified: lxml/trunk/setupinfo.py ============================================================================== --- lxml/trunk/setupinfo.py (original) +++ lxml/trunk/setupinfo.py Sat Jan 30 23:02:47 2010 @@ -41,14 +41,19 @@ static_cflags, static_binaries): global XML2_CONFIG, XSLT_CONFIG if OPTION_BUILD_LIBXML2XSLT: - from buildlibxml import build_libxml2xslt - XML2_CONFIG, XSLT_CONFIG = build_libxml2xslt( - 'libs', 'build/tmp', - static_include_dirs, static_library_dirs, - static_cflags, static_binaries, - libiconv_version=OPTION_LIBICONV_VERSION, - libxml2_version=OPTION_LIBXML2_VERSION, - libxslt_version=OPTION_LIBXSLT_VERSION) + from buildlibxml import build_libxml2xslt, get_prebuilt_libxml2xslt + if sys.platform.startswith('win'): + get_prebuilt_libxml2xslt( + 'libs', static_include_dirs, static_library_dirs) + else: + XML2_CONFIG, XSLT_CONFIG = build_libxml2xslt( + 'libs', 'build/tmp', + static_include_dirs, static_library_dirs, + static_cflags, static_binaries, + libiconv_version=OPTION_LIBICONV_VERSION, + libxml2_version=OPTION_LIBXML2_VERSION, + libxslt_version=OPTION_LIBXSLT_VERSION) + if CYTHON_INSTALLED: source_extension = ".pyx" print("Building with Cython %s." % Cython.Compiler.Version.version) @@ -321,6 +326,7 @@ env_val = os.getenv(name.upper().replace('-', '_')) return env_val +staticbuild = bool(os.environ.get('STATICBUILD', '')) # pick up any commandline options and/or env variables OPTION_WITHOUT_OBJECTIFY = has_option('without-objectify') OPTION_WITHOUT_ASSERT = has_option('without-assert') @@ -329,11 +335,11 @@ OPTION_WITH_REFNANNY = has_option('with-refnanny') if OPTION_WITHOUT_CYTHON: CYTHON_INSTALLED = False -OPTION_STATIC = has_option('static') +OPTION_STATIC = staticbuild or has_option('static') OPTION_DEBUG_GCC = has_option('debug-gcc') OPTION_SHOW_WARNINGS = has_option('warnings') OPTION_AUTO_RPATH = has_option('auto-rpath') -OPTION_BUILD_LIBXML2XSLT = has_option('static-deps') +OPTION_BUILD_LIBXML2XSLT = staticbuild or has_option('static-deps') if OPTION_BUILD_LIBXML2XSLT: OPTION_STATIC = True OPTION_LIBXML2_VERSION = option_value('libxml2-version')