From lxml-checkins at codespeak.net Wed Feb 4 05:51:44 2009 From: lxml-checkins at codespeak.net (lxml-checkins at codespeak.net) Date: Wed, 4 Feb 2009 05:51:44 +0100 (CET) Subject: [Lxml-checkins] Order Shipped -- Order #59723 Message-ID: <20090204045144.7A44A169E84@codespeak.net> An HTML attachment was scrubbed... URL: http://codespeak.net/pipermail/lxml-checkins/attachments/20090204/3fd2fcd5/attachment.htm From scoder at codespeak.net Fri Feb 6 22:18:33 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 6 Feb 2009 22:18:33 +0100 (CET) Subject: [Lxml-checkins] r61592 - in lxml/trunk: . src/lxml/tests Message-ID: <20090206211833.7038216A04F@codespeak.net> Author: scoder Date: Fri Feb 6 22:18:30 2009 New Revision: 61592 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/tests/test_io.py Log: r4971 at delle: sbehnel | 2009-01-28 20:45:09 +0100 new test case for writing to an invalid file path Modified: lxml/trunk/src/lxml/tests/test_io.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_io.py (original) +++ lxml/trunk/src/lxml/tests/test_io.py Fri Feb 6 22:18:30 2009 @@ -105,6 +105,18 @@ finally: os.close(handle) os.remove(filename) + + def test_write_invalid_filename(self): + filename = os.path.join( + os.path.join('hopefullynonexistingpathname'), + 'invalid_file.xml') + try: + self.tree.write(filename) + except IOError, e: + pass + else: + self.assertTrue( + False, "writing to an invalid file path should fail") def test_module_parse_gzipobject(self): # (c)ElementTree supports gzip instance as parse argument From scoder at codespeak.net Fri Feb 6 22:18:35 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 6 Feb 2009 22:18:35 +0100 (CET) Subject: [Lxml-checkins] r61593 - lxml/trunk Message-ID: <20090206211835.4749116A04F@codespeak.net> Author: scoder Date: Fri Feb 6 22:18:35 2009 New Revision: 61593 Modified: lxml/trunk/ (props changed) lxml/trunk/Makefile Log: r4972 at delle: sbehnel | 2009-02-06 17:46:15 +0100 honour external PYTHONPATH in Makefile Modified: lxml/trunk/Makefile ============================================================================== --- lxml/trunk/Makefile (original) +++ lxml/trunk/Makefile Fri Feb 6 22:18:35 2009 @@ -19,15 +19,15 @@ test_inplace: inplace $(PYTHON) test.py $(TESTFLAGS) $(TESTOPTS) - PYTHONPATH=src $(PYTHON) selftest.py - PYTHONPATH=src $(PYTHON) selftest2.py + PYTHONPATH=src:$(PYTHONPATH) $(PYTHON) selftest.py + PYTHONPATH=src:$(PYTHONPATH) $(PYTHON) selftest2.py test_inplace3: inplace $(MAKE) clean $(PYTHON3) setup.py $(SETUPFLAGS) build_ext -i $(PYTHON3) test.py $(TESTFLAGS) $(TESTOPTS) - PYTHONPATH=src $(PYTHON3) selftest.py - PYTHONPATH=src $(PYTHON3) selftest2.py + PYTHONPATH=src:$(PYTHONPATH) $(PYTHON3) selftest.py + PYTHONPATH=src:$(PYTHONPATH) $(PYTHON3) selftest2.py valgrind_test_inplace: inplace valgrind --tool=memcheck --leak-check=full --num-callers=30 --suppressions=valgrind-python.supp \ @@ -60,7 +60,7 @@ || (echo "not generating epydoc API documentation") website: inplace - PYTHONPATH=src $(PYTHON) doc/mkhtml.py doc/html . ${LXMLVERSION} + PYTHONPATH=src:$(PYTHONPATH) $(PYTHON) doc/mkhtml.py doc/html . ${LXMLVERSION} html: inplace website apihtml s5 From scoder at codespeak.net Fri Feb 6 22:18:41 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 6 Feb 2009 22:18:41 +0100 (CET) Subject: [Lxml-checkins] r61594 - lxml/trunk Message-ID: <20090206211841.A00BA16A053@codespeak.net> Author: scoder Date: Fri Feb 6 22:18:40 2009 New Revision: 61594 Modified: lxml/trunk/ (props changed) lxml/trunk/setupinfo.py Log: r4973 at delle: sbehnel | 2009-02-06 17:51:15 +0100 new setup options: enable refnanny, disable Cython Modified: lxml/trunk/setupinfo.py ============================================================================== --- lxml/trunk/setupinfo.py (original) +++ lxml/trunk/setupinfo.py Fri Feb 6 22:18:40 2009 @@ -211,6 +211,8 @@ macros.append(('PYREX_WITHOUT_ASSERTIONS', None)) if OPTION_WITHOUT_THREADING: macros.append(('WITHOUT_THREADING', None)) + if OPTION_WITH_REFNANNY: + macros.append(('CYTHON_REFNANNY', None)) return macros _ERROR_PRINTED = False @@ -319,6 +321,10 @@ OPTION_WITHOUT_OBJECTIFY = has_option('without-objectify') OPTION_WITHOUT_ASSERT = has_option('without-assert') OPTION_WITHOUT_THREADING = has_option('without-threading') +OPTION_WITHOUT_CYTHON = has_option('without-cython') +OPTION_WITH_REFNANNY = has_option('with-refnanny') +if OPTION_WITHOUT_CYTHON: + CYTHON_INSTALLED = False OPTION_STATIC = has_option('static') OPTION_DEBUG_GCC = has_option('debug-gcc') OPTION_AUTO_RPATH = has_option('auto-rpath') From scoder at codespeak.net Fri Feb 6 22:18:47 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 6 Feb 2009 22:18:47 +0100 (CET) Subject: [Lxml-checkins] r61595 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20090206211847.3D0E516A053@codespeak.net> Author: scoder Date: Fri Feb 6 22:18:46 2009 New Revision: 61595 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/tests/test_xslt.py lxml/trunk/src/lxml/xslt.pxd lxml/trunk/src/lxml/xslt.pxi Log: r4974 at delle: sbehnel | 2009-02-06 21:38:11 +0100 support for quotes in XSLT string parameters, adapted from patch by Alexander Shigin Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Fri Feb 6 22:18:46 2009 @@ -2,6 +2,16 @@ lxml changelog ============== +Under development +================= + +Features added +-------------- + +* ``XSLT.strparam()`` class method to wrap quoted string parameters + that require escaping. + + 2.2beta2 (2009-01-25) ===================== Modified: lxml/trunk/src/lxml/tests/test_xslt.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_xslt.py (original) +++ lxml/trunk/src/lxml/tests/test_xslt.py Fri Feb 6 22:18:46 2009 @@ -329,6 +329,24 @@ ''', str(res)) + def test_xslt_string_parameters(self): + tree = self.parse('BC') + style = self.parse('''\ + + + + +''') + + st = etree.XSLT(style) + res = st(tree, bar=etree.XSLT.strparam('''it's me, "Bar"''')) + self.assertEquals('''\ + +it's me, "Bar" +''', + str(res)) + def test_xslt_parameter_invalid(self): tree = self.parse('BC') style = self.parse('''\ Modified: lxml/trunk/src/lxml/xslt.pxd ============================================================================== --- lxml/trunk/src/lxml/xslt.pxd (original) +++ lxml/trunk/src/lxml/xslt.pxd Fri Feb 6 22:18:46 2009 @@ -133,6 +133,16 @@ xsltTransformContext* ctxt) nogil cdef xmlDoc* xsltGetProfileInformation(xsltTransformContext* ctxt) nogil +cdef extern from "libxslt/variables.h": + cdef int xsltQuoteUserParams(xsltTransformContext* ctxt, + char** params) + cdef int xsltQuoteOneUserParam(xsltTransformContext* ctxt, + char* name, + char* value) + cdef int xsltEvalOneUserParam(xsltTransformContext* ctxt, + char* name, + char* value) + cdef extern from "libxslt/extra.h": cdef char* XSLT_LIBXSLT_NAMESPACE cdef char* XSLT_XALAN_NAMESPACE Modified: lxml/trunk/src/lxml/xslt.pxi ============================================================================== --- lxml/trunk/src/lxml/xslt.pxi (original) +++ lxml/trunk/src/lxml/xslt.pxi Fri Feb 6 22:18:46 2009 @@ -315,6 +315,15 @@ self._release_temp_refs() +cdef class _XSLTQuotedStringParam: + u"""A wrapper class for literal XSLT string parameters that require + quote escaping. + """ + cdef str strval + def __init__(self, strval): + self.strval = _utf8(strval) + + cdef class XSLT: u"""XSLT(self, xslt_input, extensions=None, regexp=True, access_control=None) @@ -411,6 +420,20 @@ def __get__(self): return self._error_log.copy() + @classmethod + def strparam(_, strval): + u"""strparam(strval) + + Mark an XSLT string parameter that requires quote escaping + before passing it into the transformation. Use it like this:: + + result = transform(doc, some_strval = XSLT.strparam( + '''it's \"Monty Python's\" ...''')) + + Escaped string parameters can be reused without restriction. + """ + return _XSLTQuotedStringParam(strval) + def apply(self, _input, *, profile_run=False, **kw): u"""apply(self, _input, profile_run=False, **kw) @@ -548,7 +571,7 @@ return _xsltResultTreeFactory(result_doc, self, profile_doc) cdef xmlDoc* _run_transform(self, xmlDoc* c_input_doc, - parameters, _XSLTContext context, + dict parameters, _XSLTContext context, xslt.xsltTransformContext* transform_ctxt): cdef xmlDoc* c_result cdef char** params @@ -561,7 +584,7 @@ if self._access_control is not None: self._access_control._register_in_context(transform_ctxt) - parameter_count = python.PyDict_Size(parameters) + parameter_count = len(parameters) if parameter_count > 0: # allocate space for parameters # * 2 as we want an entry for both key and value, @@ -571,15 +594,20 @@ try: i = 0 keep_ref = [] - for key, value in parameters.items(): + for key, value in parameters.iteritems(): k = _utf8(key) + if isinstance(value, _XSLTQuotedStringParam): + v = (<_XSLTQuotedStringParam>value).strval + xslt.xsltQuoteOneUserParam( + transform_ctxt, _cstr(k), _cstr(v)) + else: + v = _utf8(value) + params[i] = _cstr(k) + i += 1 + params[i] = _cstr(v) + i += 1 keep_ref.append(k) - v = _utf8(value) keep_ref.append(v) - params[i] = _cstr(k) - i += 1 - params[i] = _cstr(v) - i += 1 except: python.PyMem_Free(params) raise From lxml-checkins at codespeak.net Fri Feb 6 23:38:49 2009 From: lxml-checkins at codespeak.net (lxml-checkins at codespeak.net) Date: Fri, 6 Feb 2009 23:38:49 +0100 (CET) Subject: [Lxml-checkins] Customer Receipt/Purchase Confirmation Message-ID: <20090206223849.DF24416A03F@codespeak.net> An HTML attachment was scrubbed... URL: http://codespeak.net/pipermail/lxml-checkins/attachments/20090206/03ad465c/attachment.htm From scoder at codespeak.net Sat Feb 7 21:58:57 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 7 Feb 2009 21:58:57 +0100 (CET) Subject: [Lxml-checkins] r61623 - in lxml/trunk: . src/lxml Message-ID: <20090207205857.16C5B169F69@codespeak.net> Author: scoder Date: Sat Feb 7 21:58:55 2009 New Revision: 61623 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/proxy.pxi Log: r4979 at delle: sbehnel | 2009-02-07 15:44:18 +0100 cleanup Modified: lxml/trunk/src/lxml/proxy.pxi ============================================================================== --- lxml/trunk/src/lxml/proxy.pxi (original) +++ lxml/trunk/src/lxml/proxy.pxi Sat Feb 7 21:58:55 2009 @@ -277,10 +277,11 @@ xmlNode* c_element) except -1: u"""Fix the xmlNs pointers of a node and its subtree that were moved. - Mainly copied from libxml2's xmlReconciliateNs(). Expects libxml2 doc - pointers of node to be correct already, but fixes _Document references. + Originally copied from libxml2's xmlReconciliateNs(). Expects + libxml2 doc pointers of node to be correct already, but fixes + _Document references. - For each node in the subtree, we do three things here: + For each node in the subtree, we do this: 1) Remove redundant declarations of namespace that are already defined in its parents. @@ -406,12 +407,15 @@ fixThreadDictNsForNode(c_element, c_src_dict, c_dict) c_element = c_element.children while c_element is not NULL: - fixThreadDictNames(c_element, c_src_dict, c_dict) + if tree._isElementOrXInclude(c_element): + fixThreadDictNamesForNode(c_element, c_src_dict, c_dict) c_element = c_element.next - return - elif not tree._isElementOrXInclude(c_element): - return + elif tree._isElementOrXInclude(c_element): + fixThreadDictNamesForNode(c_element, c_src_dict, c_dict) +cdef void fixThreadDictNamesForNode(xmlNode* c_element, + tree.xmlDict* c_src_dict, + tree.xmlDict* c_dict) nogil: tree.BEGIN_FOR_EACH_FROM(c_element, c_element, 1) if c_element.name is not NULL: fixThreadDictNameForNode(c_element, c_dict) From scoder at codespeak.net Sat Feb 7 21:59:01 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 7 Feb 2009 21:59:01 +0100 (CET) Subject: [Lxml-checkins] r61624 - in lxml/trunk: . src/lxml Message-ID: <20090207205901.BA825169F6F@codespeak.net> Author: scoder Date: Sat Feb 7 21:59:00 2009 New Revision: 61624 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/cstd.pxd Log: r4980 at delle: sbehnel | 2009-02-07 17:04:51 +0100 fix printf() signature Modified: lxml/trunk/src/lxml/cstd.pxd ============================================================================== --- lxml/trunk/src/lxml/cstd.pxd (original) +++ lxml/trunk/src/lxml/cstd.pxd Sat Feb 7 21:59:00 2009 @@ -17,7 +17,7 @@ cdef int feof(FILE *stream) nogil cdef int ferror(FILE *stream) nogil cdef int sprintf(char* str, char* format, ...) nogil - cdef int printf(char* str) nogil + cdef int printf(char* str, ...) nogil cdef extern from "stdlib.h": cdef void* malloc(size_t size) nogil From scoder at codespeak.net Sat Feb 7 21:59:05 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 7 Feb 2009 21:59:05 +0100 (CET) Subject: [Lxml-checkins] r61625 - in lxml/trunk: . src/lxml Message-ID: <20090207205905.B7E0F169F7F@codespeak.net> Author: scoder Date: Sat Feb 7 21:59:05 2009 New Revision: 61625 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/proxy.pxi Log: r4981 at delle: sbehnel | 2009-02-07 17:05:41 +0100 cleanup, copy some special casing conditions from libxml2 code Modified: lxml/trunk/src/lxml/proxy.pxi ============================================================================== --- lxml/trunk/src/lxml/proxy.pxi (original) +++ lxml/trunk/src/lxml/proxy.pxi Sat Feb 7 21:59:05 2009 @@ -428,8 +428,7 @@ tree.xmlDict* c_src_dict, tree.xmlDict* c_dict) nogil: cdef xmlNode* c_child - cdef xmlNode* c_node - c_node = c_attr + cdef xmlNode* c_node = c_attr while c_node is not NULL: fixThreadDictNameForNode(c_node, c_dict) fixThreadDictContentForNode(c_node, c_src_dict, c_dict) @@ -443,16 +442,20 @@ cdef inline void fixThreadDictNameForNode(xmlNode* c_node, tree.xmlDict* c_dict) nogil: - cdef char* c_name - # c_name can be NULL on memory error, but we don't handle that here - c_name = tree.xmlDictLookup(c_dict, c_node.name, -1) - if c_name is not NULL: - c_node.name = c_name + cdef char* c_name = c_node.name + if c_name is not NULL and \ + c_node.type != tree.XML_TEXT_NODE and \ + c_node.type != tree.XML_COMMENT_NODE: + # c_name can be NULL on memory error, but we don't handle that here + c_name = tree.xmlDictLookup(c_dict, c_name, -1) + if c_name is not NULL: + c_node.name = c_name cdef inline void fixThreadDictContentForNode(xmlNode* c_node, tree.xmlDict* c_src_dict, tree.xmlDict* c_dict) nogil: - if c_node.content is not NULL: + if c_node.content is not NULL and \ + c_node.content is not c_node.properties: if tree.xmlDictOwns(c_src_dict, c_node.content): # result can be NULL on memory error, but we don't handle that here c_node.content = tree.xmlDictLookup(c_dict, c_node.content, -1) From scoder at codespeak.net Sat Feb 7 21:59:09 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 7 Feb 2009 21:59:09 +0100 (CET) Subject: [Lxml-checkins] r61626 - in lxml/trunk: . src/lxml Message-ID: <20090207205909.CD30F169FA3@codespeak.net> Author: scoder Date: Sat Feb 7 21:59:09 2009 New Revision: 61626 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/proxy.pxi Log: r4982 at delle: sbehnel | 2009-02-07 20:21:48 +0100 build fix Modified: lxml/trunk/src/lxml/proxy.pxi ============================================================================== --- lxml/trunk/src/lxml/proxy.pxi (original) +++ lxml/trunk/src/lxml/proxy.pxi Sat Feb 7 21:59:09 2009 @@ -455,7 +455,7 @@ tree.xmlDict* c_src_dict, tree.xmlDict* c_dict) nogil: if c_node.content is not NULL and \ - c_node.content is not c_node.properties: + c_node.content is not c_node.properties: if tree.xmlDictOwns(c_src_dict, c_node.content): # result can be NULL on memory error, but we don't handle that here c_node.content = tree.xmlDictLookup(c_dict, c_node.content, -1) From scoder at codespeak.net Sat Feb 7 21:59:14 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 7 Feb 2009 21:59:14 +0100 (CET) Subject: [Lxml-checkins] r61627 - in lxml/trunk: . src/lxml Message-ID: <20090207205914.90D40169F69@codespeak.net> Author: scoder Date: Sat Feb 7 21:59:14 2009 New Revision: 61627 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/serializer.pxi Log: r4983 at delle: sbehnel | 2009-02-07 20:52:44 +0100 better error handling during serialisation Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Sat Feb 7 21:59:14 2009 @@ -11,6 +11,11 @@ * ``XSLT.strparam()`` class method to wrap quoted string parameters that require escaping. +Other changes +------------- + +* More robust error handling on serialisation. + 2.2beta2 (2009-01-25) ===================== Modified: lxml/trunk/src/lxml/serializer.pxi ============================================================================== --- lxml/trunk/src/lxml/serializer.pxi (original) +++ lxml/trunk/src/lxml/serializer.pxi Sat Feb 7 21:59:14 2009 @@ -1,5 +1,9 @@ # XML serialization and output functions +class SerialisationError(LxmlError): + u"""A libxml2 error that occurred during serialisation. + """ + cdef enum _OutputMethods: OUTPUT_METHOD_XML OUTPUT_METHOD_HTML @@ -22,13 +26,14 @@ cdef char* c_text cdef xmlNode* c_text_node cdef tree.xmlBuffer* c_buffer + cdef int error_result c_buffer = tree.xmlBufferCreate() if c_buffer is NULL: return python.PyErr_NoMemory() with nogil: - tree.xmlNodeBufGetContent(c_buffer, c_node) + error_result = tree.xmlNodeBufGetContent(c_buffer, c_node) if with_tail: c_text_node = _textNodeOrSkip(c_node.next) while c_text_node is not NULL: @@ -36,6 +41,10 @@ c_text_node = _textNodeOrSkip(c_text_node.next) c_text = tree.xmlBufferContent(c_buffer) + if error_result < 0 or c_text is NULL: + tree.xmlBufferFree(c_buffer) + raise SerialisationError, u"Error during serialisation (out of memory?)" + try: needs_conversion = 0 if encoding is _unicode: @@ -60,7 +69,7 @@ else: text = c_text finally: - tree.xmlBufferFree(c_buffer); + tree.xmlBufferFree(c_buffer) return text @@ -76,6 +85,7 @@ cdef char* c_enc cdef char* c_version cdef int c_method + cdef int error_result if element is None: return None c_method = _findOutputMethod(method) @@ -108,6 +118,11 @@ else: c_result_buffer = c_buffer.buffer + error_result = c_buffer.error + if error_result != xmlerror.XML_ERR_OK: + tree.xmlOutputBufferClose(c_buffer) + _raiseSerialisationError(error_result) + try: if encoding is _unicode: result = python.PyUnicode_DecodeUTF8( @@ -122,6 +137,15 @@ tree.xmlOutputBufferClose(c_buffer) return result +cdef _raiseSerialisationError(int error_result): + if error_result == xmlerror.XML_ERR_NO_MEMORY: + return python.PyErr_NoMemory() + else: + message = ErrorTypes._getName(error_result) + if message is None: + message = u"unknown error %d" % error_result + raise SerialisationError, message + ############################################################ # low-level serialisation functions @@ -146,6 +170,9 @@ # copy the node and add namespaces from parents # this is required to make libxml write them c_nsdecl_node = tree.xmlCopyNode(c_node, 2) + if c_nsdecl_node is NULL: + c_buffer.error = xmlerror.XML_ERR_NO_MEMORY + return _copyParentNamespaces(c_node, c_nsdecl_node) c_nsdecl_node.parent = c_node.parent @@ -171,17 +198,17 @@ if write_complete_document: _writeNextSiblings(c_buffer, c_node, encoding, pretty_print) if pretty_print: - tree.xmlOutputBufferWriteString(c_buffer, "\n") + tree.xmlOutputBufferWrite(c_buffer, 1, "\n") cdef void _writeDeclarationToBuffer(tree.xmlOutputBuffer* c_buffer, char* version, char* encoding) nogil: if version is NULL: version = "1.0" - tree.xmlOutputBufferWriteString(c_buffer, "\n") + tree.xmlOutputBufferWrite(c_buffer, 4, "'?>\n") cdef void _writeDtdToBuffer(tree.xmlOutputBuffer* c_buffer, xmlDoc* c_doc, char* c_root_name, @@ -317,6 +344,7 @@ cdef tree.xmlOutputBuffer* c_buffer cdef tree.xmlCharEncodingHandler* enchandler cdef char* c_enc + cdef int error_result if encoding is None: c_enc = NULL else: @@ -358,11 +386,14 @@ _writeNodeToBuffer(c_buffer, element._c_node, c_enc, c_method, write_xml_declaration, write_doctype, pretty_print, with_tail) + error_result = c_buffer.error tree.xmlOutputBufferClose(c_buffer) if writer is None: python.PyEval_RestoreThread(state) else: writer._exc_context._raise_if_stored() + if error_result != xmlerror.XML_ERR_OK: + _raiseSerialisationError(error_result) cdef _tofilelikeC14N(f, _Element element, bint exclusive, bint with_comments): cdef _FilelikeWriter writer From scoder at codespeak.net Sat Feb 7 21:59:19 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 7 Feb 2009 21:59:19 +0100 (CET) Subject: [Lxml-checkins] r61628 - in lxml/trunk: . doc Message-ID: <20090207205919.3DE26169F6F@codespeak.net> Author: scoder Date: Sat Feb 7 21:59:18 2009 New Revision: 61628 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/xpathxslt.txt Log: r4984 at delle: sbehnel | 2009-02-07 21:18:24 +0100 docs on XSLT parameters Modified: lxml/trunk/doc/xpathxslt.txt ============================================================================== --- lxml/trunk/doc/xpathxslt.txt (original) +++ lxml/trunk/doc/xpathxslt.txt Sat Feb 7 21:59:18 2009 @@ -523,16 +523,16 @@ >>> f = StringIO('Text') >>> doc = etree.parse(f) -The parameters are passed as keyword parameters to the transform call. First -let's try passing in a simple string expression: +The parameters are passed as keyword parameters to the transform call. +First, let's try passing in a simple integer expression: .. sourcecode:: pycon - >>> result = transform(doc, a="'A'") + >>> result = transform(doc, a="5") >>> str(result) - '\nA\n' + '\n5\n' -Let's try a non-string XPath expression now: +You can use any valid XPath expression as parameter value: .. sourcecode:: pycon @@ -540,6 +540,27 @@ >>> str(result) '\nText\n' +Passing a string expression looks like this: + +.. sourcecode:: pycon + + >>> result = transform(doc, a="'A'") + >>> str(result) + '\nA\n' + +To pass a string that (potentially) contains quotes, you can use the +``.strparam()`` class method. Note that it does not escape the +string. Instead, it returns an opaque object that keeps the string +value. + +.. sourcecode:: pycon + + >>> plain_string_value = etree.XSLT.strparam( + ... """ It's "Monty Python" """) + >>> result = transform(doc, a=plain_string_value) + >>> str(result) + '\n It\'s "Monty Python" \n' + The ``xslt()`` tree method -------------------------- From scoder at codespeak.net Sat Feb 7 21:59:23 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 7 Feb 2009 21:59:23 +0100 (CET) Subject: [Lxml-checkins] r61629 - in lxml/trunk: . src/lxml Message-ID: <20090207205923.2D45916A01E@codespeak.net> Author: scoder Date: Sat Feb 7 21:59:22 2009 New Revision: 61629 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/tree.pxd Log: r4985 at delle: sbehnel | 2009-02-07 21:19:36 +0100 forgotten commit for serialisation error handling Modified: lxml/trunk/src/lxml/tree.pxd ============================================================================== --- lxml/trunk/src/lxml/tree.pxd (original) +++ lxml/trunk/src/lxml/tree.pxd Sat Feb 7 21:59:22 2009 @@ -170,6 +170,7 @@ ctypedef struct xmlOutputBuffer: xmlBuffer* buffer xmlBuffer* conv + int error cdef void xmlFreeDoc(xmlDoc* cur) nogil cdef void xmlFreeDtd(xmlDtd* cur) nogil @@ -216,7 +217,7 @@ cdef xmlAttr* xmlHasProp(xmlNode* node, char* name) nogil cdef xmlAttr* xmlHasNsProp(xmlNode* node, char* name, char* nameSpace) nogil cdef char* xmlNodeGetContent(xmlNode* cur) nogil - cdef char* xmlNodeBufGetContent(xmlBuffer* buffer, xmlNode* cur) nogil + cdef int xmlNodeBufGetContent(xmlBuffer* buffer, xmlNode* cur) nogil cdef xmlNs* xmlSearchNs(xmlDoc* doc, xmlNode* node, char* prefix) nogil cdef xmlNs* xmlSearchNsByHref(xmlDoc* doc, xmlNode* node, char* href) nogil cdef int xmlIsBlankNode(xmlNode* node) nogil From scoder at codespeak.net Sat Feb 7 21:59:27 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 7 Feb 2009 21:59:27 +0100 (CET) Subject: [Lxml-checkins] r61630 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20090207205927.D4C6C169F7F@codespeak.net> Author: scoder Date: Sat Feb 7 21:59:27 2009 New Revision: 61630 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/tests/test_threading.py lxml/trunk/src/lxml/xmlerror.pxi Log: r4986 at delle: sbehnel | 2009-02-07 21:56:59 +0100 make global error log thread-local Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Sat Feb 7 21:59:27 2009 @@ -14,6 +14,9 @@ Other changes ------------- +* The global error log (which is copied into the exception log) is now + local to a thread. + * More robust error handling on serialisation. Modified: lxml/trunk/src/lxml/tests/test_threading.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_threading.py (original) +++ lxml/trunk/src/lxml/tests/test_threading.py Sat Feb 7 21:59:27 2009 @@ -108,6 +108,42 @@ self.assertEquals(_bytes('
BC
'), result) + def test_thread_error_log(self): + XML = self.etree.XML + ParseError = self.etree.ParseError + expected_error = [self.etree.ErrorTypes.ERR_TAG_NAME_MISMATCH] + children = "test" * 100 + + def parse_error_test(thread_no): + tag = "tag%d" % thread_no + xml = "<%s>%s" % (tag, children, tag.upper()) + parser = self.etree.XMLParser() + for _ in range(10): + errors = None + try: + XML(xml, parser) + except self.etree.ParseError, e: + errors = e.error_log.filter_types(expected_error) + self.assertTrue(errors, "Expected error not found") + for error in errors: + self.assertTrue( + tag in error.message and tag.upper() in error.message, + "%s and %s not found in '%s'" % ( + tag, tag.upper(), error.message)) + + self.etree.clear_error_log() + threads = [] + for thread_no in range(1, 10): + t = threading.Thread(target=parse_error_test, + args=(thread_no,)) + threads.append(t) + t.start() + + parse_error_test(0) + + for t in threads: + t.join() + def test_thread_mix(self): XML = self.etree.XML Element = self.etree.Element Modified: lxml/trunk/src/lxml/xmlerror.pxi ============================================================================== --- lxml/trunk/src/lxml/xmlerror.pxi (original) +++ lxml/trunk/src/lxml/xmlerror.pxi Sat Feb 7 21:59:27 2009 @@ -9,8 +9,12 @@ Clear the global error log. Note that this log is already bound to a fixed size. + + Note: since lxml 2.2, the global error log is local to a thread + and this function will only clear the global error log of the + current thread. """ - __GLOBAL_ERROR_LOG.clear() + _getGlobalErrorLog().clear() # dummy function: no debug output at all cdef void _nullGenericErrorFunc(void* ctxt, char* msg, ...) nogil: @@ -114,14 +118,16 @@ cdef void _receive(self, xmlerror.xmlError* error): cdef bint is_error cdef _LogEntry entry + cdef _BaseErrorLog global_log entry = _LogEntry() entry._setError(error) is_error = error.level == xmlerror.XML_ERR_ERROR or \ error.level == xmlerror.XML_ERR_FATAL - if __GLOBAL_ERROR_LOG is not self: - __GLOBAL_ERROR_LOG.receive(entry) + global_log = _getGlobalErrorLog() + if global_log is not self: + global_log.receive(entry) if is_error: - __GLOBAL_ERROR_LOG.last_error = entry + global_log.last_error = entry self.receive(entry) if is_error: self.last_error = entry @@ -130,14 +136,16 @@ message, filename): cdef bint is_error cdef _LogEntry entry + cdef _BaseErrorLog global_log entry = _LogEntry() entry._setGeneric(domain, type, level, line, message, filename) is_error = level == xmlerror.XML_ERR_ERROR or \ level == xmlerror.XML_ERR_FATAL - if __GLOBAL_ERROR_LOG is not self: - __GLOBAL_ERROR_LOG.receive(entry) + global_log = _getGlobalErrorLog() + if global_log is not self: + global_log.receive(entry) if is_error: - __GLOBAL_ERROR_LOG.last_error = entry + global_log.last_error = entry self.receive(entry) if is_error: self.last_error = entry @@ -402,13 +410,37 @@ def receive(self, entry): self.log(entry, entry) -# global list log to collect error output messages from libxml2/libxslt +# thread-local, global list log to collect error output messages from +# libxml2/libxslt + cdef _BaseErrorLog __GLOBAL_ERROR_LOG __GLOBAL_ERROR_LOG = _RotatingErrorLog(__MAX_LOG_SIZE) +cdef _ErrorLog _getGlobalErrorLog(): + u"""Retrieve the global error log of this thread.""" + cdef python.PyObject* thread_dict + thread_dict = python.PyThreadState_GetDict() + if thread_dict is NULL: + return __GLOBAL_ERROR_LOG + try: + return (thread_dict)[u"_GlobalErrorLog"] + except KeyError: + log = (thread_dict)[u"_GlobalErrorLog"] = \ + _RotatingErrorLog(__MAX_LOG_SIZE) + return log + +cdef _ErrorLog _setGlobalErrorLog(_BaseErrorLog log): + u"""Set the global error log of this thread.""" + cdef python.PyObject* thread_dict + thread_dict = python.PyThreadState_GetDict() + if thread_dict is NULL: + global __GLOBAL_ERROR_LOG + __GLOBAL_ERROR_LOG = log + (thread_dict)[u"_GlobalErrorLog"] = log + cdef __copyGlobalErrorLog(): u"Helper function for properties in exceptions." - return __GLOBAL_ERROR_LOG.copy() + return _getGlobalErrorLog().copy() def use_global_python_log(PyErrorLog log not None): u"""use_global_python_log(log) @@ -418,9 +450,12 @@ Note that this disables access to the global error log from exceptions. Parsers, XSLT etc. will continue to provide their normal local error log. + + Note: prior to lxml 2.2, this changed the error log globally. + Since lxml 2.2, the global error log is local to a thread and this + function will only set the global error log of the current thread. """ - global __GLOBAL_ERROR_LOG - __GLOBAL_ERROR_LOG = log + _setGlobalErrorLog(log) # local log functions: forward error to logger object @@ -429,7 +464,7 @@ if c_log_handler is not NULL: log_handler = <_BaseErrorLog>c_log_handler else: - log_handler = __GLOBAL_ERROR_LOG + log_handler = _getGlobalErrorLog() log_handler._receive(error) cdef void _receiveError(void* c_log_handler, xmlerror.xmlError* error) nogil: From scoder at codespeak.net Sun Feb 8 11:31:36 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 8 Feb 2009 11:31:36 +0100 (CET) Subject: [Lxml-checkins] r61635 - in lxml/trunk: . doc Message-ID: <20090208103136.A16BF16A0AC@codespeak.net> Author: scoder Date: Sun Feb 8 11:31:35 2009 New Revision: 61635 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/FAQ.txt Log: r4995 at delle: sbehnel | 2009-02-08 11:29:41 +0100 doc link to ElementLib by Fredrik Modified: lxml/trunk/doc/FAQ.txt ============================================================================== --- lxml/trunk/doc/FAQ.txt (original) +++ lxml/trunk/doc/FAQ.txt Sun Feb 8 11:31:35 2009 @@ -70,7 +70,9 @@ There is also a `tutorial for ElementTree`_ which works for ``lxml.etree``. The documentation of the `extended etree API`_ also -contains many examples for ``lxml.etree``. To learn using +contains many examples for ``lxml.etree``. Fredrik Lundh's `element +library`_ contains a lot of nice recipes that show how to solve common +tasks in ElementTree and lxml.etree. To learn using ``lxml.objectify``, read the `objectify documentation`_. John Shipman has written another tutorial called `Python XML @@ -81,7 +83,7 @@ .. _`extended etree API`: api.html .. _`objectify documentation`: objectify.html .. _`Python XML processing with lxml`: http://www.nmt.edu/tcc/help/pubs/pylxml/ - +.. _`element library`: http://effbot.org/zone/element-lib.htm Where can I find more documentation about lxml? ----------------------------------------------- @@ -89,9 +91,10 @@ There is a lot of documentation on the web and also in the Python standard library documentation, as lxml implements the well-known `ElementTree API`_ and tries to follow its documentation as closely as -possible. There are a couple of issues where lxml cannot keep up -compatibility. They are described in the compatibility_ -documentation. +possible. The recipes in Fredrik Lundh's `element library`_ are +generally worth taking a look at. There are a couple of issues where +lxml cannot keep up compatibility. They are described in the +compatibility_ documentation. The lxml specific extensions to the API are described by individual files in the ``doc`` directory of the source distribution and on `the @@ -769,6 +772,10 @@ tree. If you now call a serialization function to pretty print this tree, lxml can add fresh whitespace to the XML tree to indent it. +Fredrik Lundh also has a Python-level function for indenting XML by +appending whitespace to tags. It can be found on his `element +library`_ recipe page. + Why can't lxml parse my XML from unicode strings? ------------------------------------------------- From lxml-checkins at codespeak.net Tue Feb 10 08:34:39 2009 From: lxml-checkins at codespeak.net (lxml-checkins at codespeak.net) Date: Tue, 10 Feb 2009 08:34:39 +0100 (CET) Subject: [Lxml-checkins] Great Finds Message-ID: <20090210073439.0028F169F8A@codespeak.net> An HTML attachment was scrubbed... URL: http://codespeak.net/pipermail/lxml-checkins/attachments/20090210/5bc7c4b4/attachment.htm From lxml-checkins at codespeak.net Wed Feb 11 09:05:51 2009 From: lxml-checkins at codespeak.net (lxml-checkins at codespeak.net) Date: Wed, 11 Feb 2009 09:05:51 +0100 (CET) Subject: [Lxml-checkins] Invoice from itunes.com Message-ID: <20090211080551.763D9169E92@codespeak.net> An HTML attachment was scrubbed... URL: http://codespeak.net/pipermail/lxml-checkins/attachments/20090211/01193658/attachment.htm From lxml-checkins at codespeak.net Sat Feb 14 16:47:02 2009 From: lxml-checkins at codespeak.net (lxml-checkins at codespeak.net) Date: Sat, 14 Feb 2009 16:47:02 +0100 (CET) Subject: [Lxml-checkins] Jessica Alba leaked home video Message-ID: <20090214154702.C542916850B@codespeak.net> An HTML attachment was scrubbed... URL: http://codespeak.net/pipermail/lxml-checkins/attachments/20090214/882a736e/attachment.htm From scoder at codespeak.net Sat Feb 14 23:16:03 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 14 Feb 2009 23:16:03 +0100 (CET) Subject: [Lxml-checkins] r61892 - in lxml/trunk: . src/lxml Message-ID: <20090214221603.230531684EE@codespeak.net> Author: scoder Date: Sat Feb 14 23:16:01 2009 New Revision: 61892 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/xmlid.pxi Log: r4997 at delle: sbehnel | 2009-02-14 15:43:39 +0100 removed non-working special method __cmp__ from _IDDict class Modified: lxml/trunk/src/lxml/xmlid.pxi ============================================================================== --- lxml/trunk/src/lxml/xmlid.pxi (original) +++ lxml/trunk/src/lxml/xmlid.pxi Sat Feb 14 23:16:01 2009 @@ -105,12 +105,6 @@ def has_key(self, id_name): return id_name in self - def __cmp__(self, other): - if other is None: - return 1 - else: - return cmp(dict(self), other) - def __repr__(self): return repr(dict(self)) From scoder at codespeak.net Sat Feb 14 23:16:09 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 14 Feb 2009 23:16:09 +0100 (CET) Subject: [Lxml-checkins] r61893 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20090214221609.332AC1684F0@codespeak.net> Author: scoder Date: Sat Feb 14 23:16:08 2009 New Revision: 61893 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/objectpath.pxi lxml/trunk/src/lxml/tests/test_io.py Log: r4998 at delle: sbehnel | 2009-02-14 23:13:57 +0100 fixes for Py3 Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Sat Feb 14 23:16:08 2009 @@ -11,6 +11,11 @@ * ``XSLT.strparam()`` class method to wrap quoted string parameters that require escaping. +Bugs fixed +---------- + +* Fixes following changes in Python 3.0.1. + Other changes ------------- Modified: lxml/trunk/src/lxml/objectpath.pxi ============================================================================== --- lxml/trunk/src/lxml/objectpath.pxi (original) +++ lxml/trunk/src/lxml/objectpath.pxi Sat Feb 14 23:16:08 2009 @@ -105,10 +105,10 @@ break dot, ns, name, index = match.groups() - if index is None or python.PyUnicode_GET_SIZE(index) == 0: + if index is None or not index: index = 0 else: - index = python.PyNumber_Int(index) + index = int(index) has_dot = dot == u'.' if python.PyList_GET_SIZE(new_path) == 0: if has_dot: @@ -152,7 +152,7 @@ index_end = cstd.strchr(index_pos + 1, c']') if index_end is NULL: raise ValueError, u"index must be enclosed in []" - index = python.PyNumber_Int( + index = int( python.PyString_FromStringAndSize( index_pos + 1, (index_end - index_pos - 1))) if python.PyList_GET_SIZE(new_path) == 0 and index != 0: Modified: lxml/trunk/src/lxml/tests/test_io.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_io.py (original) +++ lxml/trunk/src/lxml/tests/test_io.py Sat Feb 14 23:16:08 2009 @@ -112,7 +112,7 @@ 'invalid_file.xml') try: self.tree.write(filename) - except IOError, e: + except IOError: pass else: self.assertTrue( From scoder at codespeak.net Sun Feb 15 13:08:40 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 15 Feb 2009 13:08:40 +0100 (CET) Subject: [Lxml-checkins] r61927 - in lxml/trunk: . doc src/lxml src/lxml/html src/lxml/html/tests Message-ID: <20090215120840.7B6861684C1@codespeak.net> Author: scoder Date: Sun Feb 15 13:08:39 2009 New Revision: 61927 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/FAQ.txt lxml/trunk/doc/tutorial.txt lxml/trunk/src/lxml/extensions.pxi lxml/trunk/src/lxml/html/__init__.py lxml/trunk/src/lxml/html/tests/test_autolink.txt lxml/trunk/src/lxml/html/tests/test_clean_embed.txt lxml/trunk/src/lxml/lxml.objectify.pyx Log: r5001 at delle: sbehnel | 2009-02-15 11:56:39 +0100 Py3 fixes Modified: lxml/trunk/doc/FAQ.txt ============================================================================== --- lxml/trunk/doc/FAQ.txt (original) +++ lxml/trunk/doc/FAQ.txt Sun Feb 15 13:08:39 2009 @@ -56,6 +56,26 @@ 7.3 How can I find out which namespace prefixes are used in a document? 7.4 How can I specify a default namespace for XPath expressions? +.. + >>> import sys + >>> from lxml import etree as _etree + >>> if sys.version_info[0] >= 3: + ... class etree_mock(object): + ... def __getattr__(self, name): return getattr(_etree, name) + ... def tostring(self, *args, **kwargs): + ... s = _etree.tostring(*args, **kwargs) + ... if isinstance(s, bytes) and bytes([10]) in s: s = s.decode("utf-8") # CR + ... if s[-1] == '\n': s = s[:-1] + ... return s + ... else: + ... class etree_mock(object): + ... def __getattr__(self, name): return getattr(_etree, name) + ... def tostring(self, *args, **kwargs): + ... s = _etree.tostring(*args, **kwargs) + ... if s[-1] == '\n': s = s[:-1] + ... return s + >>> etree = etree_mock() + General Questions ================= @@ -248,7 +268,6 @@ .. sourcecode:: pycon - >>> from lxml import etree >>> root = etree.XML("texttail") >>> print(etree.tostring(root[0])) texttail @@ -292,7 +311,6 @@ .. sourcecode:: pycon - >>> from lxml import etree >>> root = etree.XML("") >>> root.tag Modified: lxml/trunk/doc/tutorial.txt ============================================================================== --- lxml/trunk/doc/tutorial.txt (original) +++ lxml/trunk/doc/tutorial.txt Sun Feb 15 13:08:39 2009 @@ -932,7 +932,7 @@ >>> parser = etree.XMLParser(target=parser_target) >>> events = etree.fromstring('', parser) - >>> print parser_target.close_count + >>> print(parser_target.close_count) 1 >>> for event in events: @@ -949,13 +949,13 @@ .. sourcecode:: pycon >>> events = etree.fromstring('', parser) - >>> print parser_target.close_count + >>> print(parser_target.close_count) 2 >>> events = etree.fromstring('', parser) - >>> print parser_target.close_count + >>> print(parser_target.close_count) 3 >>> events = etree.fromstring('', parser) - >>> print parser_target.close_count + >>> print(parser_target.close_count) 4 >>> for event in events: Modified: lxml/trunk/src/lxml/extensions.pxi ============================================================================== --- lxml/trunk/src/lxml/extensions.pxi (original) +++ lxml/trunk/src/lxml/extensions.pxi Sun Feb 15 13:08:39 2009 @@ -59,7 +59,7 @@ # format: [ {(ns, name):function} ] -> {(ns_utf, name_utf):function} new_extensions = {} for extension in extensions: - for (ns_uri, name), function in extension.iteritems(): + for (ns_uri, name), function in extension.items(): if name is None: raise ValueError, u"extensions must have non empty names" ns_utf = self._to_utf(ns_uri) Modified: lxml/trunk/src/lxml/html/__init__.py ============================================================================== --- lxml/trunk/src/lxml/html/__init__.py (original) +++ lxml/trunk/src/lxml/html/__init__.py Sun Feb 15 13:08:39 2009 @@ -1379,8 +1379,10 @@ # This isn't a general match, but it's a match for what libxml2 # specifically serialises: -__replace_meta_content_type = re.compile( +__str_replace_meta_content_type = re.compile( r']*>').sub +__bytes_replace_meta_content_type = re.compile( + r']*>'.encode('ASCII')).sub def tostring(doc, pretty_print=False, include_meta_content_type=False, encoding=None, method="html"): @@ -1423,7 +1425,10 @@ html = etree.tostring(doc, method=method, pretty_print=pretty_print, encoding=encoding) if not include_meta_content_type: - html = __replace_meta_content_type('', html) + if isinstance(html, str): + html = __str_replace_meta_content_type('', html) + else: + html = __bytes_replace_meta_content_type(bytes(), html) return html tostring.__doc__ = __fix_docstring(tostring.__doc__) Modified: lxml/trunk/src/lxml/html/tests/test_autolink.txt ============================================================================== --- lxml/trunk/src/lxml/html/tests/test_autolink.txt (original) +++ lxml/trunk/src/lxml/html/tests/test_autolink.txt Sun Feb 15 13:08:39 2009 @@ -25,13 +25,13 @@ Parenthesis are tricky, we'll do our best:: - >>> print autolink_html(''' + >>> print(autolink_html(''' ...
(Link: http://en.wikipedia.org/wiki/PC_Tools_(Central_Point_Software))
- ... ''') + ... ''')) - >>> print autolink_html(''' + >>> print(autolink_html(''' ...
... a link: http://foo.com)
- ... ''') + ... '''))
... a link: http://foo.com)
Some cases that won't be caught (on purpose):: Modified: lxml/trunk/src/lxml/html/tests/test_clean_embed.txt ============================================================================== --- lxml/trunk/src/lxml/html/tests/test_clean_embed.txt (original) +++ lxml/trunk/src/lxml/html/tests/test_clean_embed.txt Sun Feb 15 13:08:39 2009 @@ -11,21 +11,21 @@ ... ... ... ''' ->>> print tostring(fromstring(doc_embed)) +>>> print(tostring(fromstring(doc_embed)))
->>> print Cleaner().clean_html(doc_embed) +>>> print(Cleaner().clean_html(doc_embed))
->>> print Cleaner(host_whitelist=['www.youtube.com']).clean_html(doc_embed) +>>> print(Cleaner(host_whitelist=['www.youtube.com']).clean_html(doc_embed))
->>> print Cleaner(host_whitelist=['www.youtube.com'], whitelist_tags=None).clean_html(doc_embed) +>>> print(Cleaner(host_whitelist=['www.youtube.com'], whitelist_tags=None).clean_html(doc_embed))
Modified: lxml/trunk/src/lxml/lxml.objectify.pyx ============================================================================== --- lxml/trunk/src/lxml/lxml.objectify.pyx (original) +++ lxml/trunk/src/lxml/lxml.objectify.pyx Sun Feb 15 13:08:39 2009 @@ -524,7 +524,7 @@ return else: cetree.delAttributeFromNsName( - element._c_node, _XML_SCHEMA_INSTANCE_NS, u"nil") + element._c_node, _XML_SCHEMA_INSTANCE_NS, "nil") if python._isString(value): pytype_name = u"str" _pytype = python.PyDict_GetItem(_PYTYPE_DICT, pytype_name) From scoder at codespeak.net Sun Feb 15 13:08:45 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 15 Feb 2009 13:08:45 +0100 (CET) Subject: [Lxml-checkins] r61928 - in lxml/trunk: . src/lxml/tests Message-ID: <20090215120845.787241684CB@codespeak.net> Author: scoder Date: Sun Feb 15 13:08:44 2009 New Revision: 61928 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/tests/test_elementtree.py Log: r5002 at delle: sbehnel | 2009-02-15 11:59:29 +0100 Py3 fixes Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Sun Feb 15 13:08:44 2009 @@ -3518,7 +3518,11 @@ def assertEncodingDeclaration(self, result, encoding): "Checks if the result XML byte string specifies the encoding." - has_encoding = re.compile(r"<\?xml[^>]+ encoding=[\"']([^\"']+)[\"']").match + enc_re = r"<\?xml[^>]+ encoding=[\"']([^\"']+)[\"']" + if isinstance(result, str): + has_encoding = re.compile(enc_re).match + else: + has_encoding = re.compile(_bytes(enc_re)).match self.assert_(has_encoding(result)) result_encoding = has_encoding(result).group(1) self.assertEquals(result_encoding.upper(), encoding.upper()) From scoder at codespeak.net Sun Feb 15 13:08:50 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 15 Feb 2009 13:08:50 +0100 (CET) Subject: [Lxml-checkins] r61929 - in lxml/trunk: . doc Message-ID: <20090215120850.179BF1684CC@codespeak.net> Author: scoder Date: Sun Feb 15 13:08:49 2009 New Revision: 61929 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/FAQ.txt Log: r5003 at delle: sbehnel | 2009-02-15 12:01:56 +0100 Py3 fixes Modified: lxml/trunk/doc/FAQ.txt ============================================================================== --- lxml/trunk/doc/FAQ.txt (original) +++ lxml/trunk/doc/FAQ.txt Sun Feb 15 13:08:49 2009 @@ -64,7 +64,7 @@ ... def __getattr__(self, name): return getattr(_etree, name) ... def tostring(self, *args, **kwargs): ... s = _etree.tostring(*args, **kwargs) - ... if isinstance(s, bytes) and bytes([10]) in s: s = s.decode("utf-8") # CR + ... if isinstance(s, bytes): s = s.decode("utf-8") # CR ... if s[-1] == '\n': s = s[:-1] ... return s ... else: From scoder at codespeak.net Sun Feb 15 13:08:56 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 15 Feb 2009 13:08:56 +0100 (CET) Subject: [Lxml-checkins] r61930 - in lxml/trunk: . src/lxml Message-ID: <20090215120856.5E1901684C1@codespeak.net> Author: scoder Date: Sun Feb 15 13:08:54 2009 New Revision: 61930 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/lxml.objectify.pyx lxml/trunk/src/lxml/xslt.pxi Log: r5004 at delle: sbehnel | 2009-02-15 12:51:16 +0100 Py3 fixes Modified: lxml/trunk/src/lxml/lxml.objectify.pyx ============================================================================== --- lxml/trunk/src/lxml/lxml.objectify.pyx (original) +++ lxml/trunk/src/lxml/lxml.objectify.pyx Sun Feb 15 13:08:54 2009 @@ -1927,7 +1927,7 @@ _xsi = u'xsd:' + _xsi else: name = _xsi - for prefix, ns in nsmap.iteritems(): + for prefix, ns in nsmap.items(): if ns == XML_SCHEMA_NS: if prefix is not None and prefix: _xsi = prefix + u':' + _xsi Modified: lxml/trunk/src/lxml/xslt.pxi ============================================================================== --- lxml/trunk/src/lxml/xslt.pxi (original) +++ lxml/trunk/src/lxml/xslt.pxi Sun Feb 15 13:08:54 2009 @@ -822,10 +822,10 @@ cdef xmlAttr* c_attr if self._c_node.content is NULL: raise ValueError, u"PI lacks content" - hrefs_utf = _FIND_PI_HREF(' ' + self._c_node.content) - if len(hrefs_utf) != 1: + hrefs = _FIND_PI_HREF(u' ' + funicode(self._c_node.content)) + if len(hrefs) != 1: raise ValueError, u"malformed PI attributes" - href_utf = hrefs_utf[0] + href_utf = utf8(hrefs[0]) c_href = _cstr(href_utf) if c_href[0] != c'#': From scoder at codespeak.net Sun Feb 15 13:09:01 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 15 Feb 2009 13:09:01 +0100 (CET) Subject: [Lxml-checkins] r61931 - in lxml/trunk: . src/lxml Message-ID: <20090215120901.0F0801684CA@codespeak.net> Author: scoder Date: Sun Feb 15 13:09:00 2009 New Revision: 61931 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/lxml.objectify.pyx Log: r5005 at delle: sbehnel | 2009-02-15 13:03:00 +0100 unicode fix Modified: lxml/trunk/src/lxml/lxml.objectify.pyx ============================================================================== --- lxml/trunk/src/lxml/lxml.objectify.pyx (original) +++ lxml/trunk/src/lxml/lxml.objectify.pyx Sun Feb 15 13:09:00 2009 @@ -228,8 +228,7 @@ ElementBase.tag.__set__(self, value) return elif tag == u'base': - c_base = _cstr(value) - tree.xmlNodeSetBase(self._c_node, c_base) + ElementBase.base.__set__(self, value) return tag = _buildChildTag(self, tag) element = _lookupChild(self, tag) From scoder at codespeak.net Sun Feb 15 13:09:05 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 15 Feb 2009 13:09:05 +0100 (CET) Subject: [Lxml-checkins] r61932 - in lxml/trunk: . src/lxml/tests Message-ID: <20090215120905.CD6991684CB@codespeak.net> Author: scoder Date: Sun Feb 15 13:09:05 2009 New Revision: 61932 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/tests/test_threading.py Log: r5006 at delle: sbehnel | 2009-02-15 13:03:34 +0100 Py3 fix Modified: lxml/trunk/src/lxml/tests/test_threading.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_threading.py (original) +++ lxml/trunk/src/lxml/tests/test_threading.py Sun Feb 15 13:09:05 2009 @@ -122,7 +122,8 @@ errors = None try: XML(xml, parser) - except self.etree.ParseError, e: + except self.etree.ParseError: + e = sys.exc_info()[1] errors = e.error_log.filter_types(expected_error) self.assertTrue(errors, "Expected error not found") for error in errors: From scoder at codespeak.net Sun Feb 15 13:11:51 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 15 Feb 2009 13:11:51 +0100 (CET) Subject: [Lxml-checkins] r61933 - lxml/trunk Message-ID: <20090215121151.7B1281684C1@codespeak.net> Author: scoder Date: Sun Feb 15 13:11:50 2009 New Revision: 61933 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt Log: r5013 at delle: sbehnel | 2009-02-15 13:09:42 +0100 changelog Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Sun Feb 15 13:11:50 2009 @@ -14,8 +14,13 @@ Bugs fixed ---------- +* Setting the ``base`` attribute in ``lxml.objectify`` from a unicode + string failed. + * Fixes following changes in Python 3.0.1. +* Minor fixes for Python 3. + Other changes ------------- From scoder at codespeak.net Sun Feb 15 13:14:11 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 15 Feb 2009 13:14:11 +0100 (CET) Subject: [Lxml-checkins] r61934 - lxml/trunk Message-ID: <20090215121411.C5B1D1684C1@codespeak.net> Author: scoder Date: Sun Feb 15 13:14:11 2009 New Revision: 61934 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt Log: r5015 at delle: sbehnel | 2009-02-15 13:12:12 +0100 changelog Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Sun Feb 15 13:14:11 2009 @@ -25,7 +25,7 @@ ------------- * The global error log (which is copied into the exception log) is now - local to a thread. + local to a thread, which fixes some race conditions. * More robust error handling on serialisation. From scoder at codespeak.net Tue Feb 17 22:23:13 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 17 Feb 2009 22:23:13 +0100 (CET) Subject: [Lxml-checkins] r61982 - in lxml/trunk: . src/lxml Message-ID: <20090217212313.B1C6E168550@codespeak.net> Author: scoder Date: Tue Feb 17 22:23:13 2009 New Revision: 61982 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/etree_defs.h lxml/trunk/src/lxml/proxy.pxi Log: r5017 at delle: sbehnel | 2009-02-17 22:08:37 +0100 fix multi-threading crash bug Modified: lxml/trunk/src/lxml/etree_defs.h ============================================================================== --- lxml/trunk/src/lxml/etree_defs.h (original) +++ lxml/trunk/src/lxml/etree_defs.h Tue Feb 17 22:23:13 2009 @@ -192,7 +192,7 @@ */ #define _LX__ELEMENT_MATCH(c_node, only_elements) \ - ((only_elements) ? (_isElement(c_node)) : ((c_node)->type != XML_TEXT_NODE)) + ((only_elements) ? (_isElement(c_node)) : 1) #define _LX__ADVANCE_TO_NEXT(c_node, only_elements) \ while ((c_node != 0) && (!_LX__ELEMENT_MATCH(c_node, only_elements))) \ Modified: lxml/trunk/src/lxml/proxy.pxi ============================================================================== --- lxml/trunk/src/lxml/proxy.pxi (original) +++ lxml/trunk/src/lxml/proxy.pxi Tue Feb 17 22:23:13 2009 @@ -386,13 +386,14 @@ cdef void fixElementDocument(xmlNode* c_element, _Document doc, cstd.size_t proxy_count): - tree.BEGIN_FOR_EACH_FROM(c_element, c_element, 1) - if c_element._private is not NULL: - _updateProxyDocument(c_element, doc) + cdef xmlNode* c_node = c_element + tree.BEGIN_FOR_EACH_FROM(c_element, c_node, 1) + if c_node._private is not NULL: + _updateProxyDocument(c_node, doc) proxy_count -= 1 if proxy_count == 0: return - tree.END_FOR_EACH_FROM(c_element) + tree.END_FOR_EACH_FROM(c_node) cdef void fixThreadDictNames(xmlNode* c_element, tree.xmlDict* c_src_dict, @@ -416,13 +417,17 @@ cdef void fixThreadDictNamesForNode(xmlNode* c_element, tree.xmlDict* c_src_dict, tree.xmlDict* c_dict) nogil: - tree.BEGIN_FOR_EACH_FROM(c_element, c_element, 1) - if c_element.name is not NULL: - fixThreadDictNameForNode(c_element, c_dict) - if c_element.type == tree.XML_ELEMENT_NODE: + cdef xmlNode* c_node = c_element + tree.BEGIN_FOR_EACH_FROM(c_element, c_node, 1) + if c_node.name is not NULL: + fixThreadDictNameForNode(c_node, c_src_dict, c_dict) + if c_node.type == tree.XML_ELEMENT_NODE: fixThreadDictNamesForAttributes( - c_element.properties, c_src_dict, c_dict) - tree.END_FOR_EACH_FROM(c_element) + c_node.properties, c_src_dict, c_dict) + elif c_node.type == tree.XML_TEXT_NODE: + # libxml2's SAX2 parser interns some indentation space + fixThreadDictContentForNode(c_node, c_src_dict, c_dict) + tree.END_FOR_EACH_FROM(c_node) cdef inline void fixThreadDictNamesForAttributes(tree.xmlAttr* c_attr, tree.xmlDict* c_src_dict, @@ -430,32 +435,34 @@ cdef xmlNode* c_child cdef xmlNode* c_node = c_attr while c_node is not NULL: - fixThreadDictNameForNode(c_node, c_dict) + fixThreadDictNameForNode(c_node, c_src_dict, c_dict) fixThreadDictContentForNode(c_node, c_src_dict, c_dict) # libxml2 keeps some (!) attribute values in the dict c_child = c_node.children while c_child is not NULL: - fixThreadDictNameForNode(c_child, c_dict) + fixThreadDictNameForNode(c_child, c_src_dict, c_dict) fixThreadDictContentForNode(c_child, c_src_dict, c_dict) c_child = c_child.next c_node = c_node.next cdef inline void fixThreadDictNameForNode(xmlNode* c_node, + tree.xmlDict* c_src_dict, tree.xmlDict* c_dict) nogil: cdef char* c_name = c_node.name if c_name is not NULL and \ c_node.type != tree.XML_TEXT_NODE and \ c_node.type != tree.XML_COMMENT_NODE: - # c_name can be NULL on memory error, but we don't handle that here - c_name = tree.xmlDictLookup(c_dict, c_name, -1) - if c_name is not NULL: - c_node.name = c_name + if tree.xmlDictOwns(c_src_dict, c_node.name): + # c_name can be NULL on memory error, but we don't handle that here + c_name = tree.xmlDictLookup(c_dict, c_name, -1) + if c_name is not NULL: + c_node.name = c_name cdef inline void fixThreadDictContentForNode(xmlNode* c_node, tree.xmlDict* c_src_dict, tree.xmlDict* c_dict) nogil: if c_node.content is not NULL and \ - c_node.content is not c_node.properties: + c_node.content is not &c_node.properties: if tree.xmlDictOwns(c_src_dict, c_node.content): # result can be NULL on memory error, but we don't handle that here c_node.content = tree.xmlDictLookup(c_dict, c_node.content, -1) From scoder at codespeak.net Tue Feb 17 22:23:18 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 17 Feb 2009 22:23:18 +0100 (CET) Subject: [Lxml-checkins] r61983 - in lxml/trunk: . src/lxml/tests Message-ID: <20090217212318.CE459168553@codespeak.net> Author: scoder Date: Tue Feb 17 22:23:18 2009 New Revision: 61983 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/tests/test_threading.py Log: r5018 at delle: sbehnel | 2009-02-17 22:09:11 +0100 test cases based on thread pipelines Modified: lxml/trunk/src/lxml/tests/test_threading.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_threading.py (original) +++ lxml/trunk/src/lxml/tests/test_threading.py Tue Feb 17 22:23:18 2009 @@ -12,6 +12,11 @@ from common_imports import etree, HelperTestCase, BytesIO, _bytes +try: + from Queue import Queue +except ImportError: + from queue import Queue # Py3 + class ThreadingTestCase(HelperTestCase): """Threading tests""" etree = etree @@ -252,9 +257,136 @@ for thread in threads: thread.join() + +class ThreadPipelineTestCase(HelperTestCase): + """Threading tests based on a thread worker pipeline. + """ + etree = etree + item_count = 20 + + class Worker(threading.Thread): + def __init__(self, in_queue, in_count, **kwargs): + threading.Thread.__init__(self) + self.in_queue = in_queue + self.in_count = in_count + self.out_queue = Queue(in_count) + self.__dict__.update(kwargs) + def run(self): + get, put = self.in_queue.get, self.out_queue.put + handle = self.handle + for _ in range(self.in_count): + put(handle(get())) + + class ParseWorker(Worker): + XML = etree.XML + def handle(self, xml): + return self.XML(xml) + class RotateWorker(Worker): + def handle(self, element): + first = element[0] + element[:] = element[1:] + element.append(first) + return element + class ReverseWorker(Worker): + def handle(self, element): + element[:] = element[::-1] + return element + class ParseAndExtendWorker(Worker): + XML = etree.XML + def handle(self, element): + element.extend(self.XML(self.xml)) + return element + class SerialiseWorker(Worker): + def handle(self, element): + return etree.tostring(element) + + xml = _bytes('''\ + + + +
+ +
+
+
''') + + def _build_pipeline(self, item_count, *classes, **kwargs): + in_queue = Queue(item_count) + start = last = classes[0](in_queue, item_count, **kwargs) + start.setDaemon(True) + for worker_class in classes[1:]: + last = worker_class(last.out_queue, item_count, **kwargs) + last.setDaemon(True) + last.start() + return (in_queue, start, last) + + def test_thread_pipeline_thread_parse(self): + item_count = self.item_count + # build and start the pipeline + in_queue, start, last = self._build_pipeline( + item_count, + self.ParseWorker, + self.RotateWorker, + self.ReverseWorker, + self.ParseAndExtendWorker, + self.SerialiseWorker, + xml = self.xml) + + # fill the queue + put = start.in_queue.put + for _ in range(item_count): + put(self.xml) + + # start the first thread and thus everything + start.start() + # make sure the last thread has terminated + last.join(60) # time out after 60 seconds + self.assertEquals(item_count, last.out_queue.qsize()) + # read the results + get = last.out_queue.get + results = [ get() for _ in range(item_count) ] + + comparison = results[0] + for i, result in enumerate(results[1:]): + self.assertEquals(comparison, result) + + def test_thread_pipeline_global_parse(self): + item_count = self.item_count + XML = self.etree.XML + # build and start the pipeline + in_queue, start, last = self._build_pipeline( + item_count, + self.RotateWorker, + self.ReverseWorker, + self.ParseAndExtendWorker, + self.SerialiseWorker, + xml = self.xml) + + # fill the queue + put = start.in_queue.put + for _ in range(item_count): + put(XML(self.xml)) + + # start the first thread and thus everything + start.start() + # make sure the last thread has terminated + last.join(60) # time out after 90 seconds + self.assertEquals(item_count, last.out_queue.qsize()) + # read the results + get = last.out_queue.get + results = [ get() for _ in range(item_count) ] + + comparison = results[0] + for i, result in enumerate(results[1:]): + self.assertEquals(comparison, result) + + def test_suite(): suite = unittest.TestSuite() suite.addTests([unittest.makeSuite(ThreadingTestCase)]) + suite.addTests([unittest.makeSuite(ThreadPipelineTestCase)]) return suite if __name__ == '__main__': From scoder at codespeak.net Tue Feb 17 22:23:23 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 17 Feb 2009 22:23:23 +0100 (CET) Subject: [Lxml-checkins] r61984 - in lxml/trunk: . src/lxml Message-ID: <20090217212323.4164416855B@codespeak.net> Author: scoder Date: Tue Feb 17 22:23:22 2009 New Revision: 61984 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/xpath.pxi Log: r5019 at delle: sbehnel | 2009-02-17 22:20:40 +0100 fix memory leak in XPath evaluators Modified: lxml/trunk/src/lxml/xpath.pxi ============================================================================== --- lxml/trunk/src/lxml/xpath.pxi (original) +++ lxml/trunk/src/lxml/xpath.pxi Tue Feb 17 22:23:22 2009 @@ -128,6 +128,9 @@ def __dealloc__(self): if self._xpathCtxt is not NULL: xpath.xmlXPathFreeContext(self._xpathCtxt) + if config.ENABLE_THREADING: + if self._eval_lock is not NULL: + python.PyThread_free_lock(self._eval_lock) cdef set_context(self, xpath.xmlXPathContext* xpathCtxt): self._xpathCtxt = xpathCtxt From scoder at codespeak.net Tue Feb 17 22:23:27 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 17 Feb 2009 22:23:27 +0100 (CET) Subject: [Lxml-checkins] r61985 - lxml/trunk Message-ID: <20090217212327.C4C45168566@codespeak.net> Author: scoder Date: Tue Feb 17 22:23:26 2009 New Revision: 61985 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt Log: r5020 at delle: sbehnel | 2009-02-17 22:20:50 +0100 changelog Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Tue Feb 17 22:23:26 2009 @@ -14,6 +14,11 @@ Bugs fixed ---------- +* Memory leak in XPath evaluators. + +* Crash when parsing indented XML in one thread and merging it with + other documents parsed in another thread. + * Setting the ``base`` attribute in ``lxml.objectify`` from a unicode string failed. From scoder at codespeak.net Tue Feb 17 22:23:31 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 17 Feb 2009 22:23:31 +0100 (CET) Subject: [Lxml-checkins] r61986 - in lxml/trunk: . doc Message-ID: <20090217212331.F02B1168566@codespeak.net> Author: scoder Date: Tue Feb 17 22:23:31 2009 New Revision: 61986 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/FAQ.txt Log: r5021 at delle: sbehnel | 2009-02-17 22:21:06 +0100 FAQ note on threading: use 2.2+ Modified: lxml/trunk/doc/FAQ.txt ============================================================================== --- lxml/trunk/doc/FAQ.txt (original) +++ lxml/trunk/doc/FAQ.txt Tue Feb 17 22:23:31 2009 @@ -612,7 +612,7 @@ Can I use threads to concurrently access the lxml API? ------------------------------------------------------ -Short answer: yes, if you use lxml 2.1 and later. +Short answer: yes, if you use lxml 2.2 and later. Since version 1.1, lxml frees the GIL (Python's global interpreter lock) internally when parsing from disk and memory, as long as you use @@ -634,14 +634,14 @@ One way to achieve this is by caching stylesheets in thread-local storage. -Warning: Before lxml 2.1, there were issues when moving subtrees -between different threads. If you need code to run with older -versions, you should generally avoid modifying trees in other threads -than the one it was generated in. Although this should work in many -cases, there are certain scenarios where the termination of a thread -that parsed a tree can crash the application if subtrees of this tree -were moved to other documents. You should be on the safe side when -passing trees between threads if you either +Warning: Before lxml 2.2, there were various issues when moving +subtrees between different threads. If you need code to run with +older versions, you should generally avoid modifying trees in other +threads than the one it was generated in. Although this should work +in many cases, there are certain scenarios where the termination of a +thread that parsed a tree can crash the application if subtrees of +this tree were moved to other documents. You should be on the safe +side when passing trees between threads if you either - do not modify these trees and do not move their elements to other trees, or From scoder at codespeak.net Tue Feb 17 22:33:59 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 17 Feb 2009 22:33:59 +0100 (CET) Subject: [Lxml-checkins] r61987 - in lxml/trunk: . doc Message-ID: <20090217213359.61CC9168513@codespeak.net> Author: scoder Date: Tue Feb 17 22:33:58 2009 New Revision: 61987 Modified: lxml/trunk/CHANGES.txt lxml/trunk/doc/main.txt lxml/trunk/version.txt Log: prepare release of lxml 2.2beta3 Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Tue Feb 17 22:33:58 2009 @@ -2,8 +2,8 @@ lxml changelog ============== -Under development -================= +2.2beta3 (2009-02-17) +===================== Features added -------------- Modified: lxml/trunk/doc/main.txt ============================================================================== --- lxml/trunk/doc/main.txt (original) +++ lxml/trunk/doc/main.txt Tue Feb 17 22:33:58 2009 @@ -147,8 +147,8 @@ source release. If you can't wait, consider trying a less recent release version first. -The latest version is `lxml 2.2beta2`_, released 2009-01-25 -(`changes for 2.2beta2`_). `Older versions`_ are listed below. +The latest version is `lxml 2.2beta3`_, released 2009-02-17 +(`changes for 2.2beta3`_). `Older versions`_ are listed below. Please take a look at the `installation instructions`_! @@ -220,7 +220,9 @@ `2.0 `_ and the `current in-development version `_. -.. _`PDF documentation`: lxmldoc-2.2beta2.pdf +.. _`PDF documentation`: lxmldoc-2.2beta3.pdf + +* `lxml 2.2beta2`_, released 2009-01-25 (`changes for 2.2beta2`_) * `lxml 2.2beta1`_, released 2008-12-12 (`changes for 2.2beta1`_) @@ -312,6 +314,7 @@ * `lxml 0.5`_, released 2005-04-08 +.. _`lxml 2.2beta3`: lxml-2.2beta3.tgz .. _`lxml 2.2beta2`: lxml-2.2beta2.tgz .. _`lxml 2.2beta1`: lxml-2.2beta1.tgz .. _`lxml 2.2alpha1`: lxml-2.2alpha1.tgz @@ -358,6 +361,7 @@ .. _`lxml 0.5.1`: lxml-0.5.1.tgz .. _`lxml 0.5`: lxml-0.5.tgz +.. _`changes for 2.2beta3`: changes-2.2beta3.html .. _`changes for 2.2beta2`: changes-2.2beta2.html .. _`changes for 2.2beta1`: changes-2.2beta1.html .. _`changes for 2.2alpha1`: changes-2.2alpha1.html Modified: lxml/trunk/version.txt ============================================================================== --- lxml/trunk/version.txt (original) +++ lxml/trunk/version.txt Tue Feb 17 22:33:58 2009 @@ -1 +1 @@ -2.2beta2 +2.2beta3 From scoder at codespeak.net Tue Feb 17 22:34:35 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 17 Feb 2009 22:34:35 +0100 (CET) Subject: [Lxml-checkins] r61988 - in lxml/trunk: . src/lxml/tests Message-ID: <20090217213435.EA693168514@codespeak.net> Author: scoder Date: Tue Feb 17 22:34:35 2009 New Revision: 61988 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/tests/test_threading.py Log: r5029 at delle: sbehnel | 2009-02-17 22:32:32 +0100 test fix Modified: lxml/trunk/src/lxml/tests/test_threading.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_threading.py (original) +++ lxml/trunk/src/lxml/tests/test_threading.py Tue Feb 17 22:34:35 2009 @@ -130,9 +130,9 @@ except self.etree.ParseError: e = sys.exc_info()[1] errors = e.error_log.filter_types(expected_error) - self.assertTrue(errors, "Expected error not found") + self.assert_(errors, "Expected error not found") for error in errors: - self.assertTrue( + self.assert_( tag in error.message and tag.upper() in error.message, "%s and %s not found in '%s'" % ( tag, tag.upper(), error.message)) From scoder at codespeak.net Thu Feb 19 22:49:47 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 19 Feb 2009 22:49:47 +0100 (CET) Subject: [Lxml-checkins] r62032 - in lxml/trunk: . src/lxml Message-ID: <20090219214947.1F342169E9D@codespeak.net> Author: scoder Date: Thu Feb 19 22:49:45 2009 New Revision: 62032 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/proxy.pxi Log: r5031 at delle: sbehnel | 2009-02-19 22:46:16 +0100 tiny optimisation Modified: lxml/trunk/src/lxml/proxy.pxi ============================================================================== --- lxml/trunk/src/lxml/proxy.pxi (original) +++ lxml/trunk/src/lxml/proxy.pxi Thu Feb 19 22:49:45 2009 @@ -436,11 +436,9 @@ cdef xmlNode* c_node = c_attr while c_node is not NULL: fixThreadDictNameForNode(c_node, c_src_dict, c_dict) - fixThreadDictContentForNode(c_node, c_src_dict, c_dict) # libxml2 keeps some (!) attribute values in the dict c_child = c_node.children while c_child is not NULL: - fixThreadDictNameForNode(c_child, c_src_dict, c_dict) fixThreadDictContentForNode(c_child, c_src_dict, c_dict) c_child = c_child.next c_node = c_node.next From scoder at codespeak.net Thu Feb 19 22:49:52 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 19 Feb 2009 22:49:52 +0100 (CET) Subject: [Lxml-checkins] r62033 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20090219214952.82ED2169E9E@codespeak.net> Author: scoder Date: Thu Feb 19 22:49:52 2009 New Revision: 62033 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/lxml.etree.pyx lxml/trunk/src/lxml/serializer.pxi lxml/trunk/src/lxml/tests/test_etree.py Log: r5032 at delle: sbehnel | 2009-02-19 22:47:33 +0100 gzip compatible output compression when writing to files and file-like objects Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Thu Feb 19 22:49:52 2009 @@ -2,6 +2,16 @@ lxml changelog ============== +Under development +================= + +Features added +-------------- + +* Zlib compression support for serialisation operations to file(-like) + objects. + + 2.2beta3 (2009-02-17) ===================== Modified: lxml/trunk/src/lxml/lxml.etree.pyx ============================================================================== --- lxml/trunk/src/lxml/lxml.etree.pyx (original) +++ lxml/trunk/src/lxml/lxml.etree.pyx Thu Feb 19 22:49:52 2009 @@ -44,6 +44,9 @@ cdef object re import re +cdef object gzip +import gzip + cdef object ITER_EMPTY ITER_EMPTY = iter(()) @@ -1596,16 +1599,20 @@ return None def write(self, file, *, encoding=None, method=u"xml", - pretty_print=False, xml_declaration=None, with_tail=True): + pretty_print=False, xml_declaration=None, with_tail=True, + compression=0): u"""write(self, file, encoding=None, method="xml", - pretty_print=False, xml_declaration=None, with_tail=True) + pretty_print=False, xml_declaration=None, with_tail=True, + compression=0) - Write the tree to a file or file-like object. + Write the tree to a filename, file or file-like object. Defaults to ASCII encoding and writing a declaration as needed. The keyword argument 'method' selects the output method: 'xml' or 'html'. + + The ``compression`` option enables GZip compression level 1-9. """ cdef bint write_declaration self._assertHasRoot() @@ -1621,8 +1628,10 @@ encoding = encoding.upper() write_declaration = encoding not in \ (u'US-ASCII', u'ASCII', u'UTF8', u'UTF-8') + if compression is None or compression < 0: + compression = 0 _tofilelike(file, self._context_node, encoding, method, - write_declaration, 1, pretty_print, with_tail) + write_declaration, 1, pretty_print, with_tail, compression) def getpath(self, _Element element not None): u"""getpath(self, element) @@ -1841,13 +1850,20 @@ self._assertHasRoot() XInclude()(self._context_node) - def write_c14n(self, file, *, exclusive=False, with_comments=True): - u"""write_c14n(self, file, exclusive=False, with_comments=True) + def write_c14n(self, file, *, exclusive=False, with_comments=True, + compression=0): + u"""write_c14n(self, file, exclusive=False, with_comments=True, + compression=0) C14N write of document. Always writes UTF-8. + + The ``compression`` option enables GZip compression level 1-9. """ self._assertHasRoot() - _tofilelikeC14N(file, self._context_node, exclusive, with_comments) + if compression is None or compression < 0: + compression = 0 + _tofilelikeC14N(file, self._context_node, exclusive, with_comments, + compression) cdef _ElementTree _elementTreeFactory(_Document doc, _Element context_node): return _newElementTree(doc, context_node, _ElementTree) Modified: lxml/trunk/src/lxml/serializer.pxi ============================================================================== --- lxml/trunk/src/lxml/serializer.pxi (original) +++ lxml/trunk/src/lxml/serializer.pxi Thu Feb 19 22:49:52 2009 @@ -134,7 +134,9 @@ tree.xmlBufferContent(c_result_buffer), tree.xmlBufferLength(c_result_buffer)) finally: - tree.xmlOutputBufferClose(c_buffer) + error_result = tree.xmlOutputBufferClose(c_buffer) + if error_result < 0: + _raiseSerialisationError(error_result) return result cdef _raiseSerialisationError(int error_result): @@ -294,9 +296,14 @@ cdef class _FilelikeWriter: cdef object _filelike + cdef object _close_filelike cdef _ExceptionContext _exc_context cdef _ErrorLog error_log - def __init__(self, filelike, exc_context=None): + def __init__(self, filelike, exc_context=None, compression=None): + if compression is not None and compression > 0: + filelike = gzip.GzipFile( + fileobj=filelike, mode='wb', compresslevel=compression) + self._close_filelike = filelike.close self._filelike = filelike if exc_context is None: self._exc_context = _ExceptionContext() @@ -326,9 +333,15 @@ return -1 cdef int close(self): - # we should not close the file here as we didn't open it - self._filelike = None - return 0 + try: + if self._close_filelike is not None: + self._close_filelike() + # we should not close the file here as we didn't open it + self._filelike = None + return 0 + except: + self._exc_context._store_raised() + return -1 cdef int _writeFilelikeWriter(void* ctxt, char* c_buffer, int len): return (<_FilelikeWriter>ctxt).write(c_buffer, len) @@ -338,7 +351,7 @@ cdef _tofilelike(f, _Element element, encoding, method, bint write_xml_declaration, bint write_doctype, - bint pretty_print, bint with_tail): + bint pretty_print, bint with_tail, int compression): cdef python.PyThreadState* state = NULL cdef _FilelikeWriter writer cdef tree.xmlOutputBuffer* c_buffer @@ -352,15 +365,25 @@ c_enc = _cstr(encoding) c_method = _findOutputMethod(method) if c_method == OUTPUT_METHOD_TEXT: + data = _textToString(element._c_node, encoding, with_tail) + if compression: + bytes_out = BytesIO() + gzip_file = gzip.GzipFile( + fileobj=bytes_out, mode='wb', compresslevel=compression) + try: + gzip_file.write(data) + finally: + gzip_file.close() + data = bytes_out if _isString(f): filename8 = _encodeFilename(f) f = open(filename8, u'wb') try: - f.write(_textToString(element._c_node, encoding, with_tail)) + f.write(data) finally: f.close() else: - f.write(_textToString(element._c_node, encoding, with_tail)) + f.write(data) return enchandler = tree.xmlFindCharEncodingHandler(c_enc) if enchandler is NULL: @@ -371,12 +394,12 @@ if _isString(f): filename8 = _encodeFilename(f) c_buffer = tree.xmlOutputBufferCreateFilename( - _cstr(filename8), enchandler, 0) + _cstr(filename8), enchandler, compression) if c_buffer is NULL: return python.PyErr_SetFromErrno(IOError) state = python.PyEval_SaveThread() elif hasattr(f, u'write'): - writer = _FilelikeWriter(f) + writer = _FilelikeWriter(f, compression=compression) c_buffer = writer._createOutputBuffer(enchandler) else: tree.xmlCharEncCloseFunc(enchandler) @@ -387,7 +410,12 @@ write_xml_declaration, write_doctype, pretty_print, with_tail) error_result = c_buffer.error - tree.xmlOutputBufferClose(c_buffer) + if error_result == xmlerror.XML_ERR_OK: + error_result = tree.xmlOutputBufferClose(c_buffer) + if error_result > 0: + error_result = xmlerror.XML_ERR_OK + else: + tree.xmlOutputBufferClose(c_buffer) if writer is None: python.PyEval_RestoreThread(state) else: @@ -395,7 +423,8 @@ if error_result != xmlerror.XML_ERR_OK: _raiseSerialisationError(error_result) -cdef _tofilelikeC14N(f, _Element element, bint exclusive, bint with_comments): +cdef _tofilelikeC14N(f, _Element element, bint exclusive, bint with_comments, + int compression): cdef _FilelikeWriter writer cdef tree.xmlOutputBuffer* c_buffer cdef char* c_filename @@ -411,15 +440,18 @@ c_filename = _cstr(filename8) with nogil: bytes = c14n.xmlC14NDocSave(c_doc, NULL, exclusive, NULL, - with_comments, c_filename, 0) + with_comments, c_filename, compression) elif hasattr(f, u'write'): - writer = _FilelikeWriter(f) + writer = _FilelikeWriter(f, compression=compression) c_buffer = writer._createOutputBuffer(NULL) writer.error_log.connect() bytes = c14n.xmlC14NDocSaveTo(c_doc, NULL, exclusive, NULL, with_comments, c_buffer) writer.error_log.disconnect() - tree.xmlOutputBufferClose(c_buffer) + if bytes >= 0: + bytes = tree.xmlOutputBufferClose(c_buffer) + else: + tree.xmlOutputBufferClose(c_buffer) else: raise TypeError, \ u"File or filename expected, got '%s'" % funicode(python._fqtypename(f)) Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Thu Feb 19 22:49:52 2009 @@ -7,7 +7,7 @@ test_elementtree """ -import os.path, unittest, copy, sys, operator, tempfile +import os.path, unittest, copy, sys, operator, tempfile, gzip this_dir = os.path.dirname(__file__) if this_dir not in sys.path: @@ -2457,14 +2457,16 @@ # helper methods - def _writeElement(self, element, encoding='us-ascii'): + def _writeElement(self, element, encoding='us-ascii', compression=0): """Write out element for comparison. """ ElementTree = self.etree.ElementTree f = BytesIO() tree = ElementTree(element=element) - tree.write(f, encoding=encoding) + tree.write(f, encoding=encoding, compression=compression) data = f.getvalue() + if compression: + data = zlib.decompress(data) return canonicalize(data) @@ -2547,6 +2549,14 @@ self.assertEquals(_bytes(''), s) + def test_c14n_gzip(self): + tree = self.parse(_bytes(''+''*200+'')) + f = BytesIO() + tree.write_c14n(f, compression=9) + s = gzip.GzipFile(fileobj=BytesIO(f.getvalue())).read() + self.assertEquals(_bytes(''+''*200+''), + s) + def test_c14n_file(self): tree = self.parse(_bytes('')) handle, filename = tempfile.mkstemp() @@ -2561,6 +2571,20 @@ self.assertEquals(_bytes(''), data) + def test_c14n_file_gzip(self): + tree = self.parse(_bytes(''+''*200+'')) + handle, filename = tempfile.mkstemp() + try: + tree.write_c14n(filename, compression=9) + f = gzip.open(filename, 'rb') + data = f.read() + f.close() + finally: + os.close(handle) + os.remove(filename) + self.assertEquals(_bytes(''+''*200+''), + data) + def test_c14n_with_comments(self): tree = self.parse(_bytes('')) f = BytesIO() @@ -2598,12 +2622,114 @@ self.assertEquals(_bytes(''), s) + +class ETreeWriteTestCase(HelperTestCase): + def test_write(self): + tree = self.parse(_bytes('')) + f = BytesIO() + tree.write(f) + s = f.getvalue() + self.assertEquals(_bytes(''), + s) + + def test_write_gzip(self): + tree = self.parse(_bytes(''+''*200+'')) + f = BytesIO() + tree.write(f, compression=9) + s = gzip.GzipFile(fileobj=BytesIO(f.getvalue())).read() + self.assertEquals(_bytes(''+''*200+''), + s) + + def test_write_gzip_level(self): + tree = self.parse(_bytes(''+''*200+'')) + f = BytesIO() + tree.write(f, compression=0) + s0 = f.getvalue() + + f = BytesIO() + tree.write(f) + self.assertEquals(f.getvalue(), s0) + + f = BytesIO() + tree.write(f, compression=1) + s = f.getvalue() + self.assert_(len(s) <= len(s0)) + s1 = gzip.GzipFile(fileobj=BytesIO(s)).read() + + f = BytesIO() + tree.write(f, compression=9) + s = f.getvalue() + self.assert_(len(s) <= len(s0)) + s9 = gzip.GzipFile(fileobj=BytesIO(s)).read() + + self.assertEquals(_bytes(''+''*200+''), + s0) + self.assertEquals(_bytes(''+''*200+''), + s1) + self.assertEquals(_bytes(''+''*200+''), + s9) + + def test_write_file(self): + tree = self.parse(_bytes('')) + handle, filename = tempfile.mkstemp() + try: + tree.write(filename) + f = open(filename, 'rb') + data = f.read() + f.close() + finally: + os.close(handle) + os.remove(filename) + self.assertEquals(_bytes(''), + data) + + def test_write_file_gzip(self): + tree = self.parse(_bytes(''+''*200+'')) + handle, filename = tempfile.mkstemp() + try: + tree.write(filename, compression=9) + f = gzip.open(filename, 'rb') + data = f.read() + f.close() + finally: + os.close(handle) + os.remove(filename) + self.assertEquals(_bytes(''+''*200+''), + data) + + def test_write_file_gzip_parse(self): + tree = self.parse(_bytes(''+''*200+'')) + handle, filename = tempfile.mkstemp() + try: + tree.write(filename, compression=9) + data = etree.tostring(etree.parse(filename)) + finally: + os.close(handle) + os.remove(filename) + self.assertEquals(_bytes(''+''*200+''), + data) + + def test_write_file_gzipfile_parse(self): + tree = self.parse(_bytes(''+''*200+'')) + handle, filename = tempfile.mkstemp() + try: + tree.write(filename, compression=9) + data = etree.tostring(etree.parse( + gzip.GzipFile(filename))) + finally: + os.close(handle) + os.remove(filename) + self.assertEquals(_bytes(''+''*200+''), + data) + + def test_suite(): suite = unittest.TestSuite() suite.addTests([unittest.makeSuite(ETreeOnlyTestCase)]) suite.addTests([unittest.makeSuite(ETreeXIncludeTestCase)]) suite.addTests([unittest.makeSuite(ElementIncludeTestCase)]) suite.addTests([unittest.makeSuite(ETreeC14NTestCase)]) + suite.addTests([unittest.makeSuite(ETreeWriteTestCase)]) suite.addTests( [make_doctest('../../../doc/tutorial.txt')]) suite.addTests( From scoder at codespeak.net Fri Feb 20 19:30:47 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 20 Feb 2009 19:30:47 +0100 (CET) Subject: [Lxml-checkins] r62055 - lxml/trunk Message-ID: <20090220183047.8FCE0169E39@codespeak.net> Author: scoder Date: Fri Feb 20 19:30:45 2009 New Revision: 62055 Modified: lxml/trunk/ (props changed) lxml/trunk/setupinfo.py Log: r5035 at delle: sbehnel | 2009-02-20 19:19:36 +0100 allow showing compiler warning during build Modified: lxml/trunk/setupinfo.py ============================================================================== --- lxml/trunk/setupinfo.py (original) +++ lxml/trunk/setupinfo.py Fri Feb 20 19:30:45 2009 @@ -91,6 +91,9 @@ else: runtime_library_dirs = [] + if not OPTION_SHOW_WARNINGS: + _cflags = ['-w'] + _cflags + result = [] for module in modules: main_module_source = PACKAGE_PATH + module + source_extension @@ -99,7 +102,7 @@ Extension( module, sources = [main_module_source] + dependencies, - extra_compile_args = ['-w'] + _cflags, + extra_compile_args = _cflags, extra_objects = static_binaries, define_macros = _define_macros, include_dirs = _include_dirs, @@ -327,6 +330,7 @@ CYTHON_INSTALLED = False OPTION_STATIC = has_option('static') OPTION_DEBUG_GCC = has_option('debug-gcc') +OPTION_SHOW_WARNINGS = has_option('warnings') OPTION_AUTO_RPATH = has_option('auto-rpath') OPTION_BUILD_LIBXML2XSLT = has_option('static-deps') if OPTION_BUILD_LIBXML2XSLT: From scoder at codespeak.net Fri Feb 20 19:30:51 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 20 Feb 2009 19:30:51 +0100 (CET) Subject: [Lxml-checkins] r62056 - in lxml/trunk: . src/lxml Message-ID: <20090220183051.E85AB169E4A@codespeak.net> Author: scoder Date: Fri Feb 20 19:30:50 2009 New Revision: 62056 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/_elementpath.py Log: r5036 at delle: sbehnel | 2009-02-20 19:21:16 +0100 return empty string instead of None for empty element text content from el.findtext() Modified: lxml/trunk/src/lxml/_elementpath.py ============================================================================== --- lxml/trunk/src/lxml/_elementpath.py (original) +++ lxml/trunk/src/lxml/_elementpath.py Fri Feb 20 19:30:50 2009 @@ -232,4 +232,4 @@ if el is None: return default else: - return el.text + return el.text or '' From scoder at codespeak.net Fri Feb 20 19:30:56 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 20 Feb 2009 19:30:56 +0100 (CET) Subject: [Lxml-checkins] r62057 - in lxml/trunk: . doc src/lxml src/lxml/tests Message-ID: <20090220183056.56711169E4B@codespeak.net> Author: scoder Date: Fri Feb 20 19:30:55 2009 New Revision: 62057 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/element_classes.txt lxml/trunk/src/lxml/classlookup.pxi lxml/trunk/src/lxml/tests/test_nsclasses.py Log: r5037 at delle: sbehnel | 2009-02-20 19:27:56 +0100 extended ElementBase constructor: support text content, Element classes and instances Modified: lxml/trunk/doc/element_classes.txt ============================================================================== --- lxml/trunk/doc/element_classes.txt (original) +++ lxml/trunk/doc/element_classes.txt Fri Feb 20 19:30:55 2009 @@ -38,6 +38,26 @@ 4 Generating XML with custom classes 5 Implementing namespaces +.. + >>> import sys + >>> from lxml import etree as _etree + >>> if sys.version_info[0] >= 3: + ... class etree_mock(object): + ... def __getattr__(self, name): return getattr(_etree, name) + ... def tostring(self, *args, **kwargs): + ... s = _etree.tostring(*args, **kwargs) + ... if isinstance(s, bytes) and bytes([10]) in s: s = s.decode("utf-8") # CR + ... if s[-1] == '\n': s = s[:-1] + ... return s + ... else: + ... class etree_mock(object): + ... def __getattr__(self, name): return getattr(_etree, name) + ... def tostring(self, *args, **kwargs): + ... s = _etree.tostring(*args, **kwargs) + ... if s[-1] == '\n': s = s[:-1] + ... return s + >>> etree = etree_mock() + Background on Element proxies ============================= @@ -371,6 +391,18 @@ your Element proxy classes for the elements that they create. The ``ElementNamespaceClassLookup`` is generally a good match. +You can use custom Element classes to quickly create XML fragments: + +.. sourcecode:: pycon + + >>> class hale(etree.ElementBase): pass + >>> class bopp(etree.ElementBase): pass + + >>> el = hale( "some ", honk(honking = 'true'), bopp, " text" ) + + >>> print(etree.tostring(el)) + some text + Implementing namespaces ======================= Modified: lxml/trunk/src/lxml/classlookup.pxi ============================================================================== --- lxml/trunk/src/lxml/classlookup.pxi (original) +++ lxml/trunk/src/lxml/classlookup.pxi Fri Feb 20 19:30:55 2009 @@ -44,12 +44,13 @@ """ cdef bint is_html = 0 cdef _BaseParser parser + cdef _Element last_child try: namespace = _utf8(self.NAMESPACE) except AttributeError: namespace = None try: - tag, ns = _getNsTag(self.TAG) + ns, tag = _getNsTag(self.TAG) if ns is not None: namespace = ns except AttributeError: @@ -59,10 +60,11 @@ try: parser = self.PARSER except AttributeError: - if python.PyTuple_GET_SIZE(children): - parser = (<_Element>children[0])._doc._parser - else: - parser = None + parser = None + for child in children: + if isinstance(child, _Element): + parser = (<_Element>child)._doc._parser + break if isinstance(parser, HTMLParser): is_html = 1 if namespace is None: @@ -72,8 +74,27 @@ pass _initNewElement(self, is_html, tag, namespace, parser, attrib, nsmap, _extra) + last_child = None for child in children: - _appendChild(self, child) + if _isString(child): + if last_child is None: + if _hasText(self._c_node): + self.text += child + else: + self.text = child + else: + if _hasTail(last_child._c_node): + last_child.tail += child + else: + last_child.tail = child + elif isinstance(child, _Element): + last_child = child + _appendChild(self, last_child) + elif isinstance(child, type) and issubclass(child, ElementBase): + last_child = child() + _appendChild(self, last_child) + else: + raise TypeError, "Invalid child type: %r" % type(child) cdef class CommentBase(_Comment): u"""All custom Comment classes must inherit from this one. Modified: lxml/trunk/src/lxml/tests/test_nsclasses.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_nsclasses.py (original) +++ lxml/trunk/src/lxml/tests/test_nsclasses.py Fri Feb 20 19:30:55 2009 @@ -161,6 +161,49 @@ self.Namespace(None).clear() self.Namespace('ns30').clear() + def test_element_creation(self): + default, bluff, maeh = ( + self.default_class, self.bluff_class, self.maeh_class) + + class honk(etree.ElementBase): + TAG = 'HONK' + NAMESPACE = 'http://a.b/c' + + el = default( + "test", + "text", + bluff(honk, "TaIL", maeh), + maeh("TeXT", bluff, honk(), "TAiL"), + "Tail") + + self.assertEquals('default_class', el.tag) + self.assertEquals('testtext', el.text) + self.assertEquals(None, el.tail) + self.assertEquals(2, len(el)) + self.assertEquals(7, len(list(el.iter()))) + + self.assertEquals('bluff_class', el[0].tag) + self.assertEquals('TaIL', el[0][0].tail) + self.assertEquals('TaIL', ''.join(el[0].itertext())) + self.assertEquals('{http://a.b/c}HONK', + el[0][0].tag) + self.assertEquals('maeh_class', + el[0][1].tag) + + self.assertEquals('maeh_class', el[1].tag) + self.assertEquals('TeXT', el[1].text) + self.assertEquals('bluff_class', el[1][0].tag) + self.assertEquals('{http://a.b/c}HONK', el[1][1].tag) + self.assertEquals('TAiL', el[1][1].tail) + + self.assertEquals('TeXTTAiL', + ''.join(el[1].itertext())) + self.assertEquals('Tail', el[1].tail) + self.assertEquals('TAiL', el[1][1].tail) + self.assertEquals('bluff_class', el[1][0].tag) + self.assertEquals('{http://a.b/c}HONK', el[1][1].tag) + + def test_suite(): suite = unittest.TestSuite() suite.addTests([unittest.makeSuite(ETreeNamespaceClassesTestCase)]) From scoder at codespeak.net Fri Feb 20 19:31:00 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 20 Feb 2009 19:31:00 +0100 (CET) Subject: [Lxml-checkins] r62058 - lxml/trunk Message-ID: <20090220183100.C20DF169E4B@codespeak.net> Author: scoder Date: Fri Feb 20 19:31:00 2009 New Revision: 62058 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt Log: r5038 at delle: sbehnel | 2009-02-20 19:28:41 +0100 changelog Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Fri Feb 20 19:31:00 2009 @@ -8,9 +8,24 @@ Features added -------------- -* Zlib compression support for serialisation operations to file(-like) +* Support strings and instantiable Element classes as child arguments + to the constructor of custom Element classes. + +* GZip compression support for serialisation to files and file-like objects. +Bugs fixed +---------- + +* ``TAG`` special attribute in constructor of custom Element classes + was evaluated incorrectly. + +Other changes +------------- + +* ``Element.findtext()`` now returns an empty string instead of None + for Elements without text content. + 2.2beta3 (2009-02-17) ===================== From scoder at codespeak.net Fri Feb 20 19:43:28 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 20 Feb 2009 19:43:28 +0100 (CET) Subject: [Lxml-checkins] r62061 - in lxml/trunk: . src/lxml/html Message-ID: <20090220184328.D0634169E40@codespeak.net> Author: scoder Date: Fri Feb 20 19:43:26 2009 New Revision: 62061 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/html/__init__.py Log: r5043 at delle: sbehnel | 2009-02-20 19:39:08 +0100 fix name lookup in lxml.html Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Fri Feb 20 19:43:26 2009 @@ -17,6 +17,8 @@ Bugs fixed ---------- +* ``FormElement._name()`` failed for non top-level forms. + * ``TAG`` special attribute in constructor of custom Element classes was evaluated incorrectly. Modified: lxml/trunk/src/lxml/html/__init__.py ============================================================================== --- lxml/trunk/src/lxml/html/__init__.py (original) +++ lxml/trunk/src/lxml/html/__init__.py Fri Feb 20 19:43:26 2009 @@ -721,9 +721,9 @@ return self.get('name') elif self.get('id'): return '#' + self.get('id') - forms = self.body.findall('form') + forms = list(self.body.iter('form')) if not forms: - forms = self.body.findall('{%s}form' % XHTML_NAMESPACE) + forms = list(self.body.iter('{%s}form' % XHTML_NAMESPACE)) return str(forms.index(self)) def form_values(self): From scoder at codespeak.net Fri Feb 20 21:26:50 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 20 Feb 2009 21:26:50 +0100 (CET) Subject: [Lxml-checkins] r62062 - lxml/trunk Message-ID: <20090220202650.858C5169E46@codespeak.net> Author: scoder Date: Fri Feb 20 21:26:48 2009 New Revision: 62062 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt Log: r5045 at delle: sbehnel | 2009-02-20 20:07:34 +0100 changelog Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Fri Feb 20 21:26:48 2009 @@ -17,7 +17,7 @@ Bugs fixed ---------- -* ``FormElement._name()`` failed for non top-level forms. +* ``lxml.html.FormElement._name()`` failed for non top-level forms. * ``TAG`` special attribute in constructor of custom Element classes was evaluated incorrectly. From scoder at codespeak.net Fri Feb 20 21:26:54 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 20 Feb 2009 21:26:54 +0100 (CET) Subject: [Lxml-checkins] r62063 - in lxml/trunk: . doc src/lxml Message-ID: <20090220202654.8219B169E4A@codespeak.net> Author: scoder Date: Fri Feb 20 21:26:53 2009 New Revision: 62063 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/element_classes.txt lxml/trunk/src/lxml/parser.pxi lxml/trunk/src/lxml/serializer.pxi Log: r5046 at delle: sbehnel | 2009-02-20 21:24:44 +0100 Py3 fixes Modified: lxml/trunk/doc/element_classes.txt ============================================================================== --- lxml/trunk/doc/element_classes.txt (original) +++ lxml/trunk/doc/element_classes.txt Fri Feb 20 21:26:53 2009 @@ -39,24 +39,8 @@ 5 Implementing namespaces .. - >>> import sys - >>> from lxml import etree as _etree - >>> if sys.version_info[0] >= 3: - ... class etree_mock(object): - ... def __getattr__(self, name): return getattr(_etree, name) - ... def tostring(self, *args, **kwargs): - ... s = _etree.tostring(*args, **kwargs) - ... if isinstance(s, bytes) and bytes([10]) in s: s = s.decode("utf-8") # CR - ... if s[-1] == '\n': s = s[:-1] - ... return s - ... else: - ... class etree_mock(object): - ... def __getattr__(self, name): return getattr(_etree, name) - ... def tostring(self, *args, **kwargs): - ... s = _etree.tostring(*args, **kwargs) - ... if s[-1] == '\n': s = s[:-1] - ... return s - >>> etree = etree_mock() + >>> try: unicode + ... except NameError: unicode = str Background on Element proxies @@ -400,7 +384,7 @@ >>> el = hale( "some ", honk(honking = 'true'), bopp, " text" ) - >>> print(etree.tostring(el)) + >>> print(etree.tostring(el, encoding=unicode)) some text Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Fri Feb 20 21:26:53 2009 @@ -272,11 +272,12 @@ self._exc_context = exc_context self._filelike = filelike self._encoding = encoding - self._url = url if url is None: self._c_url = NULL else: + url = _encodeFilename(url) self._c_url = _cstr(url) + self._url = url self._bytes = '' self._bytes_read = 0 Modified: lxml/trunk/src/lxml/serializer.pxi ============================================================================== --- lxml/trunk/src/lxml/serializer.pxi (original) +++ lxml/trunk/src/lxml/serializer.pxi Fri Feb 20 21:26:53 2009 @@ -302,7 +302,7 @@ def __init__(self, filelike, exc_context=None, compression=None): if compression is not None and compression > 0: filelike = gzip.GzipFile( - fileobj=filelike, mode='wb', compresslevel=compression) + fileobj=filelike, mode=u'wb', compresslevel=compression) self._close_filelike = filelike.close self._filelike = filelike if exc_context is None: @@ -369,7 +369,7 @@ if compression: bytes_out = BytesIO() gzip_file = gzip.GzipFile( - fileobj=bytes_out, mode='wb', compresslevel=compression) + fileobj=bytes_out, mode=u'wb', compresslevel=compression) try: gzip_file.write(data) finally: From scoder at codespeak.net Fri Feb 20 23:32:30 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 20 Feb 2009 23:32:30 +0100 (CET) Subject: [Lxml-checkins] r62064 - in lxml/trunk: . src/lxml Message-ID: <20090220223230.85040169E0E@codespeak.net> Author: scoder Date: Fri Feb 20 23:32:29 2009 New Revision: 62064 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/etree_defs.h lxml/trunk/src/lxml/tree.pxd Log: r5049 at delle: sbehnel | 2009-02-20 23:30:24 +0100 compile fixes Modified: lxml/trunk/src/lxml/etree_defs.h ============================================================================== --- lxml/trunk/src/lxml/etree_defs.h (original) +++ lxml/trunk/src/lxml/etree_defs.h Fri Feb 20 23:32:29 2009 @@ -16,7 +16,6 @@ #if PY_VERSION_HEX >= 0x03000000 # define PyFile_AsFile(o) (NULL) # define PyString_Check(o) PyBytes_Check(o) -# define PyString_CheckExact(o) PyBytes_CheckExact(o) # define PyString_FromStringAndSize(s, len) PyBytes_FromStringAndSize(s, len) # define PyString_FromFormat PyBytes_FromFormat # define PyString_GET_SIZE(s) PyBytes_GET_SIZE(s) Modified: lxml/trunk/src/lxml/tree.pxd ============================================================================== --- lxml/trunk/src/lxml/tree.pxd (original) +++ lxml/trunk/src/lxml/tree.pxd Fri Feb 20 23:32:29 2009 @@ -242,9 +242,11 @@ cdef int xmlKeepBlanksDefault(int val) nogil cdef char* xmlNodeGetBase(xmlDoc* doc, xmlNode* node) nogil cdef void xmlNodeSetBase(xmlNode* node, char* uri) nogil - cdef char* xmlBuildURI(char* href, char* base) nogil cdef int xmlValidateNCName(char* value, int space) nogil +cdef extern from "libxml/uri.h": + cdef char* xmlBuildURI(char* href, char* base) nogil + cdef extern from "libxml/HTMLtree.h": cdef void htmlNodeDumpFormatOutput(xmlOutputBuffer* buf, xmlDoc* doc, xmlNode* cur, From lxml-checkins at codespeak.net Sun Feb 22 22:32:21 2009 From: lxml-checkins at codespeak.net (maganutxzdq) Date: Sun, 22 Feb 2009 22:32:21 +0100 (CET) Subject: [Lxml-checkins] Learn all the dirty tricks Message-ID: <20090222213221.A890016842D@codespeak.net> An HTML attachment was scrubbed... URL: http://codespeak.net/pipermail/lxml-checkins/attachments/20090222/3a7cd7ef/attachment.htm From lxml-checkins at codespeak.net Wed Feb 25 14:37:49 2009 From: lxml-checkins at codespeak.net (lxml-checkins at codespeak.net) Date: Wed, 25 Feb 2009 14:37:49 +0100 (CET) Subject: [Lxml-checkins] Halloween sales Message-ID: <20090225133749.475B4168445@codespeak.net> An HTML attachment was scrubbed... URL: http://codespeak.net/pipermail/lxml-checkins/attachments/20090225/5f217f11/attachment.htm From scoder at codespeak.net Fri Feb 27 14:46:54 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 27 Feb 2009 14:46:54 +0100 (CET) Subject: [Lxml-checkins] r62225 - in lxml/trunk: . src/lxml Message-ID: <20090227134654.7146816846D@codespeak.net> Author: scoder Date: Fri Feb 27 14:46:53 2009 New Revision: 62225 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/apihelpers.pxi lxml/trunk/src/lxml/cstd.pxd lxml/trunk/src/lxml/proxy.pxi lxml/trunk/src/lxml/python.pxd lxml/trunk/src/lxml/tree.pxd Log: r5051 at delle: sbehnel | 2009-02-20 23:52:54 +0100 use Cython's builtin size_t type support Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Fri Feb 27 14:46:53 2009 @@ -295,9 +295,9 @@ cdef _ns_node_ref* c_nsref_ptr cdef xmlNs* c_nsdef cdef xmlNode* c_node - cdef cstd.size_t c_ns_list_size - cdef cstd.size_t c_ns_list_len - cdef cstd.size_t i + cdef size_t c_ns_list_size + cdef size_t c_ns_list_len + cdef size_t i c_ns_list = NULL c_ns_list_size = 0 Modified: lxml/trunk/src/lxml/cstd.pxd ============================================================================== --- lxml/trunk/src/lxml/cstd.pxd (original) +++ lxml/trunk/src/lxml/cstd.pxd Fri Feb 27 14:46:53 2009 @@ -1,6 +1,5 @@ cdef extern from "string.h": - ctypedef int size_t cdef int strlen(char* s) nogil cdef char* strstr(char* haystack, char* needle) nogil cdef char* strchr(char* haystack, int needle) nogil Modified: lxml/trunk/src/lxml/proxy.pxi ============================================================================== --- lxml/trunk/src/lxml/proxy.pxi (original) +++ lxml/trunk/src/lxml/proxy.pxi Fri Feb 27 14:46:53 2009 @@ -210,8 +210,8 @@ ctypedef struct _nscache: xmlNs** new xmlNs** old - cstd.size_t size - cstd.size_t last + size_t size + size_t last cdef int _growNsCache(_nscache* c_ns_cache) except -1: cdef xmlNs** c_ns_ptr @@ -312,7 +312,7 @@ cdef xmlNs* c_ns_next cdef xmlNs* c_nsdef cdef xmlNs* c_del_ns_list - cdef cstd.size_t i, proxy_count = 0 + cdef size_t i, proxy_count = 0 if not tree._isElementOrXInclude(c_element): return 0 @@ -385,7 +385,7 @@ cdef void fixElementDocument(xmlNode* c_element, _Document doc, - cstd.size_t proxy_count): + size_t proxy_count): cdef xmlNode* c_node = c_element tree.BEGIN_FOR_EACH_FROM(c_element, c_node, 1) if c_node._private is not NULL: Modified: lxml/trunk/src/lxml/python.pxd ============================================================================== --- lxml/trunk/src/lxml/python.pxd (original) +++ lxml/trunk/src/lxml/python.pxd Fri Feb 27 14:46:53 2009 @@ -3,7 +3,6 @@ cdef extern from "Python.h": ctypedef struct PyObject ctypedef struct PyThreadState - ctypedef int size_t cdef int INT_MAX cdef int PY_SSIZE_T_MAX cdef int PY_VERSION_HEX Modified: lxml/trunk/src/lxml/tree.pxd ============================================================================== --- lxml/trunk/src/lxml/tree.pxd (original) +++ lxml/trunk/src/lxml/tree.pxd Fri Feb 27 14:46:53 2009 @@ -1,4 +1,4 @@ -from cstd cimport FILE, size_t +from cstd cimport FILE cdef extern from "lxml-version.h": cdef char* LXML_VERSION_STRING From scoder at codespeak.net Fri Feb 27 14:47:02 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 27 Feb 2009 14:47:02 +0100 (CET) Subject: [Lxml-checkins] r62226 - in lxml/trunk: . doc Message-ID: <20090227134702.3556A168487@codespeak.net> Author: scoder Date: Fri Feb 27 14:47:01 2009 New Revision: 62226 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/element_classes.txt Log: r5052 at delle: sbehnel | 2009-02-22 00:23:54 +0100 test fix Modified: lxml/trunk/doc/element_classes.txt ============================================================================== --- lxml/trunk/doc/element_classes.txt (original) +++ lxml/trunk/doc/element_classes.txt Fri Feb 27 14:47:01 2009 @@ -39,7 +39,7 @@ 5 Implementing namespaces .. - >>> try: unicode + >>> try: _ = unicode ... except NameError: unicode = str From scoder at codespeak.net Fri Feb 27 14:47:07 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 27 Feb 2009 14:47:07 +0100 (CET) Subject: [Lxml-checkins] r62227 - in lxml/trunk: . src/lxml/html/tests Message-ID: <20090227134707.1BA8F168480@codespeak.net> Author: scoder Date: Fri Feb 27 14:47:06 2009 New Revision: 62227 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/html/tests/test_clean_embed.txt Log: r5053 at delle: sbehnel | 2009-02-22 15:41:12 +0100 Py3 test fix Modified: lxml/trunk/src/lxml/html/tests/test_clean_embed.txt ============================================================================== --- lxml/trunk/src/lxml/html/tests/test_clean_embed.txt (original) +++ lxml/trunk/src/lxml/html/tests/test_clean_embed.txt Fri Feb 27 14:47:06 2009 @@ -5,6 +5,13 @@ >>> from lxml.html.clean import clean, clean_html, Cleaner >>> from lxml.html import usedoctest +>>> def tostring(el): # work-around for Py3 'bytes' type +... from lxml.html import tostring +... s = tostring(el) +... if not isinstance(s, str): +... s = s.decode('UTF-8') +... return s + >>> doc_embed = '''
... ... From scoder at codespeak.net Fri Feb 27 14:47:12 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 27 Feb 2009 14:47:12 +0100 (CET) Subject: [Lxml-checkins] r62228 - lxml/trunk Message-ID: <20090227134712.5EC5516846D@codespeak.net> Author: scoder Date: Fri Feb 27 14:47:11 2009 New Revision: 62228 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/setup.py Log: r5054 at delle: sbehnel | 2009-02-22 15:44:15 +0100 mark Py3 exception crash bug fixed in Cython, official support for Python 3.0.1 Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Fri Feb 27 14:47:11 2009 @@ -17,6 +17,9 @@ Bugs fixed ---------- +* Crash bug in exception handling code under Python 3. This was due + to a problem in Cython, not lxml itself. + * ``lxml.html.FormElement._name()`` failed for non top-level forms. * ``TAG`` special attribute in constructor of custom Element classes @@ -25,6 +28,8 @@ Other changes ------------- +* Official support for Python 3.0.1. + * ``Element.findtext()`` now returns an empty string instead of None for Elements without text content. Modified: lxml/trunk/setup.py ============================================================================== --- lxml/trunk/setup.py (original) +++ lxml/trunk/setup.py Fri Feb 27 14:47:11 2009 @@ -98,8 +98,8 @@ 'Programming Language :: Python :: 2.4', 'Programming Language :: Python :: 2.5', 'Programming Language :: Python :: 2.6', -# 'Programming Language :: Python :: 3', -# 'Programming Language :: Python :: 3.0', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.0', 'Programming Language :: C', 'Operating System :: OS Independent', 'Topic :: Text Processing :: Markup :: HTML', From scoder at codespeak.net Fri Feb 27 14:47:17 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 27 Feb 2009 14:47:17 +0100 (CET) Subject: [Lxml-checkins] r62229 - in lxml/trunk: . doc Message-ID: <20090227134717.9817E168487@codespeak.net> Author: scoder Date: Fri Feb 27 14:47:17 2009 New Revision: 62229 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/FAQ.txt Log: r5055 at delle: sbehnel | 2009-02-27 12:04:42 +0100 FAQ update: clean up threading sections, reference dev-works article Modified: lxml/trunk/doc/FAQ.txt ============================================================================== --- lxml/trunk/doc/FAQ.txt (original) +++ lxml/trunk/doc/FAQ.txt Fri Feb 27 14:47:17 2009 @@ -96,7 +96,9 @@ ``lxml.objectify``, read the `objectify documentation`_. John Shipman has written another tutorial called `Python XML -processing with lxml`_ that contains lots of examples. +processing with lxml`_ that contains lots of examples. Liza Daly +wrote a nice article about high-performance aspects when `parsing +large files with lxml`_. .. _`lxml.etree Tutorial`: tutorial.html .. _`tutorial for ElementTree`: http://effbot.org/zone/element.htm @@ -104,6 +106,8 @@ .. _`objectify documentation`: objectify.html .. _`Python XML processing with lxml`: http://www.nmt.edu/tcc/help/pubs/pylxml/ .. _`element library`: http://effbot.org/zone/element-lib.htm +.. _`parsing large files with lxml`: http://www.ibm.com/developerworks/xml/library/x-hiperfparse/ + Where can I find more documentation about lxml? ----------------------------------------------- @@ -194,7 +198,10 @@ * zif.sedna_, an XQuery based interface to the Sedna OpenSource XML database And don't miss the quotes by our generally happy_ users_, and other -`sites that link to lxml`_. +`sites that link to lxml`_. As `Liza Daly`_ puts it: "Many software +products come with the pick-two caveat, meaning that you must choose +only two: speed, flexibility, or readability. When used carefully, +lxml can provide all three." .. _Zope: http://www.zope.org/ .. _Plone: http://www.plone.org/ @@ -215,6 +222,7 @@ .. _happy: http://thread.gmane.org/gmane.comp.python.lxml.devel/3244/focus=3244 .. _users: http://article.gmane.org/gmane.comp.python.lxml.devel/3246 .. _`sites that link to lxml`: http://www.google.com/search?as_lq=http:%2F%2Fcodespeak.net%2Flxml +.. _`Liza Daly`: http://www.ibm.com/developerworks/xml/library/x-hiperfparse/ What is the difference between lxml.etree and lxml.objectify? @@ -619,8 +627,8 @@ either the default parser (which is replicated for each thread) or create a parser for each thread yourself. lxml also allows concurrency during validation (RelaxNG and XMLSchema) and XSL -transformation. You can share RelaxNG, XMLSchema and (with -restrictions) XSLT objects between threads. +transformation. You can share RelaxNG, XMLSchema and XSLT objects +between threads. While you can also share parsers between threads, this will serialize the access to each of them, so it is better to ``.copy()`` parsers or @@ -629,19 +637,16 @@ internal lock to protect their prepared evaluation contexts. It is therefore best to use separate evaluator instances in threads. -Due to the way libxslt handles threading, applying a stylesheets is -most efficient if it was parsed in the same thread that executes it. -One way to achieve this is by caching stylesheets in thread-local -storage. - -Warning: Before lxml 2.2, there were various issues when moving -subtrees between different threads. If you need code to run with -older versions, you should generally avoid modifying trees in other -threads than the one it was generated in. Although this should work -in many cases, there are certain scenarios where the termination of a -thread that parsed a tree can crash the application if subtrees of -this tree were moved to other documents. You should be on the safe -side when passing trees between threads if you either +Warning: Before lxml 2.2, and especially before 2.1, there were +various issues when moving subtrees between different threads, or when +applying XSLT objects from one thread to trees parsed or modified in +another. If you need code to run with older versions, you should +generally avoid modifying trees in other threads than the one it was +generated in. Although this should work in many cases, there are +certain scenarios where the termination of a thread that parsed a tree +can crash the application if subtrees of this tree were moved to other +documents. You should be on the safe side when passing trees between +threads if you either - do not modify these trees and do not move their elements to other trees, or @@ -650,6 +655,13 @@ use (e.g. by using a fixed size thread-pool or long-running threads in processing chains) +Since lxml 2.2, even multi-thread pipelines are supported. However, +note that it is more efficient to do all tree work inside one thread, +than to let multiple threads work on a tree one after the other. This +is because trees inherit state from the thread that created them, +which must be maintained when the tree is modified inside another +thread. + Does my program run faster if I use threads? -------------------------------------------- @@ -657,11 +669,13 @@ Depends. The best way to answer this is timing and profiling. The global interpreter lock (GIL) in Python serializes access to the -interpreter, so if the majority of your processing is done in Python code -(walking trees, modifying elements, etc.), your gain will be close to 0. The -more of your XML processing moves into lxml, however, the higher your gain. -If your application is bound by XML parsing and serialisation, or by complex -XSLTs, your speedup on multi-processor machines can be substantial. +interpreter, so if the majority of your processing is done in Python +code (walking trees, modifying elements, etc.), your gain will be +close to zero. The more of your XML processing moves into lxml, +however, the higher your gain. If your application is bound by XML +parsing and serialisation, or by very selective XPath expressions and +complex XSLTs, your speedup on multi-processor machines can be +substantial. See the question above to learn which operations free the GIL to support multi-threading. @@ -670,30 +684,28 @@ Would my single-threaded program run faster if I turned off threading? ---------------------------------------------------------------------- -Quite likely, yes. You can see for yourself by compiling lxml -entirely without threading support. Pass the ``--without-threading`` -option to setup.py when building lxml from source. You can also build -libxml2 without pthread support (``--without-pthreads`` option), which -may add another bit of performance. Note that this will leave -internal data structures entirely without thread protection, so make -sure you really do not use lxml outside of the main application thread -in this case. +Possibly, yes. You can see for yourself by compiling lxml entirely +without threading support. Pass the ``--without-threading`` option to +setup.py when building lxml from source. You can also build libxml2 +without pthread support (``--without-pthreads`` option), which may add +another bit of performance. Note that this will leave internal data +structures entirely without thread protection, so make sure you really +do not use lxml outside of the main application thread in this case. Why can't I reuse XSLT stylesheets in other threads? ---------------------------------------------------- -Since lxml 2.0, you can. However, it is a lot more efficient to use -stylesheets in the thread that created them. This is due to some -interfering optimisations in libxslt and lxml.etree. It is therefore -a good idea to cache them in thread local storage (see Python's -threading module). lxml cannot easily do this for you, as it cannot -know when to discard them from such a cache. - -If you use very complex stylesheets or create stylesheets -programmatically, you should do so in the main thread, and then copy -them into the thread cache using the ``copy`` module from the standard -library. +Since later lxml 2.0 versions, you can do this. There is some +overhead involved as the result document needs an additional cleanup +traversal when the input document and/or the stylesheet were created +in other threads. However, on a multi-processor machine, the gain of +freeing the GIL easily covers this drawback. + +If you need even the last bit of performance, consider keeping (a copy +of) the stylesheet in thread-local storage, and try creating the input +document(s) in the same thread. And do not forget to benchmark your +code to see if the increased code complexity is really worth it. My program crashes when run with mod_python/Pyro/Zope/Plone/... @@ -709,10 +721,11 @@ code runs perfectly when started by hand, the following gives you a few hints for possible approaches to solve your specific problem: -* make sure you use recent versions of libxml2, libxslt and lxml. The libxml2 - developers keep fixing bugs in each release, and lxml also tries to become - more robust against possible pitfalls. So newer versions might already fix - your problem in a reliable way. +* make sure you use recent versions of libxml2, libxslt and lxml. The + libxml2 developers keep fixing bugs in each release, and lxml also + tries to become more robust against possible pitfalls. So newer + versions might already fix your problem in a reliable way. Version + 2.2 of lxml contains many improvements. * make sure the library versions you installed are really used. Do not rely on what your operating system tells you! Print the version @@ -736,14 +749,15 @@ from crashing, which should be worth more to you than peek performance. Remember that lxml is fast anyway, so concurrency may not even be worth it. -* avoid doing fancy XSLT stuff like foreign document access or passing in - subtrees trough XSLT variables. This might or might not work, depending on - your specific usage. +* look out for fancy XSLT stuff like foreign document access or + passing in subtrees trough XSLT variables. This might or might not + work, depending on your specific usage. Again, later versions of + lxml and libxslt provide safer support here. * try copying trees at suspicious places in your code and working with - those instead of a tree shared between threads. A good candidate - might be the result of an XSLT or the stylesheet itself, if it - traverses thread boundaries. + those instead of a tree shared between threads. Note that the + copying must happen inside the target thread to be effective, not in + the thread that created the tree. * try keeping thread-local copies of XSLT stylesheets, i.e. one per thread, instead of sharing one. Also see the question above. @@ -756,6 +770,10 @@ of lxml, libxml2 and libxslt you are using (see the question on reporting a bug). +Note that most of these options will degrade performance and/or your +code quality. If you are unsure what to do, please ask on the mailing +list. + Parsing and Serialisation ========================= From scoder at codespeak.net Fri Feb 27 14:47:22 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 27 Feb 2009 14:47:22 +0100 (CET) Subject: [Lxml-checkins] r62230 - in lxml/trunk: . doc Message-ID: <20090227134722.5F44C168480@codespeak.net> Author: scoder Date: Fri Feb 27 14:47:21 2009 New Revision: 62230 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/FAQ.txt Log: r5056 at delle: sbehnel | 2009-02-27 12:09:09 +0100 minor doc update Modified: lxml/trunk/doc/FAQ.txt ============================================================================== --- lxml/trunk/doc/FAQ.txt (original) +++ lxml/trunk/doc/FAQ.txt Fri Feb 27 14:47:21 2009 @@ -757,7 +757,9 @@ * try copying trees at suspicious places in your code and working with those instead of a tree shared between threads. Note that the copying must happen inside the target thread to be effective, not in - the thread that created the tree. + the thread that created the tree. Serialising in one thread and + parsing in another is also a simple (and fast) way of separating + thread contexts. * try keeping thread-local copies of XSLT stylesheets, i.e. one per thread, instead of sharing one. Also see the question above. From scoder at codespeak.net Fri Feb 27 14:47:27 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 27 Feb 2009 14:47:27 +0100 (CET) Subject: [Lxml-checkins] r62231 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20090227134727.AE76116846D@codespeak.net> Author: scoder Date: Fri Feb 27 14:47:27 2009 New Revision: 62231 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/tests/test_threading.py lxml/trunk/src/lxml/xslt.pxi Log: r5057 at delle: sbehnel | 2009-02-27 13:32:16 +0100 fix crash when overwriting attributes in XSLT Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Fri Feb 27 14:47:27 2009 @@ -17,6 +17,9 @@ Bugs fixed ---------- +* Crash in XSLT when overwriting an already defined attribute using + ``xsl:attribute``. + * Crash bug in exception handling code under Python 3. This was due to a problem in Cython, not lxml itself. Modified: lxml/trunk/src/lxml/tests/test_threading.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_threading.py (original) +++ lxml/trunk/src/lxml/tests/test_threading.py Fri Feb 27 14:47:27 2009 @@ -84,6 +84,31 @@ self.assertEquals(_bytes('BCB'), tostring(root)) + def test_thread_xslt_attr_replace(self): + # this is the only case in XSLT where the result tree can be + # modified in-place + XML = self.etree.XML + tostring = self.etree.tostring + style = self.etree.XSLT(XML(_bytes('''\ + + + + + xyz + + + '''))) + + result = [] + def run_thread(): + root = XML(_bytes('')) + result.append( style(root).getroot() ) + + self._run_thread(run_thread) + self.assertEquals(_bytes(''), + tostring(result[0])) + def test_thread_create_xslt(self): XML = self.etree.XML tostring = self.etree.tostring Modified: lxml/trunk/src/lxml/xslt.pxi ============================================================================== --- lxml/trunk/src/lxml/xslt.pxi (original) +++ lxml/trunk/src/lxml/xslt.pxi Fri Feb 27 14:47:27 2009 @@ -486,7 +486,15 @@ _destroyFakeDoc(input_doc._c_doc, c_doc) python.PyErr_NoMemory() - initTransformDict(transform_ctxt) + # using the stylesheet dict is safer than using a possibly + # unrelated dict from the current thread. Almost all + # non-input tag/attr names will come from the stylesheet + # anyway. + if transform_ctxt.dict is not NULL: + xmlparser.xmlDictFree(transform_ctxt.dict) + transform_ctxt.dict = self._c_style.doc.dict + xmlparser.xmlDictReference(transform_ctxt.dict) + xslt.xsltSetCtxtParseOptions( transform_ctxt, input_doc._parser._parse_options) @@ -776,9 +784,6 @@ # enable EXSLT support for XSLT xslt.exsltRegisterAll() -cdef void initTransformDict(xslt.xsltTransformContext* transform_ctxt): - __GLOBAL_PARSER_CONTEXT.initThreadDictRef(&transform_ctxt.dict) - ################################################################################ # XSLT PI support From scoder at codespeak.net Fri Feb 27 14:47:32 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 27 Feb 2009 14:47:32 +0100 (CET) Subject: [Lxml-checkins] r62232 - in lxml/trunk: . src/lxml/html src/lxml/html/tests Message-ID: <20090227134732.5EDF11684C8@codespeak.net> Author: scoder Date: Fri Feb 27 14:47:31 2009 New Revision: 62232 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/html/soupparser.py lxml/trunk/src/lxml/html/tests/test_elementsoup.py Log: r5058 at delle: sbehnel | 2009-02-27 14:06:21 +0100 fix bug #334718: soupparser fails on attribute without value Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Fri Feb 27 14:47:31 2009 @@ -17,6 +17,8 @@ Bugs fixed ---------- +* Soupparser failed on broken attributes without values. + * Crash in XSLT when overwriting an already defined attribute using ``xsl:attribute``. Modified: lxml/trunk/src/lxml/html/soupparser.py ============================================================================== --- lxml/trunk/src/lxml/html/soupparser.py (original) +++ lxml/trunk/src/lxml/html/soupparser.py Fri Feb 27 14:47:31 2009 @@ -109,8 +109,14 @@ import re handle_entities = re.compile("&(\w+);").sub +try: + empty_string = unicode() +except NameError: + empty_string = str() def unescape(string): + if not string: + return empty_string # work around oddities in BeautifulSoup's entity handling def unescape_entity(m): try: Modified: lxml/trunk/src/lxml/html/tests/test_elementsoup.py ============================================================================== --- lxml/trunk/src/lxml/html/tests/test_elementsoup.py (original) +++ lxml/trunk/src/lxml/html/tests/test_elementsoup.py Fri Feb 27 14:47:31 2009 @@ -1,5 +1,5 @@ import unittest, sys -from lxml.tests.common_imports import make_doctest +from lxml.tests.common_imports import make_doctest, HelperTestCase try: import BeautifulSoup @@ -7,11 +7,25 @@ except ImportError: BS_INSTALLED = False +if BS_INSTALLED: + class SoupParserTestCase(HelperTestCase): + from lxml.html import soupparser + + def test_broken_attribute(self): + html = """\ + +
+ + """ + root = self.soupparser.fromstring(html) + self.assert_(root.find('.//input').get('disabled') is not None) + def test_suite(): suite = unittest.TestSuite() if sys.version_info >= (2,4): if BS_INSTALLED: + suite.addTests([unittest.makeSuite(SoupParserTestCase)]) suite.addTests([make_doctest('../../../../doc/elementsoup.txt')]) return suite From scoder at codespeak.net Fri Feb 27 14:47:37 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 27 Feb 2009 14:47:37 +0100 (CET) Subject: [Lxml-checkins] r62233 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20090227134737.BBADC168480@codespeak.net> Author: scoder Date: Fri Feb 27 14:47:36 2009 New Revision: 62233 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/apihelpers.pxi lxml/trunk/src/lxml/lxml.etree.pyx lxml/trunk/src/lxml/tests/test_etree.py Log: r5059 at delle: sbehnel | 2009-02-27 14:43:30 +0100 make deep-copying an ElementTree copy PI/comment siblings, too Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Fri Feb 27 14:47:36 2009 @@ -17,6 +17,9 @@ Bugs fixed ---------- +* Deep-copying an ElementTree did not copy its sibling PIs and + comments. + * Soupparser failed on broken attributes without values. * Crash in XSLT when overwriting an already defined attribute using Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Fri Feb 27 14:47:36 2009 @@ -871,7 +871,7 @@ c_target = c_tail c_tail = c_next -cdef void _copyTail(xmlNode* c_tail, xmlNode* c_target): +cdef int _copyTail(xmlNode* c_tail, xmlNode* c_target) except -1: cdef xmlNode* c_new_tail # tail copying support: look for any text nodes trailing this node and # copy it to the target node @@ -881,9 +881,34 @@ c_new_tail = tree.xmlDocCopyNode(c_tail, c_target.doc, 0) else: c_new_tail = tree.xmlCopyNode(c_tail, 0) + if c_new_tail is NULL: + python.PyErr_NoMemory() tree.xmlAddNextSibling(c_target, c_new_tail) c_target = c_new_tail c_tail = _textNodeOrSkip(c_tail.next) + return 0 + +cdef int _copyNonElementSiblings(xmlNode* c_node, xmlNode* c_target) except -1: + cdef xmlNode* c_copy + cdef xmlNode* c_sibling = c_node + while c_sibling.prev != NULL and \ + (c_sibling.prev.type == tree.XML_PI_NODE or \ + c_sibling.prev.type == tree.XML_COMMENT_NODE): + c_sibling = c_sibling.prev + while c_sibling != c_node: + c_copy = tree.xmlDocCopyNode(c_sibling, c_target.doc, 1) + if c_copy is NULL: + python.PyErr_NoMemory() + tree.xmlAddPrevSibling(c_target, c_copy) + c_sibling = c_sibling.next + while c_sibling.next != NULL and \ + (c_sibling.next.type == tree.XML_PI_NODE or \ + c_sibling.next.type == tree.XML_COMMENT_NODE): + c_sibling = c_sibling.next + c_copy = tree.xmlDocCopyNode(c_sibling, c_target.doc, 1) + if c_copy is NULL: + python.PyErr_NoMemory() + tree.xmlAddNextSibling(c_target, c_copy) cdef int _deleteSlice(_Document doc, xmlNode* c_node, Py_ssize_t count, Py_ssize_t step) except -1: Modified: lxml/trunk/src/lxml/lxml.etree.pyx ============================================================================== --- lxml/trunk/src/lxml/lxml.etree.pyx (original) +++ lxml/trunk/src/lxml/lxml.etree.pyx Fri Feb 27 14:47:36 2009 @@ -1571,9 +1571,21 @@ def __deepcopy__(self, memo): cdef _Element root + cdef _Document doc + cdef xmlDoc* c_doc if self._context_node is not None: root = self._context_node.__copy__() - return _elementTreeFactory(None, root) + _copyNonElementSiblings(self._context_node._c_node, root._c_node) + return _elementTreeFactory(None, root) + elif self._doc is not None: + c_doc = tree.xmlCopyDoc(self._doc._c_doc, 1) + if c_doc is NULL: + python.PyErr_NoMemory() + doc = _documentFactory(c_doc, self._doc._parser) + return _elementTreeFactory(doc, None) + else: + # so what ... + return self # not in ElementTree, read-only property docinfo: Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Fri Feb 27 14:47:36 2009 @@ -233,6 +233,22 @@ self.assertEquals('ONE', a.text) self.assertEquals('ANOTHER', b.text) + def test_deepcopy_elementtree_pi(self): + XML = self.etree.XML + tostring = self.etree.tostring + root = XML(_bytes("")) + tree1 = self.etree.ElementTree(root) + self.assertEquals(_bytes(""), + tostring(tree1)) + + tree2 = copy.deepcopy(tree1) + self.assertEquals(_bytes(""), + tostring(tree2)) + + root2 = copy.deepcopy(tree1.getroot()) + self.assertEquals(_bytes(""), + tostring(root2)) + def test_attribute_set(self): # ElementTree accepts arbitrary attribute values # lxml.etree allows only strings From scoder at codespeak.net Fri Feb 27 15:08:29 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 27 Feb 2009 15:08:29 +0100 (CET) Subject: [Lxml-checkins] r62234 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20090227140829.26DE91684A6@codespeak.net> Author: scoder Date: Fri Feb 27 15:08:27 2009 New Revision: 62234 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/lxml.etree.pyx lxml/trunk/src/lxml/tests/test_etree.py Log: r5069 at delle: sbehnel | 2009-02-27 15:06:11 +0100 copy int/ext DTD subsets when deep-copying an ElementTree Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Fri Feb 27 15:08:27 2009 @@ -17,8 +17,8 @@ Bugs fixed ---------- -* Deep-copying an ElementTree did not copy its sibling PIs and - comments. +* Deep-copying an ElementTree copied neither its sibling PIs and + comments nor its internal/external DTD subsets. * Soupparser failed on broken attributes without values. Modified: lxml/trunk/src/lxml/lxml.etree.pyx ============================================================================== --- lxml/trunk/src/lxml/lxml.etree.pyx (original) +++ lxml/trunk/src/lxml/lxml.etree.pyx Fri Feb 27 15:08:27 2009 @@ -1576,6 +1576,16 @@ if self._context_node is not None: root = self._context_node.__copy__() _copyNonElementSiblings(self._context_node._c_node, root._c_node) + doc = root._doc + c_doc = self._context_node._doc._c_doc + if c_doc.intSubset and not doc._c_doc.intSubset: + doc._c_doc.intSubset = tree.xmlCopyDtd(c_doc.intSubset) + if doc._c_doc.intSubset is NULL: + python.PyErr_NoMemory() + if c_doc.extSubset and not doc._c_doc.extSubset: + doc._c_doc.extSubset = tree.xmlCopyDtd(c_doc.extSubset) + if doc._c_doc.extSubset is NULL: + python.PyErr_NoMemory() return _elementTreeFactory(None, root) elif self._doc is not None: c_doc = tree.xmlCopyDoc(self._doc._c_doc, 1) Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Fri Feb 27 15:08:27 2009 @@ -249,6 +249,21 @@ self.assertEquals(_bytes(""), tostring(root2)) + def test_deepcopy_elementtree_dtd(self): + XML = self.etree.XML + tostring = self.etree.tostring + xml = _bytes('\n]>\n') + root = XML(xml) + tree1 = self.etree.ElementTree(root) + self.assertEquals(xml, tostring(tree1)) + + tree2 = copy.deepcopy(tree1) + self.assertEquals(xml, tostring(tree2)) + + root2 = copy.deepcopy(tree1.getroot()) + self.assertEquals(_bytes(""), + tostring(root2)) + def test_attribute_set(self): # ElementTree accepts arbitrary attribute values # lxml.etree allows only strings From scoder at codespeak.net Fri Feb 27 15:41:02 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 27 Feb 2009 15:41:02 +0100 (CET) Subject: [Lxml-checkins] r62235 - in lxml/trunk: . src/lxml/html Message-ID: <20090227144102.8E5BD168519@codespeak.net> Author: scoder Date: Fri Feb 27 15:41:02 2009 New Revision: 62235 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/html/soupparser.py Log: r5071 at delle: sbehnel | 2009-02-27 15:29:21 +0100 simplification Modified: lxml/trunk/src/lxml/html/soupparser.py ============================================================================== --- lxml/trunk/src/lxml/html/soupparser.py (original) +++ lxml/trunk/src/lxml/html/soupparser.py Fri Feb 27 15:41:02 2009 @@ -109,14 +109,10 @@ import re handle_entities = re.compile("&(\w+);").sub -try: - empty_string = unicode() -except NameError: - empty_string = str() def unescape(string): if not string: - return empty_string + return '' # work around oddities in BeautifulSoup's entity handling def unescape_entity(m): try: From scoder at codespeak.net Fri Feb 27 16:22:49 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 27 Feb 2009 16:22:49 +0100 (CET) Subject: [Lxml-checkins] r62239 - in lxml/trunk: . doc Message-ID: <20090227152249.4CDC316852D@codespeak.net> Author: scoder Date: Fri Feb 27 16:22:48 2009 New Revision: 62239 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/doc/main.txt lxml/trunk/version.txt Log: r5073 at delle: sbehnel | 2009-02-27 15:44:43 +0100 prepare release of 2.2beta4 Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Fri Feb 27 16:22:48 2009 @@ -2,8 +2,8 @@ lxml changelog ============== -Under development -================= +2.2beta4 (2009-02-27) +===================== Features added -------------- Modified: lxml/trunk/doc/main.txt ============================================================================== --- lxml/trunk/doc/main.txt (original) +++ lxml/trunk/doc/main.txt Fri Feb 27 16:22:48 2009 @@ -147,8 +147,8 @@ source release. If you can't wait, consider trying a less recent release version first. -The latest version is `lxml 2.2beta3`_, released 2009-02-17 -(`changes for 2.2beta3`_). `Older versions`_ are listed below. +The latest version is `lxml 2.2beta4`_, released 2009-02-17 +(`changes for 2.2beta4`_). `Older versions`_ are listed below. Please take a look at the `installation instructions`_! @@ -220,7 +220,9 @@ `2.0 `_ and the `current in-development version `_. -.. _`PDF documentation`: lxmldoc-2.2beta3.pdf +.. _`PDF documentation`: lxmldoc-2.2beta4.pdf + +* `lxml 2.2beta3`_, released 2009-02-17 (`changes for 2.2beta3`_) * `lxml 2.2beta2`_, released 2009-01-25 (`changes for 2.2beta2`_) @@ -314,6 +316,7 @@ * `lxml 0.5`_, released 2005-04-08 +.. _`lxml 2.2beta4`: lxml-2.2beta4.tgz .. _`lxml 2.2beta3`: lxml-2.2beta3.tgz .. _`lxml 2.2beta2`: lxml-2.2beta2.tgz .. _`lxml 2.2beta1`: lxml-2.2beta1.tgz @@ -361,6 +364,7 @@ .. _`lxml 0.5.1`: lxml-0.5.1.tgz .. _`lxml 0.5`: lxml-0.5.tgz +.. _`changes for 2.2beta4`: changes-2.2beta4.html .. _`changes for 2.2beta3`: changes-2.2beta3.html .. _`changes for 2.2beta2`: changes-2.2beta2.html .. _`changes for 2.2beta1`: changes-2.2beta1.html Modified: lxml/trunk/version.txt ============================================================================== --- lxml/trunk/version.txt (original) +++ lxml/trunk/version.txt Fri Feb 27 16:22:48 2009 @@ -1 +1 @@ -2.2beta3 +2.2beta4 From scoder at codespeak.net Fri Feb 27 16:22:53 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 27 Feb 2009 16:22:53 +0100 (CET) Subject: [Lxml-checkins] r62240 - in lxml/trunk: . doc Message-ID: <20090227152253.9600616852E@codespeak.net> Author: scoder Date: Fri Feb 27 16:22:53 2009 New Revision: 62240 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/mklatex.py Log: r5074 at delle: sbehnel | 2009-02-27 16:19:53 +0100 fix PDF docs Modified: lxml/trunk/doc/mklatex.py ============================================================================== --- lxml/trunk/doc/mklatex.py (original) +++ lxml/trunk/doc/mklatex.py Fri Feb 27 16:22:53 2009 @@ -90,7 +90,8 @@ break if line.startswith('%') or \ r'\documentclass' in line or \ - r'\makeindex' in line: + r'\makeindex' in line or \ + r'{inputenc}' in line: continue if line.startswith(r'\usepackage'): if line in existing_header_lines: @@ -270,7 +271,7 @@ if hln.startswith(r"\documentclass"): #hln = hln.replace('article', 'book') hln = DOCUMENT_CLASS - elif hln.startswith("%% generator "): + elif hln.startswith("%% generator ") or hln.startswith("% generated "): master.write(EPYDOC_IMPORT) elif hln.startswith(r"\begin{document}"): # pygments and epydoc support From scoder at codespeak.net Fri Feb 27 16:22:58 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 27 Feb 2009 16:22:58 +0100 (CET) Subject: [Lxml-checkins] r62241 - in lxml/trunk: . doc Message-ID: <20090227152258.221CA16852F@codespeak.net> Author: scoder Date: Fri Feb 27 16:22:57 2009 New Revision: 62241 Modified: lxml/trunk/ (props changed) lxml/trunk/INSTALL.txt lxml/trunk/doc/build.txt Log: r5075 at delle: sbehnel | 2009-02-27 16:20:41 +0100 fix dependency versions Modified: lxml/trunk/INSTALL.txt ============================================================================== --- lxml/trunk/INSTALL.txt (original) +++ lxml/trunk/INSTALL.txt Fri Feb 27 16:22:57 2009 @@ -12,7 +12,7 @@ http://xmlsoft.org/downloads.html If you want to use XPath, do not use libxml2 2.6.27. We recommend - libxml2 2.6.28 or later. + libxml2 2.7.2 or later. * libxslt 1.1.15 or later. It can be found here: http://xmlsoft.org/XSLT/downloads.html Modified: lxml/trunk/doc/build.txt ============================================================================== --- lxml/trunk/doc/build.txt (original) +++ lxml/trunk/doc/build.txt Fri Feb 27 16:22:57 2009 @@ -45,9 +45,9 @@ want to be an lxml developer, then you do need a working Cython installation. You can use EasyInstall_ to install it:: - easy_install Cython==0.10.3 + easy_install Cython==0.11 -lxml currently requires Cython 0.10.3, later release versions should +lxml currently requires Cython 0.11, later release versions should work as well. From scoder at codespeak.net Sat Feb 28 23:04:00 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 28 Feb 2009 23:04:00 +0100 (CET) Subject: [Lxml-checkins] r62278 - lxml/trunk Message-ID: <20090228220400.06062168458@codespeak.net> Author: scoder Date: Sat Feb 28 23:03:59 2009 New Revision: 62278 Modified: lxml/trunk/ (props changed) lxml/trunk/ez_setup.py Log: r5079 at delle: sbehnel | 2009-02-28 23:01:11 +0100 updated setuptools install script Modified: lxml/trunk/ez_setup.py ============================================================================== --- lxml/trunk/ez_setup.py (original) +++ lxml/trunk/ez_setup.py Sat Feb 28 23:03:59 2009 @@ -14,8 +14,8 @@ This file can also be run as a script to install or upgrade setuptools. """ import sys -DEFAULT_VERSION = "0.6c3" -DEFAULT_URL = "http://cheeseshop.python.org/packages/%s/s/setuptools/" % sys.version[:3] +DEFAULT_VERSION = "0.6c9" +DEFAULT_URL = "http://pypi.python.org/packages/%s/s/setuptools/" % sys.version[:3] md5_data = { 'setuptools-0.6b1-py2.3.egg': '8822caf901250d848b996b7f25c6e6ca', @@ -33,13 +33,33 @@ 'setuptools-0.6c3-py2.3.egg': 'f181fa125dfe85a259c9cd6f1d7b78fa', 'setuptools-0.6c3-py2.4.egg': 'e0ed74682c998bfb73bf803a50e7b71e', 'setuptools-0.6c3-py2.5.egg': 'abef16fdd61955514841c7c6bd98965e', + 'setuptools-0.6c4-py2.3.egg': 'b0b9131acab32022bfac7f44c5d7971f', + 'setuptools-0.6c4-py2.4.egg': '2a1f9656d4fbf3c97bf946c0a124e6e2', + 'setuptools-0.6c4-py2.5.egg': '8f5a052e32cdb9c72bcf4b5526f28afc', + 'setuptools-0.6c5-py2.3.egg': 'ee9fd80965da04f2f3e6b3576e9d8167', + 'setuptools-0.6c5-py2.4.egg': 'afe2adf1c01701ee841761f5bcd8aa64', + 'setuptools-0.6c5-py2.5.egg': 'a8d3f61494ccaa8714dfed37bccd3d5d', + 'setuptools-0.6c6-py2.3.egg': '35686b78116a668847237b69d549ec20', + 'setuptools-0.6c6-py2.4.egg': '3c56af57be3225019260a644430065ab', + 'setuptools-0.6c6-py2.5.egg': 'b2f8a7520709a5b34f80946de5f02f53', + 'setuptools-0.6c7-py2.3.egg': '209fdf9adc3a615e5115b725658e13e2', + 'setuptools-0.6c7-py2.4.egg': '5a8f954807d46a0fb67cf1f26c55a82e', + 'setuptools-0.6c7-py2.5.egg': '45d2ad28f9750e7434111fde831e8372', + 'setuptools-0.6c8-py2.3.egg': '50759d29b349db8cfd807ba8303f1902', + 'setuptools-0.6c8-py2.4.egg': 'cba38d74f7d483c06e9daa6070cce6de', + 'setuptools-0.6c8-py2.5.egg': '1721747ee329dc150590a58b3e1ac95b', + 'setuptools-0.6c9-py2.3.egg': 'a83c4020414807b496e4cfbe08507c03', + 'setuptools-0.6c9-py2.4.egg': '260a2be2e5388d66bdaee06abec6342a', + 'setuptools-0.6c9-py2.5.egg': 'fe67c3e5a17b12c0e7c541b7ea43a8e6', + 'setuptools-0.6c9-py2.6.egg': 'ca37b1ff16fa2ede6e19383e7b59245a', } import sys, os +try: from hashlib import md5 +except ImportError: from md5 import md5 def _validate_md5(egg_name, data): if egg_name in md5_data: - from md5 import md5 digest = md5(data).hexdigest() if digest != md5_data[egg_name]: print >>sys.stderr, ( @@ -49,7 +69,6 @@ sys.exit(2) return data - def use_setuptools( version=DEFAULT_VERSION, download_base=DEFAULT_URL, to_dir=os.curdir, download_delay=15 @@ -65,31 +84,31 @@ this routine will print a message to ``sys.stderr`` and raise SystemExit in an attempt to abort the calling script. """ - try: - import setuptools - if setuptools.__version__ == '0.0.1': - print >>sys.stderr, ( - "You have an obsolete version of setuptools installed. Please\n" - "remove it from your system entirely before rerunning this script." - ) - sys.exit(2) - except ImportError: + was_imported = 'pkg_resources' in sys.modules or 'setuptools' in sys.modules + def do_download(): egg = download_setuptools(version, download_base, to_dir, download_delay) sys.path.insert(0, egg) import setuptools; setuptools.bootstrap_install_from = egg - - import pkg_resources try: - pkg_resources.require("setuptools>="+version) - + import pkg_resources + except ImportError: + return do_download() + try: + pkg_resources.require("setuptools>="+version); return except pkg_resources.VersionConflict, e: - # XXX could we install in a subprocess here? - print >>sys.stderr, ( + if was_imported: + print >>sys.stderr, ( "The required version of setuptools (>=%s) is not available, and\n" "can't be installed while this script is running. Please install\n" - " a more recent version first.\n\n(Currently using %r)" - ) % (version, e.args[0]) - sys.exit(2) + " a more recent version first, using 'easy_install -U setuptools'." + "\n\n(Currently using %r)" + ) % (version, e.args[0]) + sys.exit(2) + else: + del pkg_resources, sys.modules['pkg_resources'] # reload ok + return do_download() + except pkg_resources.DistributionNotFound: + return do_download() def download_setuptools( version=DEFAULT_VERSION, download_base=DEFAULT_URL, to_dir=os.curdir, @@ -138,9 +157,43 @@ if dst: dst.close() return os.path.realpath(saveto) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + def main(argv, version=DEFAULT_VERSION): """Install or upgrade setuptools and EasyInstall""" - try: import setuptools except ImportError: @@ -155,8 +208,11 @@ os.unlink(egg) else: if setuptools.__version__ == '0.0.1': - # tell the user to uninstall obsolete version - use_setuptools(version) + print >>sys.stderr, ( + "You have an obsolete version of setuptools installed. Please\n" + "remove it from your system entirely before rerunning this script." + ) + sys.exit(2) req = "setuptools>="+version import pkg_resources @@ -177,13 +233,10 @@ print "Setuptools version",version,"or greater has been installed." print '(Run "ez_setup.py -U setuptools" to reinstall or upgrade.)' - - def update_md5(filenames): """Update our built-in md5 registry""" import re - from md5 import md5 for name in filenames: base = os.path.basename(name) @@ -220,3 +273,4 @@ + From scoder at codespeak.net Sat Feb 28 23:04:05 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 28 Feb 2009 23:04:05 +0100 (CET) Subject: [Lxml-checkins] r62279 - in lxml/trunk: . doc Message-ID: <20090228220405.76FB5168469@codespeak.net> Author: scoder Date: Sat Feb 28 23:04:04 2009 New Revision: 62279 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/performance.txt Log: r5080 at delle: sbehnel | 2009-02-28 23:01:43 +0100 updated performance docs to lxml 2.2 and Python 2.6 Modified: lxml/trunk/doc/performance.txt ============================================================================== --- lxml/trunk/doc/performance.txt (original) +++ lxml/trunk/doc/performance.txt Sat Feb 28 23:04:04 2009 @@ -86,12 +86,15 @@ a specific part of the API yourself, please consider sending it to the lxml mailing list. -The timings cited below compare lxml 2.1 (with libxml2 2.6.33) to the -April 2008 SVN trunk versions of ElementTree (1.3alpha) and -cElementTree (1.2.7). They were run single-threaded on a 1.8GHz Intel -Core Duo machine under Ubuntu Linux 7.10 (Gutsy). The C libraries -were compiled with the same platform specific optimisation flags. The -Python interpreter (2.5.1) was used as provided by the distribution. +The timings cited below compare lxml 2.2 (with libxml2 2.7.3) to the +February 2009 SVN versions of ElementTree (1.3alpha2) and cElementTree +(1.0.6). They were run single-threaded on a 1.8GHz Intel Core Duo +machine under Ubuntu Linux 8.10 (Intrepid). The C libraries were +compiled with the same platform specific optimisation flags. The +Python interpreter (2.6.1) was manually compiled for the platform. +Note that many of the following ElementTree timings are therefore +better then what a normal Python installation with the standard +library (c)ElementTree modules would yield. .. _`bench_etree.py`: http://codespeak.net/svn/lxml/trunk/benchmark/bench_etree.py .. _`bench_xpath.py`: http://codespeak.net/svn/lxml/trunk/benchmark/bench_xpath.py @@ -129,107 +132,108 @@ executes entirely at the C level, without any interaction with Python code. The results are rather impressive, especially for UTF-8, which is native to libxml2. While 20 to 40 times faster than (c)ElementTree -1.2 (which is part of the standard library in Python 2.5), lxml is +1.2 (which is part of the standard library since Python 2.5), lxml is still more than 7 times as fast as the much improved ElementTree 1.3:: - lxe: tostring_utf16 (SATR T1) 25.7590 msec/pass - cET: tostring_utf16 (SATR T1) 179.6291 msec/pass - ET : tostring_utf16 (SATR T1) 188.5638 msec/pass - - lxe: tostring_utf16 (UATR T1) 26.0060 msec/pass - cET: tostring_utf16 (UATR T1) 176.9981 msec/pass - ET : tostring_utf16 (UATR T1) 188.2110 msec/pass - - lxe: tostring_utf16 (S-TR T2) 26.9201 msec/pass - cET: tostring_utf16 (S-TR T2) 182.5061 msec/pass - ET : tostring_utf16 (S-TR T2) 190.2061 msec/pass - - lxe: tostring_utf8 (S-TR T2) 19.5830 msec/pass - cET: tostring_utf8 (S-TR T2) 183.0020 msec/pass - ET : tostring_utf8 (S-TR T2) 187.7251 msec/pass - - lxe: tostring_utf8 (U-TR T3) 5.5292 msec/pass - cET: tostring_utf8 (U-TR T3) 56.1349 msec/pass - ET : tostring_utf8 (U-TR T3) 56.6628 msec/pass + lxe: tostring_utf16 (SATR T1) 22.4042 msec/pass + cET: tostring_utf16 (SATR T1) 184.5090 msec/pass + ET : tostring_utf16 (SATR T1) 182.4350 msec/pass + + lxe: tostring_utf16 (UATR T1) 23.1769 msec/pass + cET: tostring_utf16 (UATR T1) 188.6780 msec/pass + ET : tostring_utf16 (UATR T1) 186.7781 msec/pass + + lxe: tostring_utf16 (S-TR T2) 21.8501 msec/pass + cET: tostring_utf16 (S-TR T2) 200.0139 msec/pass + ET : tostring_utf16 (S-TR T2) 190.8720 msec/pass + + lxe: tostring_utf8 (S-TR T2) 17.1690 msec/pass + cET: tostring_utf8 (S-TR T2) 192.3709 msec/pass + ET : tostring_utf8 (S-TR T2) 189.7140 msec/pass + + lxe: tostring_utf8 (U-TR T3) 4.9832 msec/pass + cET: tostring_utf8 (U-TR T3) 60.2911 msec/pass + ET : tostring_utf8 (U-TR T3) 57.8101 msec/pass The same applies to plain text serialisation. Note that cElementTree -does not currently support this, as it is new in ET 1.3:: +does not currently support this, as it is a new feature in ET 1.3 and +lxml.etree 2.0:: - lxe: tostring_text_ascii (S-TR T1) 3.8729 msec/pass - ET : tostring_text_ascii (S-TR T1) 90.7841 msec/pass + lxe: tostring_text_ascii (S-TR T1) 4.3709 msec/pass + ET : tostring_text_ascii (S-TR T1) 83.9939 msec/pass - lxe: tostring_text_ascii (S-TR T3) 1.1508 msec/pass - ET : tostring_text_ascii (S-TR T3) 28.0581 msec/pass + lxe: tostring_text_ascii (S-TR T3) 1.3590 msec/pass + ET : tostring_text_ascii (S-TR T3) 26.6340 msec/pass - lxe: tostring_text_utf16 (S-TR T1) 5.6219 msec/pass - ET : tostring_text_utf16 (S-TR T1) 87.4891 msec/pass + lxe: tostring_text_utf16 (S-TR T1) 6.2978 msec/pass + ET : tostring_text_utf16 (S-TR T1) 84.7399 msec/pass - lxe: tostring_text_utf16 (U-TR T1) 7.0660 msec/pass - ET : tostring_text_utf16 (U-TR T1) 82.1049 msec/pass + lxe: tostring_text_utf16 (U-TR T1) 7.7510 msec/pass + ET : tostring_text_utf16 (U-TR T1) 79.9279 msec/pass Unlike ElementTree, the ``tostring()`` function in lxml also supports serialisation to a Python unicode string object:: - lxe: tostring_text_unicode (S-TR T1) 4.2419 msec/pass - lxe: tostring_text_unicode (U-TR T1) 5.2760 msec/pass - lxe: tostring_text_unicode (S-TR T3) 1.3049 msec/pass - lxe: tostring_text_unicode (U-TR T3) 1.4210 msec/pass + lxe: tostring_text_unicode (S-TR T1) 4.6940 msec/pass + lxe: tostring_text_unicode (U-TR T1) 6.3069 msec/pass + lxe: tostring_text_unicode (S-TR T3) 1.3652 msec/pass + lxe: tostring_text_unicode (U-TR T3) 2.0702 msec/pass For parsing, on the other hand, the advantage is clearly with cElementTree. The (c)ET libraries use a very thin layer on top of the expat parser, which is known to be extremely fast:: - lxe: parse_stringIO (SAXR T1) 40.6771 msec/pass - cET: parse_stringIO (SAXR T1) 19.3741 msec/pass - ET : parse_stringIO (SAXR T1) 355.7711 msec/pass - - lxe: parse_stringIO (S-XR T3) 5.9960 msec/pass - cET: parse_stringIO (S-XR T3) 5.8751 msec/pass - ET : parse_stringIO (S-XR T3) 93.7259 msec/pass - - lxe: parse_stringIO (UAXR T3) 26.2671 msec/pass - cET: parse_stringIO (UAXR T3) 30.6449 msec/pass - ET : parse_stringIO (UAXR T3) 178.8890 msec/pass + lxe: parse_stringIO (SAXR T1) 50.0100 msec/pass + cET: parse_stringIO (SAXR T1) 19.3238 msec/pass + ET : parse_stringIO (SAXR T1) 318.2330 msec/pass + + lxe: parse_stringIO (S-XR T3) 6.1851 msec/pass + cET: parse_stringIO (S-XR T3) 5.7080 msec/pass + ET : parse_stringIO (S-XR T3) 83.5931 msec/pass + + lxe: parse_stringIO (UAXR T3) 34.4319 msec/pass + cET: parse_stringIO (UAXR T3) 28.8520 msec/pass + ET : parse_stringIO (UAXR T3) 164.5968 msec/pass While about as fast for smaller documents, the expat parser allows cET to be up to 2 times faster than lxml on plain parser performance for large input documents. Similar timings can be observed for the ``iterparse()`` function:: - lxe: iterparse_stringIO (SAXR T1) 50.8120 msec/pass - cET: iterparse_stringIO (SAXR T1) 24.9379 msec/pass - ET : iterparse_stringIO (SAXR T1) 388.9420 msec/pass - - lxe: iterparse_stringIO (UAXR T3) 29.0790 msec/pass - cET: iterparse_stringIO (UAXR T3) 32.1240 msec/pass - ET : iterparse_stringIO (UAXR T3) 189.1720 msec/pass + lxe: iterparse_stringIO (SAXR T1) 57.8308 msec/pass + cET: iterparse_stringIO (SAXR T1) 23.8140 msec/pass + ET : iterparse_stringIO (SAXR T1) 349.5209 msec/pass + + lxe: iterparse_stringIO (UAXR T3) 37.2162 msec/pass + cET: iterparse_stringIO (UAXR T3) 30.2329 msec/pass + ET : iterparse_stringIO (UAXR T3) 171.4060 msec/pass However, if you benchmark the complete round-trip of a serialise-parse cycle, the numbers will look similar to these:: - lxe: write_utf8_parse_stringIO (S-TR T1) 63.7550 msec/pass - cET: write_utf8_parse_stringIO (S-TR T1) 292.0721 msec/pass - ET : write_utf8_parse_stringIO (S-TR T1) 635.2799 msec/pass - - lxe: write_utf8_parse_stringIO (UATR T2) 75.0258 msec/pass - cET: write_utf8_parse_stringIO (UATR T2) 341.7251 msec/pass - ET : write_utf8_parse_stringIO (UATR T2) 713.1951 msec/pass - - lxe: write_utf8_parse_stringIO (S-TR T3) 11.4899 msec/pass - cET: write_utf8_parse_stringIO (S-TR T3) 96.8502 msec/pass - ET : write_utf8_parse_stringIO (S-TR T3) 185.6079 msec/pass - - lxe: write_utf8_parse_stringIO (SATR T4) 1.2081 msec/pass - cET: write_utf8_parse_stringIO (SATR T4) 6.8581 msec/pass - ET : write_utf8_parse_stringIO (SATR T4) 10.6261 msec/pass + lxe: write_utf8_parse_stringIO (S-TR T1) 60.2388 msec/pass + cET: write_utf8_parse_stringIO (S-TR T1) 314.9750 msec/pass + ET : write_utf8_parse_stringIO (S-TR T1) 616.4260 msec/pass + + lxe: write_utf8_parse_stringIO (UATR T2) 71.7540 msec/pass + cET: write_utf8_parse_stringIO (UATR T2) 364.4099 msec/pass + ET : write_utf8_parse_stringIO (UATR T2) 684.5109 msec/pass + + lxe: write_utf8_parse_stringIO (S-TR T3) 10.7441 msec/pass + cET: write_utf8_parse_stringIO (S-TR T3) 103.3869 msec/pass + ET : write_utf8_parse_stringIO (S-TR T3) 179.5921 msec/pass + + lxe: write_utf8_parse_stringIO (SATR T4) 1.1981 msec/pass + cET: write_utf8_parse_stringIO (SATR T4) 7.0901 msec/pass + ET : write_utf8_parse_stringIO (SATR T4) 10.4899 msec/pass For applications that require a high parser throughput of large files, and that do little to no serialization, cET is the best choice. Also -for iterparse applications that extract small amounts of data from -large XML data sets that do not fit into the memory. If it comes to -round-trip performance, however, lxml tends to be multiple times -faster in total. So, whenever the input documents are not -considerably larger than the output, lxml is the clear winner. +for iterparse applications that extract small amounts of data or +aggregate information from large XML data sets that do not fit into +memory. If it comes to round-trip performance, however, lxml tends to +be multiple times faster in total. So, whenever the input documents +are not considerably larger than the output, lxml is the clear winner. Regarding HTML parsing, Ian Bicking has done some `benchmarking on lxml's HTML parser`_, comparing it to a number of other famous HTML @@ -241,6 +245,13 @@ .. _`benchmarking on lxml's HTML parser`: http://blog.ianbicking.org/2008/03/30/python-html-parser-performance/ +Liza Daly has written an article that presents a couple of tweaks to +get the most out of lxml's parser for very large XML documents. She +quite favourably positions ``lxml.etree`` as a tool for +`high-performance XML parsing`_. + +.. _`high-performance XML parsing`: http://www.ibm.com/developerworks/xml/library/x-hiperfparse/ + The ElementTree API =================== @@ -253,27 +264,27 @@ (given in seconds):: lxe: -- S- U- -A SA UA - T1: 0.0437 0.0498 0.0516 0.0430 0.0498 0.0519 - T2: 0.0550 0.0643 0.0677 0.0612 0.0685 0.0721 - T3: 0.0168 0.0142 0.0159 0.0338 0.0350 0.0359 - T4: 0.0003 0.0002 0.0003 0.0007 0.0007 0.0007 + T1: 0.0502 0.0572 0.0613 0.0494 0.0575 0.0615 + T2: 0.0602 0.0691 0.0747 0.0651 0.0745 0.0796 + T3: 0.0145 0.0157 0.0176 0.0392 0.0411 0.0415 + T4: 0.0003 0.0003 0.0003 0.0008 0.0008 0.0008 cET: -- S- U- -A SA UA - T1: 0.0093 0.0093 0.0093 0.0097 0.0094 0.0094 - T2: 0.0153 0.0155 0.0152 0.0157 0.0154 0.0154 - T3: 0.0076 0.0076 0.0076 0.0099 0.0122 0.0100 + T1: 0.0092 0.0094 0.0094 0.0094 0.0096 0.0093 + T2: 0.0152 0.0151 0.0152 0.0156 0.0154 0.0154 + T3: 0.0079 0.0080 0.0079 0.0106 0.0107 0.0134 T4: 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 ET : -- S- U- -A SA UA - T1: 0.1074 0.1669 0.1050 0.2054 0.2401 0.1047 - T2: 0.2920 0.1172 0.3393 0.3830 0.1184 0.4215 - T3: 0.0347 0.0331 0.0316 0.0368 0.3944 0.0377 - T4: 0.0006 0.0005 0.0007 0.0006 0.0007 0.0006 - - -While lxml is still faster than ET in most cases (10-70%), cET can be up to -five times faster than lxml here. One of the reasons is that lxml must -additionally discard the created Python elements after their use, when they -are no longer referenced. ET and cET represent the tree itself through these -objects, which reduces the overhead in creating them. + T1: 0.1017 0.1715 0.1962 0.1080 0.2470 0.1049 + T2: 0.3130 0.3324 0.1130 0.3897 0.1158 0.4246 + T3: 0.0341 0.0323 0.0338 0.0358 0.3965 0.0359 + T4: 0.0006 0.0005 0.0006 0.0006 0.0007 0.0006 + +While lxml is still a lot faster than ET in most cases, cET can be up +to five times faster than lxml here. One of the reasons is that lxml +must additionally discard the created Python elements after their use, +when they are no longer referenced. ET and cET represent the tree +itself through these objects, which reduces the overhead in creating +them. Child access @@ -284,36 +295,36 @@ create a shallow copy of their list of children, lxml has to create a Python object for each child and collect them in a list:: - lxe: root_list_children (--TR T1) 0.0160 msec/pass - cET: root_list_children (--TR T1) 0.0081 msec/pass - ET : root_list_children (--TR T1) 0.0541 msec/pass - - lxe: root_list_children (--TR T2) 0.2100 msec/pass - cET: root_list_children (--TR T2) 0.0319 msec/pass - ET : root_list_children (--TR T2) 0.4420 msec/pass + lxe: root_list_children (--TR T1) 0.0148 msec/pass + cET: root_list_children (--TR T1) 0.0050 msec/pass + ET : root_list_children (--TR T1) 0.0219 msec/pass + + lxe: root_list_children (--TR T2) 0.1719 msec/pass + cET: root_list_children (--TR T2) 0.0260 msec/pass + ET : root_list_children (--TR T2) 0.3390 msec/pass This handicap is also visible when accessing single children:: - lxe: first_child (--TR T2) 0.2341 msec/pass - cET: first_child (--TR T2) 0.2198 msec/pass - ET : first_child (--TR T2) 0.8960 msec/pass - - lxe: last_child (--TR T1 ) 0.2549 msec/pass - cET: last_child (--TR T1 ) 0.2251 msec/pass - ET : last_child (--TR T1 ) 0.8969 msec/pass + lxe: first_child (--TR T2) 0.1879 msec/pass + cET: first_child (--TR T2) 0.1760 msec/pass + ET : first_child (--TR T2) 0.8099 msec/pass + + lxe: last_child (--TR T1) 0.1910 msec/pass + cET: last_child (--TR T1) 0.1872 msec/pass + ET : last_child (--TR T1) 0.8099 msec/pass ... unless you also add the time to find a child index in a bigger list. ET and cET use Python lists here, which are based on arrays. The data structure used by libxml2 is a linked tree, and thus, a linked list of children:: - lxe: middle_child (--TR T1) 0.2699 msec/pass - cET: middle_child (--TR T1) 0.2089 msec/pass - ET : middle_child (--TR T1) 0.8910 msec/pass - - lxe: middle_child (--TR T2) 1.9410 msec/pass - cET: middle_child (--TR T2) 0.2151 msec/pass - ET : middle_child (--TR T2) 0.8960 msec/pass + lxe: middle_child (--TR T1) 0.2189 msec/pass + cET: middle_child (--TR T1) 0.1779 msec/pass + ET : middle_child (--TR T1) 0.8030 msec/pass + + lxe: middle_child (--TR T2) 2.4071 msec/pass + cET: middle_child (--TR T2) 0.1781 msec/pass + ET : middle_child (--TR T2) 0.8039 msec/pass Element creation @@ -323,21 +334,21 @@ in. This results in a major performance difference for creating independent Elements that end up in independently created documents:: - lxe: create_elements (--TC T2) 1.7340 msec/pass - cET: create_elements (--TC T2) 0.1929 msec/pass - ET : create_elements (--TC T2) 1.3809 msec/pass + lxe: create_elements (--TC T2) 2.1949 msec/pass + cET: create_elements (--TC T2) 0.1941 msec/pass + ET : create_elements (--TC T2) 1.2760 msec/pass Therefore, it is always preferable to create Elements for the document they are supposed to end up in, either as SubElements of an Element or using the explicit ``Element.makeelement()`` call:: - lxe: makeelement (--TC T2) 1.6100 msec/pass - cET: makeelement (--TC T2) 0.3171 msec/pass - ET : makeelement (--TC T2) 1.6270 msec/pass + lxe: makeelement (--TC T2) 1.8370 msec/pass + cET: makeelement (--TC T2) 0.3200 msec/pass + ET : makeelement (--TC T2) 1.5380 msec/pass - lxe: create_subelements (--TC T2) 1.3542 msec/pass + lxe: create_subelements (--TC T2) 1.6761 msec/pass cET: create_subelements (--TC T2) 0.2329 msec/pass - ET : create_subelements (--TC T2) 3.3019 msec/pass + ET : create_subelements (--TC T2) 3.0999 msec/pass So, if the main performance bottleneck of an application is creating large XML trees in memory through calls to Element and SubElement, cET is the best @@ -354,13 +365,13 @@ The following benchmark appends all root children of the second tree to the root of the first tree:: - lxe: append_from_document (--TR T1,T2) 3.0038 msec/pass + lxe: append_from_document (--TR T1,T2) 3.4299 msec/pass cET: append_from_document (--TR T1,T2) 0.2639 msec/pass - ET : append_from_document (--TR T1,T2) 1.2522 msec/pass + ET : append_from_document (--TR T1,T2) 1.1489 msec/pass - lxe: append_from_document (--TR T3,T4) 0.0398 msec/pass - cET: append_from_document (--TR T3,T4) 0.0160 msec/pass - ET : append_from_document (--TR T3,T4) 0.0811 msec/pass + lxe: append_from_document (--TR T3,T4) 0.0429 msec/pass + cET: append_from_document (--TR T3,T4) 0.0169 msec/pass + ET : append_from_document (--TR T3,T4) 0.0780 msec/pass Although these are fairly small numbers compared to parsing, this easily shows the different performance classes for lxml and (c)ET. Where the latter do not @@ -371,22 +382,22 @@ This difference is not always as visible, but applies to most parts of the API, like inserting newly created elements:: - lxe: insert_from_document (--TR T1,T2) 4.9140 msec/pass - cET: insert_from_document (--TR T1,T2) 0.4108 msec/pass - ET : insert_from_document (--TR T1,T2) 1.4670 msec/pass + lxe: insert_from_document (--TR T1,T2) 6.1119 msec/pass + cET: insert_from_document (--TR T1,T2) 0.4129 msec/pass + ET : insert_from_document (--TR T1,T2) 1.4160 msec/pass or replacing the child slice by a newly created element:: - lxe: replace_children_element (--TC T1) 0.1500 msec/pass - cET: replace_children_element (--TC T1) 0.0238 msec/pass - ET : replace_children_element (--TC T1) 0.1600 msec/pass + lxe: replace_children_element (--TC T1) 0.1769 msec/pass + cET: replace_children_element (--TC T1) 0.0250 msec/pass + ET : replace_children_element (--TC T1) 0.1538 msec/pass as opposed to replacing the slice with an existing element from the same document:: - lxe: replace_children (--TC T1) 0.0160 msec/pass + lxe: replace_children (--TC T1) 0.0169 msec/pass cET: replace_children (--TC T1) 0.0119 msec/pass - ET : replace_children (--TC T1) 0.0741 msec/pass + ET : replace_children (--TC T1) 0.0758 msec/pass While these numbers are too small to provide a major performance impact in practice, you should keep this difference in mind when you @@ -398,17 +409,17 @@ Deep copying a tree is fast in lxml:: - lxe: deepcopy_all (--TR T1) 9.4090 msec/pass - cET: deepcopy_all (--TR T1) 120.1589 msec/pass - ET : deepcopy_all (--TR T1) 901.3789 msec/pass - - lxe: deepcopy_all (-ATR T2) 12.4569 msec/pass - cET: deepcopy_all (-ATR T2) 135.8809 msec/pass - ET : deepcopy_all (-ATR T2) 940.7840 msec/pass - - lxe: deepcopy_all (S-TR T3) 2.7640 msec/pass - cET: deepcopy_all (S-TR T3) 30.1108 msec/pass - ET : deepcopy_all (S-TR T3) 228.4350 msec/pass + lxe: deepcopy_all (--TR T1) 10.0670 msec/pass + cET: deepcopy_all (--TR T1) 115.8700 msec/pass + ET : deepcopy_all (--TR T1) 866.8201 msec/pass + + lxe: deepcopy_all (-ATR T2) 12.4321 msec/pass + cET: deepcopy_all (-ATR T2) 130.1000 msec/pass + ET : deepcopy_all (-ATR T2) 901.1638 msec/pass + + lxe: deepcopy_all (S-TR T3) 2.6951 msec/pass + cET: deepcopy_all (S-TR T3) 28.9950 msec/pass + ET : deepcopy_all (S-TR T3) 218.7109 msec/pass So, for example, if you have a database-like scenario where you parse in a large tree and then search and copy independent subtrees from it for further @@ -423,42 +434,43 @@ especially if few elements are of interest or the target element tag name is known, lxml is a good choice:: - lxe: getiterator_all (--TR T1) 5.0449 msec/pass - cET: getiterator_all (--TR T1) 42.0539 msec/pass - ET : getiterator_all (--TR T1) 22.9158 msec/pass - - lxe: getiterator_islice (--TR T2) 0.0789 msec/pass - cET: getiterator_islice (--TR T2) 0.3579 msec/pass - ET : getiterator_islice (--TR T2) 0.2351 msec/pass - - lxe: getiterator_tag (--TR T2) 0.0651 msec/pass - cET: getiterator_tag (--TR T2) 0.7648 msec/pass - ET : getiterator_tag (--TR T2) 0.4380 msec/pass - - lxe: getiterator_tag_all (--TR T2) 0.8650 msec/pass - cET: getiterator_tag_all (--TR T2) 42.7120 msec/pass - ET : getiterator_tag_all (--TR T2) 21.5559 msec/pass + lxe: getiterator_all (--TR T1) 4.7209 msec/pass + cET: getiterator_all (--TR T1) 45.8400 msec/pass + ET : getiterator_all (--TR T1) 22.9480 msec/pass + + lxe: getiterator_islice (--TR T2) 0.0398 msec/pass + cET: getiterator_islice (--TR T2) 0.3798 msec/pass + ET : getiterator_islice (--TR T2) 0.1900 msec/pass + + lxe: getiterator_tag (--TR T2) 0.0160 msec/pass + cET: getiterator_tag (--TR T2) 0.8149 msec/pass + ET : getiterator_tag (--TR T2) 0.3560 msec/pass + + lxe: getiterator_tag_all (--TR T2) 0.6580 msec/pass + cET: getiterator_tag_all (--TR T2) 46.3769 msec/pass + ET : getiterator_tag_all (--TR T2) 20.3989 msec/pass This translates directly into similar timings for ``Element.findall()``:: - lxe: findall (--TR T2) 6.8750 msec/pass - cET: findall (--TR T2) 46.8600 msec/pass - ET : findall (--TR T2) 27.0121 msec/pass - - lxe: findall (--TR T3) 1.5690 msec/pass - cET: findall (--TR T3) 13.6340 msec/pass - ET : findall (--TR T3) 8.8100 msec/pass - - lxe: findall_tag (--TR T2) 1.0221 msec/pass - cET: findall_tag (--TR T2) 42.8400 msec/pass - ET : findall_tag (--TR T2) 21.4801 msec/pass - - lxe: findall_tag (--TR T3) 0.4241 msec/pass - cET: findall_tag (--TR T3) 10.7069 msec/pass - ET : findall_tag (--TR T3) 5.8560 msec/pass - -Note that all three libraries currently use the same Python implementation for -``findall()``, except for their native tree iterator (``element.iter()``). + lxe: findall (--TR T2) 6.7198 msec/pass + cET: findall (--TR T2) 51.2750 msec/pass + ET : findall (--TR T2) 26.9110 msec/pass + + lxe: findall (--TR T3) 1.4520 msec/pass + cET: findall (--TR T3) 14.2760 msec/pass + ET : findall (--TR T3) 8.4310 msec/pass + + lxe: findall_tag (--TR T2) 0.7401 msec/pass + cET: findall_tag (--TR T2) 46.5961 msec/pass + ET : findall_tag (--TR T2) 20.3760 msec/pass + + lxe: findall_tag (--TR T3) 0.3331 msec/pass + cET: findall_tag (--TR T3) 11.5960 msec/pass + ET : findall_tag (--TR T3) 5.4510 msec/pass + +Note that all three libraries currently use the same Python +implementation for ``.findall()``, except for their native tree +iterator (``element.iter()``). XPath @@ -471,38 +483,38 @@ of the lxml API you use. The most straight forward way is to call the ``xpath()`` method on an Element or ElementTree:: - lxe: xpath_method (--TC T1) 1.5969 msec/pass - lxe: xpath_method (--TC T2) 21.3680 msec/pass - lxe: xpath_method (--TC T3) 0.1218 msec/pass - lxe: xpath_method (--TC T4) 1.0300 msec/pass + lxe: xpath_method (--TC T1) 1.5750 msec/pass + lxe: xpath_method (--TC T2) 20.9570 msec/pass + lxe: xpath_method (--TC T3) 0.1199 msec/pass + lxe: xpath_method (--TC T4) 1.0121 msec/pass This is well suited for testing and when the XPath expressions are as diverse as the trees they are called on. However, if you have a single XPath expression that you want to apply to a larger number of different elements, the ``XPath`` class is the most efficient way to do it:: - lxe: xpath_class (--TC T1) 0.6590 msec/pass - lxe: xpath_class (--TC T2) 2.9969 msec/pass - lxe: xpath_class (--TC T3) 0.0520 msec/pass - lxe: xpath_class (--TC T4) 0.1619 msec/pass + lxe: xpath_class (--TC T1) 0.6301 msec/pass + lxe: xpath_class (--TC T2) 2.6128 msec/pass + lxe: xpath_class (--TC T3) 0.0498 msec/pass + lxe: xpath_class (--TC T4) 0.1400 msec/pass Note that this still allows you to use variables in the expression, so you can parse it once and then adapt it through variables at call time. In other cases, where you have a fixed Element or ElementTree and want to run different expressions on it, you should consider the ``XPathEvaluator``:: - lxe: xpath_element (--TR T1) 0.4120 msec/pass - lxe: xpath_element (--TR T2) 11.5321 msec/pass - lxe: xpath_element (--TR T3) 0.1152 msec/pass - lxe: xpath_element (--TR T4) 0.3202 msec/pass + lxe: xpath_element (--TR T1) 0.2739 msec/pass + lxe: xpath_element (--TR T2) 10.8800 msec/pass + lxe: xpath_element (--TR T3) 0.0660 msec/pass + lxe: xpath_element (--TR T4) 0.2739 msec/pass While it looks slightly slower, creating an XPath object for each of the expressions generates a much higher overhead here:: - lxe: xpath_class_repeat (--TC T1) 1.5409 msec/pass - lxe: xpath_class_repeat (--TC T2) 20.2711 msec/pass - lxe: xpath_class_repeat (--TC T3) 0.1161 msec/pass - lxe: xpath_class_repeat (--TC T4) 0.9799 msec/pass + lxe: xpath_class_repeat (--TC T1) 1.5399 msec/pass + lxe: xpath_class_repeat (--TC T2) 20.5159 msec/pass + lxe: xpath_class_repeat (--TC T3) 0.1178 msec/pass + lxe: xpath_class_repeat (--TC T4) 0.9880 msec/pass A longer example @@ -640,8 +652,8 @@ ``iterparse()``: 0.07 versus 0.10 seconds. However, tree iteration in lxml is increadibly fast, so it can be better to parse the whole tree and then iterate over it rather than using ``iterparse()`` to do both in one step. - Or, you can just wait for the lxml authors to optimise iterparse in one of - the next releases... + Or, you can just wait for the lxml developers to optimise iterparse in one + of the next releases... lxml.objectify @@ -669,21 +681,21 @@ tree. It avoids step-by-step Python element instantiations along the path, which can substantially improve the access time:: - lxe: attribute (--TR T1) 8.4081 msec/pass - lxe: attribute (--TR T2) 51.3301 msec/pass - lxe: attribute (--TR T4) 8.2269 msec/pass - - lxe: objectpath (--TR T1) 4.6120 msec/pass - lxe: objectpath (--TR T2) 47.0440 msec/pass - lxe: objectpath (--TR T4) 4.4930 msec/pass - - lxe: attributes_deep (--TR T1) 12.6550 msec/pass - lxe: attributes_deep (--TR T2) 56.0241 msec/pass - lxe: attributes_deep (--TR T4) 12.5690 msec/pass - - lxe: objectpath_deep (--TR T1) 5.9190 msec/pass - lxe: objectpath_deep (--TR T2) 49.6972 msec/pass - lxe: objectpath_deep (--TR T4) 5.7530 msec/pass + lxe: attribute (--TR T1) 6.9990 msec/pass + lxe: attribute (--TR T2) 29.2060 msec/pass + lxe: attribute (--TR T4) 6.9048 msec/pass + + lxe: objectpath (--TR T1) 3.5410 msec/pass + lxe: objectpath (--TR T2) 24.9801 msec/pass + lxe: objectpath (--TR T4) 3.5069 msec/pass + + lxe: attributes_deep (--TR T1) 16.9580 msec/pass + lxe: attributes_deep (--TR T2) 39.8140 msec/pass + lxe: attributes_deep (--TR T4) 16.9699 msec/pass + + lxe: objectpath_deep (--TR T1) 9.4180 msec/pass + lxe: objectpath_deep (--TR T2) 31.7512 msec/pass + lxe: objectpath_deep (--TR T4) 9.4421 msec/pass Note, however, that parsing ObjectPath expressions is not for free either, so this is most effective for frequently accessing the same element. @@ -713,17 +725,17 @@ subtrees and elements) to cache, you can trade memory usage against access speed:: - lxe: attribute_cached (--TR T1) 6.4209 msec/pass - lxe: attribute_cached (--TR T2) 48.0378 msec/pass - lxe: attribute_cached (--TR T4) 6.3779 msec/pass - - lxe: attributes_deep_cached (--TR T1) 7.8559 msec/pass - lxe: attributes_deep_cached (--TR T2) 51.0719 msec/pass - lxe: attributes_deep_cached (--TR T4) 7.7350 msec/pass - - lxe: objectpath_deep_cached (--TR T1) 3.2761 msec/pass - lxe: objectpath_deep_cached (--TR T2) 45.7590 msec/pass - lxe: objectpath_deep_cached (--TR T4) 3.1459 msec/pass + lxe: attribute_cached (--TR T1) 5.1420 msec/pass + lxe: attribute_cached (--TR T2) 27.0739 msec/pass + lxe: attribute_cached (--TR T4) 5.1429 msec/pass + + lxe: attributes_deep_cached (--TR T1) 7.0908 msec/pass + lxe: attributes_deep_cached (--TR T2) 29.5591 msec/pass + lxe: attributes_deep_cached (--TR T4) 7.1721 msec/pass + + lxe: objectpath_deep_cached (--TR T1) 2.2731 msec/pass + lxe: objectpath_deep_cached (--TR T2) 23.1631 msec/pass + lxe: objectpath_deep_cached (--TR T4) 2.3179 msec/pass Things to note: you cannot currently use ``weakref.WeakKeyDictionary`` objects for this as lxml's element objects do not support weak references (which are