[Lxml-checkins] r52195 - in lxml/trunk: . benchmark doc src/lxml src/lxml/tests

scoder at codespeak.net scoder at codespeak.net
Wed Mar 5 22:17:57 CET 2008


Author: scoder
Date: Wed Mar  5 22:17:55 2008
New Revision: 52195

Modified:
   lxml/trunk/   (props changed)
   lxml/trunk/benchmark/bench_etree.py
   lxml/trunk/benchmark/benchbase.py
   lxml/trunk/doc/tutorial.txt
   lxml/trunk/src/lxml/serializer.pxi
   lxml/trunk/src/lxml/tests/test_etree.py
   lxml/trunk/src/lxml/tree.pxd
Log:
 r3730 at delle:  sbehnel | 2008-03-04 22:44:06 +0100
 rewrite of 'text' serialiser: fix default encoding, faster .tail adding and 'unicode' encoding


Modified: lxml/trunk/benchmark/bench_etree.py
==============================================================================
--- lxml/trunk/benchmark/bench_etree.py	(original)
+++ lxml/trunk/benchmark/bench_etree.py	Wed Mar  5 22:17:55 2008
@@ -34,6 +34,35 @@
         for i in range(1000):
             child = root[pos]
 
+    @with_attributes(False)
+    @with_text(text=True)
+    @onlylib('lxe', 'ET')
+    def bench_tostring_text_ascii(self, root):
+        self.etree.tostring(root, method="text")
+
+    @with_attributes(False)
+    @with_text(text=True, utext=True)
+    @onlylib('lxe')
+    def bench_tostring_text_utf16(self, root):
+        self.etree.tostring(root, method="text", encoding='UTF-16')
+
+    @with_attributes(False)
+    @with_text(text=True, utext=True)
+    @onlylib('lxe', 'ET')
+    @children
+    def bench_tostring_text_utf8_with_tail(self, children):
+        for child in children:
+            self.etree.tostring(child, method="text",
+                                encoding='UTF-8', with_tail=True)
+
+    @with_attributes(False)
+    @with_text(text=True, utext=True)
+    @onlylib('lxe')
+    @children
+    def bench_tostring_text_unicode(self, children):
+        for child in children:
+            self.etree.tostring(child, method="text", encoding=unicode)
+
     @with_attributes(True, False)
     @with_text(text=True, utext=True)
     def bench_tostring_utf8(self, root):

Modified: lxml/trunk/benchmark/benchbase.py
==============================================================================
--- lxml/trunk/benchmark/benchbase.py	(original)
+++ lxml/trunk/benchmark/benchbase.py	Wed Mar  5 22:17:55 2008
@@ -200,7 +200,7 @@
             el.text = text
             for ch2 in atoz:
                 for i in range(20 * TREE_FACTOR):
-                    SubElement(el, "{cdefg}%s%05d" % (ch2, i))
+                    SubElement(el, "{cdefg}%s%05d" % (ch2, i)).tail = text
         t = current_time() - t
         return (root, t)
 
@@ -216,7 +216,7 @@
                 el = SubElement(root, "{abc}"+ch1*5, attributes)
                 el.text = text
                 for ch2 in atoz:
-                    SubElement(el, "{cdefg}%s%05d" % (ch2, i))
+                    SubElement(el, "{cdefg}%s%05d" % (ch2, i)).tail = text
         t = current_time() - t
         return (root, t)
 
@@ -231,8 +231,9 @@
             tag_no = count().next
             children = [ SubElement(c, "{cdefg}a%05d" % i, attributes)
                          for i,c in enumerate(chain(children, children, children)) ]
-        for child in root:
+        for child in children:
             child.text = text
+            child.tail = text
         t = current_time() - t
         return (root, t)
 
@@ -246,8 +247,8 @@
         for ch1 in self.atoz:
             el = SubElement(root, "{abc}"+ch1*5, attributes)
             el.text = text
-            SubElement(el, "{cdefg}a00001", attributes)
-            SubElement(el, "{cdefg}z00000", attributes)
+            SubElement(el, "{cdefg}a00001", attributes).tail = text
+            SubElement(el, "{cdefg}z00000", attributes).tail = text
         t = current_time() - t
         return (root, t)
 

Modified: lxml/trunk/doc/tutorial.txt
==============================================================================
--- lxml/trunk/doc/tutorial.txt	(original)
+++ lxml/trunk/doc/tutorial.txt	Wed Mar  5 22:17:55 2008
@@ -557,14 +557,20 @@
    >>> print etree.tostring(root, method='text')
    HelloWorld
 
-Note that the default encoding for plain text serialisation is UTF-8:
+As for XML serialisation, the default encoding for plain text
+serialisation is ASCII:
 
 .. sourcecode:: pycon
 
    >>> br = root.find('.//br')
    >>> br.tail = u'W\xf6rld'
 
-   >>> etree.tostring(root, method='text')
+   >>> etree.tostring(root, method='text')  # doctest: +ELLIPSIS
+   Traceback (most recent call last):
+     ...
+   UnicodeEncodeError: 'ascii' codec can't encode character u'\xf6' ...
+
+   >>> etree.tostring(root, method='text', encoding="UTF-8")
    'HelloW\xc3\xb6rld'
 
 Here, serialising to a Python unicode string instead of a byte string

Modified: lxml/trunk/src/lxml/serializer.pxi
==============================================================================
--- lxml/trunk/src/lxml/serializer.pxi	(original)
+++ lxml/trunk/src/lxml/serializer.pxi	Wed Mar  5 22:17:55 2008
@@ -18,28 +18,46 @@
     raise ValueError("unknown output method %r" % method)
 
 cdef _textToString(xmlNode* c_node, encoding, bint with_tail):
+    cdef bint needs_conversion
     cdef char* c_text
+    cdef xmlNode* c_text_node
+    cdef tree.xmlBuffer* c_buffer
+
+    c_buffer = tree.xmlBufferCreate()
+    if c_buffer is NULL:
+        return python.PyErr_NoMemory()
+
     with nogil:
-        c_text = tree.xmlNodeGetContent(c_node)
-    if c_text is NULL:
-        python.PyErr_NoMemory()
-
-    text = c_text
-    tree.xmlFree(c_text)
-
-    if with_tail and _hasTail(c_node):
-        tail = _collectText(c_node.next)
-        if tail:
-            text = text + tail
+        tree.xmlNodeBufGetContent(c_buffer, c_node)
+        if with_tail:
+            c_text_node = _textNodeOrSkip(c_node.next)
+            while c_text_node is not NULL:
+                tree.xmlBufferWriteChar(c_buffer, c_text_node.content)
+                c_text_node = _textNodeOrSkip(c_text_node.next)
+        c_text = tree.xmlBufferContent(c_buffer)
 
-    if encoding is None:
-        return text
-    encoding = encoding.upper()
-    if encoding == 'UTF-8' or encoding == 'ASCII':
-        return text
+    try:
+        needs_conversion = 0
+        if encoding is not None:
+            encoding = encoding.upper()
+            if encoding != 'UTF-8':
+                if encoding == 'ASCII':
+                    if isutf8(c_text):
+                        # will raise a decode error below
+                        needs_conversion = 1
+                else:
+                    needs_conversion = 1
+
+        if needs_conversion:
+            text = python.PyUnicode_DecodeUTF8(
+                c_text, tree.xmlBufferLength(c_buffer), 'strict')
+            text = python.PyUnicode_AsEncodedString(text, encoding, 'strict')
+        else:
+            text = c_text
+    finally:
+        tree.xmlBufferFree(c_buffer);
+    return text
 
-    text = python.PyUnicode_FromEncodedObject(text, 'utf-8', 'strict')
-    return python.PyUnicode_AsEncodedString(text, encoding, 'strict')
 
 cdef _tostring(_Element element, encoding, method,
                bint write_xml_declaration, bint write_complete_document,

Modified: lxml/trunk/src/lxml/tests/test_etree.py
==============================================================================
--- lxml/trunk/src/lxml/tests/test_etree.py	(original)
+++ lxml/trunk/src/lxml/tests/test_etree.py	Wed Mar  5 22:17:55 2008
@@ -1849,6 +1849,27 @@
         self.assertEquals(u'ABSøk på nettetCtail'.encode("UTF-16"),
                           result)
 
+    def test_tostring_method_text_unicode(self):
+        tostring = self.etree.tostring
+        Element = self.etree.Element
+        SubElement = self.etree.SubElement
+        
+        a = Element('a')
+        a.text = u'Søk på nettetA'
+        a.tail = "tail"
+        b = SubElement(a, 'b')
+        b.text = "B"
+        b.tail = u'Søk på nettetB'
+        c = SubElement(a, 'c')
+        c.text = "C"
+        
+        self.assertRaises(UnicodeEncodeError,
+                          tostring, a, method="text")
+        
+        self.assertEquals(
+            u'Søk på nettetABSøk på nettetBCtail'.encode('utf-8'),
+            tostring(a, encoding="UTF-8", method="text"))
+
     def test_tounicode(self):
         tounicode = self.etree.tounicode
         Element = self.etree.Element

Modified: lxml/trunk/src/lxml/tree.pxd
==============================================================================
--- lxml/trunk/src/lxml/tree.pxd	(original)
+++ lxml/trunk/src/lxml/tree.pxd	Wed Mar  5 22:17:55 2008
@@ -212,6 +212,7 @@
     cdef xmlAttr* xmlHasProp(xmlNode* node, char* name) nogil
     cdef xmlAttr* xmlHasNsProp(xmlNode* node, char* name, char* nameSpace) nogil
     cdef char* xmlNodeGetContent(xmlNode* cur) nogil
+    cdef char* xmlNodeBufGetContent(xmlBuffer* buffer, xmlNode* cur) nogil
     cdef xmlNs* xmlSearchNs(xmlDoc* doc, xmlNode* node, char* prefix) nogil
     cdef xmlNs* xmlSearchNsByHref(xmlDoc* doc, xmlNode* node, char* href) nogil
     cdef int xmlIsBlankNode(xmlNode* node) nogil
@@ -229,6 +230,7 @@
     cdef int xmlReconciliateNs(xmlDoc* doc, xmlNode* tree) nogil
     cdef xmlNs* xmlNewReconciliedNs(xmlDoc* doc, xmlNode* tree, xmlNs* ns) nogil
     cdef xmlBuffer* xmlBufferCreate() nogil
+    cdef void xmlBufferWriteChar(xmlBuffer* buf, char* string) nogil
     cdef void xmlBufferFree(xmlBuffer* buf) nogil
     cdef char* xmlBufferContent(xmlBuffer* buf) nogil
     cdef int xmlBufferLength(xmlBuffer* buf) nogil
@@ -249,7 +251,6 @@
                                    xmlNotationTable* table) nogil
 
 cdef extern from "libxml/xmlIO.h":
-    cdef void xmlBufferWriteQuotedString(xmlOutputBuffer* out, char* str) nogil
     cdef int xmlOutputBufferWriteString(xmlOutputBuffer* out, char* str) nogil
     cdef int xmlOutputBufferWrite(xmlOutputBuffer* out,
                                   int len, char* str) nogil


More information about the lxml-checkins mailing list