[Lxml-checkins] r52195 - in lxml/trunk: . benchmark doc src/lxml src/lxml/tests
scoder at codespeak.net
scoder at codespeak.net
Wed Mar 5 22:17:57 CET 2008
Author: scoder
Date: Wed Mar 5 22:17:55 2008
New Revision: 52195
Modified:
lxml/trunk/ (props changed)
lxml/trunk/benchmark/bench_etree.py
lxml/trunk/benchmark/benchbase.py
lxml/trunk/doc/tutorial.txt
lxml/trunk/src/lxml/serializer.pxi
lxml/trunk/src/lxml/tests/test_etree.py
lxml/trunk/src/lxml/tree.pxd
Log:
r3730 at delle: sbehnel | 2008-03-04 22:44:06 +0100
rewrite of 'text' serialiser: fix default encoding, faster .tail adding and 'unicode' encoding
Modified: lxml/trunk/benchmark/bench_etree.py
==============================================================================
--- lxml/trunk/benchmark/bench_etree.py (original)
+++ lxml/trunk/benchmark/bench_etree.py Wed Mar 5 22:17:55 2008
@@ -34,6 +34,35 @@
for i in range(1000):
child = root[pos]
+ @with_attributes(False)
+ @with_text(text=True)
+ @onlylib('lxe', 'ET')
+ def bench_tostring_text_ascii(self, root):
+ self.etree.tostring(root, method="text")
+
+ @with_attributes(False)
+ @with_text(text=True, utext=True)
+ @onlylib('lxe')
+ def bench_tostring_text_utf16(self, root):
+ self.etree.tostring(root, method="text", encoding='UTF-16')
+
+ @with_attributes(False)
+ @with_text(text=True, utext=True)
+ @onlylib('lxe', 'ET')
+ @children
+ def bench_tostring_text_utf8_with_tail(self, children):
+ for child in children:
+ self.etree.tostring(child, method="text",
+ encoding='UTF-8', with_tail=True)
+
+ @with_attributes(False)
+ @with_text(text=True, utext=True)
+ @onlylib('lxe')
+ @children
+ def bench_tostring_text_unicode(self, children):
+ for child in children:
+ self.etree.tostring(child, method="text", encoding=unicode)
+
@with_attributes(True, False)
@with_text(text=True, utext=True)
def bench_tostring_utf8(self, root):
Modified: lxml/trunk/benchmark/benchbase.py
==============================================================================
--- lxml/trunk/benchmark/benchbase.py (original)
+++ lxml/trunk/benchmark/benchbase.py Wed Mar 5 22:17:55 2008
@@ -200,7 +200,7 @@
el.text = text
for ch2 in atoz:
for i in range(20 * TREE_FACTOR):
- SubElement(el, "{cdefg}%s%05d" % (ch2, i))
+ SubElement(el, "{cdefg}%s%05d" % (ch2, i)).tail = text
t = current_time() - t
return (root, t)
@@ -216,7 +216,7 @@
el = SubElement(root, "{abc}"+ch1*5, attributes)
el.text = text
for ch2 in atoz:
- SubElement(el, "{cdefg}%s%05d" % (ch2, i))
+ SubElement(el, "{cdefg}%s%05d" % (ch2, i)).tail = text
t = current_time() - t
return (root, t)
@@ -231,8 +231,9 @@
tag_no = count().next
children = [ SubElement(c, "{cdefg}a%05d" % i, attributes)
for i,c in enumerate(chain(children, children, children)) ]
- for child in root:
+ for child in children:
child.text = text
+ child.tail = text
t = current_time() - t
return (root, t)
@@ -246,8 +247,8 @@
for ch1 in self.atoz:
el = SubElement(root, "{abc}"+ch1*5, attributes)
el.text = text
- SubElement(el, "{cdefg}a00001", attributes)
- SubElement(el, "{cdefg}z00000", attributes)
+ SubElement(el, "{cdefg}a00001", attributes).tail = text
+ SubElement(el, "{cdefg}z00000", attributes).tail = text
t = current_time() - t
return (root, t)
Modified: lxml/trunk/doc/tutorial.txt
==============================================================================
--- lxml/trunk/doc/tutorial.txt (original)
+++ lxml/trunk/doc/tutorial.txt Wed Mar 5 22:17:55 2008
@@ -557,14 +557,20 @@
>>> print etree.tostring(root, method='text')
HelloWorld
-Note that the default encoding for plain text serialisation is UTF-8:
+As for XML serialisation, the default encoding for plain text
+serialisation is ASCII:
.. sourcecode:: pycon
>>> br = root.find('.//br')
>>> br.tail = u'W\xf6rld'
- >>> etree.tostring(root, method='text')
+ >>> etree.tostring(root, method='text') # doctest: +ELLIPSIS
+ Traceback (most recent call last):
+ ...
+ UnicodeEncodeError: 'ascii' codec can't encode character u'\xf6' ...
+
+ >>> etree.tostring(root, method='text', encoding="UTF-8")
'HelloW\xc3\xb6rld'
Here, serialising to a Python unicode string instead of a byte string
Modified: lxml/trunk/src/lxml/serializer.pxi
==============================================================================
--- lxml/trunk/src/lxml/serializer.pxi (original)
+++ lxml/trunk/src/lxml/serializer.pxi Wed Mar 5 22:17:55 2008
@@ -18,28 +18,46 @@
raise ValueError("unknown output method %r" % method)
cdef _textToString(xmlNode* c_node, encoding, bint with_tail):
+ cdef bint needs_conversion
cdef char* c_text
+ cdef xmlNode* c_text_node
+ cdef tree.xmlBuffer* c_buffer
+
+ c_buffer = tree.xmlBufferCreate()
+ if c_buffer is NULL:
+ return python.PyErr_NoMemory()
+
with nogil:
- c_text = tree.xmlNodeGetContent(c_node)
- if c_text is NULL:
- python.PyErr_NoMemory()
-
- text = c_text
- tree.xmlFree(c_text)
-
- if with_tail and _hasTail(c_node):
- tail = _collectText(c_node.next)
- if tail:
- text = text + tail
+ tree.xmlNodeBufGetContent(c_buffer, c_node)
+ if with_tail:
+ c_text_node = _textNodeOrSkip(c_node.next)
+ while c_text_node is not NULL:
+ tree.xmlBufferWriteChar(c_buffer, c_text_node.content)
+ c_text_node = _textNodeOrSkip(c_text_node.next)
+ c_text = tree.xmlBufferContent(c_buffer)
- if encoding is None:
- return text
- encoding = encoding.upper()
- if encoding == 'UTF-8' or encoding == 'ASCII':
- return text
+ try:
+ needs_conversion = 0
+ if encoding is not None:
+ encoding = encoding.upper()
+ if encoding != 'UTF-8':
+ if encoding == 'ASCII':
+ if isutf8(c_text):
+ # will raise a decode error below
+ needs_conversion = 1
+ else:
+ needs_conversion = 1
+
+ if needs_conversion:
+ text = python.PyUnicode_DecodeUTF8(
+ c_text, tree.xmlBufferLength(c_buffer), 'strict')
+ text = python.PyUnicode_AsEncodedString(text, encoding, 'strict')
+ else:
+ text = c_text
+ finally:
+ tree.xmlBufferFree(c_buffer);
+ return text
- text = python.PyUnicode_FromEncodedObject(text, 'utf-8', 'strict')
- return python.PyUnicode_AsEncodedString(text, encoding, 'strict')
cdef _tostring(_Element element, encoding, method,
bint write_xml_declaration, bint write_complete_document,
Modified: lxml/trunk/src/lxml/tests/test_etree.py
==============================================================================
--- lxml/trunk/src/lxml/tests/test_etree.py (original)
+++ lxml/trunk/src/lxml/tests/test_etree.py Wed Mar 5 22:17:55 2008
@@ -1849,6 +1849,27 @@
self.assertEquals(u'ABSøk på nettetCtail'.encode("UTF-16"),
result)
+ def test_tostring_method_text_unicode(self):
+ tostring = self.etree.tostring
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ a.text = u'Søk på nettetA'
+ a.tail = "tail"
+ b = SubElement(a, 'b')
+ b.text = "B"
+ b.tail = u'Søk på nettetB'
+ c = SubElement(a, 'c')
+ c.text = "C"
+
+ self.assertRaises(UnicodeEncodeError,
+ tostring, a, method="text")
+
+ self.assertEquals(
+ u'Søk på nettetABSøk på nettetBCtail'.encode('utf-8'),
+ tostring(a, encoding="UTF-8", method="text"))
+
def test_tounicode(self):
tounicode = self.etree.tounicode
Element = self.etree.Element
Modified: lxml/trunk/src/lxml/tree.pxd
==============================================================================
--- lxml/trunk/src/lxml/tree.pxd (original)
+++ lxml/trunk/src/lxml/tree.pxd Wed Mar 5 22:17:55 2008
@@ -212,6 +212,7 @@
cdef xmlAttr* xmlHasProp(xmlNode* node, char* name) nogil
cdef xmlAttr* xmlHasNsProp(xmlNode* node, char* name, char* nameSpace) nogil
cdef char* xmlNodeGetContent(xmlNode* cur) nogil
+ cdef char* xmlNodeBufGetContent(xmlBuffer* buffer, xmlNode* cur) nogil
cdef xmlNs* xmlSearchNs(xmlDoc* doc, xmlNode* node, char* prefix) nogil
cdef xmlNs* xmlSearchNsByHref(xmlDoc* doc, xmlNode* node, char* href) nogil
cdef int xmlIsBlankNode(xmlNode* node) nogil
@@ -229,6 +230,7 @@
cdef int xmlReconciliateNs(xmlDoc* doc, xmlNode* tree) nogil
cdef xmlNs* xmlNewReconciliedNs(xmlDoc* doc, xmlNode* tree, xmlNs* ns) nogil
cdef xmlBuffer* xmlBufferCreate() nogil
+ cdef void xmlBufferWriteChar(xmlBuffer* buf, char* string) nogil
cdef void xmlBufferFree(xmlBuffer* buf) nogil
cdef char* xmlBufferContent(xmlBuffer* buf) nogil
cdef int xmlBufferLength(xmlBuffer* buf) nogil
@@ -249,7 +251,6 @@
xmlNotationTable* table) nogil
cdef extern from "libxml/xmlIO.h":
- cdef void xmlBufferWriteQuotedString(xmlOutputBuffer* out, char* str) nogil
cdef int xmlOutputBufferWriteString(xmlOutputBuffer* out, char* str) nogil
cdef int xmlOutputBufferWrite(xmlOutputBuffer* out,
int len, char* str) nogil
More information about the lxml-checkins
mailing list