[Lxml-checkins] r46666 - in lxml/trunk: . src/lxml src/lxml/tests
scoder at codespeak.net
scoder at codespeak.net
Sat Sep 15 23:44:55 CEST 2007
Author: scoder
Date: Sat Sep 15 23:44:54 2007
New Revision: 46666
Modified:
lxml/trunk/CHANGES.txt
lxml/trunk/src/lxml/etree.pyx
lxml/trunk/src/lxml/extensions.pxi
lxml/trunk/src/lxml/serializer.pxi
lxml/trunk/src/lxml/tests/test_elementtree.py
lxml/trunk/src/lxml/tests/test_etree.py
lxml/trunk/src/lxml/tree.pxd
Log:
support for selection output method on serialisation
Modified: lxml/trunk/CHANGES.txt
==============================================================================
--- lxml/trunk/CHANGES.txt (original)
+++ lxml/trunk/CHANGES.txt Sat Sep 15 23:44:54 2007
@@ -8,6 +8,10 @@
Features added
--------------
+* ``ET.write()``, ``tostring()`` and ``tounicode()`` now accept a keyword
+ argument ``method`` that can be one of 'xml' (or None), 'html' or 'text' to
+ serialise as XML, HTML or plain text content.
+
* ``iterfind()`` method on Elements returns an iterator equivalent to
``findall()``
Modified: lxml/trunk/src/lxml/etree.pyx
==============================================================================
--- lxml/trunk/src/lxml/etree.pyx (original)
+++ lxml/trunk/src/lxml/etree.pyx Sat Sep 15 23:44:54 2007
@@ -1374,11 +1374,14 @@
return self._doc._parser
return None
- def write(self, file, encoding=None,
+ def write(self, file, encoding=None, method="xml",
pretty_print=False, xml_declaration=None):
"""Write the tree to a file or file-like object.
-
+
Defaults to ASCII encoding and writing a declaration as needed.
+
+ The keyword argument 'method' selects the output method: 'xml' or
+ 'html'.
"""
cdef int c_write_declaration
self._assertHasRoot()
@@ -1394,7 +1397,7 @@
encoding = encoding.upper()
c_write_declaration = encoding not in \
('US-ASCII', 'ASCII', 'UTF8', 'UTF-8')
- _tofilelike(file, self._context_node, encoding,
+ _tofilelike(file, self._context_node, encoding, method,
c_write_declaration, 1, bool(pretty_print))
def getpath(self, _Element element not None):
@@ -2148,7 +2151,7 @@
"""
_dumpToFile(sys.stdout, elem._c_node, bool(pretty_print))
-def tostring(element_or_tree, encoding=None,
+def tostring(element_or_tree, encoding=None, method="xml",
xml_declaration=None, pretty_print=False):
"""Serialize an element to an encoded string representation of its XML
tree.
@@ -2159,6 +2162,8 @@
compatible encoding will enable a declaration by default.
The keyword argument 'pretty_print' (bool) enables formatted XML.
+
+ The keyword argument 'method' selects the output method: 'xml' or 'html'.
"""
cdef int write_declaration
cdef int c_pretty_print
@@ -2173,15 +2178,15 @@
encoding = 'ASCII'
if isinstance(element_or_tree, _Element):
- return _tostring(<_Element>element_or_tree,
- encoding, write_declaration, 0, c_pretty_print)
+ return _tostring(<_Element>element_or_tree, encoding, method,
+ write_declaration, 0, c_pretty_print)
elif isinstance(element_or_tree, _ElementTree):
return _tostring((<_ElementTree>element_or_tree)._context_node,
- encoding, write_declaration, 1, c_pretty_print)
+ encoding, method, write_declaration, 1, c_pretty_print)
else:
raise TypeError, "Type '%s' cannot be serialized." % type(element_or_tree)
-def tounicode(element_or_tree, pretty_print=False):
+def tounicode(element_or_tree, method="xml", pretty_print=False):
"""Serialize an element to the Python unicode representation of its XML
tree.
@@ -2190,14 +2195,16 @@
further treatment.
The keyword argument 'pretty_print' (bool) enables formatted XML.
+
+ The keyword argument 'method' selects the output method: 'xml' or 'html'.
"""
cdef int c_pretty_print
c_pretty_print = bool(pretty_print)
if isinstance(element_or_tree, _Element):
- return _tounicode(<_Element>element_or_tree, 0, c_pretty_print)
+ return _tounicode(<_Element>element_or_tree, method, 0, c_pretty_print)
elif isinstance(element_or_tree, _ElementTree):
return _tounicode((<_ElementTree>element_or_tree)._context_node,
- 1, c_pretty_print)
+ method, 1, c_pretty_print)
else:
raise TypeError, "Type '%s' cannot be serialized." % type(element_or_tree)
Modified: lxml/trunk/src/lxml/extensions.pxi
==============================================================================
--- lxml/trunk/src/lxml/extensions.pxi (original)
+++ lxml/trunk/src/lxml/extensions.pxi Sat Sep 15 23:44:54 2007
@@ -363,8 +363,12 @@
return firstnode
elif isinstance(firstnode, _Element):
c_text = tree.xmlNodeGetContent((<_Element>firstnode)._c_node)
- s = funicode(c_text)
- tree.xmlFree(c_text)
+ if c_text is NULL:
+ python.PyErr_NoMemory()
+ try:
+ s = funicode(c_text)
+ finally:
+ tree.xmlFree(c_text)
return s
else:
return str(firstnode)
Modified: lxml/trunk/src/lxml/serializer.pxi
==============================================================================
--- lxml/trunk/src/lxml/serializer.pxi (original)
+++ lxml/trunk/src/lxml/serializer.pxi Sat Sep 15 23:44:54 2007
@@ -1,21 +1,75 @@
# XML serialization and output functions
-cdef _tostring(_Element element, encoding,
+cdef enum _OutputMethods:
+ OUTPUT_METHOD_XML
+ OUTPUT_METHOD_HTML
+ OUTPUT_METHOD_TEXT
+
+cdef int _findOutputMethod(method) except -1:
+ if method is None:
+ return OUTPUT_METHOD_XML
+ method = method.lower()
+ if method == "xml":
+ return OUTPUT_METHOD_XML
+ if method == "html":
+ return OUTPUT_METHOD_HTML
+ if method == "text":
+ return OUTPUT_METHOD_TEXT
+ raise ValueError, "unknown output method %r" % method
+
+cdef _textToString(xmlNode* c_node, encoding):
+ cdef python.PyThreadState* state
+ cdef char* c_text
+ state = python.PyEval_SaveThread()
+ c_text = tree.xmlNodeGetContent(c_node)
+ python.PyEval_RestoreThread(state)
+ if c_text is NULL:
+ python.PyErr_NoMemory()
+
+ try:
+ if _hasTail(c_node):
+ tail = _collectText(c_node.next)
+ if tail:
+ text = c_text + tail
+ else:
+ text = c_text
+ else:
+ text = c_text
+ finally:
+ tree.xmlFree(c_text)
+
+ if encoding is None:
+ return text
+ encoding = encoding.upper()
+ if encoding == 'UTF-8' or encoding == 'ASCII':
+ return text
+
+ text = python.PyUnicode_FromEncodedObject(text, 'utf-8', 'strict')
+ return python.PyUnicode_AsEncodedString(text, encoding, 'strict')
+
+cdef _tostring(_Element element, encoding, method,
int write_xml_declaration, int write_complete_document,
int pretty_print):
- "Serialize an element to an encoded string representation of its XML tree."
+ """Serialize an element to an encoded string representation of its XML
+ tree.
+ """
cdef python.PyThreadState* state
cdef tree.xmlOutputBuffer* c_buffer
cdef tree.xmlBuffer* c_result_buffer
cdef tree.xmlCharEncodingHandler* enchandler
cdef char* c_enc
cdef char* c_version
+ cdef int c_method
if element is None:
return None
if encoding is None:
c_enc = NULL
else:
- c_enc = encoding
+ encoding = _utf8(encoding)
+ c_enc = _cstr(encoding)
+ c_method = _findOutputMethod(method)
+ if c_method == OUTPUT_METHOD_TEXT:
+ return _textToString(element._c_node, encoding)
# it is necessary to *and* find the encoding handler *and* use
# encoding during output
enchandler = tree.xmlFindCharEncodingHandler(c_enc)
@@ -29,7 +83,7 @@
try:
state = python.PyEval_SaveThread()
- _writeNodeToBuffer(c_buffer, element._c_node, c_enc,
+ _writeNodeToBuffer(c_buffer, element._c_node, c_enc, c_method,
write_xml_declaration, write_complete_document,
pretty_print)
tree.xmlOutputBufferFlush(c_buffer)
@@ -45,19 +99,27 @@
tree.xmlOutputBufferClose(c_buffer)
return result
-cdef _tounicode(_Element element, int write_complete_document, int pretty_print):
- "Serialize an element to the Python unicode representation of its XML tree."
+cdef _tounicode(_Element element, method,
+ int write_complete_document, int pretty_print):
+ """Serialize an element to the Python unicode representation of its XML
+ tree.
+ """
cdef python.PyThreadState* state
cdef tree.xmlOutputBuffer* c_buffer
cdef tree.xmlBuffer* c_result_buffer
+ cdef int c_method
if element is None:
return None
+ c_method = _findOutputMethod(method)
+ if c_method == OUTPUT_METHOD_TEXT:
+ text = _textToString(element._c_node, None)
+ return python.PyUnicode_FromEncodedObject(text, 'utf-8', 'strict')
c_buffer = tree.xmlAllocOutputBuffer(NULL)
if c_buffer is NULL:
raise LxmlError, "Failed to create output buffer"
try:
state = python.PyEval_SaveThread()
- _writeNodeToBuffer(c_buffer, element._c_node, NULL, 0,
+ _writeNodeToBuffer(c_buffer, element._c_node, NULL, c_method, 0,
write_complete_document, pretty_print)
tree.xmlOutputBufferFlush(c_buffer)
python.PyEval_RestoreThread(state)
@@ -74,14 +136,14 @@
return result
cdef void _writeNodeToBuffer(tree.xmlOutputBuffer* c_buffer,
- xmlNode* c_node, char* encoding,
+ xmlNode* c_node, char* encoding, int c_method,
int write_xml_declaration,
int write_complete_document,
int pretty_print):
cdef xmlDoc* c_doc
cdef xmlNode* c_nsdecl_node
c_doc = c_node.doc
- if write_xml_declaration:
+ if write_xml_declaration and c_method == OUTPUT_METHOD_XML:
_writeDeclarationToBuffer(c_buffer, c_doc.version, encoding)
# write internal DTD subset, preceding PIs/comments, etc.
@@ -101,8 +163,12 @@
c_nsdecl_node.last = c_node.last
# write node
- tree.xmlNodeDumpOutput(c_buffer, c_doc, c_nsdecl_node, 0,
- pretty_print, encoding)
+ if c_method == OUTPUT_METHOD_XML:
+ tree.xmlNodeDumpOutput(
+ c_buffer, c_doc, c_nsdecl_node, 0, pretty_print, encoding)
+ else:
+ tree.htmlNodeDumpFormatOutput(
+ c_buffer, c_doc, c_nsdecl_node, encoding, pretty_print)
if c_nsdecl_node is not c_node:
# clean up
@@ -244,7 +310,7 @@
cdef int _closeFilelikeWriter(void* ctxt):
return (<_FilelikeWriter>ctxt).close()
-cdef _tofilelike(f, _Element element, encoding,
+cdef _tofilelike(f, _Element element, encoding, method,
int write_xml_declaration, int write_doctype,
int pretty_print):
cdef python.PyThreadState* state
@@ -255,7 +321,17 @@
if encoding is None:
c_enc = NULL
else:
- c_enc = encoding
+ encoding = _utf8(encoding)
+ c_enc = _cstr(encoding)
+ c_method = _findOutputMethod(method)
+ if c_method == OUTPUT_METHOD_TEXT:
+ if _isString(f):
+ f = open(f, 'wb')
+ f.write(_textToString(element._c_node, encoding))
+ f.close()
+ else:
+ f.write(_textToString(element._c_node, encoding))
+ return
enchandler = tree.xmlFindCharEncodingHandler(c_enc)
if enchandler is NULL:
raise LookupError, python.PyString_FromFormat(
@@ -275,7 +351,7 @@
tree.xmlCharEncCloseFunc(enchandler)
raise TypeError, "File or filename expected, got '%s'" % type(f)
- _writeNodeToBuffer(c_buffer, element._c_node, c_enc,
+ _writeNodeToBuffer(c_buffer, element._c_node, c_enc, c_method,
write_xml_declaration, write_doctype, pretty_print)
tree.xmlOutputBufferClose(c_buffer)
tree.xmlCharEncCloseFunc(enchandler)
Modified: lxml/trunk/src/lxml/tests/test_elementtree.py
==============================================================================
--- lxml/trunk/src/lxml/tests/test_elementtree.py (original)
+++ lxml/trunk/src/lxml/tests/test_elementtree.py Sat Sep 15 23:44:54 2007
@@ -716,6 +716,47 @@
self.assertEquals(
'<doc%s>This is a test.</doc%s>' % (i, i),
canonicalize(data))
+
+ def test_write_method_html(self):
+ ElementTree = self.etree.ElementTree
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ html = Element('html')
+ body = SubElement(html, 'body')
+ p = SubElement(body, 'p')
+ p.text = "html"
+ SubElement(p, 'br').tail = "test"
+
+ tree = ElementTree(element=html)
+ f = StringIO()
+ tree.write(f, method="html")
+ data = f.getvalue()
+
+ self.assertEquals('<html><body><p>html<br>test</p></body></html>',
+ data)
+
+ def test_write_method_text(self):
+ ElementTree = self.etree.ElementTree
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ a.text = "A"
+ a.tail = "tail"
+ b = SubElement(a, 'b')
+ b.text = "B"
+ b.tail = "TAIL"
+ c = SubElement(a, 'c')
+ c.text = "C"
+
+ tree = ElementTree(element=a)
+ f = StringIO()
+ tree.write(f, method="text")
+ data = f.getvalue()
+
+ self.assertEquals('ABTAILCtail',
+ data)
def test_write_fail(self):
ElementTree = self.etree.ElementTree
@@ -2099,6 +2140,37 @@
self.assert_(tostring(b) == '<b/>Foo' or
tostring(b) == '<b />Foo')
+ def test_tostring_method_html(self):
+ tostring = self.etree.tostring
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ html = Element('html')
+ body = SubElement(html, 'body')
+ p = SubElement(body, 'p')
+ p.text = "html"
+ SubElement(p, 'br').tail = "test"
+
+ self.assertEquals('<html><body><p>html<br>test</p></body></html>',
+ tostring(html, method="html"))
+
+ def test_tostring_method_text(self):
+ tostring = self.etree.tostring
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ a.text = "A"
+ a.tail = "tail"
+ b = SubElement(a, 'b')
+ b.text = "B"
+ b.tail = "TAIL"
+ c = SubElement(a, 'c')
+ c.text = "C"
+
+ self.assertEquals('ABTAILCtail',
+ tostring(a, method="text"))
+
def test_iterparse(self):
iterparse = self.etree.iterparse
f = StringIO('<a><b></b><c/></a>')
Modified: lxml/trunk/src/lxml/tests/test_etree.py
==============================================================================
--- lxml/trunk/src/lxml/tests/test_etree.py (original)
+++ lxml/trunk/src/lxml/tests/test_etree.py Sat Sep 15 23:44:54 2007
@@ -1761,6 +1761,25 @@
result = tostring(a, pretty_print=True)
self.assertEquals(result, "<a>\n <b/>\n <c/>\n</a>")
+ def test_tostring_method_text_encoding(self):
+ tostring = self.etree.tostring
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ a.text = "A"
+ a.tail = "tail"
+ b = SubElement(a, 'b')
+ b.text = "B"
+ b.tail = u"Søk på nettet"
+ c = SubElement(a, 'c')
+ c.text = "C"
+
+ result = tostring(a, method="text", encoding="UTF-16")
+
+ self.assertEquals(u'ABSøk på nettetCtail'.encode("UTF-16"),
+ result)
+
def test_tounicode(self):
tounicode = self.etree.tounicode
Element = self.etree.Element
Modified: lxml/trunk/src/lxml/tree.pxd
==============================================================================
--- lxml/trunk/src/lxml/tree.pxd (original)
+++ lxml/trunk/src/lxml/tree.pxd Sat Sep 15 23:44:54 2007
@@ -234,6 +234,11 @@
cdef char* xmlBuildURI(char* href, char* base)
cdef int xmlValidateNCName(char* value, int space)
+cdef extern from "libxml/HTMLtree.h":
+ cdef void htmlNodeDumpFormatOutput(xmlOutputBuffer* buf,
+ xmlDoc* doc, xmlNode* cur,
+ char* encoding, int format)
+
cdef extern from "libxml/valid.h":
cdef xmlAttr* xmlGetID(xmlDoc* doc, char* ID)
cdef void xmlDumpNotationTable(xmlBuffer* buffer, xmlNotationTable* table)
More information about the lxml-checkins
mailing list