[Lxml-checkins] r46666 - in lxml/trunk: . src/lxml src/lxml/tests

scoder at codespeak.net scoder at codespeak.net
Sat Sep 15 23:44:55 CEST 2007


Author: scoder
Date: Sat Sep 15 23:44:54 2007
New Revision: 46666

Modified:
   lxml/trunk/CHANGES.txt
   lxml/trunk/src/lxml/etree.pyx
   lxml/trunk/src/lxml/extensions.pxi
   lxml/trunk/src/lxml/serializer.pxi
   lxml/trunk/src/lxml/tests/test_elementtree.py
   lxml/trunk/src/lxml/tests/test_etree.py
   lxml/trunk/src/lxml/tree.pxd
Log:
support for selection output method on serialisation

Modified: lxml/trunk/CHANGES.txt
==============================================================================
--- lxml/trunk/CHANGES.txt	(original)
+++ lxml/trunk/CHANGES.txt	Sat Sep 15 23:44:54 2007
@@ -8,6 +8,10 @@
 Features added
 --------------
 
+* ``ET.write()``, ``tostring()`` and ``tounicode()`` now accept a keyword
+  argument ``method`` that can be one of 'xml' (or None), 'html' or 'text' to
+  serialise as XML, HTML or plain text content.
+
 * ``iterfind()`` method on Elements returns an iterator equivalent to
   ``findall()``
 

Modified: lxml/trunk/src/lxml/etree.pyx
==============================================================================
--- lxml/trunk/src/lxml/etree.pyx	(original)
+++ lxml/trunk/src/lxml/etree.pyx	Sat Sep 15 23:44:54 2007
@@ -1374,11 +1374,14 @@
                 return self._doc._parser
             return None
 
-    def write(self, file, encoding=None,
+    def write(self, file, encoding=None, method="xml",
               pretty_print=False, xml_declaration=None):
         """Write the tree to a file or file-like object.
-        
+
         Defaults to ASCII encoding and writing a declaration as needed.
+
+        The keyword argument 'method' selects the output method: 'xml' or
+        'html'.
         """
         cdef int c_write_declaration
         self._assertHasRoot()
@@ -1394,7 +1397,7 @@
             encoding = encoding.upper()
             c_write_declaration = encoding not in \
                                   ('US-ASCII', 'ASCII', 'UTF8', 'UTF-8')
-        _tofilelike(file, self._context_node, encoding,
+        _tofilelike(file, self._context_node, encoding, method,
                     c_write_declaration, 1, bool(pretty_print))
 
     def getpath(self, _Element element not None):
@@ -2148,7 +2151,7 @@
     """
     _dumpToFile(sys.stdout, elem._c_node, bool(pretty_print))
 
-def tostring(element_or_tree, encoding=None,
+def tostring(element_or_tree, encoding=None, method="xml",
              xml_declaration=None, pretty_print=False):
     """Serialize an element to an encoded string representation of its XML
     tree.
@@ -2159,6 +2162,8 @@
     compatible encoding will enable a declaration by default.
 
     The keyword argument 'pretty_print' (bool) enables formatted XML.
+
+    The keyword argument 'method' selects the output method: 'xml' or 'html'.
     """
     cdef int write_declaration
     cdef int c_pretty_print
@@ -2173,15 +2178,15 @@
         encoding = 'ASCII'
 
     if isinstance(element_or_tree, _Element):
-        return _tostring(<_Element>element_or_tree,
-                         encoding, write_declaration, 0, c_pretty_print)
+        return _tostring(<_Element>element_or_tree, encoding, method,
+                         write_declaration, 0, c_pretty_print)
     elif isinstance(element_or_tree, _ElementTree):
         return _tostring((<_ElementTree>element_or_tree)._context_node,
-                         encoding, write_declaration, 1, c_pretty_print)
+                         encoding, method, write_declaration, 1, c_pretty_print)
     else:
         raise TypeError, "Type '%s' cannot be serialized." % type(element_or_tree)
 
-def tounicode(element_or_tree, pretty_print=False):
+def tounicode(element_or_tree, method="xml", pretty_print=False):
     """Serialize an element to the Python unicode representation of its XML
     tree.
 
@@ -2190,14 +2195,16 @@
     further treatment.
 
     The keyword argument 'pretty_print' (bool) enables formatted XML.
+
+    The keyword argument 'method' selects the output method: 'xml' or 'html'.
     """
     cdef int c_pretty_print
     c_pretty_print = bool(pretty_print)
     if isinstance(element_or_tree, _Element):
-        return _tounicode(<_Element>element_or_tree, 0, c_pretty_print)
+        return _tounicode(<_Element>element_or_tree, method, 0, c_pretty_print)
     elif isinstance(element_or_tree, _ElementTree):
         return _tounicode((<_ElementTree>element_or_tree)._context_node,
-                          1, c_pretty_print)
+                          method, 1, c_pretty_print)
     else:
         raise TypeError, "Type '%s' cannot be serialized." % type(element_or_tree)
 

Modified: lxml/trunk/src/lxml/extensions.pxi
==============================================================================
--- lxml/trunk/src/lxml/extensions.pxi	(original)
+++ lxml/trunk/src/lxml/extensions.pxi	Sat Sep 15 23:44:54 2007
@@ -363,8 +363,12 @@
                 return firstnode
             elif isinstance(firstnode, _Element):
                 c_text = tree.xmlNodeGetContent((<_Element>firstnode)._c_node)
-                s = funicode(c_text)
-                tree.xmlFree(c_text)
+                if c_text is NULL:
+                    python.PyErr_NoMemory()
+                try:
+                    s = funicode(c_text)
+                finally:
+                    tree.xmlFree(c_text)
                 return s
             else:
                 return str(firstnode)

Modified: lxml/trunk/src/lxml/serializer.pxi
==============================================================================
--- lxml/trunk/src/lxml/serializer.pxi	(original)
+++ lxml/trunk/src/lxml/serializer.pxi	Sat Sep 15 23:44:54 2007
@@ -1,21 +1,75 @@
 # XML serialization and output functions
 
-cdef _tostring(_Element element, encoding,
+cdef enum _OutputMethods:
+    OUTPUT_METHOD_XML
+    OUTPUT_METHOD_HTML
+    OUTPUT_METHOD_TEXT
+
+cdef int _findOutputMethod(method) except -1:
+    if method is None:
+        return OUTPUT_METHOD_XML
+    method = method.lower()
+    if method == "xml":
+        return OUTPUT_METHOD_XML
+    if method == "html":
+        return OUTPUT_METHOD_HTML
+    if method == "text":
+        return OUTPUT_METHOD_TEXT
+    raise ValueError, "unknown output method %r" % method
+
+cdef _textToString(xmlNode* c_node, encoding):
+    cdef python.PyThreadState* state
+    cdef char* c_text
+    state = python.PyEval_SaveThread()
+    c_text = tree.xmlNodeGetContent(c_node)
+    python.PyEval_RestoreThread(state)
+    if c_text is NULL:
+        python.PyErr_NoMemory()
+
+    try:
+        if _hasTail(c_node):
+            tail = _collectText(c_node.next)
+            if tail:
+                text = c_text + tail
+            else:
+                text = c_text
+        else:
+            text = c_text
+    finally:
+        tree.xmlFree(c_text)
+
+    if encoding is None:
+        return text
+    encoding = encoding.upper()
+    if encoding == 'UTF-8' or encoding == 'ASCII':
+        return text
+
+    text = python.PyUnicode_FromEncodedObject(text, 'utf-8', 'strict')
+    return python.PyUnicode_AsEncodedString(text, encoding, 'strict')
+
+cdef _tostring(_Element element, encoding, method,
                int write_xml_declaration, int write_complete_document,
                int pretty_print):
-    "Serialize an element to an encoded string representation of its XML tree."
+    """Serialize an element to an encoded string representation of its XML
+    tree.
+    """
     cdef python.PyThreadState* state
     cdef tree.xmlOutputBuffer* c_buffer
     cdef tree.xmlBuffer* c_result_buffer
     cdef tree.xmlCharEncodingHandler* enchandler
     cdef char* c_enc
     cdef char* c_version
+    cdef int c_method
     if element is None:
         return None
     if encoding is None:
         c_enc = NULL
     else:
-        c_enc = encoding
+        encoding = _utf8(encoding)
+        c_enc = _cstr(encoding)
+    c_method = _findOutputMethod(method)
+    if c_method == OUTPUT_METHOD_TEXT:
+        return _textToString(element._c_node, encoding)
     # it is necessary to *and* find the encoding handler *and* use
     # encoding during output
     enchandler = tree.xmlFindCharEncodingHandler(c_enc)
@@ -29,7 +83,7 @@
 
     try:
         state = python.PyEval_SaveThread()
-        _writeNodeToBuffer(c_buffer, element._c_node, c_enc,
+        _writeNodeToBuffer(c_buffer, element._c_node, c_enc, c_method,
                            write_xml_declaration, write_complete_document,
                            pretty_print)
         tree.xmlOutputBufferFlush(c_buffer)
@@ -45,19 +99,27 @@
         tree.xmlOutputBufferClose(c_buffer)
     return result
 
-cdef _tounicode(_Element element, int write_complete_document, int pretty_print):
-    "Serialize an element to the Python unicode representation of its XML tree."
+cdef _tounicode(_Element element, method,
+                int write_complete_document, int pretty_print):
+    """Serialize an element to the Python unicode representation of its XML
+    tree.
+    """
     cdef python.PyThreadState* state
     cdef tree.xmlOutputBuffer* c_buffer
     cdef tree.xmlBuffer* c_result_buffer
+    cdef int c_method
     if element is None:
         return None
+    c_method = _findOutputMethod(method)
+    if c_method == OUTPUT_METHOD_TEXT:
+        text = _textToString(element._c_node, None)
+        return python.PyUnicode_FromEncodedObject(text, 'utf-8', 'strict')
     c_buffer = tree.xmlAllocOutputBuffer(NULL)
     if c_buffer is NULL:
         raise LxmlError, "Failed to create output buffer"
     try:
         state = python.PyEval_SaveThread()
-        _writeNodeToBuffer(c_buffer, element._c_node, NULL, 0,
+        _writeNodeToBuffer(c_buffer, element._c_node, NULL, c_method, 0,
                            write_complete_document, pretty_print)
         tree.xmlOutputBufferFlush(c_buffer)
         python.PyEval_RestoreThread(state)
@@ -74,14 +136,14 @@
     return result
 
 cdef void _writeNodeToBuffer(tree.xmlOutputBuffer* c_buffer,
-                             xmlNode* c_node, char* encoding,
+                             xmlNode* c_node, char* encoding, int c_method,
                              int write_xml_declaration,
                              int write_complete_document,
                              int pretty_print):
     cdef xmlDoc* c_doc
     cdef xmlNode* c_nsdecl_node
     c_doc = c_node.doc
-    if write_xml_declaration:
+    if write_xml_declaration and c_method == OUTPUT_METHOD_XML:
         _writeDeclarationToBuffer(c_buffer, c_doc.version, encoding)
 
     # write internal DTD subset, preceding PIs/comments, etc.
@@ -101,8 +163,12 @@
         c_nsdecl_node.last = c_node.last
 
     # write node
-    tree.xmlNodeDumpOutput(c_buffer, c_doc, c_nsdecl_node, 0,
-                           pretty_print, encoding)
+    if c_method == OUTPUT_METHOD_XML:
+        tree.xmlNodeDumpOutput(
+            c_buffer, c_doc, c_nsdecl_node, 0, pretty_print, encoding)
+    else:
+        tree.htmlNodeDumpFormatOutput(
+            c_buffer, c_doc, c_nsdecl_node, encoding, pretty_print)
 
     if c_nsdecl_node is not c_node:
         # clean up
@@ -244,7 +310,7 @@
 cdef int _closeFilelikeWriter(void* ctxt):
     return (<_FilelikeWriter>ctxt).close()
 
-cdef _tofilelike(f, _Element element, encoding,
+cdef _tofilelike(f, _Element element, encoding, method,
                  int write_xml_declaration, int write_doctype,
                  int pretty_print):
     cdef python.PyThreadState* state
@@ -255,7 +321,17 @@
     if encoding is None:
         c_enc = NULL
     else:
-        c_enc = encoding
+        encoding = _utf8(encoding)
+        c_enc = _cstr(encoding)
+    c_method = _findOutputMethod(method)
+    if c_method == OUTPUT_METHOD_TEXT:
+        if _isString(f):
+            f = open(f, 'wb')
+            f.write(_textToString(element._c_node, encoding))
+            f.close()
+        else:
+            f.write(_textToString(element._c_node, encoding))
+        return
     enchandler = tree.xmlFindCharEncodingHandler(c_enc)
     if enchandler is NULL:
         raise LookupError, python.PyString_FromFormat(
@@ -275,7 +351,7 @@
         tree.xmlCharEncCloseFunc(enchandler)
         raise TypeError, "File or filename expected, got '%s'" % type(f)
 
-    _writeNodeToBuffer(c_buffer, element._c_node, c_enc,
+    _writeNodeToBuffer(c_buffer, element._c_node, c_enc, c_method,
                        write_xml_declaration, write_doctype, pretty_print)
     tree.xmlOutputBufferClose(c_buffer)
     tree.xmlCharEncCloseFunc(enchandler)

Modified: lxml/trunk/src/lxml/tests/test_elementtree.py
==============================================================================
--- lxml/trunk/src/lxml/tests/test_elementtree.py	(original)
+++ lxml/trunk/src/lxml/tests/test_elementtree.py	Sat Sep 15 23:44:54 2007
@@ -716,6 +716,47 @@
             self.assertEquals(
                 '<doc%s>This is a test.</doc%s>' % (i, i),
                 canonicalize(data))
+
+    def test_write_method_html(self):
+        ElementTree = self.etree.ElementTree
+        Element = self.etree.Element
+        SubElement = self.etree.SubElement
+        
+        html = Element('html')
+        body = SubElement(html, 'body')
+        p = SubElement(body, 'p')
+        p.text = "html"
+        SubElement(p, 'br').tail = "test"
+
+        tree = ElementTree(element=html)
+        f = StringIO() 
+        tree.write(f, method="html")
+        data = f.getvalue()
+
+        self.assertEquals('<html><body><p>html<br>test</p></body></html>',
+                          data)
+
+    def test_write_method_text(self):
+        ElementTree = self.etree.ElementTree
+        Element = self.etree.Element
+        SubElement = self.etree.SubElement
+        
+        a = Element('a')
+        a.text = "A"
+        a.tail = "tail"
+        b = SubElement(a, 'b')
+        b.text = "B"
+        b.tail = "TAIL"
+        c = SubElement(a, 'c')
+        c.text = "C"
+        
+        tree = ElementTree(element=a)
+        f = StringIO() 
+        tree.write(f, method="text")
+        data = f.getvalue()
+
+        self.assertEquals('ABTAILCtail',
+                          data)
         
     def test_write_fail(self):
         ElementTree = self.etree.ElementTree
@@ -2099,6 +2140,37 @@
         self.assert_(tostring(b) == '<b/>Foo' or
                      tostring(b) == '<b />Foo')
 
+    def test_tostring_method_html(self):
+        tostring = self.etree.tostring
+        Element = self.etree.Element
+        SubElement = self.etree.SubElement
+        
+        html = Element('html')
+        body = SubElement(html, 'body')
+        p = SubElement(body, 'p')
+        p.text = "html"
+        SubElement(p, 'br').tail = "test"
+
+        self.assertEquals('<html><body><p>html<br>test</p></body></html>',
+                          tostring(html, method="html"))
+
+    def test_tostring_method_text(self):
+        tostring = self.etree.tostring
+        Element = self.etree.Element
+        SubElement = self.etree.SubElement
+        
+        a = Element('a')
+        a.text = "A"
+        a.tail = "tail"
+        b = SubElement(a, 'b')
+        b.text = "B"
+        b.tail = "TAIL"
+        c = SubElement(a, 'c')
+        c.text = "C"
+        
+        self.assertEquals('ABTAILCtail',
+                          tostring(a, method="text"))
+
     def test_iterparse(self):
         iterparse = self.etree.iterparse
         f = StringIO('<a><b></b><c/></a>')

Modified: lxml/trunk/src/lxml/tests/test_etree.py
==============================================================================
--- lxml/trunk/src/lxml/tests/test_etree.py	(original)
+++ lxml/trunk/src/lxml/tests/test_etree.py	Sat Sep 15 23:44:54 2007
@@ -1761,6 +1761,25 @@
         result = tostring(a, pretty_print=True)
         self.assertEquals(result, "<a>\n  <b/>\n  <c/>\n</a>")
 
+    def test_tostring_method_text_encoding(self):
+        tostring = self.etree.tostring
+        Element = self.etree.Element
+        SubElement = self.etree.SubElement
+        
+        a = Element('a')
+        a.text = "A"
+        a.tail = "tail"
+        b = SubElement(a, 'b')
+        b.text = "B"
+        b.tail = u"Søk på nettet"
+        c = SubElement(a, 'c')
+        c.text = "C"
+
+        result = tostring(a, method="text", encoding="UTF-16")
+
+        self.assertEquals(u'ABSøk på nettetCtail'.encode("UTF-16"),
+                          result)
+
     def test_tounicode(self):
         tounicode = self.etree.tounicode
         Element = self.etree.Element

Modified: lxml/trunk/src/lxml/tree.pxd
==============================================================================
--- lxml/trunk/src/lxml/tree.pxd	(original)
+++ lxml/trunk/src/lxml/tree.pxd	Sat Sep 15 23:44:54 2007
@@ -234,6 +234,11 @@
     cdef char* xmlBuildURI(char* href, char* base)
     cdef int xmlValidateNCName(char* value, int space)
 
+cdef extern from "libxml/HTMLtree.h":
+    cdef void htmlNodeDumpFormatOutput(xmlOutputBuffer* buf,
+                                       xmlDoc* doc, xmlNode* cur,
+                                       char* encoding, int format)
+
 cdef extern from "libxml/valid.h":
     cdef xmlAttr* xmlGetID(xmlDoc* doc, char* ID)
     cdef void xmlDumpNotationTable(xmlBuffer* buffer, xmlNotationTable* table)


More information about the lxml-checkins mailing list