[Lxml-checkins] r53699 - in lxml/trunk: . doc src/lxml src/lxml/tests

scoder at codespeak.net scoder at codespeak.net
Fri Apr 11 19:33:18 CEST 2008


Author: scoder
Date: Fri Apr 11 19:33:18 2008
New Revision: 53699

Modified:
   lxml/trunk/   (props changed)
   lxml/trunk/CHANGES.txt
   lxml/trunk/doc/api.txt
   lxml/trunk/src/lxml/apihelpers.pxi
   lxml/trunk/src/lxml/iterparse.pxi
   lxml/trunk/src/lxml/lxml.etree.pyx
   lxml/trunk/src/lxml/parser.pxi
   lxml/trunk/src/lxml/saxparser.pxi
   lxml/trunk/src/lxml/tests/test_etree.py
   lxml/trunk/src/lxml/tree.pxd
Log:
 r3936 at delle:  sbehnel | 2008-04-11 19:31:10 +0200
 support for CDATA blocks: parser option  and CDATA() text factory


Modified: lxml/trunk/CHANGES.txt
==============================================================================
--- lxml/trunk/CHANGES.txt	(original)
+++ lxml/trunk/CHANGES.txt	Fri Apr 11 19:33:18 2008
@@ -8,6 +8,12 @@
 Features added
 --------------
 
+* Parser option `strip_cdata` for normalising or keeping CDATA
+  sections.  Defaults to ``True`` as before, thus replacing CDATA
+  sections by their text content.
+
+* ``CDATA()`` factory to wrap string content as CDATA section.
+
 Bugs fixed
 ----------
 

Modified: lxml/trunk/doc/api.txt
==============================================================================
--- lxml/trunk/doc/api.txt	(original)
+++ lxml/trunk/doc/api.txt	Fri Apr 11 19:33:18 2008
@@ -30,15 +30,16 @@
 
 .. contents::
 .. 
-   1  lxml.etree
-   2  Other Element APIs
-   3  Trees and Documents
-   4  Iteration
-   5  Error handling on exceptions
-   6  Error logging
-   7  Serialisation
-   8  XInclude and ElementInclude
-   9  write_c14n on ElementTree
+   1   lxml.etree
+   2   Other Element APIs
+   3   Trees and Documents
+   4   Iteration
+   5   Error handling on exceptions
+   6   Error logging
+   7   Serialisation
+   8   CDATA
+   9   XInclude and ElementInclude
+   10  write_c14n on ElementTree
 
 
 lxml.etree
@@ -352,6 +353,50 @@
   XMLSyntaxError: ...
 
 
+CDATA
+-----
+
+By default, lxml's parser will strip CDATA sections from the tree and
+replace them by their plain text content.  As real applications for
+CDATA are rare, this is the best way to deal with this issue.
+
+However, in some cases, keeping CDATA sections or creating them in a
+document is required to adhere to existing XML language definitions.
+For these special cases, you can instruct the parser to leave CDATA
+sections in the document:
+
+.. sourcecode:: pycon
+
+  >>> parser = etree.XMLParser(strip_cdata=False)
+  >>> root = etree.XML('<root><![CDATA[test]]></root>', parser)
+  >>> root.text
+  'test'
+
+  >>> etree.tostring(root)
+  '<root><![CDATA[test]]></root>'
+
+Note how the ``.text`` property does not give any indication that the
+text content is wrapped by a CDATA section.  If you want to make sure
+your data is wrapped by a CDATA block, you can use the ``CDATA()``
+text wrapper:
+
+.. sourcecode:: pycon
+
+  >>> root.text = 'test'
+
+  >>> root.text
+  'test'
+  >>> etree.tostring(root)
+  '<root>test</root>'
+
+  >>> root.text = etree.CDATA(root.text)
+
+  >>> root.text
+  'test'
+  >>> etree.tostring(root)
+  '<root><![CDATA[test]]></root>'
+
+
 XInclude and ElementInclude
 ---------------------------
 

Modified: lxml/trunk/src/lxml/apihelpers.pxi
==============================================================================
--- lxml/trunk/src/lxml/apihelpers.pxi	(original)
+++ lxml/trunk/src/lxml/apihelpers.pxi	Fri Apr 11 19:33:18 2008
@@ -449,8 +449,17 @@
     if value is None:
         return 0
     # now add new text node with value at start
-    text = _utf8(value)
-    c_text_node = tree.xmlNewDocText(c_node.doc, _cstr(text))
+    if python._isString(value):
+        text = _utf8(value)
+        c_text_node = tree.xmlNewDocText(c_node.doc, _cstr(text))
+    elif isinstance(value, CDATA):
+        c_text_node = tree.xmlNewCDataBlock(
+            c_node.doc, _cstr((<CDATA>value)._utf8_data),
+            python.PyString_GET_SIZE((<CDATA>value)._utf8_data))
+    else:
+        # this will raise the right error
+       _utf8(value)
+       return -1
     if c_node.children is NULL:
         tree.xmlAddChild(c_node, c_text_node)
     else:
@@ -593,6 +602,8 @@
     while c_node is not NULL:
         if c_node.type == tree.XML_TEXT_NODE:
             return c_node
+        if c_node.type == tree.XML_CDATA_SECTION_NODE:
+            return c_node
         elif c_node.type == tree.XML_XINCLUDE_START or \
                  c_node.type == tree.XML_XINCLUDE_END:
             c_node = c_node.next

Modified: lxml/trunk/src/lxml/iterparse.pxi
==============================================================================
--- lxml/trunk/src/lxml/iterparse.pxi	(original)
+++ lxml/trunk/src/lxml/iterparse.pxi	Fri Apr 11 19:33:18 2008
@@ -327,6 +327,7 @@
       - remove_blank_text  - discard blank text nodes
       - remove_comments    - discard comments
       - remove_pis         - discard processing instructions
+      - strip_cdata        - replace CDATA sections by normal text content (default: True)
       - compact            - safe memory for short text content (default: True)
       - resolve_entities   - replace entities by their text value (default: True)
 
@@ -342,7 +343,7 @@
                  attribute_defaults=False, dtd_validation=False,
                  load_dtd=False, no_network=True, remove_blank_text=False,
                  compact=True, resolve_entities=True, remove_comments=False,
-                 remove_pis=False, encoding=None,
+                 remove_pis=False, strip_cdata=True, encoding=None,
                  html=False, XMLSchema schema=None):
         cdef _IterparseContext context
         cdef char* c_encoding
@@ -381,7 +382,7 @@
             parse_options = parse_options ^ xmlparser.XML_PARSE_NOENT
 
         _BaseParser.__init__(self, parse_options, html, schema,
-                             remove_comments, remove_pis,
+                             remove_comments, remove_pis, strip_cdata,
                              None, filename, encoding)
 
         context = <_IterparseContext>self._getPushParserContext()

Modified: lxml/trunk/src/lxml/lxml.etree.pyx
==============================================================================
--- lxml/trunk/src/lxml/lxml.etree.pyx	(original)
+++ lxml/trunk/src/lxml/lxml.etree.pyx	Fri Apr 11 19:33:18 2008
@@ -2264,6 +2264,20 @@
 
 PI = ProcessingInstruction
 
+cdef class CDATA:
+    """CDATA(data)
+
+    CDATA factory.  This factory creates an opaque data object that
+    can be used to set Element text.  The usual way to use it is::
+
+        >>> from lxml import etree
+        >>> el = etree.Element('content')
+        >>> el.text = etree.CDATA('a string')
+    """
+    cdef object _utf8_data
+    def __init__(self, data):
+        self._utf8_data = _utf8(data)
+
 def Entity(name):
     """Entity(name)
 

Modified: lxml/trunk/src/lxml/parser.pxi
==============================================================================
--- lxml/trunk/src/lxml/parser.pxi	(original)
+++ lxml/trunk/src/lxml/parser.pxi	Fri Apr 11 19:33:18 2008
@@ -550,6 +550,7 @@
     cdef bint _for_html
     cdef bint _remove_comments
     cdef bint _remove_pis
+    cdef bint _strip_cdata
     cdef XMLSchema _schema
     cdef object _filename
     cdef object _target
@@ -557,7 +558,8 @@
     cdef int _default_encoding_int
 
     def __init__(self, int parse_options, bint for_html, XMLSchema schema,
-                 remove_comments, remove_pis, target, filename, encoding):
+                 remove_comments, remove_pis, strip_cdata, target,
+                 filename, encoding):
         cdef int c_encoding
         if not isinstance(self, HTMLParser) and \
                 not isinstance(self, XMLParser) and \
@@ -570,6 +572,7 @@
         self._for_html = for_html
         self._remove_comments = remove_comments
         self._remove_pis = remove_pis
+        self._strip_cdata = strip_cdata
         self._schema = schema
 
         self._resolvers = _ResolverRegistry()
@@ -601,8 +604,9 @@
                 pctxt.sax.comment = NULL
             if self._remove_pis:
                 pctxt.sax.processingInstruction = NULL
-            # hard switch-off for CDATA nodes => makes them plain text
-            pctxt.sax.cdataBlock = NULL
+            if self._strip_cdata:
+                # hard switch-off for CDATA nodes => makes them plain text
+                pctxt.sax.cdataBlock = NULL
         return self._parser_context
 
     cdef _ParserContext _getPushParserContext(self):
@@ -621,8 +625,9 @@
                 pctxt.sax.comment = NULL
             if self._remove_pis:
                 pctxt.sax.processingInstruction = NULL
-            # hard switch-off for CDATA nodes => makes them plain text
-            pctxt.sax.cdataBlock = NULL
+            if self._strip_cdata:
+                # hard switch-off for CDATA nodes => makes them plain text
+                pctxt.sax.cdataBlock = NULL
         return self._push_parser_context
 
     cdef _ParserContext _createContext(self, target):
@@ -700,6 +705,7 @@
         parser._for_html = self._for_html
         parser._remove_comments = self._remove_comments
         parser._remove_pis = self._remove_pis
+        parser._strip_cdata = self._strip_cdata
         parser._filename = self._filename
         parser._resolvers = self._resolvers
         parser._target = self._target
@@ -1051,6 +1057,7 @@
     - remove_blank_text  - discard blank text nodes
     - remove_comments    - discard comments
     - remove_pis         - discard processing instructions
+    - strip_cdata        - replace CDATA sections by normal text content (default: True)
     - compact            - safe memory for short text content (default: True)
     - resolve_entities   - replace entities by their text value (default: True)
 
@@ -1068,8 +1075,8 @@
                  load_dtd=False, no_network=True, ns_clean=False,
                  recover=False, remove_blank_text=False, compact=True,
                  resolve_entities=True, remove_comments=False,
-                 remove_pis=False, target=None, encoding=None,
-                 XMLSchema schema=None):
+                 remove_pis=False, strip_cdata=True, target=None,
+                 encoding=None, XMLSchema schema=None):
         cdef int parse_options
         parse_options = _XML_DEFAULT_PARSE_OPTIONS
         if load_dtd:
@@ -1092,9 +1099,11 @@
             parse_options = parse_options ^ xmlparser.XML_PARSE_COMPACT
         if not resolve_entities:
             parse_options = parse_options ^ xmlparser.XML_PARSE_NOENT
+        if not strip_cdata:
+            parse_options = parse_options ^ xmlparser.XML_PARSE_NOCDATA
 
         _BaseParser.__init__(self, parse_options, 0, schema,
-                             remove_comments, remove_pis,
+                             remove_comments, remove_pis, strip_cdata,
                              target, None, encoding)
 
 cdef class ETCompatXMLParser(XMLParser):
@@ -1110,7 +1119,8 @@
                  load_dtd=False, no_network=True, ns_clean=False,
                  recover=False, remove_blank_text=False, compact=True,
                  resolve_entities=True, remove_comments=True,
-                 remove_pis=True, target=None, encoding=None, schema=None):
+                 remove_pis=True, strip_cdata=True, target=None,
+                 encoding=None, schema=None):
         XMLParser.__init__(self,
                            attribute_defaults=attribute_defaults,
                            dtd_validation=dtd_validation,
@@ -1123,6 +1133,7 @@
                            resolve_entities=resolve_entities,
                            remove_comments=remove_comments,
                            remove_pis=remove_pis,
+                           strip_cdata=strip_cdata,
                            target=target,
                            encoding=encoding,
                            schema=schema)
@@ -1180,6 +1191,7 @@
     - remove_blank_text  - discard empty text nodes
     - remove_comments    - discard comments
     - remove_pis         - discard processing instructions
+    - strip_cdata        - replace CDATA sections by normal text content (default: True)
     - compact            - safe memory for short text content (default: True)
 
     Other keyword arguments:
@@ -1193,7 +1205,7 @@
     """
     def __init__(self, *, recover=True, no_network=True,
                  remove_blank_text=False, compact=True, remove_comments=False,
-                 remove_pis=False, target=None, encoding=None,
+                 remove_pis=False, strip_cdata=True, target=None, encoding=None,
                  XMLSchema schema=None):
         cdef int parse_options
         parse_options = _HTML_DEFAULT_PARSE_OPTIONS
@@ -1207,7 +1219,7 @@
             parse_options = parse_options ^ htmlparser.HTML_PARSE_COMPACT
 
         _BaseParser.__init__(self, parse_options, 1, schema,
-                             remove_comments, remove_pis,
+                             remove_comments, remove_pis, strip_cdata,
                              target, None, encoding)
 
 cdef HTMLParser __DEFAULT_HTML_PARSER

Modified: lxml/trunk/src/lxml/saxparser.pxi
==============================================================================
--- lxml/trunk/src/lxml/saxparser.pxi	(original)
+++ lxml/trunk/src/lxml/saxparser.pxi	Fri Apr 11 19:33:18 2008
@@ -37,6 +37,7 @@
     cdef xmlparser.startElementSAXFunc    _origSaxStartNoNs
     cdef xmlparser.endElementSAXFunc      _origSaxEndNoNs
     cdef xmlparser.charactersSAXFunc      _origSaxData
+    cdef xmlparser.cdataBlockSAXFunc      _origSaxCData
     cdef xmlparser.internalSubsetSAXFunc  _origSaxDoctype
     cdef xmlparser.commentSAXFunc         _origSaxComment
     cdef xmlparser.processingInstructionSAXFunc    _origSaxPi
@@ -76,10 +77,12 @@
 
         if self._target._sax_event_propagate & SAX_EVENT_DATA:
             self._origSaxData = sax.characters
+            self._origSaxCData = sax.cdataBlock
         else:
-            self._origSaxData = sax.characters = NULL
+            self._origSaxData = sax.characters = sax.cdataBlock = NULL
         if self._target._sax_event_filter & SAX_EVENT_DATA:
             sax.characters = _handleSaxData
+            sax.cdataBlock = _handleSaxCData
 
         # doctype propagation is always required for entity replacement
         self._origSaxDoctype = sax.internalSubset
@@ -249,6 +252,21 @@
     except:
         context._handleSaxException(c_ctxt)
 
+cdef void _handleSaxCData(void* ctxt, char* c_data, int data_len) with gil:
+    cdef _SaxParserContext context
+    cdef xmlparser.xmlParserCtxt* c_ctxt
+    c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
+    if c_ctxt._private is NULL:
+        return
+    context = <_SaxParserContext>c_ctxt._private
+    if context._origSaxCData is not NULL:
+        context._origSaxCData(c_ctxt, c_data, data_len)
+    try:
+        context._target._handleSaxData(
+            python.PyUnicode_DecodeUTF8(c_data, data_len, NULL))
+    except:
+        context._handleSaxException(c_ctxt)
+
 cdef void _handleSaxDoctype(void* ctxt, char* c_name, char* c_public,
                             char* c_system) with gil:
     cdef _SaxParserContext context

Modified: lxml/trunk/src/lxml/tests/test_etree.py
==============================================================================
--- lxml/trunk/src/lxml/tests/test_etree.py	(original)
+++ lxml/trunk/src/lxml/tests/test_etree.py	Fri Apr 11 19:33:18 2008
@@ -462,6 +462,29 @@
                            "data-B", "end-root", "pi-test-c"],
                           events)
 
+    def test_parser_target_cdata(self):
+        events = []
+        class Target(object):
+            def start(self, tag, attrib):
+                events.append("start-" + tag)
+            def end(self, tag):
+                events.append("end-" + tag)
+            def data(self, data):
+                events.append("data-" + data)
+            def close(self):
+                return "DONE"
+
+        parser = self.etree.XMLParser(target=Target(),
+                                      strip_cdata=False)
+
+        parser.feed('<root>A<a><![CDATA[ca]]></a>B</root>')
+        done = parser.close()
+
+        self.assertEquals("DONE", done)
+        self.assertEquals(["start-root", "data-A", "start-a",
+                           "data-ca", "end-a", "data-B", "end-root"],
+                          events)
+
     def test_iterwalk_tag(self):
         iterwalk = self.etree.iterwalk
         root = self.etree.XML('<a><b><d/></b><c/></a>')
@@ -666,6 +689,55 @@
         self.assertRaises(ValueError, Entity, '#abc')
         self.assertRaises(ValueError, Entity, '#xxyz')
 
+    def test_cdata(self):
+        CDATA = self.etree.CDATA
+        Element = self.etree.Element
+        tostring = self.etree.tostring
+
+        root = Element("root")
+        root.text = CDATA('test')
+
+        self.assertEquals('test',
+                          root.text)
+        self.assertEquals('<root><![CDATA[test]]></root>',
+                          tostring(root))
+
+    def test_cdata_type(self):
+        CDATA = self.etree.CDATA
+        Element = self.etree.Element
+        root = Element("root")
+
+        root.text = CDATA("test")
+        self.assertEquals('test', root.text)
+
+        root.text = CDATA(u"test")
+        self.assertEquals('test', root.text)
+
+        self.assertRaises(TypeError, CDATA, 1)
+
+    def test_cdata_errors(self):
+        CDATA = self.etree.CDATA
+        Element = self.etree.Element
+
+        root = Element("root")
+        cdata = CDATA('test')
+        
+        self.assertRaises(TypeError,
+                          setattr, root, 'tail', cdata)
+        self.assertRaises(TypeError,
+                          root.set, 'attr', cdata)
+        self.assertRaises(TypeError,
+                          operator.setitem, root.attrib, 'attr', cdata)
+
+    def test_cdata_parser(self):
+        tostring = self.etree.tostring
+        parser = self.etree.XMLParser(strip_cdata=False)
+        root = self.etree.XML('<root><![CDATA[test]]></root>', parser)
+
+        self.assertEquals('test', root.text)
+        self.assertEquals('<root><![CDATA[test]]></root>',
+                          tostring(root))
+
     # TypeError in etree, AssertionError in ElementTree;
     def test_setitem_assert(self):
         Element = self.etree.Element

Modified: lxml/trunk/src/lxml/tree.pxd
==============================================================================
--- lxml/trunk/src/lxml/tree.pxd	(original)
+++ lxml/trunk/src/lxml/tree.pxd	Fri Apr 11 19:33:18 2008
@@ -181,6 +181,7 @@
     cdef xmlNode* xmlNewDocComment(xmlDoc* doc, char* content) nogil
     cdef xmlNode* xmlNewDocPI(xmlDoc* doc, char* name, char* content) nogil
     cdef xmlNode* xmlNewReference(xmlDoc* doc, char* name) nogil
+    cdef xmlNode* xmlNewCDataBlock(xmlDoc* doc, char* text, int len) nogil
     cdef xmlNs* xmlNewNs(xmlNode* node, char* href, char* prefix) nogil
     cdef xmlNode* xmlAddChild(xmlNode* parent, xmlNode* cur) nogil
     cdef xmlNode* xmlReplaceNode(xmlNode* old, xmlNode* cur) nogil


More information about the lxml-checkins mailing list