[Lxml-checkins] r53699 - in lxml/trunk: . doc src/lxml src/lxml/tests
scoder at codespeak.net
scoder at codespeak.net
Fri Apr 11 19:33:18 CEST 2008
Author: scoder
Date: Fri Apr 11 19:33:18 2008
New Revision: 53699
Modified:
lxml/trunk/ (props changed)
lxml/trunk/CHANGES.txt
lxml/trunk/doc/api.txt
lxml/trunk/src/lxml/apihelpers.pxi
lxml/trunk/src/lxml/iterparse.pxi
lxml/trunk/src/lxml/lxml.etree.pyx
lxml/trunk/src/lxml/parser.pxi
lxml/trunk/src/lxml/saxparser.pxi
lxml/trunk/src/lxml/tests/test_etree.py
lxml/trunk/src/lxml/tree.pxd
Log:
r3936 at delle: sbehnel | 2008-04-11 19:31:10 +0200
support for CDATA blocks: parser option and CDATA() text factory
Modified: lxml/trunk/CHANGES.txt
==============================================================================
--- lxml/trunk/CHANGES.txt (original)
+++ lxml/trunk/CHANGES.txt Fri Apr 11 19:33:18 2008
@@ -8,6 +8,12 @@
Features added
--------------
+* Parser option `strip_cdata` for normalising or keeping CDATA
+ sections. Defaults to ``True`` as before, thus replacing CDATA
+ sections by their text content.
+
+* ``CDATA()`` factory to wrap string content as CDATA section.
+
Bugs fixed
----------
Modified: lxml/trunk/doc/api.txt
==============================================================================
--- lxml/trunk/doc/api.txt (original)
+++ lxml/trunk/doc/api.txt Fri Apr 11 19:33:18 2008
@@ -30,15 +30,16 @@
.. contents::
..
- 1 lxml.etree
- 2 Other Element APIs
- 3 Trees and Documents
- 4 Iteration
- 5 Error handling on exceptions
- 6 Error logging
- 7 Serialisation
- 8 XInclude and ElementInclude
- 9 write_c14n on ElementTree
+ 1 lxml.etree
+ 2 Other Element APIs
+ 3 Trees and Documents
+ 4 Iteration
+ 5 Error handling on exceptions
+ 6 Error logging
+ 7 Serialisation
+ 8 CDATA
+ 9 XInclude and ElementInclude
+ 10 write_c14n on ElementTree
lxml.etree
@@ -352,6 +353,50 @@
XMLSyntaxError: ...
+CDATA
+-----
+
+By default, lxml's parser will strip CDATA sections from the tree and
+replace them by their plain text content. As real applications for
+CDATA are rare, this is the best way to deal with this issue.
+
+However, in some cases, keeping CDATA sections or creating them in a
+document is required to adhere to existing XML language definitions.
+For these special cases, you can instruct the parser to leave CDATA
+sections in the document:
+
+.. sourcecode:: pycon
+
+ >>> parser = etree.XMLParser(strip_cdata=False)
+ >>> root = etree.XML('<root><![CDATA[test]]></root>', parser)
+ >>> root.text
+ 'test'
+
+ >>> etree.tostring(root)
+ '<root><![CDATA[test]]></root>'
+
+Note how the ``.text`` property does not give any indication that the
+text content is wrapped by a CDATA section. If you want to make sure
+your data is wrapped by a CDATA block, you can use the ``CDATA()``
+text wrapper:
+
+.. sourcecode:: pycon
+
+ >>> root.text = 'test'
+
+ >>> root.text
+ 'test'
+ >>> etree.tostring(root)
+ '<root>test</root>'
+
+ >>> root.text = etree.CDATA(root.text)
+
+ >>> root.text
+ 'test'
+ >>> etree.tostring(root)
+ '<root><![CDATA[test]]></root>'
+
+
XInclude and ElementInclude
---------------------------
Modified: lxml/trunk/src/lxml/apihelpers.pxi
==============================================================================
--- lxml/trunk/src/lxml/apihelpers.pxi (original)
+++ lxml/trunk/src/lxml/apihelpers.pxi Fri Apr 11 19:33:18 2008
@@ -449,8 +449,17 @@
if value is None:
return 0
# now add new text node with value at start
- text = _utf8(value)
- c_text_node = tree.xmlNewDocText(c_node.doc, _cstr(text))
+ if python._isString(value):
+ text = _utf8(value)
+ c_text_node = tree.xmlNewDocText(c_node.doc, _cstr(text))
+ elif isinstance(value, CDATA):
+ c_text_node = tree.xmlNewCDataBlock(
+ c_node.doc, _cstr((<CDATA>value)._utf8_data),
+ python.PyString_GET_SIZE((<CDATA>value)._utf8_data))
+ else:
+ # this will raise the right error
+ _utf8(value)
+ return -1
if c_node.children is NULL:
tree.xmlAddChild(c_node, c_text_node)
else:
@@ -593,6 +602,8 @@
while c_node is not NULL:
if c_node.type == tree.XML_TEXT_NODE:
return c_node
+ if c_node.type == tree.XML_CDATA_SECTION_NODE:
+ return c_node
elif c_node.type == tree.XML_XINCLUDE_START or \
c_node.type == tree.XML_XINCLUDE_END:
c_node = c_node.next
Modified: lxml/trunk/src/lxml/iterparse.pxi
==============================================================================
--- lxml/trunk/src/lxml/iterparse.pxi (original)
+++ lxml/trunk/src/lxml/iterparse.pxi Fri Apr 11 19:33:18 2008
@@ -327,6 +327,7 @@
- remove_blank_text - discard blank text nodes
- remove_comments - discard comments
- remove_pis - discard processing instructions
+ - strip_cdata - replace CDATA sections by normal text content (default: True)
- compact - safe memory for short text content (default: True)
- resolve_entities - replace entities by their text value (default: True)
@@ -342,7 +343,7 @@
attribute_defaults=False, dtd_validation=False,
load_dtd=False, no_network=True, remove_blank_text=False,
compact=True, resolve_entities=True, remove_comments=False,
- remove_pis=False, encoding=None,
+ remove_pis=False, strip_cdata=True, encoding=None,
html=False, XMLSchema schema=None):
cdef _IterparseContext context
cdef char* c_encoding
@@ -381,7 +382,7 @@
parse_options = parse_options ^ xmlparser.XML_PARSE_NOENT
_BaseParser.__init__(self, parse_options, html, schema,
- remove_comments, remove_pis,
+ remove_comments, remove_pis, strip_cdata,
None, filename, encoding)
context = <_IterparseContext>self._getPushParserContext()
Modified: lxml/trunk/src/lxml/lxml.etree.pyx
==============================================================================
--- lxml/trunk/src/lxml/lxml.etree.pyx (original)
+++ lxml/trunk/src/lxml/lxml.etree.pyx Fri Apr 11 19:33:18 2008
@@ -2264,6 +2264,20 @@
PI = ProcessingInstruction
+cdef class CDATA:
+ """CDATA(data)
+
+ CDATA factory. This factory creates an opaque data object that
+ can be used to set Element text. The usual way to use it is::
+
+ >>> from lxml import etree
+ >>> el = etree.Element('content')
+ >>> el.text = etree.CDATA('a string')
+ """
+ cdef object _utf8_data
+ def __init__(self, data):
+ self._utf8_data = _utf8(data)
+
def Entity(name):
"""Entity(name)
Modified: lxml/trunk/src/lxml/parser.pxi
==============================================================================
--- lxml/trunk/src/lxml/parser.pxi (original)
+++ lxml/trunk/src/lxml/parser.pxi Fri Apr 11 19:33:18 2008
@@ -550,6 +550,7 @@
cdef bint _for_html
cdef bint _remove_comments
cdef bint _remove_pis
+ cdef bint _strip_cdata
cdef XMLSchema _schema
cdef object _filename
cdef object _target
@@ -557,7 +558,8 @@
cdef int _default_encoding_int
def __init__(self, int parse_options, bint for_html, XMLSchema schema,
- remove_comments, remove_pis, target, filename, encoding):
+ remove_comments, remove_pis, strip_cdata, target,
+ filename, encoding):
cdef int c_encoding
if not isinstance(self, HTMLParser) and \
not isinstance(self, XMLParser) and \
@@ -570,6 +572,7 @@
self._for_html = for_html
self._remove_comments = remove_comments
self._remove_pis = remove_pis
+ self._strip_cdata = strip_cdata
self._schema = schema
self._resolvers = _ResolverRegistry()
@@ -601,8 +604,9 @@
pctxt.sax.comment = NULL
if self._remove_pis:
pctxt.sax.processingInstruction = NULL
- # hard switch-off for CDATA nodes => makes them plain text
- pctxt.sax.cdataBlock = NULL
+ if self._strip_cdata:
+ # hard switch-off for CDATA nodes => makes them plain text
+ pctxt.sax.cdataBlock = NULL
return self._parser_context
cdef _ParserContext _getPushParserContext(self):
@@ -621,8 +625,9 @@
pctxt.sax.comment = NULL
if self._remove_pis:
pctxt.sax.processingInstruction = NULL
- # hard switch-off for CDATA nodes => makes them plain text
- pctxt.sax.cdataBlock = NULL
+ if self._strip_cdata:
+ # hard switch-off for CDATA nodes => makes them plain text
+ pctxt.sax.cdataBlock = NULL
return self._push_parser_context
cdef _ParserContext _createContext(self, target):
@@ -700,6 +705,7 @@
parser._for_html = self._for_html
parser._remove_comments = self._remove_comments
parser._remove_pis = self._remove_pis
+ parser._strip_cdata = self._strip_cdata
parser._filename = self._filename
parser._resolvers = self._resolvers
parser._target = self._target
@@ -1051,6 +1057,7 @@
- remove_blank_text - discard blank text nodes
- remove_comments - discard comments
- remove_pis - discard processing instructions
+ - strip_cdata - replace CDATA sections by normal text content (default: True)
- compact - safe memory for short text content (default: True)
- resolve_entities - replace entities by their text value (default: True)
@@ -1068,8 +1075,8 @@
load_dtd=False, no_network=True, ns_clean=False,
recover=False, remove_blank_text=False, compact=True,
resolve_entities=True, remove_comments=False,
- remove_pis=False, target=None, encoding=None,
- XMLSchema schema=None):
+ remove_pis=False, strip_cdata=True, target=None,
+ encoding=None, XMLSchema schema=None):
cdef int parse_options
parse_options = _XML_DEFAULT_PARSE_OPTIONS
if load_dtd:
@@ -1092,9 +1099,11 @@
parse_options = parse_options ^ xmlparser.XML_PARSE_COMPACT
if not resolve_entities:
parse_options = parse_options ^ xmlparser.XML_PARSE_NOENT
+ if not strip_cdata:
+ parse_options = parse_options ^ xmlparser.XML_PARSE_NOCDATA
_BaseParser.__init__(self, parse_options, 0, schema,
- remove_comments, remove_pis,
+ remove_comments, remove_pis, strip_cdata,
target, None, encoding)
cdef class ETCompatXMLParser(XMLParser):
@@ -1110,7 +1119,8 @@
load_dtd=False, no_network=True, ns_clean=False,
recover=False, remove_blank_text=False, compact=True,
resolve_entities=True, remove_comments=True,
- remove_pis=True, target=None, encoding=None, schema=None):
+ remove_pis=True, strip_cdata=True, target=None,
+ encoding=None, schema=None):
XMLParser.__init__(self,
attribute_defaults=attribute_defaults,
dtd_validation=dtd_validation,
@@ -1123,6 +1133,7 @@
resolve_entities=resolve_entities,
remove_comments=remove_comments,
remove_pis=remove_pis,
+ strip_cdata=strip_cdata,
target=target,
encoding=encoding,
schema=schema)
@@ -1180,6 +1191,7 @@
- remove_blank_text - discard empty text nodes
- remove_comments - discard comments
- remove_pis - discard processing instructions
+ - strip_cdata - replace CDATA sections by normal text content (default: True)
- compact - safe memory for short text content (default: True)
Other keyword arguments:
@@ -1193,7 +1205,7 @@
"""
def __init__(self, *, recover=True, no_network=True,
remove_blank_text=False, compact=True, remove_comments=False,
- remove_pis=False, target=None, encoding=None,
+ remove_pis=False, strip_cdata=True, target=None, encoding=None,
XMLSchema schema=None):
cdef int parse_options
parse_options = _HTML_DEFAULT_PARSE_OPTIONS
@@ -1207,7 +1219,7 @@
parse_options = parse_options ^ htmlparser.HTML_PARSE_COMPACT
_BaseParser.__init__(self, parse_options, 1, schema,
- remove_comments, remove_pis,
+ remove_comments, remove_pis, strip_cdata,
target, None, encoding)
cdef HTMLParser __DEFAULT_HTML_PARSER
Modified: lxml/trunk/src/lxml/saxparser.pxi
==============================================================================
--- lxml/trunk/src/lxml/saxparser.pxi (original)
+++ lxml/trunk/src/lxml/saxparser.pxi Fri Apr 11 19:33:18 2008
@@ -37,6 +37,7 @@
cdef xmlparser.startElementSAXFunc _origSaxStartNoNs
cdef xmlparser.endElementSAXFunc _origSaxEndNoNs
cdef xmlparser.charactersSAXFunc _origSaxData
+ cdef xmlparser.cdataBlockSAXFunc _origSaxCData
cdef xmlparser.internalSubsetSAXFunc _origSaxDoctype
cdef xmlparser.commentSAXFunc _origSaxComment
cdef xmlparser.processingInstructionSAXFunc _origSaxPi
@@ -76,10 +77,12 @@
if self._target._sax_event_propagate & SAX_EVENT_DATA:
self._origSaxData = sax.characters
+ self._origSaxCData = sax.cdataBlock
else:
- self._origSaxData = sax.characters = NULL
+ self._origSaxData = sax.characters = sax.cdataBlock = NULL
if self._target._sax_event_filter & SAX_EVENT_DATA:
sax.characters = _handleSaxData
+ sax.cdataBlock = _handleSaxCData
# doctype propagation is always required for entity replacement
self._origSaxDoctype = sax.internalSubset
@@ -249,6 +252,21 @@
except:
context._handleSaxException(c_ctxt)
+cdef void _handleSaxCData(void* ctxt, char* c_data, int data_len) with gil:
+ cdef _SaxParserContext context
+ cdef xmlparser.xmlParserCtxt* c_ctxt
+ c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
+ if c_ctxt._private is NULL:
+ return
+ context = <_SaxParserContext>c_ctxt._private
+ if context._origSaxCData is not NULL:
+ context._origSaxCData(c_ctxt, c_data, data_len)
+ try:
+ context._target._handleSaxData(
+ python.PyUnicode_DecodeUTF8(c_data, data_len, NULL))
+ except:
+ context._handleSaxException(c_ctxt)
+
cdef void _handleSaxDoctype(void* ctxt, char* c_name, char* c_public,
char* c_system) with gil:
cdef _SaxParserContext context
Modified: lxml/trunk/src/lxml/tests/test_etree.py
==============================================================================
--- lxml/trunk/src/lxml/tests/test_etree.py (original)
+++ lxml/trunk/src/lxml/tests/test_etree.py Fri Apr 11 19:33:18 2008
@@ -462,6 +462,29 @@
"data-B", "end-root", "pi-test-c"],
events)
+ def test_parser_target_cdata(self):
+ events = []
+ class Target(object):
+ def start(self, tag, attrib):
+ events.append("start-" + tag)
+ def end(self, tag):
+ events.append("end-" + tag)
+ def data(self, data):
+ events.append("data-" + data)
+ def close(self):
+ return "DONE"
+
+ parser = self.etree.XMLParser(target=Target(),
+ strip_cdata=False)
+
+ parser.feed('<root>A<a><![CDATA[ca]]></a>B</root>')
+ done = parser.close()
+
+ self.assertEquals("DONE", done)
+ self.assertEquals(["start-root", "data-A", "start-a",
+ "data-ca", "end-a", "data-B", "end-root"],
+ events)
+
def test_iterwalk_tag(self):
iterwalk = self.etree.iterwalk
root = self.etree.XML('<a><b><d/></b><c/></a>')
@@ -666,6 +689,55 @@
self.assertRaises(ValueError, Entity, '#abc')
self.assertRaises(ValueError, Entity, '#xxyz')
+ def test_cdata(self):
+ CDATA = self.etree.CDATA
+ Element = self.etree.Element
+ tostring = self.etree.tostring
+
+ root = Element("root")
+ root.text = CDATA('test')
+
+ self.assertEquals('test',
+ root.text)
+ self.assertEquals('<root><![CDATA[test]]></root>',
+ tostring(root))
+
+ def test_cdata_type(self):
+ CDATA = self.etree.CDATA
+ Element = self.etree.Element
+ root = Element("root")
+
+ root.text = CDATA("test")
+ self.assertEquals('test', root.text)
+
+ root.text = CDATA(u"test")
+ self.assertEquals('test', root.text)
+
+ self.assertRaises(TypeError, CDATA, 1)
+
+ def test_cdata_errors(self):
+ CDATA = self.etree.CDATA
+ Element = self.etree.Element
+
+ root = Element("root")
+ cdata = CDATA('test')
+
+ self.assertRaises(TypeError,
+ setattr, root, 'tail', cdata)
+ self.assertRaises(TypeError,
+ root.set, 'attr', cdata)
+ self.assertRaises(TypeError,
+ operator.setitem, root.attrib, 'attr', cdata)
+
+ def test_cdata_parser(self):
+ tostring = self.etree.tostring
+ parser = self.etree.XMLParser(strip_cdata=False)
+ root = self.etree.XML('<root><![CDATA[test]]></root>', parser)
+
+ self.assertEquals('test', root.text)
+ self.assertEquals('<root><![CDATA[test]]></root>',
+ tostring(root))
+
# TypeError in etree, AssertionError in ElementTree;
def test_setitem_assert(self):
Element = self.etree.Element
Modified: lxml/trunk/src/lxml/tree.pxd
==============================================================================
--- lxml/trunk/src/lxml/tree.pxd (original)
+++ lxml/trunk/src/lxml/tree.pxd Fri Apr 11 19:33:18 2008
@@ -181,6 +181,7 @@
cdef xmlNode* xmlNewDocComment(xmlDoc* doc, char* content) nogil
cdef xmlNode* xmlNewDocPI(xmlDoc* doc, char* name, char* content) nogil
cdef xmlNode* xmlNewReference(xmlDoc* doc, char* name) nogil
+ cdef xmlNode* xmlNewCDataBlock(xmlDoc* doc, char* text, int len) nogil
cdef xmlNs* xmlNewNs(xmlNode* node, char* href, char* prefix) nogil
cdef xmlNode* xmlAddChild(xmlNode* parent, xmlNode* cur) nogil
cdef xmlNode* xmlReplaceNode(xmlNode* old, xmlNode* cur) nogil
More information about the lxml-checkins
mailing list