[Lxml-checkins] r32662 - in lxml/trunk/src/lxml: . tests
scoder at codespeak.net
scoder at codespeak.net
Wed Sep 27 08:46:55 CEST 2006
Author: scoder
Date: Wed Sep 27 08:46:53 2006
New Revision: 32662
Modified:
lxml/trunk/src/lxml/parser.pxi
lxml/trunk/src/lxml/tests/test_htmlparser.py
lxml/trunk/src/lxml/xmlparser.pxd
Log:
prevent CDATA sections from appearing in HTML tree
Modified: lxml/trunk/src/lxml/parser.pxi
==============================================================================
--- lxml/trunk/src/lxml/parser.pxi (original)
+++ lxml/trunk/src/lxml/parser.pxi Wed Sep 27 08:46:53 2006
@@ -368,6 +368,9 @@
self._parser_ctxt = pctxt
if pctxt is NULL:
raise ParserError, "Failed to create parser context"
+ if pctxt.sax != NULL:
+ # hard switch-off for CDATA nodes => makes them plain text
+ pctxt.sax.cdataBlock = NULL
if thread is None or self._parser_type == LXML_ITERPARSE_PARSER:
# no threading
self._lockParser = self.__dummy
Modified: lxml/trunk/src/lxml/tests/test_htmlparser.py
==============================================================================
--- lxml/trunk/src/lxml/tests/test_htmlparser.py (original)
+++ lxml/trunk/src/lxml/tests/test_htmlparser.py Wed Sep 27 08:46:53 2006
@@ -51,6 +51,12 @@
self.assertEqual(self.etree.tostring(element),
self.html_str)
+ def test_module_HTML_cdata(self):
+ # by default, libxml2 generates CDATA nodes for <script> content
+ html = '<html><head><style>foo</style></head></html>'
+ element = self.etree.HTML(html)
+ self.assertEquals(element[0][0].text, "foo")
+
def test_module_HTML_access(self):
element = self.etree.HTML(self.html_str)
self.assertEqual(element[0][0].tag, 'title')
Modified: lxml/trunk/src/lxml/xmlparser.pxd
==============================================================================
--- lxml/trunk/src/lxml/xmlparser.pxd (original)
+++ lxml/trunk/src/lxml/xmlparser.pxd Wed Sep 27 08:46:53 2006
@@ -19,6 +19,10 @@
char* prefix,
char* URI)
+ ctypedef void (*cdataBlockSAXFunc)(void* ctx,
+ char* value,
+ int len)
+
cdef extern from "libxml/tree.h":
ctypedef struct xmlParserInput
ctypedef struct xmlParserInputBuffer:
@@ -29,6 +33,7 @@
ctypedef struct xmlSAXHandler:
startElementNsSAX2Func startElementNs
endElementNsSAX2Func endElementNs
+ cdataBlockSAXFunc cdataBlock
cdef extern from "libxml/xmlIO.h":
cdef xmlParserInputBuffer* xmlAllocParserInputBuffer(int enc)
More information about the lxml-checkins
mailing list