[Lxml-checkins] r32662 - in lxml/trunk/src/lxml: . tests

scoder at codespeak.net scoder at codespeak.net
Wed Sep 27 08:46:55 CEST 2006


Author: scoder
Date: Wed Sep 27 08:46:53 2006
New Revision: 32662

Modified:
   lxml/trunk/src/lxml/parser.pxi
   lxml/trunk/src/lxml/tests/test_htmlparser.py
   lxml/trunk/src/lxml/xmlparser.pxd
Log:
prevent CDATA sections from appearing in HTML tree

Modified: lxml/trunk/src/lxml/parser.pxi
==============================================================================
--- lxml/trunk/src/lxml/parser.pxi	(original)
+++ lxml/trunk/src/lxml/parser.pxi	Wed Sep 27 08:46:53 2006
@@ -368,6 +368,9 @@
         self._parser_ctxt = pctxt
         if pctxt is NULL:
             raise ParserError, "Failed to create parser context"
+        if pctxt.sax != NULL:
+            # hard switch-off for CDATA nodes => makes them plain text
+            pctxt.sax.cdataBlock = NULL
         if thread is None or self._parser_type == LXML_ITERPARSE_PARSER:
             # no threading
             self._lockParser   = self.__dummy

Modified: lxml/trunk/src/lxml/tests/test_htmlparser.py
==============================================================================
--- lxml/trunk/src/lxml/tests/test_htmlparser.py	(original)
+++ lxml/trunk/src/lxml/tests/test_htmlparser.py	Wed Sep 27 08:46:53 2006
@@ -51,6 +51,12 @@
         self.assertEqual(self.etree.tostring(element),
                          self.html_str)
 
+    def test_module_HTML_cdata(self):
+        # by default, libxml2 generates CDATA nodes for <script> content
+        html = '<html><head><style>foo</style></head></html>'
+        element = self.etree.HTML(html)
+        self.assertEquals(element[0][0].text, "foo")
+
     def test_module_HTML_access(self):
         element = self.etree.HTML(self.html_str)
         self.assertEqual(element[0][0].tag, 'title')

Modified: lxml/trunk/src/lxml/xmlparser.pxd
==============================================================================
--- lxml/trunk/src/lxml/xmlparser.pxd	(original)
+++ lxml/trunk/src/lxml/xmlparser.pxd	Wed Sep 27 08:46:53 2006
@@ -19,6 +19,10 @@
                                           char* prefix,
                                           char* URI)
 
+    ctypedef void (*cdataBlockSAXFunc)(void* ctx,
+                                       char* value,
+                                       int len)
+
 cdef extern from "libxml/tree.h":
     ctypedef struct xmlParserInput
     ctypedef struct xmlParserInputBuffer:
@@ -29,6 +33,7 @@
     ctypedef struct xmlSAXHandler:
         startElementNsSAX2Func startElementNs
         endElementNsSAX2Func   endElementNs
+        cdataBlockSAXFunc      cdataBlock
 
 cdef extern from "libxml/xmlIO.h":
     cdef xmlParserInputBuffer* xmlAllocParserInputBuffer(int enc)


More information about the lxml-checkins mailing list