[Lxml-checkins] r45113 - in lxml/trunk/src/lxml: . tests

scoder at codespeak.net scoder at codespeak.net
Sun Jul 15 23:27:50 CEST 2007


Author: scoder
Date: Sun Jul 15 23:27:50 2007
New Revision: 45113

Modified:
   lxml/trunk/src/lxml/iterparse.pxi
   lxml/trunk/src/lxml/parser.pxi
   lxml/trunk/src/lxml/tests/test_etree.py
   lxml/trunk/src/lxml/xmlparser.pxd
Log:
new parser kw arg 'remove_pis' to discard PIs

Modified: lxml/trunk/src/lxml/iterparse.pxi
==============================================================================
--- lxml/trunk/src/lxml/iterparse.pxi	(original)
+++ lxml/trunk/src/lxml/iterparse.pxi	Sun Jul 15 23:27:50 2007
@@ -235,6 +235,7 @@
     * no_network         - prevent network access
     * remove_blank_text  - discard blank text nodes
     * remove_comments    - discard comments
+    * remove_pis         - discard processing instructions
     """
     cdef object _source
     cdef object _filename
@@ -242,7 +243,7 @@
     def __init__(self, source, events=("end",), tag=None,
                  attribute_defaults=False, dtd_validation=False,
                  load_dtd=False, no_network=False, remove_blank_text=False,
-                 remove_comments=False):
+                 remove_comments=False, remove_pis=False):
         cdef _IterparseContext context
         cdef char* c_filename
         cdef int parse_options
@@ -259,7 +260,8 @@
             c_filename = NULL
 
         self._source = source
-        _BaseParser.__init__(self, remove_comments, _IterparseContext)
+        _BaseParser.__init__(self, remove_comments, remove_pis,
+                             _IterparseContext)
 
         parse_options = _XML_DEFAULT_PARSE_OPTIONS
         if load_dtd:

Modified: lxml/trunk/src/lxml/parser.pxi
==============================================================================
--- lxml/trunk/src/lxml/parser.pxi	(original)
+++ lxml/trunk/src/lxml/parser.pxi	Sun Jul 15 23:27:50 2007
@@ -372,7 +372,8 @@
     cdef ElementClassLookup _class_lookup
     cdef python.PyThread_type_lock _parser_lock
 
-    def __init__(self, remove_comments, context_class=_ResolverContext):
+    def __init__(self, remove_comments, remove_pis,
+                 context_class=_ResolverContext):
         cdef xmlParserCtxt* pctxt
         if isinstance(self, HTMLParser):
             self._parser_type = LXML_HTML_PARSER
@@ -391,6 +392,8 @@
         if pctxt.sax != NULL:
             if remove_comments:
                 pctxt.sax.comment = NULL
+            if remove_pis:
+                pctxt.sax.processingInstruction = NULL
             # hard switch-off for CDATA nodes => makes them plain text
             pctxt.sax.cdataBlock = NULL
 
@@ -699,6 +702,7 @@
     * recover            - try hard to parse through broken XML
     * remove_blank_text  - discard blank text nodes
     * remove_comments    - discard comments
+    * remove_pis         - discard processing instructions
     * compact            - safe memory for short text content (default: True)
     * resolve_entities   - replace entities by their text value (default: True)
 
@@ -709,9 +713,10 @@
     def __init__(self, attribute_defaults=False, dtd_validation=False,
                  load_dtd=False, no_network=True, ns_clean=False,
                  recover=False, remove_blank_text=False, compact=True,
-                 resolve_entities=True, remove_comments=False):
+                 resolve_entities=True, remove_comments=False,
+                 remove_pis=False):
         cdef int parse_options
-        _BaseParser.__init__(self, remove_comments)
+        _BaseParser.__init__(self, remove_comments, remove_pis)
 
         parse_options = _XML_DEFAULT_PARSE_OPTIONS
         if load_dtd:
@@ -833,15 +838,16 @@
     * no_network         - prevent network access (default: True)
     * remove_blank_text  - discard empty text nodes
     * remove_comments    - discard comments
+    * remove_pis         - discard processing instructions
     * compact            - safe memory for short text content (default: True)
 
     Note that you should avoid sharing parsers between threads for performance
     reasons.
     """
     def __init__(self, recover=True, no_network=True, remove_blank_text=False,
-                 compact=True, remove_comments=False):
+                 compact=True, remove_comments=False, remove_pis=False):
         cdef int parse_options
-        _BaseParser.__init__(self, remove_comments)
+        _BaseParser.__init__(self, remove_comments, remove_pis)
 
         parse_options = _HTML_DEFAULT_PARSE_OPTIONS
         if remove_blank_text:

Modified: lxml/trunk/src/lxml/tests/test_etree.py
==============================================================================
--- lxml/trunk/src/lxml/tests/test_etree.py	(original)
+++ lxml/trunk/src/lxml/tests/test_etree.py	Sun Jul 15 23:27:50 2007
@@ -187,6 +187,26 @@
             '<a><b><c/></b></a>',
             tostring(tree))
 
+    def test_parse_remove_pis(self):
+        parse = self.etree.parse
+        tostring = self.etree.tostring
+        XMLParser = self.etree.XMLParser
+
+        xml = '<?test?><a><?A?><b><?B?><c/></b><?C?></a><?tail?>'
+
+        f = StringIO(xml)
+        tree = parse(f)
+        self.assertEquals(
+            xml,
+            tostring(tree))
+
+        f = StringIO(xml)
+        parser = XMLParser(remove_pis=True)
+        tree = parse(f, parser)
+        self.assertEquals(
+            '<a><b><c/></b></a>',
+            tostring(tree))
+
     def test_parse_parser_type_error(self):
         # ET raises IOError only
         parse = self.etree.parse

Modified: lxml/trunk/src/lxml/xmlparser.pxd
==============================================================================
--- lxml/trunk/src/lxml/xmlparser.pxd	(original)
+++ lxml/trunk/src/lxml/xmlparser.pxd	Sun Jul 15 23:27:50 2007
@@ -26,6 +26,10 @@
     ctypedef void (*commentSAXFunc)(void* ctx,
                                     char* value)
 
+    ctypedef void (*processingInstructionSAXFunc)(void * ctx, 
+                                                  char* target, 
+                                                  char* data)
+
 cdef extern from "libxml/tree.h":
     ctypedef struct xmlParserInput
     ctypedef struct xmlParserInputBuffer:
@@ -34,10 +38,11 @@
         xmlInputCloseCallback closecallback
 
     ctypedef struct xmlSAXHandler:
-        startElementNsSAX2Func startElementNs
-        endElementNsSAX2Func   endElementNs
-        cdataBlockSAXFunc      cdataBlock
-        commentSAXFunc         comment
+        startElementNsSAX2Func          startElementNs
+        endElementNsSAX2Func            endElementNs
+        cdataBlockSAXFunc               cdataBlock
+        commentSAXFunc                  comment
+        processingInstructionSAXFunc	processingInstruction
 
 cdef extern from "libxml/xmlIO.h":
     cdef xmlParserInputBuffer* xmlAllocParserInputBuffer(int enc)


More information about the lxml-checkins mailing list