[Lxml-checkins] r45113 - in lxml/trunk/src/lxml: . tests
scoder at codespeak.net
scoder at codespeak.net
Sun Jul 15 23:27:50 CEST 2007
Author: scoder
Date: Sun Jul 15 23:27:50 2007
New Revision: 45113
Modified:
lxml/trunk/src/lxml/iterparse.pxi
lxml/trunk/src/lxml/parser.pxi
lxml/trunk/src/lxml/tests/test_etree.py
lxml/trunk/src/lxml/xmlparser.pxd
Log:
new parser kw arg 'remove_pis' to discard PIs
Modified: lxml/trunk/src/lxml/iterparse.pxi
==============================================================================
--- lxml/trunk/src/lxml/iterparse.pxi (original)
+++ lxml/trunk/src/lxml/iterparse.pxi Sun Jul 15 23:27:50 2007
@@ -235,6 +235,7 @@
* no_network - prevent network access
* remove_blank_text - discard blank text nodes
* remove_comments - discard comments
+ * remove_pis - discard processing instructions
"""
cdef object _source
cdef object _filename
@@ -242,7 +243,7 @@
def __init__(self, source, events=("end",), tag=None,
attribute_defaults=False, dtd_validation=False,
load_dtd=False, no_network=False, remove_blank_text=False,
- remove_comments=False):
+ remove_comments=False, remove_pis=False):
cdef _IterparseContext context
cdef char* c_filename
cdef int parse_options
@@ -259,7 +260,8 @@
c_filename = NULL
self._source = source
- _BaseParser.__init__(self, remove_comments, _IterparseContext)
+ _BaseParser.__init__(self, remove_comments, remove_pis,
+ _IterparseContext)
parse_options = _XML_DEFAULT_PARSE_OPTIONS
if load_dtd:
Modified: lxml/trunk/src/lxml/parser.pxi
==============================================================================
--- lxml/trunk/src/lxml/parser.pxi (original)
+++ lxml/trunk/src/lxml/parser.pxi Sun Jul 15 23:27:50 2007
@@ -372,7 +372,8 @@
cdef ElementClassLookup _class_lookup
cdef python.PyThread_type_lock _parser_lock
- def __init__(self, remove_comments, context_class=_ResolverContext):
+ def __init__(self, remove_comments, remove_pis,
+ context_class=_ResolverContext):
cdef xmlParserCtxt* pctxt
if isinstance(self, HTMLParser):
self._parser_type = LXML_HTML_PARSER
@@ -391,6 +392,8 @@
if pctxt.sax != NULL:
if remove_comments:
pctxt.sax.comment = NULL
+ if remove_pis:
+ pctxt.sax.processingInstruction = NULL
# hard switch-off for CDATA nodes => makes them plain text
pctxt.sax.cdataBlock = NULL
@@ -699,6 +702,7 @@
* recover - try hard to parse through broken XML
* remove_blank_text - discard blank text nodes
* remove_comments - discard comments
+ * remove_pis - discard processing instructions
* compact - safe memory for short text content (default: True)
* resolve_entities - replace entities by their text value (default: True)
@@ -709,9 +713,10 @@
def __init__(self, attribute_defaults=False, dtd_validation=False,
load_dtd=False, no_network=True, ns_clean=False,
recover=False, remove_blank_text=False, compact=True,
- resolve_entities=True, remove_comments=False):
+ resolve_entities=True, remove_comments=False,
+ remove_pis=False):
cdef int parse_options
- _BaseParser.__init__(self, remove_comments)
+ _BaseParser.__init__(self, remove_comments, remove_pis)
parse_options = _XML_DEFAULT_PARSE_OPTIONS
if load_dtd:
@@ -833,15 +838,16 @@
* no_network - prevent network access (default: True)
* remove_blank_text - discard empty text nodes
* remove_comments - discard comments
+ * remove_pis - discard processing instructions
* compact - safe memory for short text content (default: True)
Note that you should avoid sharing parsers between threads for performance
reasons.
"""
def __init__(self, recover=True, no_network=True, remove_blank_text=False,
- compact=True, remove_comments=False):
+ compact=True, remove_comments=False, remove_pis=False):
cdef int parse_options
- _BaseParser.__init__(self, remove_comments)
+ _BaseParser.__init__(self, remove_comments, remove_pis)
parse_options = _HTML_DEFAULT_PARSE_OPTIONS
if remove_blank_text:
Modified: lxml/trunk/src/lxml/tests/test_etree.py
==============================================================================
--- lxml/trunk/src/lxml/tests/test_etree.py (original)
+++ lxml/trunk/src/lxml/tests/test_etree.py Sun Jul 15 23:27:50 2007
@@ -187,6 +187,26 @@
'<a><b><c/></b></a>',
tostring(tree))
+ def test_parse_remove_pis(self):
+ parse = self.etree.parse
+ tostring = self.etree.tostring
+ XMLParser = self.etree.XMLParser
+
+ xml = '<?test?><a><?A?><b><?B?><c/></b><?C?></a><?tail?>'
+
+ f = StringIO(xml)
+ tree = parse(f)
+ self.assertEquals(
+ xml,
+ tostring(tree))
+
+ f = StringIO(xml)
+ parser = XMLParser(remove_pis=True)
+ tree = parse(f, parser)
+ self.assertEquals(
+ '<a><b><c/></b></a>',
+ tostring(tree))
+
def test_parse_parser_type_error(self):
# ET raises IOError only
parse = self.etree.parse
Modified: lxml/trunk/src/lxml/xmlparser.pxd
==============================================================================
--- lxml/trunk/src/lxml/xmlparser.pxd (original)
+++ lxml/trunk/src/lxml/xmlparser.pxd Sun Jul 15 23:27:50 2007
@@ -26,6 +26,10 @@
ctypedef void (*commentSAXFunc)(void* ctx,
char* value)
+ ctypedef void (*processingInstructionSAXFunc)(void * ctx,
+ char* target,
+ char* data)
+
cdef extern from "libxml/tree.h":
ctypedef struct xmlParserInput
ctypedef struct xmlParserInputBuffer:
@@ -34,10 +38,11 @@
xmlInputCloseCallback closecallback
ctypedef struct xmlSAXHandler:
- startElementNsSAX2Func startElementNs
- endElementNsSAX2Func endElementNs
- cdataBlockSAXFunc cdataBlock
- commentSAXFunc comment
+ startElementNsSAX2Func startElementNs
+ endElementNsSAX2Func endElementNs
+ cdataBlockSAXFunc cdataBlock
+ commentSAXFunc comment
+ processingInstructionSAXFunc processingInstruction
cdef extern from "libxml/xmlIO.h":
cdef xmlParserInputBuffer* xmlAllocParserInputBuffer(int enc)
More information about the lxml-checkins
mailing list