[Lxml-checkins] r46441 - in lxml/trunk: . src/lxml src/lxml/tests
scoder at codespeak.net
scoder at codespeak.net
Mon Sep 10 16:23:24 CEST 2007
Author: scoder
Date: Mon Sep 10 16:23:22 2007
New Revision: 46441
Modified:
lxml/trunk/selftest.py
lxml/trunk/src/lxml/cstd.pxd
lxml/trunk/src/lxml/docloader.pxi
lxml/trunk/src/lxml/dtd.pxi
lxml/trunk/src/lxml/etree.pyx
lxml/trunk/src/lxml/htmlparser.pxd
lxml/trunk/src/lxml/iterparse.pxi
lxml/trunk/src/lxml/parser.pxi
lxml/trunk/src/lxml/tests/test_elementtree.py
lxml/trunk/src/lxml/xmlparser.pxd
Log:
major restructuring of the parser code to better interate feed parser and (the new) target parser
Modified: lxml/trunk/selftest.py
==============================================================================
--- lxml/trunk/selftest.py (original)
+++ lxml/trunk/selftest.py Mon Sep 10 16:23:22 2007
@@ -266,7 +266,8 @@
## """
## Test HTML parsing.
-## >>> p = HTMLTreeBuilder.TreeBuilder()
+## >>> # p = HTMLTreeBuilder.TreeBuilder()
+## >>> p = ElementTree.HTMLParser()
## >>> p.feed("<p><p>spam<b>egg</b></p>")
## >>> serialize(p.close())
## '<p>spam<b>egg</b></p>'
Modified: lxml/trunk/src/lxml/cstd.pxd
==============================================================================
--- lxml/trunk/src/lxml/cstd.pxd (original)
+++ lxml/trunk/src/lxml/cstd.pxd Mon Sep 10 16:23:22 2007
@@ -13,6 +13,7 @@
cdef int strcmp(char* s1, char* s2)
cdef int strncmp(char* s1, char* s2, size_t len)
cdef void* memcpy(void* dest, void* src, size_t len)
+ cdef void* memset(void* s, int c, size_t len)
cdef extern from "stdarg.h":
ctypedef void *va_list
Modified: lxml/trunk/src/lxml/docloader.pxi
==============================================================================
--- lxml/trunk/src/lxml/docloader.pxi (original)
+++ lxml/trunk/src/lxml/docloader.pxi Mon Sep 10 16:23:22 2007
@@ -94,9 +94,12 @@
cdef class _ResolverContext(_ExceptionContext):
cdef _ResolverRegistry _resolvers
cdef _TempStore _storage
- def __init__(self, _ResolverRegistry resolvers not None):
+ def __init__(self, _ResolverRegistry resolvers):
_ExceptionContext.__init__(self)
- self._resolvers = resolvers
+ if resolvers is None:
+ self._resolvers = _ResolverRegistry()
+ else:
+ self._resolvers = resolvers
self._storage = _TempStore()
cdef void clear(self):
Modified: lxml/trunk/src/lxml/dtd.pxi
==============================================================================
--- lxml/trunk/src/lxml/dtd.pxi (original)
+++ lxml/trunk/src/lxml/dtd.pxi Mon Sep 10 16:23:22 2007
@@ -88,10 +88,10 @@
cdef tree.xmlDtd* _parseDtdFromFilelike(file) except NULL:
cdef _ExceptionContext exc_context
- cdef _FileParserContext dtd_parser
+ cdef _FileReaderContext dtd_parser
cdef tree.xmlDtd* c_dtd
exc_context = _ExceptionContext()
- dtd_parser = _FileParserContext(file, exc_context)
+ dtd_parser = _FileReaderContext(file, exc_context)
c_dtd = dtd_parser._readDtd()
Modified: lxml/trunk/src/lxml/etree.pyx
==============================================================================
--- lxml/trunk/src/lxml/etree.pyx (original)
+++ lxml/trunk/src/lxml/etree.pyx Mon Sep 10 16:23:22 2007
@@ -2131,19 +2131,20 @@
################################################################################
# Include submodules
-include "proxy.pxi" # Proxy handling (element backpointers/memory/etc.)
-include "apihelpers.pxi" # Private helper functions
-include "xmlerror.pxi" # Error and log handling
-include "classlookup.pxi"# Element class lookup mechanisms
-include "nsclasses.pxi" # Namespace implementation and registry
-include "docloader.pxi" # Support for custom document loaders
-include "parser.pxi" # XML Parser
-include "serializer.pxi" # XML output functions
-include "iterparse.pxi" # incremental XML parsing
-include "xmlid.pxi" # XMLID and IDDict
-include "extensions.pxi" # XPath/XSLT extension functions
-include "xpath.pxi" # XPath evaluation
-include "xslt.pxi" # XSL transformations
+include "proxy.pxi" # Proxy handling (element backpointers/memory/etc.)
+include "apihelpers.pxi" # Private helper functions
+include "xmlerror.pxi" # Error and log handling
+include "classlookup.pxi" # Element class lookup mechanisms
+include "nsclasses.pxi" # Namespace implementation and registry
+include "docloader.pxi" # Support for custom document loaders
+include "parser.pxi" # XML Parser
+include "parsertarget.pxi" # ET Parser target
+include "serializer.pxi" # XML output functions
+include "iterparse.pxi" # incremental XML parsing
+include "xmlid.pxi" # XMLID and IDDict
+include "extensions.pxi" # XPath/XSLT extension functions
+include "xpath.pxi" # XPath evaluation
+include "xslt.pxi" # XSL transformations
################################################################################
Modified: lxml/trunk/src/lxml/htmlparser.pxd
==============================================================================
--- lxml/trunk/src/lxml/htmlparser.pxd (original)
+++ lxml/trunk/src/lxml/htmlparser.pxd Mon Sep 10 16:23:22 2007
@@ -17,7 +17,11 @@
cdef xmlParserCtxt* htmlCreateMemoryParserCtxt(char* buffer, int size)
cdef xmlParserCtxt* htmlCreateFileParserCtxt(char* filename, char* encoding)
cdef void htmlFreeParserCtxt(xmlParserCtxt* ctxt)
+ cdef void htmlCtxtReset(xmlParserCtxt* ctxt)
+ cdef int htmlCtxtUseOptions(xmlParserCtxt* ctxt, int options)
cdef int htmlParseDocument(xmlParserCtxt* ctxt)
+ cdef int htmlParseChunk(xmlParserCtxt* ctxt,
+ char* chunk, int size, int terminate)
cdef xmlDoc* htmlCtxtReadFile(xmlParserCtxt* ctxt,
char* filename, char* encoding,
Modified: lxml/trunk/src/lxml/iterparse.pxi
==============================================================================
--- lxml/trunk/src/lxml/iterparse.pxi (original)
+++ lxml/trunk/src/lxml/iterparse.pxi Mon Sep 10 16:23:22 2007
@@ -48,7 +48,7 @@
c_ns = c_ns.next
return count
-cdef class _IterparseContext(_ResolverContext):
+cdef class _IterparseContext(_ParserContext):
cdef xmlparser.startElementNsSAX2Func _origSaxStart
cdef xmlparser.endElementNsSAX2Func _origSaxEnd
cdef _Element _root
@@ -64,8 +64,8 @@
cdef char* _tag_href
cdef char* _tag_name
- def __init__(self, _ResolverRegistry resolvers):
- _ResolverContext.__init__(self, resolvers)
+ def __init__(self):
+ _ParserContext.__init__(self)
self._ns_stack = []
self._pop_ns = self._ns_stack.pop
self._node_stack = []
@@ -73,22 +73,25 @@
self._events = []
self._event_index = 0
- cdef void _wrapCallbacks(self, xmlparser.xmlSAXHandler* sax):
+ cdef void _initParserContext(self, xmlparser.xmlParserCtxt* c_ctxt):
"wrap original SAX2 callbacks"
+ cdef xmlparser.xmlSAXHandler* sax
+ _ParserContext._initParserContext(self, c_ctxt)
+ sax = c_ctxt.sax
self._origSaxStart = sax.startElementNs
# only override start event handler if needed
if self._event_filter == 0 or \
self._event_filter & (ITERPARSE_FILTER_START | \
ITERPARSE_FILTER_START_NS | \
ITERPARSE_FILTER_END_NS):
- sax.startElementNs = _saxStart
+ sax.startElementNs = _iterparseSaxStart
self._origSaxEnd = sax.endElementNs
# only override end event handler if needed
if self._event_filter == 0 or \
self._event_filter & (ITERPARSE_FILTER_END | \
ITERPARSE_FILTER_END_NS):
- sax.endElementNs = _saxEnd
+ sax.endElementNs = _iterparseSaxEnd
cdef _setEventFilter(self, events, tag):
self._event_filter = _buildIterparseEventFilter(events)
@@ -184,9 +187,10 @@
cdef xmlparser.endElementNsSAX2Func _getOrigEnd(xmlparser.xmlParserCtxt* c_ctxt):
return (<_IterparseContext>c_ctxt._private)._origSaxEnd
-cdef void _saxStart(void* ctxt, char* localname, char* prefix, char* URI,
- int nb_namespaces, char** namespaces,
- int nb_attributes, int nb_defaulted, char** attributes):
+cdef void _iterparseSaxStart(void* ctxt, char* localname, char* prefix,
+ char* URI, int nb_namespaces, char** namespaces,
+ int nb_attributes, int nb_defaulted,
+ char** attributes):
# no Python in here!
cdef xmlparser.xmlParserCtxt* c_ctxt
cdef xmlparser.startElementNsSAX2Func origStart
@@ -196,7 +200,7 @@
nb_attributes, nb_defaulted, attributes)
_pushSaxStartEvent(c_ctxt, c_ctxt.node)
-cdef void _saxEnd(void* ctxt, char* localname, char* prefix, char* URI):
+cdef void _iterparseSaxEnd(void* ctxt, char* localname, char* prefix, char* URI):
# no Python in here!
cdef xmlparser.xmlParserCtxt* c_ctxt
cdef xmlparser.endElementNsSAX2Func origEnd
@@ -276,15 +280,17 @@
parse_options = parse_options | xmlparser.XML_PARSE_NOBLANKS
_BaseParser.__init__(self, parse_options, remove_comments, remove_pis,
- _IterparseContext)
+ None)
context = <_IterparseContext>self._context
context._setEventFilter(events, tag)
- context._wrapCallbacks(self._parser_ctxt.sax)
xmlparser.xmlCtxtUseOptions(self._parser_ctxt, parse_options)
xmlparser.xmlCtxtResetPush(self._parser_ctxt, NULL, 0, c_filename, NULL)
self._lockParser() # will not be unlocked - no other methods supported
+ cdef _ParserContext _createContext(self, target):
+ return _IterparseContext()
+
def __iter__(self):
return self
@@ -318,7 +324,8 @@
break
if error != 0:
self._source = None
- _raiseParseError(self._parser_ctxt, self._filename, None)
+ _raiseParseError(self._parser_ctxt, self._filename,
+ self._context._error_log)
if python.PyList_GET_SIZE(context._events) == 0:
self.root = context._root
self._source = None
Modified: lxml/trunk/src/lxml/parser.pxi
==============================================================================
--- lxml/trunk/src/lxml/parser.pxi (original)
+++ lxml/trunk/src/lxml/parser.pxi Mon Sep 10 16:23:22 2007
@@ -2,7 +2,6 @@
cimport xmlparser
cimport htmlparser
-from xmlparser cimport xmlParserCtxt, xmlDict
class ParseError(LxmlSyntaxError):
"""Syntax error while parsing an XML document.
@@ -26,17 +25,17 @@
LXML_HTML_PARSER
LXML_ITERPARSE_PARSER
-cdef class _ParserContext:
+cdef class _ParserDictionaryContext:
# Global parser context to share the string dictionary.
#
- # This class is a singleton!
+ # This class is a delegate singleton!
#
- # It creates _ParserContext objects for each thread to keep thread state,
+ # It creates _ParserDictionaryContext objects for each thread to keep thread state,
# but those must never be used directly. Always stick to using the static
# __GLOBAL_PARSER_CONTEXT as defined below the class.
#
- cdef xmlDict* _c_dict
+ cdef tree.xmlDict* _c_dict
cdef _BaseParser _default_parser
def __dealloc__(self):
if self._c_dict is not NULL:
@@ -49,33 +48,33 @@
cdef python.PyObject* result
thread_dict = python.PyThreadState_GetDict()
if thread_dict is not NULL:
- python.PyDict_SetItem(<object>thread_dict, "_ParserContext", self)
+ python.PyDict_SetItem(<object>thread_dict, "_ParserDictionaryContext", self)
- cdef _ParserContext _findThreadParserContext(self):
- "Find (or create) the _ParserContext object for the current thread"
+ cdef _ParserDictionaryContext _findThreadParserContext(self):
+ "Find (or create) the _ParserDictionaryContext object for the current thread"
cdef python.PyObject* thread_dict
cdef python.PyObject* result
- cdef _ParserContext context
+ cdef _ParserDictionaryContext context
thread_dict = python.PyThreadState_GetDict()
if thread_dict is NULL:
return self
d = <object>thread_dict
- result = python.PyDict_GetItem(d, "_ParserContext")
+ result = python.PyDict_GetItem(d, "_ParserDictionaryContext")
if result is not NULL:
return <object>result
- context = _ParserContext()
- python.PyDict_SetItem(d, "_ParserContext", context)
+ context = _ParserDictionaryContext()
+ python.PyDict_SetItem(d, "_ParserDictionaryContext", context)
return context
cdef void setDefaultParser(self, _BaseParser parser):
"Set the default parser for the current thread"
- cdef _ParserContext context
+ cdef _ParserDictionaryContext context
context = self._findThreadParserContext()
context._default_parser = parser
cdef _BaseParser getDefaultParser(self):
"Return (or create) the default parser of the current thread"
- cdef _ParserContext context
+ cdef _ParserDictionaryContext context
context = self._findThreadParserContext()
if context._default_parser is None:
if self._default_parser is None:
@@ -84,9 +83,9 @@
context._default_parser = self._default_parser._copy()
return context._default_parser
- cdef xmlDict* _getThreadDict(self, xmlDict* default):
+ cdef tree.xmlDict* _getThreadDict(self, tree.xmlDict* default):
"Return the thread-local dict or create a new one if necessary."
- cdef _ParserContext context
+ cdef _ParserDictionaryContext context
context = self._findThreadParserContext()
if context._c_dict is NULL:
# thread dict not yet set up => use default or create a new one
@@ -100,9 +99,9 @@
context._c_dict = xmlparser.xmlDictCreateSub(self._c_dict)
return context._c_dict
- cdef void initThreadDictRef(self, xmlDict** c_dict_ref):
- cdef xmlDict* c_dict
- cdef xmlDict* c_thread_dict
+ cdef void initThreadDictRef(self, tree.xmlDict** c_dict_ref):
+ cdef tree.xmlDict* c_dict
+ cdef tree.xmlDict* c_thread_dict
c_dict = c_dict_ref[0]
c_thread_dict = self._getThreadDict(c_dict)
if c_dict is c_thread_dict:
@@ -112,7 +111,7 @@
c_dict_ref[0] = c_thread_dict
xmlparser.xmlDictReference(c_thread_dict)
- cdef void initParserDict(self, xmlParserCtxt* pctxt):
+ cdef void initParserDict(self, xmlparser.xmlParserCtxt* pctxt):
"Assure we always use the same string dictionary."
self.initThreadDictRef(&pctxt.dict)
@@ -127,11 +126,11 @@
# otherwise we'd free data that's in use => segfault
self.initThreadDictRef(&result.dict)
-cdef _ParserContext __GLOBAL_PARSER_CONTEXT
-__GLOBAL_PARSER_CONTEXT = _ParserContext()
+cdef _ParserDictionaryContext __GLOBAL_PARSER_CONTEXT
+__GLOBAL_PARSER_CONTEXT = _ParserDictionaryContext()
__GLOBAL_PARSER_CONTEXT.initMainParserContext()
-cdef int _checkThreadDict(xmlDict* c_dict):
+cdef int _checkThreadDict(tree.xmlDict* c_dict):
"""Check that c_dict is either the local thread dictionary or the global
parent dictionary.
"""
@@ -205,7 +204,7 @@
## support for file-like objects
############################################################
-cdef class _FileParserContext:
+cdef class _FileReaderContext:
cdef object _filelike
cdef object _url
cdef object _bytes
@@ -223,14 +222,15 @@
self._bytes = ''
self._bytes_read = 0
- cdef xmlparser.xmlParserInput* _createParserInput(self, xmlParserCtxt* ctxt):
+ cdef xmlparser.xmlParserInput* _createParserInput(
+ self, xmlparser.xmlParserCtxt* ctxt):
cdef xmlparser.xmlParserInputBuffer* c_buffer
c_buffer = xmlparser.xmlAllocParserInputBuffer(0)
c_buffer.context = <python.PyObject*>self
c_buffer.readcallback = _readFilelikeParser
return xmlparser.xmlNewIOInputStream(ctxt, c_buffer, 0)
- cdef xmlDoc* _readDoc(self, xmlParserCtxt* ctxt, int options,
+ cdef xmlDoc* _readDoc(self, xmlparser.xmlParserCtxt* ctxt, int options,
LxmlParserType parser_type):
cdef python.PyThreadState* state
cdef xmlDoc* result
@@ -291,19 +291,19 @@
return -1
cdef int _readFilelikeParser(void* ctxt, char* c_buffer, int c_size):
- return (<_FileParserContext>ctxt).copyToBuffer(c_buffer, c_size)
+ return (<_FileReaderContext>ctxt).copyToBuffer(c_buffer, c_size)
############################################################
## support for custom document loaders
############################################################
cdef xmlparser.xmlParserInput* _parser_resolve_from_python(
- char* c_url, char* c_pubid, xmlParserCtxt* c_context, int* error):
+ char* c_url, char* c_pubid, xmlparser.xmlParserCtxt* c_context, int* error):
# call the Python document loaders
cdef xmlparser.xmlParserInput* c_input
cdef _ResolverContext context
cdef _InputDocument doc_ref
- cdef _FileParserContext file_context
+ cdef _FileReaderContext file_context
error[0] = 0
context = <_ResolverContext>c_context._private
try:
@@ -338,7 +338,7 @@
c_input = xmlparser.xmlNewInputFromFile(
c_context, _cstr(doc_ref._data_bytes))
elif doc_ref._type == PARSER_DATA_FILE:
- file_context = _FileParserContext(doc_ref._file, context, url)
+ file_context = _FileReaderContext(doc_ref._file, context, url)
c_input = file_context._createParserInput(c_context)
data = file_context
@@ -347,7 +347,7 @@
return c_input
cdef xmlparser.xmlParserInput* _local_resolver(char* c_url, char* c_pubid,
- xmlParserCtxt* c_context):
+ xmlparser.xmlParserCtxt* c_context):
# no Python objects here, may be called without thread context !
# when we declare a Python object, Pyrex will INCREF(None) !
cdef xmlparser.xmlParserInput* c_input
@@ -379,42 +379,145 @@
## Parsers
############################################################
+cdef class _ParserContext(_ResolverContext):
+ cdef _ErrorLog _error_log
+ cdef xmlparser.xmlParserCtxt* _c_ctxt
+ def __init__(self):
+ _ResolverContext.__init__(self, _ResolverRegistry())
+ self._error_log = _ErrorLog()
+
+ cdef void _initParserContext(self, xmlparser.xmlParserCtxt* c_ctxt):
+ self._c_ctxt = c_ctxt
+
+ cdef object _handleParseResult(self, _BaseParser parser,
+ xmlDoc* result, filename):
+ cdef xmlDoc* c_doc
+ cdef int recover
+ recover = parser._parse_options & xmlparser.XML_PARSE_RECOVER
+ c_doc = _handleParseResult(self, self._c_ctxt, result,
+ filename, recover)
+ return _documentFactory(c_doc, parser)
+
+ cdef xmlDoc* _handleParseResultDoc(self, _BaseParser parser,
+ xmlDoc* result, filename) except NULL:
+ cdef int recover
+ recover = parser._parse_options & xmlparser.XML_PARSE_RECOVER
+ return _handleParseResult(self, self._c_ctxt, result,
+ filename, recover)
+
+cdef class _InternalParserContext(_ParserContext):
+ """Parser context for internal single-shot parsing
+ """
+
+cdef int _raiseParseError(xmlparser.xmlParserCtxt* ctxt, filename,
+ _ErrorLog error_log) except 0:
+ if filename is not None and \
+ ctxt.lastError.domain == xmlerror.XML_FROM_IO:
+ if ctxt.lastError.message is not NULL:
+ message = "Error reading file '%s': %s" % (
+ filename, (ctxt.lastError.message).strip())
+ else:
+ message = "Error reading file '%s'" % filename
+ raise IOError, message
+ elif error_log:
+ raise XMLSyntaxError, error_log._buildExceptionMessage(
+ "Document is not well formed")
+ elif ctxt.lastError.message is not NULL:
+ message = (ctxt.lastError.message).strip()
+ if ctxt.lastError.line > 0:
+ message = "line %d: %s" % (ctxt.lastError.line, message)
+ raise XMLSyntaxError, message
+ else:
+ raise XMLSyntaxError
+
+cdef xmlDoc* _handleParseResult(_ParserContext context,
+ xmlparser.xmlParserCtxt* c_ctxt,
+ xmlDoc* result, filename,
+ int recover) except NULL:
+ cdef int well_formed
+ if c_ctxt.myDoc is not NULL:
+ if c_ctxt.myDoc != result:
+ tree.xmlFreeDoc(c_ctxt.myDoc)
+ c_ctxt.myDoc = NULL
+
+ if result is not NULL:
+ if recover or (c_ctxt.wellFormed and \
+ c_ctxt.lastError.level < xmlerror.XML_ERR_ERROR):
+ well_formed = 1
+ elif not c_ctxt.replaceEntities and not c_ctxt.validate \
+ and context is not None:
+ # in this mode, we ignore errors about undefined entities
+ for error in context._error_log.filter_from_errors():
+ if error.type != ErrorTypes.WAR_UNDECLARED_ENTITY and \
+ error.type != ErrorTypes.ERR_UNDECLARED_ENTITY:
+ well_formed = 0
+ break
+ else:
+ well_formed = 1
+ else:
+ well_formed = 0
+
+ if well_formed:
+ __GLOBAL_PARSER_CONTEXT.initDocDict(result)
+ else:
+ # free broken document
+ tree.xmlFreeDoc(result)
+ result = NULL
+
+ if context is not None and context._has_raised():
+ if result is not NULL:
+ tree.xmlFreeDoc(result)
+ result = NULL
+ context._raise_if_stored()
+
+ if result is NULL:
+ if context is not None:
+ _raiseParseError(c_ctxt, filename, context._error_log)
+ else:
+ _raiseParseError(c_ctxt, filename, None)
+ elif result.URL is NULL and filename is not None:
+ result.URL = tree.xmlStrdup(_cstr(filename))
+ return result
+
+
cdef class _BaseParser:
cdef int _parse_options
- cdef _ErrorLog _error_log
- cdef readonly _ResolverRegistry resolvers
- cdef _ResolverContext _context
+ cdef _ParserContext _context
cdef LxmlParserType _parser_type
- cdef xmlParserCtxt* _parser_ctxt
+ cdef xmlparser.xmlParserCtxt* _parser_ctxt
cdef ElementClassLookup _class_lookup
cdef python.PyThread_type_lock _parser_lock
cdef int _feed_parser_running
def __init__(self, int parse_options, remove_comments, remove_pis,
- context_class=_ResolverContext):
- cdef xmlParserCtxt* pctxt
+ target):
+ cdef xmlparser.xmlParserCtxt* pctxt
if isinstance(self, HTMLParser):
self._parser_type = LXML_HTML_PARSER
- pctxt = htmlparser.htmlCreateMemoryParserCtxt('dummy', 5)
elif isinstance(self, XMLParser):
self._parser_type = LXML_XML_PARSER
- pctxt = xmlparser.xmlNewParserCtxt()
elif isinstance(self, iterparse):
self._parser_type = LXML_ITERPARSE_PARSER
- pctxt = xmlparser.xmlNewParserCtxt()
else:
raise TypeError, "This class cannot be instantiated"
+
self._parse_options = parse_options
+
+ pctxt = self._newParserCtxt()
self._parser_ctxt = pctxt
if pctxt is NULL:
python.PyErr_NoMemory()
- if pctxt.sax != NULL:
- if remove_comments:
- pctxt.sax.comment = NULL
- if remove_pis:
- pctxt.sax.processingInstruction = NULL
- # hard switch-off for CDATA nodes => makes them plain text
- pctxt.sax.cdataBlock = NULL
+
+ self._context = self._createContext(target)
+ self._context._initParserContext(pctxt)
+ pctxt._private = <python.PyObject*>self._context
+
+ if remove_comments:
+ pctxt.sax.comment = NULL
+ if remove_pis:
+ pctxt.sax.processingInstruction = NULL
+ # hard switch-off for CDATA nodes => makes them plain text
+ pctxt.sax.cdataBlock = NULL
if not config.ENABLE_THREADING or \
self._parser_type == LXML_ITERPARSE_PARSER:
@@ -422,10 +525,18 @@
self._parser_lock = NULL
else:
self._parser_lock = python.PyThread_allocate_lock()
- self._error_log = _ErrorLog()
- self.resolvers = _ResolverRegistry()
- self._context = context_class(self.resolvers)
- pctxt._private = <python.PyObject*>self._context
+
+ cdef _ParserContext _createContext(self, target):
+ if target is not None:
+ return _TargetParserContext(target)
+ else:
+ return _InternalParserContext()
+
+ cdef xmlparser.xmlParserCtxt* _newParserCtxt(self):
+ if self._parser_type == LXML_HTML_PARSER:
+ return htmlparser.htmlCreateMemoryParserCtxt('dummy', 5)
+ else:
+ return xmlparser.xmlNewParserCtxt()
def __dealloc__(self):
if self._parser_ctxt is not NULL:
@@ -434,7 +545,7 @@
python.PyThread_free_lock(self._parser_lock)
cdef void _cleanup(self):
- cdef xmlParserCtxt* pctxt
+ cdef xmlparser.xmlParserCtxt* pctxt
pctxt = self._parser_ctxt
if pctxt is not NULL:
if pctxt.spaceTab is not NULL: # work around bug in libxml2
@@ -458,7 +569,11 @@
property error_log:
def __get__(self):
- return self._error_log.copy()
+ return self._context._error_log.copy()
+
+ property resolvers:
+ def __get__(self):
+ return self._context._resolvers
def setElementClassLookup(self, ElementClassLookup lookup = None):
"Deprecated, use ``parser.set_element_class_lookup(lookup)`` instead."
@@ -497,114 +612,6 @@
def __get__(self):
return "libxml2 %d.%d.%d" % LIBXML_VERSION
- # feed parser interface
-
- def feed(self, data):
- """Feeds data to the parser. The argument should be an 8-bit string
- buffer containing encoded data, although Unicode is supported as long
- as both string types are not mixed.
-
- This is the main entry point to the consumer interface of a parser.
- The parser will parse as much of the XML stream as it can on each
- call. To finish parsing, call the ``close()`` method.
-
- It is not possible to use the parser in any other way after calling
- the ``feed()`` method. The parser can only be reset by calling
- ``close()``.
- """
- cdef xmlParserCtxt* pctxt
- cdef Py_ssize_t py_buffer_len
- cdef char* c_data
- cdef char* c_encoding
- cdef int buffer_len
- cdef int error
- cdef int recover
- if python.PyString_Check(data):
- c_encoding = NULL
- c_data = _cstr(data)
- py_buffer_len = python.PyString_GET_SIZE(data)
- elif python.PyUnicode_Check(data):
- if _UNICODE_ENCODING is NULL:
- raise ParserError, \
- "Unicode parsing is not supported on this platform"
- c_encoding = _UNICODE_ENCODING
- c_data = python.PyUnicode_AS_DATA(data)
- py_buffer_len = python.PyUnicode_GET_DATA_SIZE(data)
- else:
- raise TypeError, "Parsing requires string data"
-
- pctxt = self._parser_ctxt
- error = 0
- if not self._feed_parser_running:
- self._lockParser()
- self._feed_parser_running = 1
- self._error_log.connect()
- __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
- xmlparser.xmlCtxtUseOptions(pctxt, self._parse_options)
-
- if py_buffer_len > python.INT_MAX:
- buffer_len = python.INT_MAX
- else:
- buffer_len = <int>py_buffer_len
-
- error = xmlparser.xmlCtxtResetPush(
- pctxt, c_data, buffer_len, NULL, c_encoding)
-
- py_buffer_len = py_buffer_len - buffer_len
- c_data = c_data + buffer_len
-
- while error == 0 and py_buffer_len > 0:
- if py_buffer_len > python.INT_MAX:
- buffer_len = python.INT_MAX
- else:
- buffer_len = <int>py_buffer_len
- py_buffer_len = py_buffer_len - buffer_len
- error = xmlparser.xmlParseChunk(pctxt, c_data, buffer_len, 0)
- c_data = c_data + buffer_len
-
- if error:
- self._feed_parser_running = 0
- try:
- recover = self._parse_options & xmlparser.XML_PARSE_RECOVER
- _handleParseResult(pctxt, pctxt.myDoc, None,
- self._error_log, recover)
- finally:
- self._cleanup()
- self._context.clear()
- self._error_log.disconnect()
- self._unlockParser()
-
- def close(self):
- """Finishes feeding of data to this parser. This tells the parser to
- process any remaining data in the feed buffer, and then returns the
- root Element of the tree that was parsed.
-
- This method must be called after passing the last chunk of data into
- the ``feed()`` method. It should only be called when using the feed
- parser interface, all other usage is undefined.
- """
- cdef xmlParserCtxt* pctxt
- cdef xmlDoc* c_doc
- cdef _Document doc
- cdef int error
- if not self._feed_parser_running:
- raise XMLSyntaxError, "no element found"
- pctxt = self._parser_ctxt
- self._feed_parser_running = 0
- error = xmlparser.xmlParseChunk(pctxt, NULL, 0, 1)
- try:
- recover = self._parse_options & xmlparser.XML_PARSE_RECOVER
- c_doc = _handleParseResult(pctxt, pctxt.myDoc, None,
- self._error_log, recover)
- finally:
- self._cleanup()
- self._context.clear()
- self._error_log.disconnect()
- self._unlockParser()
-
- doc = _documentFactory(c_doc, self)
- return doc.getroot()
-
# internal parser methods
cdef xmlDoc* _parseUnicodeDoc(self, utext, char* c_filename) except NULL:
@@ -612,7 +619,7 @@
"""
cdef python.PyThreadState* state
cdef xmlDoc* result
- cdef xmlParserCtxt* pctxt
+ cdef xmlparser.xmlParserCtxt* pctxt
cdef int recover
cdef Py_ssize_t py_buffer_len
cdef int buffer_len
@@ -625,7 +632,7 @@
buffer_len = py_buffer_len
self._lockParser()
- self._error_log.connect()
+ self._context._error_log.connect()
try:
pctxt = self._parser_ctxt
__GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
@@ -642,13 +649,11 @@
self._parse_options)
python.PyEval_RestoreThread(state)
- recover = self._parse_options & xmlparser.XML_PARSE_RECOVER
- return _handleParseResult(pctxt, result, None,
- self._error_log, recover)
+ return self._context._handleParseResultDoc(self, result, None)
finally:
self._cleanup()
self._context.clear()
- self._error_log.disconnect()
+ self._context._error_log.disconnect()
self._unlockParser()
cdef xmlDoc* _parseDoc(self, char* c_text, Py_ssize_t c_len,
@@ -657,12 +662,12 @@
"""
cdef python.PyThreadState* state
cdef xmlDoc* result
- cdef xmlParserCtxt* pctxt
+ cdef xmlparser.xmlParserCtxt* pctxt
cdef int recover
if c_len > python.INT_MAX:
raise ParserError, "string is too long to parse it with libxml2"
self._lockParser()
- self._error_log.connect()
+ self._context._error_log.connect()
try:
pctxt = self._parser_ctxt
__GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
@@ -676,24 +681,22 @@
pctxt, c_text, c_len, c_filename, NULL, self._parse_options)
python.PyEval_RestoreThread(state)
- recover = self._parse_options & xmlparser.XML_PARSE_RECOVER
- return _handleParseResult(pctxt, result, None,
- self._error_log, recover)
+ return self._context._handleParseResultDoc(self, result, None)
finally:
self._cleanup()
self._context.clear()
- self._error_log.disconnect()
+ self._context._error_log.disconnect()
self._unlockParser()
cdef xmlDoc* _parseDocFromFile(self, char* c_filename) except NULL:
cdef python.PyThreadState* state
cdef xmlDoc* result
- cdef xmlParserCtxt* pctxt
+ cdef xmlparser.xmlParserCtxt* pctxt
cdef int recover
cdef int orig_options
result = NULL
self._lockParser()
- self._error_log.connect()
+ self._context._error_log.connect()
try:
pctxt = self._parser_ctxt
__GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
@@ -709,108 +712,182 @@
python.PyEval_RestoreThread(state)
pctxt.options = orig_options # work around libxml2 problem
- recover = self._parse_options & xmlparser.XML_PARSE_RECOVER
- return _handleParseResult(pctxt, result, c_filename,
- self._error_log, recover)
+ return self._context._handleParseResultDoc(
+ self, result, c_filename)
finally:
self._cleanup()
self._context.clear()
- self._error_log.disconnect()
+ self._context._error_log.disconnect()
self._unlockParser()
cdef xmlDoc* _parseDocFromFilelike(self, filelike, filename) except NULL:
- cdef _FileParserContext file_context
+ cdef _FileReaderContext file_context
cdef xmlDoc* result
- cdef xmlParserCtxt* pctxt
+ cdef xmlparser.xmlParserCtxt* pctxt
cdef char* c_filename
cdef int recover
if not filename:
filename = None
self._lockParser()
- self._error_log.connect()
+ self._context._error_log.connect()
try:
pctxt = self._parser_ctxt
__GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
- file_context = _FileParserContext(filelike, self._context, filename)
+ file_context = _FileReaderContext(filelike, self._context, filename)
result = file_context._readDoc(
pctxt, self._parse_options, self._parser_type)
- recover = self._parse_options & xmlparser.XML_PARSE_RECOVER
- return _handleParseResult(pctxt, result, filename,
- self._error_log, recover)
+ return self._context._handleParseResultDoc(
+ self, result, filename)
finally:
self._cleanup()
self._context.clear()
- self._error_log.disconnect()
+ self._context._error_log.disconnect()
self._unlockParser()
-cdef int _raiseParseError(xmlParserCtxt* ctxt, filename,
- _ErrorLog error_log) except 0:
- if filename is not None and \
- ctxt.lastError.domain == xmlerror.XML_FROM_IO:
- if ctxt.lastError.message is not NULL:
- message = "Error reading file '%s': %s" % (
- filename, (ctxt.lastError.message).strip())
+############################################################
+## ET feed parser
+############################################################
+
+cdef class _FeedParser(_BaseParser):
+ def feed(self, data):
+ """Feeds data to the parser. The argument should be an 8-bit string
+ buffer containing encoded data, although Unicode is supported as long
+ as both string types are not mixed.
+
+ This is the main entry point to the consumer interface of a parser.
+ The parser will parse as much of the XML stream as it can on each
+ call. To finish parsing, call the ``close()`` method.
+
+ It is not possible to use the parser in any other way after calling
+ the ``feed()`` method. The parser can only be reset by calling
+ ``close()``.
+ """
+ cdef xmlparser.xmlParserCtxt* pctxt
+ cdef Py_ssize_t py_buffer_len
+ cdef char* c_data
+ cdef char* c_encoding
+ cdef int buffer_len
+ cdef int error
+ cdef int recover
+ if python.PyString_Check(data):
+ c_encoding = NULL
+ c_data = _cstr(data)
+ py_buffer_len = python.PyString_GET_SIZE(data)
+ elif python.PyUnicode_Check(data):
+ if _UNICODE_ENCODING is NULL:
+ raise ParserError, \
+ "Unicode parsing is not supported on this platform"
+ c_encoding = _UNICODE_ENCODING
+ c_data = python.PyUnicode_AS_DATA(data)
+ py_buffer_len = python.PyUnicode_GET_DATA_SIZE(data)
else:
- message = "Error reading file '%s'" % filename
- raise IOError, message
- elif error_log is not None:
- raise XMLSyntaxError, error_log._buildExceptionMessage(
- "Document is not well formed")
- elif ctxt.lastError.message is not NULL:
- message = (ctxt.lastError.message).strip()
- if ctxt.lastError.line > 0:
- message = "line %d: %s" % (ctxt.lastError.line, message)
- raise XMLSyntaxError, message
- else:
- raise XMLSyntaxError
+ raise TypeError, "Parsing requires string data"
-cdef xmlDoc* _handleParseResult(xmlParserCtxt* ctxt, xmlDoc* result,
- filename, _ErrorLog error_log,
- int recover) except NULL:
- cdef _ResolverContext context
- cdef int well_formed
- if ctxt.myDoc is not NULL:
- if ctxt.myDoc != result:
- tree.xmlFreeDoc(ctxt.myDoc)
- ctxt.myDoc = NULL
+ pctxt = self._parser_ctxt
+ error = 0
+ if not self._feed_parser_running:
+ self._lockParser()
+ self._feed_parser_running = 1
+ self._context._error_log.connect()
+ __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
- if result is not NULL:
- if recover or (ctxt.wellFormed and \
- ctxt.lastError.level < xmlerror.XML_ERR_ERROR):
- well_formed = 1
- elif not ctxt.replaceEntities and not ctxt.validate:
- # in this mode, we ignore errors about undefined entities
- for error in error_log.filter_from_errors():
- if error.type != ErrorTypes.WAR_UNDECLARED_ENTITY and \
- error.type != ErrorTypes.ERR_UNDECLARED_ENTITY:
- well_formed = 0
- break
+ if py_buffer_len > python.INT_MAX:
+ buffer_len = python.INT_MAX
else:
- well_formed = 1
+ buffer_len = <int>py_buffer_len
+ if self._parser_type == LXML_HTML_PARSER:
+ error = _htmlCtxtResetPush(pctxt, c_data, buffer_len,
+ c_encoding, self._parse_options)
+ else:
+ error = xmlparser.xmlCtxtResetPush(
+ pctxt, c_data, buffer_len, NULL, c_encoding)
+ xmlparser.xmlCtxtUseOptions(pctxt, self._parse_options)
+ py_buffer_len = py_buffer_len - buffer_len
+ c_data = c_data + buffer_len
+
+ while error == 0 and py_buffer_len > 0:
+ if py_buffer_len > python.INT_MAX:
+ buffer_len = python.INT_MAX
+ else:
+ buffer_len = <int>py_buffer_len
+ if self._parser_type == LXML_HTML_PARSER:
+ error = htmlparser.htmlParseChunk(pctxt, c_data, buffer_len, 0)
+ else:
+ error = xmlparser.xmlParseChunk(pctxt, c_data, buffer_len, 0)
+ py_buffer_len = py_buffer_len - buffer_len
+ c_data = c_data + buffer_len
+
+ if error:
+ self._feed_parser_running = 0
+ try:
+ self._context._handleParseResult(
+ self, pctxt.myDoc, None)
+ finally:
+ self._cleanup()
+ self._context.clear()
+ self._context._error_log.disconnect()
+ self._unlockParser()
+
+ def close(self):
+ """Terminates feeding data to this parser. This tells the parser to
+ process any remaining data in the feed buffer, and then returns the
+ root Element of the tree that was parsed.
+
+ This method must be called after passing the last chunk of data into
+ the ``feed()`` method. It should only be called when using the feed
+ parser interface, all other usage is undefined.
+ """
+ cdef xmlparser.xmlParserCtxt* pctxt
+ cdef xmlDoc* c_doc
+ cdef _Document doc
+ cdef int is_target_parser, error
+ if not self._feed_parser_running:
+ raise XMLSyntaxError, "no element found"
+ pctxt = self._parser_ctxt
+ self._feed_parser_running = 0
+ if self._parser_type == LXML_HTML_PARSER:
+ error = htmlparser.htmlParseChunk(pctxt, NULL, 0, 1)
else:
- well_formed = 0
+ error = xmlparser.xmlParseChunk(pctxt, NULL, 0, 1)
+ is_target_parser = isinstance(self._context, _TargetParserContext)
+ try:
+ result = self._context._handleParseResult(
+ self, pctxt.myDoc, None)
+ finally:
+ self._cleanup()
+ self._context.clear()
+ self._context._error_log.disconnect()
+ self._unlockParser()
- if well_formed:
- __GLOBAL_PARSER_CONTEXT.initDocDict(result)
+ if isinstance(result, _Document):
+ return (<_Document>result).getroot()
else:
- # free broken document
- tree.xmlFreeDoc(result)
- result = NULL
+ return result
- if ctxt._private is not NULL:
- context = <_ResolverContext>ctxt._private
- if context._has_raised():
- if result is not NULL:
- tree.xmlFreeDoc(result)
- result = NULL
- context._raise_if_stored()
+cdef int _htmlCtxtResetPush(xmlparser.xmlParserCtxt* c_ctxt,
+ char* c_data, int buffer_len,
+ char* c_encoding, int parse_options) except -1:
+ cdef xmlparser.xmlParserInput* c_input_stream
+ # libxml2 crashes if spaceTab is not initialised
+ if _LIBXML_VERSION_INT < 20629 and c_ctxt.spaceTab is NULL:
+ c_ctxt.spaceTab = <int*>tree.xmlMalloc(10 * sizeof(int))
+ c_ctxt.spaceMax = 10
- if result is NULL:
- _raiseParseError(ctxt, filename, error_log)
- elif result.URL is NULL and filename is not None:
- result.URL = tree.xmlStrdup(_cstr(filename))
- return result
+ # libxml2 lacks an HTML push parser setup function
+ error = xmlparser.xmlCtxtResetPush(c_ctxt, NULL, 0, NULL, c_encoding)
+ if error:
+ return error
+
+ # fix libxml2 setup for HTML
+ c_ctxt.progressive = 1
+ c_ctxt.html = 1
+ htmlparser.htmlCtxtUseOptions(c_ctxt, parse_options)
+
+ if c_data is not NULL and buffer_len > 0:
+ return htmlparser.htmlParseChunk(c_ctxt, c_data, buffer_len, 0)
+ return 0
+
############################################################
## XML parser
@@ -824,7 +901,7 @@
xmlparser.XML_PARSE_COMPACT
)
-cdef class XMLParser(_BaseParser):
+cdef class XMLParser(_FeedParser):
"""The XML parser. Parsers can be supplied as additional argument to
various parse functions of the lxml API. A default parser is always
available and can be replaced by a call to the global function
@@ -848,6 +925,8 @@
* compact - safe memory for short text content (default: True)
* resolve_entities - replace entities by their text value (default: True)
+ You can pass a parser target as ``target`` keyword argument.
+
Note that you should avoid sharing parsers between threads. While this is
not harmful, it is more efficient to use separate parsers. This does not
apply to the default parser.
@@ -856,7 +935,7 @@
load_dtd=False, no_network=True, ns_clean=False,
recover=False, remove_blank_text=False, compact=True,
resolve_entities=True, remove_comments=False,
- remove_pis=False):
+ remove_pis=False, target=None):
cdef int parse_options
parse_options = _XML_DEFAULT_PARSE_OPTIONS
if load_dtd:
@@ -880,7 +959,8 @@
if not resolve_entities:
parse_options = parse_options ^ xmlparser.XML_PARSE_NOENT
- _BaseParser.__init__(self, parse_options, remove_comments, remove_pis)
+ _BaseParser.__init__(self, parse_options, remove_comments, remove_pis,
+ target)
cdef class ETCompatXMLParser(XMLParser):
"""An XML parser with an ElementTree compatible default setup. See the
@@ -893,18 +973,18 @@
load_dtd=False, no_network=True, ns_clean=False,
recover=False, remove_blank_text=False, compact=True,
resolve_entities=True, remove_comments=True,
- remove_pis=True):
+ remove_pis=True, target=None):
XMLParser.__init__(self,
attribute_defaults, dtd_validation,
load_dtd, no_network, ns_clean,
recover, remove_blank_text, compact,
resolve_entities, remove_comments,
- remove_pis)
+ remove_pis, target)
cdef xmlDoc* _internalParseDoc(char* c_text, int options,
_ResolverContext context) except NULL:
# internal parser function for XSLT
- cdef xmlParserCtxt* pctxt
+ cdef xmlparser.xmlParserCtxt* pctxt
cdef xmlDoc* c_doc
cdef int recover
pctxt = xmlparser.xmlNewParserCtxt()
@@ -916,7 +996,7 @@
pctxt, c_text, NULL, NULL, options)
try:
recover = options & xmlparser.XML_PARSE_RECOVER
- c_doc = _handleParseResult(pctxt, c_doc, None, None, recover)
+ c_doc = _handleParseResult(None, pctxt, c_doc, None, recover)
finally:
xmlparser.xmlFreeParserCtxt(pctxt)
return c_doc
@@ -924,7 +1004,7 @@
cdef xmlDoc* _internalParseDocFromFile(char* c_filename, int options,
_ResolverContext context) except NULL:
# internal parser function for XSLT
- cdef xmlParserCtxt* pctxt
+ cdef xmlparser.xmlParserCtxt* pctxt
cdef xmlDoc* c_doc
cdef int recover
pctxt = xmlparser.xmlNewParserCtxt()
@@ -940,7 +1020,7 @@
filename = None
else:
filename = c_filename
- c_doc = _handleParseResult(pctxt, c_doc, filename, None, recover)
+ c_doc = _handleParseResult(None, pctxt, c_doc, filename, recover)
finally:
xmlparser.xmlFreeParserCtxt(pctxt)
return c_doc
@@ -987,7 +1067,7 @@
htmlparser.HTML_PARSE_COMPACT
)
-cdef class HTMLParser(_BaseParser):
+cdef class HTMLParser(_FeedParser):
"""The HTML parser. This parser allows reading HTML into a normal XML
tree. By default, it can read broken (non well-formed) HTML, depending on
the capabilities of libxml2. Use the 'recover' option to switch this off.
@@ -1000,11 +1080,14 @@
* remove_pis - discard processing instructions
* compact - safe memory for short text content (default: True)
+ You can pass a parser target as ``target`` keyword argument.
+
Note that you should avoid sharing parsers between threads for performance
reasons.
"""
def __init__(self, recover=True, no_network=True, remove_blank_text=False,
- compact=True, remove_comments=False, remove_pis=False):
+ compact=True, remove_comments=False, remove_pis=False,
+ target=None):
cdef int parse_options
parse_options = _HTML_DEFAULT_PARSE_OPTIONS
if remove_blank_text:
@@ -1016,7 +1099,8 @@
if not compact:
parse_options = parse_options ^ htmlparser.HTML_PARSE_COMPACT
- _BaseParser.__init__(self, parse_options, remove_comments, remove_pis)
+ _BaseParser.__init__(self, parse_options, remove_comments, remove_pis,
+ target)
cdef HTMLParser __DEFAULT_HTML_PARSER
__DEFAULT_HTML_PARSER = HTMLParser()
Modified: lxml/trunk/src/lxml/tests/test_elementtree.py
==============================================================================
--- lxml/trunk/src/lxml/tests/test_elementtree.py (original)
+++ lxml/trunk/src/lxml/tests/test_elementtree.py Mon Sep 10 16:23:22 2007
@@ -2538,6 +2538,8 @@
# ElementTree 1.3+, cET
self.assert_(re.match("[^ ]+ [0-9.]+", parser.version))
+ # feed parser interface
+
def test_feed_parser(self):
parser = self.etree.XMLParser()
@@ -2579,6 +2581,81 @@
self.assertRaises(Exception, parser.close)
+ # parser target interface
+
+ def test_parser_target_tag(self):
+ assertEquals = self.assertEquals
+ assertFalse = self.assertFalse
+
+ events = []
+ class Target(object):
+ def start(self, tag, attrib):
+ events.append("start")
+ assertFalse(attrib)
+ assertEquals("TAG", tag)
+ def end(self, tag):
+ events.append("end")
+ assertEquals("TAG", tag)
+ def close(self):
+ return "DONE"
+
+ parser = self.etree.XMLParser(target=Target())
+
+ parser.feed("<TAG/>")
+ done = parser.close()
+
+ self.assertEquals("DONE", done)
+ self.assertEquals(["start", "end"], events)
+
+ def test_parser_target_attrib(self):
+ assertEquals = self.assertEquals
+ assertFalse = self.assertFalse
+
+ events = []
+ class Target(object):
+ def start(self, tag, attrib):
+ events.append("start-" + tag)
+ for name, value in attrib.iteritems():
+ assertEquals(tag + name, value)
+ def end(self, tag):
+ events.append("end-" + tag)
+ def close(self):
+ return "DONE"
+
+ parser = self.etree.XMLParser(target=Target())
+
+ parser.feed('<root a="roota" b="rootb"><sub c="subc"/></root>')
+ done = parser.close()
+
+ self.assertEquals("DONE", done)
+ self.assertEquals(["start-root", "start-sub", "end-sub", "end-root"],
+ events)
+
+ def test_parser_target_data(self):
+ assertEquals = self.assertEquals
+ assertFalse = self.assertFalse
+
+ events = []
+ class Target(object):
+ def start(self, tag, attrib):
+ events.append("start-" + tag)
+ def end(self, tag):
+ events.append("end-" + tag)
+ def data(self, data):
+ events.append("data-" + data)
+ def close(self):
+ return "DONE"
+
+ parser = self.etree.XMLParser(target=Target())
+
+ parser.feed('<root>A<sub/>B</root>')
+ done = parser.close()
+
+ self.assertEquals("DONE", done)
+ self.assertEquals(["start-root", "data-A", "start-sub",
+ "end-sub", "data-B", "end-root"],
+ events)
+
# helper methods
def _writeElement(self, element, encoding='us-ascii'):
Modified: lxml/trunk/src/lxml/xmlparser.pxd
==============================================================================
--- lxml/trunk/src/lxml/xmlparser.pxd (original)
+++ lxml/trunk/src/lxml/xmlparser.pxd Mon Sep 10 16:23:22 2007
@@ -19,17 +19,25 @@
char* prefix,
char* URI)
- ctypedef void (*cdataBlockSAXFunc)(void* ctx,
- char* value,
- int len)
+ ctypedef void (*charactersSAXFunc)(void* ctx, char* ch, int len)
- ctypedef void (*commentSAXFunc)(void* ctx,
- char* value)
+ ctypedef void (*cdataBlockSAXFunc)(void* ctx, char* value, int len)
- ctypedef void (*processingInstructionSAXFunc)(void * ctx,
+ ctypedef void (*commentSAXFunc)(void* ctx, char* value)
+
+ ctypedef void (*processingInstructionSAXFunc)(void* ctx,
char* target,
char* data)
+ ctypedef void (*internalSubsetSAXFunc)(void* ctx,
+ char* name,
+ char* externalID,
+ char* systemID)
+
+ ctypedef void (*endDocumentSAXFunc)(void* ctx)
+
+ cdef int XML_SAX2_MAGIC
+
cdef extern from "libxml/tree.h":
ctypedef struct xmlParserInput
ctypedef struct xmlParserInputBuffer:
@@ -38,11 +46,15 @@
xmlInputCloseCallback closecallback
ctypedef struct xmlSAXHandler:
+ internalSubsetSAXFunc internalSubset
startElementNsSAX2Func startElementNs
endElementNsSAX2Func endElementNs
+ charactersSAXFunc characters
cdataBlockSAXFunc cdataBlock
commentSAXFunc comment
processingInstructionSAXFunc processingInstruction
+ endDocumentSAXFunc endDocument
+ int initialized
cdef extern from "libxml/xmlIO.h":
cdef xmlParserInputBuffer* xmlAllocParserInputBuffer(int enc)
@@ -54,6 +66,8 @@
cdef void xmlDictFree(xmlDict* sub)
cdef int xmlDictReference(xmlDict* dict)
+ cdef int XML_COMPLETE_ATTRS # SAX option for adding DTD default attributes
+
ctypedef struct xmlParserCtxt:
xmlDoc* myDoc
xmlDict* dict
@@ -64,11 +78,16 @@
int disableSAX
int errNo
int replaceEntities
+ int loadsubset
int validate
xmlError lastError
xmlNode* node
xmlSAXHandler* sax
int* spaceTab
+ int spaceMax
+ int html
+ int progressive
+ int charset
ctypedef enum xmlParserOption:
XML_PARSE_RECOVER = 1 # recover on errors
More information about the lxml-checkins
mailing list