[Lxml-checkins] r54503 - in lxml/trunk: . src/lxml
scoder at codespeak.net
scoder at codespeak.net
Tue May 6 22:37:45 CEST 2008
Author: scoder
Date: Tue May 6 22:37:45 2008
New Revision: 54503
Modified:
lxml/trunk/ (props changed)
lxml/trunk/src/lxml/iterparse.pxi
lxml/trunk/src/lxml/parser.pxi
Log:
r4185 at delle: sbehnel | 2008-05-06 21:21:23 +0200
fixes for HTML name dictification and a corner case in iterparse()
Modified: lxml/trunk/src/lxml/iterparse.pxi
==============================================================================
--- lxml/trunk/src/lxml/iterparse.pxi (original)
+++ lxml/trunk/src/lxml/iterparse.pxi Tue May 6 22:37:45 2008
@@ -201,6 +201,8 @@
cdef _IterparseContext context
context = <_IterparseContext>c_ctxt._private
try:
+ if c_ctxt.html:
+ _fixHtmlDictNodeNames(c_ctxt.dict, c_node)
context.startNode(c_node)
except:
if c_ctxt.errNo == xmlerror.XML_ERR_OK:
@@ -452,7 +454,8 @@
not context._validator.isvalid()):
self._source = None
del context._events[:]
- _raiseParseError(pctxt, self._filename, context._error_log)
+ _handleParseResult(context, pctxt, NULL,
+ self._filename, self._for_html)
if python.PyList_GET_SIZE(context._events) == 0:
self.root = context._root
self._source = None
Modified: lxml/trunk/src/lxml/parser.pxi
==============================================================================
--- lxml/trunk/src/lxml/parser.pxi (original)
+++ lxml/trunk/src/lxml/parser.pxi Tue May 6 22:37:45 2008
@@ -244,6 +244,10 @@
result = htmlparser.htmlCtxtReadIO(
ctxt, _readFilelikeParser, NULL, <python.PyObject*>self,
self._c_url, c_encoding, options)
+ if result is not NULL:
+ if _fixHtmlDictNames(ctxt.dict, result) < 0:
+ tree.xmlFreeDoc(result)
+ result = NULL
else:
result = xmlparser.xmlCtxtReadIO(
ctxt, _readFilelikeParser, NULL, <python.PyObject*>self,
@@ -493,8 +497,12 @@
xmlDoc* result, filename,
bint recover) except NULL:
cdef bint well_formed
+ if result is not NULL:
+ __GLOBAL_PARSER_CONTEXT.initDocDict(result)
+
if c_ctxt.myDoc is not NULL:
- if c_ctxt.myDoc != result:
+ if c_ctxt.myDoc is not result:
+ __GLOBAL_PARSER_CONTEXT.initDocDict(c_ctxt.myDoc)
tree.xmlFreeDoc(c_ctxt.myDoc)
c_ctxt.myDoc = NULL
@@ -518,11 +526,7 @@
else:
well_formed = 0
- if well_formed:
- __GLOBAL_PARSER_CONTEXT.initDocDict(result)
- if c_ctxt.html:
- _fixHtmlDictNames(result)
- else:
+ if not well_formed:
# free broken document
tree.xmlFreeDoc(result)
result = NULL
@@ -542,31 +546,38 @@
result.URL = tree.xmlStrdup(_cstr(filename))
return result
-cdef int _fixHtmlDictNames(xmlDoc* c_doc) except -1:
- cdef char* c_name
- cdef xmlNode* c_attr
- cdef xmlNode* c_node = c_doc.children
+cdef int _fixHtmlDictNames(tree.xmlDict* c_dict, xmlDoc* c_doc) nogil:
+ cdef xmlNode* c_node
+ if c_doc is NULL:
+ return 0
+ c_node = c_doc.children
tree.BEGIN_FOR_EACH_ELEMENT_FROM(<xmlNode*>c_doc, c_node, 0)
if c_node.type == tree.XML_ELEMENT_NODE:
- if not tree.xmlDictOwns(c_doc.dict, c_node.name):
- c_name = tree.xmlDictLookup(c_doc.dict, c_node.name, -1)
- if c_name is NULL:
- python.PyErr_NoMemory()
- return -1
- tree.xmlFree(c_node.name)
- c_node.name = c_name
- c_attr = <xmlNode*>c_node.properties
- while c_attr is not NULL:
- if not tree.xmlDictOwns(c_doc.dict, c_attr.name):
- c_name = tree.xmlDictLookup(c_doc.dict, c_attr.name, -1)
- if c_name is NULL:
- python.PyErr_NoMemory()
- return -1
- tree.xmlFree(c_attr.name)
- c_attr.name = c_name
- c_attr = c_attr.next
+ if _fixHtmlDictNodeNames(c_dict, c_node) < 0:
+ return -1
tree.END_FOR_EACH_ELEMENT_FROM(c_node)
+ return 0
+cdef inline int _fixHtmlDictNodeNames(tree.xmlDict* c_dict,
+ xmlNode* c_node) nogil:
+ cdef xmlNode* c_attr
+ cdef char* c_name
+ c_name = tree.xmlDictLookup(c_dict, c_node.name, -1)
+ if c_name is NULL:
+ return -1
+ if c_name is not c_node.name:
+ tree.xmlFree(c_node.name)
+ c_node.name = c_name
+ c_attr = <xmlNode*>c_node.properties
+ while c_attr is not NULL:
+ c_name = tree.xmlDictLookup(c_dict, c_attr.name, -1)
+ if c_name is NULL:
+ return -1
+ if c_name is not c_attr.name:
+ tree.xmlFree(c_attr.name)
+ c_attr.name = c_name
+ c_attr = c_attr.next
+ return 0
cdef class _BaseParser:
cdef ElementClassLookup _class_lookup
@@ -784,6 +795,10 @@
result = htmlparser.htmlCtxtReadMemory(
pctxt, c_text, buffer_len, c_filename, _UNICODE_ENCODING,
self._parse_options)
+ if result is not NULL:
+ if _fixHtmlDictNames(pctxt.dict, result) < 0:
+ tree.xmlFreeDoc(result)
+ result = NULL
else:
result = xmlparser.xmlCtxtReadMemory(
pctxt, c_text, buffer_len, c_filename, _UNICODE_ENCODING,
@@ -820,6 +835,10 @@
result = htmlparser.htmlCtxtReadMemory(
pctxt, c_text, c_len, c_filename,
c_encoding, self._parse_options)
+ if result is not NULL:
+ if _fixHtmlDictNames(pctxt.dict, result) < 0:
+ tree.xmlFreeDoc(result)
+ result = NULL
else:
result = xmlparser.xmlCtxtReadMemory(
pctxt, c_text, c_len, c_filename,
@@ -853,6 +872,10 @@
if self._for_html:
result = htmlparser.htmlCtxtReadFile(
pctxt, c_filename, c_encoding, self._parse_options)
+ if result is not NULL:
+ if _fixHtmlDictNames(pctxt.dict, result) < 0:
+ tree.xmlFreeDoc(result)
+ result = NULL
else:
result = xmlparser.xmlCtxtReadFile(
pctxt, c_filename, c_encoding, self._parse_options)
More information about the lxml-checkins
mailing list