[Lxml-checkins] r54503 - in lxml/trunk: . src/lxml

scoder at codespeak.net scoder at codespeak.net
Tue May 6 22:37:45 CEST 2008


Author: scoder
Date: Tue May  6 22:37:45 2008
New Revision: 54503

Modified:
   lxml/trunk/   (props changed)
   lxml/trunk/src/lxml/iterparse.pxi
   lxml/trunk/src/lxml/parser.pxi
Log:
 r4185 at delle:  sbehnel | 2008-05-06 21:21:23 +0200
 fixes for HTML name dictification and a corner case in iterparse()


Modified: lxml/trunk/src/lxml/iterparse.pxi
==============================================================================
--- lxml/trunk/src/lxml/iterparse.pxi	(original)
+++ lxml/trunk/src/lxml/iterparse.pxi	Tue May  6 22:37:45 2008
@@ -201,6 +201,8 @@
     cdef _IterparseContext context
     context = <_IterparseContext>c_ctxt._private
     try:
+        if c_ctxt.html:
+            _fixHtmlDictNodeNames(c_ctxt.dict, c_node)
         context.startNode(c_node)
     except:
         if c_ctxt.errNo == xmlerror.XML_ERR_OK:
@@ -452,7 +454,8 @@
                           not context._validator.isvalid()):
             self._source = None
             del context._events[:]
-            _raiseParseError(pctxt, self._filename, context._error_log)
+            _handleParseResult(context, pctxt, NULL,
+                               self._filename, self._for_html)
         if python.PyList_GET_SIZE(context._events) == 0:
             self.root = context._root
             self._source = None

Modified: lxml/trunk/src/lxml/parser.pxi
==============================================================================
--- lxml/trunk/src/lxml/parser.pxi	(original)
+++ lxml/trunk/src/lxml/parser.pxi	Tue May  6 22:37:45 2008
@@ -244,6 +244,10 @@
                 result = htmlparser.htmlCtxtReadIO(
                     ctxt, _readFilelikeParser, NULL, <python.PyObject*>self,
                     self._c_url, c_encoding, options)
+                if result is not NULL:
+                    if _fixHtmlDictNames(ctxt.dict, result) < 0:
+                        tree.xmlFreeDoc(result)
+                        result = NULL
             else:
                 result = xmlparser.xmlCtxtReadIO(
                     ctxt, _readFilelikeParser, NULL, <python.PyObject*>self,
@@ -493,8 +497,12 @@
                                 xmlDoc* result, filename,
                                 bint recover) except NULL:
     cdef bint well_formed
+    if result is not NULL:
+        __GLOBAL_PARSER_CONTEXT.initDocDict(result)
+
     if c_ctxt.myDoc is not NULL:
-        if c_ctxt.myDoc != result:
+        if c_ctxt.myDoc is not result:
+            __GLOBAL_PARSER_CONTEXT.initDocDict(c_ctxt.myDoc)
             tree.xmlFreeDoc(c_ctxt.myDoc)
         c_ctxt.myDoc = NULL
 
@@ -518,11 +526,7 @@
         else:
             well_formed = 0
 
-        if well_formed:
-            __GLOBAL_PARSER_CONTEXT.initDocDict(result)
-            if c_ctxt.html:
-                _fixHtmlDictNames(result)
-        else:
+        if not well_formed:
             # free broken document
             tree.xmlFreeDoc(result)
             result = NULL
@@ -542,31 +546,38 @@
         result.URL = tree.xmlStrdup(_cstr(filename))
     return result
 
-cdef int _fixHtmlDictNames(xmlDoc* c_doc) except -1:
-    cdef char* c_name
-    cdef xmlNode* c_attr
-    cdef xmlNode* c_node = c_doc.children
+cdef int _fixHtmlDictNames(tree.xmlDict* c_dict, xmlDoc* c_doc) nogil:
+    cdef xmlNode* c_node
+    if c_doc is NULL:
+        return 0
+    c_node = c_doc.children
     tree.BEGIN_FOR_EACH_ELEMENT_FROM(<xmlNode*>c_doc, c_node, 0)
     if c_node.type == tree.XML_ELEMENT_NODE:
-        if not tree.xmlDictOwns(c_doc.dict, c_node.name):
-            c_name = tree.xmlDictLookup(c_doc.dict, c_node.name, -1)
-            if c_name is NULL:
-                python.PyErr_NoMemory()
-                return -1
-            tree.xmlFree(c_node.name)
-            c_node.name = c_name
-        c_attr = <xmlNode*>c_node.properties
-        while c_attr is not NULL:
-            if not tree.xmlDictOwns(c_doc.dict, c_attr.name):
-                c_name = tree.xmlDictLookup(c_doc.dict, c_attr.name, -1)
-                if c_name is NULL:
-                    python.PyErr_NoMemory()
-                    return -1
-                tree.xmlFree(c_attr.name)
-                c_attr.name = c_name
-            c_attr = c_attr.next
+        if _fixHtmlDictNodeNames(c_dict, c_node) < 0:
+            return -1
     tree.END_FOR_EACH_ELEMENT_FROM(c_node)
+    return 0
 
+cdef inline int _fixHtmlDictNodeNames(tree.xmlDict* c_dict,
+                                      xmlNode* c_node) nogil:
+    cdef xmlNode* c_attr
+    cdef char* c_name
+    c_name = tree.xmlDictLookup(c_dict, c_node.name, -1)
+    if c_name is NULL:
+        return -1
+    if c_name is not c_node.name:
+        tree.xmlFree(c_node.name)
+        c_node.name = c_name
+    c_attr = <xmlNode*>c_node.properties
+    while c_attr is not NULL:
+        c_name = tree.xmlDictLookup(c_dict, c_attr.name, -1)
+        if c_name is NULL:
+            return -1
+        if c_name is not c_attr.name:
+            tree.xmlFree(c_attr.name)
+            c_attr.name = c_name
+        c_attr = c_attr.next
+    return 0
 
 cdef class _BaseParser:
     cdef ElementClassLookup _class_lookup
@@ -784,6 +795,10 @@
                     result = htmlparser.htmlCtxtReadMemory(
                         pctxt, c_text, buffer_len, c_filename, _UNICODE_ENCODING,
                         self._parse_options)
+                    if result is not NULL:
+                        if _fixHtmlDictNames(pctxt.dict, result) < 0:
+                            tree.xmlFreeDoc(result)
+                            result = NULL
                 else:
                     result = xmlparser.xmlCtxtReadMemory(
                         pctxt, c_text, buffer_len, c_filename, _UNICODE_ENCODING,
@@ -820,6 +835,10 @@
                     result = htmlparser.htmlCtxtReadMemory(
                         pctxt, c_text, c_len, c_filename,
                         c_encoding, self._parse_options)
+                    if result is not NULL:
+                        if _fixHtmlDictNames(pctxt.dict, result) < 0:
+                            tree.xmlFreeDoc(result)
+                            result = NULL
                 else:
                     result = xmlparser.xmlCtxtReadMemory(
                         pctxt, c_text, c_len, c_filename,
@@ -853,6 +872,10 @@
                 if self._for_html:
                     result = htmlparser.htmlCtxtReadFile(
                         pctxt, c_filename, c_encoding, self._parse_options)
+                    if result is not NULL:
+                        if _fixHtmlDictNames(pctxt.dict, result) < 0:
+                            tree.xmlFreeDoc(result)
+                            result = NULL
                 else:
                     result = xmlparser.xmlCtxtReadFile(
                         pctxt, c_filename, c_encoding, self._parse_options)


More information about the lxml-checkins mailing list