[Lxml-checkins] r53788 - in lxml/trunk: . src/lxml

scoder at codespeak.net scoder at codespeak.net
Tue Apr 15 17:44:13 CEST 2008


Author: scoder
Date: Tue Apr 15 17:44:10 2008
New Revision: 53788

Modified:
   lxml/trunk/   (props changed)
   lxml/trunk/CHANGES.txt
   lxml/trunk/src/lxml/proxy.pxi
Log:
 r3980 at delle:  sbehnel | 2008-04-15 17:42:57 +0200
 huge cleanup in moveNodeToDocument() function


Modified: lxml/trunk/CHANGES.txt
==============================================================================
--- lxml/trunk/CHANGES.txt	(original)
+++ lxml/trunk/CHANGES.txt	Tue Apr 15 17:44:10 2008
@@ -29,6 +29,10 @@
 Other changes
 -------------
 
+* Major cleanup in internal ``moveNodeToDocument()`` function, which
+  takes care of namespace cleanup when moving elements between
+  different namespace contexts.
+
 * New Elements created through the ``makeelement()`` method of an HTML
   parser or through lxml.html now end up in a new HTML document
   (doctype HTML 4.01 Transitional) instead of a generic XML document.

Modified: lxml/trunk/src/lxml/proxy.pxi
==============================================================================
--- lxml/trunk/src/lxml/proxy.pxi	(original)
+++ lxml/trunk/src/lxml/proxy.pxi	Tue Apr 15 17:44:10 2008
@@ -46,6 +46,18 @@
     python.Py_XDECREF(proxy._gc_doc)
     proxy._gc_doc = NULL
 
+cdef inline void _updateProxyDocument(xmlNode* c_node, _Document doc):
+    """Replace the document reference of a proxy.
+
+    This may deallocate the original document of the proxy!
+    """
+    cdef _Element element = <_Element>c_node._private
+    if element._doc is not doc:
+        python.Py_INCREF(doc)
+        python.Py_DECREF(element._doc)
+        element._doc = doc
+        element._gc_doc = <python.PyObject*>doc
+
 ################################################################################
 # temporarily make a node the root node of its document
 
@@ -198,6 +210,72 @@
             c_new_ns = c_new_ns.next
         c_parent = c_parent.parent
 
+ctypedef struct _nscache:
+    xmlNs** new
+    xmlNs** old
+    cstd.size_t size
+    cstd.size_t last
+
+cdef int _growNsCache(_nscache* c_ns_cache) except -1:
+    cdef xmlNs** c_ns_ptr
+    if c_ns_cache.size == 0:
+        c_ns_cache.size = 20
+    else:
+        c_ns_cache.size *= 2
+    c_ns_ptr = <xmlNs**> cstd.realloc(
+        c_ns_cache.new, c_ns_cache.size * sizeof(xmlNs*))
+    if c_ns_ptr is not NULL:
+        c_ns_cache.new = c_ns_ptr
+        c_ns_ptr = <xmlNs**> cstd.realloc(
+            c_ns_cache.old, c_ns_cache.size * sizeof(xmlNs*))
+    if c_ns_ptr is not NULL:
+        c_ns_cache.old = c_ns_ptr
+    else:
+        cstd.free(c_ns_cache.new)
+        cstd.free(c_ns_cache.old)
+        python.PyErr_NoMemory()
+        return -1
+    return 0
+
+cdef inline int _appendToNsCache(_nscache* c_ns_cache,
+                                 xmlNs* c_old_ns, xmlNs* c_new_ns) except -1:
+    if c_ns_cache.last >= c_ns_cache.size:
+        _growNsCache(c_ns_cache)
+    c_ns_cache.old[c_ns_cache.last] = c_old_ns
+    c_ns_cache.new[c_ns_cache.last] = c_new_ns
+    c_ns_cache.last += 1
+
+cdef int _stripRedundantNamespaceDeclarations(
+    xmlNode* c_element, _nscache* c_ns_cache, xmlNs** c_del_ns_list) except -1:
+    """Removes namespace declarations from an element that are already
+    defined in its parents.  Does not free the xmlNs's, just prepends
+    them to the c_del_ns_list.
+    """
+    cdef xmlNs* c_ns
+    cdef xmlNs* c_ns_next
+    cdef xmlNs** c_nsdef
+    # use a xmlNs** to handle assignments to "c_element.nsDef" correctly
+    c_nsdef = &c_element.nsDef
+    while c_nsdef[0] is not NULL:
+        c_ns = tree.xmlSearchNsByHref(
+            c_element.doc, c_element.parent, c_nsdef[0].href)
+        if c_ns is NULL:
+            # new namespace href => keep and cache the ns declaration
+            _appendToNsCache(c_ns_cache, c_nsdef[0], c_nsdef[0])
+            c_nsdef = &c_nsdef[0].next
+        else:
+            # known namespace href => strip the ns
+            if c_ns is tree.xmlSearchNs(c_element.doc, c_element.parent,
+                                        c_ns.prefix):
+                # prefix is not shadowed by parents => ns is reusable
+                _appendToNsCache(c_ns_cache, c_nsdef[0], c_ns)
+            # cut out c_nsdef.next and prepend it to garbage chain
+            c_ns_next = c_nsdef[0].next
+            c_nsdef[0].next = c_del_ns_list[0]
+            c_del_ns_list[0] = c_nsdef[0]
+            c_nsdef[0] = c_ns_next
+    return 0
+
 cdef int moveNodeToDocument(_Document doc, xmlNode* c_element) except -1:
     """Fix the xmlNs pointers of a node and its subtree that were moved.
 
@@ -223,96 +301,48 @@
     step 1), but freed only after the complete subtree was traversed
     and all occurrences were replaced by tree-internal pointers.
     """
-    cdef _Element element
-    cdef xmlDoc* c_doc
     cdef xmlNode* c_start_node
     cdef xmlNode* c_node
-    cdef xmlNs** c_ns_ptr
-    cdef xmlNs** c_ns_new_cache
-    cdef xmlNs** c_ns_old_cache
+    cdef _nscache c_ns_cache
     cdef xmlNs* c_ns
     cdef xmlNs* c_ns_next
     cdef xmlNs* c_nsdef
-    cdef xmlNs* c_new_ns
-    cdef xmlNs* c_del_ns
-    cdef cstd.size_t i, c_cache_size, c_cache_last
+    cdef xmlNs* c_del_ns_list
+    cdef cstd.size_t i
 
     if not tree._isElementOrXInclude(c_element):
         return 0
 
-    c_doc = c_element.doc
     c_start_node = c_element
-    c_ns_new_cache = NULL
-    c_ns_old_cache = NULL
-    c_cache_size = 0
-    c_cache_last = 0
-    c_del_ns = NULL
+    c_del_ns_list = NULL
+
+    c_ns_cache.new = NULL
+    c_ns_cache.old = NULL
+    c_ns_cache.size = 0
+    c_ns_cache.last = 0
 
     while c_element is not NULL:
         # 1) cut out namespaces defined here that are already known by
         #    the ancestors
-        c_nsdef = c_element.nsDef
-        if c_nsdef is not NULL:
-            # start with second nsdef to keep c_element.nsDef for now
-            while c_nsdef.next is not NULL:
-                if c_nsdef.next is c_element.ns:
-                    c_nsdef = c_nsdef.next
-                    continue
-                c_ns = tree.xmlSearchNsByHref(
-                    c_element.doc, c_element.parent, c_nsdef.next.href)
-                if c_ns is NULL:
-                    c_nsdef = c_nsdef.next
-                    continue
-                # cut out c_nsdef.next and prepend it to garbage chain
-                c_ns_next = c_nsdef.next.next
-                c_nsdef.next.next = c_del_ns
-                c_del_ns = c_nsdef.next
-                c_nsdef.next = c_ns_next
-            # now handle c_element.nsDef
-            c_ns = tree.xmlSearchNsByHref(
-                c_element.doc, c_element.parent, c_element.nsDef.href)
-            if c_ns is not NULL:
-                c_ns_next = c_element.nsDef.next
-                c_element.nsDef.next = c_del_ns
-                c_del_ns = c_element.nsDef
-                c_element.nsDef = c_ns_next
+        if c_element.nsDef is not NULL:
+            _stripRedundantNamespaceDeclarations(
+                c_element, &c_ns_cache, &c_del_ns_list)
 
-        # 2) make sure the namespace of an element and its attributes
-        #    is declared in this document (i.e. the node or its parents)
+        # 2) make sure the namespaces of an element and its attributes
+        #    are declared in this document (i.e. on the node or its parents)
         c_node = c_element
         while c_node is not NULL:
             if c_node.ns is not NULL:
-                for i from 0 <= i < c_cache_last:
-                    if c_node.ns is c_ns_old_cache[i]:
-                        c_node.ns = c_ns_new_cache[i]
+                for i from 0 <= i < c_ns_cache.last:
+                    if c_node.ns is c_ns_cache.old[i]:
+                        c_node.ns = c_ns_cache.new[i]
                         break
                 else:
                     # not in cache => find a replacement from this document
-                    c_new_ns = doc._findOrBuildNodeNs(
+                    c_ns = doc._findOrBuildNodeNs(
                         c_element, c_node.ns.href, c_node.ns.prefix)
-                    if c_cache_last >= c_cache_size:
-                        # must resize cache
-                        if c_cache_size == 0:
-                            c_cache_size = 20
-                        else:
-                            c_cache_size *= 2
-                        c_ns_ptr = <xmlNs**> cstd.realloc(
-                            c_ns_new_cache, c_cache_size * sizeof(xmlNs*))
-                        if c_ns_ptr is not NULL:
-                            c_ns_new_cache = c_ns_ptr
-                            c_ns_ptr = <xmlNs**> cstd.realloc(
-                                c_ns_old_cache, c_cache_size * sizeof(xmlNs*))
-                        if c_ns_ptr is not NULL:
-                            c_ns_old_cache = c_ns_ptr
-                        else:
-                            cstd.free(c_ns_new_cache)
-                            cstd.free(c_ns_old_cache)
-                            python.PyErr_NoMemory()
-                            return -1
-                    c_ns_new_cache[c_cache_last] = c_new_ns
-                    c_ns_old_cache[c_cache_last] = c_node.ns
-                    c_cache_last += 1
-                    c_node.ns = c_new_ns
+                    _appendToNsCache(&c_ns_cache, c_node.ns, c_ns)
+                    c_node.ns = c_ns
             if c_node is c_element:
                 # after the element, continue with its attributes
                 c_node = <xmlNode*>c_element.properties
@@ -330,12 +360,7 @@
 
             # 3) fix _Document reference (may dealloc the original document!)
             if c_element._private is not NULL:
-                element = <_Element>c_element._private
-                if element._doc is not doc:
-                    python.Py_INCREF(doc)
-                    python.Py_DECREF(element._doc)
-                    element._doc = doc
-                    element._gc_doc = <python.PyObject*>doc
+                _updateProxyDocument(c_element, doc)
 
             if c_element is c_start_node:
                 break # all done
@@ -353,12 +378,7 @@
 
                 # 3) fix _Document reference (may dealloc the original document!)
                 if c_element._private is not NULL:
-                    element = <_Element>c_element._private
-                    if element._doc is not doc:
-                        python.Py_INCREF(doc)
-                        python.Py_DECREF(element._doc)
-                        element._doc = doc
-                        element._gc_doc = <python.PyObject*>doc
+                    _updateProxyDocument(c_element, doc)
 
                 if c_element is c_start_node:
                     break
@@ -372,13 +392,13 @@
         c_element = c_node
 
     # free now unused namespace declarations
-    if c_del_ns is not NULL:
-        tree.xmlFreeNsList(c_del_ns)
+    if c_del_ns_list is not NULL:
+        tree.xmlFreeNsList(c_del_ns_list)
 
     # cleanup
-    if c_ns_new_cache is not NULL:
-        cstd.free(c_ns_new_cache)
-    if c_ns_old_cache is not NULL:
-        cstd.free(c_ns_old_cache)
+    if c_ns_cache.new is not NULL:
+        cstd.free(c_ns_cache.new)
+    if c_ns_cache.old is not NULL:
+        cstd.free(c_ns_cache.old)
 
     return 0


More information about the lxml-checkins mailing list