[Lxml-checkins] r53788 - in lxml/trunk: . src/lxml
scoder at codespeak.net
scoder at codespeak.net
Tue Apr 15 17:44:13 CEST 2008
Author: scoder
Date: Tue Apr 15 17:44:10 2008
New Revision: 53788
Modified:
lxml/trunk/ (props changed)
lxml/trunk/CHANGES.txt
lxml/trunk/src/lxml/proxy.pxi
Log:
r3980 at delle: sbehnel | 2008-04-15 17:42:57 +0200
huge cleanup in moveNodeToDocument() function
Modified: lxml/trunk/CHANGES.txt
==============================================================================
--- lxml/trunk/CHANGES.txt (original)
+++ lxml/trunk/CHANGES.txt Tue Apr 15 17:44:10 2008
@@ -29,6 +29,10 @@
Other changes
-------------
+* Major cleanup in internal ``moveNodeToDocument()`` function, which
+ takes care of namespace cleanup when moving elements between
+ different namespace contexts.
+
* New Elements created through the ``makeelement()`` method of an HTML
parser or through lxml.html now end up in a new HTML document
(doctype HTML 4.01 Transitional) instead of a generic XML document.
Modified: lxml/trunk/src/lxml/proxy.pxi
==============================================================================
--- lxml/trunk/src/lxml/proxy.pxi (original)
+++ lxml/trunk/src/lxml/proxy.pxi Tue Apr 15 17:44:10 2008
@@ -46,6 +46,18 @@
python.Py_XDECREF(proxy._gc_doc)
proxy._gc_doc = NULL
+cdef inline void _updateProxyDocument(xmlNode* c_node, _Document doc):
+ """Replace the document reference of a proxy.
+
+ This may deallocate the original document of the proxy!
+ """
+ cdef _Element element = <_Element>c_node._private
+ if element._doc is not doc:
+ python.Py_INCREF(doc)
+ python.Py_DECREF(element._doc)
+ element._doc = doc
+ element._gc_doc = <python.PyObject*>doc
+
################################################################################
# temporarily make a node the root node of its document
@@ -198,6 +210,72 @@
c_new_ns = c_new_ns.next
c_parent = c_parent.parent
+ctypedef struct _nscache:
+ xmlNs** new
+ xmlNs** old
+ cstd.size_t size
+ cstd.size_t last
+
+cdef int _growNsCache(_nscache* c_ns_cache) except -1:
+ cdef xmlNs** c_ns_ptr
+ if c_ns_cache.size == 0:
+ c_ns_cache.size = 20
+ else:
+ c_ns_cache.size *= 2
+ c_ns_ptr = <xmlNs**> cstd.realloc(
+ c_ns_cache.new, c_ns_cache.size * sizeof(xmlNs*))
+ if c_ns_ptr is not NULL:
+ c_ns_cache.new = c_ns_ptr
+ c_ns_ptr = <xmlNs**> cstd.realloc(
+ c_ns_cache.old, c_ns_cache.size * sizeof(xmlNs*))
+ if c_ns_ptr is not NULL:
+ c_ns_cache.old = c_ns_ptr
+ else:
+ cstd.free(c_ns_cache.new)
+ cstd.free(c_ns_cache.old)
+ python.PyErr_NoMemory()
+ return -1
+ return 0
+
+cdef inline int _appendToNsCache(_nscache* c_ns_cache,
+ xmlNs* c_old_ns, xmlNs* c_new_ns) except -1:
+ if c_ns_cache.last >= c_ns_cache.size:
+ _growNsCache(c_ns_cache)
+ c_ns_cache.old[c_ns_cache.last] = c_old_ns
+ c_ns_cache.new[c_ns_cache.last] = c_new_ns
+ c_ns_cache.last += 1
+
+cdef int _stripRedundantNamespaceDeclarations(
+ xmlNode* c_element, _nscache* c_ns_cache, xmlNs** c_del_ns_list) except -1:
+ """Removes namespace declarations from an element that are already
+ defined in its parents. Does not free the xmlNs's, just prepends
+ them to the c_del_ns_list.
+ """
+ cdef xmlNs* c_ns
+ cdef xmlNs* c_ns_next
+ cdef xmlNs** c_nsdef
+ # use a xmlNs** to handle assignments to "c_element.nsDef" correctly
+ c_nsdef = &c_element.nsDef
+ while c_nsdef[0] is not NULL:
+ c_ns = tree.xmlSearchNsByHref(
+ c_element.doc, c_element.parent, c_nsdef[0].href)
+ if c_ns is NULL:
+ # new namespace href => keep and cache the ns declaration
+ _appendToNsCache(c_ns_cache, c_nsdef[0], c_nsdef[0])
+ c_nsdef = &c_nsdef[0].next
+ else:
+ # known namespace href => strip the ns
+ if c_ns is tree.xmlSearchNs(c_element.doc, c_element.parent,
+ c_ns.prefix):
+ # prefix is not shadowed by parents => ns is reusable
+ _appendToNsCache(c_ns_cache, c_nsdef[0], c_ns)
+ # cut out c_nsdef.next and prepend it to garbage chain
+ c_ns_next = c_nsdef[0].next
+ c_nsdef[0].next = c_del_ns_list[0]
+ c_del_ns_list[0] = c_nsdef[0]
+ c_nsdef[0] = c_ns_next
+ return 0
+
cdef int moveNodeToDocument(_Document doc, xmlNode* c_element) except -1:
"""Fix the xmlNs pointers of a node and its subtree that were moved.
@@ -223,96 +301,48 @@
step 1), but freed only after the complete subtree was traversed
and all occurrences were replaced by tree-internal pointers.
"""
- cdef _Element element
- cdef xmlDoc* c_doc
cdef xmlNode* c_start_node
cdef xmlNode* c_node
- cdef xmlNs** c_ns_ptr
- cdef xmlNs** c_ns_new_cache
- cdef xmlNs** c_ns_old_cache
+ cdef _nscache c_ns_cache
cdef xmlNs* c_ns
cdef xmlNs* c_ns_next
cdef xmlNs* c_nsdef
- cdef xmlNs* c_new_ns
- cdef xmlNs* c_del_ns
- cdef cstd.size_t i, c_cache_size, c_cache_last
+ cdef xmlNs* c_del_ns_list
+ cdef cstd.size_t i
if not tree._isElementOrXInclude(c_element):
return 0
- c_doc = c_element.doc
c_start_node = c_element
- c_ns_new_cache = NULL
- c_ns_old_cache = NULL
- c_cache_size = 0
- c_cache_last = 0
- c_del_ns = NULL
+ c_del_ns_list = NULL
+
+ c_ns_cache.new = NULL
+ c_ns_cache.old = NULL
+ c_ns_cache.size = 0
+ c_ns_cache.last = 0
while c_element is not NULL:
# 1) cut out namespaces defined here that are already known by
# the ancestors
- c_nsdef = c_element.nsDef
- if c_nsdef is not NULL:
- # start with second nsdef to keep c_element.nsDef for now
- while c_nsdef.next is not NULL:
- if c_nsdef.next is c_element.ns:
- c_nsdef = c_nsdef.next
- continue
- c_ns = tree.xmlSearchNsByHref(
- c_element.doc, c_element.parent, c_nsdef.next.href)
- if c_ns is NULL:
- c_nsdef = c_nsdef.next
- continue
- # cut out c_nsdef.next and prepend it to garbage chain
- c_ns_next = c_nsdef.next.next
- c_nsdef.next.next = c_del_ns
- c_del_ns = c_nsdef.next
- c_nsdef.next = c_ns_next
- # now handle c_element.nsDef
- c_ns = tree.xmlSearchNsByHref(
- c_element.doc, c_element.parent, c_element.nsDef.href)
- if c_ns is not NULL:
- c_ns_next = c_element.nsDef.next
- c_element.nsDef.next = c_del_ns
- c_del_ns = c_element.nsDef
- c_element.nsDef = c_ns_next
+ if c_element.nsDef is not NULL:
+ _stripRedundantNamespaceDeclarations(
+ c_element, &c_ns_cache, &c_del_ns_list)
- # 2) make sure the namespace of an element and its attributes
- # is declared in this document (i.e. the node or its parents)
+ # 2) make sure the namespaces of an element and its attributes
+ # are declared in this document (i.e. on the node or its parents)
c_node = c_element
while c_node is not NULL:
if c_node.ns is not NULL:
- for i from 0 <= i < c_cache_last:
- if c_node.ns is c_ns_old_cache[i]:
- c_node.ns = c_ns_new_cache[i]
+ for i from 0 <= i < c_ns_cache.last:
+ if c_node.ns is c_ns_cache.old[i]:
+ c_node.ns = c_ns_cache.new[i]
break
else:
# not in cache => find a replacement from this document
- c_new_ns = doc._findOrBuildNodeNs(
+ c_ns = doc._findOrBuildNodeNs(
c_element, c_node.ns.href, c_node.ns.prefix)
- if c_cache_last >= c_cache_size:
- # must resize cache
- if c_cache_size == 0:
- c_cache_size = 20
- else:
- c_cache_size *= 2
- c_ns_ptr = <xmlNs**> cstd.realloc(
- c_ns_new_cache, c_cache_size * sizeof(xmlNs*))
- if c_ns_ptr is not NULL:
- c_ns_new_cache = c_ns_ptr
- c_ns_ptr = <xmlNs**> cstd.realloc(
- c_ns_old_cache, c_cache_size * sizeof(xmlNs*))
- if c_ns_ptr is not NULL:
- c_ns_old_cache = c_ns_ptr
- else:
- cstd.free(c_ns_new_cache)
- cstd.free(c_ns_old_cache)
- python.PyErr_NoMemory()
- return -1
- c_ns_new_cache[c_cache_last] = c_new_ns
- c_ns_old_cache[c_cache_last] = c_node.ns
- c_cache_last += 1
- c_node.ns = c_new_ns
+ _appendToNsCache(&c_ns_cache, c_node.ns, c_ns)
+ c_node.ns = c_ns
if c_node is c_element:
# after the element, continue with its attributes
c_node = <xmlNode*>c_element.properties
@@ -330,12 +360,7 @@
# 3) fix _Document reference (may dealloc the original document!)
if c_element._private is not NULL:
- element = <_Element>c_element._private
- if element._doc is not doc:
- python.Py_INCREF(doc)
- python.Py_DECREF(element._doc)
- element._doc = doc
- element._gc_doc = <python.PyObject*>doc
+ _updateProxyDocument(c_element, doc)
if c_element is c_start_node:
break # all done
@@ -353,12 +378,7 @@
# 3) fix _Document reference (may dealloc the original document!)
if c_element._private is not NULL:
- element = <_Element>c_element._private
- if element._doc is not doc:
- python.Py_INCREF(doc)
- python.Py_DECREF(element._doc)
- element._doc = doc
- element._gc_doc = <python.PyObject*>doc
+ _updateProxyDocument(c_element, doc)
if c_element is c_start_node:
break
@@ -372,13 +392,13 @@
c_element = c_node
# free now unused namespace declarations
- if c_del_ns is not NULL:
- tree.xmlFreeNsList(c_del_ns)
+ if c_del_ns_list is not NULL:
+ tree.xmlFreeNsList(c_del_ns_list)
# cleanup
- if c_ns_new_cache is not NULL:
- cstd.free(c_ns_new_cache)
- if c_ns_old_cache is not NULL:
- cstd.free(c_ns_old_cache)
+ if c_ns_cache.new is not NULL:
+ cstd.free(c_ns_cache.new)
+ if c_ns_cache.old is not NULL:
+ cstd.free(c_ns_cache.old)
return 0
More information about the lxml-checkins
mailing list