[Lxml-checkins] r44111 - in lxml/branch/html: . src/lxml src/lxml/html src/lxml/tests

scoder at codespeak.net scoder at codespeak.net
Sat Jun 9 10:06:40 CEST 2007


Author: scoder
Date: Sat Jun  9 10:06:40 2007
New Revision: 44111

Modified:
   lxml/branch/html/CHANGES.txt
   lxml/branch/html/src/lxml/apihelpers.pxi
   lxml/branch/html/src/lxml/etree.pyx
   lxml/branch/html/src/lxml/html/clean.py
   lxml/branch/html/src/lxml/tests/test_elementtree.py
   lxml/branch/html/src/lxml/tests/test_etree.py
Log:
support for Comment, PI and Entity in getiterator(tag)

Modified: lxml/branch/html/CHANGES.txt
==============================================================================
--- lxml/branch/html/CHANGES.txt	(original)
+++ lxml/branch/html/CHANGES.txt	Sat Jun  9 10:06:40 2007
@@ -50,6 +50,9 @@
 Bugs fixed
 ----------
 
+* ``Element.getiterator(tag)`` did not accept ``Comment`` and
+  ``ProcessingInstruction`` as tags
+
 * The XML parser did not report undefined entities as error
 
 * The text in exceptions raised by XML parsers, validators and XPath

Modified: lxml/branch/html/src/lxml/apihelpers.pxi
==============================================================================
--- lxml/branch/html/src/lxml/apihelpers.pxi	(original)
+++ lxml/branch/html/src/lxml/apihelpers.pxi	Sat Jun  9 10:06:40 2007
@@ -459,6 +459,9 @@
     * its name string equals the c_name string
     """
     cdef char* c_node_href
+    if c_node.type != tree.XML_ELEMENT_NODE:
+        # not an element, only succeed if we match everything
+        return c_name is NULL and c_href is NULL
     if c_name is NULL:
         if c_href is NULL:
             # always match

Modified: lxml/branch/html/src/lxml/etree.pyx
==============================================================================
--- lxml/branch/html/src/lxml/etree.pyx	(original)
+++ lxml/branch/html/src/lxml/etree.pyx	Sat Jun  9 10:06:40 2007
@@ -1631,17 +1631,24 @@
 cdef public class _ElementTagMatcher [ object LxmlElementTagMatcher,
                                        type LxmlElementTagMatcherType ]:
     cdef object _pystrings
+    cdef int _node_type
     cdef char* _href
     cdef char* _name
     cdef _initTagMatch(self, tag):
+        self._href = NULL
+        self._name = NULL
         if tag is None:
-            self._href = NULL
-            self._name = NULL
+            self._node_type = 0
+        elif tag is Comment:
+            self._node_type = tree.XML_COMMENT_NODE
+        elif tag is ProcessingInstruction:
+            self._node_type = tree.XML_PI_NODE
+        elif tag is Entity:
+            self._node_type = tree.XML_ENTITY_REF_NODE
         else:
+            self._node_type = tree.XML_ELEMENT_NODE
             self._pystrings = _getNsTag(tag)
-            if self._pystrings[0] is None:
-                self._href = NULL
-            else:
+            if self._pystrings[0] is not None:
                 self._href = _cstr(self._pystrings[0])
             self._name = _cstr(self._pystrings[1])
             if self._name[0] == c'*' and self._name[1] == c'\0':
@@ -1659,7 +1666,9 @@
         cdef xmlNode* c_node
         c_node = self._next_element(node._c_node)
         while c_node is not NULL and \
-                  not _tagMatches(c_node, self._href, self._name):
+                  self._node_type != 0 and \
+                  (self._node_type != c_node.type or
+                   not _tagMatches(c_node, self._href, self._name)):
             c_node = self._next_element(c_node)
         if c_node is NULL:
             self._node = None
@@ -1690,7 +1699,9 @@
             self._next_element = _nextElement
         if tag is not None:
             while c_node is not NULL and \
-                      not _tagMatches(c_node, self._href, self._name):
+                      self._node_type != 0 and \
+                      (self._node_type != c_node.type or
+                       not _tagMatches(c_node, self._href, self._name)):
                 c_node = self._next_element(c_node)
         if c_node is not NULL:
             # store Python ref:
@@ -1736,14 +1747,15 @@
     # keep next node to return and a depth counter in the tree
     cdef _Element _next_node
     cdef _Element _top_node
-    cdef int _include_all_types
     def __init__(self, _Element node not None, tag=None, inclusive=True):
         self._top_node  = node
         self._next_node = node
         self._initTagMatch(tag)
-        if tag is not None and \
-               not _tagMatches(node._c_node, self._href, self._name) or \
-               not inclusive:
+        if not inclusive or \
+               tag is not None and \
+               self._node_type != 0 and \
+               (self._node_type != node._c_node.type or
+                not _tagMatches(node._c_node, self._href, self._name)):
             # this cannot raise StopIteration, self._next_node != None
             self.next()
 
@@ -1769,7 +1781,8 @@
 
     cdef xmlNode* _nextNodeAnyTag(self, xmlNode* c_node):
         tree.BEGIN_FOR_EACH_ELEMENT_FROM(self._top_node._c_node, c_node, 0)
-        return c_node
+        if self._node_type == 0 or self._node_type == c_node.type:
+            return c_node
         tree.END_FOR_EACH_ELEMENT_FROM(c_node)
         return NULL
 

Modified: lxml/branch/html/src/lxml/html/clean.py
==============================================================================
--- lxml/branch/html/src/lxml/html/clean.py	(original)
+++ lxml/branch/html/src/lxml/html/clean.py	Sat Jun  9 10:06:40 2007
@@ -160,9 +160,8 @@
     # comments that could be conditional
     if not comments:
         bad = []
-        for el in doc.getiterator():
-            if (isinstance(el, etree.CommentBase)
-                and _conditional_comment_re.search(el.text)):
+        for el in doc.getiterator(etree.Comment):
+            if _conditional_comment_re.search(el.text):
                 bad.append(el)
         for el in bad:
             el.drop_element()

Modified: lxml/branch/html/src/lxml/tests/test_elementtree.py
==============================================================================
--- lxml/branch/html/src/lxml/tests/test_elementtree.py	(original)
+++ lxml/branch/html/src/lxml/tests/test_elementtree.py	Sat Jun  9 10:06:40 2007
@@ -1419,6 +1419,56 @@
             [a2],
             list(c.getiterator('a')))
 
+    def test_getiterator_filter_comment(self):
+        Element = self.etree.Element
+        Comment = self.etree.Comment
+        SubElement = self.etree.SubElement
+
+        a = Element('a')
+        b = SubElement(a, 'b')
+        comment_b = Comment("TEST-b")
+        b.append(comment_b)
+
+        self.assertEquals(
+            [comment_b],
+            list(a.getiterator(Comment)))
+
+        comment_a = Comment("TEST-a")
+        a.append(comment_a)
+
+        self.assertEquals(
+            [comment_b, comment_a],
+            list(a.getiterator(Comment)))
+
+        self.assertEquals(
+            [comment_b],
+            list(b.getiterator(Comment)))
+
+    def test_getiterator_filter_pi(self):
+        Element = self.etree.Element
+        PI = self.etree.ProcessingInstruction
+        SubElement = self.etree.SubElement
+
+        a = Element('a')
+        b = SubElement(a, 'b')
+        pi_b = PI("TEST-b")
+        b.append(pi_b)
+
+        self.assertEquals(
+            [pi_b],
+            list(a.getiterator(PI)))
+
+        pi_a = PI("TEST-a")
+        a.append(pi_a)
+
+        self.assertEquals(
+            [pi_b, pi_a],
+            list(a.getiterator(PI)))
+
+        self.assertEquals(
+            [pi_b],
+            list(b.getiterator(PI)))
+
     def test_getiterator_with_text(self):
         Element = self.etree.Element
         SubElement = self.etree.SubElement

Modified: lxml/branch/html/src/lxml/tests/test_etree.py
==============================================================================
--- lxml/branch/html/src/lxml/tests/test_etree.py	(original)
+++ lxml/branch/html/src/lxml/tests/test_etree.py	Sat Jun  9 10:06:40 2007
@@ -1229,6 +1229,31 @@
             [d, f],
             list(a.getiterator('{b}*')))
 
+    def test_getiterator_filter_entities(self):
+        Element = self.etree.Element
+        Entity = self.etree.Entity
+        SubElement = self.etree.SubElement
+
+        a = Element('a')
+        b = SubElement(a, 'b')
+        entity_b = Entity("TEST-b")
+        b.append(entity_b)
+
+        self.assertEquals(
+            [entity_b],
+            list(a.getiterator(Entity)))
+
+        entity_a = Entity("TEST-a")
+        a.append(entity_a)
+
+        self.assertEquals(
+            [entity_b, entity_a],
+            list(a.getiterator(Entity)))
+
+        self.assertEquals(
+            [entity_b],
+            list(b.getiterator(Entity)))
+
     def test_findall_ns(self):
         XML = self.etree.XML
         root = XML('<a xmlns:x="X" xmlns:y="Y"><x:b><c/></x:b><b/><c><x:b/><b/></c><b/></a>')


More information about the lxml-checkins mailing list