[Lxml-checkins] r44111 - in lxml/branch/html: . src/lxml src/lxml/html src/lxml/tests
scoder at codespeak.net
scoder at codespeak.net
Sat Jun 9 10:06:40 CEST 2007
Author: scoder
Date: Sat Jun 9 10:06:40 2007
New Revision: 44111
Modified:
lxml/branch/html/CHANGES.txt
lxml/branch/html/src/lxml/apihelpers.pxi
lxml/branch/html/src/lxml/etree.pyx
lxml/branch/html/src/lxml/html/clean.py
lxml/branch/html/src/lxml/tests/test_elementtree.py
lxml/branch/html/src/lxml/tests/test_etree.py
Log:
support for Comment, PI and Entity in getiterator(tag)
Modified: lxml/branch/html/CHANGES.txt
==============================================================================
--- lxml/branch/html/CHANGES.txt (original)
+++ lxml/branch/html/CHANGES.txt Sat Jun 9 10:06:40 2007
@@ -50,6 +50,9 @@
Bugs fixed
----------
+* ``Element.getiterator(tag)`` did not accept ``Comment`` and
+ ``ProcessingInstruction`` as tags
+
* The XML parser did not report undefined entities as error
* The text in exceptions raised by XML parsers, validators and XPath
Modified: lxml/branch/html/src/lxml/apihelpers.pxi
==============================================================================
--- lxml/branch/html/src/lxml/apihelpers.pxi (original)
+++ lxml/branch/html/src/lxml/apihelpers.pxi Sat Jun 9 10:06:40 2007
@@ -459,6 +459,9 @@
* its name string equals the c_name string
"""
cdef char* c_node_href
+ if c_node.type != tree.XML_ELEMENT_NODE:
+ # not an element, only succeed if we match everything
+ return c_name is NULL and c_href is NULL
if c_name is NULL:
if c_href is NULL:
# always match
Modified: lxml/branch/html/src/lxml/etree.pyx
==============================================================================
--- lxml/branch/html/src/lxml/etree.pyx (original)
+++ lxml/branch/html/src/lxml/etree.pyx Sat Jun 9 10:06:40 2007
@@ -1631,17 +1631,24 @@
cdef public class _ElementTagMatcher [ object LxmlElementTagMatcher,
type LxmlElementTagMatcherType ]:
cdef object _pystrings
+ cdef int _node_type
cdef char* _href
cdef char* _name
cdef _initTagMatch(self, tag):
+ self._href = NULL
+ self._name = NULL
if tag is None:
- self._href = NULL
- self._name = NULL
+ self._node_type = 0
+ elif tag is Comment:
+ self._node_type = tree.XML_COMMENT_NODE
+ elif tag is ProcessingInstruction:
+ self._node_type = tree.XML_PI_NODE
+ elif tag is Entity:
+ self._node_type = tree.XML_ENTITY_REF_NODE
else:
+ self._node_type = tree.XML_ELEMENT_NODE
self._pystrings = _getNsTag(tag)
- if self._pystrings[0] is None:
- self._href = NULL
- else:
+ if self._pystrings[0] is not None:
self._href = _cstr(self._pystrings[0])
self._name = _cstr(self._pystrings[1])
if self._name[0] == c'*' and self._name[1] == c'\0':
@@ -1659,7 +1666,9 @@
cdef xmlNode* c_node
c_node = self._next_element(node._c_node)
while c_node is not NULL and \
- not _tagMatches(c_node, self._href, self._name):
+ self._node_type != 0 and \
+ (self._node_type != c_node.type or
+ not _tagMatches(c_node, self._href, self._name)):
c_node = self._next_element(c_node)
if c_node is NULL:
self._node = None
@@ -1690,7 +1699,9 @@
self._next_element = _nextElement
if tag is not None:
while c_node is not NULL and \
- not _tagMatches(c_node, self._href, self._name):
+ self._node_type != 0 and \
+ (self._node_type != c_node.type or
+ not _tagMatches(c_node, self._href, self._name)):
c_node = self._next_element(c_node)
if c_node is not NULL:
# store Python ref:
@@ -1736,14 +1747,15 @@
# keep next node to return and a depth counter in the tree
cdef _Element _next_node
cdef _Element _top_node
- cdef int _include_all_types
def __init__(self, _Element node not None, tag=None, inclusive=True):
self._top_node = node
self._next_node = node
self._initTagMatch(tag)
- if tag is not None and \
- not _tagMatches(node._c_node, self._href, self._name) or \
- not inclusive:
+ if not inclusive or \
+ tag is not None and \
+ self._node_type != 0 and \
+ (self._node_type != node._c_node.type or
+ not _tagMatches(node._c_node, self._href, self._name)):
# this cannot raise StopIteration, self._next_node != None
self.next()
@@ -1769,7 +1781,8 @@
cdef xmlNode* _nextNodeAnyTag(self, xmlNode* c_node):
tree.BEGIN_FOR_EACH_ELEMENT_FROM(self._top_node._c_node, c_node, 0)
- return c_node
+ if self._node_type == 0 or self._node_type == c_node.type:
+ return c_node
tree.END_FOR_EACH_ELEMENT_FROM(c_node)
return NULL
Modified: lxml/branch/html/src/lxml/html/clean.py
==============================================================================
--- lxml/branch/html/src/lxml/html/clean.py (original)
+++ lxml/branch/html/src/lxml/html/clean.py Sat Jun 9 10:06:40 2007
@@ -160,9 +160,8 @@
# comments that could be conditional
if not comments:
bad = []
- for el in doc.getiterator():
- if (isinstance(el, etree.CommentBase)
- and _conditional_comment_re.search(el.text)):
+ for el in doc.getiterator(etree.Comment):
+ if _conditional_comment_re.search(el.text):
bad.append(el)
for el in bad:
el.drop_element()
Modified: lxml/branch/html/src/lxml/tests/test_elementtree.py
==============================================================================
--- lxml/branch/html/src/lxml/tests/test_elementtree.py (original)
+++ lxml/branch/html/src/lxml/tests/test_elementtree.py Sat Jun 9 10:06:40 2007
@@ -1419,6 +1419,56 @@
[a2],
list(c.getiterator('a')))
+ def test_getiterator_filter_comment(self):
+ Element = self.etree.Element
+ Comment = self.etree.Comment
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ comment_b = Comment("TEST-b")
+ b.append(comment_b)
+
+ self.assertEquals(
+ [comment_b],
+ list(a.getiterator(Comment)))
+
+ comment_a = Comment("TEST-a")
+ a.append(comment_a)
+
+ self.assertEquals(
+ [comment_b, comment_a],
+ list(a.getiterator(Comment)))
+
+ self.assertEquals(
+ [comment_b],
+ list(b.getiterator(Comment)))
+
+ def test_getiterator_filter_pi(self):
+ Element = self.etree.Element
+ PI = self.etree.ProcessingInstruction
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ pi_b = PI("TEST-b")
+ b.append(pi_b)
+
+ self.assertEquals(
+ [pi_b],
+ list(a.getiterator(PI)))
+
+ pi_a = PI("TEST-a")
+ a.append(pi_a)
+
+ self.assertEquals(
+ [pi_b, pi_a],
+ list(a.getiterator(PI)))
+
+ self.assertEquals(
+ [pi_b],
+ list(b.getiterator(PI)))
+
def test_getiterator_with_text(self):
Element = self.etree.Element
SubElement = self.etree.SubElement
Modified: lxml/branch/html/src/lxml/tests/test_etree.py
==============================================================================
--- lxml/branch/html/src/lxml/tests/test_etree.py (original)
+++ lxml/branch/html/src/lxml/tests/test_etree.py Sat Jun 9 10:06:40 2007
@@ -1229,6 +1229,31 @@
[d, f],
list(a.getiterator('{b}*')))
+ def test_getiterator_filter_entities(self):
+ Element = self.etree.Element
+ Entity = self.etree.Entity
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ entity_b = Entity("TEST-b")
+ b.append(entity_b)
+
+ self.assertEquals(
+ [entity_b],
+ list(a.getiterator(Entity)))
+
+ entity_a = Entity("TEST-a")
+ a.append(entity_a)
+
+ self.assertEquals(
+ [entity_b, entity_a],
+ list(a.getiterator(Entity)))
+
+ self.assertEquals(
+ [entity_b],
+ list(b.getiterator(Entity)))
+
def test_findall_ns(self):
XML = self.etree.XML
root = XML('<a xmlns:x="X" xmlns:y="Y"><x:b><c/></x:b><b/><c><x:b/><b/></c><b/></a>')
More information about the lxml-checkins
mailing list