[Lxml-checkins] r44057 - lxml/branch/html/src/lxml/html

scoder at codespeak.net scoder at codespeak.net
Wed Jun 6 19:55:51 CEST 2007


Author: scoder
Date: Wed Jun  6 19:55:50 2007
New Revision: 44057

Modified:
   lxml/branch/html/src/lxml/html/clean.py
Log:
cleanup: replace custom iteration function by call to getiterator(), use sets for kill_tags and remove_tags, avoid XPath calls where iteration is done anyway

Modified: lxml/branch/html/src/lxml/html/clean.py
==============================================================================
--- lxml/branch/html/src/lxml/html/clean.py	(original)
+++ lxml/branch/html/src/lxml/html/clean.py	Wed Jun  6 19:55:50 2007
@@ -3,6 +3,11 @@
 from lxml.html import defs
 from lxml.html import parse, tostring
 
+try:
+    import set
+except ImportError:
+    from sets import Set as set
+
 __all__ = ['clean_html', 'clean', 'autolink', 'autolink_html',
            'word_break', 'word_break_html']
 
@@ -52,14 +57,6 @@
     clean(doc, **kw)
     return tostring(doc)
 
-def _itertree(el):
-    """
-    Return the element's descendants, and the element itself
-    """
-    yield el
-    for item in el.iterdescendants():
-        yield item
-
 # FIXME: I really have to figure out what a sane set of defaults is
 # for these keyword arguments.  And is this signature out of control?
 # What about if we want things like whitelisting of <object> or other
@@ -148,12 +145,15 @@
 
     This modifies the document *in place*.
     """
+    if hasattr(doc, 'getroot'):
+        # ElementTree
+        doc = doc.getroot()
     # IE conditional comments basically embed HTML that the parser doesn't
     # normally see.  We can't allow anything like that, so we'll kill any
     # comments that could be conditional
     if not comments:
         bad = []
-        for el in _itertree(doc):
+        for el in doc.getiterator():
             if (isinstance(el, etree.CommentBase)
                 and _conditional_comment_re.search(el.text)):
                 bad.append(el)
@@ -163,10 +163,10 @@
     # confuse either this step or later steps.
     for el in doc.xpath('descendant-or-self::image'):
         el.tag = 'img'
-    kill_tags = []
-    remove_tags = list(remove_tags or [])
+    kill_tags = set()
+    remove_tags = set(remove_tags or ())
     if scripts:
-        kill_tags.append('script')
+        kill_tags.add('script')
     if safe_attrs_only:
         safe_attrs = set(defs.safe_attrs)
         for el in doc.getiterator():
@@ -212,58 +212,85 @@
         # FIXME: why either?  I feel like there's some obscure reason
         # because you can put PIs in comments...?  But I've already
         # forgotten it
-        kill_tags.append(etree.Comment)
+        kill_tags.add(etree.Comment)
     if processing_instructions:
-        kill_tags.append(etree.ProcessingInstruction)
+        kill_tags.add(etree.ProcessingInstruction)
 ## SB: Does this actually work? Definitely not the right place to do this.
 #    if processing_instructions:
 #        # FIXME: is this really the right place to remove these attributes?
 #        for el in doc.xpath('descendant-or-self::*[@xmlns]'):
 #            del el.attrib['xmlns']
     if style:
-        kill_tags.append('style')
+        kill_tags.add('style')
         for el in doc.xpath('descendant-or-self::link'):
             if 'stylesheet' in el.attrib.get('rel', '').lower():
                 el.drop_element()
         for el in doc.xpath('descendant-or-self::*[@style]'):
             del el.attrib['style']
     if links:
-        kill_tags.append('link')
+        kill_tags.add('link')
     elif javascript:
         # FIXME: we should get rid of included stylesheets in this
         # case, as you can put Javascript in them
         pass
     if meta:
-        kill_tags.append('meta')
+        kill_tags.add('meta')
     if page_structure:
-        remove_tags.extend(['head', 'html', 'title'])
+        remove_tags.union(('head', 'html', 'title'))
     if embedded:
         # FIXME: is <layer> really embedded?
-        kill_tags.extend(['applet', 'param'])
+        kill_tags.union(('applet', 'param'))
         # The alternate contents that are in an iframe are a good fallback:
         # FIXME: somehow embed seems to be getting data, but from what I
         # can tell the embed tag is supposed to always be empty
-        remove_tags.extend(['iframe', 'object', 'embed', 'layer'])
+        remove_tags.union(('iframe', 'object', 'embed', 'layer'))
     if frames:
-        kill_tags.extend(defs.frame_tags)
+        kill_tags.union(defs.frame_tags)
     if forms:
-        remove_tags.extend(['form'])
-        kill_tags.extend(['button', 'input', 'select', 'textarea'])
+        remove_tags.add('form')
+        kill_tags.union(('button', 'input', 'select', 'textarea'))
     if annoying_tags:
-        remove_tags.extend(['blink', 'marque'])
-    bad = []
-    for el in _itertree(doc):
-        if el.tag in kill_tags:
-            bad.append(el)
-    for el in bad:
+        remove_tags.union(('blink', 'marque'))
+
+    _remove = []
+    if strip_tags:
+        _kill = []
+        for el in doc.getiterator():
+            if el.tag in kill_tags:
+                _kill.append(el)
+            elif el.tag in remove_tags:
+                _remove.append(el)
+    else:
+        kill_tags.update(remove_tags)
+        _kill = [ el for el in doc.getiterator()
+                  if el.tag in kill_tags ]
+
+    if _remove and _remove[0] == doc:
+        # We have to drop the parent-most tag, which we can't
+        # do.  Instead we'll rewrite it:
+        el = _remove.pop(0)
+        el.tag = 'div'
+        el.attrib.clear()
+    elif _kill and _kill[0] == doc:
+        # We have to drop the parent-most element, which we can't
+        # do.  Instead we'll clear it:
+        el = _kill.pop(0)
+        if el.tag != 'html':
+            el.tag = 'div'
+        el.clear()
+
+    for el in _kill:
         el.drop_element()
-    if remove_tags:
+    for el in _remove:
+        el.drop_tag()
+
+    if False and remove_tags:
         xpath = ' | '.join([
             "descendant-or-self::%s" % tag
             for tag in remove_tags])
         for el in doc.xpath(xpath):
             if strip_tags:
-                if el.getparent():
+                if el.getparent() is not None:
                     el.drop_tag()
                 else:
                     # We have to drop the parent-most tag, which we can't
@@ -273,6 +300,7 @@
             else:
                 # FIXME: Should we test if this has been removed because of a parent?
                 el.drop_element()
+
     if remove_unknown_tags:
         if allow_tags:
             raise ValueError(
@@ -280,7 +308,7 @@
         allow_tags = defs.tags
     if allow_tags:
         bad = []
-        for el in _itertree(doc):
+        for el in doc.getiterator():
             if el.tag not in allow_tags:
                 bad.append(el)
         for el in bad:


More information about the lxml-checkins mailing list