[Lxml-checkins] r44088 - lxml/branch/html/src/lxml/html

scoder at codespeak.net scoder at codespeak.net
Thu Jun 7 14:00:52 CEST 2007


Author: scoder
Date: Thu Jun  7 14:00:51 2007
New Revision: 44088

Modified:
   lxml/branch/html/src/lxml/html/clean.py
Log:
more cleanup: use sets, precompiled XPath expressions and getiterator() where possible

Modified: lxml/branch/html/src/lxml/html/clean.py
==============================================================================
--- lxml/branch/html/src/lxml/html/clean.py	(original)
+++ lxml/branch/html/src/lxml/html/clean.py	Thu Jun  7 14:00:51 2007
@@ -48,6 +48,12 @@
 _conditional_comment_re = re.compile(
     r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S)
 
+_find_styled_elements = etree.XPath(
+    "descendant-or-self::*[@style]")
+
+_find_external_links = etree.XPath(
+    "descendant-or-self::a[string(@href) and substring(@href,0,1) != '#']")
+
 def clean_html(html, **kw):
     """
     Like clean(), but takes a text input document, and returns a text
@@ -165,6 +171,8 @@
         el.tag = 'img'
     kill_tags = set()
     remove_tags = set(remove_tags or ())
+    if allow_tags:
+        allow_tags  = set(allow_tags)
     if scripts:
         kill_tags.add('script')
     if safe_attrs_only:
@@ -186,7 +194,7 @@
         if not style:
             # If we're deleting style then we don't have to remove JS links
             # from styles, otherwise...
-            for el in doc.xpath('descendant-or-self::*[@style]'):
+            for el in _find_styled_elements(doc):
                 old = el.get('style')
                 new = _css_javascript_re.sub('', old)
                 new = _css_import_re.sub('', old)
@@ -195,7 +203,7 @@
                     del el.attrib['style']
                 elif new != old:
                     el.set('style', new)
-            for el in doc.xpath('descendant-or-self::style'):
+            for el in list(doc.getiterator('style')):
                 if el.get('type', '').lower().strip() == 'text/javascript':
                     el.drop_element()
                     continue
@@ -222,10 +230,10 @@
 #            del el.attrib['xmlns']
     if style:
         kill_tags.add('style')
-        for el in doc.xpath('descendant-or-self::link'):
-            if 'stylesheet' in el.attrib.get('rel', '').lower():
+        for el in doc.getiterator('link'):
+            if 'stylesheet' in el.get('rel', '').lower():
                 el.drop_element()
-        for el in doc.xpath('descendant-or-self::*[@style]'):
+        for el in _find_styled_elements(doc):
             del el.attrib['style']
     if links:
         kill_tags.add('link')
@@ -288,25 +296,21 @@
         if allow_tags:
             raise ValueError(
                 "It does not make sense to pass in both allow_tags and remove_unknown_tags")
-        allow_tags = defs.tags
+        allow_tags = set(defs.tags)
     if allow_tags:
         bad = []
         for el in doc.getiterator():
             if el.tag not in allow_tags:
                 bad.append(el)
-        for el in bad:
-            if strip_tags:
+        if strip_tags:
+            for el in bad:
                 el.drop_tag()
-            else:
-                # FIXME: Should we test if this has been removed because of a parent?
+        else:
+            for el in bad:
                 el.drop_element()
     if add_nofollow:
-        for el in doc.xpath('descendant-or-self::a[@href]'):
-            href = el.attrib['href']
-            if not href or href.startswith('#'):
-                # internal link, we don't care
-                continue
-            el.attrib['rel'] = 'nofollow'
+        for el in _find_external_links(doc):
+            el.set('rel', 'nofollow')
 
 def _remove_javascript(link):
     # links like "j a v a s c r i p t:" might be interpreted in IE


More information about the lxml-checkins mailing list