[Lxml-checkins] r43898 - lxml/branch/html/src/lxml/html

ianb at codespeak.net ianb at codespeak.net
Wed May 30 17:46:38 CEST 2007


Author: ianb
Date: Wed May 30 17:46:38 2007
New Revision: 43898

Modified:
   lxml/branch/html/src/lxml/html/__init__.py
   lxml/branch/html/src/lxml/html/clean.py
Log:
rename remove_element to drop_element, remove_tag to drop_tag.  Add clean support for dropping meta tags, and drop applet along with other embedded objects

Modified: lxml/branch/html/src/lxml/html/__init__.py
==============================================================================
--- lxml/branch/html/src/lxml/html/__init__.py	(original)
+++ lxml/branch/html/src/lxml/html/__init__.py	Wed May 30 17:46:38 2007
@@ -11,7 +11,7 @@
 
 class HtmlMixin(object):
 
-    def remove_element(self):
+    def drop_element(self):
         """
         Removes this element from the tree, including its children and
         text.  The tail text is joined to the previous element or
@@ -28,7 +28,7 @@
                 previous.tail = (previous.tail or '') + self.tail
         parent.remove(self)
 
-    def remove_tag(self):
+    def drop_tag(self):
         """
         Remove the tag, but not its children or text.  The children and text
         are merged into the parent.

Modified: lxml/branch/html/src/lxml/html/clean.py
==============================================================================
--- lxml/branch/html/src/lxml/html/clean.py	(original)
+++ lxml/branch/html/src/lxml/html/clean.py	Wed May 30 17:46:38 2007
@@ -4,6 +4,13 @@
 
 __all__ = ['clean_html', 'clean']
 
+# FIXME: I should study this for more ideas: http://feedparser.org/docs/html-sanitization.html
+# In CSS/style attribute:
+#   url(javascript:...)
+#   expression(...)
+# Other on* attributes that aren't standard?
+# Try these tests: http://feedparser.org/tests/wellformed/sanitize/
+
 def clean_html(html, **kw):
     """
     Like clean(), but takes a text input document, and returns a text
@@ -20,6 +27,7 @@
           # process instructions?
           style=False,
           links=False,
+          meta=False,
           embedded=True,
           frames=True,
           forms=True,
@@ -48,6 +56,9 @@
     ``links``:
         Remove any ``<link>`` tags
 
+    ``meta``:
+        Remove any ``<meta>`` tags
+
     ``frames``:
         Remove any frame-related tags
 
@@ -99,17 +110,19 @@
             if isinstance(el, etree._Comment):
                 bad.append(el)
         for el in bad:
-            el.remove_element()
+            el.drop_element()
     if style:
         kill_tags.append('style')
         for el in doc.xpath('descendant-or-self::link[lower-case(@rel)="stylesheet"]'):
-            el.remove_element()
+            el.drop_element()
         for el in doc.xpath('descendant-or-self::*[@style]'):
             del el.attrib['style']
     if links:
         kill_tags.append('link')
+    if meta:
+        kill_tags.append('meta')
     if embedded:
-        kill_tags.extend(['object', 'embed', 'iframe'])
+        kill_tags.extend(['object', 'embed', 'iframe', 'applet'])
     if frames:
         kill_tags.extend(defs.frame_tags)
     if forms:
@@ -122,17 +135,17 @@
         if el.tag in kill_tags:
             bad.append(el)
     for el in bad:
-        el.remove_element()
+        el.drop_element()
     if remove_tags:
         xpath = ' | '.join([
             "descendant-or-self::%s" % tag
             for tag in remove_tags])
         for el in doc.xpath(xpath):
             if strip_tags:
-                el.remove_tag()
+                el.drop_tag()
             else:
                 # FIXME: Should we test if this has been removed because of a parent?
-                el.remove_element()
+                el.drop_element()
     if remove_unknown_tags:
         if allow_tags:
             raise ValueError(
@@ -145,10 +158,10 @@
                 bad.append(el)
         for el in bad:
             if strip_tags:
-                el.remove_tag()
+                el.drop_tag()
             else:
                 # FIXME: Should we test if this has been removed because of a parent?
-                el.remove_element()
+                el.drop_element()
     if add_nofollow:
         for el in doc.xpath('descendant-or-self::a[@href]'):
             href = el.attrib['href']


More information about the lxml-checkins mailing list