[Lxml-checkins] r47960 - in lxml/trunk: . doc src/lxml/html src/lxml/html/tests

ianb at codespeak.net ianb at codespeak.net
Thu Oct 25 18:31:49 CEST 2007


Author: ianb
Date: Thu Oct 25 18:31:48 2007
New Revision: 47960

Modified:
   lxml/trunk/CHANGES.txt
   lxml/trunk/doc/lxmlhtml.txt
   lxml/trunk/src/lxml/html/clean.py
   lxml/trunk/src/lxml/html/tests/test_clean.py
   lxml/trunk/src/lxml/html/tests/test_clean.txt
Log:
Added a host_whitelist option and some other opt-in options to lxml.html.clean

Modified: lxml/trunk/CHANGES.txt
==============================================================================
--- lxml/trunk/CHANGES.txt	(original)
+++ lxml/trunk/CHANGES.txt	Thu Oct 25 18:31:48 2007
@@ -17,6 +17,10 @@
   ``NOPARSE_MARKUP`` (like ``# doctest: +NOPARSE_MARKUP``) to suppress
   the special checking for one test.
 
+* ``lxml.html.clean.Cleaner`` now allows for a ``host_whitelist``, and
+  two overridable methods: ``allow_embedded_url(el, url)`` and the
+  more general ``allow_element(el)``.
+
 Bugs fixed
 ----------
 

Modified: lxml/trunk/doc/lxmlhtml.txt
==============================================================================
--- lxml/trunk/doc/lxmlhtml.txt	(original)
+++ lxml/trunk/doc/lxmlhtml.txt	Thu Oct 25 18:31:48 2007
@@ -531,6 +531,11 @@
       </body>
     </html>
 
+You can also whitelist some otherwise dangerous content with
+``Cleaner(host_whitelist=['www.youtube.com'])``, which would allow
+embedded media from YouTube, while still filtering out embedded media
+from other sites.
+
 See the docstring of ``Cleaner`` for the details of what can be
 cleaned.
 

Modified: lxml/trunk/src/lxml/html/clean.py
==============================================================================
--- lxml/trunk/src/lxml/html/clean.py	(original)
+++ lxml/trunk/src/lxml/html/clean.py	Thu Oct 25 18:31:48 2007
@@ -1,4 +1,5 @@
 import re
+import urlparse
 from lxml import etree
 from lxml.html import defs
 from lxml.html import fromstring, tostring
@@ -124,6 +125,25 @@
     ``add_nofollow``:
         If true, then any <a> tags will have ``rel="nofollow"`` added to them.
 
+    ``host_whitelist``:
+        A list or set of hosts that you can use for embedded content
+        (for content like ``<object>``, ``<link rel="stylesheet">``, etc).
+        You can also implement/override the method
+        ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to
+        implement more complex rules for what can be embedded.
+        Anything that passes this test will be shown, regardless of
+        the value of (for instance) ``embedded``.
+
+        Note that this parameter might not work as intended if you do not
+        make the links absolute before doing the cleaning.
+
+    ``whitelist_tags``:
+        A set of tags that can be included with ``host_whitelist``.
+        The default is ``iframe`` and ``embed``; you may wish to
+        include other tags like ``script``, or you may want to
+        implement ``allow_embedded_url`` for more control.  Set to None to
+        include all tags.
+
     This modifies the document *in place*.
     """
 
@@ -144,6 +164,8 @@
     remove_unknown_tags = True
     safe_attrs_only = True
     add_nofollow = False
+    host_whitelist = ()
+    whitelist_tags = set(['iframe', 'embed'])
 
     def __init__(self, **kw):
         for name, value in kw.items():
@@ -152,12 +174,34 @@
                     "Unknown parameter: %s=%r" % (name, value))
             setattr(self, name, value)
 
+    # Used to lookup the primary URL for a given tag that is up for
+    # removal:
+    _tag_link_attrs = dict(
+        script='src',
+        link='href',
+        # From: http://java.sun.com/j2se/1.4.2/docs/guide/misc/applet.html
+        # From what I can tell, both attributes can contain a link:
+        applet=['code', 'object'],
+        iframe='src',
+        embed='src',
+        layer='src',
+        # FIXME: there doesn't really seem like a general way to figure out what
+        # links an <object> tag uses; links often go in <param> tags with values
+        # that we don't really know.  You'd have to have knowledge about specific
+        # kinds of plugins (probably keyed off classid), and match against those.
+        ##object=?,
+        # FIXME: not looking at the action currently, because it is more complex
+        # than than -- if you keep the form, you should keep the form controls.
+        ##form='action',
+        a='href',
+        )
+
     def __call__(self, doc):
         """
         Cleans the document.
         """
         if hasattr(doc, 'getroot'):
-            # ElementTree
+            # ElementTree instance, instead of an element
             doc = doc.getroot()
         # Normalize a case that IE treats <image> like <img>, and that
         # can confuse either this step or later steps.
@@ -243,12 +287,22 @@
             remove_tags.update(('head', 'html', 'title'))
         if self.embedded:
             # FIXME: is <layer> really embedded?
-            kill_tags.update(('applet', 'param'))
+            # We should get rid of any <param> tags not inside <applet>;
+            # These are not really valid anyway.
+            for el in list(doc.getiterator('param')):
+                found_parent = False
+                parent = el.getparent()
+                while parent is not None and parent.tag not in ('applet', 'object'):
+                    parent = parent.getparent()
+                if parent is None:
+                    el.drop_tree()
+            kill_tags.update(('applet',))
             # The alternate contents that are in an iframe are a good fallback:
-            # FIXME: somehow embed seems to be getting data, but from what I
-            # can tell the embed tag is supposed to always be empty
-            remove_tags.update(('iframe', 'object', 'embed', 'layer'))
+            remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param'))
         if self.frames:
+            # FIXME: ideally we should look at the frame links, but
+            # generally frames don't mix properly with an HTML
+            # fragment anyway.
             kill_tags.update(defs.frame_tags)
         if self.forms:
             remove_tags.add('form')
@@ -260,8 +314,12 @@
         _kill = []
         for el in doc.getiterator():
             if el.tag in kill_tags:
+                if self.allow_element(el):
+                    continue
                 _kill.append(el)
             elif el.tag in remove_tags:
+                if self.allow_element(el):
+                    continue
                 _remove.append(el)
 
         if _remove and _remove[0] == doc:
@@ -298,7 +356,34 @@
                 el.drop_tag()
         if self.add_nofollow:
             for el in _find_external_links(doc):
-                el.set('rel', 'nofollow')
+                if not self.allow_follow(el):
+                    el.set('rel', 'nofollow')
+
+    def allow_follow(self, anchor):
+        """
+        Override to suppress rel="nofollow" on some anchors.
+        """
+        return False
+
+    def allow_element(self, el):
+        if el.tag not in self._tag_link_attrs:
+            return False
+        url = el.get(self._tag_link_attrs[el.tag])
+        if not url:
+            return False
+        return self.allow_embedded_url(el, url)
+
+    def allow_embedded_url(self, el, url):
+        if (self.whitelist_tags is not None
+            and el.tag not in self.whitelist_tags):
+            return False
+        scheme, netloc, path, query, fragment = urlparse.urlsplit(url)
+        netloc = netloc.lower().split(':', 1)[0]
+        if scheme not in ('http', 'https'):
+            return False
+        if netloc in self.host_whitelist:
+            return True
+        return False
 
     def kill_conditional_comments(self, doc):
         """

Modified: lxml/trunk/src/lxml/html/tests/test_clean.py
==============================================================================
--- lxml/trunk/src/lxml/html/tests/test_clean.py	(original)
+++ lxml/trunk/src/lxml/html/tests/test_clean.py	Thu Oct 25 18:31:48 2007
@@ -5,6 +5,3 @@
     suite = unittest.TestSuite()
     suite.addTests([doctest.DocFileSuite('test_clean.txt')])
     return suite
-
-if __name__ == '__main__':
-    unittest.main()

Modified: lxml/trunk/src/lxml/html/tests/test_clean.txt
==============================================================================
--- lxml/trunk/src/lxml/html/tests/test_clean.txt	(original)
+++ lxml/trunk/src/lxml/html/tests/test_clean.txt	Thu Oct 25 18:31:48 2007
@@ -117,3 +117,29 @@
     <img src="evil!">
   </body>
 </html>
+
+>>> doc_embed = '''<div>
+... <embed src="http://www.youtube.com/v/183tVH1CZpA" type="application/x-shockwave-flash"></embed>
+... <embed src="http://anothersite.com/v/another"></embed>
+... <script src="http://www.youtube.com/example.js"></script>
+... <script src="/something-else.js"></script>
+... </div>'''
+>>> print tostring(fromstring(doc_embed))
+<div>
+<embed src="http://www.youtube.com/v/183tVH1CZpA" type="application/x-shockwave-flash"></embed>
+<embed src="http://anothersite.com/v/another"></embed>
+<script src="http://www.youtube.com/example.js"></script>
+<script src="/something-else.js"></script>
+</div>
+>>> print Cleaner().clean_html(doc_embed)
+<div>
+</div>
+>>> print Cleaner(host_whitelist=['www.youtube.com']).clean_html(doc_embed)
+<div>
+<embed src="http://www.youtube.com/v/183tVH1CZpA" type="application/x-shockwave-flash"></embed>
+</div>
+>>> print Cleaner(host_whitelist=['www.youtube.com'], whitelist_tags=None).clean_html(doc_embed)
+<div>
+<embed src="http://www.youtube.com/v/183tVH1CZpA" type="application/x-shockwave-flash"></embed>
+<script src="http://www.youtube.com/example.js"></script>
+</div>


More information about the lxml-checkins mailing list