[Lxml-checkins] r43968 - in lxml/branch/html/src/lxml/html: . tests

ianb at codespeak.net ianb at codespeak.net
Fri Jun 1 08:35:35 CEST 2007


Author: ianb
Date: Fri Jun  1 08:35:34 2007
New Revision: 43968

Modified:
   lxml/branch/html/src/lxml/html/__init__.py
   lxml/branch/html/src/lxml/html/clean.py
   lxml/branch/html/src/lxml/html/tests/test_clean.txt
Log:
Clean using rewrite_links; catch expression() in styles

Modified: lxml/branch/html/src/lxml/html/__init__.py
==============================================================================
--- lxml/branch/html/src/lxml/html/__init__.py	(original)
+++ lxml/branch/html/src/lxml/html/__init__.py	Fri Jun  1 08:35:34 2007
@@ -178,6 +178,9 @@
 
         If you give ``base_href`` then all links passed to
         ``link_repl_func()`` will take that into account.
+
+        If the ``link_repl_func`` returns None, the attribute or
+        tag text will be removed completely.
         """
         if base_href is not None:
             # FIXME: this can be done in one pass with a wrapper
@@ -189,6 +192,13 @@
             new_link = link_repl_func(link)
             if new_link == link:
                 continue
+            if new_link is None:
+                # Remove the attribute or element content
+                if attrib is None:
+                    el.text = ''
+                else:
+                    del el.attrib[attrib]
+                continue
             if attrib is None:
                 new = el.text[:pos] + new_link + el.text[pos+len(link):]
                 el.text = new

Modified: lxml/branch/html/src/lxml/html/clean.py
==============================================================================
--- lxml/branch/html/src/lxml/html/clean.py	(original)
+++ lxml/branch/html/src/lxml/html/clean.py	Fri Jun  1 08:35:34 2007
@@ -1,3 +1,4 @@
+import re
 from lxml import etree
 from lxml.html import defs
 from lxml.html import HTML, tostring
@@ -5,9 +6,6 @@
 __all__ = ['clean_html', 'clean']
 
 # FIXME: I should study this for more ideas: http://feedparser.org/docs/html-sanitization.html
-# In CSS/style attribute:
-#   url(javascript:...)
-#   expression(...)
 # Other on* attributes that aren't standard?
 # Try these tests: http://feedparser.org/tests/wellformed/sanitize/
 # Also http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl
@@ -19,6 +17,10 @@
 # CSS stuff?
 # remove images?
 
+# This is an IE-specific construct you can have in a stylesheet to
+# run some Javascript:
+_css_javascript_re = re.compile(
+    r'expression\(.*?\)', re.S|re.I)
 
 def clean_html(html, **kw):
     """
@@ -108,14 +110,18 @@
         for attrib in defs.event_attrs:
             for el in doc.xpath('descendant-or-self::*[@%s]' % attrib):
                 del el.attrib[attrib]
-        for attrib in defs.link_attrs:
-            # FIXME: should call lower-case()
-            # FIXME: starts-with isn't really good either, because
-            #        href="   javascript:..." is also a problem
-            for el in doc.xpath("descendant-or-self::*[starts-with(@%s, 'javascript:')]" % attrib):
-                if isinstance(el, basestring):
-                    assert 0, repr(el)
-                el.attrib[attrib] = ""
+        doc.rewrite_links(_remove_javascript, resolve_base_href=False)
+        if not style:
+            for el in doc.xpath('descendant-or-self::*[@style]'):
+                old = el.attrib['style']
+                new = _css_javascript_re.sub('', old)
+                if new != old:
+                    el.attrib['style'] = new
+            for el in doc.xpath('descendant-or-self::style'):
+                old = el.text or ''
+                new = _css_javascript_re.sub('', old)
+                if new != old:
+                    el.text = new
     if comments:
         # Easier way?
         bad = []
@@ -183,3 +189,9 @@
                 continue
             el.attrib['rel'] = 'nofollow'
 
+def _remove_javascript(link):
+    if link.strip().startswith('javascript:'):
+        # FIXME: should this be None to delete?
+        return ''
+    return link
+

Modified: lxml/branch/html/src/lxml/html/tests/test_clean.txt
==============================================================================
--- lxml/branch/html/src/lxml/html/tests/test_clean.txt	(original)
+++ lxml/branch/html/src/lxml/html/tests/test_clean.txt	Fri Jun  1 08:35:34 2007
@@ -5,6 +5,10 @@
 ...  <head>
 ...    <script type="text/javascript" src="evil-site"></script>
 ...    <link rel="alternate" type="text/rss" src="evil-rss">
+...    <style>
+...      body {background-image: url(javascript:do_evil)};
+...      div {color: expression(evil)};
+...    </style>
 ...  </head>
 ...  <body onload="evil_function()">
 ...    <!-- I am interpreted for EVIL! -->
@@ -27,6 +31,10 @@
   <head>
     <script type="text/javascript" src="evil-site"></script>
     <link rel="alternate" type="text/rss" src="evil-rss">
+    <style>
+      body {background-image: url(javascript:do_evil)};
+      div {color: expression(evil)};
+    </style>
   </head>
   <body onload="evil_function()">
     <!-- I am interpreted for EVIL! -->
@@ -49,6 +57,10 @@
   <head>
     <script type="text/javascript" src="evil-site"></script>
     <link rel="alternate" type="text/rss" src="evil-rss">
+    <style>
+      body {background-image: url(javascript:do_evil)};
+      div {color: expression(evil)};
+    </style>
   </head>
   <body onload="evil_function()">
     <!-- I am interpreted for EVIL! -->
@@ -70,6 +82,10 @@
 <html>
   <head>
     <link rel="alternate" type="text/rss" src="evil-rss">
+    <style>
+      body {background-image: url()};
+      div {color: };
+    </style>
   </head>
   <body>
     <a href="">a link</a>


More information about the lxml-checkins mailing list