[Lxml-checkins] r44050 - in lxml/branch/html/src/lxml/html: . tests tests/hackers-org-data

ianb at codespeak.net ianb at codespeak.net
Wed Jun 6 10:11:34 CEST 2007


Author: ianb
Date: Wed Jun  6 10:11:34 2007
New Revision: 44050

Modified:
   lxml/branch/html/src/lxml/html/clean.py
   lxml/branch/html/src/lxml/html/tests/hackers-org-data/downlevel-hidden.data
   lxml/branch/html/src/lxml/html/tests/test_clean.txt
   lxml/branch/html/src/lxml/html/tests/test_rewritelinks.txt
Log:
Fix some cleaning-related tests, mostly updating things that got changed when I wasn't running these tests.  Also notice IE conditional comments

Modified: lxml/branch/html/src/lxml/html/clean.py
==============================================================================
--- lxml/branch/html/src/lxml/html/clean.py	(original)
+++ lxml/branch/html/src/lxml/html/clean.py	Wed Jun  6 10:11:34 2007
@@ -40,6 +40,9 @@
 _whitespace_re = re.compile(r'\s+')
 # FIXME: should data: be blocked?
 
+_conditional_comment_re = re.compile(
+    r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S)
+
 def clean_html(html, **kw):
     """
     Like clean(), but takes a text input document, and returns a text
@@ -57,6 +60,10 @@
     for item in el.iterdescendants():
         yield item
 
+# FIXME: I really have to figure out what a sane set of defaults is
+# for these keyword arguments.  And is this signature out of control?
+# What about if we want things like whitelisting of <object> or other
+# controls?  Maybe this has to be more than a function.
 def clean(doc,
           scripts=True,
           javascript=True,
@@ -141,6 +148,17 @@
 
     This modifies the document *in place*.
     """
+    # IE conditional comments basically embed HTML that the parser doesn't
+    # normally see.  We can't allow anything like that, so we'll kill any
+    # comments that could be conditional
+    if not comments:
+        bad = []
+        for el in _itertree(doc):
+            if (isinstance(el, etree.CommentBase)
+                and _conditional_comment_re.search(el.text)):
+                bad.append(el)
+        for el in bad:
+            el.drop_element()
     # First, handle a case that IE treats <image> like <img>, and that can
     # confuse either this step or later steps.
     for el in doc.xpath('descendant-or-self::image'):
@@ -191,6 +209,9 @@
                 elif new != old:
                     el.text = new
     if comments or processing_instructions:
+        # FIXME: why either?  I feel like there's some obscure reason
+        # because you can put PIs in comments...?  But I've already
+        # forgotten it
         kill_tags.append(etree.Comment)
     if processing_instructions:
         kill_tags.append(etree.ProcessingInstruction)
@@ -201,8 +222,9 @@
 #            del el.attrib['xmlns']
     if style:
         kill_tags.append('style')
-        for el in doc.xpath('descendant-or-self::link[lower-case(@rel)="stylesheet"]'):
-            el.drop_element()
+        for el in doc.xpath('descendant-or-self::link'):
+            if 'stylesheet' in el.attrib.get('rel', '').lower():
+                el.drop_element()
         for el in doc.xpath('descendant-or-self::*[@style]'):
             del el.attrib['style']
     if links:

Modified: lxml/branch/html/src/lxml/html/tests/hackers-org-data/downlevel-hidden.data
==============================================================================
--- lxml/branch/html/src/lxml/html/tests/hackers-org-data/downlevel-hidden.data	(original)
+++ lxml/branch/html/src/lxml/html/tests/hackers-org-data/downlevel-hidden.data	Wed Jun  6 10:11:34 2007
@@ -1,11 +1,9 @@
 Description: Downlevel-Hidden-Hidden block (only works in IE5.0 and later and Netscape 8.1 in IE rendering engine mode). Some websites consider anything inside a comment block to be safe and therefore does not need to be removed, which allows our Cross Site Scripting vector. Or the system could add comment tags around something to attempt to render it harmless. As we can see, that probably wouldn't do the job
     http://ha.ckers.org/xss.html#XSS_Downlevel-Hidden
-Options: -comments
+Options: -comments, -processing_instructions
 
 <div><!--[if gte IE 4]>
 <SCRIPT>alert('XSS');</SCRIPT>
 <![endif]--></div>
 ----------
-<div>[if gte IE 4]>
-&lt;SCRIPT&gt;alert('XSS');&lt;/SCRIPT&gt;
-&lt;![endif]</div>
+<div></div>

Modified: lxml/branch/html/src/lxml/html/tests/test_clean.txt
==============================================================================
--- lxml/branch/html/src/lxml/html/tests/test_clean.txt	(original)
+++ lxml/branch/html/src/lxml/html/tests/test_clean.txt	Wed Jun  6 10:11:34 2007
@@ -1,4 +1,4 @@
->>> from lxml.html import HTML, tostring
+>>> from lxml.html import parse, tostring
 >>> from lxml.html.clean import clean, clean_html
 >>> from lxml.html import usedoctest
 >>> doc = '''<html>
@@ -52,7 +52,7 @@
     <image src="evil!">
   </body>
 </html>
->>> print tostring(HTML(doc))
+>>> print tostring(parse(doc))
 <html>
   <head>
     <script type="text/javascript" src="evil-site"></script>
@@ -78,27 +78,25 @@
     <image src="evil!">
   </body>
 </html>
->>> print clean_html(doc)
+>>> print clean_html(doc, page_structure=False, safe_attrs_only=False)
 <html>
   <head>
-    <link rel="alternate" type="text/rss" src="evil-rss">
-    <style>
-      body {background-image: url()};
-      div {color: };
-    </style>
+    <style>/* deleted */</style>
   </head>
   <body>
     <a href="">a link</a>
     <a href="#">another link</a>
     <p>a paragraph</p>
     <div style="display: none">secret EVIL!</div>
+    of EVIL!
     Password:
-    <blink>annoying EVIL!</blink>
+    annoying EVIL!
     <a href="evil-site">spam spam SPAM!</a>
     <img src="evil!">
   </body>
 </html>
->>> print clean_html(doc, style=True, links=True, add_nofollow=True)
+>>> print clean_html(doc, style=True, links=True, add_nofollow=True,
+...                  page_structure=False, safe_attrs_only=False)
 <html>
   <head>
   </head>
@@ -107,8 +105,9 @@
     <a href="#">another link</a>
     <p>a paragraph</p>
     <div>secret EVIL!</div>
+    of EVIL!
     Password:
-    <blink>annoying EVIL!</blink>
+    annoying EVIL!
     <a href="evil-site" rel="nofollow">spam spam SPAM!</a>
     <img src="evil!">
   </body>

Modified: lxml/branch/html/src/lxml/html/tests/test_rewritelinks.txt
==============================================================================
--- lxml/branch/html/src/lxml/html/tests/test_rewritelinks.txt	(original)
+++ lxml/branch/html/src/lxml/html/tests/test_rewritelinks.txt	Wed Jun  6 10:11:34 2007
@@ -75,7 +75,7 @@
 is something embedded).  It returns a generator of ``(element, attrib,
 link)``, which is awkward to test here, so we'll make a printer::
 
-    >>> from lxml.html import iter_links
+    >>> from lxml.html import iter_links, HTML, tostring
     >>> def print_iter(seq):
     ...     for element, attrib, link, pos in seq:
     ...         if pos:


More information about the lxml-checkins mailing list