[Lxml-checkins] r44100 - lxml/branch/html/src/lxml/html

ianb at codespeak.net ianb at codespeak.net
Thu Jun 7 18:56:07 CEST 2007


Author: ianb
Date: Thu Jun  7 18:56:06 2007
New Revision: 44100

Modified:
   lxml/branch/html/src/lxml/html/clean.py
Log:
improve autolink_html a bit; add some comments and doc stuff

Modified: lxml/branch/html/src/lxml/html/clean.py
==============================================================================
--- lxml/branch/html/src/lxml/html/clean.py	(original)
+++ lxml/branch/html/src/lxml/html/clean.py	Thu Jun  7 18:56:06 2007
@@ -45,6 +45,7 @@
 _whitespace_re = re.compile(r'\s+')
 # FIXME: should data: be blocked?
 
+# FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx
 _conditional_comment_re = re.compile(
     r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S)
 
@@ -440,9 +441,19 @@
     return leading_text, links
                 
 def autolink_html(html, *args, **kw):
-    doc = parse(html)
+    if isinstance(html, basestring):
+        doc = parse(html)
+        return_string = True
+    else:
+        doc = copy.deepcopy(html)
+        return_string = False
     autolink(doc, *args, **kw)
-    return tostring(doc)
+    if return_string:
+        return tostring(doc)
+    else:
+        return doc
+
+autolink_html.__doc__ = autolink.__doc__
 
 _avoid_word_break_elements = ['pre', 'textarea', 'code']
 _avoid_word_break_classes = ['nobreak']
@@ -455,7 +466,7 @@
     Breaks any long words found in the body of the text (not attributes).
 
     Doesn't effect any of the tags in avoid_elements, by default
-    textarea and pre
+    ``<textarea>`` and ``<pre>``
 
     Breaks words by inserting &#8203;, which is a unicode character
     for Zero Width Space character.  This generally takes up no space
@@ -524,9 +535,20 @@
 _decomment_re = re.compile(r'/\*.*?\*/', re.S)
 
 def _has_sneaky_javascript(style):
+    """
+    Depending on the browser, stuff like ``e x p r e s s i o n(...)``
+    can get interpreted, or ``expre/* stuff */ssion(...)``.  This
+    checks for attempt to do stuff like this.
+
+    Typically the response will be to kill the entire style; if you
+    have just a bit of Javascript in the style another rule will catch
+    that and remove only the Javascript from the style; this catches
+    more sneaky attempts.
+    """
     style = _decomment_re.sub('', style)
     style = style.replace('\\', '')
     style = _whitespace_re.sub('', style)
+    style = style.lower()
     if 'javascript:' in style:
         return True
     if 'expression(' in style:


More information about the lxml-checkins mailing list