[Lxml-checkins] r44100 - lxml/branch/html/src/lxml/html
ianb at codespeak.net
ianb at codespeak.net
Thu Jun 7 18:56:07 CEST 2007
Author: ianb
Date: Thu Jun 7 18:56:06 2007
New Revision: 44100
Modified:
lxml/branch/html/src/lxml/html/clean.py
Log:
improve autolink_html a bit; add some comments and doc stuff
Modified: lxml/branch/html/src/lxml/html/clean.py
==============================================================================
--- lxml/branch/html/src/lxml/html/clean.py (original)
+++ lxml/branch/html/src/lxml/html/clean.py Thu Jun 7 18:56:06 2007
@@ -45,6 +45,7 @@
_whitespace_re = re.compile(r'\s+')
# FIXME: should data: be blocked?
+# FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx
_conditional_comment_re = re.compile(
r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S)
@@ -440,9 +441,19 @@
return leading_text, links
def autolink_html(html, *args, **kw):
- doc = parse(html)
+ if isinstance(html, basestring):
+ doc = parse(html)
+ return_string = True
+ else:
+ doc = copy.deepcopy(html)
+ return_string = False
autolink(doc, *args, **kw)
- return tostring(doc)
+ if return_string:
+ return tostring(doc)
+ else:
+ return doc
+
+autolink_html.__doc__ = autolink.__doc__
_avoid_word_break_elements = ['pre', 'textarea', 'code']
_avoid_word_break_classes = ['nobreak']
@@ -455,7 +466,7 @@
Breaks any long words found in the body of the text (not attributes).
Doesn't effect any of the tags in avoid_elements, by default
- textarea and pre
+ ``<textarea>`` and ``<pre>``
Breaks words by inserting ​, which is a unicode character
for Zero Width Space character. This generally takes up no space
@@ -524,9 +535,20 @@
_decomment_re = re.compile(r'/\*.*?\*/', re.S)
def _has_sneaky_javascript(style):
+ """
+ Depending on the browser, stuff like ``e x p r e s s i o n(...)``
+ can get interpreted, or ``expre/* stuff */ssion(...)``. This
+ checks for attempt to do stuff like this.
+
+ Typically the response will be to kill the entire style; if you
+ have just a bit of Javascript in the style another rule will catch
+ that and remove only the Javascript from the style; this catches
+ more sneaky attempts.
+ """
style = _decomment_re.sub('', style)
style = style.replace('\\', '')
style = _whitespace_re.sub('', style)
+ style = style.lower()
if 'javascript:' in style:
return True
if 'expression(' in style:
More information about the lxml-checkins
mailing list