[Lxml-checkins] r43979 - in lxml/branch/html/src/lxml/html: . tests
ianb at codespeak.net
ianb at codespeak.net
Fri Jun 1 21:43:00 CEST 2007
Author: ianb
Date: Fri Jun 1 21:42:58 2007
New Revision: 43979
Added:
lxml/branch/html/src/lxml/html/tests/test_autolink.py (contents, props changed)
lxml/branch/html/src/lxml/html/tests/test_autolink.txt (contents, props changed)
Modified:
lxml/branch/html/src/lxml/html/clean.py
Log:
Added an autolinking function
Modified: lxml/branch/html/src/lxml/html/clean.py
==============================================================================
--- lxml/branch/html/src/lxml/html/clean.py (original)
+++ lxml/branch/html/src/lxml/html/clean.py Fri Jun 1 21:42:58 2007
@@ -3,23 +3,30 @@
from lxml.html import defs
from lxml.html import parse, tostring
-__all__ = ['clean_html', 'clean']
+__all__ = ['clean_html', 'clean', 'autolink', 'autolink_html']
-# FIXME: I should study this for more ideas: http://feedparser.org/docs/html-sanitization.html
-# Other on* attributes that aren't standard?
-# Try these tests: http://feedparser.org/tests/wellformed/sanitize/
-# Also http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl
-# max width for words
+# Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl
+# I have multiple kinds of schemes searched; but should schemes be
+# whitelisted instead?
+# max width for words (but not in pre or textarea)
# max height?
-# autolink?
-# CSS stuff?
-# remove images?
+# autolink? (don't autolink in textarea, pre, code)
+# remove images? Also in CSS? background attribute?
+# Some way to whitelist object, iframe, etc (e.g., if you want to
+# allow *just* embedded YouTube movies)
+# Log what was deleted and why?
# This is an IE-specific construct you can have in a stylesheet to
# run some Javascript:
_css_javascript_re = re.compile(
r'expression\(.*?\)', re.S|re.I)
+# All kinds of schemes besides just javascript: that can cause
+# execution:
+_javascript_scheme_re = re.compile(
+ r'\s*(?:javascript|jscript|livescript|vbscript|about):', re.I)
+_whitespace_re = re.compile(r'\s+')
+
def clean_html(html, **kw):
"""
Like clean(), but takes a text input document, and returns a text
@@ -93,7 +100,7 @@
Remove any form tags
``annoying_tags``:
- Tags that aren't *wrong*, but are annoying. ``<blink>`` (FIXME: marquee?)
+ Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marque>``
``remove_tags``:
A list of tags to remove.
@@ -190,7 +197,7 @@
remove_tags.extend(['form'])
kill_tags.extend(['button', 'input', 'select', 'textarea'])
if annoying_tags:
- remove_tags.extend(['blink'])
+ remove_tags.extend(['blink', 'marque'])
bad = []
for el in _itertree(doc):
if el.tag in kill_tags:
@@ -238,8 +245,136 @@
el.attrib['rel'] = 'nofollow'
def _remove_javascript(link):
- if link.strip().startswith('javascript:'):
+ # links like "j a v a s c r i p t:" might be interpreted in IE
+ new = _whitespace_re.sub('', link)
+ if _javascript_scheme_re.search(new):
# FIXME: should this be None to delete?
return ''
return link
+_link_regexes = [
+ re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?)', re.I),
+ # This is conservative, but autolinking can be a bit conservative:
+ re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_._]+[a-z]))', re.I),
+ ]
+
+_avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a']
+
+_avoid_hosts = [
+ re.compile(r'^localhost', re.I),
+ re.compile(r'\bexample\.(?:com|org|net)$', re.I),
+ re.compile(r'^127\.0\.0\.1$'),
+ ]
+
+_avoid_classes = ['nolink']
+
+def autolink(el, link_regexes=_link_regexes,
+ avoid_elements=_avoid_elements,
+ avoid_hosts=_avoid_hosts,
+ avoid_classes=_avoid_classes):
+ """
+ Turn any URLs into links.
+
+ It will search for links identified by the given regular
+ expressions (by default mailto and http(s) links).
+
+ It won't link text in an element in avoid_elements, or an element
+ with a class in avoid_classes. It won't link to anything with a
+ host that matches one of the regular expressions in avoid_hosts
+ (default localhost and 127.0.0.1).
+
+ If you pass in an element, the elements tail will not be
+ substituted, only the contents of the element.
+ """
+ if el.tag in avoid_elements:
+ return
+ class_name = el.attrib.get('class')
+ if class_name:
+ class_name = class_name.split()
+ for match_class in avoid_classes:
+ if match_class in class_name:
+ return
+ for child in list(el):
+ autolink(child, link_regexes=link_regexes,
+ avoid_elements=avoid_elements,
+ avoid_hosts=avoid_hosts,
+ avoid_classes=avoid_classes)
+ if child.tail:
+ text, tail_children = _link_text(
+ child.tail, link_regexes, avoid_hosts, factory=el.makeelement)
+ if tail_children:
+ child.tail = text
+ index = el.index(child)
+ el[index+1:index+1] = tail_children
+ if el.text:
+ text, pre_children = _link_text(
+ el.text, link_regexes, avoid_hosts, factory=el.makeelement)
+ if pre_children:
+ el.text = text
+ el[:0] = pre_children
+
+def _link_text(text, link_regexes, avoid_hosts, factory):
+ leading_text = ''
+ links = []
+ last_pos = 0
+ while 1:
+ best_match, best_pos = None, None
+ for regex in link_regexes:
+ regex_pos = last_pos
+ while 1:
+ match = regex.search(text, pos=regex_pos)
+ if match is None:
+ break
+ host = match.group('host')
+ for host_regex in avoid_hosts:
+ if host_regex.search(host):
+ regex_pos = match.end()
+ break
+ else:
+ break
+ if match is None:
+ continue
+ if best_pos is None or match.start() < best_pos:
+ best_match = match
+ best_pos = match.start()
+ if best_match is None:
+ # No more matches
+ if links:
+ assert not links[-1].tail
+ links[-1].tail = text
+ else:
+ assert not leading_text
+ leading_text = text
+ break
+ link = best_match.group(0)
+ end = best_match.end()
+ if link.endswith('.') or link.endswith(','):
+ # These punctuation marks shouldn't end a link
+ end -= 1
+ link = link[:-1]
+ prev_text = text[:best_match.start()]
+ if links:
+ assert not links[-1].tail
+ links[-1].tail = prev_text
+ else:
+ assert not leading_text
+ leading_text = prev_text
+ anchor = factory('a')
+ anchor.attrib['href'] = link
+ body = best_match.group('body')
+ if not body:
+ body = link
+ if body.endswith('.') or body.endswith(','):
+ body = body[:-1]
+ anchor.text = body
+ links.append(anchor)
+ text = text[end:]
+ return leading_text, links
+
+def autolink_html(html, *args, **kw):
+ doc = parse(html)
+ autolink(doc, *args, **kw)
+ return tostring(doc)
+
+
+
Added: lxml/branch/html/src/lxml/html/tests/test_autolink.py
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/tests/test_autolink.py Fri Jun 1 21:42:58 2007
@@ -0,0 +1,10 @@
+import unittest
+from lxml.tests.common_imports import doctest
+
+def test_suite():
+ suite = unittest.TestSuite()
+ suite.addTests([doctest.DocFileSuite('test_autolink.txt')])
+ return suite
+
+if __name__ == '__main__':
+ unittest.main()
Added: lxml/branch/html/src/lxml/html/tests/test_autolink.txt
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/tests/test_autolink.txt Fri Jun 1 21:42:58 2007
@@ -0,0 +1,37 @@
+This tests autolink::
+
+ >>> from lxml.html import usedoctest
+ >>> from lxml.html.clean import autolink_html
+ >>> print autolink_html('''
+ ... <div>Link here: http://test.com/foo.html.</div>
+ ... ''')
+ <div>Link here: <a href="http://test.com/foo.html">http://test.com/foo.html</a>.</div>
+ >>> print autolink_html('''
+ ... <div>Mail me at mailto:ianb at test.com or http://myhome.com</div>
+ ... ''')
+ <div>Mail me at <a href="mailto:ianb at test.com">ianb at test.com</a>
+ or <a href="http://myhome.com">http://myhome.com</a></div>
+ >>> print autolink_html('''
+ ... <div>The <b>great</b> thing is the http://link.com links <i>and</i>
+ ... the http://foobar.com links.</div>''')
+ <div>The <b>great</b> thing is the <a href="http://link.com">http://link.com</a> links <i>and</i>
+ the <a href="http://foobar.com">http://foobar.com</a> links.</div>
+
+Some cases that won't be caught (on purpose)::
+
+ >>> print autolink_html('''
+ ... <div>A link to http://localhost/foo/bar won't, but a link to
+ ... http://test.com will</div>''')
+ <div>A link to http://localhost/foo/bar won't, but a link to
+ <a href="http://test.com">http://test.com</a> will</div>
+ >>> print autolink_html('''
+ ... <div>A link in <textarea>http://test.com</textarea></div>''')
+ <div>A link in <textarea>http://test.com</textarea></div>
+ >>> print autolink_html('''
+ ... <div>A link in <a href="http://foo.com">http://bar.com</a></div>''')
+ <div>A link in <a href="http://foo.com">http://bar.com</a></div>
+ >>> print autolink_html('''
+ ... <div>A link in <code>http://foo.com</code> or
+ ... <span class="nolink">http://bar.com</span></div>''')
+ <div>A link in <code>http://foo.com</code> or
+ <span class="nolink">http://bar.com</span></div>
More information about the lxml-checkins
mailing list