[Lxml-checkins] r43979 - in lxml/branch/html/src/lxml/html: . tests

ianb at codespeak.net ianb at codespeak.net
Fri Jun 1 21:43:00 CEST 2007


Author: ianb
Date: Fri Jun  1 21:42:58 2007
New Revision: 43979

Added:
   lxml/branch/html/src/lxml/html/tests/test_autolink.py   (contents, props changed)
   lxml/branch/html/src/lxml/html/tests/test_autolink.txt   (contents, props changed)
Modified:
   lxml/branch/html/src/lxml/html/clean.py
Log:
Added an autolinking function

Modified: lxml/branch/html/src/lxml/html/clean.py
==============================================================================
--- lxml/branch/html/src/lxml/html/clean.py	(original)
+++ lxml/branch/html/src/lxml/html/clean.py	Fri Jun  1 21:42:58 2007
@@ -3,23 +3,30 @@
 from lxml.html import defs
 from lxml.html import parse, tostring
 
-__all__ = ['clean_html', 'clean']
+__all__ = ['clean_html', 'clean', 'autolink', 'autolink_html']
 
-# FIXME: I should study this for more ideas: http://feedparser.org/docs/html-sanitization.html
-# Other on* attributes that aren't standard?
-# Try these tests: http://feedparser.org/tests/wellformed/sanitize/
-# Also http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl
-# max width for words
+# Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl
+# I have multiple kinds of schemes searched; but should schemes be
+#   whitelisted instead?
+# max width for words (but not in pre or textarea)
 # max height?
-# autolink?
-# CSS stuff?
-# remove images?
+# autolink? (don't autolink in textarea, pre, code)
+# remove images?  Also in CSS?  background attribute?
+# Some way to whitelist object, iframe, etc (e.g., if you want to
+#   allow *just* embedded YouTube movies)
+# Log what was deleted and why?
 
 # This is an IE-specific construct you can have in a stylesheet to
 # run some Javascript:
 _css_javascript_re = re.compile(
     r'expression\(.*?\)', re.S|re.I)
 
+# All kinds of schemes besides just javascript: that can cause
+# execution:
+_javascript_scheme_re = re.compile(
+    r'\s*(?:javascript|jscript|livescript|vbscript|about):', re.I)
+_whitespace_re = re.compile(r'\s+')
+
 def clean_html(html, **kw):
     """
     Like clean(), but takes a text input document, and returns a text
@@ -93,7 +100,7 @@
         Remove any form tags
 
     ``annoying_tags``:
-        Tags that aren't *wrong*, but are annoying.  ``<blink>`` (FIXME: marquee?)
+        Tags that aren't *wrong*, but are annoying.  ``<blink>`` and ``<marque>``
 
     ``remove_tags``:
         A list of tags to remove.
@@ -190,7 +197,7 @@
         remove_tags.extend(['form'])
         kill_tags.extend(['button', 'input', 'select', 'textarea'])
     if annoying_tags:
-        remove_tags.extend(['blink'])
+        remove_tags.extend(['blink', 'marque'])
     bad = []
     for el in _itertree(doc):
         if el.tag in kill_tags:
@@ -238,8 +245,136 @@
             el.attrib['rel'] = 'nofollow'
 
 def _remove_javascript(link):
-    if link.strip().startswith('javascript:'):
+    # links like "j a v a s c r i p t:" might be interpreted in IE
+    new = _whitespace_re.sub('', link)
+    if _javascript_scheme_re.search(new):
         # FIXME: should this be None to delete?
         return ''
     return link
 
+_link_regexes = [
+    re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?)', re.I),
+    # This is conservative, but autolinking can be a bit conservative:
+    re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_._]+[a-z]))', re.I),
+    ]
+
+_avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a']
+
+_avoid_hosts = [
+    re.compile(r'^localhost', re.I),
+    re.compile(r'\bexample\.(?:com|org|net)$', re.I),
+    re.compile(r'^127\.0\.0\.1$'),
+    ]
+
+_avoid_classes = ['nolink']
+
+def autolink(el, link_regexes=_link_regexes,
+             avoid_elements=_avoid_elements,
+             avoid_hosts=_avoid_hosts,
+             avoid_classes=_avoid_classes):
+    """
+    Turn any URLs into links.
+
+    It will search for links identified by the given regular
+    expressions (by default mailto and http(s) links).
+
+    It won't link text in an element in avoid_elements, or an element
+    with a class in avoid_classes.  It won't link to anything with a
+    host that matches one of the regular expressions in avoid_hosts
+    (default localhost and 127.0.0.1).
+
+    If you pass in an element, the elements tail will not be
+    substituted, only the contents of the element.
+    """
+    if el.tag in avoid_elements:
+        return
+    class_name = el.attrib.get('class')
+    if class_name:
+        class_name = class_name.split()
+        for match_class in avoid_classes:
+            if match_class in class_name:
+                return
+    for child in list(el):
+        autolink(child, link_regexes=link_regexes,
+                 avoid_elements=avoid_elements,
+                 avoid_hosts=avoid_hosts,
+                 avoid_classes=avoid_classes)
+        if child.tail:
+            text, tail_children = _link_text(
+                child.tail, link_regexes, avoid_hosts, factory=el.makeelement)
+            if tail_children:
+                child.tail = text
+                index = el.index(child)
+                el[index+1:index+1] = tail_children
+    if el.text:
+        text, pre_children = _link_text(
+            el.text, link_regexes, avoid_hosts, factory=el.makeelement)
+        if pre_children:
+            el.text = text
+            el[:0] = pre_children
+
+def _link_text(text, link_regexes, avoid_hosts, factory):
+    leading_text = ''
+    links = []
+    last_pos = 0
+    while 1:
+        best_match, best_pos = None, None
+        for regex in link_regexes:
+            regex_pos = last_pos
+            while 1:
+                match = regex.search(text, pos=regex_pos)
+                if match is None:
+                    break
+                host = match.group('host')
+                for host_regex in avoid_hosts:
+                    if host_regex.search(host):
+                        regex_pos = match.end()
+                        break
+                else:
+                    break
+            if match is None:
+                continue
+            if best_pos is None or match.start() < best_pos:
+                best_match = match
+                best_pos = match.start()
+        if best_match is None:
+            # No more matches
+            if links:
+                assert not links[-1].tail
+                links[-1].tail = text
+            else:
+                assert not leading_text
+                leading_text = text
+            break
+        link = best_match.group(0)
+        end = best_match.end()
+        if link.endswith('.') or link.endswith(','):
+            # These punctuation marks shouldn't end a link
+            end -= 1
+            link = link[:-1]
+        prev_text = text[:best_match.start()]
+        if links:
+            assert not links[-1].tail
+            links[-1].tail = prev_text
+        else:
+            assert not leading_text
+            leading_text = prev_text
+        anchor = factory('a')
+        anchor.attrib['href'] = link
+        body = best_match.group('body')
+        if not body:
+            body = link
+        if body.endswith('.') or body.endswith(','):
+            body = body[:-1]
+        anchor.text = body
+        links.append(anchor)
+        text = text[end:]
+    return leading_text, links
+                
+def autolink_html(html, *args, **kw):
+    doc = parse(html)
+    autolink(doc, *args, **kw)
+    return tostring(doc)
+
+            
+        

Added: lxml/branch/html/src/lxml/html/tests/test_autolink.py
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/tests/test_autolink.py	Fri Jun  1 21:42:58 2007
@@ -0,0 +1,10 @@
+import unittest
+from lxml.tests.common_imports import doctest
+
+def test_suite():
+    suite = unittest.TestSuite()
+    suite.addTests([doctest.DocFileSuite('test_autolink.txt')])
+    return suite
+
+if __name__ == '__main__':
+    unittest.main()

Added: lxml/branch/html/src/lxml/html/tests/test_autolink.txt
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/tests/test_autolink.txt	Fri Jun  1 21:42:58 2007
@@ -0,0 +1,37 @@
+This tests autolink::
+
+    >>> from lxml.html import usedoctest
+    >>> from lxml.html.clean import autolink_html
+    >>> print autolink_html('''
+    ... <div>Link here: http://test.com/foo.html.</div>
+    ... ''')
+    <div>Link here: <a href="http://test.com/foo.html">http://test.com/foo.html</a>.</div>
+    >>> print autolink_html('''
+    ... <div>Mail me at mailto:ianb at test.com or http://myhome.com</div>
+    ... ''')
+    <div>Mail me at <a href="mailto:ianb at test.com">ianb at test.com</a>
+    or <a href="http://myhome.com">http://myhome.com</a></div>
+    >>> print autolink_html('''
+    ... <div>The <b>great</b> thing is the http://link.com links <i>and</i>
+    ... the http://foobar.com links.</div>''')
+    <div>The <b>great</b> thing is the <a href="http://link.com">http://link.com</a> links <i>and</i>
+    the <a href="http://foobar.com">http://foobar.com</a> links.</div>
+
+Some cases that won't be caught (on purpose)::
+
+    >>> print autolink_html('''
+    ... <div>A link to http://localhost/foo/bar won't, but a link to
+    ...  http://test.com will</div>''')
+    <div>A link to http://localhost/foo/bar won't, but a link to
+    <a href="http://test.com">http://test.com</a> will</div>
+    >>> print autolink_html('''
+    ... <div>A link in <textarea>http://test.com</textarea></div>''')
+    <div>A link in <textarea>http://test.com</textarea></div>
+    >>> print autolink_html('''
+    ... <div>A link in <a href="http://foo.com">http://bar.com</a></div>''')
+    <div>A link in <a href="http://foo.com">http://bar.com</a></div>
+    >>> print autolink_html('''
+    ... <div>A link in <code>http://foo.com</code> or
+    ... <span class="nolink">http://bar.com</span></div>''')
+    <div>A link in <code>http://foo.com</code> or
+    <span class="nolink">http://bar.com</span></div>


More information about the lxml-checkins mailing list