From ianb at codespeak.net Fri Jun 1 06:18:44 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Fri, 1 Jun 2007 06:18:44 +0200 (CEST) Subject: [Lxml-checkins] r43954 - lxml/branch/html/src/lxml Message-ID: <20070601041844.70BB48093@code0.codespeak.net> Author: ianb Date: Fri Jun 1 06:18:43 2007 New Revision: 43954 Modified: lxml/branch/html/src/lxml/doctestcompare.py Log: Change the diff output a bit; only parse if both got *and* want look like HTML/XML Modified: lxml/branch/html/src/lxml/doctestcompare.py ============================================================================== --- lxml/branch/html/src/lxml/doctestcompare.py (original) +++ lxml/branch/html/src/lxml/doctestcompare.py Fri Jun 1 06:18:43 2007 @@ -78,9 +78,11 @@ parser = HTML elif PARSE_XML & optionflags: parser = etree.XML - elif want.strip().lower().startswith('' % tag @@ -297,7 +299,7 @@ if not got: return '' return self.format_text(got, strip) - text = '%s (not %s)' % (got, want) + text = '%s (got: %s)' % (want, got) return self.format_text(text, strip) class LHTMLOutputChecker(LXMLOutputChecker): From ianb at codespeak.net Fri Jun 1 06:25:27 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Fri, 1 Jun 2007 06:25:27 +0200 (CEST) Subject: [Lxml-checkins] r43955 - in lxml/branch/html/src/lxml/html: . tests Message-ID: <20070601042527.A7A778093@code0.codespeak.net> Author: ianb Date: Fri Jun 1 06:25:27 2007 New Revision: 43955 Removed: lxml/branch/html/src/lxml/html/rewritelinks.py Modified: lxml/branch/html/src/lxml/html/__init__.py lxml/branch/html/src/lxml/html/tests/test_rewritelinks.txt Log: Move all the link functions directly into __init__; change rewriting to all use iter_links Modified: lxml/branch/html/src/lxml/html/__init__.py ============================================================================== --- lxml/branch/html/src/lxml/html/__init__.py (original) +++ lxml/branch/html/src/lxml/html/__init__.py Fri Jun 1 06:25:27 2007 @@ -1,13 +1,19 @@ import threading import re +import urlparse from lxml import etree +from lxml.html import defs -__all__ = ['HTML', 'tostring', 'Element'] +__all__ = ['HTML', 'tostring', 'Element', 'defs', + 'find_rel_links', 'find_class', 'make_links_absolute', + 'resolve_base_href', 'iter_links', 'rewrite_links'] _rel_links_xpath = etree.XPath("descendant-or-self::a[fn:upper-case(@rel)=$rel]") #_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'}) _class_xpath = etree.XPath("descendant-or-self::*[contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") +_css_url_re = re.compile(r'url\((.*?)\)', re.I) +_css_import_re = re.compile(r'@import "(.*?)"') class HtmlMixin(object): @@ -110,8 +116,11 @@ tags in the document are used *and* removed from the document. If it is false then any such tag is ignored. """ - from lxml.html.rewritelinks import make_links_absolute - make_links_absolute(self, base_href, resolve_base_href=resolve_base_href) + if resolve_base_href: + self.resolve_base_href() + def link_repl(href): + return urlparse.urljoin(base_href, href) + self.rewrite_links(link_repl) def resolve_base_href(self): """ @@ -119,25 +128,38 @@ values to all links found in the document. Also remove the tag once it has been applied. """ - from lxml.html.rewritelinks import resolve_base_href - resolve_base_href(self) - - def iter_links(self, in_order=True): - """ - Iterate over all the links in the document, yielding - ``(element, attribute, link)``. - - The ``element`` contains the link. ``attribute`` is a string - like ``'href'`` or ``'src'``. It may be None, which means - that the link is in the body of the element. The only type - this occurs is with `` - ... - ... - ... - ... - ... - ... - ...
- ... - ... Hi world! - ...
- ... ''', False)) - link href="style.css" - a href="/test.html" - a href="/other.html" - script src="/js-funcs.js" - img src="/logo.gif" - style None="/bg.gif"@40 - style None="/other-styles.css"@69 - td style="/td-bg.png"@22 From ianb at codespeak.net Fri Jun 1 06:35:35 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Fri, 1 Jun 2007 06:35:35 +0200 (CEST) Subject: [Lxml-checkins] r43956 - lxml/branch/html/src/lxml/html Message-ID: <20070601043535.E300D809C@code0.codespeak.net> Author: ianb Date: Fri Jun 1 06:35:35 2007 New Revision: 43956 Modified: lxml/branch/html/src/lxml/html/__init__.py Log: rename get_text_content to text_content. Add docstring and notes Modified: lxml/branch/html/src/lxml/html/__init__.py ============================================================================== --- lxml/branch/html/src/lxml/html/__init__.py (original) +++ lxml/branch/html/src/lxml/html/__init__.py Fri Jun 1 06:35:35 2007 @@ -1,6 +1,7 @@ import threading import re import urlparse +import copy from lxml import etree from lxml.html import defs @@ -96,7 +97,7 @@ except IndexError: return default - def get_text_content(self): + def text_content(self): """ Return the text content of the tag (and the text in any children). """ @@ -200,6 +201,19 @@ class _MethodFunc(object): + """ + An object that represents a method on an element as a function; + the function takes either an element or an HTML string. It + returns whatever the function normally returns, or if the function + works in-place (and so returns None) it returns a serialized form + of the resulting document. + """ + # FIXME: the None test is a bit sloppy FIXME: this is basically + # functional if you use it with a string; should it be a + # functional equivalent for working with elements too? It has to + # make a copy of the document. The problem is it changes the + # return type, as it should return the copied document and not a + # serialization. Is that odd? def __init__(self, name, fragment=False, source_class=HtmlMixin): self.name = name self.fragment = fragment From ianb at codespeak.net Fri Jun 1 06:41:44 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Fri, 1 Jun 2007 06:41:44 +0200 (CEST) Subject: [Lxml-checkins] r43957 - lxml/branch/html/src/lxml/html Message-ID: <20070601044144.31619809F@code0.codespeak.net> Author: ianb Date: Fri Jun 1 06:41:43 2007 New Revision: 43957 Modified: lxml/branch/html/src/lxml/html/__init__.py Log: remove the sub-module function wrappers Modified: lxml/branch/html/src/lxml/html/__init__.py ============================================================================== --- lxml/branch/html/src/lxml/html/__init__.py (original) +++ lxml/branch/html/src/lxml/html/__init__.py Fri Jun 1 06:41:43 2007 @@ -243,29 +243,6 @@ iter_links = _MethodFunc('iter_links') rewrite_links = _MethodFunc('rewrite_links') -class _SubmoduleFunc(object): - def __init__(self, module, name, doc=None): - self.module = module - self.name = name - self.obj = None - if doc is None: - doc = 'See %s.%s' % (module, name) - self.__doc__ = doc - def __call__(self, *args, **kw): - if self.obj is None: - import sys - __import__(self.module) - mod = sys.modules(self.module) - self.obj = getattr(mod, self.name) - self.__doc__ = self.obj.__doc__ - return self.obj(*args, **kw) - -# FIXME: Damn module names conflict with the function names :( -#clean = _SubmoduleFunc('lxml.html.clean', 'clean') -#clean_html = _SubmoduleFunc('lxml.html.clean', 'clean_html') -#htmldiff = _SubmoduleFunc('lxml.html.htmldiff', 'htmldiff') -#html_annotate = _SubmoduleFunc('lxml.html.htmldiff', 'html_annotate') - class HtmlComment(etree.CommentBase, HtmlMixin): pass From ianb at codespeak.net Fri Jun 1 06:41:55 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Fri, 1 Jun 2007 06:41:55 +0200 (CEST) Subject: [Lxml-checkins] r43958 - lxml/branch/html/src/lxml/html/tests Message-ID: <20070601044155.EC494809F@code0.codespeak.net> Author: ianb Date: Fri Jun 1 06:41:55 2007 New Revision: 43958 Modified: lxml/branch/html/src/lxml/html/tests/test_rewritelinks.txt Log: remove references to now-gone rewritelinks module Modified: lxml/branch/html/src/lxml/html/tests/test_rewritelinks.txt ============================================================================== --- lxml/branch/html/src/lxml/html/tests/test_rewritelinks.txt (original) +++ lxml/branch/html/src/lxml/html/tests/test_rewritelinks.txt Fri Jun 1 06:41:55 2007 @@ -1,52 +1,13 @@ -These are tests of relocateresponse:: +We'll define a link translation function: - >>> from lxml.html.rewritelinks import Relocator - -In all these examples we'll be using ``http://old`` for the old -(to-be-replaced) URL and ``https://new`` for the new URL (note the -scheme change). To test the rewriting we'll use this handy rewriter -that rewrites everything from one base to another base:: - - >>> relocate_href = Relocator( - ... base_href='http://old/base/path.html', - ... old_href='http://old/', - ... new_href='https://new/') - -Now lets look at simple href rewriting. Normal rewrite:: - - >>> relocate_href('http://old/bar') - 'https://new/bar' - -Note that the trailing / doesn't matter in this one case (since -``http://old`` and ``http://old/`` are entirely equivalent):: - - >>> relocate_href('http://old') - 'https://new/' - -The trailing / does matter in other cases:: - - >>> Relocator( - ... base_href='', - ... old_href='http://old-test/foo/', - ... new_href='https://new', - ... )('http://old-test/foo') - 'http://old-test/foo' - >>> Relocator( - ... base_href='', - ... old_href='http://old-test/foo/', - ... new_href='https://new', - ... )('http://old-test/foo/') - 'https://new' - -Rewriting a link that doesn't match old_href is a no-op:: - - >>> relocate_href('http://foo/bar') - 'http://foo/bar' - -Relative links are handled:: - - >>> relocate_href('index.html') - 'https://new/base/index.html' + >>> base_href = 'http://old/base/path.html' + >>> import urlparse + >>> def relocate_href(link): + ... link = urlparse.urljoin(base_href, link) + ... if link.startswith('http://old'): + ... return 'https://new' + link[len('http://old'):] + ... else: + ... return link Now for content. First, to make it easier on us, we need to trim the normalized HTML we get from these functions:: From ianb at codespeak.net Fri Jun 1 06:58:08 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Fri, 1 Jun 2007 06:58:08 +0200 (CEST) Subject: [Lxml-checkins] r43959 - in lxml/branch/html/src/lxml/html: . tests Message-ID: <20070601045808.E8E2C80B0@code0.codespeak.net> Author: ianb Date: Fri Jun 1 06:58:08 2007 New Revision: 43959 Modified: lxml/branch/html/src/lxml/html/__init__.py lxml/branch/html/src/lxml/html/tests/test_basic.txt Log: Fix find_rel_links Modified: lxml/branch/html/src/lxml/html/__init__.py ============================================================================== --- lxml/branch/html/src/lxml/html/__init__.py (original) +++ lxml/branch/html/src/lxml/html/__init__.py Fri Jun 1 06:58:08 2007 @@ -9,7 +9,7 @@ 'find_rel_links', 'find_class', 'make_links_absolute', 'resolve_base_href', 'iter_links', 'rewrite_links'] -_rel_links_xpath = etree.XPath("descendant-or-self::a[fn:upper-case(@rel)=$rel]") +_rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]") #_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'}) _class_xpath = etree.XPath("descendant-or-self::*[contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") @@ -71,13 +71,15 @@ """ Find any links like ``...``; returns a list of elements. """ - return _rel_links_xpath(self, rel=rel.lower()) + rel = rel.lower() + return [el for el in _rel_links_xpath(self) + if el.attrib['rel'].lower() == rel] def find_class(self, class_name): """ Find any elements with the given class name. """ - return _class_xpath(self, class_name=class_name.lower()) + return _class_xpath(self, class_name=class_name) def get_element_by_id(self, id, default=None): """ Modified: lxml/branch/html/src/lxml/html/tests/test_basic.txt ============================================================================== --- lxml/branch/html/src/lxml/html/tests/test_basic.txt (original) +++ lxml/branch/html/src/lxml/html/tests/test_basic.txt Fri Jun 1 06:58:08 2007 @@ -27,16 +27,14 @@ Also added is a get_rel_links, which you can use to search for links like ````: - >>> h = HTML(''' - ... test 1 - ... - ... item 3 - ... item 4''') - >>> print [e.attrib['href'] for e in h.find_rel_links('tag')] - ['2'] - >>> print [e.attrib['href'] for e in h.find_rel_links('nofollow')] - [] - -FIXME: actually that should have returned ['2', '4'] + >>> h = HTML(''' + ... test 1 + ... + ... item 3 + ... item 4''') + >>> print [e.attrib['href'] for e in h.find_rel_links('tag')] + ['2', '4'] + >>> print [e.attrib['href'] for e in h.find_rel_links('nofollow')] + [] From ianb at codespeak.net Fri Jun 1 07:09:12 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Fri, 1 Jun 2007 07:09:12 +0200 (CEST) Subject: [Lxml-checkins] r43960 - lxml/branch/html/src/lxml Message-ID: <20070601050912.E7C5C80A8@code0.codespeak.net> Author: ianb Date: Fri Jun 1 07:09:12 2007 New Revision: 43960 Modified: lxml/branch/html/src/lxml/doctestcompare.py Log: avoid treating a single repr() of an object like markup Modified: lxml/branch/html/src/lxml/doctestcompare.py ============================================================================== --- lxml/branch/html/src/lxml/doctestcompare.py (original) +++ lxml/branch/html/src/lxml/doctestcompare.py Fri Jun 1 07:09:12 2007 @@ -31,6 +31,9 @@ import doctest import cgi +__all__ = ['PARSE_HTML', 'PARSE_XML', 'LXMLOutputChecker', + 'LHTMLOutputChecker', 'install', 'temp_install'] + PARSE_HTML = doctest.register_optionflag('PARSE_HTML') PARSE_XML = doctest.register_optionflag('PARSE_XML') @@ -42,6 +45,9 @@ else: return v.strip() +# We use this to distinguish repr()s from elements: +_repr_re = re.compile(r'^<[^>]+ (at|object) ') + class LXMLOutputChecker(OutputChecker): empty_tags = ( @@ -81,11 +87,16 @@ elif (want.strip().lower().startswith(' Author: ianb Date: Fri Jun 1 07:09:38 2007 New Revision: 43961 Modified: lxml/branch/html/src/lxml/html/tests/test_basic.txt Log: added some more tests for basic functionality Modified: lxml/branch/html/src/lxml/html/tests/test_basic.txt ============================================================================== --- lxml/branch/html/src/lxml/html/tests/test_basic.txt (original) +++ lxml/branch/html/src/lxml/html/tests/test_basic.txt Fri Jun 1 07:09:38 2007 @@ -1,6 +1,6 @@ lxml.html adds a find_class method to elements:: - >>> from lxml.html import HTML, tostring + >>> from lxml.html import HTML, tostring, parse_element >>> from lxml.html.clean import clean, clean_html >>> from lxml.html import usedoctest >>> h = HTML(''' @@ -25,7 +25,7 @@ ['P1', 'P2'] Also added is a get_rel_links, which you can use to search for links -like ````: +like ````:: >>> h = HTML(''' ... test 1 @@ -37,4 +37,46 @@ >>> print [e.attrib['href'] for e in h.find_rel_links('nofollow')] [] +Another method is ``get_element_by_id`` that does what it says:: + >>> print tostring(HTML(''' + ...
+ ... stuff + ...
''').get_element_by_id('test')) + stuff + +Or to get the content of an element without the tags, use text_content():: + + >>> el = parse_element(''' + ...
This is a bold link
''') + >>> el.text_content() + 'This is a bold link' + +Or drop both tags (leaving content) or the entire element, like:: + + >>> doc = HTML(''' + ... + ... + ...
+ ... This is a test of stuff. + ...
+ ...
footer
+ ... + ... ''') + >>> doc.get_element_by_id('link').drop_tag() + >>> print tostring(doc) + + +
+ This is a test of stuff. +
+
footer
+ + + >>> doc.get_element_by_id('body').drop_element() + >>> print tostring(doc) + + +
footer
+ + From ianb at codespeak.net Fri Jun 1 07:10:51 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Fri, 1 Jun 2007 07:10:51 +0200 (CEST) Subject: [Lxml-checkins] r43962 - lxml/branch/html/src/lxml/html Message-ID: <20070601051051.EDCF180A8@code0.codespeak.net> Author: ianb Date: Fri Jun 1 07:10:51 2007 New Revision: 43962 Modified: lxml/branch/html/src/lxml/html/clean.py Log: added some more tests for basic functionality Modified: lxml/branch/html/src/lxml/html/clean.py ============================================================================== --- lxml/branch/html/src/lxml/html/clean.py (original) +++ lxml/branch/html/src/lxml/html/clean.py Fri Jun 1 07:10:51 2007 @@ -97,6 +97,10 @@ This modifies the document *in place*. """ + # First, handle a case that IE treats like , and that can + # confuse either this step or later steps. + for el in doc.xpath('descendant-or-self::image'): + el.tag = 'img' kill_tags = [] remove_tags = list(remove_tags or []) if scripts: From ianb at codespeak.net Fri Jun 1 07:12:30 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Fri, 1 Jun 2007 07:12:30 +0200 (CEST) Subject: [Lxml-checkins] r43963 - in lxml/branch/html/src/lxml/html: . tests Message-ID: <20070601051230.F0A0680A8@code0.codespeak.net> Author: ianb Date: Fri Jun 1 07:12:30 2007 New Revision: 43963 Modified: lxml/branch/html/src/lxml/html/clean.py lxml/branch/html/src/lxml/html/tests/test_clean.txt Log: Handle in clean Modified: lxml/branch/html/src/lxml/html/clean.py ============================================================================== --- lxml/branch/html/src/lxml/html/clean.py (original) +++ lxml/branch/html/src/lxml/html/clean.py Fri Jun 1 07:12:30 2007 @@ -11,7 +11,6 @@ # Other on* attributes that aren't standard? # Try these tests: http://feedparser.org/tests/wellformed/sanitize/ # Also http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl -# IE treats like # ...? # and is fishy in a fragment # max width for words Modified: lxml/branch/html/src/lxml/html/tests/test_clean.txt ============================================================================== --- lxml/branch/html/src/lxml/html/tests/test_clean.txt (original) +++ lxml/branch/html/src/lxml/html/tests/test_clean.txt Fri Jun 1 07:12:30 2007 @@ -19,6 +19,7 @@ ... </form> ... <blink>annoying EVIL!</blink> ... <a href="evil-site">spam spam SPAM!</a> +... <image src="evil!"> ... </body> ... </html>''' >>> print doc @@ -40,6 +41,7 @@ </form> <blink>annoying EVIL!</blink> <a href="evil-site">spam spam SPAM!</a> + <image src="evil!"> </body> </html> >>> print tostring(HTML(doc)) @@ -61,6 +63,7 @@ </form> <blink>annoying EVIL!</blink> <a href="evil-site">spam spam SPAM!</a> + <image src="evil!"> </body> </html> >>> print clean_html(doc) @@ -76,6 +79,7 @@ Password: <blink>annoying EVIL!</blink> <a href="evil-site">spam spam SPAM!</a> + <img src="evil!"> </body> </html> >>> print clean_html(doc, style=True, links=True, add_nofollow=True) @@ -90,5 +94,6 @@ Password: <blink>annoying EVIL!</blink> <a href="evil-site" rel="nofollow">spam spam SPAM!</a> + <img src="evil!"> </body> </html> From ianb at codespeak.net Fri Jun 1 08:34:17 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Fri, 1 Jun 2007 08:34:17 +0200 (CEST) Subject: [Lxml-checkins] r43967 - lxml/branch/html/src/lxml Message-ID: <20070601063417.C100880A1@code0.codespeak.net> Author: ianb Date: Fri Jun 1 08:34:17 2007 New Revision: 43967 Modified: lxml/branch/html/src/lxml/doctestcompare.py Log: normalize whitespace before comparing text Modified: lxml/branch/html/src/lxml/doctestcompare.py ============================================================================== --- lxml/branch/html/src/lxml/doctestcompare.py (original) +++ lxml/branch/html/src/lxml/doctestcompare.py Fri Jun 1 08:34:17 2007 @@ -45,8 +45,12 @@ else: return v.strip() +def norm_whitespace(v): + return _norm_whitespace_re.sub(' ', v) + # We use this to distinguish repr()s from elements: _repr_re = re.compile(r'^<[^>]+ (at|object) ') +_norm_whitespace_re = re.compile(r'[ \t\n][ \t\n]+') class LXMLOutputChecker(OutputChecker): @@ -130,8 +134,8 @@ want = want or '' got = got or '' if strip: - want = want.strip() - got = got.strip() + want = norm_whitespace(want).strip() + got = norm_whitespace(got).strip() want = '^%s$' % re.escape(want) want = want.replace(r'\.\.\.', '.*') if re.search(want, got): From ianb at codespeak.net Fri Jun 1 08:35:35 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Fri, 1 Jun 2007 08:35:35 +0200 (CEST) Subject: [Lxml-checkins] r43968 - in lxml/branch/html/src/lxml/html: . tests Message-ID: <20070601063535.3C58E80A1@code0.codespeak.net> Author: ianb Date: Fri Jun 1 08:35:34 2007 New Revision: 43968 Modified: lxml/branch/html/src/lxml/html/__init__.py lxml/branch/html/src/lxml/html/clean.py lxml/branch/html/src/lxml/html/tests/test_clean.txt Log: Clean using rewrite_links; catch expression() in styles Modified: lxml/branch/html/src/lxml/html/__init__.py ============================================================================== --- lxml/branch/html/src/lxml/html/__init__.py (original) +++ lxml/branch/html/src/lxml/html/__init__.py Fri Jun 1 08:35:34 2007 @@ -178,6 +178,9 @@ If you give ``base_href`` then all links passed to ``link_repl_func()`` will take that into account. + + If the ``link_repl_func`` returns None, the attribute or + tag text will be removed completely. """ if base_href is not None: # FIXME: this can be done in one pass with a wrapper @@ -189,6 +192,13 @@ new_link = link_repl_func(link) if new_link == link: continue + if new_link is None: + # Remove the attribute or element content + if attrib is None: + el.text = '' + else: + del el.attrib[attrib] + continue if attrib is None: new = el.text[:pos] + new_link + el.text[pos+len(link):] el.text = new Modified: lxml/branch/html/src/lxml/html/clean.py ============================================================================== --- lxml/branch/html/src/lxml/html/clean.py (original) +++ lxml/branch/html/src/lxml/html/clean.py Fri Jun 1 08:35:34 2007 @@ -1,3 +1,4 @@ +import re from lxml import etree from lxml.html import defs from lxml.html import HTML, tostring @@ -5,9 +6,6 @@ __all__ = ['clean_html', 'clean'] # FIXME: I should study this for more ideas: http://feedparser.org/docs/html-sanitization.html -# In CSS/style attribute: -# url(javascript:...) -# expression(...) # Other on* attributes that aren't standard? # Try these tests: http://feedparser.org/tests/wellformed/sanitize/ # Also http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl @@ -19,6 +17,10 @@ # CSS stuff? # remove images? +# This is an IE-specific construct you can have in a stylesheet to +# run some Javascript: +_css_javascript_re = re.compile( + r'expression\(.*?\)', re.S|re.I) def clean_html(html, **kw): """ @@ -108,14 +110,18 @@ for attrib in defs.event_attrs: for el in doc.xpath('descendant-or-self::*[@%s]' % attrib): del el.attrib[attrib] - for attrib in defs.link_attrs: - # FIXME: should call lower-case() - # FIXME: starts-with isn't really good either, because - # href=" javascript:..." is also a problem - for el in doc.xpath("descendant-or-self::*[starts-with(@%s, 'javascript:')]" % attrib): - if isinstance(el, basestring): - assert 0, repr(el) - el.attrib[attrib] = "" + doc.rewrite_links(_remove_javascript, resolve_base_href=False) + if not style: + for el in doc.xpath('descendant-or-self::*[@style]'): + old = el.attrib['style'] + new = _css_javascript_re.sub('', old) + if new != old: + el.attrib['style'] = new + for el in doc.xpath('descendant-or-self::style'): + old = el.text or '' + new = _css_javascript_re.sub('', old) + if new != old: + el.text = new if comments: # Easier way? bad = [] @@ -183,3 +189,9 @@ continue el.attrib['rel'] = 'nofollow' +def _remove_javascript(link): + if link.strip().startswith('javascript:'): + # FIXME: should this be None to delete? + return '' + return link + Modified: lxml/branch/html/src/lxml/html/tests/test_clean.txt ============================================================================== --- lxml/branch/html/src/lxml/html/tests/test_clean.txt (original) +++ lxml/branch/html/src/lxml/html/tests/test_clean.txt Fri Jun 1 08:35:34 2007 @@ -5,6 +5,10 @@ ... <head> ... <script type="text/javascript" src="evil-site"></script> ... <link rel="alternate" type="text/rss" src="evil-rss"> +... <style> +... body {background-image: url(javascript:do_evil)}; +... div {color: expression(evil)}; +... </style> ... </head> ... <body onload="evil_function()"> ... <!-- I am interpreted for EVIL! --> @@ -27,6 +31,10 @@ <head> <script type="text/javascript" src="evil-site"></script> <link rel="alternate" type="text/rss" src="evil-rss"> + <style> + body {background-image: url(javascript:do_evil)}; + div {color: expression(evil)}; + </style> </head> <body onload="evil_function()"> <!-- I am interpreted for EVIL! --> @@ -49,6 +57,10 @@ <head> <script type="text/javascript" src="evil-site"></script> <link rel="alternate" type="text/rss" src="evil-rss"> + <style> + body {background-image: url(javascript:do_evil)}; + div {color: expression(evil)}; + </style> </head> <body onload="evil_function()"> <!-- I am interpreted for EVIL! --> @@ -70,6 +82,10 @@ <html> <head> <link rel="alternate" type="text/rss" src="evil-rss"> + <style> + body {background-image: url()}; + div {color: }; + </style> </head> <body> <a href="">a link</a> From ianb at codespeak.net Fri Jun 1 08:39:16 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Fri, 1 Jun 2007 08:39:16 +0200 (CEST) Subject: [Lxml-checkins] r43969 - lxml/branch/html/src/lxml/html Message-ID: <20070601063916.536348077@code0.codespeak.net> Author: ianb Date: Fri Jun 1 08:39:16 2007 New Revision: 43969 Modified: lxml/branch/html/src/lxml/html/clean.py Log: don't delete fieldset and legend. Do remove <layer> Modified: lxml/branch/html/src/lxml/html/clean.py ============================================================================== --- lxml/branch/html/src/lxml/html/clean.py (original) +++ lxml/branch/html/src/lxml/html/clean.py Fri Jun 1 08:39:16 2007 @@ -9,7 +9,6 @@ # Other on* attributes that aren't standard? # Try these tests: http://feedparser.org/tests/wellformed/sanitize/ # Also http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl -# <layer>...? # <head> and <title> is fishy in a fragment # max width for words # max height? @@ -141,13 +140,12 @@ if meta: kill_tags.append('meta') if embedded: - kill_tags.extend(['object', 'embed', 'iframe', 'applet']) + # FIXME: is <layer> really embedded? + kill_tags.extend(['object', 'embed', 'iframe', 'applet', 'layer']) if frames: kill_tags.extend(defs.frame_tags) if forms: - # FIXME: do I even care about fieldset and legend? I don't - # care about label. - remove_tags.extend(['form', 'fieldset', 'legend']) + remove_tags.extend(['form']) kill_tags.extend(['button', 'input', 'select', 'textarea']) bad = [] for el in doc.iterdescendants(): From ianb at codespeak.net Fri Jun 1 08:41:26 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Fri, 1 Jun 2007 08:41:26 +0200 (CEST) Subject: [Lxml-checkins] r43970 - lxml/branch/html/src/lxml/html Message-ID: <20070601064126.67BE9809F@code0.codespeak.net> Author: ianb Date: Fri Jun 1 08:41:25 2007 New Revision: 43970 Modified: lxml/branch/html/src/lxml/html/clean.py Log: add page_structure removal; for clean_html parse the html as a fragment Modified: lxml/branch/html/src/lxml/html/clean.py ============================================================================== --- lxml/branch/html/src/lxml/html/clean.py (original) +++ lxml/branch/html/src/lxml/html/clean.py Fri Jun 1 08:41:25 2007 @@ -1,7 +1,7 @@ import re from lxml import etree from lxml.html import defs -from lxml.html import HTML, tostring +from lxml.html import parse_element, tostring __all__ = ['clean_html', 'clean'] @@ -9,7 +9,6 @@ # Other on* attributes that aren't standard? # Try these tests: http://feedparser.org/tests/wellformed/sanitize/ # Also http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl -# <head> and <title> is fishy in a fragment # max width for words # max height? # autolink? @@ -26,7 +25,7 @@ Like clean(), but takes a text input document, and returns a text document. """ - doc = HTML(html) + doc = parse_element(html, create_parent=True) clean(doc, **kw) return tostring(doc) @@ -38,6 +37,7 @@ style=False, links=False, meta=False, + page_structure=False, embedded=True, frames=True, forms=True, @@ -69,12 +69,15 @@ ``meta``: Remove any ``<meta>`` tags - ``frames``: - Remove any frame-related tags + ``page_structure``: + Structural parts of a page: ``<head>``, ``<html>``, ``<title>`` ``embedded``: Remove any embedded objects (flash, iframes) + ``frames``: + Remove any frame-related tags + ``forms``: Remove any form tags @@ -139,6 +142,8 @@ kill_tags.append('link') if meta: kill_tags.append('meta') + if page_structure: + remove_tags.extend(['head', 'html', 'title']) if embedded: # FIXME: is <layer> really embedded? kill_tags.extend(['object', 'embed', 'iframe', 'applet', 'layer']) From ianb at codespeak.net Fri Jun 1 20:11:25 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Fri, 1 Jun 2007 20:11:25 +0200 (CEST) Subject: [Lxml-checkins] r43976 - in lxml/branch/html/src/lxml/html: . tests Message-ID: <20070601181125.702FA80A2@code0.codespeak.net> Author: ianb Date: Fri Jun 1 20:11:25 2007 New Revision: 43976 Added: lxml/branch/html/src/lxml/html/tests/test_feedparser_data.py (contents, props changed) lxml/branch/html/src/lxml/html/tests/transform_feedparser_data.py (contents, props changed) Modified: lxml/branch/html/src/lxml/html/__init__.py lxml/branch/html/src/lxml/html/clean.py lxml/branch/html/src/lxml/html/defs.py Log: Added tests from feedparser. Make sure to traverse the root element as well as children (_itertree). Keep contents of some tags like <iframe>. Add filter for <blink>. Add new parser that handles random HTML a bit better. Modified: lxml/branch/html/src/lxml/html/__init__.py ============================================================================== --- lxml/branch/html/src/lxml/html/__init__.py (original) +++ lxml/branch/html/src/lxml/html/__init__.py Fri Jun 1 20:11:25 2007 @@ -269,7 +269,7 @@ # FIXME: should this notice a fragment and parse accordingly? value = etree.HTML(html, html_parser) if value is None: - raise ParserError( + raise etree.ParserError( "Could not parse document") return value @@ -283,15 +283,18 @@ of only elements. """ # FIXME: check what happens when you give html with a body, head, etc. - html = '<html><body>%s</body></html>' % html + start = html[:20].lstrip().lower() + if not start.startswith('<html') and not start.startswith('<!doctype'): + # FIXME: That test doesn't work with a doctype or PI + html = '<html><body>%s</body></html>' % html doc = HTML(html) assert doc.tag == 'html' bodies = [e for e in doc if e.tag == 'body'] - assert len(bodies) == 1 + assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html)) body = bodies[0] elements = [] if no_leading_text and body.text and body.text.strip(): - raise ParserError( + raise etree.ParserError( "There is leading text: %r" % body.text) if body.text and body.text.strip(): elements.append(body.text) @@ -313,21 +316,81 @@ if not isinstance(create_parent, basestring): create_parent = 'div' return parse_element('<%s>%s</%s>' % (create_parent, html, create_parent)) + else: + print '----------\n', html elements = parse_elements(html, no_leading_text=True) if not elements: - raise ParserError( + raise etree.ParserError( "No elements found") if len(elements) > 1: - raise ParserError( + raise etree.ParserError( "Multiple elements found (%s)" - % ', '.join([e.tag for e in elements])) + % ', '.join([_element_name(e) for e in elements])) el = elements[0] if el.tail and el.tail.strip(): - raise ParserError( + raise etree.ParserError( "Element followed by text: %r" % el.tail) el.tail = None return el +def parse(html): + """ + Parse the html, returning a single element/document. + + This tries to minimally parse the chunk of text, without knowing if it + is a fragment or a document. + """ + start = html[:10].lstrip().lower() + if start.startswith('<html') or start.startswith('<!doctype'): + # Looks like a full HTML document + return HTML(html) + # otherwise, lets parse it out... + doc = HTML(html) + bodies = doc.findall('body') + body = bodies[0] + if len(bodies) > 1: + # Somehow there are multiple bodies, which is bad, but just + # smash them into one body + for other_body in bodies[1:]: + if other_body.text: + if len(body): + body[-1].tail = (body[-1].tail or '') + other_body.text + else: + body.text = (body.text or '') + other_body.text + body.extend(other_body) + # We'll ignore tail + # I guess we are ignoring attributes too + other_body.drop_element() + heads = doc.findall('head') + if heads: + # Well, we have some sort of structure, so lets keep it all + head = heads[0] + if len(heads) > 1: + for other_head in heads[1:]: + head.extend(other_head) + # We don't care about text or tail in a head + other_head.drop_element() + return doc + + if (len(body) == 1 and (not body.text or not body.text.strip()) + and (not body[-1].tail or not body[-1].tail.strip())): + # The body has just one element, so it was probably a single + # element passed in + return body[0] + # Now we have a body which represents a bunch of tags which have the + # content that was passed in. We will create a fake container, which + # is the body tag, except body implies too much structure. + body.tag = 'div' + return body + +def _element_name(el): + if isinstance(el, etree.CommentBase): + return 'comment' + elif isinstance(el, basestring): + return 'string' + else: + return el.tag + def Element(*args, **kw): v = html_parser.makeelement(*args, **kw) return v Modified: lxml/branch/html/src/lxml/html/clean.py ============================================================================== --- lxml/branch/html/src/lxml/html/clean.py (original) +++ lxml/branch/html/src/lxml/html/clean.py Fri Jun 1 20:11:25 2007 @@ -1,7 +1,7 @@ import re from lxml import etree from lxml.html import defs -from lxml.html import parse_element, tostring +from lxml.html import parse, tostring __all__ = ['clean_html', 'clean'] @@ -25,26 +25,36 @@ Like clean(), but takes a text input document, and returns a text document. """ - doc = parse_element(html, create_parent=True) + doc = parse(html) clean(doc, **kw) return tostring(doc) +def _itertree(el): + """ + Return the element's descendants, and the element itself + """ + yield el + for item in el.iterdescendants(): + yield item + def clean(doc, scripts=True, javascript=True, comments=True, # process instructions? style=False, - links=False, - meta=False, - page_structure=False, + links=True, + meta=True, + page_structure=True, embedded=True, frames=True, forms=True, + annoying_tags=True, remove_tags=None, allow_tags=None, strip_tags=True, remove_unknown_tags=True, + safe_attrs_only=True, add_nofollow=False, # callbacks? ): @@ -70,7 +80,8 @@ Remove any ``<meta>`` tags ``page_structure``: - Structural parts of a page: ``<head>``, ``<html>``, ``<title>`` + Structural parts of a page: ``<head>``, ``<html>``, ``<title>``. + Also xmlns attributes are removed with this. ``embedded``: Remove any embedded objects (flash, iframes) @@ -81,6 +92,9 @@ ``forms``: Remove any form tags + ``annoying_tags``: + Tags that aren't *wrong*, but are annoying. ``<blink>`` (FIXME: marquee?) + ``remove_tags``: A list of tags to remove. @@ -95,6 +109,11 @@ ``remove_unknown_tags``: Remove any tags that aren't standard parts of HTML. + ``safe_attrs_only``: + If true, only include 'safe' attributes (specifically the list + from `feedparser + <http://feedparser.org/docs/html-sanitization.html>`_). + ``add_nofollow``: If true, then any <a> tags will have ``rel="nofollow"`` added to them. @@ -108,12 +127,23 @@ remove_tags = list(remove_tags or []) if scripts: kill_tags.append('script') + if safe_attrs_only: + safe_attrs = set(defs.safe_attrs) + for el in _itertree(doc): + for aname in el.attrib.keys(): + if aname not in defs.safe_attrs: + del el.attrib[aname] if javascript: - for attrib in defs.event_attrs: - for el in doc.xpath('descendant-or-self::*[@%s]' % attrib): - del el.attrib[attrib] + if not safe_attrs_only: + # safe_attrs handles events attributes itself + for el in _itertree(doc): + for aname in el.attrib.keys(): + if aname.startswith('on'): + del el.attrib[aname] doc.rewrite_links(_remove_javascript, resolve_base_href=False) if not style: + # If we're deleting style then we don't have to remove JS links + # from styles, otherwise... for el in doc.xpath('descendant-or-self::*[@style]'): old = el.attrib['style'] new = _css_javascript_re.sub('', old) @@ -127,7 +157,7 @@ if comments: # Easier way? bad = [] - for el in doc.iterdescendants(): + for el in _itertree(doc): if isinstance(el, etree._Comment): bad.append(el) for el in bad: @@ -144,16 +174,25 @@ kill_tags.append('meta') if page_structure: remove_tags.extend(['head', 'html', 'title']) + # FIXME: is this really the right place to remove these attributes? + for el in doc.xpath('descendant-or-self::*[@xmlns]'): + del el.attrib['xmlns'] if embedded: # FIXME: is <layer> really embedded? - kill_tags.extend(['object', 'embed', 'iframe', 'applet', 'layer']) + kill_tags.extend(['applet', 'param']) + # The alternate contents that are in an iframe are a good fallback: + # FIXME: somehow embed seems to be getting data, but from what I + # can tell the embed tag is supposed to always be empty + remove_tags.extend(['iframe', 'object', 'embed', 'layer']) if frames: kill_tags.extend(defs.frame_tags) if forms: remove_tags.extend(['form']) kill_tags.extend(['button', 'input', 'select', 'textarea']) + if annoying_tags: + remove_tags.extend(['blink']) bad = [] - for el in doc.iterdescendants(): + for el in _itertree(doc): if el.tag in kill_tags: bad.append(el) for el in bad: @@ -164,7 +203,13 @@ for tag in remove_tags]) for el in doc.xpath(xpath): if strip_tags: - el.drop_tag() + if el.getparent(): + el.drop_tag() + else: + # We have to drop the parent-most tag, which we can't + # do. Instead we'll rewrite it: + el.tag = 'div' + el.attrib.clear() else: # FIXME: Should we test if this has been removed because of a parent? el.drop_element() @@ -175,7 +220,7 @@ allow_tags = defs.tags if allow_tags: bad = [] - for el in doc.iterdescendants(): + for el in _itertree(doc): if el.tag not in allow_tags: bad.append(el) for el in bad: Modified: lxml/branch/html/src/lxml/html/defs.py ============================================================================== --- lxml/branch/html/src/lxml/html/defs.py (original) +++ lxml/branch/html/src/lxml/html/defs.py Fri Jun 1 20:11:25 2007 @@ -19,14 +19,27 @@ 'usemap'] # Not in the HTML 4 spec: -# onerror +# onerror, onresize event_attrs = [ 'onblur', 'onchange', 'onclick', 'ondblclick', 'onerror', 'onfocus', 'onkeydown', 'onkeypress', 'onkeyup', 'onload', 'onmousedown', 'onmousemove', 'onmouseout', 'onmouseover', - 'onmouseup', 'onreset', 'onselect', 'onsubmit', 'onunload', + 'onmouseup', 'onreset', 'onresize', 'onselect', 'onsubmit', + 'onunload', ] +safe_attrs = [ + 'abbr', 'accept', 'accept-charset', 'accesskey', 'action', 'align', + 'alt', 'axis', 'border', 'cellpadding', 'cellspacing', 'char', 'charoff', + 'charset', 'checked', 'cite', 'class', 'clear', 'cols', 'colspan', + 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled', 'enctype', + 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace', 'id', + 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method', + 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', + 'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', + 'size', 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', + 'type', 'usemap', 'valign', 'value', 'vspace', 'width'] + # From http://htmlhelp.com/reference/html40/olist.html top_level_tags = [ 'html', 'head', 'body', 'frameset', Added: lxml/branch/html/src/lxml/html/tests/test_feedparser_data.py ============================================================================== --- (empty file) +++ lxml/branch/html/src/lxml/html/tests/test_feedparser_data.py Fri Jun 1 20:11:25 2007 @@ -0,0 +1,83 @@ +import os +import re +import rfc822 +import unittest +from lxml.tests.common_imports import doctest +from lxml.doctestcompare import LHTMLOutputChecker + +from lxml.html import HTML, parse_element +from lxml.html.clean import clean, clean_html + +feed_dir = os.path.join(os.path.dirname(__file__), 'feedparser-data') +bar_re = re.compile(r"-----+") + +class DummyInput: + def __init__(self, **kw): + for name, value in kw.items(): + setattr(self, name, value) + +class FeedTestCase(unittest.TestCase): + + def __init__(self, filename): + self.filename = filename + unittest.TestCase.__init__(self) + + def parse(self): + f = open(self.filename, 'rb') + headers = rfc822.Message(f) + c = f.read() + f.close() + if not headers.keys(): + raise Exception( + "File %s has no headers" % self.filename) + self.description = headers['Description'] + self.expect = headers['Expect'] + self.ignore = headers.get('Ignore') + self.options = [ + o.strip() for o in headers['Options'].split(',') + if o.strip()] + parts = bar_re.split(c) + self.input = parts[0].rstrip() + '\n' + if parts[1:]: + self.expect = parts[1].rstrip() + '\n' + else: + self.expect = None + + def runTest(self): + self.parse() + if self.ignore: + # We've marked this test to be ignored. + return + kw = {} + for name in self.options: + if name.startswith('-'): + kw[name[1:]] = False + else: + kw[name] = True + transformed = clean_html(self.input, **kw) + assert self.expect is not None, ( + "No expected output in %s" % self.filename) + checker = LHTMLOutputChecker() + if not checker.check_output(self.expect, transformed, 0): + result = checker.output_difference( + DummyInput(want=self.expect), transformed, 0) + #result += '\noptions: %s %r' % (', '.join(self.options), kw) + #result += repr(transformed) + raise Exception("\n"+result) + + def shortDescription(self): + return self.filename + +def test_suite(): + suite = unittest.TestSuite() + for fn in os.listdir(feed_dir): + fn = os.path.join(feed_dir, fn) + if fn.endswith('.data'): + case = FeedTestCase(fn) + suite.addTests([case]) + # This is my lazy way of stopping on first error: + try: + case.runTest() + except: + break + return suite Added: lxml/branch/html/src/lxml/html/tests/transform_feedparser_data.py ============================================================================== --- (empty file) +++ lxml/branch/html/src/lxml/html/tests/transform_feedparser_data.py Fri Jun 1 20:11:25 2007 @@ -0,0 +1,110 @@ +""" +This takes the feedparser tests from here: + + http://feedparser.org/tests/wellformed/sanitize/ + +and rewrites them to be easier to handle (not using the internal model +of feedparser). The input format is:: + + <!-- + Description: {description} + Expect: {expression} + --> + ... + <content ...>{content}</content> + ... + +The Expect expression is checked for +``entries[0]['content'][0]['value'] == {data}``. + +The output format is:: + + Description: {description} + Expect: {expression} (if data couldn't be parsed) + Options: + + {content, unescaped} + ---------- + {data, unescaped, if found} + +""" + +import re +import os +import traceback + +_desc_re = re.compile(r'\s*Description:\s*(.*)') +_expect_re = re.compile(r'\s*Expect:\s*(.*)') +_data_expect_re = re.compile(r"entries\[0\]\['[^']+'\](?:\[0\]\['value'\])?\s*==\s*(.*)") +_feed_data_expect_re = re.compile(r"feed\['[^']+'\]\s*==\s*(.*)") + +def parse_content(content): + match = _desc_re.search(content) + desc = match.group(1) + match = _expect_re.search(content) + expect = match.group(1) + data = None + for regex in [_data_expect_re, _feed_data_expect_re]: + match = regex.search(expect) + if match: + # Icky, but I'll trust it + data = eval(match.group(1).strip()) + break + c = None + for tag in ['content', 'summary', 'title', 'copyright', 'tagline', 'info', 'subtitle', 'fullitem', 'body', 'description', 'content:encoded']: + regex = re.compile(r"<%s.*?>(.*)</%s>" % (tag, tag), re.S) + match = regex.search(content) + if match: + c = match.group(1) + break + assert c is not None + # Seems like body isn't quoted + if tag != 'body': + c = c.replace('<', '<') + c = c.replace('&', '&') + # FIXME: I should really do more unescaping... + return { + 'Description': desc, + 'Expect': expect, + 'data': data, + 'content': c} + +def serialize_content(d): + s = '''\ +Description: %(Description)s +Expect: %(Expect)s +Options: + +%(content)s +''' % d + if d.get('data') is not None: + s += '----------\n%s' % d['data'] + return s + +def translate_file(filename): + f = open(filename, 'rb') + c = f.read() + f.close() + try: + output = serialize_content(parse_content(c)) + except: + print 'Bad data in %s:' % filename + print c + traceback.print_exc() + print '-'*60 + return + new = os.path.splitext(filename)[0] + '.data' + f = open(new, 'wb') + f.write(output) + f.close() + +def translate_all(dir): + for fn in os.listdir(dir): + fn = os.path.join(dir, fn) + if fn.endswith('.xml'): + translate_file(fn) + +if __name__ == '__main__': + import sys + translate_all(os.path.join(os.path.dirname(__file__), 'feedparser-data')) + From ianb at codespeak.net Fri Jun 1 20:23:36 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Fri, 1 Jun 2007 20:23:36 +0200 (CEST) Subject: [Lxml-checkins] r43977 - in lxml/branch/html/src/lxml/html: . tests Message-ID: <20070601182336.70C0280AD@code0.codespeak.net> Author: ianb Date: Fri Jun 1 20:23:35 2007 New Revision: 43977 Added: lxml/branch/html/src/lxml/html/diff.py - copied, changed from r43962, lxml/branch/html/src/lxml/html/htmldiff.py lxml/branch/html/src/lxml/html/tests/test_diff.py - copied, changed from r43962, lxml/branch/html/src/lxml/html/tests/test_htmldiff.py lxml/branch/html/src/lxml/html/tests/test_diff.txt - copied, changed from r43962, lxml/branch/html/src/lxml/html/tests/test_htmldiff.txt Removed: lxml/branch/html/src/lxml/html/htmldiff.py lxml/branch/html/src/lxml/html/tests/test_htmldiff.py lxml/branch/html/src/lxml/html/tests/test_htmldiff.txt Modified: lxml/branch/html/src/lxml/html/__init__.py Log: Remove debugging print; rename htmldiff to diff (lxml.html.diff) Modified: lxml/branch/html/src/lxml/html/__init__.py ============================================================================== --- lxml/branch/html/src/lxml/html/__init__.py (original) +++ lxml/branch/html/src/lxml/html/__init__.py Fri Jun 1 20:23:35 2007 @@ -316,8 +316,6 @@ if not isinstance(create_parent, basestring): create_parent = 'div' return parse_element('<%s>%s</%s>' % (create_parent, html, create_parent)) - else: - print '----------\n', html elements = parse_elements(html, no_leading_text=True) if not elements: raise etree.ParserError( Copied: lxml/branch/html/src/lxml/html/diff.py (from r43962, lxml/branch/html/src/lxml/html/htmldiff.py) ============================================================================== --- lxml/branch/html/src/lxml/html/htmldiff.py (original) +++ lxml/branch/html/src/lxml/html/diff.py Fri Jun 1 20:23:35 2007 @@ -770,7 +770,8 @@ if not _contains_block_level_tag(el): continue _move_el_inside_block(el, tag=tag) - _merge_element_contents(el) + el.drop_tag() + #_merge_element_contents(el) def _contains_block_level_tag(el): """True if the element contains any block-level elements, like <p>, <td>, etc. Deleted: /lxml/branch/html/src/lxml/html/htmldiff.py ============================================================================== --- /lxml/branch/html/src/lxml/html/htmldiff.py Fri Jun 1 20:23:35 2007 +++ (empty file) @@ -1,890 +0,0 @@ -import difflib -from lxml import etree -from lxml.html import parse_element -import cgi -import re - -__all__ = ['html_annotate', 'htmldiff'] - - -############################################################ -## Annotation -############################################################ - -def default_markup(text, version): - return '<span title="%s">%s</span>' % ( - cgi.escape(unicode(version), 1), text) - -def html_annotate(doclist, markup=default_markup): - """ - doclist should be ordered from oldest to newest, like:: - - >>> version1 = 'Hello World' - >>> version2 = 'Goodbye World' - >>> html_annotate([(version1, 'version 1'), - ... (version2, 'version 2')]) - u'<span title="version 2">Goodbye</span> <span title="version 1">World</span>' - - The documents must be *fragments* (str/UTF8 or unicode), not - complete documents - - The markup argument is a function to markup the spans of words. - This function is called like markup('Hello', 'version 2'), and - returns HTML. The first argument is text and never includes any - markup. The default uses a span with a title: - - >>> default_markup('Some Text', 'by Joe') - u'<span title="by Joe">Some Text</span>' - """ - # The basic strategy we have is to split the documents up into - # logical tokens (which are words with attached markup). We then - # do diffs of each of the versions to track when a token first - # appeared in the document; the annotation attached to the token - # is the version where it first appeared. - tokenlist = [tokenize_annotated(doc, version) - for doc, version in doclist] - cur_tokens = tokenlist[0] - for tokens in tokenlist[1:]: - html_annotate_merge_annotations(cur_tokens, tokens) - cur_tokens = tokens - - # After we've tracked all the tokens, we can combine spans of text - # that are adjacent and have the same annotation - cur_tokens = compress_tokens(cur_tokens) - # And finally add markup - result = markup_serialize_tokens(cur_tokens, markup) - return ''.join(result).strip() - -def tokenize_annotated(doc, annotation): - """Tokenize a document and add an annotation attribute to each token - """ - tokens = tokenize(doc, include_hrefs=False) - for tok in tokens: - tok.annotation = annotation - return tokens - -def html_annotate_merge_annotations(tokens_old, tokens_new): - """Merge the annotations from tokens_old into tokens_new, when the - tokens in the new document already existed in the old document. - """ - s = InsensitiveSequenceMatcher(a=tokens_old, b=tokens_new) - commands = s.get_opcodes() - - for command, i1, i2, j1, j2 in commands: - if command == 'equal': - eq_old = tokens_old[i1:i2] - eq_new = tokens_new[j1:j2] - copy_annotations(eq_old, eq_new) - -def copy_annotations(src, dest): - """ - Copy annotations from the tokens listed in src to the tokens in dest - """ - assert len(src) == len(dest) - for src_tok, dest_tok in zip(src, dest): - dest_tok.annotation = src_tok.annotation - -def compress_tokens(tokens): - """ - Combine adjacent tokens when there is no HTML between the tokens, - and they share an annotation - """ - result = [tokens[0]] - for tok in tokens[1:]: - if (not result[-1].post_tags and - not tok.pre_tags and - result[-1].annotation == tok.annotation): - compress_merge_back(result, tok) - else: - result.append(tok) - return result - -def compress_merge_back(tokens, tok): - """ Merge tok into the last element of tokens (modifying the list of - tokens in-place). """ - last = tokens[-1] - if type(last) is not token or type(tok) is not token: - tokens.append(tok) - else: - text = unicode(last) - if last.trailing_whitespace: - text += ' ' - text += tok - merged = token(text, - pre_tags=last.pre_tags, - post_tags=tok.post_tags, - trailing_whitespace=tok.trailing_whitespace) - merged.annotation = last.annotation - tokens[-1] = merged - -def markup_serialize_tokens(tokens, markup_func): - """ - Serialize the list of tokens into a list of text chunks, calling - markup_func around text to add annotations. - """ - for token in tokens: - for pre in token.pre_tags: - yield pre - html = token.html() - html = markup_func(html, token.annotation) - if token.trailing_whitespace: - html += ' ' - yield html - for post in token.post_tags: - yield post - - -############################################################ -## HTML Diffs -############################################################ - -def htmldiff(old_html, new_html): - """ Do a diff of the old and new document. The documents are HTML - *fragments* (str/UTF8 or unicode), they are not complete documents - (i.e., no <html> tag). - - Returns HTML with <ins> and <del> tags added around the - appropriate text. - - Markup is generally ignored, with the markup from new_html - preserved, and possibly some markup from old_html (though it is - considered acceptable to lose some of the old markup). Only the - words in the HTML are diffed. The exception is <img> tags, which - are treated like words, and the href attribute of <a> tags, which - are noted inside the tag itself when there are changes. - """ - old_html_tokens = tokenize(old_html) - new_html_tokens = tokenize(new_html) - result = htmldiff_tokens(old_html_tokens, new_html_tokens) - result = ''.join(result).strip() - return fixup_ins_del_tags(result) - -def htmldiff_tokens(html1_tokens, html2_tokens): - """ Does a diff on the tokens themselves, returning a list of text - chunks (not tokens). - """ - # There are several passes as we do the differences. The tokens - # isolate the portion of the content we care to diff; difflib does - # all the actual hard work at that point. - # - # Then we must create a valid document from pieces of both the old - # document and the new document. We generally prefer to take - # markup from the new document, and only do a best effort attempt - # to keep markup from the old document; anything that we can't - # resolve we throw away. Also we try to put the deletes as close - # to the location where we think they would have been -- because - # we are only keeping the markup from the new document, it can be - # fuzzy where in the new document the old text would have gone. - # Again we just do a best effort attempt. - s = InsensitiveSequenceMatcher(a=html1_tokens, b=html2_tokens) - commands = s.get_opcodes() - result = [] - for command, i1, i2, j1, j2 in commands: - if command == 'equal': - result.extend(expand_tokens(html2_tokens[j1:j2], equal=True)) - continue - if command == 'insert' or command == 'replace': - ins_tokens = expand_tokens(html2_tokens[j1:j2]) - merge_insert(ins_tokens, result) - if command == 'delete' or command == 'replace': - del_tokens = expand_tokens(html1_tokens[i1:i2]) - merge_delete(del_tokens, result) - # If deletes were inserted directly as <del> then we'd have an - # invalid document at this point. Instead we put in special - # markers, and when the complete diffed document has been created - # we try to move the deletes around and resolve any problems. - result = cleanup_delete(result) - - return result - -def expand_tokens(tokens, equal=False): - """Given a list of tokens, return a generator of the chunks of - text for the data in the tokens. - """ - for token in tokens: - for pre in token.pre_tags: - yield pre - if not equal or not token.hide_when_equal: - if token.trailing_whitespace: - yield token.html() + ' ' - else: - yield token.html() - for post in token.post_tags: - yield post - -def merge_insert(ins_chunks, doc): - """ doc is the already-handled document (as a list of text chunks); - here we add <ins>ins_chunks</ins> to the end of that. """ - # Though we don't throw away unbalanced_start or unbalanced_end - # (we assume there is accompanying markup later or earlier in the - # document), we only put <ins> around the balanced portion. - unbalanced_start, balanced, unbalanced_end = split_unbalanced(ins_chunks) - doc.extend(unbalanced_start) - if doc and not doc[-1].endswith(' '): - # Fix up the case where the word before the insert didn't end with - # a space - doc[-1] += ' ' - doc.append('<ins>') - if balanced and balanced[-1].endswith(' '): - # We move space outside of </ins> - balanced[-1] = balanced[-1][:-1] - doc.extend(balanced) - doc.append('</ins> ') - doc.extend(unbalanced_end) - -# These are sentinals to represent the start and end of a <del> -# segment, until we do the cleanup phase to turn them into proper -# markup: -class DEL_START: - pass -class DEL_END: - pass - -class NoDeletes(Exception): - """ Raised when the document no longer contains any pending deletes - (DEL_START/DEL_END) """ - -def merge_delete(del_chunks, doc): - """ Adds the text chunks in del_chunks to the document doc (another - list of text chunks) with marker to show it is a delete. - cleanup_delete later resolves these markers into <del> tags.""" - doc.append(DEL_START) - doc.extend(del_chunks) - doc.append(DEL_END) - -def cleanup_delete(chunks): - """ Cleans up any DEL_START/DEL_END markers in the document, replacing - them with <del></del>. To do this while keeping the document - valid, it may need to drop some tags (either start or end tags). - - It may also move the del into adjacent tags to try to move it to a - similar location where it was originally located (e.g., moving a - delete into preceding <div> tag, if the del looks like (DEL_START, - 'Text</div>', DEL_END)""" - while 1: - # Find a pending DEL_START/DEL_END, splitting the document - # into stuff-preceding-DEL_START, stuff-inside, and - # stuff-following-DEL_END - try: - pre_delete, delete, post_delete = split_delete(chunks) - except NoDeletes: - # Nothing found, we've cleaned up the entire doc - break - # The stuff-inside-DEL_START/END may not be well balanced - # markup. First we figure out what unbalanced portions there are: - unbalanced_start, balanced, unbalanced_end = split_unbalanced(delete) - # Then we move the span forward and/or backward based on these - # unbalanced portions: - locate_unbalanced_start(unbalanced_start, pre_delete, post_delete) - locate_unbalanced_end(unbalanced_end, pre_delete, post_delete) - doc = pre_delete - if doc and not doc[-1].endswith(' '): - # Fix up case where the word before us didn't have a trailing space - doc[-1] += ' ' - doc.append('<del>') - if balanced and balanced[-1].endswith(' '): - # We move space outside of </del> - balanced[-1] = balanced[-1][:-1] - doc.extend(balanced) - doc.append('</del> ') - doc.extend(post_delete) - chunks = doc - return chunks - -def split_unbalanced(chunks): - """Return (unbalanced_start, balanced, unbalanced_end), where each is - a list of text and tag chunks. - - unbalanced_start is a list of all the tags that are opened, but - not closed in this span. Similarly, unbalanced_end is a list of - tags that are closed but were not opened. Extracting these might - mean some reordering of the chunks.""" - start = [] - end = [] - tag_stack = [] - balanced = [] - for chunk in chunks: - if not chunk.startswith('<'): - balanced.append(chunk) - continue - endtag = chunk[1] == '/' - name = chunk.split()[0].strip('<>/') - if name in empty_tags: - assert not endtag, ( - "Empty tag %r should have no end tag" % chunk) - balanced.append(chunk) - continue - if endtag: - if tag_stack and tag_stack[-1][0] == name: - balanced.append(chunk) - name, pos, tag = tag_stack.pop() - balanced[pos] = tag - elif tag_stack: - start.extend(tag for name, pos, tag in tag_stack) - tag_stack = [] - end.append(chunk) - else: - end.append(chunk) - else: - tag_stack.append((name, len(balanced), chunk)) - balanced.append(None) - start.extend( - [chunk for name, pos, chunk in tag_stack]) - balanced = [chunk for chunk in balanced if chunk is not None] - return start, balanced, end - -def split_delete(chunks): - """ Returns (stuff_before_DEL_START, stuff_inside_DEL_START_END, - stuff_after_DEL_END). Returns the first case found (there may be - more DEL_STARTs in stuff_after_DEL_END). Raises NoDeletes if - there's no DEL_START found. """ - try: - pos = chunks.index(DEL_START) - except ValueError: - raise NoDeletes - pos2 = chunks.index(DEL_END) - return chunks[:pos], chunks[pos+1:pos2], chunks[pos2+1:] - -def locate_unbalanced_start(unbalanced_start, pre_delete, post_delete): - """ pre_delete and post_delete implicitly point to a place in the - document (where the two were split). This moves that point (by - popping items from one and pushing them onto the other). It moves - the point to try to find a place where unbalanced_start applies. - - As an example:: - - >>> unbalanced_start = ['<div>'] - >>> doc = ['<p>', 'Text', '</p>', '<div>', 'More Text', '</div>'] - >>> pre, post = doc[:3], doc[3:] - >>> pre, post - (['<p>', 'Text', '</p>'], ['<div>', 'More Text', '</div>']) - >>> locate_unbalanced_start(unbalanced_start, pre, post) - >>> pre, post - (['<p>', 'Text', '</p>', '<div>'], ['More Text', '</div>']) - - As you can see, we moved the point so that the dangling <div> that - we found will be effectively replaced by the div in the original - document. If this doesn't work out, we just throw away - unbalanced_start without doing anything. - """ - while 1: - if not unbalanced_start: - # We have totally succeded in finding the position - break - finding = unbalanced_start[0] - finding_name = finding.split()[0].strip('<>') - if not post_delete: - break - next = post_delete[0] - if next is DEL_START or not next.startswith('<'): - # Reached a word, we can't move the delete text forward - break - if next[1] == '/': - # Reached a closing tag, can we go further? Maybe not... - break - name = next.split()[0].strip('<>') - if name == 'ins': - # Can't move into an insert - break - assert name != 'del', ( - "Unexpected delete tag: %r" % next) - if name == finding_name: - unbalanced_start.pop(0) - pre_delete.append(post_delete.pop(0)) - else: - # Found a tag that doesn't match - break - -def locate_unbalanced_end(unbalanced_end, pre_delete, post_delete): - """ like locate_unbalanced_start, except handling end tags and - possibly moving the point earlier in the document. """ - while 1: - if not unbalanced_end: - # Success - break - finding = unbalanced_end[-1] - finding_name = finding.split()[0].strip('<>/') - if not pre_delete: - break - next = pre_delete[-1] - if next is DEL_END or not next.startswith('</'): - # A word or a start tag - break - name = next.split()[0].strip('<>/') - if name == 'ins' or name == 'del': - # Can't move into an insert or delete - break - if name == finding_name: - unbalanced_end.pop() - post_delete.insert(0, pre_delete.pop()) - else: - # Found a tag that doesn't match - break - -class token(unicode): - """ Represents a diffable token, generally a word that is displayed to - the user. Opening tags are attached to this token when they are - adjacent (pre_tags) and closing tags that follow the word - (post_tags). Some exceptions occur when there are empty tags - adjacent to a word, so there may be close tags in pre_tags, or - open tags in post_tags. - - We also keep track of whether the word was originally followed by - whitespace, even though we do not want to treat the word as - equivalent to a similar word that does not have a trailing - space.""" - - # When this is true, the token will be eliminated from the - # displayed diff if no change has occurred: - hide_when_equal = False - - def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=False): - obj = unicode.__new__(cls, text) - - if pre_tags is not None: - obj.pre_tags = pre_tags - else: - obj.pre_tags = [] - - if post_tags is not None: - obj.post_tags = post_tags - else: - obj.post_tags = [] - - obj.trailing_whitespace = trailing_whitespace - - return obj - - def __repr__(self): - return 'token(%s, %r, %r)' % (unicode.__repr__(self), self.pre_tags, self.post_tags) - - def html(self): - return unicode(self) - -class tag_token(token): - - """ Represents a token that is actually a tag. Currently this is just - the <img> tag, which takes up visible space just like a word but - is only represented in a document by a tag. """ - - def __new__(cls, tag, data, html_repr, pre_tags=None, - post_tags=None, trailing_whitespace=False): - obj = token.__new__(cls, "%s: %s" % (type, data), - pre_tags=pre_tags, - post_tags=post_tags, - trailing_whitespace=trailing_whitespace) - obj.tag = tag - obj.data = data - obj.html_repr = html_repr - return obj - - def __repr__(self): - return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%s)' % ( - self.tag, - self.data, - self.html_repr, - self.pre_tags, - self.post_tags, - self.trailing_whitespace) - def html(self): - return self.html_repr - -class href_token(token): - - """ Represents the href in an anchor tag. Unlike other words, we only - show the href when it changes. """ - - hide_when_equal = True - - def html(self): - return 'Link: %s' % self - -def tokenize(html, include_hrefs=True): - """ - Parse the given HTML and returns token objects (words with attached tags). - - This parses only the content of a page; anything in the head is - ignored, and the <head> and <body> elements are themselves - optional. The content is then parsed by lxml, which ensures the - validity of the resulting parsed document (though lxml may make - incorrect guesses when the markup is particular bad). - - <ins> and <del> tags are also eliminated from the document, as - that gets confusing. - - If include_hrefs is true, then the href attribute of <a> tags is - included as a special kind of diffable token.""" - body_el = parse_html(html, cleanup=True) - # Then we split the document into text chunks for each tag, word, and end tag: - chunks = flatten_el(body_el, drop_tag=True, include_hrefs=include_hrefs) - # Finally re-joining them into token objects: - return fixup_chunks(chunks) - -def parse_html(html, cleanup=True): - """ - Parses an HTML fragment, returning an lxml element. Note that the HTML will be - wrapped in a <div> tag that was not in the original document. - - If cleanup is true, make sure there's no <head> or <body>, and get - rid of any <ins> and <del> tags. - """ - if cleanup: - # This removes any extra markup or structure like <head>: - html = cleanup_html(html) - return parse_element(html, create_parent=True) - -_body_re = re.compile(r'<body.*?>', re.I|re.S) -_end_body_re = re.compile(r'</body.*?>', re.I|re.S) -_ins_del_re = re.compile(r'</?(ins|del).*?>', re.I|re.S) - -def cleanup_html(html): - """ This 'cleans' the HTML, meaning that any page structure is removed - (only the contents of <body> are used, if there is any <body). - Also <ins> and <del> tags are removed. """ - match = _body_re.search(html) - if match: - html = html[match.end():] - match = _end_body_re.search(html) - if match: - html = html[:match.start()] - html = _ins_del_re.sub('', html) - return html - - -end_whitespace_re = re.compile(r'[ \t\n\r]$') - -def fixup_chunks(chunks): - """ - This function takes a list of chunks and produces a list of tokens. - """ - tag_accum = [] - cur_word = None - result = [] - for chunk in chunks: - if isinstance(chunk, tuple): - if chunk[0] == 'img': - src = chunk[1] - tag = chunk[2] - if tag.endswith(' '): - tag = tag[:-1] - trailing_whitespace = True - else: - trailing_whitespace = False - cur_word = tag_token('img', src, html_repr=tag, - pre_tags=tag_accum, - trailing_whitespace=trailing_whitespace) - tag_accum = [] - result.append(cur_word) - elif chunk[0] == 'href': - href = chunk[1] - cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=True) - tag_accum = [] - result.append(cur_word) - continue - if is_word(chunk): - if chunk.endswith(' '): - chunk = chunk[:-1] - trailing_whitespace = True - else: - trailing_whitespace = False - cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace) - tag_accum = [] - result.append(cur_word) - elif is_start_tag(chunk): - tag_accum.append(chunk) - elif is_end_tag(chunk): - if tag_accum: - tag_accum.append(chunk) - else: - assert cur_word, ( - "Weird state, cur_word=%r, result=%r, chunks=%r of %r" - % (cur_word, result, chunk, chunks)) - cur_word.post_tags.append(chunk) - else: - assert(0) - - if not result: - return [token('', pre_tags=tag_accum)] - else: - result[-1].post_tags.extend(tag_accum) - - return result - - -# All the tags in HTML that don't require end tags: -empty_tags = ( - 'param', 'img', 'area', 'br', 'basefont', 'input', - 'base', 'meta', 'link', 'col') - -block_level_tags = ( - 'address', - 'blockquote', - 'center', - 'dir', - 'div', - 'dl', - 'fieldset', - 'form', - 'h1', - 'h2', - 'h3', - 'h4', - 'h5', - 'h6', - 'hr', - 'isindex', - 'menu', - 'noframes', - 'noscript', - 'ol', - 'p', - 'pre', - 'table', - 'ul', - ) - -block_level_container_tags = ( - 'dd', - 'dt', - 'frameset', - 'li', - 'tbody', - 'td', - 'tfoot', - 'th', - 'thead', - 'tr', - ) - - -def flatten_el(el, include_hrefs, drop_tag=False): - """ Takes an lxml element el, and generates all the text chunks for - that tag. Each start tag is a chunk, each word is a chunk, and each - end tag is a chunk. - - If drop_tag is true, then the outermost container tag is - not returned (just its contents).""" - if not drop_tag: - if el.tag == 'img': - yield ('img', el.attrib['src'], start_tag(el)) - else: - yield start_tag(el) - if el.tag in empty_tags and not el.text and not len(el): - return - start_words = split_words(el.text) - for word in start_words: - yield cgi.escape(word) - for child in el: - for item in flatten_el(child, include_hrefs=include_hrefs): - yield item - if el.tag == 'a' and el.attrib.get('href') and include_hrefs: - yield ('href', el.attrib['href']) - if not drop_tag: - yield end_tag(el) - end_words = split_words(el.tail) - for word in end_words: - yield cgi.escape(word) - -def split_words(text): - """ Splits some text into words. Includes trailing whitespace (one - space) on each word when appropriate. """ - if not text or not text.strip(): - return [] - words = [w + ' ' for w in text.strip().split()] - if not end_whitespace_re.search(text): - words[-1] = words[-1][:-1] - return words - -start_whitespace_re = re.compile(r'^[ \t\n\r]') - -def start_tag(el): - """ - The text representation of the start tag for a tag. - """ - return '<%s%s>' % ( - el.tag, ''.join(' %s="%s"' % (name, cgi.escape(value, True)) - for name, value in el.attrib.items())) - -def end_tag(el): - """ The text representation of an end tag for a tag. Includes - trailing whitespace when appropriate. """ - if el.tail and start_whitespace_re.search(el.tail): - extra = ' ' - else: - extra = '' - return '</%s>%s' % (el.tag, extra) - -def is_word(tok): - return not tok.startswith('<') - -def is_end_tag(tok): - return tok.startswith('</') - -def is_start_tag(tok): - return tok.startswith('<') and not tok.startswith('</') - -def fixup_ins_del_tags(html): - """ Given an html string, move any <ins> or <del> tags inside of any - block-level elements, e.g. transform <ins><p>word</p></ins> to - <p><ins>word</ins></p> """ - doc = parse_html(html, cleanup=False) - _fixup_ins_del_tags(doc) - html = serialize_html_fragment(doc, drop_outer=True) - return html - -def serialize_html_fragment(el, drop_outer=False): - """ Serialize a single lxml element as HTML. The serialized form - includes the elements tail. - - If drop_outer is true, then don't serialize the outermost tag - """ - - html_xsl = """\ -<xsl:transform xmlns:xsl="http://www.w3.org/1999/XSL/Transform"> - <xsl:output method="html" encoding="UTF-8" /> - <xsl:template match="/"> - <xsl:copy-of select="."/> - </xsl:template> -</xsl:transform> -""" - transform = etree.XSLT(etree.XML(html_xsl)) - assert not isinstance(el, basestring), ( - "You should pass in an element, not a string like %r" % el) - html = str(transform(el)) - if drop_outer: - # Get rid of the extra starting tag: - html = html[html.find('>')+1:] - if drop_outer: - # Get rid of the extra end tag: - html = html[:html.rfind('<')] - if drop_outer: - return html.strip() - else: - return html.lstrip() - -def _fixup_ins_del_tags(doc): - """fixup_ins_del_tags that works on an lxml document in-place - """ - for tag in ['ins', 'del']: - for el in doc.xpath('descendant-or-self::%s' % tag): - if not _contains_block_level_tag(el): - continue - _move_el_inside_block(el, tag=tag) - _merge_element_contents(el) - -def _contains_block_level_tag(el): - """True if the element contains any block-level elements, like <p>, <td>, etc. - """ - if el.tag in block_level_tags or el.tag in block_level_container_tags: - return True - for child in el: - if _contains_block_level_tag(child): - return True - return False - -def _move_el_inside_block(el, tag): - """ helper for _fixup_ins_del_tags; actually takes the <ins> etc tags - and moves them inside any block-level tags. """ - for child in el: - if _contains_block_level_tag(child): - break - else: - import sys - # No block-level tags in any child - children_tag = etree.Element(tag) - children_tag.text = el.text - el.text = None - children_tag.extend(list(el)) - el[:] = [children_tag] - return - for child in list(el): - if _contains_block_level_tag(child): - _move_el_inside_block(child, tag) - if child.tail: - tail_tag = etree.Element(tag) - tail_tag.text = child.tail - child.tail = None - el.insert(el.index(child)+1, tail_tag) - else: - child_tag = etree.Element(tag) - el.replace(child, child_tag) - child_tag.append(child) - if el.text: - text_tag = etree.Element(tag) - text_tag.text = el.text - el.text = None - el.insert(0, text_tag) - -def _merge_element_contents(el): - """ - Removes an element, but merges its contents into its place, e.g., - given <p>Hi <i>there!</i></p>, if you remove the <i> element you get - <p>Hi there!</p> - """ - parent = el.getparent() - text = el.text or '' - if el.tail: - if not len(el): - text += el.tail - else: - if el[-1].tail: - el[-1].tail += el.tail - else: - el[-1].tail = el.tail - index = parent.index(el) - if text: - if index == 0: - previous = None - else: - previous = parent[index-1] - if previous is None: - if parent.text: - parent.text += text - else: - parent.text = text - else: - if previous.tail: - previous.tail += text - else: - previous.tail = text - parent[index:index+1] = el.getchildren() - -class InsensitiveSequenceMatcher(difflib.SequenceMatcher): - """ - Acts like SequenceMatcher, but tries not to find very small equal - blocks amidst large spans of changes - """ - - threshold = 2 - - def get_matching_blocks(self): - size = min(len(self.b), len(self.b)) - threshold = min(self.threshold, size / 4) - actual = difflib.SequenceMatcher.get_matching_blocks(self) - return [item for item in actual - if item[2] > threshold - or not item[2]] - -# def get_matching_blocks(self): -# size = min(len(self.b), len(self.b)) -# threshold = min(self.threshold, size / 4) -# actual = difflib.SequenceMatcher.get_matching_blocks(self) -# last_equal_a = 0 -# eliminate = [] -# for i in xrange(1, len(actual)-1): -# start_diff_length = actual[i][0] - (actual[i-1][0] + actual[i-1][2]) -# end_diff_length = actual[i+1][0] -# for a_pos, b_pos, length in actual: -# if (last_equal_a - a_pos is big -# and length is small -# and next_equal_a is far away): -# continue -# result.append((a_pos, b_pos, length)) -# last_equal_a = a_pos+length -# return result - - -if __name__ == '__main__': - import doctest - doctest.testmod() - Copied: lxml/branch/html/src/lxml/html/tests/test_diff.py (from r43962, lxml/branch/html/src/lxml/html/tests/test_htmldiff.py) ============================================================================== --- lxml/branch/html/src/lxml/html/tests/test_htmldiff.py (original) +++ lxml/branch/html/src/lxml/html/tests/test_diff.py Fri Jun 1 20:23:35 2007 @@ -1,12 +1,12 @@ import unittest from lxml.tests.common_imports import doctest -from lxml.html import htmldiff +from lxml.html import diff def test_suite(): suite = unittest.TestSuite() - suite.addTests([doctest.DocFileSuite('test_htmldiff.txt'), - doctest.DocTestSuite(htmldiff)]) + suite.addTests([doctest.DocFileSuite('test_diff.txt'), + doctest.DocTestSuite(diff)]) return suite if __name__ == '__main__': Copied: lxml/branch/html/src/lxml/html/tests/test_diff.txt (from r43962, lxml/branch/html/src/lxml/html/tests/test_htmldiff.txt) ============================================================================== --- lxml/branch/html/src/lxml/html/tests/test_htmldiff.txt (original) +++ lxml/branch/html/src/lxml/html/tests/test_diff.txt Fri Jun 1 20:23:35 2007 @@ -1,4 +1,4 @@ -htmldiff does HTML comparisons. These are word-based comparisons. +lxml.html.diff does HTML comparisons. These are word-based comparisons. First, a handy function for normalizing whitespace and doing word wrapping:: @@ -12,7 +12,7 @@ Example:: - >>> from lxml.html.htmldiff import htmldiff, split_unbalanced, html_annotate + >>> from lxml.html.diff import htmldiff, html_annotate >>> html1 = '<p>This is some test text with some changes and some same stuff</p>' >>> html2 = '''<p>This is some test textual writing with some changed stuff ... and some same stuff</p>''' @@ -187,36 +187,13 @@ <p><a href="/foo"><span version="0">Hey</span> <span version="1">Guy</span></a></p> +Internals +--------- -Here's a test of a utility function!: +Some utility functions:: - >>> from lxml.html.htmldiff import _merge_element_contents - >>> from lxml import etree - >>> doc = '''<html><body><div> - ... <div id="c1">a b <span id="d1">content</span> c d</div> - ... <div id="c2"><span id="d2">content <b>and more</b> stuff</span> trailing</div> - ... <div id="c3"><b>hi</b><span id="d3"><i>content</i></span></div> - ... <div id="c4"><b>Hi</b> <span id="d4">some stuff<i>more stuff</i></span></div> - ... </div></body></html>''' - >>> doc = etree.HTML(doc) - >>> def show_result(id): - ... el = doc.xpath("//*[@id='d%s']" % id)[0] - ... _merge_element_contents(el) - ... container = doc.xpath("//*[@id='c%s']" % id)[0] - ... print etree.tostring(container).strip() - >>> show_result(1) - <div id="c1">a b content c d</div> - >>> show_result(2) - <div id="c2">content <b>and more</b> stuff trailing</div> - >>> show_result(3) - <div id="c3"><b>hi</b><i>content</i></div> - >>> show_result(4) - <div id="c4"><b>Hi</b> some stuff<i>more stuff</i></div> - -More utility: - - >>> from lxml.html.htmldiff import fixup_ins_del_tags + >>> from lxml.html.diff import fixup_ins_del_tags, split_unbalanced >>> def pfixup(text): ... print fixup_ins_del_tags(text).strip() >>> pfixup('<ins><p>some text <b>and more text</b> and more</p></ins>') @@ -233,7 +210,7 @@ </tr></table> -Testing split_unbalanced: +Testing split_unbalanced:: >>> split_unbalanced(['<a href="blah">', 'hey', '</a>']) ([], ['<a href="blah">', 'hey', '</a>'], []) Deleted: /lxml/branch/html/src/lxml/html/tests/test_htmldiff.py ============================================================================== --- /lxml/branch/html/src/lxml/html/tests/test_htmldiff.py Fri Jun 1 20:23:35 2007 +++ (empty file) @@ -1,13 +0,0 @@ -import unittest -from lxml.tests.common_imports import doctest - -from lxml.html import htmldiff - -def test_suite(): - suite = unittest.TestSuite() - suite.addTests([doctest.DocFileSuite('test_htmldiff.txt'), - doctest.DocTestSuite(htmldiff)]) - return suite - -if __name__ == '__main__': - unittest.main() Deleted: /lxml/branch/html/src/lxml/html/tests/test_htmldiff.txt ============================================================================== --- /lxml/branch/html/src/lxml/html/tests/test_htmldiff.txt Fri Jun 1 20:23:35 2007 +++ (empty file) @@ -1,248 +0,0 @@ -htmldiff does HTML comparisons. These are word-based comparisons. - -First, a handy function for normalizing whitespace and doing word wrapping:: - - >>> import re, textwrap - >>> def pwrapped(text): - ... text = re.sub(r'[ \n\t\r]+', ' ', text) - ... text = textwrap.fill(text) - ... print text - >>> def pdiff(text1, text2): - ... pwrapped(htmldiff(text1, text2)) - -Example:: - - >>> from lxml.html.htmldiff import htmldiff, split_unbalanced, html_annotate - >>> html1 = '<p>This is some test text with some changes and some same stuff</p>' - >>> html2 = '''<p>This is some test textual writing with some changed stuff - ... and some same stuff</p>''' - >>> pdiff(html1, html2) - <p>This is some test <ins>textual writing with some changed - stuff</ins> <del>text with some changes</del> and some same stuff</p> - -Style tags are largely ignored in terms of differences, though markup is not eliminated:: - - >>> html1 = '<p>Hi <i>you guys</i></p>' - >>> html2 = '<p>Hi <i>you</i> guys</p>' - >>> pdiff(html1, html2) - <p>Hi <i>you</i> guys</p> - >>> pdiff('text', '<p>text</p>') - <p>text</p> - >>> pdiff('<i>Hi guys</i> !!', '<i>Hi guy</i> !!') - <i>Hi <ins>guy</ins> <del>guys</del> </i> !! - >>> pdiff('H<i>i</i>', 'Hi') - <ins>Hi</ins> <del>H<i>i</i></del> - >>> pdiff('<i>A B</i> C', '<i>A</i> C') - <i>A <del>B</del> </i> C - >>> pdiff('<i>A B</i> C', '<i>B</i> C') - <i> <del>A</del> B</i> C - >>> pdiff('<p></p>', '<p></p>') - <p></p> - >>> pdiff('<p>Hi</p>', '<p>Bye</p>') - <p><ins>Bye</ins></p> <p><del>Hi</del></p> - >>> pdiff('<p>Hi Guy</p>', '<p>Bye Guy</p>') - <p> <ins>Bye</ins> <del>Hi</del> Guy</p> - >>> pdiff('<p>Hey there</p>', '') - <ins></ins> <p><del>Hey there</del></p> - -Whitespace is ignored, as it's not meaningful in HTML:: - - >>> pdiff('<div>Hi\n\nguys</div>', '<div>Hi guy</div>') - <div>Hi <ins>guy</ins> <del>guys</del> </div> - -Movement between paragraphs is ignored, as tag-based changes are generally ignored:: - >>> - >>> pdiff('<p>Hello</p><p>World</p>', '<p>Hello World</p>') - <p>Hello World</p> - -As a special case, changing the href of a link is displayed, and -images are treated like words: - - >>> pdiff('<a href="http://yahoo.com">search</a>', '<a href="http://google.com">search</a>') - <a href="http://google.com">search <ins>Link: http://google.com</ins> - <del>Link: http://yahoo.com</del> </a> - >>> pdiff('<p>Print this <img src="print.gif"></p>', '<p>Print this</p>') - <p>Print this <del><img src="print.gif"></del> </p> - >>> pdiff('<a href="http://yahoo.com">search</a>', '<a href="http://yahoo.com">search</a>') - <a href="http://yahoo.com">search</a> - -The sixteen combinations:: - -First "insert start" (del start/middle/end/none): - - >>> pdiff('<b>A B C</b>', '<b>D B C</b') - <b> <ins>D</ins> <del>A</del> B C</b> - >>> pdiff('<b>A B C</b>', '<b>D A C</b>') - <b> <ins>D</ins> A <del>B</del> C</b> - >>> pdiff('<b>A B C</b>', '<b>D A B</b>') - <b> <ins>D</ins> A B <del>C</del> </b> - >>> pdiff('<b>A B C</b>', '<b>D A B C</b>') - <b> <ins>D</ins> A B C</b> - -Next, "insert middle" (del start/middle/end/none): - - >>> pdiff('<b>A B C</b>', '<b>D B C</b>') - <b> <ins>D</ins> <del>A</del> B C</b> - >>> pdiff('<b>A B C</b>', '<b>A D C</b>') - <b>A <ins>D</ins> <del>B</del> C</b> - >>> pdiff('<b>A B C</b>', '<b>A D B</b>') - <b>A <ins>D</ins> B <del>C</del> </b> - -This one case hits the threshold of our insensitive matching: - - >>> pdiff('<b>A B C</b>', '<b>A D B C</b>') - <b> <ins>A D</ins> <del>A</del> B C</b> - - -Then "insert end" (del start/middle/end/none): - - >>> pdiff('<b>A B C</b>', '<b>B C D</b>') - <b> <del>A</del> B C <ins>D</ins> </b> - >>> pdiff('<b>A B C</b>', '<b>A C D</b>') - <b>A <del>B</del> C <ins>D</ins> </b> - >>> pdiff('<b>A B C</b>', '<b>A B D</b>') - <b>A B <ins>D</ins> <del>C</del> </b> - >>> pdiff('<b>A B C</b>', '<b>A B C D</b>') - <b>A B C <ins>D</ins> </b> - -Then no insert (del start/middle/end): - - >>> pdiff('<b>A B C</b>', '<b>B C</b>') - <b> <del>A</del> B C</b> - >>> pdiff('<b>A B C</b>', '<b>A C</b>') - <b>A <del>B</del> C</b> - >>> pdiff('<b>A B C</b>', '<b>A B</b>') - <b>A B <del>C</del> </b> - - >>> pdiff('<b>A B</b> C', '<b>A B</b>') - <b>A B</b> <del>C</del> - >>> pdiff('<b>A B</b> <b>C</b>', '<b>A B</b>') - <b>A B</b> <del><b>C</b></del> - >>> pdiff('A <p><b>hey there</b> <i>how are you?</i></p>', 'A') - A <p><del><b>hey there</b> <i>how are you?</i></del></p> - -Testing a larger document, to make sure there are not weird -unnecessary parallels found: - - >>> pdiff(''' - ... <p>This is a test document with many words in it that goes on - ... for a while and doesn't have anything do to with the next - ... document that we match this against</p>''', ''' - ... <p>This is another document with few similarities to the preceding - ... one, but enough that it may have overlap that could turn into - ... a confusing series of deletes and inserts. - ... </p>''') - <p><ins>This is another document with few similarities to the - preceding one, but enough that it may have overlap that could turn - into a confusing series of deletes and inserts. </ins></p> - <p><del>This is a test document with many words in it that goes on for - a while and doesn't have anything do to with the next document that we - match this against</del></p> - - - -Annotation of content can also be done, where every bit of content is -marked up with information about where it came from. - -First, some setup; note that html_annotate is called with a sequence -of documents and the annotation associated with that document. We'll -just use indexes, but you could use author or timestamp information. - - >>> def markup(text, annotation): - ... return '<span version="%s">%s</span>' % (annotation, text) - >>> def panno(*docs): - ... pwrapped(html_annotate([(doc, index) for index, doc in enumerate(docs)], - ... markup=markup)) - -Now, a sequence of documents: - - >>> panno('Hello cruel world', 'Hi cruel world', 'Hi world') - <span version="1">Hi</span> <span version="0">world</span> - >>> panno('A similar document', 'A similar document', - ... 'A similar document here') - <span version="0">A similar document</span> <span - version="2">here</span> - >>> panno('<p>P1 para</p><p>P2 para</p>', '<p>P1 para</p><p>P3 foo</p>') - <p><span version="0">P1 para</span></p><p><span version="1">P3 - foo</span></p> - >>> panno('Hello<p>There World</p>','Hello<p>There Town</p>') - <span version="0">Hello</span><p><span version="0">There</span> <span - version="1">Town</span></p> - >>> panno('<p>Hello</p>There World','<p>Hello</p>There Town') - <p><span version="0">Hello</span></p><span version="0">There</span> - <span version="1">Town</span> - >>> panno('<p>Hello</p><p>There World</p>','<p>Hello</p><p>There Town</p>') - <p><span version="0">Hello</span></p><p><span version="0">There</span> - <span version="1">Town</span></p> - >>> panno('<p>Hi <img src="/foo"> You</p>', - ... '<p>Hi You</p>', - ... '<p>Hi You <img src="/bar"></p>') - <p><span version="0">Hi</span> <span version="1">You</span> <span - version="2"><img src="/bar"></span></p> - >>> panno('<p><a href="/foo">Hey</a></p>', - ... '<p><a href="/bar">Hey</a></p>') - <p><a href="/bar"><span version="0">Hey</span></a></p> - >>> panno('<p><a href="/foo">Hey You</a></p>', - ... '<p><a href="/foo">Hey Guy</a></p>') - <p><a href="/foo"><span version="0">Hey</span> <span - version="1">Guy</span></a></p> - - - -Here's a test of a utility function!: - - >>> from lxml.html.htmldiff import _merge_element_contents - >>> from lxml import etree - >>> doc = '''<html><body><div> - ... <div id="c1">a b <span id="d1">content</span> c d</div> - ... <div id="c2"><span id="d2">content <b>and more</b> stuff</span> trailing</div> - ... <div id="c3"><b>hi</b><span id="d3"><i>content</i></span></div> - ... <div id="c4"><b>Hi</b> <span id="d4">some stuff<i>more stuff</i></span></div> - ... </div></body></html>''' - >>> doc = etree.HTML(doc) - >>> def show_result(id): - ... el = doc.xpath("//*[@id='d%s']" % id)[0] - ... _merge_element_contents(el) - ... container = doc.xpath("//*[@id='c%s']" % id)[0] - ... print etree.tostring(container).strip() - >>> show_result(1) - <div id="c1">a b content c d</div> - >>> show_result(2) - <div id="c2">content <b>and more</b> stuff trailing</div> - >>> show_result(3) - <div id="c3"><b>hi</b><i>content</i></div> - >>> show_result(4) - <div id="c4"><b>Hi</b> some stuff<i>more stuff</i></div> - -More utility: - - >>> from lxml.html.htmldiff import fixup_ins_del_tags - >>> def pfixup(text): - ... print fixup_ins_del_tags(text).strip() - >>> pfixup('<ins><p>some text <b>and more text</b> and more</p></ins>') - <p><ins>some text <b>and more text</b> and more</ins></p> - >>> pfixup('<p><ins>Hi!</ins> you</p>') - <p><ins>Hi!</ins> you</p> - >>> pfixup('<div>Some text <ins>and <p>more text</p></ins> </div>') - <div>Some text <ins>and </ins><p><ins>more text</ins></p> </div> - >>> pfixup(''' - ... <ins><table><tr><td>One table</td><td>More stuff</td></tr></table></ins>''') - <table><tr> - <td><ins>One table</ins></td> - <td><ins>More stuff</ins></td> - </tr></table> - - -Testing split_unbalanced: - - >>> split_unbalanced(['<a href="blah">', 'hey', '</a>']) - ([], ['<a href="blah">', 'hey', '</a>'], []) - >>> split_unbalanced(['<a href="blah">', 'hey']) - (['<a href="blah">'], ['hey'], []) - >>> split_unbalanced(['Hey', '</i>', 'You', '</b>']) - ([], ['Hey', 'You'], ['</i>', '</b>']) - >>> split_unbalanced(['So', '</i>', 'Hi', '<b>', 'There', '</b>']) - ([], ['So', 'Hi', '<b>', 'There', '</b>'], ['</i>']) - >>> split_unbalanced(['So', '</i>', 'Hi', '<b>', 'There']) - (['<b>'], ['So', 'Hi', 'There'], ['</i>']) - From ianb at codespeak.net Fri Jun 1 21:43:00 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Fri, 1 Jun 2007 21:43:00 +0200 (CEST) Subject: [Lxml-checkins] r43979 - in lxml/branch/html/src/lxml/html: . tests Message-ID: <20070601194300.9C17E80AE@code0.codespeak.net> Author: ianb Date: Fri Jun 1 21:42:58 2007 New Revision: 43979 Added: lxml/branch/html/src/lxml/html/tests/test_autolink.py (contents, props changed) lxml/branch/html/src/lxml/html/tests/test_autolink.txt (contents, props changed) Modified: lxml/branch/html/src/lxml/html/clean.py Log: Added an autolinking function Modified: lxml/branch/html/src/lxml/html/clean.py ============================================================================== --- lxml/branch/html/src/lxml/html/clean.py (original) +++ lxml/branch/html/src/lxml/html/clean.py Fri Jun 1 21:42:58 2007 @@ -3,23 +3,30 @@ from lxml.html import defs from lxml.html import parse, tostring -__all__ = ['clean_html', 'clean'] +__all__ = ['clean_html', 'clean', 'autolink', 'autolink_html'] -# FIXME: I should study this for more ideas: http://feedparser.org/docs/html-sanitization.html -# Other on* attributes that aren't standard? -# Try these tests: http://feedparser.org/tests/wellformed/sanitize/ -# Also http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl -# max width for words +# Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl +# I have multiple kinds of schemes searched; but should schemes be +# whitelisted instead? +# max width for words (but not in pre or textarea) # max height? -# autolink? -# CSS stuff? -# remove images? +# autolink? (don't autolink in textarea, pre, code) +# remove images? Also in CSS? background attribute? +# Some way to whitelist object, iframe, etc (e.g., if you want to +# allow *just* embedded YouTube movies) +# Log what was deleted and why? # This is an IE-specific construct you can have in a stylesheet to # run some Javascript: _css_javascript_re = re.compile( r'expression\(.*?\)', re.S|re.I) +# All kinds of schemes besides just javascript: that can cause +# execution: +_javascript_scheme_re = re.compile( + r'\s*(?:javascript|jscript|livescript|vbscript|about):', re.I) +_whitespace_re = re.compile(r'\s+') + def clean_html(html, **kw): """ Like clean(), but takes a text input document, and returns a text @@ -93,7 +100,7 @@ Remove any form tags ``annoying_tags``: - Tags that aren't *wrong*, but are annoying. ``<blink>`` (FIXME: marquee?) + Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marque>`` ``remove_tags``: A list of tags to remove. @@ -190,7 +197,7 @@ remove_tags.extend(['form']) kill_tags.extend(['button', 'input', 'select', 'textarea']) if annoying_tags: - remove_tags.extend(['blink']) + remove_tags.extend(['blink', 'marque']) bad = [] for el in _itertree(doc): if el.tag in kill_tags: @@ -238,8 +245,136 @@ el.attrib['rel'] = 'nofollow' def _remove_javascript(link): - if link.strip().startswith('javascript:'): + # links like "j a v a s c r i p t:" might be interpreted in IE + new = _whitespace_re.sub('', link) + if _javascript_scheme_re.search(new): # FIXME: should this be None to delete? return '' return link +_link_regexes = [ + re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?)', re.I), + # This is conservative, but autolinking can be a bit conservative: + re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_._]+[a-z]))', re.I), + ] + +_avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a'] + +_avoid_hosts = [ + re.compile(r'^localhost', re.I), + re.compile(r'\bexample\.(?:com|org|net)$', re.I), + re.compile(r'^127\.0\.0\.1$'), + ] + +_avoid_classes = ['nolink'] + +def autolink(el, link_regexes=_link_regexes, + avoid_elements=_avoid_elements, + avoid_hosts=_avoid_hosts, + avoid_classes=_avoid_classes): + """ + Turn any URLs into links. + + It will search for links identified by the given regular + expressions (by default mailto and http(s) links). + + It won't link text in an element in avoid_elements, or an element + with a class in avoid_classes. It won't link to anything with a + host that matches one of the regular expressions in avoid_hosts + (default localhost and 127.0.0.1). + + If you pass in an element, the elements tail will not be + substituted, only the contents of the element. + """ + if el.tag in avoid_elements: + return + class_name = el.attrib.get('class') + if class_name: + class_name = class_name.split() + for match_class in avoid_classes: + if match_class in class_name: + return + for child in list(el): + autolink(child, link_regexes=link_regexes, + avoid_elements=avoid_elements, + avoid_hosts=avoid_hosts, + avoid_classes=avoid_classes) + if child.tail: + text, tail_children = _link_text( + child.tail, link_regexes, avoid_hosts, factory=el.makeelement) + if tail_children: + child.tail = text + index = el.index(child) + el[index+1:index+1] = tail_children + if el.text: + text, pre_children = _link_text( + el.text, link_regexes, avoid_hosts, factory=el.makeelement) + if pre_children: + el.text = text + el[:0] = pre_children + +def _link_text(text, link_regexes, avoid_hosts, factory): + leading_text = '' + links = [] + last_pos = 0 + while 1: + best_match, best_pos = None, None + for regex in link_regexes: + regex_pos = last_pos + while 1: + match = regex.search(text, pos=regex_pos) + if match is None: + break + host = match.group('host') + for host_regex in avoid_hosts: + if host_regex.search(host): + regex_pos = match.end() + break + else: + break + if match is None: + continue + if best_pos is None or match.start() < best_pos: + best_match = match + best_pos = match.start() + if best_match is None: + # No more matches + if links: + assert not links[-1].tail + links[-1].tail = text + else: + assert not leading_text + leading_text = text + break + link = best_match.group(0) + end = best_match.end() + if link.endswith('.') or link.endswith(','): + # These punctuation marks shouldn't end a link + end -= 1 + link = link[:-1] + prev_text = text[:best_match.start()] + if links: + assert not links[-1].tail + links[-1].tail = prev_text + else: + assert not leading_text + leading_text = prev_text + anchor = factory('a') + anchor.attrib['href'] = link + body = best_match.group('body') + if not body: + body = link + if body.endswith('.') or body.endswith(','): + body = body[:-1] + anchor.text = body + links.append(anchor) + text = text[end:] + return leading_text, links + +def autolink_html(html, *args, **kw): + doc = parse(html) + autolink(doc, *args, **kw) + return tostring(doc) + + + Added: lxml/branch/html/src/lxml/html/tests/test_autolink.py ============================================================================== --- (empty file) +++ lxml/branch/html/src/lxml/html/tests/test_autolink.py Fri Jun 1 21:42:58 2007 @@ -0,0 +1,10 @@ +import unittest +from lxml.tests.common_imports import doctest + +def test_suite(): + suite = unittest.TestSuite() + suite.addTests([doctest.DocFileSuite('test_autolink.txt')]) + return suite + +if __name__ == '__main__': + unittest.main() Added: lxml/branch/html/src/lxml/html/tests/test_autolink.txt ============================================================================== --- (empty file) +++ lxml/branch/html/src/lxml/html/tests/test_autolink.txt Fri Jun 1 21:42:58 2007 @@ -0,0 +1,37 @@ +This tests autolink:: + + >>> from lxml.html import usedoctest + >>> from lxml.html.clean import autolink_html + >>> print autolink_html(''' + ... <div>Link here: http://test.com/foo.html.</div> + ... ''') + <div>Link here: <a href="http://test.com/foo.html">http://test.com/foo.html</a>.</div> + >>> print autolink_html(''' + ... <div>Mail me at mailto:ianb at test.com or http://myhome.com</div> + ... ''') + <div>Mail me at <a href="mailto:ianb at test.com">ianb at test.com</a> + or <a href="http://myhome.com">http://myhome.com</a></div> + >>> print autolink_html(''' + ... <div>The <b>great</b> thing is the http://link.com links <i>and</i> + ... the http://foobar.com links.</div>''') + <div>The <b>great</b> thing is the <a href="http://link.com">http://link.com</a> links <i>and</i> + the <a href="http://foobar.com">http://foobar.com</a> links.</div> + +Some cases that won't be caught (on purpose):: + + >>> print autolink_html(''' + ... <div>A link to http://localhost/foo/bar won't, but a link to + ... http://test.com will</div>''') + <div>A link to http://localhost/foo/bar won't, but a link to + <a href="http://test.com">http://test.com</a> will</div> + >>> print autolink_html(''' + ... <div>A link in <textarea>http://test.com</textarea></div>''') + <div>A link in <textarea>http://test.com</textarea></div> + >>> print autolink_html(''' + ... <div>A link in <a href="http://foo.com">http://bar.com</a></div>''') + <div>A link in <a href="http://foo.com">http://bar.com</a></div> + >>> print autolink_html(''' + ... <div>A link in <code>http://foo.com</code> or + ... <span class="nolink">http://bar.com</span></div>''') + <div>A link in <code>http://foo.com</code> or + <span class="nolink">http://bar.com</span></div> From ianb at codespeak.net Fri Jun 1 22:40:07 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Fri, 1 Jun 2007 22:40:07 +0200 (CEST) Subject: [Lxml-checkins] r43982 - in lxml/branch/html/src/lxml/html: . tests Message-ID: <20070601204007.30FE5807C@code0.codespeak.net> Author: ianb Date: Fri Jun 1 22:40:06 2007 New Revision: 43982 Modified: lxml/branch/html/src/lxml/html/clean.py lxml/branch/html/src/lxml/html/tests/test_autolink.txt Log: Added long word breaking Modified: lxml/branch/html/src/lxml/html/clean.py ============================================================================== --- lxml/branch/html/src/lxml/html/clean.py (original) +++ lxml/branch/html/src/lxml/html/clean.py Fri Jun 1 22:40:06 2007 @@ -3,14 +3,14 @@ from lxml.html import defs from lxml.html import parse, tostring -__all__ = ['clean_html', 'clean', 'autolink', 'autolink_html'] +__all__ = ['clean_html', 'clean', 'autolink', 'autolink_html', + 'word_break', 'word_break_html'] # Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl +# Particularly the CSS cleaning; most of the tag cleaning is integrated now # I have multiple kinds of schemes searched; but should schemes be # whitelisted instead? -# max width for words (but not in pre or textarea) # max height? -# autolink? (don't autolink in textarea, pre, code) # remove images? Also in CSS? background attribute? # Some way to whitelist object, iframe, etc (e.g., if you want to # allow *just* embedded YouTube movies) @@ -376,5 +376,82 @@ autolink(doc, *args, **kw) return tostring(doc) - - +_avoid_word_break_elements = ['pre', 'textarea', 'code'] +_avoid_word_break_classes = ['nobreak'] + +def word_break(el, max_width=40, + avoid_elements=_avoid_word_break_elements, + avoid_classes=_avoid_word_break_classes, + break_character=u'\u200b'): + """ + Breaks any long words found in the body of the text (not attributes). + + Doesn't effect any of the tags in avoid_elements, by default + textarea and pre + + Breaks words by inserting ​, which is a unicode character + for Zero Width Space character. This generally takes up no space + in rendering, but does copy as a space, and in monospace contexts + usually takes up space. + + See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion + """ + # Character suggestion of ​ comes from: + # http://www.cs.tut.fi/~jkorpela/html/nobr.html + if el.tag in _avoid_word_break_elements: + return + class_name = el.attrib.get('class') + if class_name: + dont_break = False + class_name = class_name.split() + for avoid in avoid_classes: + if avoid in class_name: + dont_break = True + break + if dont_break: + return + if el.text: + el.text = _break_text(el.text, max_width, break_character) + for child in el: + word_break(child, max_width=max_width, + avoid_elements=avoid_elements, + avoid_classes=avoid_classes, + break_character=break_character) + if child.tail: + child.tail = _break_text(child.tail, max_width, break_character) + +def word_break_html(html, *args, **kw): + doc = parse(html) + word_break(doc, *args, **kw) + return tostring(doc) + +def _break_text(text, max_width, break_character): + words = text.split() + for word in words: + if len(word) > max_width: + replacement = _insert_break(word, max_width, break_character) + text = text.replace(word, replacement) + return text + +_break_prefer_re = re.compile(r'[^a-z]', re.I) + +def _insert_break(word, width, break_character): + orig_word = word + result = '' + while len(word) > width: + start = word[:width] + breaks = list(_break_prefer_re.finditer(start)) + if breaks: + last_break = breaks[-1] + # Only walk back up to 10 characters to find a nice break: + if last_break.end() > width-10: + # FIXME: should the break character be at the end of the + # chunk, or the beginning of the next chunk? + start = word[:last_break.end()] + result += start + break_character + word = word[len(start):] + result += word + return result + + + Modified: lxml/branch/html/src/lxml/html/tests/test_autolink.txt ============================================================================== --- lxml/branch/html/src/lxml/html/tests/test_autolink.txt (original) +++ lxml/branch/html/src/lxml/html/tests/test_autolink.txt Fri Jun 1 22:40:06 2007 @@ -35,3 +35,28 @@ ... <span class="nolink">http://bar.com</span></div>''') <div>A link in <code>http://foo.com</code> or <span class="nolink">http://bar.com</span></div> + +There's also a word wrapping function, that should probably be run +after autolink:: + + >>> from lxml.html.clean import word_break_html + >>> def pascii(s): + ... print s.decode('utf8').encode('ascii', 'xmlcharrefreplace') + >>> pascii(word_break_html(''' + ... <div>Hey you + ... 12345678901234567890123456789012345678901234567890</div>''')) + <div>Hey you + 1234567890123456789012345678901234567890​1234567890</div> + +Not everything is broken: + + >>> pascii(word_break_html(''' + ... <div>Hey you + ... <code>12345678901234567890123456789012345678901234567890</code></div>''')) + <div>Hey you + <code>12345678901234567890123456789012345678901234567890</code></div> + >>> pascii(word_break_html(''' + ... <a href="12345678901234567890123456789012345678901234567890">text</a>''')) + <a href="12345678901234567890123456789012345678901234567890">text</a> + + \ No newline at end of file From ianb at codespeak.net Fri Jun 1 22:43:28 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Fri, 1 Jun 2007 22:43:28 +0200 (CEST) Subject: [Lxml-checkins] r43983 - lxml/branch/html/src/lxml/html Message-ID: <20070601204328.580968091@code0.codespeak.net> Author: ianb Date: Fri Jun 1 22:43:22 2007 New Revision: 43983 Modified: lxml/branch/html/src/lxml/html/clean.py Log: allow whitespace after expression () Modified: lxml/branch/html/src/lxml/html/clean.py ============================================================================== --- lxml/branch/html/src/lxml/html/clean.py (original) +++ lxml/branch/html/src/lxml/html/clean.py Fri Jun 1 22:43:22 2007 @@ -19,7 +19,7 @@ # This is an IE-specific construct you can have in a stylesheet to # run some Javascript: _css_javascript_re = re.compile( - r'expression\(.*?\)', re.S|re.I) + r'expression\s*\(.*?\)', re.S|re.I) # All kinds of schemes besides just javascript: that can cause # execution: From ianb at codespeak.net Sat Jun 2 02:43:06 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Sat, 2 Jun 2007 02:43:06 +0200 (CEST) Subject: [Lxml-checkins] r43986 - lxml/branch/html/sr