[Lxml-checkins] r43854 - in lxml/branch/html: . src/lxml src/lxml/html src/lxml/html/tests

ianb at codespeak.net ianb at codespeak.net
Tue May 29 17:05:52 CEST 2007


Author: ianb
Date: Tue May 29 17:05:51 2007
New Revision: 43854

Added:
   lxml/branch/html/
      - copied from r43853, lxml/trunk/
   lxml/branch/html/src/lxml/doctestcompare.py   (contents, props changed)
   lxml/branch/html/src/lxml/html/
   lxml/branch/html/src/lxml/html/__init__.py   (contents, props changed)
   lxml/branch/html/src/lxml/html/clean.py   (contents, props changed)
   lxml/branch/html/src/lxml/html/defs.py   (contents, props changed)
   lxml/branch/html/src/lxml/html/htmldiff.py   (contents, props changed)
   lxml/branch/html/src/lxml/html/rewritelinks.py   (contents, props changed)
   lxml/branch/html/src/lxml/html/tests/
   lxml/branch/html/src/lxml/html/tests/__init__.py   (contents, props changed)
   lxml/branch/html/src/lxml/html/tests/test_basic.py   (contents, props changed)
   lxml/branch/html/src/lxml/html/tests/test_basic.txt   (contents, props changed)
   lxml/branch/html/src/lxml/html/tests/test_clean.py   (contents, props changed)
   lxml/branch/html/src/lxml/html/tests/test_clean.txt   (contents, props changed)
   lxml/branch/html/src/lxml/html/tests/test_htmldiff.py   (contents, props changed)
   lxml/branch/html/src/lxml/html/tests/test_htmldiff.txt   (contents, props changed)
   lxml/branch/html/src/lxml/html/tests/test_rewritelinks.py   (contents, props changed)
   lxml/branch/html/src/lxml/html/tests/test_rewritelinks.txt   (contents, props changed)
   lxml/branch/html/src/lxml/html/usedoctest.py   (contents, props changed)
   lxml/branch/html/src/lxml/usedoctest.py   (contents, props changed)
Modified:
   lxml/branch/html/setup.py
Log:
Branch with lxml.html work


Modified: lxml/branch/html/setup.py
==============================================================================
--- lxml/trunk/setup.py	(original)
+++ lxml/branch/html/setup.py	Tue May 29 17:05:51 2007
@@ -70,7 +70,7 @@
     ],
 
     package_dir = {'': 'src'},
-    packages = ['lxml'],
+    packages = ['lxml', 'lxml.html'],
     zip_safe = False,
     ext_modules = setupinfo.ext_modules(
     STATIC_INCLUDE_DIRS, STATIC_LIBRARY_DIRS, STATIC_CFLAGS),

Added: lxml/branch/html/src/lxml/doctestcompare.py
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/doctestcompare.py	Tue May 29 17:05:51 2007
@@ -0,0 +1,395 @@
+"""
+lxml-based doctest output comparison.
+
+To use this you must call ``lxmldoctest.install()``, which will cause
+doctest to use this in all subsequent calls.
+
+This changes the way output is checked and comparisons are made for
+XML or HTML-like content.
+
+XML or HTML content is noticed because the example starts with ``<``
+(it's HTML if it starts with ``<html``).  You can also use the
+``PARSE_HTML`` and ``PARSE_XML`` flags to force parsing.
+
+Some rough wildcard-like things are allowed.  Whitespace is generally
+ignored (except in attributes).  In text (attributes and text in the
+body) you can use ``...`` as a wildcard.  In an example it also
+matches any trailing tags in the element, though it does not match
+leading tags.  You may create a tag ``<any>`` or include an ``any``
+attribute in the tag.  An ``any`` tag matches any tag, while the
+attribute matches any and all attributes.
+
+When a match fails, the reformatted example and gotten text is
+displayed (indented), and a rough diff-like output is given.  Anything
+marked with ``-`` is in the output but wasn't supposed to be, and
+similarly ``+`` means its in the example but wasn't in the output.
+"""
+
+from lxml import etree
+from lxml.html import HTML
+import re
+import doctest
+import cgi
+
+PARSE_HTML = doctest.register_optionflag('PARSE_HTML')
+PARSE_XML = doctest.register_optionflag('PARSE_XML')
+
+OutputChecker = doctest.OutputChecker
+
+def strip(v):
+    if v is None:
+        return None
+    else:
+        return v.strip()
+
+class LXMLOutputChecker(OutputChecker):
+
+    empty_tags = (
+        'param', 'img', 'area', 'br', 'basefont', 'input',
+        'base', 'meta', 'link', 'col')
+
+    default_parser = etree.XML
+
+    def check_output(self, want, got, optionflags):
+        alt_self = getattr(self, '_temp_override_self', None)
+        if alt_self is not None:
+            super_method = self._temp_call_super_check_output
+            self = alt_self
+        else:
+            super_method = OutputChecker.check_output
+        parser = self.get_parser(want, got, optionflags)
+        if not parser:
+            return super_method(
+                self, want, got, optionflags)
+        try:
+            want_doc = parser(want)
+        except etree.XMLSyntaxError:
+            return False
+        try:
+            got_doc = parser(got)
+        except etree.XMLSyntaxError:
+            return False
+        return self.compare_docs(want_doc, got_doc)
+
+    def get_parser(self, want, got, optionflags):
+        parser = None
+        if PARSE_HTML & optionflags:
+            parser = HTML
+        elif PARSE_XML & optionflags:
+            parser = etree.XML
+        elif want.strip().lower().startswith('<html'):
+            parser = HTML
+        elif want.strip().startswith('<'):
+            parser = self.default_parser
+        return parser
+
+    def compare_docs(self, want, got):
+        if want.tag != got.tag and want.tag != 'any':
+            return False
+        if not self.text_compare(want.text, got.text, True):
+            return False
+        if not self.text_compare(want.tail, got.tail, True):
+            return False
+        if 'any' not in want.attrib:
+            want_keys = sorted(want.attrib.keys())
+            got_keys = sorted(got.attrib.keys())
+            if want_keys != got_keys:
+                return False
+            for key in want_keys:
+                if not self.text_compare(want.attrib[key], got.attrib[key], False):
+                    return False
+        if want.text != '...' or len(want):
+            want_children = list(want)
+            got_children = list(got)
+            while want_children or got_children:
+                if not want_children or not got_children:
+                    return False
+                want_first = want_children.pop(0)
+                got_first = got_children.pop(0)
+                if not self.compare_docs(want_first, got_first):
+                    return False
+                if not got_children and want_first.tail == '...':
+                    break
+        return True
+
+    def text_compare(self, want, got, strip):
+        want = want or ''
+        got = got or ''
+        if strip:
+            want = want.strip()
+            got = got.strip()
+        want = '^%s$' % re.escape(want)
+        want = want.replace(r'\.\.\.', '.*')
+        if re.search(want, got):
+            return True
+        else:
+            return False
+
+    def output_difference(self, example, got, optionflags):
+        want = example.want
+        parser = self.get_parser(want, got, optionflags)
+        errors = []
+        if parser is not None:
+            try:
+                want_doc = parser(want)
+            except etree.XMLSyntaxError, e:
+                errors.append('In example: %s' % e)
+            try:
+                got_doc = parser(got)
+            except etree.XMLSyntaxError, e:
+                errors.append('In actual output: %s' % e)
+        if parser is None or errors:
+            value = OutputChecker.output_difference(
+                self, example, got, optionflags)
+            if errors:
+                errors.append(value)
+                return '\n'.join(errors)
+            else:
+                return value
+        html = parser is etree.HTML
+        diff_parts = []
+        diff_parts.append('Expected:')
+        diff_parts.append(self.format_doc(want_doc, html, 2))
+        diff_parts.append('Got:')
+        diff_parts.append(self.format_doc(got_doc, html, 2))
+        diff_parts.append('Diff:')
+        diff_parts.append(self.collect_diff(want_doc, got_doc, html, 2))
+        return '\n'.join(diff_parts)
+
+    def html_empty_tag(self, el, html=True):
+        if not html:
+            return False
+        if el.tag not in self.empty_tags:
+            return False
+        if el.text or len(el):
+            # This shouldn't happen (contents in an empty tag)
+            return False
+        return True
+
+    def format_doc(self, doc, html, indent, prefix=''):
+        parts = []
+        if not len(doc):
+            # No children...
+            parts.append(' '*indent)
+            parts.append(prefix)
+            parts.append(self.format_tag(doc))
+            if not self.html_empty_tag(doc, html):
+                if strip(doc.text):
+                    parts.append(self.format_text(doc.text))
+                parts.append(self.format_end_tag(doc))
+            if strip(doc.tail):
+                parts.append(self.format_text(doc.tail))
+            parts.append('\n')
+            return ''.join(parts)
+        parts.append(' '*indent)
+        parts.append(prefix)
+        parts.append(self.format_tag(doc))
+        if not self.html_empty_tag(doc, html):
+            parts.append('\n')
+            if strip(doc.text):
+                parts.append(' '*indent)
+                parts.append(self.format_text(doc.text))
+                parts.append('\n')
+            for el in doc:
+                parts.append(self.format_doc(el, html, indent+2))
+            parts.append(' '*indent)
+            parts.append(self.format_end_tag(doc))
+            parts.append('\n')
+        if strip(doc.tail):
+            parts.append(' '*indent)
+            parts.append(self.format_text(doc.tail))
+            parts.append('\n')
+        return ''.join(parts)
+
+    def format_text(self, text, strip=True):
+        if text is None:
+            return ''
+        if strip:
+            text = text.strip()
+        return cgi.escape(text, 1)
+
+    def format_tag(self, el):
+        attrs = []
+        for name, value in sorted(el.attrib.items()):
+            attrs.append('%s="%s"' % (name, self.format_text(value, False)))
+        if not attrs:
+            return '<%s>' % el.tag
+        return '<%s %s>' % (el.tag, ' '.join(attrs))
+    
+    def format_end_tag(self, el):
+        return '</%s>' % el.tag
+
+    def collect_diff(self, want, got, html, indent):
+        parts = []
+        if not len(want) and not len(got):
+            parts.append(' '*indent)
+            parts.append(self.collect_diff_tag(want, got))
+            if not self.html_empty_tag(got, html):
+                parts.append(self.collect_diff_text(want.text, got.text))
+                parts.append(self.collect_diff_end_tag(want, got))
+            parts.append(self.collect_diff_text(want.tail, got.tail))
+            parts.append('\n')
+            return ''.join(parts)
+        parts.append(' '*indent)
+        parts.append(self.collect_diff_tag(want, got))
+        parts.append('\n')
+        if strip(want.text) or strip(got.text):
+            parts.append(' '*indent)
+            parts.append(self.collect_diff_text(want.text, got.text))
+            parts.append('\n')
+        want_children = list(want)
+        got_children = list(got)
+        while want_children or got_children:
+            if not want_children:
+                parts.append(self.format_doc(got_children.pop(0), html, indent+2, '-'))
+                continue
+            if not got_children:
+                parts.append(self.format_doc(want_children.pop(0), html, indent+2, '+'))
+                continue
+            parts.append(self.collect_diff(
+                want_children.pop(0), got_children.pop(0), html, indent+2))
+        parts.append(' '*indent)
+        parts.append(self.collect_diff_end_tag(want, got))
+        parts.append('\n')
+        if strip(want.tail) or strip(got.tail):
+            parts.append(' '*indent)
+            parts.append(self.collect_diff_text(want.tail, got.tail))
+            parts.append('\n')
+        return ''.join(parts)
+
+    def collect_diff_tag(self, want, got):
+        if want.tag != got.tag and want.tag != 'any':
+            tag = '%s (not %s)' % (got.tag, want.tag)
+        else:
+            tag = got.tag
+        attrs = []
+        any = want.tag == 'any' or 'any' in want.attrib
+        for name, value in sorted(got.attrib.items()):
+            if name not in want.attrib and not any:
+                attrs.append('-%s="%s"' % (name, self.format_text(value, False)))
+            else:
+                if name in want.attrib:
+                    text = self.collect_diff_text(value, want.attrib[name], False)
+                else:
+                    text = self.format_text(value, False)
+                attrs.append('%s="%s"' % (name, text))
+        if not any:
+            for name, value in sorted(got.attrib.items()):
+                if name in got.attrib:
+                    continue
+                attrs.append('+%s="%s"' % (name, self.format_text(value, False)))
+        if attrs:
+            tag = '<%s %s>' % (tag, ' '.join(attrs))
+        else:
+            tag = '<%s>' % tag
+        return tag
+
+    def collect_diff_end_tag(self, want, got):
+        if want.tag != got.tag:
+            tag = '%s (not %s)' % (got.tag, want.tag)
+        else:
+            tag = got.tag
+        return '</%s>' % tag
+
+    def collect_diff_text(self, want, got, strip=True):
+        if self.text_compare(want, got, strip):
+            if not got:
+                return ''
+            return self.format_text(got, strip)
+        text = '%s (not %s)' % (got, want)
+        return self.format_text(text, strip)
+
+class LHTMLOutputChecker(LXMLOutputChecker):
+    default_parser = HTML
+    
+def install(html=False):
+    """
+    Install doctestcompare for all future doctests.
+
+    If html is true, then by default the HTML parser will be used;
+    otherwise the XML parser is used.
+    """
+    if html:
+        doctest.OutputChecker = LHTMLOutputChecker
+    else:
+        doctest.OutputChecker = LXMLOutputChecker
+
+def temp_install(html=False):
+    """
+    Use this *inside* a doctest to enable this checker for this
+    doctest only.
+
+    If html is true, then by default the HTML parser will be used;
+    otherwise the XML parser is used.
+    """
+    if html:
+        Checker = LHTMLOutputChecker
+    else:
+        Checker = LXMLOutputChecker
+    frame = _find_doctest_frame()
+    dt_self = frame.f_locals['self']
+    checker = Checker()
+    old_checker = dt_self._checker
+    dt_self._checker = checker
+    # The unfortunate thing is that there is a local variable 'check'
+    # in the function that runs the doctests, that is a bound method
+    # into the output checker.  We have to update that.  We can't
+    # modify the frame, so we have to modify the object in place.  The
+    # only way to do this is to actually change the func_code
+    # attribute of the method.  We change it, and then wait for
+    # __record_outcome to be run, which signals the end of the __run
+    # method, at which point we restore the previous check_output
+    # implementation.
+    check_func = frame.f_locals['check'].im_func
+    # Because we can't patch up func_globals, this is the only global
+    # in check_output that we care about:
+    doctest.etree = etree
+    _RestoreChecker(dt_self, old_checker, checker,
+                    check_func, checker.check_output.im_func)
+
+class _RestoreChecker(object):
+    def __init__(self, dt_self, old_checker, new_checker, check_func, clone_func):
+        self.dt_self = dt_self
+        self.checker = old_checker
+        self.checker._temp_call_super_check_output = self.call_super
+        self.checker._temp_override_self = new_checker
+        self.check_func = check_func
+        self.clone_func = clone_func
+        self.install_clone()
+        self.install_dt_self()
+    def install_clone(self):
+        self.func_code = self.check_func.func_code
+        self.func_globals = self.check_func.func_globals
+        self.check_func.func_code = self.clone_func.func_code
+    def uninstall_clone(self):
+        self.check_func.func_code = self.func_code
+    def install_dt_self(self):
+        self.prev_func = self.dt_self._DocTestRunner__record_outcome
+        self.dt_self._DocTestRunner__record_outcome = self
+    def uninstall_dt_self(self):
+        self.dt_self._DocTestRunner__record_outcome = self.prev_func
+    def __call__(self, *args, **kw):
+        self.uninstall_clone()
+        self.uninstall_dt_self()
+        del self.checker._temp_override_self
+        del self.checker._temp_call_super_check_output
+        return self.prev_func(*args, **kw)
+    def call_super(self, *args, **kw):
+        self.uninstall_clone()
+        try:
+            return self.check_func(*args, **kw)
+        finally:
+            self.install_clone()
+            
+def _find_doctest_frame():
+    import sys
+    frame = sys._getframe(1)
+    while frame:
+        l = frame.f_locals
+        if 'BOOM' in l:
+            # Sign of doctest
+            return frame
+        frame = frame.f_back
+    raise LookupError(
+        "Could not find doctest (only use this function *inside* a doctest)")
+    

Added: lxml/branch/html/src/lxml/html/__init__.py
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/__init__.py	Tue May 29 17:05:51 2007
@@ -0,0 +1,201 @@
+import threading
+import re
+from lxml import etree
+
+__all__ = ['HTML', 'tostring']
+
+_rel_links_xpath = etree.XPath("descendant-or-self::a[fn:upper-case(@rel)=$rel]")
+#_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'})
+_class_xpath = etree.XPath("descendant-or-self::*[contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
+
+class HtmlMixin(object):
+
+    def remove_element(self):
+        """
+        Removes this element from the tree, including its children and
+        text.  The tail text is joined to the previous element or
+        parent.
+        """
+        parent = self.getparent()
+        assert parent
+        index = parent.index(self)
+        if self.tail:
+            if index == 0:
+                parent.text = (parent.text or '') + self.tail
+            else:
+                previous = parent[index-1]
+                previous.tail = (previous.tail or '') + self.tail
+        parent.remove(self)
+
+    def remove_tag(self):
+        """
+        Remove the tag, but not its children or text.  The children and text
+        are merged into the parent.
+        """
+        parent = self.getparent()
+        assert parent
+        index = parent.index(self)
+        if self.text:
+            if index == 0:
+                parent.text = (parent.text or '') + self.text
+            else:
+                prev = parent[index-1]
+                prev.tail = (prev.tail or '') + self.text
+        if self.tail:
+            if len(self):
+                last = self[-1]
+                last.tail = (last.tail or '') + self.tail
+            elif index == 0:
+                parent.text = (parent.text or '') + self.tail
+            else:
+                prev = parent[index-1]
+                prev.tail = (prev.tail or '') + self.tail
+        parent[index:index+1] = list(self)
+
+    def find_rel_links(self, rel):
+        return _rel_links_xpath(self, rel=rel.lower())
+
+    def find_class(self, class_name):
+        return _class_xpath(self, class_name=class_name.lower())
+
+class HtmlComment(etree._Comment, HtmlMixin):
+    pass
+
+class HtmlElement(etree.ElementBase, HtmlMixin):
+    pass
+
+class HtmlLookup(etree.CustomElementClassLookup):
+
+    def lookup(self, node_type, document, namespace, name):
+        if node_type == 'element':
+            return HtmlElement
+        elif node_type == 'comment':
+            return HtmlComment
+        else:
+            # Delegate
+            return None
+
+html_parser = etree.HTMLParser()
+html_parser.setElementClassLookup(HtmlLookup())
+
+def HTML(html):
+    # FIXME: should this notice a fragment and parse accordingly?
+    value = etree.HTML(html, html_parser)
+    if value is None:
+        raise ParserError(
+            "Could not parse document")
+    return value
+
+def parse_elements(html, no_leading_text=False):
+    """
+    Parses several HTML elements, returning a list of elements.
+
+    The first item in the list may be a string (though leading
+    whitespace is removed).  If no_leading_text is true, then it will
+    be an error if there is leading text.
+    """
+    # FIXME: check what happens when you give html with a body, head, etc.
+    html = '<html><head></head><body>%s</body></html>' % html
+    doc = HTML(html)
+    assert doc.tag == 'html'
+    bodies = [e for e in doc if e.tag == 'body']
+    assert len(bodies) == 1
+    body = bodies[0]
+    elements = []
+    if no_leading_text and body.text and body.text.strip():
+        raise ParserError(
+            "There is leading text: %r" % body.text)
+    if body.text and body.text.strip():
+        elements.append(body.text)
+    elements.extend(body)
+    # FIXME: removing the reference to the parent artificial document
+    # would be nice
+    return elements
+
+def parse_element(html, create_parent=False):
+    """
+    Parses a single HTML element; it is an error if there is more than
+    one element, or if anything but whitespace precedes or follows the
+    element.
+
+    If create_parent is true (or is a tag name) then a parent node
+    will be created to encapsulate the HTML in a single element.
+    """
+    if create_parent:
+        if not isinstance(create_parent, basestring):
+            create_parent = 'div'
+        return parse_element('<%s>%s</%s>' % (create_parent, html, create_parent))
+    elements = parse_elements(html, no_leading_text=True)
+    if not elements:
+        raise ParserError(
+            "No elements found")
+    if len(elements) > 1:
+        raise ParserError(
+            "Multiple elements found (%s)"
+            % ', '.join([e.tag for e in elements]))
+    el = elements[0]
+    if el.tail and el.tail.strip():
+        raise ParserError(
+            "Element followed by text: %r" % el.tail)
+    el.tail = None
+    return el
+
+def Element(*args, **kw):
+    # FIXME: this is totally broken; segfaults
+    v = HtmlElement(*args, **kw)
+    return v
+
+############################################################
+## Serialization
+############################################################
+
+_html_xsl = """\
+<xsl:transform xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+  <xsl:output method="html" encoding="UTF-8" />
+  <xsl:template match="/">
+    <xsl:copy-of select="."/>
+  </xsl:template>
+</xsl:transform>
+"""
+
+_pretty_html_xsl = """\
+<xsl:transform xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+  <xsl:output method="html" encoding="UTF-8" indent="yes" />
+  <xsl:template match="/">
+    <xsl:copy-of select="."/>
+  </xsl:template>
+</xsl:transform>
+"""
+
+_local_transforms = threading.local()
+# FIXME: should we just lazily compile these?
+_local_transforms.html_transform = etree.XSLT(etree.XML(_html_xsl))
+_local_transforms.pretty_html_transform = etree.XSLT(etree.XML(_pretty_html_xsl))
+
+# This isn't a general match, but it's a match for what XSLT specifically creates:
+_meta_content_type_re = re.compile(
+    r'<meta http-equiv="Content-Type".*?>')
+
+def tostring(doc, pretty=False, include_meta_content_type=False):
+    """
+    return HTML string representation of the document given 
+ 
+    note: this will create a meta http-equiv="Content" tag in the head
+    and may replace any that are present 
+    """
+    assert doc is not None
+    if pretty:
+        try:
+            pretty_html_transform = _local_transforms.pretty_html_transform
+        except AttributeError:
+            pretty_html_transform = _local_transforms.pretty_html_transform = etree.XSLT(etree.XML(_pretty_html_xsl))
+        html = str(pretty_html_transform(doc))
+    else:
+        try:
+            html_transform = _local_transforms.html_transform
+        except AttributeError:
+            html_transform = _local_transforms.html_transform = etree.XSLT(etree.XML(_html_xsl))
+        html = str(html_transform(doc))
+    if not include_meta_content_type:
+        html = _meta_content_type_re.sub('', html)
+    return html

Added: lxml/branch/html/src/lxml/html/clean.py
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/clean.py	Tue May 29 17:05:51 2007
@@ -0,0 +1,157 @@
+from lxml import etree
+from lxml.html import defs
+from lxml.html import HTML, tostring
+
+__all__ = ['clean_html', 'clean']
+
+def clean_html(html, **kw):
+    """
+    Like clean(), but takes a text input document, and returns a text
+    document.
+    """
+    doc = HTML(html)
+    clean(doc, **kw)
+    return tostring(doc)
+
+def clean(doc,
+          scripts=True,
+          javascript=True,
+          comments=True,
+          # process instructions?
+          style=False,
+          links=False,
+          embedded=True,
+          frames=True,
+          forms=True,
+          remove_tags=None,
+          allow_tags=None,
+          strip_tags=True,
+          remove_unknown_tags=True,
+          add_nofollow=False,
+          # callbacks?
+          ):
+    """
+    Cleans the document of each of the possible offending elements:
+
+    ``scripts``:
+        Any ``<script>`` tags.
+
+    ``javascript``:
+        Any Javascript, like an ``onclick`` attribute.
+
+    ``comments``:
+        Any comments.
+
+    ``style``:
+        Any style tags or attributes.
+
+    ``links``:
+        Remove any ``<link>`` tags
+
+    ``frames``:
+        Remove any frame-related tags
+
+    ``embedded``:
+        Remove any embedded objects (flash, iframes)
+
+    ``forms``:
+        Remove any form tags
+
+    ``remove_tags``:
+        A list of tags to remove.
+
+    ``allow_tags``:
+        A list of tags to include (default include all).
+
+    ``strip_tags``:
+        If true, then any tag taken out by remove_tags or allow_tags will
+        leave its text in place; if false, then the tag and its content are
+        removed.
+
+    ``remove_unknown_tags``:
+        Remove any tags that aren't standard parts of HTML.
+
+    ``add_nofollow``:
+        If true, then any <a> tags will have ``rel="nofollow"`` added to them.
+
+    This modifies the document *in place*.
+    """
+    kill_tags = []
+    remove_tags = list(remove_tags or [])
+    if scripts:
+        kill_tags.append('script')
+    if javascript:
+        for attrib in defs.event_attrs:
+            for el in doc.xpath('descendant-or-self::*[@%s]' % attrib):
+                del el.attrib[attrib]
+        for attrib in defs.link_attrs:
+            # FIXME: should call lower-case()
+            for el in doc.xpath("descendant-or-self::*[starts-with(@%s, 'javascript:')]" % attrib):
+                if isinstance(el, basestring):
+                    assert 0, repr(el)
+                el.attrib[attrib] = ""
+    if comments:
+        # Easier way?
+        bad = []
+        for el in doc.iterdescendants():
+            if isinstance(el, etree._Comment):
+                bad.append(el)
+        for el in bad:
+            el.remove_element()
+    if style:
+        kill_tags.append('style')
+        for el in doc.xpath('descendant-or-self::link[lower-case(@rel)="stylesheet"]'):
+            el.remove_element()
+        for el in doc.xpath('descendant-or-self::*[@style]'):
+            del el.attrib['style']
+    if links:
+        kill_tags.append('link')
+    if embedded:
+        kill_tags.extend(['object', 'embed', 'iframe'])
+    if frames:
+        kill_tags.extend(defs.frame_tags)
+    if forms:
+        # FIXME: do I even care about fieldset and legend?  I don't
+        # care about label.
+        remove_tags.extend(['form', 'fieldset', 'legend'])
+        kill_tags.extend(['button', 'input', 'select', 'textarea'])
+    bad = []
+    for el in doc.iterdescendants():
+        if el.tag in kill_tags:
+            bad.append(el)
+    for el in bad:
+        el.remove_element()
+    if remove_tags:
+        xpath = ' | '.join([
+            "descendant-or-self::%s" % tag
+            for tag in remove_tags])
+        for el in doc.xpath(xpath):
+            if strip_tags:
+                el.remove_tag()
+            else:
+                # FIXME: Should we test if this has been removed because of a parent?
+                el.remove_element()
+    if remove_unknown_tags:
+        if allow_tags:
+            raise ValueError(
+                "It does not make sense to pass in both allow_tags and remove_unknown_tags")
+        allow_tags = defs.tags
+    if allow_tags:
+        bad = []
+        for el in doc.iterdescendants():
+            if el.tag not in allow_tags:
+                bad.append(el)
+        for el in bad:
+            if strip_tags:
+                el.remove_tag()
+            else:
+                # FIXME: Should we test if this has been removed because of a parent?
+                el.remove_element()
+    if add_nofollow:
+        for el in doc.xpath('descendant-or-self::a[@href]'):
+            href = el.attrib['href']
+            if not href or href.startswith('#'):
+                # internal link, we don't care
+                continue
+            el.attrib['rel'] = 'nofollow'
+

Added: lxml/branch/html/src/lxml/html/defs.py
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/defs.py	Tue May 29 17:05:51 2007
@@ -0,0 +1,100 @@
+# Data taken from http://www.w3.org/TR/html401/index/elements.html
+
+empty_tags = [
+    'area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
+    'img', 'input', 'isindex', 'link', 'meta', 'param']
+
+deprecated_tags = [
+    'applet', 'basefont', 'center', 'dir', 'font', 'isindex',
+    'menu', 's', 'strike', 'u']
+
+# archive actually takes a space-separated list of URIs
+link_attrs = [
+    'action', 'archive', 'background', 'cite', 'classid',
+    'codebase', 'data', 'href', 'longdesc', 'profile', 'src',
+    'usemap']
+
+# Not in the HTML 4 spec:
+# onerror
+event_attrs = [
+    'onblur', 'onchange', 'onclick', 'ondblclick', 'onerror',
+    'onfocus', 'onkeydown', 'onkeypress', 'onkeyup', 'onload',
+    'onmousedown', 'onmousemove', 'onmouseout', 'onmouseover',
+    'onmouseup', 'onreset', 'onselect', 'onsubmit', 'onunload',
+    ]
+
+# From http://htmlhelp.com/reference/html40/olist.html
+top_level_tags = [
+    'html', 'head', 'body', 'frameset',
+    ]
+
+head_tags = [
+    'base', 'isindex', 'link', 'meta', 'script', 'style', 'title',
+    ]
+
+general_block_tags = [
+    'address',
+    'blockquote',
+    'center',
+    'del',
+    'div',
+    'h1',
+    'h2',
+    'h3',
+    'h4',
+    'h5',
+    'h6',
+    'hr',
+    'ins',
+    'isindex',
+    'noscript',
+    'p',
+    'pre',
+    ]
+
+list_tags = [
+    'dir', 'dl', 'dt', 'dd', 'li', 'menu', 'ol', 'ul',
+    ]
+
+table_tags = [
+    'table', 'caption', 'colgroup', 'col',
+    'thead', 'tfoot', 'tbody', 'tr', 'td', 'th',
+    ]
+
+# just this one from
+# http://www.georgehernandez.com/h/XComputers/HTML/2BlockLevel.htm
+block_tags = general_block_tags + list_tags + table_tags + [
+    # Partial form tags
+    'fieldset', 'form', 'legend', 'optgroup', 'option',
+    ]
+
+form_tags = [
+    'form', 'button', 'fieldset', 'legend', 'input', 'label',
+    'select', 'optgroup', 'option', 'textarea',
+    ]
+
+special_inline_tags = [
+    'a', 'applet', 'basefont', 'bdo', 'br', 'font', 'iframe',
+    'img', 'map', 'area', 'object', 'param', 'q', 'script',
+    'span', 'sub', 'sup',
+    ]
+
+phrase_tags = [
+    'abbr', 'acronym', 'cite', 'code', 'del', 'dfn', 'em',
+    'ins', 'kbd', 'samp', 'strong', 'var',
+    ]
+
+font_style_tags = [
+    'b', 'big', 'i', 's', 'small', 'strike', 'tt', 'u',
+    ]
+
+frame_tags = [
+    'frameset', 'frame', 'noframes',
+    ]
+
+# These tags aren't standard
+nonstandard_tags = ['blink', 'marque']
+
+tags = (top_level_tags + head_tags + general_block_tags + list_tags
+        + table_tags + form_tags + special_inline_tags + phrase_tags
+        + font_style_tags + nonstandard_tags)

Added: lxml/branch/html/src/lxml/html/htmldiff.py
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/htmldiff.py	Tue May 29 17:05:51 2007
@@ -0,0 +1,890 @@
+import difflib
+from lxml import etree
+from lxml.html import parse_element
+import cgi
+import re
+
+__all__ = ['html_annotate', 'htmldiff']
+
+
+############################################################
+## Annotation
+############################################################
+
+def default_markup(text, version):
+    return '<span title="%s">%s</span>' % (
+        cgi.escape(unicode(version), 1), text)
+
+def html_annotate(doclist, markup=default_markup):
+    """
+    doclist should be ordered from oldest to newest, like::
+
+        >>> version1 = 'Hello World'
+        >>> version2 = 'Goodbye World'
+        >>> html_annotate([(version1, 'version 1'),
+        ...                (version2, 'version 2')])
+        u'<span title="version 2">Goodbye</span> <span title="version 1">World</span>'
+
+    The documents must be *fragments* (str/UTF8 or unicode), not
+    complete documents
+
+    The markup argument is a function to markup the spans of words.
+    This function is called like markup('Hello', 'version 2'), and
+    returns HTML.  The first argument is text and never includes any
+    markup.  The default uses a span with a title:
+
+        >>> default_markup('Some Text', 'by Joe')
+        u'<span title="by Joe">Some Text</span>'
+    """
+    # The basic strategy we have is to split the documents up into
+    # logical tokens (which are words with attached markup).  We then
+    # do diffs of each of the versions to track when a token first
+    # appeared in the document; the annotation attached to the token
+    # is the version where it first appeared.
+    tokenlist = [tokenize_annotated(doc, version)
+                 for doc, version in doclist]
+    cur_tokens = tokenlist[0]
+    for tokens in tokenlist[1:]:
+        html_annotate_merge_annotations(cur_tokens, tokens)
+        cur_tokens = tokens
+
+    # After we've tracked all the tokens, we can combine spans of text
+    # that are adjacent and have the same annotation
+    cur_tokens = compress_tokens(cur_tokens)
+    # And finally add markup
+    result = markup_serialize_tokens(cur_tokens, markup)
+    return ''.join(result).strip()
+
+def tokenize_annotated(doc, annotation): 
+    """Tokenize a document and add an annotation attribute to each token
+    """
+    tokens = tokenize(doc, include_hrefs=False)
+    for tok in tokens: 
+        tok.annotation = annotation
+    return tokens
+
+def html_annotate_merge_annotations(tokens_old, tokens_new): 
+    """Merge the annotations from tokens_old into tokens_new, when the
+    tokens in the new document already existed in the old document.
+    """
+    s = InsensitiveSequenceMatcher(a=tokens_old, b=tokens_new)
+    commands = s.get_opcodes()
+
+    for command, i1, i2, j1, j2 in commands:
+        if command == 'equal': 
+            eq_old = tokens_old[i1:i2]
+            eq_new = tokens_new[j1:j2]
+            copy_annotations(eq_old, eq_new)
+
+def copy_annotations(src, dest): 
+    """
+    Copy annotations from the tokens listed in src to the tokens in dest
+    """
+    assert len(src) == len(dest)
+    for src_tok, dest_tok in zip(src, dest): 
+        dest_tok.annotation = src_tok.annotation
+
+def compress_tokens(tokens):
+    """
+    Combine adjacent tokens when there is no HTML between the tokens, 
+    and they share an annotation
+    """
+    result = [tokens[0]] 
+    for tok in tokens[1:]: 
+        if (not result[-1].post_tags and 
+            not tok.pre_tags and 
+            result[-1].annotation == tok.annotation): 
+            compress_merge_back(result, tok)
+        else: 
+            result.append(tok)
+    return result
+
+def compress_merge_back(tokens, tok): 
+    """ Merge tok into the last element of tokens (modifying the list of
+    tokens in-place).  """
+    last = tokens[-1]
+    if type(last) is not token or type(tok) is not token: 
+        tokens.append(tok)
+    else:
+        text = unicode(last)
+        if last.trailing_whitespace:
+            text += ' '
+        text += tok
+        merged = token(text,
+                       pre_tags=last.pre_tags,
+                       post_tags=tok.post_tags,
+                       trailing_whitespace=tok.trailing_whitespace)
+        merged.annotation = last.annotation
+        tokens[-1] = merged
+    
+def markup_serialize_tokens(tokens, markup_func):
+    """
+    Serialize the list of tokens into a list of text chunks, calling
+    markup_func around text to add annotations.
+    """
+    for token in tokens:
+        for pre in token.pre_tags:
+            yield pre
+        html = token.html()
+        html = markup_func(html, token.annotation)
+        if token.trailing_whitespace:
+            html += ' '
+        yield html
+        for post in token.post_tags:
+            yield post
+
+
+############################################################
+## HTML Diffs
+############################################################
+
+def htmldiff(old_html, new_html):
+    """ Do a diff of the old and new document.  The documents are HTML
+    *fragments* (str/UTF8 or unicode), they are not complete documents
+    (i.e., no <html> tag).
+
+    Returns HTML with <ins> and <del> tags added around the
+    appropriate text.  
+
+    Markup is generally ignored, with the markup from new_html
+    preserved, and possibly some markup from old_html (though it is
+    considered acceptable to lose some of the old markup).  Only the
+    words in the HTML are diffed.  The exception is <img> tags, which
+    are treated like words, and the href attribute of <a> tags, which
+    are noted inside the tag itself when there are changes.
+    """ 
+    old_html_tokens = tokenize(old_html)
+    new_html_tokens = tokenize(new_html)
+    result = htmldiff_tokens(old_html_tokens, new_html_tokens)
+    result = ''.join(result).strip()
+    return fixup_ins_del_tags(result)
+
+def htmldiff_tokens(html1_tokens, html2_tokens):
+    """ Does a diff on the tokens themselves, returning a list of text
+    chunks (not tokens).
+    """
+    # There are several passes as we do the differences.  The tokens
+    # isolate the portion of the content we care to diff; difflib does
+    # all the actual hard work at that point.  
+    #
+    # Then we must create a valid document from pieces of both the old
+    # document and the new document.  We generally prefer to take
+    # markup from the new document, and only do a best effort attempt
+    # to keep markup from the old document; anything that we can't
+    # resolve we throw away.  Also we try to put the deletes as close
+    # to the location where we think they would have been -- because
+    # we are only keeping the markup from the new document, it can be
+    # fuzzy where in the new document the old text would have gone.
+    # Again we just do a best effort attempt.
+    s = InsensitiveSequenceMatcher(a=html1_tokens, b=html2_tokens)
+    commands = s.get_opcodes()
+    result = []
+    for command, i1, i2, j1, j2 in commands:
+        if command == 'equal':
+            result.extend(expand_tokens(html2_tokens[j1:j2], equal=True))
+            continue
+        if command == 'insert' or command == 'replace':
+            ins_tokens = expand_tokens(html2_tokens[j1:j2])
+            merge_insert(ins_tokens, result)
+        if command == 'delete' or command == 'replace':
+            del_tokens = expand_tokens(html1_tokens[i1:i2])
+            merge_delete(del_tokens, result)
+    # If deletes were inserted directly as <del> then we'd have an
+    # invalid document at this point.  Instead we put in special
+    # markers, and when the complete diffed document has been created
+    # we try to move the deletes around and resolve any problems.
+    result = cleanup_delete(result)
+
+    return result
+
+def expand_tokens(tokens, equal=False):
+    """Given a list of tokens, return a generator of the chunks of
+    text for the data in the tokens.
+    """
+    for token in tokens:
+        for pre in token.pre_tags:
+            yield pre
+        if not equal or not token.hide_when_equal:
+            if token.trailing_whitespace:
+                yield token.html() + ' '
+            else:
+                yield token.html()
+        for post in token.post_tags:
+            yield post
+
+def merge_insert(ins_chunks, doc):
+    """ doc is the already-handled document (as a list of text chunks);
+    here we add <ins>ins_chunks</ins> to the end of that.  """
+    # Though we don't throw away unbalanced_start or unbalanced_end
+    # (we assume there is accompanying markup later or earlier in the
+    # document), we only put <ins> around the balanced portion.
+    unbalanced_start, balanced, unbalanced_end = split_unbalanced(ins_chunks)
+    doc.extend(unbalanced_start)
+    if doc and not doc[-1].endswith(' '):
+        # Fix up the case where the word before the insert didn't end with 
+        # a space
+        doc[-1] += ' '
+    doc.append('<ins>')
+    if balanced and balanced[-1].endswith(' '):
+        # We move space outside of </ins>
+        balanced[-1] = balanced[-1][:-1]
+    doc.extend(balanced)
+    doc.append('</ins> ')
+    doc.extend(unbalanced_end)
+
+# These are sentinals to represent the start and end of a <del>
+# segment, until we do the cleanup phase to turn them into proper
+# markup:
+class DEL_START:
+    pass
+class DEL_END:
+    pass
+
+class NoDeletes(Exception):
+    """ Raised when the document no longer contains any pending deletes
+    (DEL_START/DEL_END) """
+
+def merge_delete(del_chunks, doc):
+    """ Adds the text chunks in del_chunks to the document doc (another
+    list of text chunks) with marker to show it is a delete.
+    cleanup_delete later resolves these markers into <del> tags."""
+    doc.append(DEL_START)
+    doc.extend(del_chunks)
+    doc.append(DEL_END)
+
+def cleanup_delete(chunks):
+    """ Cleans up any DEL_START/DEL_END markers in the document, replacing
+    them with <del></del>.  To do this while keeping the document
+    valid, it may need to drop some tags (either start or end tags).
+
+    It may also move the del into adjacent tags to try to move it to a
+    similar location where it was originally located (e.g., moving a
+    delete into preceding <div> tag, if the del looks like (DEL_START,
+    'Text</div>', DEL_END)"""
+    while 1:
+        # Find a pending DEL_START/DEL_END, splitting the document
+        # into stuff-preceding-DEL_START, stuff-inside, and
+        # stuff-following-DEL_END
+        try:
+            pre_delete, delete, post_delete = split_delete(chunks)
+        except NoDeletes:
+            # Nothing found, we've cleaned up the entire doc
+            break
+        # The stuff-inside-DEL_START/END may not be well balanced
+        # markup.  First we figure out what unbalanced portions there are:
+        unbalanced_start, balanced, unbalanced_end = split_unbalanced(delete)
+        # Then we move the span forward and/or backward based on these
+        # unbalanced portions:
+        locate_unbalanced_start(unbalanced_start, pre_delete, post_delete)
+        locate_unbalanced_end(unbalanced_end, pre_delete, post_delete)
+        doc = pre_delete
+        if doc and not doc[-1].endswith(' '):
+            # Fix up case where the word before us didn't have a trailing space
+            doc[-1] += ' '
+        doc.append('<del>')
+        if balanced and balanced[-1].endswith(' '):
+            # We move space outside of </del>
+            balanced[-1] = balanced[-1][:-1]
+        doc.extend(balanced)
+        doc.append('</del> ')
+        doc.extend(post_delete)
+        chunks = doc
+    return chunks
+
+def split_unbalanced(chunks):
+    """Return (unbalanced_start, balanced, unbalanced_end), where each is
+    a list of text and tag chunks.
+
+    unbalanced_start is a list of all the tags that are opened, but
+    not closed in this span.  Similarly, unbalanced_end is a list of
+    tags that are closed but were not opened.  Extracting these might
+    mean some reordering of the chunks."""
+    start = []
+    end = []
+    tag_stack = []
+    balanced = []
+    for chunk in chunks:
+        if not chunk.startswith('<'):
+            balanced.append(chunk)
+            continue
+        endtag = chunk[1] == '/'
+        name = chunk.split()[0].strip('<>/')
+        if name in empty_tags:
+            assert not endtag, (
+                "Empty tag %r should have no end tag" % chunk)
+            balanced.append(chunk)
+            continue
+        if endtag:
+            if tag_stack and tag_stack[-1][0] == name:
+                balanced.append(chunk)
+                name, pos, tag = tag_stack.pop()
+                balanced[pos] = tag
+            elif tag_stack:
+                start.extend(tag for name, pos, tag in tag_stack)
+                tag_stack = []
+                end.append(chunk)
+            else:
+                end.append(chunk)
+        else:
+            tag_stack.append((name, len(balanced), chunk))
+            balanced.append(None)
+    start.extend(
+        [chunk for name, pos, chunk in tag_stack])
+    balanced = [chunk for chunk in balanced if chunk is not None]
+    return start, balanced, end
+
+def split_delete(chunks):
+    """ Returns (stuff_before_DEL_START, stuff_inside_DEL_START_END,
+    stuff_after_DEL_END).  Returns the first case found (there may be
+    more DEL_STARTs in stuff_after_DEL_END).  Raises NoDeletes if
+    there's no DEL_START found. """
+    try:
+        pos = chunks.index(DEL_START)
+    except ValueError:
+        raise NoDeletes
+    pos2 = chunks.index(DEL_END)
+    return chunks[:pos], chunks[pos+1:pos2], chunks[pos2+1:]
+
+def locate_unbalanced_start(unbalanced_start, pre_delete, post_delete):
+    """ pre_delete and post_delete implicitly point to a place in the
+    document (where the two were split).  This moves that point (by
+    popping items from one and pushing them onto the other).  It moves
+    the point to try to find a place where unbalanced_start applies.
+
+    As an example::
+
+        >>> unbalanced_start = ['<div>']
+        >>> doc = ['<p>', 'Text', '</p>', '<div>', 'More Text', '</div>']
+        >>> pre, post = doc[:3], doc[3:]
+        >>> pre, post
+        (['<p>', 'Text', '</p>'], ['<div>', 'More Text', '</div>'])
+        >>> locate_unbalanced_start(unbalanced_start, pre, post)
+        >>> pre, post
+        (['<p>', 'Text', '</p>', '<div>'], ['More Text', '</div>'])
+
+    As you can see, we moved the point so that the dangling <div> that
+    we found will be effectively replaced by the div in the original
+    document.  If this doesn't work out, we just throw away
+    unbalanced_start without doing anything.
+    """
+    while 1:
+        if not unbalanced_start:
+            # We have totally succeded in finding the position
+            break
+        finding = unbalanced_start[0]
+        finding_name = finding.split()[0].strip('<>')
+        if not post_delete:
+            break
+        next = post_delete[0]
+        if next is DEL_START or not next.startswith('<'):
+            # Reached a word, we can't move the delete text forward
+            break
+        if next[1] == '/':
+            # Reached a closing tag, can we go further?  Maybe not...
+            break
+        name = next.split()[0].strip('<>')
+        if name == 'ins':
+            # Can't move into an insert
+            break
+        assert name != 'del', (
+            "Unexpected delete tag: %r" % next)
+        if name == finding_name:
+            unbalanced_start.pop(0)
+            pre_delete.append(post_delete.pop(0))
+        else:
+            # Found a tag that doesn't match
+            break
+
+def locate_unbalanced_end(unbalanced_end, pre_delete, post_delete):
+    """ like locate_unbalanced_start, except handling end tags and
+    possibly moving the point earlier in the document.  """
+    while 1:
+        if not unbalanced_end:
+            # Success
+            break
+        finding = unbalanced_end[-1]
+        finding_name = finding.split()[0].strip('<>/')
+        if not pre_delete:
+            break
+        next = pre_delete[-1]
+        if next is DEL_END or not next.startswith('</'):
+            # A word or a start tag
+            break
+        name = next.split()[0].strip('<>/')
+        if name == 'ins' or name == 'del':
+            # Can't move into an insert or delete
+            break
+        if name == finding_name:
+            unbalanced_end.pop()
+            post_delete.insert(0, pre_delete.pop())
+        else:
+            # Found a tag that doesn't match
+            break
+
+class token(unicode):
+    """ Represents a diffable token, generally a word that is displayed to
+    the user.  Opening tags are attached to this token when they are
+    adjacent (pre_tags) and closing tags that follow the word
+    (post_tags).  Some exceptions occur when there are empty tags
+    adjacent to a word, so there may be close tags in pre_tags, or
+    open tags in post_tags.
+
+    We also keep track of whether the word was originally followed by
+    whitespace, even though we do not want to treat the word as
+    equivalent to a similar word that does not have a trailing
+    space."""
+
+    # When this is true, the token will be eliminated from the
+    # displayed diff if no change has occurred:
+    hide_when_equal = False
+
+    def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=False):
+        obj = unicode.__new__(cls, text)
+
+        if pre_tags is not None:
+            obj.pre_tags = pre_tags
+        else:
+            obj.pre_tags = []
+
+        if post_tags is not None:
+            obj.post_tags = post_tags
+        else:
+            obj.post_tags = []
+
+        obj.trailing_whitespace = trailing_whitespace
+
+        return obj
+
+    def __repr__(self):
+        return 'token(%s, %r, %r)' % (unicode.__repr__(self), self.pre_tags, self.post_tags)
+
+    def html(self):
+        return unicode(self)
+
+class tag_token(token):
+
+    """ Represents a token that is actually a tag.  Currently this is just
+    the <img> tag, which takes up visible space just like a word but
+    is only represented in a document by a tag.  """
+
+    def __new__(cls, tag, data, html_repr, pre_tags=None, 
+                post_tags=None, trailing_whitespace=False):
+        obj = token.__new__(cls, "%s: %s" % (type, data), 
+                            pre_tags=pre_tags, 
+                            post_tags=post_tags, 
+                            trailing_whitespace=trailing_whitespace)
+        obj.tag = tag
+        obj.data = data
+        obj.html_repr = html_repr
+        return obj
+
+    def __repr__(self):
+        return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%s)' % (
+            self.tag, 
+            self.data, 
+            self.html_repr, 
+            self.pre_tags, 
+            self.post_tags, 
+            self.trailing_whitespace)
+    def html(self):
+        return self.html_repr
+
+class href_token(token):
+
+    """ Represents the href in an anchor tag.  Unlike other words, we only
+    show the href when it changes.  """
+
+    hide_when_equal = True
+
+    def html(self):
+        return 'Link: %s' % self
+
+def tokenize(html, include_hrefs=True):
+    """
+    Parse the given HTML and returns token objects (words with attached tags).
+
+    This parses only the content of a page; anything in the head is
+    ignored, and the <head> and <body> elements are themselves
+    optional.  The content is then parsed by lxml, which ensures the
+    validity of the resulting parsed document (though lxml may make
+    incorrect guesses when the markup is particular bad).
+
+    <ins> and <del> tags are also eliminated from the document, as
+    that gets confusing.
+
+    If include_hrefs is true, then the href attribute of <a> tags is
+    included as a special kind of diffable token."""
+    body_el = parse_html(html, cleanup=True)
+    # Then we split the document into text chunks for each tag, word, and end tag:
+    chunks = flatten_el(body_el, drop_tag=True, include_hrefs=include_hrefs)
+    # Finally re-joining them into token objects:
+    return fixup_chunks(chunks)
+
+def parse_html(html, cleanup=True):
+    """
+    Parses an HTML fragment, returning an lxml element.  Note that the HTML will be
+    wrapped in a <div> tag that was not in the original document.
+
+    If cleanup is true, make sure there's no <head> or <body>, and get
+    rid of any <ins> and <del> tags.
+    """
+    if cleanup:
+        # This removes any extra markup or structure like <head>:
+        html = cleanup_html(html)
+    return parse_element(html, create_parent=True)
+
+_body_re = re.compile(r'<body.*?>', re.I|re.S)
+_end_body_re = re.compile(r'</body.*?>', re.I|re.S)
+_ins_del_re = re.compile(r'</?(ins|del).*?>', re.I|re.S)
+
+def cleanup_html(html):
+    """ This 'cleans' the HTML, meaning that any page structure is removed
+    (only the contents of <body> are used, if there is any <body).
+    Also <ins> and <del> tags are removed.  """
+    match = _body_re.search(html)
+    if match:
+        html = html[match.end():]
+    match = _end_body_re.search(html)
+    if match:
+        html = html[:match.start()]
+    html = _ins_del_re.sub('', html)
+    return html
+    
+
+end_whitespace_re = re.compile(r'[ \t\n\r]$')
+
+def fixup_chunks(chunks):
+    """
+    This function takes a list of chunks and produces a list of tokens.
+    """
+    tag_accum = []
+    cur_word = None
+    result = []
+    for chunk in chunks:
+        if isinstance(chunk, tuple):
+            if chunk[0] == 'img':
+                src = chunk[1]
+                tag = chunk[2]
+                if tag.endswith(' '):
+                    tag = tag[:-1]
+                    trailing_whitespace = True
+                else:
+                    trailing_whitespace = False
+                cur_word = tag_token('img', src, html_repr=tag,
+                                     pre_tags=tag_accum,
+                                     trailing_whitespace=trailing_whitespace)
+                tag_accum = []
+                result.append(cur_word)
+            elif chunk[0] == 'href':
+                href = chunk[1]
+                cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=True)
+                tag_accum = []
+                result.append(cur_word)
+            continue
+        if is_word(chunk):
+            if chunk.endswith(' '):
+                chunk = chunk[:-1]
+                trailing_whitespace = True
+            else:
+                trailing_whitespace = False
+            cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace)
+            tag_accum = []
+            result.append(cur_word)
+        elif is_start_tag(chunk):
+            tag_accum.append(chunk)
+        elif is_end_tag(chunk):
+            if tag_accum:
+                tag_accum.append(chunk)
+            else:
+                assert cur_word, (
+                    "Weird state, cur_word=%r, result=%r, chunks=%r of %r"
+                    % (cur_word, result, chunk, chunks))
+                cur_word.post_tags.append(chunk)
+        else:
+            assert(0)
+
+    if not result:
+        return [token('', pre_tags=tag_accum)]
+    else:
+        result[-1].post_tags.extend(tag_accum)
+
+    return result
+
+
+# All the tags in HTML that don't require end tags:
+empty_tags = (
+    'param', 'img', 'area', 'br', 'basefont', 'input',
+    'base', 'meta', 'link', 'col')
+
+block_level_tags = (
+    'address',
+    'blockquote',
+    'center',
+    'dir',
+    'div',
+    'dl',
+    'fieldset',
+    'form',
+    'h1',
+    'h2',
+    'h3',
+    'h4',
+    'h5',
+    'h6',
+    'hr',
+    'isindex',
+    'menu',
+    'noframes',
+    'noscript',
+    'ol',
+    'p',
+    'pre',
+    'table',
+    'ul',
+    )
+
+block_level_container_tags = (
+    'dd',
+    'dt',
+    'frameset',
+    'li',
+    'tbody',
+    'td',
+    'tfoot',
+    'th',
+    'thead',
+    'tr',
+    )
+
+
+def flatten_el(el, include_hrefs, drop_tag=False):
+    """ Takes an lxml element el, and generates all the text chunks for
+    that tag.  Each start tag is a chunk, each word is a chunk, and each
+    end tag is a chunk.
+
+    If drop_tag is true, then the outermost container tag is
+    not returned (just its contents)."""
+    if not drop_tag:
+        if el.tag == 'img':
+            yield ('img', el.attrib['src'], start_tag(el))
+        else:
+            yield start_tag(el)
+    if el.tag in empty_tags and not el.text and not len(el):
+        return
+    start_words = split_words(el.text)
+    for word in start_words:
+        yield cgi.escape(word)
+    for child in el:
+        for item in flatten_el(child, include_hrefs=include_hrefs):
+            yield item
+    if el.tag == 'a' and el.attrib.get('href') and include_hrefs:
+        yield ('href', el.attrib['href'])
+    if not drop_tag:
+        yield end_tag(el)
+        end_words = split_words(el.tail)
+        for word in end_words:
+            yield cgi.escape(word)
+
+def split_words(text):
+    """ Splits some text into words. Includes trailing whitespace (one
+    space) on each word when appropriate.  """
+    if not text or not text.strip():
+        return []
+    words = [w + ' ' for w in text.strip().split()]
+    if not end_whitespace_re.search(text):
+        words[-1] = words[-1][:-1]
+    return words
+
+start_whitespace_re = re.compile(r'^[ \t\n\r]')
+
+def start_tag(el):
+    """
+    The text representation of the start tag for a tag.
+    """
+    return '<%s%s>' % (
+        el.tag, ''.join(' %s="%s"' % (name, cgi.escape(value, True))
+                        for name, value in el.attrib.items()))
+
+def end_tag(el):
+    """ The text representation of an end tag for a tag.  Includes
+    trailing whitespace when appropriate.  """
+    if el.tail and start_whitespace_re.search(el.tail):
+        extra = ' '
+    else:
+        extra = ''
+    return '</%s>%s' % (el.tag, extra)
+
+def is_word(tok):
+    return not tok.startswith('<')
+
+def is_end_tag(tok):
+    return tok.startswith('</')
+
+def is_start_tag(tok):
+    return tok.startswith('<') and not tok.startswith('</')
+
+def fixup_ins_del_tags(html):
+    """ Given an html string, move any <ins> or <del> tags inside of any
+    block-level elements, e.g. transform <ins><p>word</p></ins> to
+    <p><ins>word</ins></p> """
+    doc = parse_html(html, cleanup=False)
+    _fixup_ins_del_tags(doc)
+    html = serialize_html_fragment(doc, drop_outer=True)
+    return html
+
+def serialize_html_fragment(el, drop_outer=False):
+    """ Serialize a single lxml element as HTML.  The serialized form
+    includes the elements tail.  
+
+    If drop_outer is true, then don't serialize the outermost tag
+    """
+    
+    html_xsl = """\
+<xsl:transform xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+  <xsl:output method="html" encoding="UTF-8" />
+  <xsl:template match="/">
+    <xsl:copy-of select="."/>
+  </xsl:template>
+</xsl:transform>
+"""
+    transform = etree.XSLT(etree.XML(html_xsl))
+    assert not isinstance(el, basestring), (
+        "You should pass in an element, not a string like %r" % el)
+    html = str(transform(el))
+    if drop_outer:
+        # Get rid of the extra starting tag:
+        html = html[html.find('>')+1:]
+    if drop_outer:
+        # Get rid of the extra end tag:
+        html = html[:html.rfind('<')]
+    if drop_outer:
+        return html.strip()
+    else:
+        return html.lstrip()
+
+def _fixup_ins_del_tags(doc):
+    """fixup_ins_del_tags that works on an lxml document in-place
+    """
+    for tag in ['ins', 'del']:
+        for el in doc.xpath('descendant-or-self::%s' % tag):
+            if not _contains_block_level_tag(el):
+                continue
+            _move_el_inside_block(el, tag=tag)
+            _merge_element_contents(el)
+
+def _contains_block_level_tag(el):
+    """True if the element contains any block-level elements, like <p>, <td>, etc.
+    """
+    if el.tag in block_level_tags or el.tag in block_level_container_tags:
+        return True
+    for child in el:
+        if _contains_block_level_tag(child):
+            return True
+    return False
+
+def _move_el_inside_block(el, tag):
+    """ helper for _fixup_ins_del_tags; actually takes the <ins> etc tags
+    and moves them inside any block-level tags.  """
+    for child in el:
+        if _contains_block_level_tag(child):
+            break
+    else:
+        import sys
+        # No block-level tags in any child
+        children_tag = etree.Element(tag)
+        children_tag.text = el.text
+        el.text = None
+        children_tag.extend(list(el))
+        el[:] = [children_tag]
+        return
+    for child in list(el):
+        if _contains_block_level_tag(child):
+            _move_el_inside_block(child, tag)
+            if child.tail:
+                tail_tag = etree.Element(tag)
+                tail_tag.text = child.tail
+                child.tail = None
+                el.insert(el.index(child)+1, tail_tag)
+        else:
+            child_tag = etree.Element(tag)
+            el.replace(child, child_tag)
+            child_tag.append(child)
+    if el.text:
+        text_tag = etree.Element(tag)
+        text_tag.text = el.text
+        el.text = None
+        el.insert(0, text_tag)
+            
+def _merge_element_contents(el):
+    """
+    Removes an element, but merges its contents into its place, e.g.,
+    given <p>Hi <i>there!</i></p>, if you remove the <i> element you get
+    <p>Hi there!</p>
+    """
+    parent = el.getparent()
+    text = el.text or ''
+    if el.tail:
+        if not len(el):
+            text += el.tail
+        else:
+            if el[-1].tail:
+                el[-1].tail += el.tail
+            else:
+                el[-1].tail = el.tail
+    index = parent.index(el)
+    if text:
+        if index == 0:
+            previous = None
+        else:
+            previous = parent[index-1]
+        if previous is None:
+            if parent.text:
+                parent.text += text
+            else:
+                parent.text = text
+        else:
+            if previous.tail:
+                previous.tail += text
+            else:
+                previous.tail = text
+    parent[index:index+1] = el.getchildren()
+
+class InsensitiveSequenceMatcher(difflib.SequenceMatcher):
+    """
+    Acts like SequenceMatcher, but tries not to find very small equal
+    blocks amidst large spans of changes
+    """
+
+    threshold = 2
+    
+    def get_matching_blocks(self):
+        size = min(len(self.b), len(self.b))
+        threshold = min(self.threshold, size / 4)
+        actual = difflib.SequenceMatcher.get_matching_blocks(self)
+        return [item for item in actual
+                if item[2] > threshold
+                or not item[2]]
+    
+# def get_matching_blocks(self):
+#         size = min(len(self.b), len(self.b))
+#         threshold = min(self.threshold, size / 4)
+#         actual = difflib.SequenceMatcher.get_matching_blocks(self)
+#         last_equal_a = 0
+#         eliminate = []
+#         for i in xrange(1, len(actual)-1):
+#             start_diff_length = actual[i][0] - (actual[i-1][0] + actual[i-1][2])
+#             end_diff_length = actual[i+1][0]
+#         for a_pos, b_pos, length in actual:
+#             if (last_equal_a - a_pos is big
+#                 and length is small
+#                 and next_equal_a is far away):
+#                 continue
+#             result.append((a_pos, b_pos, length))
+#             last_equal_a = a_pos+length
+#         return result
+            
+
+if __name__ == '__main__':
+    import doctest
+    doctest.testmod()
+

Added: lxml/branch/html/src/lxml/html/rewritelinks.py
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/rewritelinks.py	Tue May 29 17:05:51 2007
@@ -0,0 +1,121 @@
+"""
+utilities for manipulating html links 
+"""
+
+
+from lxml.html import tostring, HTML
+import urlparse
+import re
+
+__all__ = ['make_links_absolute', 'make_links_absolute_html',
+           'rewrite_links', 'rewrite_links_html',
+           'Relocator']
+
+def make_links_absolute(doc, base_href):
+    def link_repl(href):
+        return urlparse.urljoin(base_href, href)
+    rewrite_links(doc, link_repl_func)
+
+def make_links_absolute_html(html, base_href):
+    doc = HTML(html)
+    make_links_absolute(doc, base_href)
+    return tostring(doc)
+
+def rewrite_links_html(html, link_repl_func, remove_base_tags=True):
+    """
+    rewrite_links(), but work on text and returns text
+    """
+    doc = HTML(html)
+    rewrite_links(doc, link_repl_func, remove_base_tags=remove_base_tags)
+    return tostring(doc)
+
+def rewrite_links(doc, link_repl_func,
+                remove_base_tags=True):
+    """
+    Takes a given document (already parsed by lxml) and modifies it
+    in-place.  Every link is passed through link_repl_func, and the
+    output of that function replaces the link.
+    """
+    if remove_base_tags:
+        resolve_base_href(doc)
+
+    for attrib in 'href', 'src':
+        els = doc.xpath('//*[@%s]' % attrib)
+        for el in els:
+            el.attrib[attrib] = link_repl_func(el.attrib[attrib])
+
+    rewrite_css_links(doc, link_repl_func)
+    rewrite_style_links(doc, link_repl_func)
+
+def resolve_base_href(doc):
+    """
+    removes all html <base href=""> tags 
+    from the document given. 
+    """
+    base_href = None
+    basetags = doc.xpath('//base[@href]')
+    for b in basetags:
+        base_href = b.attrib['href']
+        b.getparent().remove(b)
+    if base_href is None:
+        return
+    # Now that we have a base_href (blech) we have to fix up all the
+    # links in the document with this new information.
+    def link_repl(href):
+        return urlparse.urljoin(base_href, href)
+    rewrite_links(doc, link_repl, remove_base_tags=False)
+    
+CSS_URL_PAT = re.compile(r'url\((.*?)\)', re.I)
+CSS_IMPORT_PAT = re.compile(r'@import "(.*?)"')
+def rewrite_css_links(doc, link_repl_func):
+    """
+    Fixes up any url(...) links in CSS style elements
+    """
+    def absuri(matchobj):
+        return 'url(%s)' % link_repl_func(matchobj.group(1))
+    def absimport(matchobj):
+        return '@import "%s"' % link_repl_func(matchobj.group(1))
+    els = doc.xpath('//head/style')
+    for el in els:
+        if el.text:
+            el.text = CSS_URL_PAT.sub(absuri, el.text)
+            el.text = CSS_IMPORT_PAT.sub(absimport, el.text)
+
+def rewrite_style_links(doc, link_repl_func):
+    def absuri(matchobj):
+        return 'url(%s)' % link_repl_func(matchobj.group(1))
+    for el in doc.xpath("//*[contains(@style, 'url(')]"):
+        el.attrib['style'] = CSS_URL_PAT.sub(absuri, el.attrib['style'])
+
+class Relocator(object):
+    """
+    This helper can be used to move all links in a document from one
+    location to another.  Typically you use this like::
+
+        rewrite_links_html(
+            html, Relocator('http://old-domain/', 'http://new-domain',
+                            base_href='http://old-domain/foo/bar.html'))
+
+    This means that the document was located at
+    ``http://old-domain/foo/bar.html`` (used to resolve relative
+    links), and that you want to change every occurance of
+    ``http://old-domain/`` to ``http://new-domain``
+    """
+    # This catches the case of http://foo, which is equivalent to
+    # http://foo/ :
+    _domain_no_slash_re = re.compile(r'^[a-z]+://[^/]+$', re.I)
+
+    def __init__(self, old_href, new_href, base_href=None):
+        self.old_href = old_href
+        self.new_href = new_href
+        self.base_href = base_href
+
+    def __call__(self, href):
+        if self.base_href is not None:
+            real_href = urlparse.urljoin(self.base_href, href)
+        if self._domain_no_slash_re.search(real_href):
+            real_href += '/'
+        if not real_href.startswith(self.old_href):
+            # A link somewhere else entirely
+            return href
+        return self.new_href + real_href[len(self.old_href):]

Added: lxml/branch/html/src/lxml/html/tests/__init__.py
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/tests/__init__.py	Tue May 29 17:05:51 2007
@@ -0,0 +1 @@
+#

Added: lxml/branch/html/src/lxml/html/tests/test_basic.py
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/tests/test_basic.py	Tue May 29 17:05:51 2007
@@ -0,0 +1,12 @@
+import unittest
+from lxml.tests.common_imports import doctest
+
+from lxml.html import HTML
+
+def test_suite():
+    suite = unittest.TestSuite()
+    suite.addTests([doctest.DocFileSuite('test_basic.txt')])
+    return suite
+
+if __name__ == '__main__':
+    unittest.main()

Added: lxml/branch/html/src/lxml/html/tests/test_basic.txt
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/tests/test_basic.txt	Tue May 29 17:05:51 2007
@@ -0,0 +1,42 @@
+lxml.html adds a find_class method to elements::
+
+    >>> from lxml.html import HTML, tostring
+    >>> from lxml.html.clean import clean, clean_html
+    >>> from lxml.html import usedoctest
+    >>> h = HTML('''
+    ... <html><head></head>
+    ... <body>
+    ...   <a class="vcard
+    ... fn   url" href="foobar">P1</a>
+    ...   <a class="not-fn vcard" href="baz">P2</a>
+    ... </body></html>''')
+    >>> print tostring(h)
+    <html>
+      <head></head>
+      <body>
+        <a class="vcard
+    fn   url" href="foobar">P1</a>
+        <a class="not-fn vcard" href="baz">P2</a>
+      </body>
+    </html>
+    >>> print [e.text for e in h.find_class('fn')]
+    ['P1']
+    >>> print [e.text for e in h.find_class('vcard')]
+    ['P1', 'P2']
+
+Also added is a get_rel_links, which you can use to search for links
+like ``<a rel="$something">``:
+
+     >>> h = HTML('''
+     ... <a href="1">test 1</a>
+     ... <a href="2" rel="tag">item 2</a>
+     ... <a href="3" rel="tagging">item 3</a>
+     ... <a href="4" rel="TAG">item 4</a>''')
+     >>> print [e.attrib['href'] for e in h.find_rel_links('tag')]
+     ['2']
+     >>> print [e.attrib['href'] for e in h.find_rel_links('nofollow')]
+     []
+
+FIXME: actually that should have returned ['2', '4']
+
+

Added: lxml/branch/html/src/lxml/html/tests/test_clean.py
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/tests/test_clean.py	Tue May 29 17:05:51 2007
@@ -0,0 +1,10 @@
+import unittest
+from lxml.tests.common_imports import doctest
+
+def test_suite():
+    suite = unittest.TestSuite()
+    suite.addTests([doctest.DocFileSuite('test_clean.txt')])
+    return suite
+
+if __name__ == '__main__':
+    unittest.main()

Added: lxml/branch/html/src/lxml/html/tests/test_clean.txt
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/tests/test_clean.txt	Tue May 29 17:05:51 2007
@@ -0,0 +1,94 @@
+>>> from lxml.html import HTML, tostring
+>>> from lxml.html.clean import clean, clean_html
+>>> from lxml.html import usedoctest
+>>> doc = '''<html>
+...  <head>
+...    <script type="text/javascript" src="evil-site"></script>
+...    <link rel="alternate" type="text/rss" src="evil-rss">
+...  </head>
+...  <body onload="evil_function()">
+...    <!-- I am interpreted for EVIL! -->
+...    <a href="javascript:evil_function()">a link</a>
+...    <a href="#" onclick="evil_function()">another link</a>
+...    <p onclick="evil_function()">a paragraph</p>
+...    <div style="display: none">secret EVIL!</div>
+...    <object> of EVIL! </object>
+...    <iframe src="evil-site"></iframe>
+...    <form action="evil-site">
+...      Password: <input type="password" name="password">
+...    </form>
+...    <blink>annoying EVIL!</blink>
+...    <a href="evil-site">spam spam SPAM!</a>
+...  </body>
+... </html>'''
+>>> print doc
+<html>
+  <head>
+    <script type="text/javascript" src="evil-site"></script>
+    <link rel="alternate" type="text/rss" src="evil-rss">
+  </head>
+  <body onload="evil_function()">
+    <!-- I am interpreted for EVIL! -->
+    <a href="javascript:evil_function()">a link</a>
+    <a href="#" onclick="evil_function()">another link</a>
+    <p onclick="evil_function()">a paragraph</p>
+    <div style="display: none">secret EVIL!</div>
+    <object> of EVIL! </object>
+    <iframe src="evil-site"></iframe>
+    <form action="evil-site">
+      Password: <input type="password" name="password">
+    </form>
+    <blink>annoying EVIL!</blink>
+    <a href="evil-site">spam spam SPAM!</a>
+  </body>
+</html>
+>>> print tostring(HTML(doc))
+<html>
+  <head>
+    <script type="text/javascript" src="evil-site"></script>
+    <link rel="alternate" type="text/rss" src="evil-rss">
+  </head>
+  <body onload="evil_function()">
+    <!-- I am interpreted for EVIL! -->
+    <a href="javascript:evil_function()">a link</a>
+    <a href="#" onclick="evil_function()">another link</a>
+    <p onclick="evil_function()">a paragraph</p>
+    <div style="display: none">secret EVIL!</div>
+    <object> of EVIL! </object>
+    <iframe src="evil-site"></iframe>
+    <form action="evil-site">
+      Password: <input type="password" name="password">
+    </form>
+    <blink>annoying EVIL!</blink>
+    <a href="evil-site">spam spam SPAM!</a>
+  </body>
+</html>
+>>> print clean_html(doc)
+<html>
+  <head>
+    <link rel="alternate" type="text/rss" src="evil-rss">
+  </head>
+  <body>
+    <a href="">a link</a>
+    <a href="#">another link</a>
+    <p>a paragraph</p>
+    <div style="display: none">secret EVIL!</div>
+    Password:
+    <blink>annoying EVIL!</blink>
+    <a href="evil-site">spam spam SPAM!</a>
+  </body>
+</html>
+>>> print clean_html(doc, style=True, links=True, add_nofollow=True)
+<html>
+  <head>
+  </head>
+  <body>
+    <a href="">a link</a>
+    <a href="#">another link</a>
+    <p>a paragraph</p>
+    <div>secret EVIL!</div>
+    Password:
+    <blink>annoying EVIL!</blink>
+    <a href="evil-site" rel="nofollow">spam spam SPAM!</a>
+  </body>
+</html>

Added: lxml/branch/html/src/lxml/html/tests/test_htmldiff.py
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/tests/test_htmldiff.py	Tue May 29 17:05:51 2007
@@ -0,0 +1,13 @@
+import unittest
+from lxml.tests.common_imports import doctest
+
+from lxml.html import htmldiff
+
+def test_suite():
+    suite = unittest.TestSuite()
+    suite.addTests([doctest.DocFileSuite('test_htmldiff.txt'),
+                    doctest.DocTestSuite(htmldiff)])
+    return suite
+
+if __name__ == '__main__':
+    unittest.main()

Added: lxml/branch/html/src/lxml/html/tests/test_htmldiff.txt
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/tests/test_htmldiff.txt	Tue May 29 17:05:51 2007
@@ -0,0 +1,248 @@
+htmldiff does HTML comparisons.  These are word-based comparisons.
+
+First, a handy function for normalizing whitespace and doing word wrapping::
+
+    >>> import re, textwrap
+    >>> def pwrapped(text):
+    ...     text = re.sub(r'[ \n\t\r]+', ' ', text)
+    ...     text = textwrap.fill(text)
+    ...     print text
+    >>> def pdiff(text1, text2):
+    ...     pwrapped(htmldiff(text1, text2))
+
+Example::
+
+    >>> from lxml.html.htmldiff import htmldiff, split_unbalanced, html_annotate
+    >>> html1 = '<p>This is some test text with some changes and some same stuff</p>'
+    >>> html2 = '''<p>This is some test textual writing with some changed stuff 
+    ... and some same stuff</p>'''
+    >>> pdiff(html1, html2)
+    <p>This is some test <ins>textual writing with some changed
+    stuff</ins> <del>text with some changes</del> and some same stuff</p>
+
+Style tags are largely ignored in terms of differences, though markup is not eliminated::
+
+    >>> html1 = '<p>Hi <i>you guys</i></p>'
+    >>> html2 = '<p>Hi <i>you</i> guys</p>'
+    >>> pdiff(html1, html2)
+    <p>Hi <i>you</i> guys</p>
+    >>> pdiff('text', '<p>text</p>')
+    <p>text</p>
+    >>> pdiff('<i>Hi guys</i> !!', '<i>Hi guy</i> !!')
+    <i>Hi <ins>guy</ins> <del>guys</del> </i> !!
+    >>> pdiff('H<i>i</i>', 'Hi')
+    <ins>Hi</ins> <del>H<i>i</i></del>
+    >>> pdiff('<i>A B</i> C', '<i>A</i> C')
+    <i>A <del>B</del> </i> C
+    >>> pdiff('<i>A B</i> C', '<i>B</i> C')
+    <i> <del>A</del> B</i> C
+    >>> pdiff('<p></p>', '<p></p>')
+    <p></p>
+    >>> pdiff('<p>Hi</p>', '<p>Bye</p>')
+    <p><ins>Bye</ins></p> <p><del>Hi</del></p>
+    >>> pdiff('<p>Hi Guy</p>', '<p>Bye Guy</p>')
+    <p> <ins>Bye</ins> <del>Hi</del> Guy</p>
+    >>> pdiff('<p>Hey there</p>', '')
+    <ins></ins> <p><del>Hey there</del></p>
+
+Whitespace is ignored, as it's not meaningful in HTML::
+
+    >>> pdiff('<div>Hi\n\nguys</div>', '<div>Hi guy</div>')
+    <div>Hi <ins>guy</ins> <del>guys</del> </div>
+
+Movement between paragraphs is ignored, as tag-based changes are generally ignored::
+    >>> 
+    >>> pdiff('<p>Hello</p><p>World</p>', '<p>Hello World</p>')
+    <p>Hello World</p>
+
+As a special case, changing the href of a link is displayed, and
+images are treated like words:
+
+    >>> pdiff('<a href="http://yahoo.com">search</a>', '<a href="http://google.com">search</a>')
+    <a href="http://google.com">search <ins>Link: http://google.com</ins>
+    <del>Link: http://yahoo.com</del> </a>
+    >>> pdiff('<p>Print this <img src="print.gif"></p>', '<p>Print this</p>')
+    <p>Print this <del><img src="print.gif"></del> </p>
+    >>> pdiff('<a href="http://yahoo.com">search</a>', '<a href="http://yahoo.com">search</a>')
+    <a href="http://yahoo.com">search</a>
+
+The sixteen combinations::
+
+First "insert start" (del start/middle/end/none):
+
+    >>> pdiff('<b>A B C</b>', '<b>D B C</b')
+    <b> <ins>D</ins> <del>A</del> B C</b>
+    >>> pdiff('<b>A B C</b>', '<b>D A C</b>')
+    <b> <ins>D</ins> A <del>B</del> C</b>
+    >>> pdiff('<b>A B C</b>', '<b>D A B</b>')
+    <b> <ins>D</ins> A B <del>C</del> </b>
+    >>> pdiff('<b>A B C</b>', '<b>D A B C</b>')
+    <b> <ins>D</ins> A B C</b>
+
+Next, "insert middle" (del start/middle/end/none):
+
+    >>> pdiff('<b>A B C</b>', '<b>D B C</b>')
+    <b> <ins>D</ins> <del>A</del> B C</b>
+    >>> pdiff('<b>A B C</b>', '<b>A D C</b>')
+    <b>A <ins>D</ins> <del>B</del> C</b>
+    >>> pdiff('<b>A B C</b>', '<b>A D B</b>')
+    <b>A <ins>D</ins> B <del>C</del> </b>
+
+This one case hits the threshold of our insensitive matching:
+
+    >>> pdiff('<b>A B C</b>', '<b>A D B C</b>')
+    <b> <ins>A D</ins> <del>A</del> B C</b>
+
+
+Then "insert end" (del start/middle/end/none):
+
+    >>> pdiff('<b>A B C</b>', '<b>B C D</b>')
+    <b> <del>A</del> B C <ins>D</ins> </b>
+    >>> pdiff('<b>A B C</b>', '<b>A C D</b>')
+    <b>A <del>B</del> C <ins>D</ins> </b>
+    >>> pdiff('<b>A B C</b>', '<b>A B D</b>')
+    <b>A B <ins>D</ins> <del>C</del> </b>
+    >>> pdiff('<b>A B C</b>', '<b>A B C D</b>')
+    <b>A B C <ins>D</ins> </b>
+
+Then no insert (del start/middle/end):
+
+    >>> pdiff('<b>A B C</b>', '<b>B C</b>')
+    <b> <del>A</del> B C</b>
+    >>> pdiff('<b>A B C</b>', '<b>A C</b>')
+    <b>A <del>B</del> C</b>
+    >>> pdiff('<b>A B C</b>', '<b>A B</b>')
+    <b>A B <del>C</del> </b>
+
+    >>> pdiff('<b>A B</b> C', '<b>A B</b>')
+    <b>A B</b> <del>C</del>
+    >>> pdiff('<b>A B</b> <b>C</b>', '<b>A B</b>')
+    <b>A B</b> <del><b>C</b></del>
+    >>> pdiff('A <p><b>hey there</b> <i>how are you?</i></p>', 'A')
+    A <p><del><b>hey there</b> <i>how are you?</i></del></p>
+    
+Testing a larger document, to make sure there are not weird
+unnecessary parallels found:
+
+    >>> pdiff('''
+    ... <p>This is a test document with many words in it that goes on
+    ... for a while and doesn't have anything do to with the next
+    ... document that we match this against</p>''', '''
+    ... <p>This is another document with few similarities to the preceding
+    ... one, but enough that it may have overlap that could turn into
+    ... a confusing series of deletes and inserts.
+    ... </p>''')
+    <p><ins>This is another document with few similarities to the
+    preceding one, but enough that it may have overlap that could turn
+    into a confusing series of deletes and inserts. </ins></p>
+    <p><del>This is a test document with many words in it that goes on for
+    a while and doesn't have anything do to with the next document that we
+    match this against</del></p>
+
+
+
+Annotation of content can also be done, where every bit of content is
+marked up with information about where it came from.
+
+First, some setup; note that html_annotate is called with a sequence
+of documents and the annotation associated with that document.  We'll
+just use indexes, but you could use author or timestamp information.
+
+    >>> def markup(text, annotation):
+    ...     return '<span version="%s">%s</span>' % (annotation, text)
+    >>> def panno(*docs):
+    ...     pwrapped(html_annotate([(doc, index) for index, doc in enumerate(docs)],
+    ...                            markup=markup))
+
+Now, a sequence of documents:
+
+    >>> panno('Hello cruel world', 'Hi cruel world', 'Hi world')
+    <span version="1">Hi</span> <span version="0">world</span>
+    >>> panno('A similar document', 'A similar document',
+    ...       'A similar document here')
+    <span version="0">A similar document</span> <span
+    version="2">here</span>
+    >>> panno('<p>P1 para</p><p>P2 para</p>', '<p>P1 para</p><p>P3 foo</p>')
+    <p><span version="0">P1 para</span></p><p><span version="1">P3
+    foo</span></p>
+    >>> panno('Hello<p>There World</p>','Hello<p>There Town</p>')
+    <span version="0">Hello</span><p><span version="0">There</span> <span
+    version="1">Town</span></p>
+    >>> panno('<p>Hello</p>There World','<p>Hello</p>There Town')
+    <p><span version="0">Hello</span></p><span version="0">There</span>
+    <span version="1">Town</span>
+    >>> panno('<p>Hello</p><p>There World</p>','<p>Hello</p><p>There Town</p>')
+    <p><span version="0">Hello</span></p><p><span version="0">There</span>
+    <span version="1">Town</span></p>
+    >>> panno('<p>Hi <img src="/foo"> You</p>',
+    ...       '<p>Hi You</p>',
+    ...       '<p>Hi You <img src="/bar"></p>')
+    <p><span version="0">Hi</span> <span version="1">You</span> <span
+    version="2"><img src="/bar"></span></p>
+    >>> panno('<p><a href="/foo">Hey</a></p>',
+    ...       '<p><a href="/bar">Hey</a></p>')
+    <p><a href="/bar"><span version="0">Hey</span></a></p>
+    >>> panno('<p><a href="/foo">Hey You</a></p>',
+    ...       '<p><a href="/foo">Hey Guy</a></p>')
+    <p><a href="/foo"><span version="0">Hey</span> <span
+    version="1">Guy</span></a></p>
+
+
+
+Here's a test of a utility function!:
+
+    >>> from lxml.html.htmldiff import _merge_element_contents
+    >>> from lxml import etree
+    >>> doc = '''<html><body><div>
+    ... <div id="c1">a b <span id="d1">content</span> c d</div>
+    ... <div id="c2"><span id="d2">content <b>and more</b> stuff</span> trailing</div>
+    ... <div id="c3"><b>hi</b><span id="d3"><i>content</i></span></div>
+    ... <div id="c4"><b>Hi</b> <span id="d4">some stuff<i>more stuff</i></span></div>
+    ... </div></body></html>'''
+    >>> doc = etree.HTML(doc)
+    >>> def show_result(id):
+    ...     el = doc.xpath("//*[@id='d%s']" % id)[0]
+    ...     _merge_element_contents(el)
+    ...     container = doc.xpath("//*[@id='c%s']" % id)[0]
+    ...     print etree.tostring(container).strip()
+    >>> show_result(1)
+    <div id="c1">a b content c d</div>
+    >>> show_result(2)
+    <div id="c2">content <b>and more</b> stuff trailing</div>
+    >>> show_result(3)
+    <div id="c3"><b>hi</b><i>content</i></div>
+    >>> show_result(4)
+    <div id="c4"><b>Hi</b> some stuff<i>more stuff</i></div>
+
+More utility:
+
+    >>> from lxml.html.htmldiff import fixup_ins_del_tags
+    >>> def pfixup(text):
+    ...     print fixup_ins_del_tags(text).strip()
+    >>> pfixup('<ins><p>some text <b>and more text</b> and more</p></ins>')
+    <p><ins>some text <b>and more text</b> and more</ins></p>
+    >>> pfixup('<p><ins>Hi!</ins> you</p>')
+    <p><ins>Hi!</ins> you</p>
+    >>> pfixup('<div>Some text <ins>and <p>more text</p></ins> </div>')
+    <div>Some text <ins>and </ins><p><ins>more text</ins></p> </div>
+    >>> pfixup('''
+    ...    <ins><table><tr><td>One table</td><td>More stuff</td></tr></table></ins>''')
+    <table><tr>
+    <td><ins>One table</ins></td>
+    <td><ins>More stuff</ins></td>
+    </tr></table>
+
+
+Testing split_unbalanced:
+
+    >>> split_unbalanced(['<a href="blah">', 'hey', '</a>'])
+    ([], ['<a href="blah">', 'hey', '</a>'], [])
+    >>> split_unbalanced(['<a href="blah">', 'hey'])
+    (['<a href="blah">'], ['hey'], [])
+    >>> split_unbalanced(['Hey', '</i>', 'You', '</b>'])
+    ([], ['Hey', 'You'], ['</i>', '</b>'])
+    >>> split_unbalanced(['So', '</i>', 'Hi', '<b>', 'There', '</b>'])
+    ([], ['So', 'Hi', '<b>', 'There', '</b>'], ['</i>'])
+    >>> split_unbalanced(['So', '</i>', 'Hi', '<b>', 'There'])
+    (['<b>'], ['So', 'Hi', 'There'], ['</i>'])
+    

Added: lxml/branch/html/src/lxml/html/tests/test_rewritelinks.py
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/tests/test_rewritelinks.py	Tue May 29 17:05:51 2007
@@ -0,0 +1,10 @@
+import unittest
+from lxml.tests.common_imports import doctest
+
+def test_suite():
+    suite = unittest.TestSuite()
+    suite.addTests([doctest.DocFileSuite('test_rewritelinks.txt')])
+    return suite
+
+if __name__ == '__main__':
+    unittest.main()

Added: lxml/branch/html/src/lxml/html/tests/test_rewritelinks.txt
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/tests/test_rewritelinks.txt	Tue May 29 17:05:51 2007
@@ -0,0 +1,79 @@
+These are tests of relocateresponse::
+
+    >>> from lxml.html.rewritelinks import *
+
+In all these examples we'll be using ``http://old`` for the old
+(to-be-replaced) URL and ``https://new`` for the new URL (note the
+scheme change).  Out of laziness we'll define some keywords we use
+with all these transformations::
+
+    >>> relocate_href = Relocator(
+    ...     base_href='http://old/base/path.html',
+    ...     old_href='http://old/',
+    ...     new_href='https://new/')
+
+Now lets look at simple href rewriting.
+
+Normal rewrite::
+
+    >>> relocate_href('http://old/bar')
+    'https://new/bar'
+
+Note that the trailing / doesn't matter in this one case (since
+``http://old`` and ``http://old/`` are entirely equivalent)::
+
+    >>> relocate_href('http://old')
+    'https://new/'
+
+The trailing / does matter in other cases::
+
+    >>> Relocator(
+    ...     base_href='',
+    ...     old_href='http://old-test/foo/',
+    ...     new_href='https://new',
+    ...     )('http://old-test/foo')
+    'http://old-test/foo'
+    >>> Relocator(
+    ...     base_href='',
+    ...     old_href='http://old-test/foo/',
+    ...     new_href='https://new',
+    ...     )('http://old-test/foo/')
+    'https://new'
+
+Rewriting a link that doesn't match old_href is a no-op::
+
+    >>> relocate_href('http://foo/bar')
+    'http://foo/bar'
+
+Relative links are handled::
+
+    >>> relocate_href('index.html')
+    'https://new/base/index.html'
+
+Now for content.  First, to make it easier on us, we need to trim the
+normalized HTML we get from these functions::
+
+    >>> import re
+    >>> def pr_html(html):
+    ...     html = re.sub(r'</?(?:html|head|body)>', '', html)
+    ...     html = re.sub(r'<meta.*?>', '', html)
+    ...     print html.strip()
+
+Some basics::
+
+    >>> pr_html(rewrite_links_html(
+    ...     '<a href="http://old/blah/blah.html">link</a>', relocate_href))
+    <a href="https://new/blah/blah.html">link</a>
+    >>> pr_html(rewrite_links_html(
+    ...     '<script src="http://old/foo.js"></script>', relocate_href))
+    <script src="https://new/foo.js"></script>
+    >>> pr_html(rewrite_links_html(
+    ...     '<link href="foo.css">', relocate_href))
+    <link href="https://new/base/foo.css">
+    >>> pr_html(rewrite_links_html('''\
+    ... <base href="http://blah/stuff/index.html">
+    ... <link href="foo.css">
+    ... <a href="http://old/bar.html">x</a>\
+    ... ''', relocate_href))
+    <link href="http://blah/stuff/foo.css">
+    <a href="https://new/bar.html">x</a>

Added: lxml/branch/html/src/lxml/html/usedoctest.py
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/usedoctest.py	Tue May 29 17:05:51 2007
@@ -0,0 +1,3 @@
+from lxml import doctestcompare
+
+doctestcompare.temp_install(html=True)

Added: lxml/branch/html/src/lxml/usedoctest.py
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/usedoctest.py	Tue May 29 17:05:51 2007
@@ -0,0 +1,3 @@
+from lxml import doctestcompare
+
+doctestcompare.temp_install()


More information about the lxml-checkins mailing list