[Lxml-checkins] r43977 - in lxml/branch/html/src/lxml/html: . tests

ianb at codespeak.net ianb at codespeak.net
Fri Jun 1 20:23:36 CEST 2007


Author: ianb
Date: Fri Jun  1 20:23:35 2007
New Revision: 43977

Added:
   lxml/branch/html/src/lxml/html/diff.py
      - copied, changed from r43962, lxml/branch/html/src/lxml/html/htmldiff.py
   lxml/branch/html/src/lxml/html/tests/test_diff.py
      - copied, changed from r43962, lxml/branch/html/src/lxml/html/tests/test_htmldiff.py
   lxml/branch/html/src/lxml/html/tests/test_diff.txt
      - copied, changed from r43962, lxml/branch/html/src/lxml/html/tests/test_htmldiff.txt
Removed:
   lxml/branch/html/src/lxml/html/htmldiff.py
   lxml/branch/html/src/lxml/html/tests/test_htmldiff.py
   lxml/branch/html/src/lxml/html/tests/test_htmldiff.txt
Modified:
   lxml/branch/html/src/lxml/html/__init__.py
Log:
Remove debugging print; rename htmldiff to diff (lxml.html.diff)

Modified: lxml/branch/html/src/lxml/html/__init__.py
==============================================================================
--- lxml/branch/html/src/lxml/html/__init__.py	(original)
+++ lxml/branch/html/src/lxml/html/__init__.py	Fri Jun  1 20:23:35 2007
@@ -316,8 +316,6 @@
         if not isinstance(create_parent, basestring):
             create_parent = 'div'
         return parse_element('<%s>%s</%s>' % (create_parent, html, create_parent))
-    else:
-        print '----------\n', html
     elements = parse_elements(html, no_leading_text=True)
     if not elements:
         raise etree.ParserError(

Copied: lxml/branch/html/src/lxml/html/diff.py (from r43962, lxml/branch/html/src/lxml/html/htmldiff.py)
==============================================================================
--- lxml/branch/html/src/lxml/html/htmldiff.py	(original)
+++ lxml/branch/html/src/lxml/html/diff.py	Fri Jun  1 20:23:35 2007
@@ -770,7 +770,8 @@
             if not _contains_block_level_tag(el):
                 continue
             _move_el_inside_block(el, tag=tag)
-            _merge_element_contents(el)
+            el.drop_tag()
+            #_merge_element_contents(el)
 
 def _contains_block_level_tag(el):
     """True if the element contains any block-level elements, like <p>, <td>, etc.

Deleted: /lxml/branch/html/src/lxml/html/htmldiff.py
==============================================================================
--- /lxml/branch/html/src/lxml/html/htmldiff.py	Fri Jun  1 20:23:35 2007
+++ (empty file)
@@ -1,890 +0,0 @@
-import difflib
-from lxml import etree
-from lxml.html import parse_element
-import cgi
-import re
-
-__all__ = ['html_annotate', 'htmldiff']
-
-
-############################################################
-## Annotation
-############################################################
-
-def default_markup(text, version):
-    return '<span title="%s">%s</span>' % (
-        cgi.escape(unicode(version), 1), text)
-
-def html_annotate(doclist, markup=default_markup):
-    """
-    doclist should be ordered from oldest to newest, like::
-
-        >>> version1 = 'Hello World'
-        >>> version2 = 'Goodbye World'
-        >>> html_annotate([(version1, 'version 1'),
-        ...                (version2, 'version 2')])
-        u'<span title="version 2">Goodbye</span> <span title="version 1">World</span>'
-
-    The documents must be *fragments* (str/UTF8 or unicode), not
-    complete documents
-
-    The markup argument is a function to markup the spans of words.
-    This function is called like markup('Hello', 'version 2'), and
-    returns HTML.  The first argument is text and never includes any
-    markup.  The default uses a span with a title:
-
-        >>> default_markup('Some Text', 'by Joe')
-        u'<span title="by Joe">Some Text</span>'
-    """
-    # The basic strategy we have is to split the documents up into
-    # logical tokens (which are words with attached markup).  We then
-    # do diffs of each of the versions to track when a token first
-    # appeared in the document; the annotation attached to the token
-    # is the version where it first appeared.
-    tokenlist = [tokenize_annotated(doc, version)
-                 for doc, version in doclist]
-    cur_tokens = tokenlist[0]
-    for tokens in tokenlist[1:]:
-        html_annotate_merge_annotations(cur_tokens, tokens)
-        cur_tokens = tokens
-
-    # After we've tracked all the tokens, we can combine spans of text
-    # that are adjacent and have the same annotation
-    cur_tokens = compress_tokens(cur_tokens)
-    # And finally add markup
-    result = markup_serialize_tokens(cur_tokens, markup)
-    return ''.join(result).strip()
-
-def tokenize_annotated(doc, annotation): 
-    """Tokenize a document and add an annotation attribute to each token
-    """
-    tokens = tokenize(doc, include_hrefs=False)
-    for tok in tokens: 
-        tok.annotation = annotation
-    return tokens
-
-def html_annotate_merge_annotations(tokens_old, tokens_new): 
-    """Merge the annotations from tokens_old into tokens_new, when the
-    tokens in the new document already existed in the old document.
-    """
-    s = InsensitiveSequenceMatcher(a=tokens_old, b=tokens_new)
-    commands = s.get_opcodes()
-
-    for command, i1, i2, j1, j2 in commands:
-        if command == 'equal': 
-            eq_old = tokens_old[i1:i2]
-            eq_new = tokens_new[j1:j2]
-            copy_annotations(eq_old, eq_new)
-
-def copy_annotations(src, dest): 
-    """
-    Copy annotations from the tokens listed in src to the tokens in dest
-    """
-    assert len(src) == len(dest)
-    for src_tok, dest_tok in zip(src, dest): 
-        dest_tok.annotation = src_tok.annotation
-
-def compress_tokens(tokens):
-    """
-    Combine adjacent tokens when there is no HTML between the tokens, 
-    and they share an annotation
-    """
-    result = [tokens[0]] 
-    for tok in tokens[1:]: 
-        if (not result[-1].post_tags and 
-            not tok.pre_tags and 
-            result[-1].annotation == tok.annotation): 
-            compress_merge_back(result, tok)
-        else: 
-            result.append(tok)
-    return result
-
-def compress_merge_back(tokens, tok): 
-    """ Merge tok into the last element of tokens (modifying the list of
-    tokens in-place).  """
-    last = tokens[-1]
-    if type(last) is not token or type(tok) is not token: 
-        tokens.append(tok)
-    else:
-        text = unicode(last)
-        if last.trailing_whitespace:
-            text += ' '
-        text += tok
-        merged = token(text,
-                       pre_tags=last.pre_tags,
-                       post_tags=tok.post_tags,
-                       trailing_whitespace=tok.trailing_whitespace)
-        merged.annotation = last.annotation
-        tokens[-1] = merged
-    
-def markup_serialize_tokens(tokens, markup_func):
-    """
-    Serialize the list of tokens into a list of text chunks, calling
-    markup_func around text to add annotations.
-    """
-    for token in tokens:
-        for pre in token.pre_tags:
-            yield pre
-        html = token.html()
-        html = markup_func(html, token.annotation)
-        if token.trailing_whitespace:
-            html += ' '
-        yield html
-        for post in token.post_tags:
-            yield post
-
-
-############################################################
-## HTML Diffs
-############################################################
-
-def htmldiff(old_html, new_html):
-    """ Do a diff of the old and new document.  The documents are HTML
-    *fragments* (str/UTF8 or unicode), they are not complete documents
-    (i.e., no <html> tag).
-
-    Returns HTML with <ins> and <del> tags added around the
-    appropriate text.  
-
-    Markup is generally ignored, with the markup from new_html
-    preserved, and possibly some markup from old_html (though it is
-    considered acceptable to lose some of the old markup).  Only the
-    words in the HTML are diffed.  The exception is <img> tags, which
-    are treated like words, and the href attribute of <a> tags, which
-    are noted inside the tag itself when there are changes.
-    """ 
-    old_html_tokens = tokenize(old_html)
-    new_html_tokens = tokenize(new_html)
-    result = htmldiff_tokens(old_html_tokens, new_html_tokens)
-    result = ''.join(result).strip()
-    return fixup_ins_del_tags(result)
-
-def htmldiff_tokens(html1_tokens, html2_tokens):
-    """ Does a diff on the tokens themselves, returning a list of text
-    chunks (not tokens).
-    """
-    # There are several passes as we do the differences.  The tokens
-    # isolate the portion of the content we care to diff; difflib does
-    # all the actual hard work at that point.  
-    #
-    # Then we must create a valid document from pieces of both the old
-    # document and the new document.  We generally prefer to take
-    # markup from the new document, and only do a best effort attempt
-    # to keep markup from the old document; anything that we can't
-    # resolve we throw away.  Also we try to put the deletes as close
-    # to the location where we think they would have been -- because
-    # we are only keeping the markup from the new document, it can be
-    # fuzzy where in the new document the old text would have gone.
-    # Again we just do a best effort attempt.
-    s = InsensitiveSequenceMatcher(a=html1_tokens, b=html2_tokens)
-    commands = s.get_opcodes()
-    result = []
-    for command, i1, i2, j1, j2 in commands:
-        if command == 'equal':
-            result.extend(expand_tokens(html2_tokens[j1:j2], equal=True))
-            continue
-        if command == 'insert' or command == 'replace':
-            ins_tokens = expand_tokens(html2_tokens[j1:j2])
-            merge_insert(ins_tokens, result)
-        if command == 'delete' or command == 'replace':
-            del_tokens = expand_tokens(html1_tokens[i1:i2])
-            merge_delete(del_tokens, result)
-    # If deletes were inserted directly as <del> then we'd have an
-    # invalid document at this point.  Instead we put in special
-    # markers, and when the complete diffed document has been created
-    # we try to move the deletes around and resolve any problems.
-    result = cleanup_delete(result)
-
-    return result
-
-def expand_tokens(tokens, equal=False):
-    """Given a list of tokens, return a generator of the chunks of
-    text for the data in the tokens.
-    """
-    for token in tokens:
-        for pre in token.pre_tags:
-            yield pre
-        if not equal or not token.hide_when_equal:
-            if token.trailing_whitespace:
-                yield token.html() + ' '
-            else:
-                yield token.html()
-        for post in token.post_tags:
-            yield post
-
-def merge_insert(ins_chunks, doc):
-    """ doc is the already-handled document (as a list of text chunks);
-    here we add <ins>ins_chunks</ins> to the end of that.  """
-    # Though we don't throw away unbalanced_start or unbalanced_end
-    # (we assume there is accompanying markup later or earlier in the
-    # document), we only put <ins> around the balanced portion.
-    unbalanced_start, balanced, unbalanced_end = split_unbalanced(ins_chunks)
-    doc.extend(unbalanced_start)
-    if doc and not doc[-1].endswith(' '):
-        # Fix up the case where the word before the insert didn't end with 
-        # a space
-        doc[-1] += ' '
-    doc.append('<ins>')
-    if balanced and balanced[-1].endswith(' '):
-        # We move space outside of </ins>
-        balanced[-1] = balanced[-1][:-1]
-    doc.extend(balanced)
-    doc.append('</ins> ')
-    doc.extend(unbalanced_end)
-
-# These are sentinals to represent the start and end of a <del>
-# segment, until we do the cleanup phase to turn them into proper
-# markup:
-class DEL_START:
-    pass
-class DEL_END:
-    pass
-
-class NoDeletes(Exception):
-    """ Raised when the document no longer contains any pending deletes
-    (DEL_START/DEL_END) """
-
-def merge_delete(del_chunks, doc):
-    """ Adds the text chunks in del_chunks to the document doc (another
-    list of text chunks) with marker to show it is a delete.
-    cleanup_delete later resolves these markers into <del> tags."""
-    doc.append(DEL_START)
-    doc.extend(del_chunks)
-    doc.append(DEL_END)
-
-def cleanup_delete(chunks):
-    """ Cleans up any DEL_START/DEL_END markers in the document, replacing
-    them with <del></del>.  To do this while keeping the document
-    valid, it may need to drop some tags (either start or end tags).
-
-    It may also move the del into adjacent tags to try to move it to a
-    similar location where it was originally located (e.g., moving a
-    delete into preceding <div> tag, if the del looks like (DEL_START,
-    'Text</div>', DEL_END)"""
-    while 1:
-        # Find a pending DEL_START/DEL_END, splitting the document
-        # into stuff-preceding-DEL_START, stuff-inside, and
-        # stuff-following-DEL_END
-        try:
-            pre_delete, delete, post_delete = split_delete(chunks)
-        except NoDeletes:
-            # Nothing found, we've cleaned up the entire doc
-            break
-        # The stuff-inside-DEL_START/END may not be well balanced
-        # markup.  First we figure out what unbalanced portions there are:
-        unbalanced_start, balanced, unbalanced_end = split_unbalanced(delete)
-        # Then we move the span forward and/or backward based on these
-        # unbalanced portions:
-        locate_unbalanced_start(unbalanced_start, pre_delete, post_delete)
-        locate_unbalanced_end(unbalanced_end, pre_delete, post_delete)
-        doc = pre_delete
-        if doc and not doc[-1].endswith(' '):
-            # Fix up case where the word before us didn't have a trailing space
-            doc[-1] += ' '
-        doc.append('<del>')
-        if balanced and balanced[-1].endswith(' '):
-            # We move space outside of </del>
-            balanced[-1] = balanced[-1][:-1]
-        doc.extend(balanced)
-        doc.append('</del> ')
-        doc.extend(post_delete)
-        chunks = doc
-    return chunks
-
-def split_unbalanced(chunks):
-    """Return (unbalanced_start, balanced, unbalanced_end), where each is
-    a list of text and tag chunks.
-
-    unbalanced_start is a list of all the tags that are opened, but
-    not closed in this span.  Similarly, unbalanced_end is a list of
-    tags that are closed but were not opened.  Extracting these might
-    mean some reordering of the chunks."""
-    start = []
-    end = []
-    tag_stack = []
-    balanced = []
-    for chunk in chunks:
-        if not chunk.startswith('<'):
-            balanced.append(chunk)
-            continue
-        endtag = chunk[1] == '/'
-        name = chunk.split()[0].strip('<>/')
-        if name in empty_tags:
-            assert not endtag, (
-                "Empty tag %r should have no end tag" % chunk)
-            balanced.append(chunk)
-            continue
-        if endtag:
-            if tag_stack and tag_stack[-1][0] == name:
-                balanced.append(chunk)
-                name, pos, tag = tag_stack.pop()
-                balanced[pos] = tag
-            elif tag_stack:
-                start.extend(tag for name, pos, tag in tag_stack)
-                tag_stack = []
-                end.append(chunk)
-            else:
-                end.append(chunk)
-        else:
-            tag_stack.append((name, len(balanced), chunk))
-            balanced.append(None)
-    start.extend(
-        [chunk for name, pos, chunk in tag_stack])
-    balanced = [chunk for chunk in balanced if chunk is not None]
-    return start, balanced, end
-
-def split_delete(chunks):
-    """ Returns (stuff_before_DEL_START, stuff_inside_DEL_START_END,
-    stuff_after_DEL_END).  Returns the first case found (there may be
-    more DEL_STARTs in stuff_after_DEL_END).  Raises NoDeletes if
-    there's no DEL_START found. """
-    try:
-        pos = chunks.index(DEL_START)
-    except ValueError:
-        raise NoDeletes
-    pos2 = chunks.index(DEL_END)
-    return chunks[:pos], chunks[pos+1:pos2], chunks[pos2+1:]
-
-def locate_unbalanced_start(unbalanced_start, pre_delete, post_delete):
-    """ pre_delete and post_delete implicitly point to a place in the
-    document (where the two were split).  This moves that point (by
-    popping items from one and pushing them onto the other).  It moves
-    the point to try to find a place where unbalanced_start applies.
-
-    As an example::
-
-        >>> unbalanced_start = ['<div>']
-        >>> doc = ['<p>', 'Text', '</p>', '<div>', 'More Text', '</div>']
-        >>> pre, post = doc[:3], doc[3:]
-        >>> pre, post
-        (['<p>', 'Text', '</p>'], ['<div>', 'More Text', '</div>'])
-        >>> locate_unbalanced_start(unbalanced_start, pre, post)
-        >>> pre, post
-        (['<p>', 'Text', '</p>', '<div>'], ['More Text', '</div>'])
-
-    As you can see, we moved the point so that the dangling <div> that
-    we found will be effectively replaced by the div in the original
-    document.  If this doesn't work out, we just throw away
-    unbalanced_start without doing anything.
-    """
-    while 1:
-        if not unbalanced_start:
-            # We have totally succeded in finding the position
-            break
-        finding = unbalanced_start[0]
-        finding_name = finding.split()[0].strip('<>')
-        if not post_delete:
-            break
-        next = post_delete[0]
-        if next is DEL_START or not next.startswith('<'):
-            # Reached a word, we can't move the delete text forward
-            break
-        if next[1] == '/':
-            # Reached a closing tag, can we go further?  Maybe not...
-            break
-        name = next.split()[0].strip('<>')
-        if name == 'ins':
-            # Can't move into an insert
-            break
-        assert name != 'del', (
-            "Unexpected delete tag: %r" % next)
-        if name == finding_name:
-            unbalanced_start.pop(0)
-            pre_delete.append(post_delete.pop(0))
-        else:
-            # Found a tag that doesn't match
-            break
-
-def locate_unbalanced_end(unbalanced_end, pre_delete, post_delete):
-    """ like locate_unbalanced_start, except handling end tags and
-    possibly moving the point earlier in the document.  """
-    while 1:
-        if not unbalanced_end:
-            # Success
-            break
-        finding = unbalanced_end[-1]
-        finding_name = finding.split()[0].strip('<>/')
-        if not pre_delete:
-            break
-        next = pre_delete[-1]
-        if next is DEL_END or not next.startswith('</'):
-            # A word or a start tag
-            break
-        name = next.split()[0].strip('<>/')
-        if name == 'ins' or name == 'del':
-            # Can't move into an insert or delete
-            break
-        if name == finding_name:
-            unbalanced_end.pop()
-            post_delete.insert(0, pre_delete.pop())
-        else:
-            # Found a tag that doesn't match
-            break
-
-class token(unicode):
-    """ Represents a diffable token, generally a word that is displayed to
-    the user.  Opening tags are attached to this token when they are
-    adjacent (pre_tags) and closing tags that follow the word
-    (post_tags).  Some exceptions occur when there are empty tags
-    adjacent to a word, so there may be close tags in pre_tags, or
-    open tags in post_tags.
-
-    We also keep track of whether the word was originally followed by
-    whitespace, even though we do not want to treat the word as
-    equivalent to a similar word that does not have a trailing
-    space."""
-
-    # When this is true, the token will be eliminated from the
-    # displayed diff if no change has occurred:
-    hide_when_equal = False
-
-    def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=False):
-        obj = unicode.__new__(cls, text)
-
-        if pre_tags is not None:
-            obj.pre_tags = pre_tags
-        else:
-            obj.pre_tags = []
-
-        if post_tags is not None:
-            obj.post_tags = post_tags
-        else:
-            obj.post_tags = []
-
-        obj.trailing_whitespace = trailing_whitespace
-
-        return obj
-
-    def __repr__(self):
-        return 'token(%s, %r, %r)' % (unicode.__repr__(self), self.pre_tags, self.post_tags)
-
-    def html(self):
-        return unicode(self)
-
-class tag_token(token):
-
-    """ Represents a token that is actually a tag.  Currently this is just
-    the <img> tag, which takes up visible space just like a word but
-    is only represented in a document by a tag.  """
-
-    def __new__(cls, tag, data, html_repr, pre_tags=None, 
-                post_tags=None, trailing_whitespace=False):
-        obj = token.__new__(cls, "%s: %s" % (type, data), 
-                            pre_tags=pre_tags, 
-                            post_tags=post_tags, 
-                            trailing_whitespace=trailing_whitespace)
-        obj.tag = tag
-        obj.data = data
-        obj.html_repr = html_repr
-        return obj
-
-    def __repr__(self):
-        return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%s)' % (
-            self.tag, 
-            self.data, 
-            self.html_repr, 
-            self.pre_tags, 
-            self.post_tags, 
-            self.trailing_whitespace)
-    def html(self):
-        return self.html_repr
-
-class href_token(token):
-
-    """ Represents the href in an anchor tag.  Unlike other words, we only
-    show the href when it changes.  """
-
-    hide_when_equal = True
-
-    def html(self):
-        return 'Link: %s' % self
-
-def tokenize(html, include_hrefs=True):
-    """
-    Parse the given HTML and returns token objects (words with attached tags).
-
-    This parses only the content of a page; anything in the head is
-    ignored, and the <head> and <body> elements are themselves
-    optional.  The content is then parsed by lxml, which ensures the
-    validity of the resulting parsed document (though lxml may make
-    incorrect guesses when the markup is particular bad).
-
-    <ins> and <del> tags are also eliminated from the document, as
-    that gets confusing.
-
-    If include_hrefs is true, then the href attribute of <a> tags is
-    included as a special kind of diffable token."""
-    body_el = parse_html(html, cleanup=True)
-    # Then we split the document into text chunks for each tag, word, and end tag:
-    chunks = flatten_el(body_el, drop_tag=True, include_hrefs=include_hrefs)
-    # Finally re-joining them into token objects:
-    return fixup_chunks(chunks)
-
-def parse_html(html, cleanup=True):
-    """
-    Parses an HTML fragment, returning an lxml element.  Note that the HTML will be
-    wrapped in a <div> tag that was not in the original document.
-
-    If cleanup is true, make sure there's no <head> or <body>, and get
-    rid of any <ins> and <del> tags.
-    """
-    if cleanup:
-        # This removes any extra markup or structure like <head>:
-        html = cleanup_html(html)
-    return parse_element(html, create_parent=True)
-
-_body_re = re.compile(r'<body.*?>', re.I|re.S)
-_end_body_re = re.compile(r'</body.*?>', re.I|re.S)
-_ins_del_re = re.compile(r'</?(ins|del).*?>', re.I|re.S)
-
-def cleanup_html(html):
-    """ This 'cleans' the HTML, meaning that any page structure is removed
-    (only the contents of <body> are used, if there is any <body).
-    Also <ins> and <del> tags are removed.  """
-    match = _body_re.search(html)
-    if match:
-        html = html[match.end():]
-    match = _end_body_re.search(html)
-    if match:
-        html = html[:match.start()]
-    html = _ins_del_re.sub('', html)
-    return html
-    
-
-end_whitespace_re = re.compile(r'[ \t\n\r]$')
-
-def fixup_chunks(chunks):
-    """
-    This function takes a list of chunks and produces a list of tokens.
-    """
-    tag_accum = []
-    cur_word = None
-    result = []
-    for chunk in chunks:
-        if isinstance(chunk, tuple):
-            if chunk[0] == 'img':
-                src = chunk[1]
-                tag = chunk[2]
-                if tag.endswith(' '):
-                    tag = tag[:-1]
-                    trailing_whitespace = True
-                else:
-                    trailing_whitespace = False
-                cur_word = tag_token('img', src, html_repr=tag,
-                                     pre_tags=tag_accum,
-                                     trailing_whitespace=trailing_whitespace)
-                tag_accum = []
-                result.append(cur_word)
-            elif chunk[0] == 'href':
-                href = chunk[1]
-                cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=True)
-                tag_accum = []
-                result.append(cur_word)
-            continue
-        if is_word(chunk):
-            if chunk.endswith(' '):
-                chunk = chunk[:-1]
-                trailing_whitespace = True
-            else:
-                trailing_whitespace = False
-            cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace)
-            tag_accum = []
-            result.append(cur_word)
-        elif is_start_tag(chunk):
-            tag_accum.append(chunk)
-        elif is_end_tag(chunk):
-            if tag_accum:
-                tag_accum.append(chunk)
-            else:
-                assert cur_word, (
-                    "Weird state, cur_word=%r, result=%r, chunks=%r of %r"
-                    % (cur_word, result, chunk, chunks))
-                cur_word.post_tags.append(chunk)
-        else:
-            assert(0)
-
-    if not result:
-        return [token('', pre_tags=tag_accum)]
-    else:
-        result[-1].post_tags.extend(tag_accum)
-
-    return result
-
-
-# All the tags in HTML that don't require end tags:
-empty_tags = (
-    'param', 'img', 'area', 'br', 'basefont', 'input',
-    'base', 'meta', 'link', 'col')
-
-block_level_tags = (
-    'address',
-    'blockquote',
-    'center',
-    'dir',
-    'div',
-    'dl',
-    'fieldset',
-    'form',
-    'h1',
-    'h2',
-    'h3',
-    'h4',
-    'h5',
-    'h6',
-    'hr',
-    'isindex',
-    'menu',
-    'noframes',
-    'noscript',
-    'ol',
-    'p',
-    'pre',
-    'table',
-    'ul',
-    )
-
-block_level_container_tags = (
-    'dd',
-    'dt',
-    'frameset',
-    'li',
-    'tbody',
-    'td',
-    'tfoot',
-    'th',
-    'thead',
-    'tr',
-    )
-
-
-def flatten_el(el, include_hrefs, drop_tag=False):
-    """ Takes an lxml element el, and generates all the text chunks for
-    that tag.  Each start tag is a chunk, each word is a chunk, and each
-    end tag is a chunk.
-
-    If drop_tag is true, then the outermost container tag is
-    not returned (just its contents)."""
-    if not drop_tag:
-        if el.tag == 'img':
-            yield ('img', el.attrib['src'], start_tag(el))
-        else:
-            yield start_tag(el)
-    if el.tag in empty_tags and not el.text and not len(el):
-        return
-    start_words = split_words(el.text)
-    for word in start_words:
-        yield cgi.escape(word)
-    for child in el:
-        for item in flatten_el(child, include_hrefs=include_hrefs):
-            yield item
-    if el.tag == 'a' and el.attrib.get('href') and include_hrefs:
-        yield ('href', el.attrib['href'])
-    if not drop_tag:
-        yield end_tag(el)
-        end_words = split_words(el.tail)
-        for word in end_words:
-            yield cgi.escape(word)
-
-def split_words(text):
-    """ Splits some text into words. Includes trailing whitespace (one
-    space) on each word when appropriate.  """
-    if not text or not text.strip():
-        return []
-    words = [w + ' ' for w in text.strip().split()]
-    if not end_whitespace_re.search(text):
-        words[-1] = words[-1][:-1]
-    return words
-
-start_whitespace_re = re.compile(r'^[ \t\n\r]')
-
-def start_tag(el):
-    """
-    The text representation of the start tag for a tag.
-    """
-    return '<%s%s>' % (
-        el.tag, ''.join(' %s="%s"' % (name, cgi.escape(value, True))
-                        for name, value in el.attrib.items()))
-
-def end_tag(el):
-    """ The text representation of an end tag for a tag.  Includes
-    trailing whitespace when appropriate.  """
-    if el.tail and start_whitespace_re.search(el.tail):
-        extra = ' '
-    else:
-        extra = ''
-    return '</%s>%s' % (el.tag, extra)
-
-def is_word(tok):
-    return not tok.startswith('<')
-
-def is_end_tag(tok):
-    return tok.startswith('</')
-
-def is_start_tag(tok):
-    return tok.startswith('<') and not tok.startswith('</')
-
-def fixup_ins_del_tags(html):
-    """ Given an html string, move any <ins> or <del> tags inside of any
-    block-level elements, e.g. transform <ins><p>word</p></ins> to
-    <p><ins>word</ins></p> """
-    doc = parse_html(html, cleanup=False)
-    _fixup_ins_del_tags(doc)
-    html = serialize_html_fragment(doc, drop_outer=True)
-    return html
-
-def serialize_html_fragment(el, drop_outer=False):
-    """ Serialize a single lxml element as HTML.  The serialized form
-    includes the elements tail.  
-
-    If drop_outer is true, then don't serialize the outermost tag
-    """
-    
-    html_xsl = """\
-<xsl:transform xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
-  <xsl:output method="html" encoding="UTF-8" />
-  <xsl:template match="/">
-    <xsl:copy-of select="."/>
-  </xsl:template>
-</xsl:transform>
-"""
-    transform = etree.XSLT(etree.XML(html_xsl))
-    assert not isinstance(el, basestring), (
-        "You should pass in an element, not a string like %r" % el)
-    html = str(transform(el))
-    if drop_outer:
-        # Get rid of the extra starting tag:
-        html = html[html.find('>')+1:]
-    if drop_outer:
-        # Get rid of the extra end tag:
-        html = html[:html.rfind('<')]
-    if drop_outer:
-        return html.strip()
-    else:
-        return html.lstrip()
-
-def _fixup_ins_del_tags(doc):
-    """fixup_ins_del_tags that works on an lxml document in-place
-    """
-    for tag in ['ins', 'del']:
-        for el in doc.xpath('descendant-or-self::%s' % tag):
-            if not _contains_block_level_tag(el):
-                continue
-            _move_el_inside_block(el, tag=tag)
-            _merge_element_contents(el)
-
-def _contains_block_level_tag(el):
-    """True if the element contains any block-level elements, like <p>, <td>, etc.
-    """
-    if el.tag in block_level_tags or el.tag in block_level_container_tags:
-        return True
-    for child in el:
-        if _contains_block_level_tag(child):
-            return True
-    return False
-
-def _move_el_inside_block(el, tag):
-    """ helper for _fixup_ins_del_tags; actually takes the <ins> etc tags
-    and moves them inside any block-level tags.  """
-    for child in el:
-        if _contains_block_level_tag(child):
-            break
-    else:
-        import sys
-        # No block-level tags in any child
-        children_tag = etree.Element(tag)
-        children_tag.text = el.text
-        el.text = None
-        children_tag.extend(list(el))
-        el[:] = [children_tag]
-        return
-    for child in list(el):
-        if _contains_block_level_tag(child):
-            _move_el_inside_block(child, tag)
-            if child.tail:
-                tail_tag = etree.Element(tag)
-                tail_tag.text = child.tail
-                child.tail = None
-                el.insert(el.index(child)+1, tail_tag)
-        else:
-            child_tag = etree.Element(tag)
-            el.replace(child, child_tag)
-            child_tag.append(child)
-    if el.text:
-        text_tag = etree.Element(tag)
-        text_tag.text = el.text
-        el.text = None
-        el.insert(0, text_tag)
-            
-def _merge_element_contents(el):
-    """
-    Removes an element, but merges its contents into its place, e.g.,
-    given <p>Hi <i>there!</i></p>, if you remove the <i> element you get
-    <p>Hi there!</p>
-    """
-    parent = el.getparent()
-    text = el.text or ''
-    if el.tail:
-        if not len(el):
-            text += el.tail
-        else:
-            if el[-1].tail:
-                el[-1].tail += el.tail
-            else:
-                el[-1].tail = el.tail
-    index = parent.index(el)
-    if text:
-        if index == 0:
-            previous = None
-        else:
-            previous = parent[index-1]
-        if previous is None:
-            if parent.text:
-                parent.text += text
-            else:
-                parent.text = text
-        else:
-            if previous.tail:
-                previous.tail += text
-            else:
-                previous.tail = text
-    parent[index:index+1] = el.getchildren()
-
-class InsensitiveSequenceMatcher(difflib.SequenceMatcher):
-    """
-    Acts like SequenceMatcher, but tries not to find very small equal
-    blocks amidst large spans of changes
-    """
-
-    threshold = 2
-    
-    def get_matching_blocks(self):
-        size = min(len(self.b), len(self.b))
-        threshold = min(self.threshold, size / 4)
-        actual = difflib.SequenceMatcher.get_matching_blocks(self)
-        return [item for item in actual
-                if item[2] > threshold
-                or not item[2]]
-    
-# def get_matching_blocks(self):
-#         size = min(len(self.b), len(self.b))
-#         threshold = min(self.threshold, size / 4)
-#         actual = difflib.SequenceMatcher.get_matching_blocks(self)
-#         last_equal_a = 0
-#         eliminate = []
-#         for i in xrange(1, len(actual)-1):
-#             start_diff_length = actual[i][0] - (actual[i-1][0] + actual[i-1][2])
-#             end_diff_length = actual[i+1][0]
-#         for a_pos, b_pos, length in actual:
-#             if (last_equal_a - a_pos is big
-#                 and length is small
-#                 and next_equal_a is far away):
-#                 continue
-#             result.append((a_pos, b_pos, length))
-#             last_equal_a = a_pos+length
-#         return result
-            
-
-if __name__ == '__main__':
-    import doctest
-    doctest.testmod()
-

Copied: lxml/branch/html/src/lxml/html/tests/test_diff.py (from r43962, lxml/branch/html/src/lxml/html/tests/test_htmldiff.py)
==============================================================================
--- lxml/branch/html/src/lxml/html/tests/test_htmldiff.py	(original)
+++ lxml/branch/html/src/lxml/html/tests/test_diff.py	Fri Jun  1 20:23:35 2007
@@ -1,12 +1,12 @@
 import unittest
 from lxml.tests.common_imports import doctest
 
-from lxml.html import htmldiff
+from lxml.html import diff
 
 def test_suite():
     suite = unittest.TestSuite()
-    suite.addTests([doctest.DocFileSuite('test_htmldiff.txt'),
-                    doctest.DocTestSuite(htmldiff)])
+    suite.addTests([doctest.DocFileSuite('test_diff.txt'),
+                    doctest.DocTestSuite(diff)])
     return suite
 
 if __name__ == '__main__':

Copied: lxml/branch/html/src/lxml/html/tests/test_diff.txt (from r43962, lxml/branch/html/src/lxml/html/tests/test_htmldiff.txt)
==============================================================================
--- lxml/branch/html/src/lxml/html/tests/test_htmldiff.txt	(original)
+++ lxml/branch/html/src/lxml/html/tests/test_diff.txt	Fri Jun  1 20:23:35 2007
@@ -1,4 +1,4 @@
-htmldiff does HTML comparisons.  These are word-based comparisons.
+lxml.html.diff does HTML comparisons.  These are word-based comparisons.
 
 First, a handy function for normalizing whitespace and doing word wrapping::
 
@@ -12,7 +12,7 @@
 
 Example::
 
-    >>> from lxml.html.htmldiff import htmldiff, split_unbalanced, html_annotate
+    >>> from lxml.html.diff import htmldiff, html_annotate
     >>> html1 = '<p>This is some test text with some changes and some same stuff</p>'
     >>> html2 = '''<p>This is some test textual writing with some changed stuff 
     ... and some same stuff</p>'''
@@ -187,36 +187,13 @@
     <p><a href="/foo"><span version="0">Hey</span> <span
     version="1">Guy</span></a></p>
 
+Internals
+---------
 
 
-Here's a test of a utility function!:
+Some utility functions::
 
-    >>> from lxml.html.htmldiff import _merge_element_contents
-    >>> from lxml import etree
-    >>> doc = '''<html><body><div>
-    ... <div id="c1">a b <span id="d1">content</span> c d</div>
-    ... <div id="c2"><span id="d2">content <b>and more</b> stuff</span> trailing</div>
-    ... <div id="c3"><b>hi</b><span id="d3"><i>content</i></span></div>
-    ... <div id="c4"><b>Hi</b> <span id="d4">some stuff<i>more stuff</i></span></div>
-    ... </div></body></html>'''
-    >>> doc = etree.HTML(doc)
-    >>> def show_result(id):
-    ...     el = doc.xpath("//*[@id='d%s']" % id)[0]
-    ...     _merge_element_contents(el)
-    ...     container = doc.xpath("//*[@id='c%s']" % id)[0]
-    ...     print etree.tostring(container).strip()
-    >>> show_result(1)
-    <div id="c1">a b content c d</div>
-    >>> show_result(2)
-    <div id="c2">content <b>and more</b> stuff trailing</div>
-    >>> show_result(3)
-    <div id="c3"><b>hi</b><i>content</i></div>
-    >>> show_result(4)
-    <div id="c4"><b>Hi</b> some stuff<i>more stuff</i></div>
-
-More utility:
-
-    >>> from lxml.html.htmldiff import fixup_ins_del_tags
+    >>> from lxml.html.diff import fixup_ins_del_tags, split_unbalanced
     >>> def pfixup(text):
     ...     print fixup_ins_del_tags(text).strip()
     >>> pfixup('<ins><p>some text <b>and more text</b> and more</p></ins>')
@@ -233,7 +210,7 @@
     </tr></table>
 
 
-Testing split_unbalanced:
+Testing split_unbalanced::
 
     >>> split_unbalanced(['<a href="blah">', 'hey', '</a>'])
     ([], ['<a href="blah">', 'hey', '</a>'], [])

Deleted: /lxml/branch/html/src/lxml/html/tests/test_htmldiff.py
==============================================================================
--- /lxml/branch/html/src/lxml/html/tests/test_htmldiff.py	Fri Jun  1 20:23:35 2007
+++ (empty file)
@@ -1,13 +0,0 @@
-import unittest
-from lxml.tests.common_imports import doctest
-
-from lxml.html import htmldiff
-
-def test_suite():
-    suite = unittest.TestSuite()
-    suite.addTests([doctest.DocFileSuite('test_htmldiff.txt'),
-                    doctest.DocTestSuite(htmldiff)])
-    return suite
-
-if __name__ == '__main__':
-    unittest.main()

Deleted: /lxml/branch/html/src/lxml/html/tests/test_htmldiff.txt
==============================================================================
--- /lxml/branch/html/src/lxml/html/tests/test_htmldiff.txt	Fri Jun  1 20:23:35 2007
+++ (empty file)
@@ -1,248 +0,0 @@
-htmldiff does HTML comparisons.  These are word-based comparisons.
-
-First, a handy function for normalizing whitespace and doing word wrapping::
-
-    >>> import re, textwrap
-    >>> def pwrapped(text):
-    ...     text = re.sub(r'[ \n\t\r]+', ' ', text)
-    ...     text = textwrap.fill(text)
-    ...     print text
-    >>> def pdiff(text1, text2):
-    ...     pwrapped(htmldiff(text1, text2))
-
-Example::
-
-    >>> from lxml.html.htmldiff import htmldiff, split_unbalanced, html_annotate
-    >>> html1 = '<p>This is some test text with some changes and some same stuff</p>'
-    >>> html2 = '''<p>This is some test textual writing with some changed stuff 
-    ... and some same stuff</p>'''
-    >>> pdiff(html1, html2)
-    <p>This is some test <ins>textual writing with some changed
-    stuff</ins> <del>text with some changes</del> and some same stuff</p>
-
-Style tags are largely ignored in terms of differences, though markup is not eliminated::
-
-    >>> html1 = '<p>Hi <i>you guys</i></p>'
-    >>> html2 = '<p>Hi <i>you</i> guys</p>'
-    >>> pdiff(html1, html2)
-    <p>Hi <i>you</i> guys</p>
-    >>> pdiff('text', '<p>text</p>')
-    <p>text</p>
-    >>> pdiff('<i>Hi guys</i> !!', '<i>Hi guy</i> !!')
-    <i>Hi <ins>guy</ins> <del>guys</del> </i> !!
-    >>> pdiff('H<i>i</i>', 'Hi')
-    <ins>Hi</ins> <del>H<i>i</i></del>
-    >>> pdiff('<i>A B</i> C', '<i>A</i> C')
-    <i>A <del>B</del> </i> C
-    >>> pdiff('<i>A B</i> C', '<i>B</i> C')
-    <i> <del>A</del> B</i> C
-    >>> pdiff('<p></p>', '<p></p>')
-    <p></p>
-    >>> pdiff('<p>Hi</p>', '<p>Bye</p>')
-    <p><ins>Bye</ins></p> <p><del>Hi</del></p>
-    >>> pdiff('<p>Hi Guy</p>', '<p>Bye Guy</p>')
-    <p> <ins>Bye</ins> <del>Hi</del> Guy</p>
-    >>> pdiff('<p>Hey there</p>', '')
-    <ins></ins> <p><del>Hey there</del></p>
-
-Whitespace is ignored, as it's not meaningful in HTML::
-
-    >>> pdiff('<div>Hi\n\nguys</div>', '<div>Hi guy</div>')
-    <div>Hi <ins>guy</ins> <del>guys</del> </div>
-
-Movement between paragraphs is ignored, as tag-based changes are generally ignored::
-    >>> 
-    >>> pdiff('<p>Hello</p><p>World</p>', '<p>Hello World</p>')
-    <p>Hello World</p>
-
-As a special case, changing the href of a link is displayed, and
-images are treated like words:
-
-    >>> pdiff('<a href="http://yahoo.com">search</a>', '<a href="http://google.com">search</a>')
-    <a href="http://google.com">search <ins>Link: http://google.com</ins>
-    <del>Link: http://yahoo.com</del> </a>
-    >>> pdiff('<p>Print this <img src="print.gif"></p>', '<p>Print this</p>')
-    <p>Print this <del><img src="print.gif"></del> </p>
-    >>> pdiff('<a href="http://yahoo.com">search</a>', '<a href="http://yahoo.com">search</a>')
-    <a href="http://yahoo.com">search</a>
-
-The sixteen combinations::
-
-First "insert start" (del start/middle/end/none):
-
-    >>> pdiff('<b>A B C</b>', '<b>D B C</b')
-    <b> <ins>D</ins> <del>A</del> B C</b>
-    >>> pdiff('<b>A B C</b>', '<b>D A C</b>')
-    <b> <ins>D</ins> A <del>B</del> C</b>
-    >>> pdiff('<b>A B C</b>', '<b>D A B</b>')
-    <b> <ins>D</ins> A B <del>C</del> </b>
-    >>> pdiff('<b>A B C</b>', '<b>D A B C</b>')
-    <b> <ins>D</ins> A B C</b>
-
-Next, "insert middle" (del start/middle/end/none):
-
-    >>> pdiff('<b>A B C</b>', '<b>D B C</b>')
-    <b> <ins>D</ins> <del>A</del> B C</b>
-    >>> pdiff('<b>A B C</b>', '<b>A D C</b>')
-    <b>A <ins>D</ins> <del>B</del> C</b>
-    >>> pdiff('<b>A B C</b>', '<b>A D B</b>')
-    <b>A <ins>D</ins> B <del>C</del> </b>
-
-This one case hits the threshold of our insensitive matching:
-
-    >>> pdiff('<b>A B C</b>', '<b>A D B C</b>')
-    <b> <ins>A D</ins> <del>A</del> B C</b>
-
-
-Then "insert end" (del start/middle/end/none):
-
-    >>> pdiff('<b>A B C</b>', '<b>B C D</b>')
-    <b> <del>A</del> B C <ins>D</ins> </b>
-    >>> pdiff('<b>A B C</b>', '<b>A C D</b>')
-    <b>A <del>B</del> C <ins>D</ins> </b>
-    >>> pdiff('<b>A B C</b>', '<b>A B D</b>')
-    <b>A B <ins>D</ins> <del>C</del> </b>
-    >>> pdiff('<b>A B C</b>', '<b>A B C D</b>')
-    <b>A B C <ins>D</ins> </b>
-
-Then no insert (del start/middle/end):
-
-    >>> pdiff('<b>A B C</b>', '<b>B C</b>')
-    <b> <del>A</del> B C</b>
-    >>> pdiff('<b>A B C</b>', '<b>A C</b>')
-    <b>A <del>B</del> C</b>
-    >>> pdiff('<b>A B C</b>', '<b>A B</b>')
-    <b>A B <del>C</del> </b>
-
-    >>> pdiff('<b>A B</b> C', '<b>A B</b>')
-    <b>A B</b> <del>C</del>
-    >>> pdiff('<b>A B</b> <b>C</b>', '<b>A B</b>')
-    <b>A B</b> <del><b>C</b></del>
-    >>> pdiff('A <p><b>hey there</b> <i>how are you?</i></p>', 'A')
-    A <p><del><b>hey there</b> <i>how are you?</i></del></p>
-    
-Testing a larger document, to make sure there are not weird
-unnecessary parallels found:
-
-    >>> pdiff('''
-    ... <p>This is a test document with many words in it that goes on
-    ... for a while and doesn't have anything do to with the next
-    ... document that we match this against</p>''', '''
-    ... <p>This is another document with few similarities to the preceding
-    ... one, but enough that it may have overlap that could turn into
-    ... a confusing series of deletes and inserts.
-    ... </p>''')
-    <p><ins>This is another document with few similarities to the
-    preceding one, but enough that it may have overlap that could turn
-    into a confusing series of deletes and inserts. </ins></p>
-    <p><del>This is a test document with many words in it that goes on for
-    a while and doesn't have anything do to with the next document that we
-    match this against</del></p>
-
-
-
-Annotation of content can also be done, where every bit of content is
-marked up with information about where it came from.
-
-First, some setup; note that html_annotate is called with a sequence
-of documents and the annotation associated with that document.  We'll
-just use indexes, but you could use author or timestamp information.
-
-    >>> def markup(text, annotation):
-    ...     return '<span version="%s">%s</span>' % (annotation, text)
-    >>> def panno(*docs):
-    ...     pwrapped(html_annotate([(doc, index) for index, doc in enumerate(docs)],
-    ...                            markup=markup))
-
-Now, a sequence of documents:
-
-    >>> panno('Hello cruel world', 'Hi cruel world', 'Hi world')
-    <span version="1">Hi</span> <span version="0">world</span>
-    >>> panno('A similar document', 'A similar document',
-    ...       'A similar document here')
-    <span version="0">A similar document</span> <span
-    version="2">here</span>
-    >>> panno('<p>P1 para</p><p>P2 para</p>', '<p>P1 para</p><p>P3 foo</p>')
-    <p><span version="0">P1 para</span></p><p><span version="1">P3
-    foo</span></p>
-    >>> panno('Hello<p>There World</p>','Hello<p>There Town</p>')
-    <span version="0">Hello</span><p><span version="0">There</span> <span
-    version="1">Town</span></p>
-    >>> panno('<p>Hello</p>There World','<p>Hello</p>There Town')
-    <p><span version="0">Hello</span></p><span version="0">There</span>
-    <span version="1">Town</span>
-    >>> panno('<p>Hello</p><p>There World</p>','<p>Hello</p><p>There Town</p>')
-    <p><span version="0">Hello</span></p><p><span version="0">There</span>
-    <span version="1">Town</span></p>
-    >>> panno('<p>Hi <img src="/foo"> You</p>',
-    ...       '<p>Hi You</p>',
-    ...       '<p>Hi You <img src="/bar"></p>')
-    <p><span version="0">Hi</span> <span version="1">You</span> <span
-    version="2"><img src="/bar"></span></p>
-    >>> panno('<p><a href="/foo">Hey</a></p>',
-    ...       '<p><a href="/bar">Hey</a></p>')
-    <p><a href="/bar"><span version="0">Hey</span></a></p>
-    >>> panno('<p><a href="/foo">Hey You</a></p>',
-    ...       '<p><a href="/foo">Hey Guy</a></p>')
-    <p><a href="/foo"><span version="0">Hey</span> <span
-    version="1">Guy</span></a></p>
-
-
-
-Here's a test of a utility function!:
-
-    >>> from lxml.html.htmldiff import _merge_element_contents
-    >>> from lxml import etree
-    >>> doc = '''<html><body><div>
-    ... <div id="c1">a b <span id="d1">content</span> c d</div>
-    ... <div id="c2"><span id="d2">content <b>and more</b> stuff</span> trailing</div>
-    ... <div id="c3"><b>hi</b><span id="d3"><i>content</i></span></div>
-    ... <div id="c4"><b>Hi</b> <span id="d4">some stuff<i>more stuff</i></span></div>
-    ... </div></body></html>'''
-    >>> doc = etree.HTML(doc)
-    >>> def show_result(id):
-    ...     el = doc.xpath("//*[@id='d%s']" % id)[0]
-    ...     _merge_element_contents(el)
-    ...     container = doc.xpath("//*[@id='c%s']" % id)[0]
-    ...     print etree.tostring(container).strip()
-    >>> show_result(1)
-    <div id="c1">a b content c d</div>
-    >>> show_result(2)
-    <div id="c2">content <b>and more</b> stuff trailing</div>
-    >>> show_result(3)
-    <div id="c3"><b>hi</b><i>content</i></div>
-    >>> show_result(4)
-    <div id="c4"><b>Hi</b> some stuff<i>more stuff</i></div>
-
-More utility:
-
-    >>> from lxml.html.htmldiff import fixup_ins_del_tags
-    >>> def pfixup(text):
-    ...     print fixup_ins_del_tags(text).strip()
-    >>> pfixup('<ins><p>some text <b>and more text</b> and more</p></ins>')
-    <p><ins>some text <b>and more text</b> and more</ins></p>
-    >>> pfixup('<p><ins>Hi!</ins> you</p>')
-    <p><ins>Hi!</ins> you</p>
-    >>> pfixup('<div>Some text <ins>and <p>more text</p></ins> </div>')
-    <div>Some text <ins>and </ins><p><ins>more text</ins></p> </div>
-    >>> pfixup('''
-    ...    <ins><table><tr><td>One table</td><td>More stuff</td></tr></table></ins>''')
-    <table><tr>
-    <td><ins>One table</ins></td>
-    <td><ins>More stuff</ins></td>
-    </tr></table>
-
-
-Testing split_unbalanced:
-
-    >>> split_unbalanced(['<a href="blah">', 'hey', '</a>'])
-    ([], ['<a href="blah">', 'hey', '</a>'], [])
-    >>> split_unbalanced(['<a href="blah">', 'hey'])
-    (['<a href="blah">'], ['hey'], [])
-    >>> split_unbalanced(['Hey', '</i>', 'You', '</b>'])
-    ([], ['Hey', 'You'], ['</i>', '</b>'])
-    >>> split_unbalanced(['So', '</i>', 'Hi', '<b>', 'There', '</b>'])
-    ([], ['So', 'Hi', '<b>', 'There', '</b>'], ['</i>'])
-    >>> split_unbalanced(['So', '</i>', 'Hi', '<b>', 'There'])
-    (['<b>'], ['So', 'Hi', 'There'], ['</i>'])
-    


More information about the lxml-checkins mailing list