[Lxml-checkins] r43854 - in lxml/branch/html: . src/lxml src/lxml/html src/lxml/html/tests
ianb at codespeak.net
ianb at codespeak.net
Tue May 29 17:05:52 CEST 2007
Author: ianb
Date: Tue May 29 17:05:51 2007
New Revision: 43854
Added:
lxml/branch/html/
- copied from r43853, lxml/trunk/
lxml/branch/html/src/lxml/doctestcompare.py (contents, props changed)
lxml/branch/html/src/lxml/html/
lxml/branch/html/src/lxml/html/__init__.py (contents, props changed)
lxml/branch/html/src/lxml/html/clean.py (contents, props changed)
lxml/branch/html/src/lxml/html/defs.py (contents, props changed)
lxml/branch/html/src/lxml/html/htmldiff.py (contents, props changed)
lxml/branch/html/src/lxml/html/rewritelinks.py (contents, props changed)
lxml/branch/html/src/lxml/html/tests/
lxml/branch/html/src/lxml/html/tests/__init__.py (contents, props changed)
lxml/branch/html/src/lxml/html/tests/test_basic.py (contents, props changed)
lxml/branch/html/src/lxml/html/tests/test_basic.txt (contents, props changed)
lxml/branch/html/src/lxml/html/tests/test_clean.py (contents, props changed)
lxml/branch/html/src/lxml/html/tests/test_clean.txt (contents, props changed)
lxml/branch/html/src/lxml/html/tests/test_htmldiff.py (contents, props changed)
lxml/branch/html/src/lxml/html/tests/test_htmldiff.txt (contents, props changed)
lxml/branch/html/src/lxml/html/tests/test_rewritelinks.py (contents, props changed)
lxml/branch/html/src/lxml/html/tests/test_rewritelinks.txt (contents, props changed)
lxml/branch/html/src/lxml/html/usedoctest.py (contents, props changed)
lxml/branch/html/src/lxml/usedoctest.py (contents, props changed)
Modified:
lxml/branch/html/setup.py
Log:
Branch with lxml.html work
Modified: lxml/branch/html/setup.py
==============================================================================
--- lxml/trunk/setup.py (original)
+++ lxml/branch/html/setup.py Tue May 29 17:05:51 2007
@@ -70,7 +70,7 @@
],
package_dir = {'': 'src'},
- packages = ['lxml'],
+ packages = ['lxml', 'lxml.html'],
zip_safe = False,
ext_modules = setupinfo.ext_modules(
STATIC_INCLUDE_DIRS, STATIC_LIBRARY_DIRS, STATIC_CFLAGS),
Added: lxml/branch/html/src/lxml/doctestcompare.py
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/doctestcompare.py Tue May 29 17:05:51 2007
@@ -0,0 +1,395 @@
+"""
+lxml-based doctest output comparison.
+
+To use this you must call ``lxmldoctest.install()``, which will cause
+doctest to use this in all subsequent calls.
+
+This changes the way output is checked and comparisons are made for
+XML or HTML-like content.
+
+XML or HTML content is noticed because the example starts with ``<``
+(it's HTML if it starts with ``<html``). You can also use the
+``PARSE_HTML`` and ``PARSE_XML`` flags to force parsing.
+
+Some rough wildcard-like things are allowed. Whitespace is generally
+ignored (except in attributes). In text (attributes and text in the
+body) you can use ``...`` as a wildcard. In an example it also
+matches any trailing tags in the element, though it does not match
+leading tags. You may create a tag ``<any>`` or include an ``any``
+attribute in the tag. An ``any`` tag matches any tag, while the
+attribute matches any and all attributes.
+
+When a match fails, the reformatted example and gotten text is
+displayed (indented), and a rough diff-like output is given. Anything
+marked with ``-`` is in the output but wasn't supposed to be, and
+similarly ``+`` means its in the example but wasn't in the output.
+"""
+
+from lxml import etree
+from lxml.html import HTML
+import re
+import doctest
+import cgi
+
+PARSE_HTML = doctest.register_optionflag('PARSE_HTML')
+PARSE_XML = doctest.register_optionflag('PARSE_XML')
+
+OutputChecker = doctest.OutputChecker
+
+def strip(v):
+ if v is None:
+ return None
+ else:
+ return v.strip()
+
+class LXMLOutputChecker(OutputChecker):
+
+ empty_tags = (
+ 'param', 'img', 'area', 'br', 'basefont', 'input',
+ 'base', 'meta', 'link', 'col')
+
+ default_parser = etree.XML
+
+ def check_output(self, want, got, optionflags):
+ alt_self = getattr(self, '_temp_override_self', None)
+ if alt_self is not None:
+ super_method = self._temp_call_super_check_output
+ self = alt_self
+ else:
+ super_method = OutputChecker.check_output
+ parser = self.get_parser(want, got, optionflags)
+ if not parser:
+ return super_method(
+ self, want, got, optionflags)
+ try:
+ want_doc = parser(want)
+ except etree.XMLSyntaxError:
+ return False
+ try:
+ got_doc = parser(got)
+ except etree.XMLSyntaxError:
+ return False
+ return self.compare_docs(want_doc, got_doc)
+
+ def get_parser(self, want, got, optionflags):
+ parser = None
+ if PARSE_HTML & optionflags:
+ parser = HTML
+ elif PARSE_XML & optionflags:
+ parser = etree.XML
+ elif want.strip().lower().startswith('<html'):
+ parser = HTML
+ elif want.strip().startswith('<'):
+ parser = self.default_parser
+ return parser
+
+ def compare_docs(self, want, got):
+ if want.tag != got.tag and want.tag != 'any':
+ return False
+ if not self.text_compare(want.text, got.text, True):
+ return False
+ if not self.text_compare(want.tail, got.tail, True):
+ return False
+ if 'any' not in want.attrib:
+ want_keys = sorted(want.attrib.keys())
+ got_keys = sorted(got.attrib.keys())
+ if want_keys != got_keys:
+ return False
+ for key in want_keys:
+ if not self.text_compare(want.attrib[key], got.attrib[key], False):
+ return False
+ if want.text != '...' or len(want):
+ want_children = list(want)
+ got_children = list(got)
+ while want_children or got_children:
+ if not want_children or not got_children:
+ return False
+ want_first = want_children.pop(0)
+ got_first = got_children.pop(0)
+ if not self.compare_docs(want_first, got_first):
+ return False
+ if not got_children and want_first.tail == '...':
+ break
+ return True
+
+ def text_compare(self, want, got, strip):
+ want = want or ''
+ got = got or ''
+ if strip:
+ want = want.strip()
+ got = got.strip()
+ want = '^%s$' % re.escape(want)
+ want = want.replace(r'\.\.\.', '.*')
+ if re.search(want, got):
+ return True
+ else:
+ return False
+
+ def output_difference(self, example, got, optionflags):
+ want = example.want
+ parser = self.get_parser(want, got, optionflags)
+ errors = []
+ if parser is not None:
+ try:
+ want_doc = parser(want)
+ except etree.XMLSyntaxError, e:
+ errors.append('In example: %s' % e)
+ try:
+ got_doc = parser(got)
+ except etree.XMLSyntaxError, e:
+ errors.append('In actual output: %s' % e)
+ if parser is None or errors:
+ value = OutputChecker.output_difference(
+ self, example, got, optionflags)
+ if errors:
+ errors.append(value)
+ return '\n'.join(errors)
+ else:
+ return value
+ html = parser is etree.HTML
+ diff_parts = []
+ diff_parts.append('Expected:')
+ diff_parts.append(self.format_doc(want_doc, html, 2))
+ diff_parts.append('Got:')
+ diff_parts.append(self.format_doc(got_doc, html, 2))
+ diff_parts.append('Diff:')
+ diff_parts.append(self.collect_diff(want_doc, got_doc, html, 2))
+ return '\n'.join(diff_parts)
+
+ def html_empty_tag(self, el, html=True):
+ if not html:
+ return False
+ if el.tag not in self.empty_tags:
+ return False
+ if el.text or len(el):
+ # This shouldn't happen (contents in an empty tag)
+ return False
+ return True
+
+ def format_doc(self, doc, html, indent, prefix=''):
+ parts = []
+ if not len(doc):
+ # No children...
+ parts.append(' '*indent)
+ parts.append(prefix)
+ parts.append(self.format_tag(doc))
+ if not self.html_empty_tag(doc, html):
+ if strip(doc.text):
+ parts.append(self.format_text(doc.text))
+ parts.append(self.format_end_tag(doc))
+ if strip(doc.tail):
+ parts.append(self.format_text(doc.tail))
+ parts.append('\n')
+ return ''.join(parts)
+ parts.append(' '*indent)
+ parts.append(prefix)
+ parts.append(self.format_tag(doc))
+ if not self.html_empty_tag(doc, html):
+ parts.append('\n')
+ if strip(doc.text):
+ parts.append(' '*indent)
+ parts.append(self.format_text(doc.text))
+ parts.append('\n')
+ for el in doc:
+ parts.append(self.format_doc(el, html, indent+2))
+ parts.append(' '*indent)
+ parts.append(self.format_end_tag(doc))
+ parts.append('\n')
+ if strip(doc.tail):
+ parts.append(' '*indent)
+ parts.append(self.format_text(doc.tail))
+ parts.append('\n')
+ return ''.join(parts)
+
+ def format_text(self, text, strip=True):
+ if text is None:
+ return ''
+ if strip:
+ text = text.strip()
+ return cgi.escape(text, 1)
+
+ def format_tag(self, el):
+ attrs = []
+ for name, value in sorted(el.attrib.items()):
+ attrs.append('%s="%s"' % (name, self.format_text(value, False)))
+ if not attrs:
+ return '<%s>' % el.tag
+ return '<%s %s>' % (el.tag, ' '.join(attrs))
+
+ def format_end_tag(self, el):
+ return '</%s>' % el.tag
+
+ def collect_diff(self, want, got, html, indent):
+ parts = []
+ if not len(want) and not len(got):
+ parts.append(' '*indent)
+ parts.append(self.collect_diff_tag(want, got))
+ if not self.html_empty_tag(got, html):
+ parts.append(self.collect_diff_text(want.text, got.text))
+ parts.append(self.collect_diff_end_tag(want, got))
+ parts.append(self.collect_diff_text(want.tail, got.tail))
+ parts.append('\n')
+ return ''.join(parts)
+ parts.append(' '*indent)
+ parts.append(self.collect_diff_tag(want, got))
+ parts.append('\n')
+ if strip(want.text) or strip(got.text):
+ parts.append(' '*indent)
+ parts.append(self.collect_diff_text(want.text, got.text))
+ parts.append('\n')
+ want_children = list(want)
+ got_children = list(got)
+ while want_children or got_children:
+ if not want_children:
+ parts.append(self.format_doc(got_children.pop(0), html, indent+2, '-'))
+ continue
+ if not got_children:
+ parts.append(self.format_doc(want_children.pop(0), html, indent+2, '+'))
+ continue
+ parts.append(self.collect_diff(
+ want_children.pop(0), got_children.pop(0), html, indent+2))
+ parts.append(' '*indent)
+ parts.append(self.collect_diff_end_tag(want, got))
+ parts.append('\n')
+ if strip(want.tail) or strip(got.tail):
+ parts.append(' '*indent)
+ parts.append(self.collect_diff_text(want.tail, got.tail))
+ parts.append('\n')
+ return ''.join(parts)
+
+ def collect_diff_tag(self, want, got):
+ if want.tag != got.tag and want.tag != 'any':
+ tag = '%s (not %s)' % (got.tag, want.tag)
+ else:
+ tag = got.tag
+ attrs = []
+ any = want.tag == 'any' or 'any' in want.attrib
+ for name, value in sorted(got.attrib.items()):
+ if name not in want.attrib and not any:
+ attrs.append('-%s="%s"' % (name, self.format_text(value, False)))
+ else:
+ if name in want.attrib:
+ text = self.collect_diff_text(value, want.attrib[name], False)
+ else:
+ text = self.format_text(value, False)
+ attrs.append('%s="%s"' % (name, text))
+ if not any:
+ for name, value in sorted(got.attrib.items()):
+ if name in got.attrib:
+ continue
+ attrs.append('+%s="%s"' % (name, self.format_text(value, False)))
+ if attrs:
+ tag = '<%s %s>' % (tag, ' '.join(attrs))
+ else:
+ tag = '<%s>' % tag
+ return tag
+
+ def collect_diff_end_tag(self, want, got):
+ if want.tag != got.tag:
+ tag = '%s (not %s)' % (got.tag, want.tag)
+ else:
+ tag = got.tag
+ return '</%s>' % tag
+
+ def collect_diff_text(self, want, got, strip=True):
+ if self.text_compare(want, got, strip):
+ if not got:
+ return ''
+ return self.format_text(got, strip)
+ text = '%s (not %s)' % (got, want)
+ return self.format_text(text, strip)
+
+class LHTMLOutputChecker(LXMLOutputChecker):
+ default_parser = HTML
+
+def install(html=False):
+ """
+ Install doctestcompare for all future doctests.
+
+ If html is true, then by default the HTML parser will be used;
+ otherwise the XML parser is used.
+ """
+ if html:
+ doctest.OutputChecker = LHTMLOutputChecker
+ else:
+ doctest.OutputChecker = LXMLOutputChecker
+
+def temp_install(html=False):
+ """
+ Use this *inside* a doctest to enable this checker for this
+ doctest only.
+
+ If html is true, then by default the HTML parser will be used;
+ otherwise the XML parser is used.
+ """
+ if html:
+ Checker = LHTMLOutputChecker
+ else:
+ Checker = LXMLOutputChecker
+ frame = _find_doctest_frame()
+ dt_self = frame.f_locals['self']
+ checker = Checker()
+ old_checker = dt_self._checker
+ dt_self._checker = checker
+ # The unfortunate thing is that there is a local variable 'check'
+ # in the function that runs the doctests, that is a bound method
+ # into the output checker. We have to update that. We can't
+ # modify the frame, so we have to modify the object in place. The
+ # only way to do this is to actually change the func_code
+ # attribute of the method. We change it, and then wait for
+ # __record_outcome to be run, which signals the end of the __run
+ # method, at which point we restore the previous check_output
+ # implementation.
+ check_func = frame.f_locals['check'].im_func
+ # Because we can't patch up func_globals, this is the only global
+ # in check_output that we care about:
+ doctest.etree = etree
+ _RestoreChecker(dt_self, old_checker, checker,
+ check_func, checker.check_output.im_func)
+
+class _RestoreChecker(object):
+ def __init__(self, dt_self, old_checker, new_checker, check_func, clone_func):
+ self.dt_self = dt_self
+ self.checker = old_checker
+ self.checker._temp_call_super_check_output = self.call_super
+ self.checker._temp_override_self = new_checker
+ self.check_func = check_func
+ self.clone_func = clone_func
+ self.install_clone()
+ self.install_dt_self()
+ def install_clone(self):
+ self.func_code = self.check_func.func_code
+ self.func_globals = self.check_func.func_globals
+ self.check_func.func_code = self.clone_func.func_code
+ def uninstall_clone(self):
+ self.check_func.func_code = self.func_code
+ def install_dt_self(self):
+ self.prev_func = self.dt_self._DocTestRunner__record_outcome
+ self.dt_self._DocTestRunner__record_outcome = self
+ def uninstall_dt_self(self):
+ self.dt_self._DocTestRunner__record_outcome = self.prev_func
+ def __call__(self, *args, **kw):
+ self.uninstall_clone()
+ self.uninstall_dt_self()
+ del self.checker._temp_override_self
+ del self.checker._temp_call_super_check_output
+ return self.prev_func(*args, **kw)
+ def call_super(self, *args, **kw):
+ self.uninstall_clone()
+ try:
+ return self.check_func(*args, **kw)
+ finally:
+ self.install_clone()
+
+def _find_doctest_frame():
+ import sys
+ frame = sys._getframe(1)
+ while frame:
+ l = frame.f_locals
+ if 'BOOM' in l:
+ # Sign of doctest
+ return frame
+ frame = frame.f_back
+ raise LookupError(
+ "Could not find doctest (only use this function *inside* a doctest)")
+
Added: lxml/branch/html/src/lxml/html/__init__.py
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/__init__.py Tue May 29 17:05:51 2007
@@ -0,0 +1,201 @@
+import threading
+import re
+from lxml import etree
+
+__all__ = ['HTML', 'tostring']
+
+_rel_links_xpath = etree.XPath("descendant-or-self::a[fn:upper-case(@rel)=$rel]")
+#_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'})
+_class_xpath = etree.XPath("descendant-or-self::*[contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
+
+class HtmlMixin(object):
+
+ def remove_element(self):
+ """
+ Removes this element from the tree, including its children and
+ text. The tail text is joined to the previous element or
+ parent.
+ """
+ parent = self.getparent()
+ assert parent
+ index = parent.index(self)
+ if self.tail:
+ if index == 0:
+ parent.text = (parent.text or '') + self.tail
+ else:
+ previous = parent[index-1]
+ previous.tail = (previous.tail or '') + self.tail
+ parent.remove(self)
+
+ def remove_tag(self):
+ """
+ Remove the tag, but not its children or text. The children and text
+ are merged into the parent.
+ """
+ parent = self.getparent()
+ assert parent
+ index = parent.index(self)
+ if self.text:
+ if index == 0:
+ parent.text = (parent.text or '') + self.text
+ else:
+ prev = parent[index-1]
+ prev.tail = (prev.tail or '') + self.text
+ if self.tail:
+ if len(self):
+ last = self[-1]
+ last.tail = (last.tail or '') + self.tail
+ elif index == 0:
+ parent.text = (parent.text or '') + self.tail
+ else:
+ prev = parent[index-1]
+ prev.tail = (prev.tail or '') + self.tail
+ parent[index:index+1] = list(self)
+
+ def find_rel_links(self, rel):
+ return _rel_links_xpath(self, rel=rel.lower())
+
+ def find_class(self, class_name):
+ return _class_xpath(self, class_name=class_name.lower())
+
+class HtmlComment(etree._Comment, HtmlMixin):
+ pass
+
+class HtmlElement(etree.ElementBase, HtmlMixin):
+ pass
+
+class HtmlLookup(etree.CustomElementClassLookup):
+
+ def lookup(self, node_type, document, namespace, name):
+ if node_type == 'element':
+ return HtmlElement
+ elif node_type == 'comment':
+ return HtmlComment
+ else:
+ # Delegate
+ return None
+
+html_parser = etree.HTMLParser()
+html_parser.setElementClassLookup(HtmlLookup())
+
+def HTML(html):
+ # FIXME: should this notice a fragment and parse accordingly?
+ value = etree.HTML(html, html_parser)
+ if value is None:
+ raise ParserError(
+ "Could not parse document")
+ return value
+
+def parse_elements(html, no_leading_text=False):
+ """
+ Parses several HTML elements, returning a list of elements.
+
+ The first item in the list may be a string (though leading
+ whitespace is removed). If no_leading_text is true, then it will
+ be an error if there is leading text.
+ """
+ # FIXME: check what happens when you give html with a body, head, etc.
+ html = '<html><head></head><body>%s</body></html>' % html
+ doc = HTML(html)
+ assert doc.tag == 'html'
+ bodies = [e for e in doc if e.tag == 'body']
+ assert len(bodies) == 1
+ body = bodies[0]
+ elements = []
+ if no_leading_text and body.text and body.text.strip():
+ raise ParserError(
+ "There is leading text: %r" % body.text)
+ if body.text and body.text.strip():
+ elements.append(body.text)
+ elements.extend(body)
+ # FIXME: removing the reference to the parent artificial document
+ # would be nice
+ return elements
+
+def parse_element(html, create_parent=False):
+ """
+ Parses a single HTML element; it is an error if there is more than
+ one element, or if anything but whitespace precedes or follows the
+ element.
+
+ If create_parent is true (or is a tag name) then a parent node
+ will be created to encapsulate the HTML in a single element.
+ """
+ if create_parent:
+ if not isinstance(create_parent, basestring):
+ create_parent = 'div'
+ return parse_element('<%s>%s</%s>' % (create_parent, html, create_parent))
+ elements = parse_elements(html, no_leading_text=True)
+ if not elements:
+ raise ParserError(
+ "No elements found")
+ if len(elements) > 1:
+ raise ParserError(
+ "Multiple elements found (%s)"
+ % ', '.join([e.tag for e in elements]))
+ el = elements[0]
+ if el.tail and el.tail.strip():
+ raise ParserError(
+ "Element followed by text: %r" % el.tail)
+ el.tail = None
+ return el
+
+def Element(*args, **kw):
+ # FIXME: this is totally broken; segfaults
+ v = HtmlElement(*args, **kw)
+ return v
+
+############################################################
+## Serialization
+############################################################
+
+_html_xsl = """\
+<xsl:transform xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <xsl:output method="html" encoding="UTF-8" />
+ <xsl:template match="/">
+ <xsl:copy-of select="."/>
+ </xsl:template>
+</xsl:transform>
+"""
+
+_pretty_html_xsl = """\
+<xsl:transform xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <xsl:output method="html" encoding="UTF-8" indent="yes" />
+ <xsl:template match="/">
+ <xsl:copy-of select="."/>
+ </xsl:template>
+</xsl:transform>
+"""
+
+_local_transforms = threading.local()
+# FIXME: should we just lazily compile these?
+_local_transforms.html_transform = etree.XSLT(etree.XML(_html_xsl))
+_local_transforms.pretty_html_transform = etree.XSLT(etree.XML(_pretty_html_xsl))
+
+# This isn't a general match, but it's a match for what XSLT specifically creates:
+_meta_content_type_re = re.compile(
+ r'<meta http-equiv="Content-Type".*?>')
+
+def tostring(doc, pretty=False, include_meta_content_type=False):
+ """
+ return HTML string representation of the document given
+
+ note: this will create a meta http-equiv="Content" tag in the head
+ and may replace any that are present
+ """
+ assert doc is not None
+ if pretty:
+ try:
+ pretty_html_transform = _local_transforms.pretty_html_transform
+ except AttributeError:
+ pretty_html_transform = _local_transforms.pretty_html_transform = etree.XSLT(etree.XML(_pretty_html_xsl))
+ html = str(pretty_html_transform(doc))
+ else:
+ try:
+ html_transform = _local_transforms.html_transform
+ except AttributeError:
+ html_transform = _local_transforms.html_transform = etree.XSLT(etree.XML(_html_xsl))
+ html = str(html_transform(doc))
+ if not include_meta_content_type:
+ html = _meta_content_type_re.sub('', html)
+ return html
Added: lxml/branch/html/src/lxml/html/clean.py
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/clean.py Tue May 29 17:05:51 2007
@@ -0,0 +1,157 @@
+from lxml import etree
+from lxml.html import defs
+from lxml.html import HTML, tostring
+
+__all__ = ['clean_html', 'clean']
+
+def clean_html(html, **kw):
+ """
+ Like clean(), but takes a text input document, and returns a text
+ document.
+ """
+ doc = HTML(html)
+ clean(doc, **kw)
+ return tostring(doc)
+
+def clean(doc,
+ scripts=True,
+ javascript=True,
+ comments=True,
+ # process instructions?
+ style=False,
+ links=False,
+ embedded=True,
+ frames=True,
+ forms=True,
+ remove_tags=None,
+ allow_tags=None,
+ strip_tags=True,
+ remove_unknown_tags=True,
+ add_nofollow=False,
+ # callbacks?
+ ):
+ """
+ Cleans the document of each of the possible offending elements:
+
+ ``scripts``:
+ Any ``<script>`` tags.
+
+ ``javascript``:
+ Any Javascript, like an ``onclick`` attribute.
+
+ ``comments``:
+ Any comments.
+
+ ``style``:
+ Any style tags or attributes.
+
+ ``links``:
+ Remove any ``<link>`` tags
+
+ ``frames``:
+ Remove any frame-related tags
+
+ ``embedded``:
+ Remove any embedded objects (flash, iframes)
+
+ ``forms``:
+ Remove any form tags
+
+ ``remove_tags``:
+ A list of tags to remove.
+
+ ``allow_tags``:
+ A list of tags to include (default include all).
+
+ ``strip_tags``:
+ If true, then any tag taken out by remove_tags or allow_tags will
+ leave its text in place; if false, then the tag and its content are
+ removed.
+
+ ``remove_unknown_tags``:
+ Remove any tags that aren't standard parts of HTML.
+
+ ``add_nofollow``:
+ If true, then any <a> tags will have ``rel="nofollow"`` added to them.
+
+ This modifies the document *in place*.
+ """
+ kill_tags = []
+ remove_tags = list(remove_tags or [])
+ if scripts:
+ kill_tags.append('script')
+ if javascript:
+ for attrib in defs.event_attrs:
+ for el in doc.xpath('descendant-or-self::*[@%s]' % attrib):
+ del el.attrib[attrib]
+ for attrib in defs.link_attrs:
+ # FIXME: should call lower-case()
+ for el in doc.xpath("descendant-or-self::*[starts-with(@%s, 'javascript:')]" % attrib):
+ if isinstance(el, basestring):
+ assert 0, repr(el)
+ el.attrib[attrib] = ""
+ if comments:
+ # Easier way?
+ bad = []
+ for el in doc.iterdescendants():
+ if isinstance(el, etree._Comment):
+ bad.append(el)
+ for el in bad:
+ el.remove_element()
+ if style:
+ kill_tags.append('style')
+ for el in doc.xpath('descendant-or-self::link[lower-case(@rel)="stylesheet"]'):
+ el.remove_element()
+ for el in doc.xpath('descendant-or-self::*[@style]'):
+ del el.attrib['style']
+ if links:
+ kill_tags.append('link')
+ if embedded:
+ kill_tags.extend(['object', 'embed', 'iframe'])
+ if frames:
+ kill_tags.extend(defs.frame_tags)
+ if forms:
+ # FIXME: do I even care about fieldset and legend? I don't
+ # care about label.
+ remove_tags.extend(['form', 'fieldset', 'legend'])
+ kill_tags.extend(['button', 'input', 'select', 'textarea'])
+ bad = []
+ for el in doc.iterdescendants():
+ if el.tag in kill_tags:
+ bad.append(el)
+ for el in bad:
+ el.remove_element()
+ if remove_tags:
+ xpath = ' | '.join([
+ "descendant-or-self::%s" % tag
+ for tag in remove_tags])
+ for el in doc.xpath(xpath):
+ if strip_tags:
+ el.remove_tag()
+ else:
+ # FIXME: Should we test if this has been removed because of a parent?
+ el.remove_element()
+ if remove_unknown_tags:
+ if allow_tags:
+ raise ValueError(
+ "It does not make sense to pass in both allow_tags and remove_unknown_tags")
+ allow_tags = defs.tags
+ if allow_tags:
+ bad = []
+ for el in doc.iterdescendants():
+ if el.tag not in allow_tags:
+ bad.append(el)
+ for el in bad:
+ if strip_tags:
+ el.remove_tag()
+ else:
+ # FIXME: Should we test if this has been removed because of a parent?
+ el.remove_element()
+ if add_nofollow:
+ for el in doc.xpath('descendant-or-self::a[@href]'):
+ href = el.attrib['href']
+ if not href or href.startswith('#'):
+ # internal link, we don't care
+ continue
+ el.attrib['rel'] = 'nofollow'
+
Added: lxml/branch/html/src/lxml/html/defs.py
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/defs.py Tue May 29 17:05:51 2007
@@ -0,0 +1,100 @@
+# Data taken from http://www.w3.org/TR/html401/index/elements.html
+
+empty_tags = [
+ 'area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
+ 'img', 'input', 'isindex', 'link', 'meta', 'param']
+
+deprecated_tags = [
+ 'applet', 'basefont', 'center', 'dir', 'font', 'isindex',
+ 'menu', 's', 'strike', 'u']
+
+# archive actually takes a space-separated list of URIs
+link_attrs = [
+ 'action', 'archive', 'background', 'cite', 'classid',
+ 'codebase', 'data', 'href', 'longdesc', 'profile', 'src',
+ 'usemap']
+
+# Not in the HTML 4 spec:
+# onerror
+event_attrs = [
+ 'onblur', 'onchange', 'onclick', 'ondblclick', 'onerror',
+ 'onfocus', 'onkeydown', 'onkeypress', 'onkeyup', 'onload',
+ 'onmousedown', 'onmousemove', 'onmouseout', 'onmouseover',
+ 'onmouseup', 'onreset', 'onselect', 'onsubmit', 'onunload',
+ ]
+
+# From http://htmlhelp.com/reference/html40/olist.html
+top_level_tags = [
+ 'html', 'head', 'body', 'frameset',
+ ]
+
+head_tags = [
+ 'base', 'isindex', 'link', 'meta', 'script', 'style', 'title',
+ ]
+
+general_block_tags = [
+ 'address',
+ 'blockquote',
+ 'center',
+ 'del',
+ 'div',
+ 'h1',
+ 'h2',
+ 'h3',
+ 'h4',
+ 'h5',
+ 'h6',
+ 'hr',
+ 'ins',
+ 'isindex',
+ 'noscript',
+ 'p',
+ 'pre',
+ ]
+
+list_tags = [
+ 'dir', 'dl', 'dt', 'dd', 'li', 'menu', 'ol', 'ul',
+ ]
+
+table_tags = [
+ 'table', 'caption', 'colgroup', 'col',
+ 'thead', 'tfoot', 'tbody', 'tr', 'td', 'th',
+ ]
+
+# just this one from
+# http://www.georgehernandez.com/h/XComputers/HTML/2BlockLevel.htm
+block_tags = general_block_tags + list_tags + table_tags + [
+ # Partial form tags
+ 'fieldset', 'form', 'legend', 'optgroup', 'option',
+ ]
+
+form_tags = [
+ 'form', 'button', 'fieldset', 'legend', 'input', 'label',
+ 'select', 'optgroup', 'option', 'textarea',
+ ]
+
+special_inline_tags = [
+ 'a', 'applet', 'basefont', 'bdo', 'br', 'font', 'iframe',
+ 'img', 'map', 'area', 'object', 'param', 'q', 'script',
+ 'span', 'sub', 'sup',
+ ]
+
+phrase_tags = [
+ 'abbr', 'acronym', 'cite', 'code', 'del', 'dfn', 'em',
+ 'ins', 'kbd', 'samp', 'strong', 'var',
+ ]
+
+font_style_tags = [
+ 'b', 'big', 'i', 's', 'small', 'strike', 'tt', 'u',
+ ]
+
+frame_tags = [
+ 'frameset', 'frame', 'noframes',
+ ]
+
+# These tags aren't standard
+nonstandard_tags = ['blink', 'marque']
+
+tags = (top_level_tags + head_tags + general_block_tags + list_tags
+ + table_tags + form_tags + special_inline_tags + phrase_tags
+ + font_style_tags + nonstandard_tags)
Added: lxml/branch/html/src/lxml/html/htmldiff.py
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/htmldiff.py Tue May 29 17:05:51 2007
@@ -0,0 +1,890 @@
+import difflib
+from lxml import etree
+from lxml.html import parse_element
+import cgi
+import re
+
+__all__ = ['html_annotate', 'htmldiff']
+
+
+############################################################
+## Annotation
+############################################################
+
+def default_markup(text, version):
+ return '<span title="%s">%s</span>' % (
+ cgi.escape(unicode(version), 1), text)
+
+def html_annotate(doclist, markup=default_markup):
+ """
+ doclist should be ordered from oldest to newest, like::
+
+ >>> version1 = 'Hello World'
+ >>> version2 = 'Goodbye World'
+ >>> html_annotate([(version1, 'version 1'),
+ ... (version2, 'version 2')])
+ u'<span title="version 2">Goodbye</span> <span title="version 1">World</span>'
+
+ The documents must be *fragments* (str/UTF8 or unicode), not
+ complete documents
+
+ The markup argument is a function to markup the spans of words.
+ This function is called like markup('Hello', 'version 2'), and
+ returns HTML. The first argument is text and never includes any
+ markup. The default uses a span with a title:
+
+ >>> default_markup('Some Text', 'by Joe')
+ u'<span title="by Joe">Some Text</span>'
+ """
+ # The basic strategy we have is to split the documents up into
+ # logical tokens (which are words with attached markup). We then
+ # do diffs of each of the versions to track when a token first
+ # appeared in the document; the annotation attached to the token
+ # is the version where it first appeared.
+ tokenlist = [tokenize_annotated(doc, version)
+ for doc, version in doclist]
+ cur_tokens = tokenlist[0]
+ for tokens in tokenlist[1:]:
+ html_annotate_merge_annotations(cur_tokens, tokens)
+ cur_tokens = tokens
+
+ # After we've tracked all the tokens, we can combine spans of text
+ # that are adjacent and have the same annotation
+ cur_tokens = compress_tokens(cur_tokens)
+ # And finally add markup
+ result = markup_serialize_tokens(cur_tokens, markup)
+ return ''.join(result).strip()
+
+def tokenize_annotated(doc, annotation):
+ """Tokenize a document and add an annotation attribute to each token
+ """
+ tokens = tokenize(doc, include_hrefs=False)
+ for tok in tokens:
+ tok.annotation = annotation
+ return tokens
+
+def html_annotate_merge_annotations(tokens_old, tokens_new):
+ """Merge the annotations from tokens_old into tokens_new, when the
+ tokens in the new document already existed in the old document.
+ """
+ s = InsensitiveSequenceMatcher(a=tokens_old, b=tokens_new)
+ commands = s.get_opcodes()
+
+ for command, i1, i2, j1, j2 in commands:
+ if command == 'equal':
+ eq_old = tokens_old[i1:i2]
+ eq_new = tokens_new[j1:j2]
+ copy_annotations(eq_old, eq_new)
+
+def copy_annotations(src, dest):
+ """
+ Copy annotations from the tokens listed in src to the tokens in dest
+ """
+ assert len(src) == len(dest)
+ for src_tok, dest_tok in zip(src, dest):
+ dest_tok.annotation = src_tok.annotation
+
+def compress_tokens(tokens):
+ """
+ Combine adjacent tokens when there is no HTML between the tokens,
+ and they share an annotation
+ """
+ result = [tokens[0]]
+ for tok in tokens[1:]:
+ if (not result[-1].post_tags and
+ not tok.pre_tags and
+ result[-1].annotation == tok.annotation):
+ compress_merge_back(result, tok)
+ else:
+ result.append(tok)
+ return result
+
+def compress_merge_back(tokens, tok):
+ """ Merge tok into the last element of tokens (modifying the list of
+ tokens in-place). """
+ last = tokens[-1]
+ if type(last) is not token or type(tok) is not token:
+ tokens.append(tok)
+ else:
+ text = unicode(last)
+ if last.trailing_whitespace:
+ text += ' '
+ text += tok
+ merged = token(text,
+ pre_tags=last.pre_tags,
+ post_tags=tok.post_tags,
+ trailing_whitespace=tok.trailing_whitespace)
+ merged.annotation = last.annotation
+ tokens[-1] = merged
+
+def markup_serialize_tokens(tokens, markup_func):
+ """
+ Serialize the list of tokens into a list of text chunks, calling
+ markup_func around text to add annotations.
+ """
+ for token in tokens:
+ for pre in token.pre_tags:
+ yield pre
+ html = token.html()
+ html = markup_func(html, token.annotation)
+ if token.trailing_whitespace:
+ html += ' '
+ yield html
+ for post in token.post_tags:
+ yield post
+
+
+############################################################
+## HTML Diffs
+############################################################
+
+def htmldiff(old_html, new_html):
+ """ Do a diff of the old and new document. The documents are HTML
+ *fragments* (str/UTF8 or unicode), they are not complete documents
+ (i.e., no <html> tag).
+
+ Returns HTML with <ins> and <del> tags added around the
+ appropriate text.
+
+ Markup is generally ignored, with the markup from new_html
+ preserved, and possibly some markup from old_html (though it is
+ considered acceptable to lose some of the old markup). Only the
+ words in the HTML are diffed. The exception is <img> tags, which
+ are treated like words, and the href attribute of <a> tags, which
+ are noted inside the tag itself when there are changes.
+ """
+ old_html_tokens = tokenize(old_html)
+ new_html_tokens = tokenize(new_html)
+ result = htmldiff_tokens(old_html_tokens, new_html_tokens)
+ result = ''.join(result).strip()
+ return fixup_ins_del_tags(result)
+
+def htmldiff_tokens(html1_tokens, html2_tokens):
+ """ Does a diff on the tokens themselves, returning a list of text
+ chunks (not tokens).
+ """
+ # There are several passes as we do the differences. The tokens
+ # isolate the portion of the content we care to diff; difflib does
+ # all the actual hard work at that point.
+ #
+ # Then we must create a valid document from pieces of both the old
+ # document and the new document. We generally prefer to take
+ # markup from the new document, and only do a best effort attempt
+ # to keep markup from the old document; anything that we can't
+ # resolve we throw away. Also we try to put the deletes as close
+ # to the location where we think they would have been -- because
+ # we are only keeping the markup from the new document, it can be
+ # fuzzy where in the new document the old text would have gone.
+ # Again we just do a best effort attempt.
+ s = InsensitiveSequenceMatcher(a=html1_tokens, b=html2_tokens)
+ commands = s.get_opcodes()
+ result = []
+ for command, i1, i2, j1, j2 in commands:
+ if command == 'equal':
+ result.extend(expand_tokens(html2_tokens[j1:j2], equal=True))
+ continue
+ if command == 'insert' or command == 'replace':
+ ins_tokens = expand_tokens(html2_tokens[j1:j2])
+ merge_insert(ins_tokens, result)
+ if command == 'delete' or command == 'replace':
+ del_tokens = expand_tokens(html1_tokens[i1:i2])
+ merge_delete(del_tokens, result)
+ # If deletes were inserted directly as <del> then we'd have an
+ # invalid document at this point. Instead we put in special
+ # markers, and when the complete diffed document has been created
+ # we try to move the deletes around and resolve any problems.
+ result = cleanup_delete(result)
+
+ return result
+
+def expand_tokens(tokens, equal=False):
+ """Given a list of tokens, return a generator of the chunks of
+ text for the data in the tokens.
+ """
+ for token in tokens:
+ for pre in token.pre_tags:
+ yield pre
+ if not equal or not token.hide_when_equal:
+ if token.trailing_whitespace:
+ yield token.html() + ' '
+ else:
+ yield token.html()
+ for post in token.post_tags:
+ yield post
+
+def merge_insert(ins_chunks, doc):
+ """ doc is the already-handled document (as a list of text chunks);
+ here we add <ins>ins_chunks</ins> to the end of that. """
+ # Though we don't throw away unbalanced_start or unbalanced_end
+ # (we assume there is accompanying markup later or earlier in the
+ # document), we only put <ins> around the balanced portion.
+ unbalanced_start, balanced, unbalanced_end = split_unbalanced(ins_chunks)
+ doc.extend(unbalanced_start)
+ if doc and not doc[-1].endswith(' '):
+ # Fix up the case where the word before the insert didn't end with
+ # a space
+ doc[-1] += ' '
+ doc.append('<ins>')
+ if balanced and balanced[-1].endswith(' '):
+ # We move space outside of </ins>
+ balanced[-1] = balanced[-1][:-1]
+ doc.extend(balanced)
+ doc.append('</ins> ')
+ doc.extend(unbalanced_end)
+
+# These are sentinals to represent the start and end of a <del>
+# segment, until we do the cleanup phase to turn them into proper
+# markup:
+class DEL_START:
+ pass
+class DEL_END:
+ pass
+
+class NoDeletes(Exception):
+ """ Raised when the document no longer contains any pending deletes
+ (DEL_START/DEL_END) """
+
+def merge_delete(del_chunks, doc):
+ """ Adds the text chunks in del_chunks to the document doc (another
+ list of text chunks) with marker to show it is a delete.
+ cleanup_delete later resolves these markers into <del> tags."""
+ doc.append(DEL_START)
+ doc.extend(del_chunks)
+ doc.append(DEL_END)
+
+def cleanup_delete(chunks):
+ """ Cleans up any DEL_START/DEL_END markers in the document, replacing
+ them with <del></del>. To do this while keeping the document
+ valid, it may need to drop some tags (either start or end tags).
+
+ It may also move the del into adjacent tags to try to move it to a
+ similar location where it was originally located (e.g., moving a
+ delete into preceding <div> tag, if the del looks like (DEL_START,
+ 'Text</div>', DEL_END)"""
+ while 1:
+ # Find a pending DEL_START/DEL_END, splitting the document
+ # into stuff-preceding-DEL_START, stuff-inside, and
+ # stuff-following-DEL_END
+ try:
+ pre_delete, delete, post_delete = split_delete(chunks)
+ except NoDeletes:
+ # Nothing found, we've cleaned up the entire doc
+ break
+ # The stuff-inside-DEL_START/END may not be well balanced
+ # markup. First we figure out what unbalanced portions there are:
+ unbalanced_start, balanced, unbalanced_end = split_unbalanced(delete)
+ # Then we move the span forward and/or backward based on these
+ # unbalanced portions:
+ locate_unbalanced_start(unbalanced_start, pre_delete, post_delete)
+ locate_unbalanced_end(unbalanced_end, pre_delete, post_delete)
+ doc = pre_delete
+ if doc and not doc[-1].endswith(' '):
+ # Fix up case where the word before us didn't have a trailing space
+ doc[-1] += ' '
+ doc.append('<del>')
+ if balanced and balanced[-1].endswith(' '):
+ # We move space outside of </del>
+ balanced[-1] = balanced[-1][:-1]
+ doc.extend(balanced)
+ doc.append('</del> ')
+ doc.extend(post_delete)
+ chunks = doc
+ return chunks
+
+def split_unbalanced(chunks):
+ """Return (unbalanced_start, balanced, unbalanced_end), where each is
+ a list of text and tag chunks.
+
+ unbalanced_start is a list of all the tags that are opened, but
+ not closed in this span. Similarly, unbalanced_end is a list of
+ tags that are closed but were not opened. Extracting these might
+ mean some reordering of the chunks."""
+ start = []
+ end = []
+ tag_stack = []
+ balanced = []
+ for chunk in chunks:
+ if not chunk.startswith('<'):
+ balanced.append(chunk)
+ continue
+ endtag = chunk[1] == '/'
+ name = chunk.split()[0].strip('<>/')
+ if name in empty_tags:
+ assert not endtag, (
+ "Empty tag %r should have no end tag" % chunk)
+ balanced.append(chunk)
+ continue
+ if endtag:
+ if tag_stack and tag_stack[-1][0] == name:
+ balanced.append(chunk)
+ name, pos, tag = tag_stack.pop()
+ balanced[pos] = tag
+ elif tag_stack:
+ start.extend(tag for name, pos, tag in tag_stack)
+ tag_stack = []
+ end.append(chunk)
+ else:
+ end.append(chunk)
+ else:
+ tag_stack.append((name, len(balanced), chunk))
+ balanced.append(None)
+ start.extend(
+ [chunk for name, pos, chunk in tag_stack])
+ balanced = [chunk for chunk in balanced if chunk is not None]
+ return start, balanced, end
+
+def split_delete(chunks):
+ """ Returns (stuff_before_DEL_START, stuff_inside_DEL_START_END,
+ stuff_after_DEL_END). Returns the first case found (there may be
+ more DEL_STARTs in stuff_after_DEL_END). Raises NoDeletes if
+ there's no DEL_START found. """
+ try:
+ pos = chunks.index(DEL_START)
+ except ValueError:
+ raise NoDeletes
+ pos2 = chunks.index(DEL_END)
+ return chunks[:pos], chunks[pos+1:pos2], chunks[pos2+1:]
+
+def locate_unbalanced_start(unbalanced_start, pre_delete, post_delete):
+ """ pre_delete and post_delete implicitly point to a place in the
+ document (where the two were split). This moves that point (by
+ popping items from one and pushing them onto the other). It moves
+ the point to try to find a place where unbalanced_start applies.
+
+ As an example::
+
+ >>> unbalanced_start = ['<div>']
+ >>> doc = ['<p>', 'Text', '</p>', '<div>', 'More Text', '</div>']
+ >>> pre, post = doc[:3], doc[3:]
+ >>> pre, post
+ (['<p>', 'Text', '</p>'], ['<div>', 'More Text', '</div>'])
+ >>> locate_unbalanced_start(unbalanced_start, pre, post)
+ >>> pre, post
+ (['<p>', 'Text', '</p>', '<div>'], ['More Text', '</div>'])
+
+ As you can see, we moved the point so that the dangling <div> that
+ we found will be effectively replaced by the div in the original
+ document. If this doesn't work out, we just throw away
+ unbalanced_start without doing anything.
+ """
+ while 1:
+ if not unbalanced_start:
+ # We have totally succeded in finding the position
+ break
+ finding = unbalanced_start[0]
+ finding_name = finding.split()[0].strip('<>')
+ if not post_delete:
+ break
+ next = post_delete[0]
+ if next is DEL_START or not next.startswith('<'):
+ # Reached a word, we can't move the delete text forward
+ break
+ if next[1] == '/':
+ # Reached a closing tag, can we go further? Maybe not...
+ break
+ name = next.split()[0].strip('<>')
+ if name == 'ins':
+ # Can't move into an insert
+ break
+ assert name != 'del', (
+ "Unexpected delete tag: %r" % next)
+ if name == finding_name:
+ unbalanced_start.pop(0)
+ pre_delete.append(post_delete.pop(0))
+ else:
+ # Found a tag that doesn't match
+ break
+
+def locate_unbalanced_end(unbalanced_end, pre_delete, post_delete):
+ """ like locate_unbalanced_start, except handling end tags and
+ possibly moving the point earlier in the document. """
+ while 1:
+ if not unbalanced_end:
+ # Success
+ break
+ finding = unbalanced_end[-1]
+ finding_name = finding.split()[0].strip('<>/')
+ if not pre_delete:
+ break
+ next = pre_delete[-1]
+ if next is DEL_END or not next.startswith('</'):
+ # A word or a start tag
+ break
+ name = next.split()[0].strip('<>/')
+ if name == 'ins' or name == 'del':
+ # Can't move into an insert or delete
+ break
+ if name == finding_name:
+ unbalanced_end.pop()
+ post_delete.insert(0, pre_delete.pop())
+ else:
+ # Found a tag that doesn't match
+ break
+
+class token(unicode):
+ """ Represents a diffable token, generally a word that is displayed to
+ the user. Opening tags are attached to this token when they are
+ adjacent (pre_tags) and closing tags that follow the word
+ (post_tags). Some exceptions occur when there are empty tags
+ adjacent to a word, so there may be close tags in pre_tags, or
+ open tags in post_tags.
+
+ We also keep track of whether the word was originally followed by
+ whitespace, even though we do not want to treat the word as
+ equivalent to a similar word that does not have a trailing
+ space."""
+
+ # When this is true, the token will be eliminated from the
+ # displayed diff if no change has occurred:
+ hide_when_equal = False
+
+ def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=False):
+ obj = unicode.__new__(cls, text)
+
+ if pre_tags is not None:
+ obj.pre_tags = pre_tags
+ else:
+ obj.pre_tags = []
+
+ if post_tags is not None:
+ obj.post_tags = post_tags
+ else:
+ obj.post_tags = []
+
+ obj.trailing_whitespace = trailing_whitespace
+
+ return obj
+
+ def __repr__(self):
+ return 'token(%s, %r, %r)' % (unicode.__repr__(self), self.pre_tags, self.post_tags)
+
+ def html(self):
+ return unicode(self)
+
+class tag_token(token):
+
+ """ Represents a token that is actually a tag. Currently this is just
+ the <img> tag, which takes up visible space just like a word but
+ is only represented in a document by a tag. """
+
+ def __new__(cls, tag, data, html_repr, pre_tags=None,
+ post_tags=None, trailing_whitespace=False):
+ obj = token.__new__(cls, "%s: %s" % (type, data),
+ pre_tags=pre_tags,
+ post_tags=post_tags,
+ trailing_whitespace=trailing_whitespace)
+ obj.tag = tag
+ obj.data = data
+ obj.html_repr = html_repr
+ return obj
+
+ def __repr__(self):
+ return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%s)' % (
+ self.tag,
+ self.data,
+ self.html_repr,
+ self.pre_tags,
+ self.post_tags,
+ self.trailing_whitespace)
+ def html(self):
+ return self.html_repr
+
+class href_token(token):
+
+ """ Represents the href in an anchor tag. Unlike other words, we only
+ show the href when it changes. """
+
+ hide_when_equal = True
+
+ def html(self):
+ return 'Link: %s' % self
+
+def tokenize(html, include_hrefs=True):
+ """
+ Parse the given HTML and returns token objects (words with attached tags).
+
+ This parses only the content of a page; anything in the head is
+ ignored, and the <head> and <body> elements are themselves
+ optional. The content is then parsed by lxml, which ensures the
+ validity of the resulting parsed document (though lxml may make
+ incorrect guesses when the markup is particular bad).
+
+ <ins> and <del> tags are also eliminated from the document, as
+ that gets confusing.
+
+ If include_hrefs is true, then the href attribute of <a> tags is
+ included as a special kind of diffable token."""
+ body_el = parse_html(html, cleanup=True)
+ # Then we split the document into text chunks for each tag, word, and end tag:
+ chunks = flatten_el(body_el, drop_tag=True, include_hrefs=include_hrefs)
+ # Finally re-joining them into token objects:
+ return fixup_chunks(chunks)
+
+def parse_html(html, cleanup=True):
+ """
+ Parses an HTML fragment, returning an lxml element. Note that the HTML will be
+ wrapped in a <div> tag that was not in the original document.
+
+ If cleanup is true, make sure there's no <head> or <body>, and get
+ rid of any <ins> and <del> tags.
+ """
+ if cleanup:
+ # This removes any extra markup or structure like <head>:
+ html = cleanup_html(html)
+ return parse_element(html, create_parent=True)
+
+_body_re = re.compile(r'<body.*?>', re.I|re.S)
+_end_body_re = re.compile(r'</body.*?>', re.I|re.S)
+_ins_del_re = re.compile(r'</?(ins|del).*?>', re.I|re.S)
+
+def cleanup_html(html):
+ """ This 'cleans' the HTML, meaning that any page structure is removed
+ (only the contents of <body> are used, if there is any <body).
+ Also <ins> and <del> tags are removed. """
+ match = _body_re.search(html)
+ if match:
+ html = html[match.end():]
+ match = _end_body_re.search(html)
+ if match:
+ html = html[:match.start()]
+ html = _ins_del_re.sub('', html)
+ return html
+
+
+end_whitespace_re = re.compile(r'[ \t\n\r]$')
+
+def fixup_chunks(chunks):
+ """
+ This function takes a list of chunks and produces a list of tokens.
+ """
+ tag_accum = []
+ cur_word = None
+ result = []
+ for chunk in chunks:
+ if isinstance(chunk, tuple):
+ if chunk[0] == 'img':
+ src = chunk[1]
+ tag = chunk[2]
+ if tag.endswith(' '):
+ tag = tag[:-1]
+ trailing_whitespace = True
+ else:
+ trailing_whitespace = False
+ cur_word = tag_token('img', src, html_repr=tag,
+ pre_tags=tag_accum,
+ trailing_whitespace=trailing_whitespace)
+ tag_accum = []
+ result.append(cur_word)
+ elif chunk[0] == 'href':
+ href = chunk[1]
+ cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=True)
+ tag_accum = []
+ result.append(cur_word)
+ continue
+ if is_word(chunk):
+ if chunk.endswith(' '):
+ chunk = chunk[:-1]
+ trailing_whitespace = True
+ else:
+ trailing_whitespace = False
+ cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace)
+ tag_accum = []
+ result.append(cur_word)
+ elif is_start_tag(chunk):
+ tag_accum.append(chunk)
+ elif is_end_tag(chunk):
+ if tag_accum:
+ tag_accum.append(chunk)
+ else:
+ assert cur_word, (
+ "Weird state, cur_word=%r, result=%r, chunks=%r of %r"
+ % (cur_word, result, chunk, chunks))
+ cur_word.post_tags.append(chunk)
+ else:
+ assert(0)
+
+ if not result:
+ return [token('', pre_tags=tag_accum)]
+ else:
+ result[-1].post_tags.extend(tag_accum)
+
+ return result
+
+
+# All the tags in HTML that don't require end tags:
+empty_tags = (
+ 'param', 'img', 'area', 'br', 'basefont', 'input',
+ 'base', 'meta', 'link', 'col')
+
+block_level_tags = (
+ 'address',
+ 'blockquote',
+ 'center',
+ 'dir',
+ 'div',
+ 'dl',
+ 'fieldset',
+ 'form',
+ 'h1',
+ 'h2',
+ 'h3',
+ 'h4',
+ 'h5',
+ 'h6',
+ 'hr',
+ 'isindex',
+ 'menu',
+ 'noframes',
+ 'noscript',
+ 'ol',
+ 'p',
+ 'pre',
+ 'table',
+ 'ul',
+ )
+
+block_level_container_tags = (
+ 'dd',
+ 'dt',
+ 'frameset',
+ 'li',
+ 'tbody',
+ 'td',
+ 'tfoot',
+ 'th',
+ 'thead',
+ 'tr',
+ )
+
+
+def flatten_el(el, include_hrefs, drop_tag=False):
+ """ Takes an lxml element el, and generates all the text chunks for
+ that tag. Each start tag is a chunk, each word is a chunk, and each
+ end tag is a chunk.
+
+ If drop_tag is true, then the outermost container tag is
+ not returned (just its contents)."""
+ if not drop_tag:
+ if el.tag == 'img':
+ yield ('img', el.attrib['src'], start_tag(el))
+ else:
+ yield start_tag(el)
+ if el.tag in empty_tags and not el.text and not len(el):
+ return
+ start_words = split_words(el.text)
+ for word in start_words:
+ yield cgi.escape(word)
+ for child in el:
+ for item in flatten_el(child, include_hrefs=include_hrefs):
+ yield item
+ if el.tag == 'a' and el.attrib.get('href') and include_hrefs:
+ yield ('href', el.attrib['href'])
+ if not drop_tag:
+ yield end_tag(el)
+ end_words = split_words(el.tail)
+ for word in end_words:
+ yield cgi.escape(word)
+
+def split_words(text):
+ """ Splits some text into words. Includes trailing whitespace (one
+ space) on each word when appropriate. """
+ if not text or not text.strip():
+ return []
+ words = [w + ' ' for w in text.strip().split()]
+ if not end_whitespace_re.search(text):
+ words[-1] = words[-1][:-1]
+ return words
+
+start_whitespace_re = re.compile(r'^[ \t\n\r]')
+
+def start_tag(el):
+ """
+ The text representation of the start tag for a tag.
+ """
+ return '<%s%s>' % (
+ el.tag, ''.join(' %s="%s"' % (name, cgi.escape(value, True))
+ for name, value in el.attrib.items()))
+
+def end_tag(el):
+ """ The text representation of an end tag for a tag. Includes
+ trailing whitespace when appropriate. """
+ if el.tail and start_whitespace_re.search(el.tail):
+ extra = ' '
+ else:
+ extra = ''
+ return '</%s>%s' % (el.tag, extra)
+
+def is_word(tok):
+ return not tok.startswith('<')
+
+def is_end_tag(tok):
+ return tok.startswith('</')
+
+def is_start_tag(tok):
+ return tok.startswith('<') and not tok.startswith('</')
+
+def fixup_ins_del_tags(html):
+ """ Given an html string, move any <ins> or <del> tags inside of any
+ block-level elements, e.g. transform <ins><p>word</p></ins> to
+ <p><ins>word</ins></p> """
+ doc = parse_html(html, cleanup=False)
+ _fixup_ins_del_tags(doc)
+ html = serialize_html_fragment(doc, drop_outer=True)
+ return html
+
+def serialize_html_fragment(el, drop_outer=False):
+ """ Serialize a single lxml element as HTML. The serialized form
+ includes the elements tail.
+
+ If drop_outer is true, then don't serialize the outermost tag
+ """
+
+ html_xsl = """\
+<xsl:transform xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <xsl:output method="html" encoding="UTF-8" />
+ <xsl:template match="/">
+ <xsl:copy-of select="."/>
+ </xsl:template>
+</xsl:transform>
+"""
+ transform = etree.XSLT(etree.XML(html_xsl))
+ assert not isinstance(el, basestring), (
+ "You should pass in an element, not a string like %r" % el)
+ html = str(transform(el))
+ if drop_outer:
+ # Get rid of the extra starting tag:
+ html = html[html.find('>')+1:]
+ if drop_outer:
+ # Get rid of the extra end tag:
+ html = html[:html.rfind('<')]
+ if drop_outer:
+ return html.strip()
+ else:
+ return html.lstrip()
+
+def _fixup_ins_del_tags(doc):
+ """fixup_ins_del_tags that works on an lxml document in-place
+ """
+ for tag in ['ins', 'del']:
+ for el in doc.xpath('descendant-or-self::%s' % tag):
+ if not _contains_block_level_tag(el):
+ continue
+ _move_el_inside_block(el, tag=tag)
+ _merge_element_contents(el)
+
+def _contains_block_level_tag(el):
+ """True if the element contains any block-level elements, like <p>, <td>, etc.
+ """
+ if el.tag in block_level_tags or el.tag in block_level_container_tags:
+ return True
+ for child in el:
+ if _contains_block_level_tag(child):
+ return True
+ return False
+
+def _move_el_inside_block(el, tag):
+ """ helper for _fixup_ins_del_tags; actually takes the <ins> etc tags
+ and moves them inside any block-level tags. """
+ for child in el:
+ if _contains_block_level_tag(child):
+ break
+ else:
+ import sys
+ # No block-level tags in any child
+ children_tag = etree.Element(tag)
+ children_tag.text = el.text
+ el.text = None
+ children_tag.extend(list(el))
+ el[:] = [children_tag]
+ return
+ for child in list(el):
+ if _contains_block_level_tag(child):
+ _move_el_inside_block(child, tag)
+ if child.tail:
+ tail_tag = etree.Element(tag)
+ tail_tag.text = child.tail
+ child.tail = None
+ el.insert(el.index(child)+1, tail_tag)
+ else:
+ child_tag = etree.Element(tag)
+ el.replace(child, child_tag)
+ child_tag.append(child)
+ if el.text:
+ text_tag = etree.Element(tag)
+ text_tag.text = el.text
+ el.text = None
+ el.insert(0, text_tag)
+
+def _merge_element_contents(el):
+ """
+ Removes an element, but merges its contents into its place, e.g.,
+ given <p>Hi <i>there!</i></p>, if you remove the <i> element you get
+ <p>Hi there!</p>
+ """
+ parent = el.getparent()
+ text = el.text or ''
+ if el.tail:
+ if not len(el):
+ text += el.tail
+ else:
+ if el[-1].tail:
+ el[-1].tail += el.tail
+ else:
+ el[-1].tail = el.tail
+ index = parent.index(el)
+ if text:
+ if index == 0:
+ previous = None
+ else:
+ previous = parent[index-1]
+ if previous is None:
+ if parent.text:
+ parent.text += text
+ else:
+ parent.text = text
+ else:
+ if previous.tail:
+ previous.tail += text
+ else:
+ previous.tail = text
+ parent[index:index+1] = el.getchildren()
+
+class InsensitiveSequenceMatcher(difflib.SequenceMatcher):
+ """
+ Acts like SequenceMatcher, but tries not to find very small equal
+ blocks amidst large spans of changes
+ """
+
+ threshold = 2
+
+ def get_matching_blocks(self):
+ size = min(len(self.b), len(self.b))
+ threshold = min(self.threshold, size / 4)
+ actual = difflib.SequenceMatcher.get_matching_blocks(self)
+ return [item for item in actual
+ if item[2] > threshold
+ or not item[2]]
+
+# def get_matching_blocks(self):
+# size = min(len(self.b), len(self.b))
+# threshold = min(self.threshold, size / 4)
+# actual = difflib.SequenceMatcher.get_matching_blocks(self)
+# last_equal_a = 0
+# eliminate = []
+# for i in xrange(1, len(actual)-1):
+# start_diff_length = actual[i][0] - (actual[i-1][0] + actual[i-1][2])
+# end_diff_length = actual[i+1][0]
+# for a_pos, b_pos, length in actual:
+# if (last_equal_a - a_pos is big
+# and length is small
+# and next_equal_a is far away):
+# continue
+# result.append((a_pos, b_pos, length))
+# last_equal_a = a_pos+length
+# return result
+
+
+if __name__ == '__main__':
+ import doctest
+ doctest.testmod()
+
Added: lxml/branch/html/src/lxml/html/rewritelinks.py
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/rewritelinks.py Tue May 29 17:05:51 2007
@@ -0,0 +1,121 @@
+"""
+utilities for manipulating html links
+"""
+
+
+from lxml.html import tostring, HTML
+import urlparse
+import re
+
+__all__ = ['make_links_absolute', 'make_links_absolute_html',
+ 'rewrite_links', 'rewrite_links_html',
+ 'Relocator']
+
+def make_links_absolute(doc, base_href):
+ def link_repl(href):
+ return urlparse.urljoin(base_href, href)
+ rewrite_links(doc, link_repl_func)
+
+def make_links_absolute_html(html, base_href):
+ doc = HTML(html)
+ make_links_absolute(doc, base_href)
+ return tostring(doc)
+
+def rewrite_links_html(html, link_repl_func, remove_base_tags=True):
+ """
+ rewrite_links(), but work on text and returns text
+ """
+ doc = HTML(html)
+ rewrite_links(doc, link_repl_func, remove_base_tags=remove_base_tags)
+ return tostring(doc)
+
+def rewrite_links(doc, link_repl_func,
+ remove_base_tags=True):
+ """
+ Takes a given document (already parsed by lxml) and modifies it
+ in-place. Every link is passed through link_repl_func, and the
+ output of that function replaces the link.
+ """
+ if remove_base_tags:
+ resolve_base_href(doc)
+
+ for attrib in 'href', 'src':
+ els = doc.xpath('//*[@%s]' % attrib)
+ for el in els:
+ el.attrib[attrib] = link_repl_func(el.attrib[attrib])
+
+ rewrite_css_links(doc, link_repl_func)
+ rewrite_style_links(doc, link_repl_func)
+
+def resolve_base_href(doc):
+ """
+ removes all html <base href=""> tags
+ from the document given.
+ """
+ base_href = None
+ basetags = doc.xpath('//base[@href]')
+ for b in basetags:
+ base_href = b.attrib['href']
+ b.getparent().remove(b)
+ if base_href is None:
+ return
+ # Now that we have a base_href (blech) we have to fix up all the
+ # links in the document with this new information.
+ def link_repl(href):
+ return urlparse.urljoin(base_href, href)
+ rewrite_links(doc, link_repl, remove_base_tags=False)
+
+CSS_URL_PAT = re.compile(r'url\((.*?)\)', re.I)
+CSS_IMPORT_PAT = re.compile(r'@import "(.*?)"')
+def rewrite_css_links(doc, link_repl_func):
+ """
+ Fixes up any url(...) links in CSS style elements
+ """
+ def absuri(matchobj):
+ return 'url(%s)' % link_repl_func(matchobj.group(1))
+ def absimport(matchobj):
+ return '@import "%s"' % link_repl_func(matchobj.group(1))
+ els = doc.xpath('//head/style')
+ for el in els:
+ if el.text:
+ el.text = CSS_URL_PAT.sub(absuri, el.text)
+ el.text = CSS_IMPORT_PAT.sub(absimport, el.text)
+
+def rewrite_style_links(doc, link_repl_func):
+ def absuri(matchobj):
+ return 'url(%s)' % link_repl_func(matchobj.group(1))
+ for el in doc.xpath("//*[contains(@style, 'url(')]"):
+ el.attrib['style'] = CSS_URL_PAT.sub(absuri, el.attrib['style'])
+
+class Relocator(object):
+ """
+ This helper can be used to move all links in a document from one
+ location to another. Typically you use this like::
+
+ rewrite_links_html(
+ html, Relocator('http://old-domain/', 'http://new-domain',
+ base_href='http://old-domain/foo/bar.html'))
+
+ This means that the document was located at
+ ``http://old-domain/foo/bar.html`` (used to resolve relative
+ links), and that you want to change every occurance of
+ ``http://old-domain/`` to ``http://new-domain``
+ """
+ # This catches the case of http://foo, which is equivalent to
+ # http://foo/ :
+ _domain_no_slash_re = re.compile(r'^[a-z]+://[^/]+$', re.I)
+
+ def __init__(self, old_href, new_href, base_href=None):
+ self.old_href = old_href
+ self.new_href = new_href
+ self.base_href = base_href
+
+ def __call__(self, href):
+ if self.base_href is not None:
+ real_href = urlparse.urljoin(self.base_href, href)
+ if self._domain_no_slash_re.search(real_href):
+ real_href += '/'
+ if not real_href.startswith(self.old_href):
+ # A link somewhere else entirely
+ return href
+ return self.new_href + real_href[len(self.old_href):]
Added: lxml/branch/html/src/lxml/html/tests/__init__.py
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/tests/__init__.py Tue May 29 17:05:51 2007
@@ -0,0 +1 @@
+#
Added: lxml/branch/html/src/lxml/html/tests/test_basic.py
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/tests/test_basic.py Tue May 29 17:05:51 2007
@@ -0,0 +1,12 @@
+import unittest
+from lxml.tests.common_imports import doctest
+
+from lxml.html import HTML
+
+def test_suite():
+ suite = unittest.TestSuite()
+ suite.addTests([doctest.DocFileSuite('test_basic.txt')])
+ return suite
+
+if __name__ == '__main__':
+ unittest.main()
Added: lxml/branch/html/src/lxml/html/tests/test_basic.txt
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/tests/test_basic.txt Tue May 29 17:05:51 2007
@@ -0,0 +1,42 @@
+lxml.html adds a find_class method to elements::
+
+ >>> from lxml.html import HTML, tostring
+ >>> from lxml.html.clean import clean, clean_html
+ >>> from lxml.html import usedoctest
+ >>> h = HTML('''
+ ... <html><head></head>
+ ... <body>
+ ... <a class="vcard
+ ... fn url" href="foobar">P1</a>
+ ... <a class="not-fn vcard" href="baz">P2</a>
+ ... </body></html>''')
+ >>> print tostring(h)
+ <html>
+ <head></head>
+ <body>
+ <a class="vcard
+ fn url" href="foobar">P1</a>
+ <a class="not-fn vcard" href="baz">P2</a>
+ </body>
+ </html>
+ >>> print [e.text for e in h.find_class('fn')]
+ ['P1']
+ >>> print [e.text for e in h.find_class('vcard')]
+ ['P1', 'P2']
+
+Also added is a get_rel_links, which you can use to search for links
+like ``<a rel="$something">``:
+
+ >>> h = HTML('''
+ ... <a href="1">test 1</a>
+ ... <a href="2" rel="tag">item 2</a>
+ ... <a href="3" rel="tagging">item 3</a>
+ ... <a href="4" rel="TAG">item 4</a>''')
+ >>> print [e.attrib['href'] for e in h.find_rel_links('tag')]
+ ['2']
+ >>> print [e.attrib['href'] for e in h.find_rel_links('nofollow')]
+ []
+
+FIXME: actually that should have returned ['2', '4']
+
+
Added: lxml/branch/html/src/lxml/html/tests/test_clean.py
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/tests/test_clean.py Tue May 29 17:05:51 2007
@@ -0,0 +1,10 @@
+import unittest
+from lxml.tests.common_imports import doctest
+
+def test_suite():
+ suite = unittest.TestSuite()
+ suite.addTests([doctest.DocFileSuite('test_clean.txt')])
+ return suite
+
+if __name__ == '__main__':
+ unittest.main()
Added: lxml/branch/html/src/lxml/html/tests/test_clean.txt
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/tests/test_clean.txt Tue May 29 17:05:51 2007
@@ -0,0 +1,94 @@
+>>> from lxml.html import HTML, tostring
+>>> from lxml.html.clean import clean, clean_html
+>>> from lxml.html import usedoctest
+>>> doc = '''<html>
+... <head>
+... <script type="text/javascript" src="evil-site"></script>
+... <link rel="alternate" type="text/rss" src="evil-rss">
+... </head>
+... <body onload="evil_function()">
+... <!-- I am interpreted for EVIL! -->
+... <a href="javascript:evil_function()">a link</a>
+... <a href="#" onclick="evil_function()">another link</a>
+... <p onclick="evil_function()">a paragraph</p>
+... <div style="display: none">secret EVIL!</div>
+... <object> of EVIL! </object>
+... <iframe src="evil-site"></iframe>
+... <form action="evil-site">
+... Password: <input type="password" name="password">
+... </form>
+... <blink>annoying EVIL!</blink>
+... <a href="evil-site">spam spam SPAM!</a>
+... </body>
+... </html>'''
+>>> print doc
+<html>
+ <head>
+ <script type="text/javascript" src="evil-site"></script>
+ <link rel="alternate" type="text/rss" src="evil-rss">
+ </head>
+ <body onload="evil_function()">
+ <!-- I am interpreted for EVIL! -->
+ <a href="javascript:evil_function()">a link</a>
+ <a href="#" onclick="evil_function()">another link</a>
+ <p onclick="evil_function()">a paragraph</p>
+ <div style="display: none">secret EVIL!</div>
+ <object> of EVIL! </object>
+ <iframe src="evil-site"></iframe>
+ <form action="evil-site">
+ Password: <input type="password" name="password">
+ </form>
+ <blink>annoying EVIL!</blink>
+ <a href="evil-site">spam spam SPAM!</a>
+ </body>
+</html>
+>>> print tostring(HTML(doc))
+<html>
+ <head>
+ <script type="text/javascript" src="evil-site"></script>
+ <link rel="alternate" type="text/rss" src="evil-rss">
+ </head>
+ <body onload="evil_function()">
+ <!-- I am interpreted for EVIL! -->
+ <a href="javascript:evil_function()">a link</a>
+ <a href="#" onclick="evil_function()">another link</a>
+ <p onclick="evil_function()">a paragraph</p>
+ <div style="display: none">secret EVIL!</div>
+ <object> of EVIL! </object>
+ <iframe src="evil-site"></iframe>
+ <form action="evil-site">
+ Password: <input type="password" name="password">
+ </form>
+ <blink>annoying EVIL!</blink>
+ <a href="evil-site">spam spam SPAM!</a>
+ </body>
+</html>
+>>> print clean_html(doc)
+<html>
+ <head>
+ <link rel="alternate" type="text/rss" src="evil-rss">
+ </head>
+ <body>
+ <a href="">a link</a>
+ <a href="#">another link</a>
+ <p>a paragraph</p>
+ <div style="display: none">secret EVIL!</div>
+ Password:
+ <blink>annoying EVIL!</blink>
+ <a href="evil-site">spam spam SPAM!</a>
+ </body>
+</html>
+>>> print clean_html(doc, style=True, links=True, add_nofollow=True)
+<html>
+ <head>
+ </head>
+ <body>
+ <a href="">a link</a>
+ <a href="#">another link</a>
+ <p>a paragraph</p>
+ <div>secret EVIL!</div>
+ Password:
+ <blink>annoying EVIL!</blink>
+ <a href="evil-site" rel="nofollow">spam spam SPAM!</a>
+ </body>
+</html>
Added: lxml/branch/html/src/lxml/html/tests/test_htmldiff.py
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/tests/test_htmldiff.py Tue May 29 17:05:51 2007
@@ -0,0 +1,13 @@
+import unittest
+from lxml.tests.common_imports import doctest
+
+from lxml.html import htmldiff
+
+def test_suite():
+ suite = unittest.TestSuite()
+ suite.addTests([doctest.DocFileSuite('test_htmldiff.txt'),
+ doctest.DocTestSuite(htmldiff)])
+ return suite
+
+if __name__ == '__main__':
+ unittest.main()
Added: lxml/branch/html/src/lxml/html/tests/test_htmldiff.txt
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/tests/test_htmldiff.txt Tue May 29 17:05:51 2007
@@ -0,0 +1,248 @@
+htmldiff does HTML comparisons. These are word-based comparisons.
+
+First, a handy function for normalizing whitespace and doing word wrapping::
+
+ >>> import re, textwrap
+ >>> def pwrapped(text):
+ ... text = re.sub(r'[ \n\t\r]+', ' ', text)
+ ... text = textwrap.fill(text)
+ ... print text
+ >>> def pdiff(text1, text2):
+ ... pwrapped(htmldiff(text1, text2))
+
+Example::
+
+ >>> from lxml.html.htmldiff import htmldiff, split_unbalanced, html_annotate
+ >>> html1 = '<p>This is some test text with some changes and some same stuff</p>'
+ >>> html2 = '''<p>This is some test textual writing with some changed stuff
+ ... and some same stuff</p>'''
+ >>> pdiff(html1, html2)
+ <p>This is some test <ins>textual writing with some changed
+ stuff</ins> <del>text with some changes</del> and some same stuff</p>
+
+Style tags are largely ignored in terms of differences, though markup is not eliminated::
+
+ >>> html1 = '<p>Hi <i>you guys</i></p>'
+ >>> html2 = '<p>Hi <i>you</i> guys</p>'
+ >>> pdiff(html1, html2)
+ <p>Hi <i>you</i> guys</p>
+ >>> pdiff('text', '<p>text</p>')
+ <p>text</p>
+ >>> pdiff('<i>Hi guys</i> !!', '<i>Hi guy</i> !!')
+ <i>Hi <ins>guy</ins> <del>guys</del> </i> !!
+ >>> pdiff('H<i>i</i>', 'Hi')
+ <ins>Hi</ins> <del>H<i>i</i></del>
+ >>> pdiff('<i>A B</i> C', '<i>A</i> C')
+ <i>A <del>B</del> </i> C
+ >>> pdiff('<i>A B</i> C', '<i>B</i> C')
+ <i> <del>A</del> B</i> C
+ >>> pdiff('<p></p>', '<p></p>')
+ <p></p>
+ >>> pdiff('<p>Hi</p>', '<p>Bye</p>')
+ <p><ins>Bye</ins></p> <p><del>Hi</del></p>
+ >>> pdiff('<p>Hi Guy</p>', '<p>Bye Guy</p>')
+ <p> <ins>Bye</ins> <del>Hi</del> Guy</p>
+ >>> pdiff('<p>Hey there</p>', '')
+ <ins></ins> <p><del>Hey there</del></p>
+
+Whitespace is ignored, as it's not meaningful in HTML::
+
+ >>> pdiff('<div>Hi\n\nguys</div>', '<div>Hi guy</div>')
+ <div>Hi <ins>guy</ins> <del>guys</del> </div>
+
+Movement between paragraphs is ignored, as tag-based changes are generally ignored::
+ >>>
+ >>> pdiff('<p>Hello</p><p>World</p>', '<p>Hello World</p>')
+ <p>Hello World</p>
+
+As a special case, changing the href of a link is displayed, and
+images are treated like words:
+
+ >>> pdiff('<a href="http://yahoo.com">search</a>', '<a href="http://google.com">search</a>')
+ <a href="http://google.com">search <ins>Link: http://google.com</ins>
+ <del>Link: http://yahoo.com</del> </a>
+ >>> pdiff('<p>Print this <img src="print.gif"></p>', '<p>Print this</p>')
+ <p>Print this <del><img src="print.gif"></del> </p>
+ >>> pdiff('<a href="http://yahoo.com">search</a>', '<a href="http://yahoo.com">search</a>')
+ <a href="http://yahoo.com">search</a>
+
+The sixteen combinations::
+
+First "insert start" (del start/middle/end/none):
+
+ >>> pdiff('<b>A B C</b>', '<b>D B C</b')
+ <b> <ins>D</ins> <del>A</del> B C</b>
+ >>> pdiff('<b>A B C</b>', '<b>D A C</b>')
+ <b> <ins>D</ins> A <del>B</del> C</b>
+ >>> pdiff('<b>A B C</b>', '<b>D A B</b>')
+ <b> <ins>D</ins> A B <del>C</del> </b>
+ >>> pdiff('<b>A B C</b>', '<b>D A B C</b>')
+ <b> <ins>D</ins> A B C</b>
+
+Next, "insert middle" (del start/middle/end/none):
+
+ >>> pdiff('<b>A B C</b>', '<b>D B C</b>')
+ <b> <ins>D</ins> <del>A</del> B C</b>
+ >>> pdiff('<b>A B C</b>', '<b>A D C</b>')
+ <b>A <ins>D</ins> <del>B</del> C</b>
+ >>> pdiff('<b>A B C</b>', '<b>A D B</b>')
+ <b>A <ins>D</ins> B <del>C</del> </b>
+
+This one case hits the threshold of our insensitive matching:
+
+ >>> pdiff('<b>A B C</b>', '<b>A D B C</b>')
+ <b> <ins>A D</ins> <del>A</del> B C</b>
+
+
+Then "insert end" (del start/middle/end/none):
+
+ >>> pdiff('<b>A B C</b>', '<b>B C D</b>')
+ <b> <del>A</del> B C <ins>D</ins> </b>
+ >>> pdiff('<b>A B C</b>', '<b>A C D</b>')
+ <b>A <del>B</del> C <ins>D</ins> </b>
+ >>> pdiff('<b>A B C</b>', '<b>A B D</b>')
+ <b>A B <ins>D</ins> <del>C</del> </b>
+ >>> pdiff('<b>A B C</b>', '<b>A B C D</b>')
+ <b>A B C <ins>D</ins> </b>
+
+Then no insert (del start/middle/end):
+
+ >>> pdiff('<b>A B C</b>', '<b>B C</b>')
+ <b> <del>A</del> B C</b>
+ >>> pdiff('<b>A B C</b>', '<b>A C</b>')
+ <b>A <del>B</del> C</b>
+ >>> pdiff('<b>A B C</b>', '<b>A B</b>')
+ <b>A B <del>C</del> </b>
+
+ >>> pdiff('<b>A B</b> C', '<b>A B</b>')
+ <b>A B</b> <del>C</del>
+ >>> pdiff('<b>A B</b> <b>C</b>', '<b>A B</b>')
+ <b>A B</b> <del><b>C</b></del>
+ >>> pdiff('A <p><b>hey there</b> <i>how are you?</i></p>', 'A')
+ A <p><del><b>hey there</b> <i>how are you?</i></del></p>
+
+Testing a larger document, to make sure there are not weird
+unnecessary parallels found:
+
+ >>> pdiff('''
+ ... <p>This is a test document with many words in it that goes on
+ ... for a while and doesn't have anything do to with the next
+ ... document that we match this against</p>''', '''
+ ... <p>This is another document with few similarities to the preceding
+ ... one, but enough that it may have overlap that could turn into
+ ... a confusing series of deletes and inserts.
+ ... </p>''')
+ <p><ins>This is another document with few similarities to the
+ preceding one, but enough that it may have overlap that could turn
+ into a confusing series of deletes and inserts. </ins></p>
+ <p><del>This is a test document with many words in it that goes on for
+ a while and doesn't have anything do to with the next document that we
+ match this against</del></p>
+
+
+
+Annotation of content can also be done, where every bit of content is
+marked up with information about where it came from.
+
+First, some setup; note that html_annotate is called with a sequence
+of documents and the annotation associated with that document. We'll
+just use indexes, but you could use author or timestamp information.
+
+ >>> def markup(text, annotation):
+ ... return '<span version="%s">%s</span>' % (annotation, text)
+ >>> def panno(*docs):
+ ... pwrapped(html_annotate([(doc, index) for index, doc in enumerate(docs)],
+ ... markup=markup))
+
+Now, a sequence of documents:
+
+ >>> panno('Hello cruel world', 'Hi cruel world', 'Hi world')
+ <span version="1">Hi</span> <span version="0">world</span>
+ >>> panno('A similar document', 'A similar document',
+ ... 'A similar document here')
+ <span version="0">A similar document</span> <span
+ version="2">here</span>
+ >>> panno('<p>P1 para</p><p>P2 para</p>', '<p>P1 para</p><p>P3 foo</p>')
+ <p><span version="0">P1 para</span></p><p><span version="1">P3
+ foo</span></p>
+ >>> panno('Hello<p>There World</p>','Hello<p>There Town</p>')
+ <span version="0">Hello</span><p><span version="0">There</span> <span
+ version="1">Town</span></p>
+ >>> panno('<p>Hello</p>There World','<p>Hello</p>There Town')
+ <p><span version="0">Hello</span></p><span version="0">There</span>
+ <span version="1">Town</span>
+ >>> panno('<p>Hello</p><p>There World</p>','<p>Hello</p><p>There Town</p>')
+ <p><span version="0">Hello</span></p><p><span version="0">There</span>
+ <span version="1">Town</span></p>
+ >>> panno('<p>Hi <img src="/foo"> You</p>',
+ ... '<p>Hi You</p>',
+ ... '<p>Hi You <img src="/bar"></p>')
+ <p><span version="0">Hi</span> <span version="1">You</span> <span
+ version="2"><img src="/bar"></span></p>
+ >>> panno('<p><a href="/foo">Hey</a></p>',
+ ... '<p><a href="/bar">Hey</a></p>')
+ <p><a href="/bar"><span version="0">Hey</span></a></p>
+ >>> panno('<p><a href="/foo">Hey You</a></p>',
+ ... '<p><a href="/foo">Hey Guy</a></p>')
+ <p><a href="/foo"><span version="0">Hey</span> <span
+ version="1">Guy</span></a></p>
+
+
+
+Here's a test of a utility function!:
+
+ >>> from lxml.html.htmldiff import _merge_element_contents
+ >>> from lxml import etree
+ >>> doc = '''<html><body><div>
+ ... <div id="c1">a b <span id="d1">content</span> c d</div>
+ ... <div id="c2"><span id="d2">content <b>and more</b> stuff</span> trailing</div>
+ ... <div id="c3"><b>hi</b><span id="d3"><i>content</i></span></div>
+ ... <div id="c4"><b>Hi</b> <span id="d4">some stuff<i>more stuff</i></span></div>
+ ... </div></body></html>'''
+ >>> doc = etree.HTML(doc)
+ >>> def show_result(id):
+ ... el = doc.xpath("//*[@id='d%s']" % id)[0]
+ ... _merge_element_contents(el)
+ ... container = doc.xpath("//*[@id='c%s']" % id)[0]
+ ... print etree.tostring(container).strip()
+ >>> show_result(1)
+ <div id="c1">a b content c d</div>
+ >>> show_result(2)
+ <div id="c2">content <b>and more</b> stuff trailing</div>
+ >>> show_result(3)
+ <div id="c3"><b>hi</b><i>content</i></div>
+ >>> show_result(4)
+ <div id="c4"><b>Hi</b> some stuff<i>more stuff</i></div>
+
+More utility:
+
+ >>> from lxml.html.htmldiff import fixup_ins_del_tags
+ >>> def pfixup(text):
+ ... print fixup_ins_del_tags(text).strip()
+ >>> pfixup('<ins><p>some text <b>and more text</b> and more</p></ins>')
+ <p><ins>some text <b>and more text</b> and more</ins></p>
+ >>> pfixup('<p><ins>Hi!</ins> you</p>')
+ <p><ins>Hi!</ins> you</p>
+ >>> pfixup('<div>Some text <ins>and <p>more text</p></ins> </div>')
+ <div>Some text <ins>and </ins><p><ins>more text</ins></p> </div>
+ >>> pfixup('''
+ ... <ins><table><tr><td>One table</td><td>More stuff</td></tr></table></ins>''')
+ <table><tr>
+ <td><ins>One table</ins></td>
+ <td><ins>More stuff</ins></td>
+ </tr></table>
+
+
+Testing split_unbalanced:
+
+ >>> split_unbalanced(['<a href="blah">', 'hey', '</a>'])
+ ([], ['<a href="blah">', 'hey', '</a>'], [])
+ >>> split_unbalanced(['<a href="blah">', 'hey'])
+ (['<a href="blah">'], ['hey'], [])
+ >>> split_unbalanced(['Hey', '</i>', 'You', '</b>'])
+ ([], ['Hey', 'You'], ['</i>', '</b>'])
+ >>> split_unbalanced(['So', '</i>', 'Hi', '<b>', 'There', '</b>'])
+ ([], ['So', 'Hi', '<b>', 'There', '</b>'], ['</i>'])
+ >>> split_unbalanced(['So', '</i>', 'Hi', '<b>', 'There'])
+ (['<b>'], ['So', 'Hi', 'There'], ['</i>'])
+
Added: lxml/branch/html/src/lxml/html/tests/test_rewritelinks.py
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/tests/test_rewritelinks.py Tue May 29 17:05:51 2007
@@ -0,0 +1,10 @@
+import unittest
+from lxml.tests.common_imports import doctest
+
+def test_suite():
+ suite = unittest.TestSuite()
+ suite.addTests([doctest.DocFileSuite('test_rewritelinks.txt')])
+ return suite
+
+if __name__ == '__main__':
+ unittest.main()
Added: lxml/branch/html/src/lxml/html/tests/test_rewritelinks.txt
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/tests/test_rewritelinks.txt Tue May 29 17:05:51 2007
@@ -0,0 +1,79 @@
+These are tests of relocateresponse::
+
+ >>> from lxml.html.rewritelinks import *
+
+In all these examples we'll be using ``http://old`` for the old
+(to-be-replaced) URL and ``https://new`` for the new URL (note the
+scheme change). Out of laziness we'll define some keywords we use
+with all these transformations::
+
+ >>> relocate_href = Relocator(
+ ... base_href='http://old/base/path.html',
+ ... old_href='http://old/',
+ ... new_href='https://new/')
+
+Now lets look at simple href rewriting.
+
+Normal rewrite::
+
+ >>> relocate_href('http://old/bar')
+ 'https://new/bar'
+
+Note that the trailing / doesn't matter in this one case (since
+``http://old`` and ``http://old/`` are entirely equivalent)::
+
+ >>> relocate_href('http://old')
+ 'https://new/'
+
+The trailing / does matter in other cases::
+
+ >>> Relocator(
+ ... base_href='',
+ ... old_href='http://old-test/foo/',
+ ... new_href='https://new',
+ ... )('http://old-test/foo')
+ 'http://old-test/foo'
+ >>> Relocator(
+ ... base_href='',
+ ... old_href='http://old-test/foo/',
+ ... new_href='https://new',
+ ... )('http://old-test/foo/')
+ 'https://new'
+
+Rewriting a link that doesn't match old_href is a no-op::
+
+ >>> relocate_href('http://foo/bar')
+ 'http://foo/bar'
+
+Relative links are handled::
+
+ >>> relocate_href('index.html')
+ 'https://new/base/index.html'
+
+Now for content. First, to make it easier on us, we need to trim the
+normalized HTML we get from these functions::
+
+ >>> import re
+ >>> def pr_html(html):
+ ... html = re.sub(r'</?(?:html|head|body)>', '', html)
+ ... html = re.sub(r'<meta.*?>', '', html)
+ ... print html.strip()
+
+Some basics::
+
+ >>> pr_html(rewrite_links_html(
+ ... '<a href="http://old/blah/blah.html">link</a>', relocate_href))
+ <a href="https://new/blah/blah.html">link</a>
+ >>> pr_html(rewrite_links_html(
+ ... '<script src="http://old/foo.js"></script>', relocate_href))
+ <script src="https://new/foo.js"></script>
+ >>> pr_html(rewrite_links_html(
+ ... '<link href="foo.css">', relocate_href))
+ <link href="https://new/base/foo.css">
+ >>> pr_html(rewrite_links_html('''\
+ ... <base href="http://blah/stuff/index.html">
+ ... <link href="foo.css">
+ ... <a href="http://old/bar.html">x</a>\
+ ... ''', relocate_href))
+ <link href="http://blah/stuff/foo.css">
+ <a href="https://new/bar.html">x</a>
Added: lxml/branch/html/src/lxml/html/usedoctest.py
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/usedoctest.py Tue May 29 17:05:51 2007
@@ -0,0 +1,3 @@
+from lxml import doctestcompare
+
+doctestcompare.temp_install(html=True)
Added: lxml/branch/html/src/lxml/usedoctest.py
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/usedoctest.py Tue May 29 17:05:51 2007
@@ -0,0 +1,3 @@
+from lxml import doctestcompare
+
+doctestcompare.temp_install()
More information about the lxml-checkins
mailing list