[Lxml-checkins] r43951 - in lxml/branch/html/src/lxml/html: . tests
ianb at codespeak.net
ianb at codespeak.net
Thu May 31 23:55:43 CEST 2007
Author: ianb
Date: Thu May 31 23:55:43 2007
New Revision: 43951
Modified:
lxml/branch/html/src/lxml/html/__init__.py
lxml/branch/html/src/lxml/html/rewritelinks.py
lxml/branch/html/src/lxml/html/tests/test_rewritelinks.txt
Log:
Added iter_links; added methods for each of the functions; added functions for each of the methods. Added some more tests. Consolidation of the functions will happen in a following commit
Modified: lxml/branch/html/src/lxml/html/__init__.py
==============================================================================
--- lxml/branch/html/src/lxml/html/__init__.py (original)
+++ lxml/branch/html/src/lxml/html/__init__.py Thu May 31 23:55:43 2007
@@ -96,6 +96,125 @@
"""
return self.xpath("string()")
+ ########################################
+ ## Link functions
+ ########################################
+
+ def make_links_absolute(self, base_href, resolve_base_href=True):
+ """
+ Make all links in the document absolute, given the
+ ``base_href`` for the document (the full URL where the
+ document came from).
+
+ If ``resolve_base_href`` is true, then any ``<base href>``
+ tags in the document are used *and* removed from the document.
+ If it is false then any such tag is ignored.
+ """
+ from lxml.html.rewritelinks import make_links_absolute
+ make_links_absolute(self, base_href, resolve_base_href=resolve_base_href)
+
+ def resolve_base_href(self):
+ """
+ Find any ``<base href>`` tag in the document, and apply its
+ values to all links found in the document. Also remove the
+ tag once it has been applied.
+ """
+ from lxml.html.rewritelinks import resolve_base_href
+ resolve_base_href(self)
+
+ def iter_links(self, in_order=True):
+ """
+ Iterate over all the links in the document, yielding
+ ``(element, attribute, link)``.
+
+ The ``element`` contains the link. ``attribute`` is a string
+ like ``'href'`` or ``'src'``. It may be None, which means
+ that the link is in the body of the element. The only type
+ this occurs is with ``<style>`` tags that contain links like
+ ``url(...)``. ``link`` is the actual link, like
+ ``'http://codespeak.net'``
+
+ Note: links are not returned in document order.
+ """
+ from lxml.html.rewritelinks import iter_links
+ return iter_links(self, in_order=in_order)
+
+ def rewrite_links(self, link_repl_func, resolve_base_href=True,
+ base_href=None):
+ """
+ Rewrite all the links in the document. For each link
+ ``link_repl_func(link)`` will be called, and the return value
+ will replace the old link.
+
+ Note that links may not be absolute (unless you first called
+ ``make_links_absolute()``), and may be internal (e.g.,
+ ``'#anchor'``). They can also be values like
+ ``'mailto:email'`` or ``'javascript:expr'``.
+
+ If you give ``base_href`` then all links passed to
+ ``link_repl_func()`` will be absolute.
+ """
+ from lxml.html.rewritelinks import rewrite_links
+ if base_href is not None:
+ # FIXME: this can be done in one pass with a wrapper
+ # around link_repl_func
+ self.make_links_absolute(base_href, resolve_base_href=resolve_base_href)
+ resolve_base_href = False
+ rewrite_links(self, link_repl_func, remove_base_tags=resolve_base_href)
+
+class _MethodFunc(object):
+ def __init__(self, name, fragment=False, source_class=HtmlMixin):
+ self.name = name
+ self.fragment = fragment
+ self.__doc__ = getattr(source_class, self.name).__doc__
+ def __call__(self, doc, *args, **kw):
+ if 'fragment' in kw:
+ fragment = kw.pop('fragment')
+ else:
+ fragment = self.fragment
+ if isinstance(doc, basestring):
+ if fragment:
+ doc = parse_element(doc)
+ else:
+ doc = HTML(doc)
+ meth = getattr(doc, self.name)
+ result = meth(*args, **kw)
+ if result is None:
+ # Then serialize and return
+ return tostring(doc)
+ else:
+ return result
+
+find_rel_links = _MethodFunc('find_rel_links')
+find_class = _MethodFunc('find_class')
+make_links_absolute = _MethodFunc('make_links_absolute')
+resolve_base_href = _MethodFunc('resolve_base_href')
+iter_links = _MethodFunc('iter_links')
+rewrite_links = _MethodFunc('rewrite_links')
+
+class _SubmoduleFunc(object):
+ def __init__(self, module, name, doc=None):
+ self.module = module
+ self.name = name
+ self.obj = None
+ if doc is None:
+ doc = 'See %s.%s' % (module, name)
+ self.__doc__ = doc
+ def __call__(self, *args, **kw):
+ if self.obj is None:
+ import sys
+ __import__(self.module)
+ mod = sys.modules(self.module)
+ self.obj = getattr(mod, self.name)
+ self.__doc__ = self.obj.__doc__
+ return self.obj(*args, **kw)
+
+# FIXME: Damn module names conflict with the function names :(
+#clean = _SubmoduleFunc('lxml.html.clean', 'clean')
+#clean_html = _SubmoduleFunc('lxml.html.clean', 'clean_html')
+#htmldiff = _SubmoduleFunc('lxml.html.htmldiff', 'htmldiff')
+#html_annotate = _SubmoduleFunc('lxml.html.htmldiff', 'html_annotate')
+
class HtmlComment(etree.CommentBase, HtmlMixin):
pass
Modified: lxml/branch/html/src/lxml/html/rewritelinks.py
==============================================================================
--- lxml/branch/html/src/lxml/html/rewritelinks.py (original)
+++ lxml/branch/html/src/lxml/html/rewritelinks.py Thu May 31 23:55:43 2007
@@ -4,6 +4,7 @@
from lxml.html import tostring, HTML
+from lxml.html import defs
import urlparse
import re
@@ -11,10 +12,10 @@
'rewrite_links', 'rewrite_links_html',
'Relocator']
-def make_links_absolute(doc, base_href):
+def make_links_absolute(doc, base_href, resolve_base_href=True):
def link_repl(href):
return urlparse.urljoin(base_href, href)
- rewrite_links(doc, link_repl_func)
+ rewrite_links(doc, link_repl_func, remove_base_tags=resolve_base_href)
def make_links_absolute_html(html, base_href):
doc = HTML(html)
@@ -88,6 +89,30 @@
for el in doc.xpath("//*[contains(@style, 'url(')]"):
el.attrib['style'] = CSS_URL_PAT.sub(absuri, el.attrib['style'])
+def iter_links(doc):
+ """
+ Yield (element, attribute, link, pos), where attribute may be None
+ (indicating the link is in the text). ``pos`` is the position
+ where the link occurs; often 0, but sometimes something else in
+ the case of links in stylesheets or style tags.
+
+ Note: <base href> is *not* taken into account in any way. The
+ link you get is exactly the link in the document.
+ """
+ link_attrs = defs.link_attrs
+ for el in doc.iterdescendants():
+ for attrib in link_attrs:
+ if attrib in el.attrib:
+ yield (el, attrib, el.attrib[attrib], 0)
+ if el.tag == 'style' and el.text:
+ for match in CSS_URL_PAT.finditer(el.text):
+ yield (el, None, match.group(1), match.start(1))
+ for match in CSS_IMPORT_PAT.finditer(el.text):
+ yield (el, None, match.group(1), match.start(1))
+ if 'style' in el.attrib:
+ for match in CSS_URL_PAT.finditer(el.attrib['style']):
+ yield (el, 'style', match.group(1), match.start(1))
+
class Relocator(object):
"""
This helper can be used to move all links in a document from one
@@ -120,3 +145,4 @@
# A link somewhere else entirely
return href
return self.new_href + real_href[len(self.old_href):]
+
Modified: lxml/branch/html/src/lxml/html/tests/test_rewritelinks.txt
==============================================================================
--- lxml/branch/html/src/lxml/html/tests/test_rewritelinks.txt (original)
+++ lxml/branch/html/src/lxml/html/tests/test_rewritelinks.txt Thu May 31 23:55:43 2007
@@ -1,20 +1,18 @@
These are tests of relocateresponse::
- >>> from lxml.html.rewritelinks import *
+ >>> from lxml.html.rewritelinks import Relocator
In all these examples we'll be using ``http://old`` for the old
(to-be-replaced) URL and ``https://new`` for the new URL (note the
-scheme change). Out of laziness we'll define some keywords we use
-with all these transformations::
+scheme change). To test the rewriting we'll use this handy rewriter
+that rewrites everything from one base to another base::
>>> relocate_href = Relocator(
... base_href='http://old/base/path.html',
... old_href='http://old/',
... new_href='https://new/')
-Now lets look at simple href rewriting.
-
-Normal rewrite::
+Now lets look at simple href rewriting. Normal rewrite::
>>> relocate_href('http://old/bar')
'https://new/bar'
@@ -53,27 +51,138 @@
Now for content. First, to make it easier on us, we need to trim the
normalized HTML we get from these functions::
- >>> import re
- >>> def pr_html(html):
- ... html = re.sub(r'</?(?:html|head|body)>', '', html)
- ... html = re.sub(r'<meta.*?>', '', html)
- ... print html.strip()
-
Some basics::
- >>> pr_html(rewrite_links_html(
- ... '<a href="http://old/blah/blah.html">link</a>', relocate_href))
+ >>> from lxml.html import usedoctest, parse_element, tostring
+ >>> from lxml.html import rewrite_links
+ >>> print rewrite_links(
+ ... '<a href="http://old/blah/blah.html">link</a>', relocate_href)
<a href="https://new/blah/blah.html">link</a>
- >>> pr_html(rewrite_links_html(
- ... '<script src="http://old/foo.js"></script>', relocate_href))
+ >>> print rewrite_links(
+ ... '<script src="http://old/foo.js"></script>', relocate_href)
<script src="https://new/foo.js"></script>
- >>> pr_html(rewrite_links_html(
- ... '<link href="foo.css">', relocate_href))
+ >>> print rewrite_links(
+ ... '<link href="foo.css">', relocate_href)
<link href="https://new/base/foo.css">
- >>> pr_html(rewrite_links_html('''\
+ >>> print rewrite_links('''\
... <base href="http://blah/stuff/index.html">
... <link href="foo.css">
... <a href="http://old/bar.html">x</a>\
- ... ''', relocate_href))
+ ... ''', relocate_href)
<link href="http://blah/stuff/foo.css">
<a href="https://new/bar.html">x</a>
+
+Links in CSS are also handled::
+
+ >>> print rewrite_links('''
+ ... <style>
+ ... body {background-image: url(http://old/image.gif)};
+ ... @import "http://old/other-style.css";
+ ... </style>''', relocate_href)
+ <html><head><style>
+ body {background-image: url(https://new/image.gif)};
+ @import "https://new/other-style.css";
+ </style></head></html>
+
+Those links in style attributes are also rewritten::
+
+ >>> print rewrite_links('''
+ ... <div style="background-image: url(http://old/image.gif)">text</div>
+ ... ''', relocate_href)
+ <div style="background-image: url(https://new/image.gif)">text</div>
+
+The ``<base href>`` tag is also respected (but also removed)::
+
+ >>> print rewrite_links('''
+ ... <html><head>
+ ... <base href="http://old/">
+ ... </head>
+ ... <body>
+ ... <a href="foo.html">link</a>
+ ... </body></html>''', relocate_href)
+ <html>
+ <head></head>
+ <body>
+ <a href="https://new/foo.html">link</a>
+ </body>
+ </html>
+
+The ``iter_links`` method (and function) gives you all the links in
+the document, along with the element and attribute the link comes
+from. This makes it fairly easy to see what resources the document
+references or embeds (an ``<a>`` tag is a reference, an ``<img>`` tag
+is something embedded). It returns a generator of ``(element, attrib,
+link)``, which is awkward to test here, so we'll make a printer::
+
+ >>> from lxml.html import iter_links
+ >>> def print_iter(seq):
+ ... for element, attrib, link, pos in seq:
+ ... if pos:
+ ... extra = '@%s' % pos
+ ... else:
+ ... extra = ''
+ ... print '%s %s="%s"%s' % (element.tag, attrib, link, extra)
+ >>> print_iter(iter_links('''
+ ... <html>
+ ... <head>
+ ... <link rel="stylesheet" href="style.css">
+ ... <style type="text/css">
+ ... body {
+ ... background-image: url(/bg.gif);
+ ... }
+ ... @import "/other-styles.css";
+ ... </style>
+ ... <script src="/js-funcs.js"></script>
+ ... </head>
+ ... <body>
+ ... <table>
+ ... <tr><td><ul>
+ ... <li><a href="/test.html">Test stuff</a></li>
+ ... <li><a href="/other.html">Other stuff</a></li>
+ ... </td></tr>
+ ... <td style="background-image: url(/td-bg.png)">
+ ... <img src="/logo.gif">
+ ... Hi world!
+ ... </td></tr>
+ ... </table>
+ ... </body></html>'''))
+ link href="style.css"
+ style None="/bg.gif"@40
+ style None="/other-styles.css"@69
+ script src="/js-funcs.js"
+ a href="/test.html"
+ a href="/other.html"
+ td style="/td-bg.png"@22
+ img src="/logo.gif"
+ >>> print_iter(iter_links('''
+ ... <html>
+ ... <head>
+ ... <link rel="stylesheet" href="style.css">
+ ... <style type="text/css">
+ ... body {
+ ... background-image: url(/bg.gif);
+ ... }
+ ... @import "/other-styles.css";
+ ... </style>
+ ... <script src="/js-funcs.js"></script>
+ ... </head>
+ ... <body>
+ ... <table>
+ ... <tr><td><ul>
+ ... <li><a href="/test.html">Test stuff</a></li>
+ ... <li><a href="/other.html">Other stuff</a></li>
+ ... </td></tr>
+ ... <td style="background-image: url(/td-bg.png)">
+ ... <img src="/logo.gif">
+ ... Hi world!
+ ... </td></tr>
+ ... </table>
+ ... </body></html>''', False))
+ link href="style.css"
+ a href="/test.html"
+ a href="/other.html"
+ script src="/js-funcs.js"
+ img src="/logo.gif"
+ style None="/bg.gif"@40
+ style None="/other-styles.css"@69
+ td style="/td-bg.png"@22
More information about the lxml-checkins
mailing list