[Lxml-checkins] r43951 - in lxml/branch/html/src/lxml/html: . tests

ianb at codespeak.net ianb at codespeak.net
Thu May 31 23:55:43 CEST 2007


Author: ianb
Date: Thu May 31 23:55:43 2007
New Revision: 43951

Modified:
   lxml/branch/html/src/lxml/html/__init__.py
   lxml/branch/html/src/lxml/html/rewritelinks.py
   lxml/branch/html/src/lxml/html/tests/test_rewritelinks.txt
Log:
Added iter_links; added methods for each of the functions; added functions for each of the methods.  Added some more tests.  Consolidation of the functions will happen in a following commit

Modified: lxml/branch/html/src/lxml/html/__init__.py
==============================================================================
--- lxml/branch/html/src/lxml/html/__init__.py	(original)
+++ lxml/branch/html/src/lxml/html/__init__.py	Thu May 31 23:55:43 2007
@@ -96,6 +96,125 @@
         """
         return self.xpath("string()")
 
+    ########################################
+    ## Link functions
+    ########################################
+
+    def make_links_absolute(self, base_href, resolve_base_href=True):
+        """
+        Make all links in the document absolute, given the
+        ``base_href`` for the document (the full URL where the
+        document came from).
+
+        If ``resolve_base_href`` is true, then any ``<base href>``
+        tags in the document are used *and* removed from the document.
+        If it is false then any such tag is ignored.
+        """
+        from lxml.html.rewritelinks import make_links_absolute
+        make_links_absolute(self, base_href, resolve_base_href=resolve_base_href)
+
+    def resolve_base_href(self):
+        """
+        Find any ``<base href>`` tag in the document, and apply its
+        values to all links found in the document.  Also remove the
+        tag once it has been applied.
+        """
+        from lxml.html.rewritelinks import resolve_base_href
+        resolve_base_href(self)
+
+    def iter_links(self, in_order=True):
+        """
+        Iterate over all the links in the document, yielding
+        ``(element, attribute, link)``.
+
+        The ``element`` contains the link.  ``attribute`` is a string
+        like ``'href'`` or ``'src'``.  It may be None, which means
+        that the link is in the body of the element.  The only type
+        this occurs is with ``<style>`` tags that contain links like
+        ``url(...)``.  ``link`` is the actual link, like
+        ``'http://codespeak.net'``
+
+        Note: links are not returned in document order.
+        """
+        from lxml.html.rewritelinks import iter_links
+        return iter_links(self, in_order=in_order)
+
+    def rewrite_links(self, link_repl_func, resolve_base_href=True,
+                      base_href=None):
+        """
+        Rewrite all the links in the document.  For each link
+        ``link_repl_func(link)`` will be called, and the return value
+        will replace the old link.
+
+        Note that links may not be absolute (unless you first called
+        ``make_links_absolute()``), and may be internal (e.g.,
+        ``'#anchor'``).  They can also be values like
+        ``'mailto:email'`` or ``'javascript:expr'``.
+
+        If you give ``base_href`` then all links passed to
+        ``link_repl_func()`` will be absolute.
+        """
+        from lxml.html.rewritelinks import rewrite_links
+        if base_href is not None:
+            # FIXME: this can be done in one pass with a wrapper
+            # around link_repl_func
+            self.make_links_absolute(base_href, resolve_base_href=resolve_base_href)
+            resolve_base_href = False
+        rewrite_links(self, link_repl_func, remove_base_tags=resolve_base_href)
+
+class _MethodFunc(object):
+    def __init__(self, name, fragment=False, source_class=HtmlMixin):
+        self.name = name
+        self.fragment = fragment
+        self.__doc__ = getattr(source_class, self.name).__doc__
+    def __call__(self, doc, *args, **kw):
+        if 'fragment' in kw:
+            fragment = kw.pop('fragment')
+        else:
+            fragment = self.fragment
+        if isinstance(doc, basestring):
+            if fragment:
+                doc = parse_element(doc)
+            else:
+                doc = HTML(doc)
+        meth = getattr(doc, self.name)
+        result = meth(*args, **kw)
+        if result is None:
+            # Then serialize and return
+            return tostring(doc)
+        else:
+            return result
+
+find_rel_links = _MethodFunc('find_rel_links')
+find_class = _MethodFunc('find_class')
+make_links_absolute = _MethodFunc('make_links_absolute')
+resolve_base_href = _MethodFunc('resolve_base_href')
+iter_links = _MethodFunc('iter_links')
+rewrite_links = _MethodFunc('rewrite_links')
+
+class _SubmoduleFunc(object):
+    def __init__(self, module, name, doc=None):
+        self.module = module
+        self.name = name
+        self.obj = None
+        if doc is None:
+            doc = 'See %s.%s' % (module, name)
+        self.__doc__ = doc
+    def __call__(self, *args, **kw):
+        if self.obj is None:
+            import sys
+            __import__(self.module)
+            mod = sys.modules(self.module)
+            self.obj = getattr(mod, self.name)
+            self.__doc__ = self.obj.__doc__
+        return self.obj(*args, **kw)
+
+# FIXME: Damn module names conflict with the function names :(
+#clean = _SubmoduleFunc('lxml.html.clean', 'clean')
+#clean_html = _SubmoduleFunc('lxml.html.clean', 'clean_html')
+#htmldiff = _SubmoduleFunc('lxml.html.htmldiff', 'htmldiff')
+#html_annotate = _SubmoduleFunc('lxml.html.htmldiff', 'html_annotate')
+
 class HtmlComment(etree.CommentBase, HtmlMixin):
     pass
 

Modified: lxml/branch/html/src/lxml/html/rewritelinks.py
==============================================================================
--- lxml/branch/html/src/lxml/html/rewritelinks.py	(original)
+++ lxml/branch/html/src/lxml/html/rewritelinks.py	Thu May 31 23:55:43 2007
@@ -4,6 +4,7 @@
 
 
 from lxml.html import tostring, HTML
+from lxml.html import defs
 import urlparse
 import re
 
@@ -11,10 +12,10 @@
            'rewrite_links', 'rewrite_links_html',
            'Relocator']
 
-def make_links_absolute(doc, base_href):
+def make_links_absolute(doc, base_href, resolve_base_href=True):
     def link_repl(href):
         return urlparse.urljoin(base_href, href)
-    rewrite_links(doc, link_repl_func)
+    rewrite_links(doc, link_repl_func, remove_base_tags=resolve_base_href)
 
 def make_links_absolute_html(html, base_href):
     doc = HTML(html)
@@ -88,6 +89,30 @@
     for el in doc.xpath("//*[contains(@style, 'url(')]"):
         el.attrib['style'] = CSS_URL_PAT.sub(absuri, el.attrib['style'])
 
+def iter_links(doc):
+    """
+    Yield (element, attribute, link, pos), where attribute may be None
+    (indicating the link is in the text).  ``pos`` is the position
+    where the link occurs; often 0, but sometimes something else in
+    the case of links in stylesheets or style tags.
+
+    Note: <base href> is *not* taken into account in any way.  The
+    link you get is exactly the link in the document.
+    """
+    link_attrs = defs.link_attrs
+    for el in doc.iterdescendants():
+        for attrib in link_attrs:
+            if attrib in el.attrib:
+                yield (el, attrib, el.attrib[attrib], 0)
+        if el.tag == 'style' and el.text:
+            for match in CSS_URL_PAT.finditer(el.text):
+                yield (el, None, match.group(1), match.start(1))
+            for match in CSS_IMPORT_PAT.finditer(el.text):
+                yield (el, None, match.group(1), match.start(1))
+        if 'style' in el.attrib:
+            for match in CSS_URL_PAT.finditer(el.attrib['style']):
+                yield (el, 'style', match.group(1), match.start(1))
+            
 class Relocator(object):
     """
     This helper can be used to move all links in a document from one
@@ -120,3 +145,4 @@
             # A link somewhere else entirely
             return href
         return self.new_href + real_href[len(self.old_href):]
+    

Modified: lxml/branch/html/src/lxml/html/tests/test_rewritelinks.txt
==============================================================================
--- lxml/branch/html/src/lxml/html/tests/test_rewritelinks.txt	(original)
+++ lxml/branch/html/src/lxml/html/tests/test_rewritelinks.txt	Thu May 31 23:55:43 2007
@@ -1,20 +1,18 @@
 These are tests of relocateresponse::
 
-    >>> from lxml.html.rewritelinks import *
+    >>> from lxml.html.rewritelinks import Relocator
 
 In all these examples we'll be using ``http://old`` for the old
 (to-be-replaced) URL and ``https://new`` for the new URL (note the
-scheme change).  Out of laziness we'll define some keywords we use
-with all these transformations::
+scheme change).  To test the rewriting we'll use this handy rewriter
+that rewrites everything from one base to another base::
 
     >>> relocate_href = Relocator(
     ...     base_href='http://old/base/path.html',
     ...     old_href='http://old/',
     ...     new_href='https://new/')
 
-Now lets look at simple href rewriting.
-
-Normal rewrite::
+Now lets look at simple href rewriting.  Normal rewrite::
 
     >>> relocate_href('http://old/bar')
     'https://new/bar'
@@ -53,27 +51,138 @@
 Now for content.  First, to make it easier on us, we need to trim the
 normalized HTML we get from these functions::
 
-    >>> import re
-    >>> def pr_html(html):
-    ...     html = re.sub(r'</?(?:html|head|body)>', '', html)
-    ...     html = re.sub(r'<meta.*?>', '', html)
-    ...     print html.strip()
-
 Some basics::
 
-    >>> pr_html(rewrite_links_html(
-    ...     '<a href="http://old/blah/blah.html">link</a>', relocate_href))
+    >>> from lxml.html import usedoctest, parse_element, tostring
+    >>> from lxml.html import rewrite_links
+    >>> print rewrite_links(
+    ...     '<a href="http://old/blah/blah.html">link</a>', relocate_href)
     <a href="https://new/blah/blah.html">link</a>
-    >>> pr_html(rewrite_links_html(
-    ...     '<script src="http://old/foo.js"></script>', relocate_href))
+    >>> print rewrite_links(
+    ...     '<script src="http://old/foo.js"></script>', relocate_href)
     <script src="https://new/foo.js"></script>
-    >>> pr_html(rewrite_links_html(
-    ...     '<link href="foo.css">', relocate_href))
+    >>> print rewrite_links(
+    ...     '<link href="foo.css">', relocate_href)
     <link href="https://new/base/foo.css">
-    >>> pr_html(rewrite_links_html('''\
+    >>> print rewrite_links('''\
     ... <base href="http://blah/stuff/index.html">
     ... <link href="foo.css">
     ... <a href="http://old/bar.html">x</a>\
-    ... ''', relocate_href))
+    ... ''', relocate_href)
     <link href="http://blah/stuff/foo.css">
     <a href="https://new/bar.html">x</a>
+
+Links in CSS are also handled::
+
+    >>> print rewrite_links('''
+    ... <style>
+    ...   body {background-image: url(http://old/image.gif)};
+    ...   @import "http://old/other-style.css";
+    ... </style>''', relocate_href)
+    <html><head><style>
+      body {background-image: url(https://new/image.gif)};
+      @import "https://new/other-style.css";
+    </style></head></html>
+
+Those links in style attributes are also rewritten::
+
+    >>> print rewrite_links('''
+    ... <div style="background-image: url(http://old/image.gif)">text</div>
+    ... ''', relocate_href)
+    <div style="background-image: url(https://new/image.gif)">text</div>
+
+The ``<base href>`` tag is also respected (but also removed)::
+
+    >>> print rewrite_links('''
+    ... <html><head>
+    ...  <base href="http://old/">
+    ... </head>
+    ... <body>
+    ...  <a href="foo.html">link</a>
+    ... </body></html>''', relocate_href)
+    <html>
+     <head></head>
+     <body>
+      <a href="https://new/foo.html">link</a>
+     </body>
+    </html>
+
+The ``iter_links`` method (and function) gives you all the links in
+the document, along with the element and attribute the link comes
+from.  This makes it fairly easy to see what resources the document
+references or embeds (an ``<a>`` tag is a reference, an ``<img>`` tag
+is something embedded).  It returns a generator of ``(element, attrib,
+link)``, which is awkward to test here, so we'll make a printer::
+
+    >>> from lxml.html import iter_links
+    >>> def print_iter(seq):
+    ...     for element, attrib, link, pos in seq:
+    ...         if pos:
+    ...             extra = '@%s' % pos
+    ...         else:
+    ...             extra = ''
+    ...         print '%s %s="%s"%s' % (element.tag, attrib, link, extra)
+    >>> print_iter(iter_links('''
+    ... <html>
+    ...  <head>
+    ...   <link rel="stylesheet" href="style.css">
+    ...   <style type="text/css">
+    ...     body {
+    ...       background-image: url(/bg.gif);
+    ...     }
+    ...     @import "/other-styles.css";
+    ...   </style>
+    ...   <script src="/js-funcs.js"></script>
+    ...  </head>
+    ...  <body>
+    ...   <table>
+    ...    <tr><td><ul>
+    ...     <li><a href="/test.html">Test stuff</a></li>
+    ...     <li><a href="/other.html">Other stuff</a></li>
+    ...    </td></tr>
+    ...    <td style="background-image: url(/td-bg.png)">
+    ...      <img src="/logo.gif">
+    ...      Hi world!
+    ...    </td></tr>
+    ...   </table>
+    ...  </body></html>'''))
+    link href="style.css"
+    style None="/bg.gif"@40
+    style None="/other-styles.css"@69
+    script src="/js-funcs.js"
+    a href="/test.html"
+    a href="/other.html"
+    td style="/td-bg.png"@22
+    img src="/logo.gif"
+    >>> print_iter(iter_links('''
+    ... <html>
+    ...  <head>
+    ...   <link rel="stylesheet" href="style.css">
+    ...   <style type="text/css">
+    ...     body {
+    ...       background-image: url(/bg.gif);
+    ...     }
+    ...     @import "/other-styles.css";
+    ...   </style>
+    ...   <script src="/js-funcs.js"></script>
+    ...  </head>
+    ...  <body>
+    ...   <table>
+    ...    <tr><td><ul>
+    ...     <li><a href="/test.html">Test stuff</a></li>
+    ...     <li><a href="/other.html">Other stuff</a></li>
+    ...    </td></tr>
+    ...    <td style="background-image: url(/td-bg.png)">
+    ...      <img src="/logo.gif">
+    ...      Hi world!
+    ...    </td></tr>
+    ...   </table>
+    ...  </body></html>''', False))
+    link href="style.css"
+    a href="/test.html"
+    a href="/other.html"
+    script src="/js-funcs.js"
+    img src="/logo.gif"
+    style None="/bg.gif"@40
+    style None="/other-styles.css"@69
+    td style="/td-bg.png"@22


More information about the lxml-checkins mailing list