[Lxml-checkins] r43955 - in lxml/branch/html/src/lxml/html: . tests

ianb at codespeak.net ianb at codespeak.net
Fri Jun 1 06:25:27 CEST 2007


Author: ianb
Date: Fri Jun  1 06:25:27 2007
New Revision: 43955

Removed:
   lxml/branch/html/src/lxml/html/rewritelinks.py
Modified:
   lxml/branch/html/src/lxml/html/__init__.py
   lxml/branch/html/src/lxml/html/tests/test_rewritelinks.txt
Log:
Move all the link functions directly into __init__; change rewriting to all use iter_links

Modified: lxml/branch/html/src/lxml/html/__init__.py
==============================================================================
--- lxml/branch/html/src/lxml/html/__init__.py	(original)
+++ lxml/branch/html/src/lxml/html/__init__.py	Fri Jun  1 06:25:27 2007
@@ -1,13 +1,19 @@
 import threading
 import re
+import urlparse
 from lxml import etree
+from lxml.html import defs
 
-__all__ = ['HTML', 'tostring', 'Element']
+__all__ = ['HTML', 'tostring', 'Element', 'defs',
+           'find_rel_links', 'find_class', 'make_links_absolute',
+           'resolve_base_href', 'iter_links', 'rewrite_links']
 
 _rel_links_xpath = etree.XPath("descendant-or-self::a[fn:upper-case(@rel)=$rel]")
 #_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'})
 _class_xpath = etree.XPath("descendant-or-self::*[contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
 _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")
+_css_url_re = re.compile(r'url\((.*?)\)', re.I)
+_css_import_re = re.compile(r'@import "(.*?)"')
 
 class HtmlMixin(object):
 
@@ -110,8 +116,11 @@
         tags in the document are used *and* removed from the document.
         If it is false then any such tag is ignored.
         """
-        from lxml.html.rewritelinks import make_links_absolute
-        make_links_absolute(self, base_href, resolve_base_href=resolve_base_href)
+        if resolve_base_href:
+            self.resolve_base_href()
+        def link_repl(href):
+            return urlparse.urljoin(base_href, href)
+        self.rewrite_links(link_repl)
 
     def resolve_base_href(self):
         """
@@ -119,25 +128,38 @@
         values to all links found in the document.  Also remove the
         tag once it has been applied.
         """
-        from lxml.html.rewritelinks import resolve_base_href
-        resolve_base_href(self)
-
-    def iter_links(self, in_order=True):
-        """
-        Iterate over all the links in the document, yielding
-        ``(element, attribute, link)``.
-
-        The ``element`` contains the link.  ``attribute`` is a string
-        like ``'href'`` or ``'src'``.  It may be None, which means
-        that the link is in the body of the element.  The only type
-        this occurs is with ``<style>`` tags that contain links like
-        ``url(...)``.  ``link`` is the actual link, like
-        ``'http://codespeak.net'``
-
-        Note: links are not returned in document order.
-        """
-        from lxml.html.rewritelinks import iter_links
-        return iter_links(self, in_order=in_order)
+        base_href = None
+        basetags = self.xpath('//base[@href]')
+        for b in basetags:
+            base_href = b.attrib['href']
+            b.drop_element()
+        if not base_href:
+            return
+        self.make_links_absolute(base_href, resolve_base_href=False)
+        
+    def iter_links(self):
+        """
+        Yield (element, attribute, link, pos), where attribute may be None
+        (indicating the link is in the text).  ``pos`` is the position
+        where the link occurs; often 0, but sometimes something else in
+        the case of links in stylesheets or style tags.
+
+        Note: <base href> is *not* taken into account in any way.  The
+        link you get is exactly the link in the document.
+        """
+        link_attrs = defs.link_attrs
+        for el in self.iterdescendants():
+            for attrib in link_attrs:
+                if attrib in el.attrib:
+                    yield (el, attrib, el.attrib[attrib], 0)
+            if el.tag == 'style' and el.text:
+                for match in _css_url_re.finditer(el.text):
+                    yield (el, None, match.group(1), match.start(1))
+                for match in _css_import_re.finditer(el.text):
+                    yield (el, None, match.group(1), match.start(1))
+            if 'style' in el.attrib:
+                for match in _css_url_re.finditer(el.attrib['style']):
+                    yield (el, 'style', match.group(1), match.start(1))
 
     def rewrite_links(self, link_repl_func, resolve_base_href=True,
                       base_href=None):
@@ -152,15 +174,30 @@
         ``'mailto:email'`` or ``'javascript:expr'``.
 
         If you give ``base_href`` then all links passed to
-        ``link_repl_func()`` will be absolute.
+        ``link_repl_func()`` will take that into account.
         """
-        from lxml.html.rewritelinks import rewrite_links
         if base_href is not None:
             # FIXME: this can be done in one pass with a wrapper
             # around link_repl_func
             self.make_links_absolute(base_href, resolve_base_href=resolve_base_href)
-            resolve_base_href = False
-        rewrite_links(self, link_repl_func, remove_base_tags=resolve_base_href)
+        elif resolve_base_href:
+            self.resolve_base_href()
+        for el, attrib, link, pos in self.iter_links():
+            new_link = link_repl_func(link)
+            if new_link == link:
+                continue
+            if attrib is None:
+                new = el.text[:pos] + new_link + el.text[pos+len(link):]
+                el.text = new
+            else:
+                cur = el.attrib[attrib]
+                if not pos and len(cur) == len(link):
+                    # Most common case
+                    el.attrib[attrib] = new_link
+                else:
+                    new = cur[:pos] + new_link + cur[pos+len(link):]
+                    el.attrib[attrib] = new
+                    
 
 class _MethodFunc(object):
     def __init__(self, name, fragment=False, source_class=HtmlMixin):

Deleted: /lxml/branch/html/src/lxml/html/rewritelinks.py
==============================================================================
--- /lxml/branch/html/src/lxml/html/rewritelinks.py	Fri Jun  1 06:25:27 2007
+++ (empty file)
@@ -1,148 +0,0 @@
-"""
-utilities for manipulating html links 
-"""
-
-
-from lxml.html import tostring, HTML
-from lxml.html import defs
-import urlparse
-import re
-
-__all__ = ['make_links_absolute', 'make_links_absolute_html',
-           'rewrite_links', 'rewrite_links_html',
-           'Relocator']
-
-def make_links_absolute(doc, base_href, resolve_base_href=True):
-    def link_repl(href):
-        return urlparse.urljoin(base_href, href)
-    rewrite_links(doc, link_repl_func, remove_base_tags=resolve_base_href)
-
-def make_links_absolute_html(html, base_href):
-    doc = HTML(html)
-    make_links_absolute(doc, base_href)
-    return tostring(doc)
-
-def rewrite_links_html(html, link_repl_func, remove_base_tags=True):
-    """
-    rewrite_links(), but work on text and returns text
-    """
-    doc = HTML(html)
-    rewrite_links(doc, link_repl_func, remove_base_tags=remove_base_tags)
-    return tostring(doc)
-
-def rewrite_links(doc, link_repl_func,
-                remove_base_tags=True):
-    """
-    Takes a given document (already parsed by lxml) and modifies it
-    in-place.  Every link is passed through link_repl_func, and the
-    output of that function replaces the link.
-    """
-    if remove_base_tags:
-        resolve_base_href(doc)
-
-    # FIXME: should use defs.link_attrs
-    for attrib in 'href', 'src':
-        els = doc.xpath('//*[@%s]' % attrib)
-        for el in els:
-            el.attrib[attrib] = link_repl_func(el.attrib[attrib])
-
-    rewrite_css_links(doc, link_repl_func)
-    rewrite_style_links(doc, link_repl_func)
-
-def resolve_base_href(doc):
-    """
-    removes all html <base href=""> tags 
-    from the document given. 
-    """
-    base_href = None
-    basetags = doc.xpath('//base[@href]')
-    for b in basetags:
-        base_href = b.attrib['href']
-        b.getparent().remove(b)
-    if base_href is None:
-        return
-    # Now that we have a base_href (blech) we have to fix up all the
-    # links in the document with this new information.
-    def link_repl(href):
-        return urlparse.urljoin(base_href, href)
-    rewrite_links(doc, link_repl, remove_base_tags=False)
-    
-CSS_URL_PAT = re.compile(r'url\((.*?)\)', re.I)
-CSS_IMPORT_PAT = re.compile(r'@import "(.*?)"')
-def rewrite_css_links(doc, link_repl_func):
-    """
-    Fixes up any url(...) links in CSS style elements
-    """
-    def absuri(matchobj):
-        return 'url(%s)' % link_repl_func(matchobj.group(1))
-    def absimport(matchobj):
-        return '@import "%s"' % link_repl_func(matchobj.group(1))
-    els = doc.xpath('//head/style')
-    for el in els:
-        if el.text:
-            el.text = CSS_URL_PAT.sub(absuri, el.text)
-            el.text = CSS_IMPORT_PAT.sub(absimport, el.text)
-
-def rewrite_style_links(doc, link_repl_func):
-    def absuri(matchobj):
-        return 'url(%s)' % link_repl_func(matchobj.group(1))
-    for el in doc.xpath("//*[contains(@style, 'url(')]"):
-        el.attrib['style'] = CSS_URL_PAT.sub(absuri, el.attrib['style'])
-
-def iter_links(doc):
-    """
-    Yield (element, attribute, link, pos), where attribute may be None
-    (indicating the link is in the text).  ``pos`` is the position
-    where the link occurs; often 0, but sometimes something else in
-    the case of links in stylesheets or style tags.
-
-    Note: <base href> is *not* taken into account in any way.  The
-    link you get is exactly the link in the document.
-    """
-    link_attrs = defs.link_attrs
-    for el in doc.iterdescendants():
-        for attrib in link_attrs:
-            if attrib in el.attrib:
-                yield (el, attrib, el.attrib[attrib], 0)
-        if el.tag == 'style' and el.text:
-            for match in CSS_URL_PAT.finditer(el.text):
-                yield (el, None, match.group(1), match.start(1))
-            for match in CSS_IMPORT_PAT.finditer(el.text):
-                yield (el, None, match.group(1), match.start(1))
-        if 'style' in el.attrib:
-            for match in CSS_URL_PAT.finditer(el.attrib['style']):
-                yield (el, 'style', match.group(1), match.start(1))
-            
-class Relocator(object):
-    """
-    This helper can be used to move all links in a document from one
-    location to another.  Typically you use this like::
-
-        rewrite_links_html(
-            html, Relocator('http://old-domain/', 'http://new-domain',
-                            base_href='http://old-domain/foo/bar.html'))
-
-    This means that the document was located at
-    ``http://old-domain/foo/bar.html`` (used to resolve relative
-    links), and that you want to change every occurance of
-    ``http://old-domain/`` to ``http://new-domain``
-    """
-    # This catches the case of http://foo, which is equivalent to
-    # http://foo/ :
-    _domain_no_slash_re = re.compile(r'^[a-z]+://[^/]+$', re.I)
-
-    def __init__(self, old_href, new_href, base_href=None):
-        self.old_href = old_href
-        self.new_href = new_href
-        self.base_href = base_href
-
-    def __call__(self, href):
-        if self.base_href is not None:
-            real_href = urlparse.urljoin(self.base_href, href)
-        if self._domain_no_slash_re.search(real_href):
-            real_href += '/'
-        if not real_href.startswith(self.old_href):
-            # A link somewhere else entirely
-            return href
-        return self.new_href + real_href[len(self.old_href):]
-    

Modified: lxml/branch/html/src/lxml/html/tests/test_rewritelinks.txt
==============================================================================
--- lxml/branch/html/src/lxml/html/tests/test_rewritelinks.txt	(original)
+++ lxml/branch/html/src/lxml/html/tests/test_rewritelinks.txt	Fri Jun  1 06:25:27 2007
@@ -154,35 +154,3 @@
     a href="/other.html"
     td style="/td-bg.png"@22
     img src="/logo.gif"
-    >>> print_iter(iter_links('''
-    ... <html>
-    ...  <head>
-    ...   <link rel="stylesheet" href="style.css">
-    ...   <style type="text/css">
-    ...     body {
-    ...       background-image: url(/bg.gif);
-    ...     }
-    ...     @import "/other-styles.css";
-    ...   </style>
-    ...   <script src="/js-funcs.js"></script>
-    ...  </head>
-    ...  <body>
-    ...   <table>
-    ...    <tr><td><ul>
-    ...     <li><a href="/test.html">Test stuff</a></li>
-    ...     <li><a href="/other.html">Other stuff</a></li>
-    ...    </td></tr>
-    ...    <td style="background-image: url(/td-bg.png)">
-    ...      <img src="/logo.gif">
-    ...      Hi world!
-    ...    </td></tr>
-    ...   </table>
-    ...  </body></html>''', False))
-    link href="style.css"
-    a href="/test.html"
-    a href="/other.html"
-    script src="/js-funcs.js"
-    img src="/logo.gif"
-    style None="/bg.gif"@40
-    style None="/other-styles.css"@69
-    td style="/td-bg.png"@22


More information about the lxml-checkins mailing list