[z3-checkins] r34687 - z3/deliverance/trunk/deliverance

ianb at codespeak.net ianb at codespeak.net
Thu Nov 16 19:18:27 CET 2006


Author: ianb
Date: Thu Nov 16 19:18:24 2006
New Revision: 34687

Added:
   z3/deliverance/trunk/deliverance/test_relocate.txt   (contents, props changed)
Modified:
   z3/deliverance/trunk/deliverance/fixuplinks.py
   z3/deliverance/trunk/deliverance/relocateresponse.py
Log:
Tests for link relocating, and some bug fixes: respect base href, and handling http://foo (with no trailing slash) as identical to http://foo/

Modified: z3/deliverance/trunk/deliverance/fixuplinks.py
==============================================================================
--- z3/deliverance/trunk/deliverance/fixuplinks.py	(original)
+++ z3/deliverance/trunk/deliverance/fixuplinks.py	Thu Nov 16 19:18:24 2006
@@ -4,6 +4,7 @@
 
 
 from htmlserialize import decodeAndParseHTML, tostring
+import urlparse
 import re
 
 def fixup_text_links(doc, link_repl_func, remove_base_tags=True):
@@ -22,7 +23,7 @@
     output of that function replaces the link.
     """
     if remove_base_tags:
-        remove_base_tags_from_document(doc)
+        resolve_base_tags_in_document(doc)
 
     for attrib in 'href', 'src':
         els = doc.xpath('//*[@%s]' % attrib)
@@ -31,14 +32,23 @@
 
     fixup_css_links(doc, link_repl_func)
 
-def remove_base_tags_from_document(doc):
+def resolve_base_tags_in_document(doc):
     """
     removes all html <base href=""> tags 
     from the document given. 
     """
+    base_href = None
     basetags = doc.xpath('//base[@href]')
     for b in basetags:
+        base_href = b.attrib['href']
         b.getparent().remove(b)
+    if base_href is None:
+        return
+    # Now that we have a base_href (blech) we have to fix up all the
+    # links in the document with this new information.
+    def link_repl(href):
+        return urlparse.urljoin(base_href, href)
+    fixup_links(doc, link_repl, remove_base_tags=False)
     
 CSS_URL_PAT = re.compile(r'url\((.*?)\)', re.I)
 def fixup_css_links(doc, link_repl_func):

Modified: z3/deliverance/trunk/deliverance/relocateresponse.py
==============================================================================
--- z3/deliverance/trunk/deliverance/relocateresponse.py	(original)
+++ z3/deliverance/trunk/deliverance/relocateresponse.py	Thu Nov 16 19:18:24 2006
@@ -2,10 +2,11 @@
 Takes a response (headers + content) and relocates it, changing domain
 names and paths.
 """
-import fixuplinks
 import urlparse
+import re
 from paste.request import construct_url
 from paste.response import header_value
+import fixuplinks
 
 def relocate_response(headers, content, base_href, old_href, new_href):
     """
@@ -29,8 +30,14 @@
         return relocate_href(href, base_href, old_href, new_href)
     return fixuplinks.fixup_text_links(content, sub_link)
 
+# This catches the case of http://foo, which is equivalent to
+# http://foo/ :
+_domain_no_slash_re = re.compile(r'^[a-z]+://[^/]+$', re.I)
+
 def relocate_href(href, base_href, old_href, new_href):
     real_href = urlparse.urljoin(base_href, href)
+    if _domain_no_slash_re.search(real_href):
+        real_href += '/'
     if not real_href.startswith(old_href):
         return href
     return new_href + real_href[len(old_href):]

Added: z3/deliverance/trunk/deliverance/test_relocate.txt
==============================================================================
--- (empty file)
+++ z3/deliverance/trunk/deliverance/test_relocate.txt	Thu Nov 16 19:18:24 2006
@@ -0,0 +1,92 @@
+These are tests of relocateresponse::
+
+    >>> from deliverance.relocateresponse import *
+
+In all these examples we'll be using ``http://old`` for the old
+(to-be-replaced) URL and ``https://new`` for the new URL (note the
+scheme change).  Out of laziness we'll define some keywords we use
+with all these transformations::
+
+    >>> kw = dict(base_href='http://old/base/path.html',
+    ...           old_href='http://old/',
+    ...           new_href='https://new/')
+
+Now lets look at simple href rewriting.
+
+Normal rewrite::
+
+    >>> relocate_href('http://old/bar', **kw)
+    'https://new/bar'
+
+Note that the trailing doesn't matter in this one case (since
+``http://old`` and ``http://old/`` are entirely equivalent)::
+
+    >>> relocate_href('http://old', **kw)
+    'https://new/'
+
+It does in other cases::
+
+    >>> relocate_href('http://old-test/foo',
+    ...               base_href='',
+    ...               old_href='http://old-test/foo/',
+    ...               new_href='https://new')
+    'http://old-test/foo'
+    >>> relocate_href('http://old-test/foo/',
+    ...               base_href='',
+    ...               old_href='http://old-test/foo/',
+    ...               new_href='https://new')
+    'https://new'
+
+Rewriting a link that doesn't match old_href is a no-op::
+
+    >>> relocate_href('http://foo/bar', **kw)
+    'http://foo/bar'
+
+Relative links are handled::
+
+    >>> relocate_href('index.html', **kw)
+    'https://new/base/index.html'
+
+Now we look at header rewriting.  Note that Location is rewritten, but
+other headers are not.  (Set-Cookie should also get some rewriting,
+but does not yet)::
+
+    >>> relocate_headers([('X-Unknown', 'http://old'),
+    ...                   ('Location', 'http://old/foo/bar')],
+    ...                  **kw)
+    [('X-Unknown', 'http://old'), ('Location', 'https://new/foo/bar')]
+
+But the location header won't be rewritten if it points to a
+third-party site::
+
+    >>> relocate_headers([('Location', 'http://foo/bar')],
+    ...                  **kw)
+    [('Location', 'http://foo/bar')]
+
+Now for content.  First, to make it easier on us, we need to trim the
+normalized HTML we get from these functions::
+
+    >>> import re
+    >>> def pr_html(html):
+    ...     html = re.sub(r'</?(?:html|head|body)>', '', html)
+    ...     html = re.sub(r'<meta.*?>', '', html)
+    ...     print html.strip()
+
+Some basics::
+
+    >>> pr_html(relocate_content(
+    ...     '<a href="http://old/blah/blah.html">link</a>', **kw))
+    <a href="https://new/blah/blah.html">link</a>
+    >>> pr_html(relocate_content(
+    ...     '<script src="http://old/foo.js"></script>', **kw))
+    <script src="https://new/foo.js"></script>
+    >>> pr_html(relocate_content(
+    ...     '<link href="foo.css">', **kw))
+    <link href="https://new/base/foo.css">
+    >>> pr_html(relocate_content('''\
+    ... <base href="http://blah/stuff/index.html">
+    ... <link href="foo.css">
+    ... <a href="http://old/bar.html">x</a>\
+    ... ''', **kw))
+    <link href="http://blah/stuff/foo.css">
+    <a href="https://new/bar.html">x</a>


More information about the z3-checkins mailing list