[z3-checkins] r34687 - z3/deliverance/trunk/deliverance
ianb at codespeak.net
ianb at codespeak.net
Thu Nov 16 19:18:27 CET 2006
Author: ianb
Date: Thu Nov 16 19:18:24 2006
New Revision: 34687
Added:
z3/deliverance/trunk/deliverance/test_relocate.txt (contents, props changed)
Modified:
z3/deliverance/trunk/deliverance/fixuplinks.py
z3/deliverance/trunk/deliverance/relocateresponse.py
Log:
Tests for link relocating, and some bug fixes: respect base href, and handling http://foo (with no trailing slash) as identical to http://foo/
Modified: z3/deliverance/trunk/deliverance/fixuplinks.py
==============================================================================
--- z3/deliverance/trunk/deliverance/fixuplinks.py (original)
+++ z3/deliverance/trunk/deliverance/fixuplinks.py Thu Nov 16 19:18:24 2006
@@ -4,6 +4,7 @@
from htmlserialize import decodeAndParseHTML, tostring
+import urlparse
import re
def fixup_text_links(doc, link_repl_func, remove_base_tags=True):
@@ -22,7 +23,7 @@
output of that function replaces the link.
"""
if remove_base_tags:
- remove_base_tags_from_document(doc)
+ resolve_base_tags_in_document(doc)
for attrib in 'href', 'src':
els = doc.xpath('//*[@%s]' % attrib)
@@ -31,14 +32,23 @@
fixup_css_links(doc, link_repl_func)
-def remove_base_tags_from_document(doc):
+def resolve_base_tags_in_document(doc):
"""
removes all html <base href=""> tags
from the document given.
"""
+ base_href = None
basetags = doc.xpath('//base[@href]')
for b in basetags:
+ base_href = b.attrib['href']
b.getparent().remove(b)
+ if base_href is None:
+ return
+ # Now that we have a base_href (blech) we have to fix up all the
+ # links in the document with this new information.
+ def link_repl(href):
+ return urlparse.urljoin(base_href, href)
+ fixup_links(doc, link_repl, remove_base_tags=False)
CSS_URL_PAT = re.compile(r'url\((.*?)\)', re.I)
def fixup_css_links(doc, link_repl_func):
Modified: z3/deliverance/trunk/deliverance/relocateresponse.py
==============================================================================
--- z3/deliverance/trunk/deliverance/relocateresponse.py (original)
+++ z3/deliverance/trunk/deliverance/relocateresponse.py Thu Nov 16 19:18:24 2006
@@ -2,10 +2,11 @@
Takes a response (headers + content) and relocates it, changing domain
names and paths.
"""
-import fixuplinks
import urlparse
+import re
from paste.request import construct_url
from paste.response import header_value
+import fixuplinks
def relocate_response(headers, content, base_href, old_href, new_href):
"""
@@ -29,8 +30,14 @@
return relocate_href(href, base_href, old_href, new_href)
return fixuplinks.fixup_text_links(content, sub_link)
+# This catches the case of http://foo, which is equivalent to
+# http://foo/ :
+_domain_no_slash_re = re.compile(r'^[a-z]+://[^/]+$', re.I)
+
def relocate_href(href, base_href, old_href, new_href):
real_href = urlparse.urljoin(base_href, href)
+ if _domain_no_slash_re.search(real_href):
+ real_href += '/'
if not real_href.startswith(old_href):
return href
return new_href + real_href[len(old_href):]
Added: z3/deliverance/trunk/deliverance/test_relocate.txt
==============================================================================
--- (empty file)
+++ z3/deliverance/trunk/deliverance/test_relocate.txt Thu Nov 16 19:18:24 2006
@@ -0,0 +1,92 @@
+These are tests of relocateresponse::
+
+ >>> from deliverance.relocateresponse import *
+
+In all these examples we'll be using ``http://old`` for the old
+(to-be-replaced) URL and ``https://new`` for the new URL (note the
+scheme change). Out of laziness we'll define some keywords we use
+with all these transformations::
+
+ >>> kw = dict(base_href='http://old/base/path.html',
+ ... old_href='http://old/',
+ ... new_href='https://new/')
+
+Now lets look at simple href rewriting.
+
+Normal rewrite::
+
+ >>> relocate_href('http://old/bar', **kw)
+ 'https://new/bar'
+
+Note that the trailing doesn't matter in this one case (since
+``http://old`` and ``http://old/`` are entirely equivalent)::
+
+ >>> relocate_href('http://old', **kw)
+ 'https://new/'
+
+It does in other cases::
+
+ >>> relocate_href('http://old-test/foo',
+ ... base_href='',
+ ... old_href='http://old-test/foo/',
+ ... new_href='https://new')
+ 'http://old-test/foo'
+ >>> relocate_href('http://old-test/foo/',
+ ... base_href='',
+ ... old_href='http://old-test/foo/',
+ ... new_href='https://new')
+ 'https://new'
+
+Rewriting a link that doesn't match old_href is a no-op::
+
+ >>> relocate_href('http://foo/bar', **kw)
+ 'http://foo/bar'
+
+Relative links are handled::
+
+ >>> relocate_href('index.html', **kw)
+ 'https://new/base/index.html'
+
+Now we look at header rewriting. Note that Location is rewritten, but
+other headers are not. (Set-Cookie should also get some rewriting,
+but does not yet)::
+
+ >>> relocate_headers([('X-Unknown', 'http://old'),
+ ... ('Location', 'http://old/foo/bar')],
+ ... **kw)
+ [('X-Unknown', 'http://old'), ('Location', 'https://new/foo/bar')]
+
+But the location header won't be rewritten if it points to a
+third-party site::
+
+ >>> relocate_headers([('Location', 'http://foo/bar')],
+ ... **kw)
+ [('Location', 'http://foo/bar')]
+
+Now for content. First, to make it easier on us, we need to trim the
+normalized HTML we get from these functions::
+
+ >>> import re
+ >>> def pr_html(html):
+ ... html = re.sub(r'</?(?:html|head|body)>', '', html)
+ ... html = re.sub(r'<meta.*?>', '', html)
+ ... print html.strip()
+
+Some basics::
+
+ >>> pr_html(relocate_content(
+ ... '<a href="http://old/blah/blah.html">link</a>', **kw))
+ <a href="https://new/blah/blah.html">link</a>
+ >>> pr_html(relocate_content(
+ ... '<script src="http://old/foo.js"></script>', **kw))
+ <script src="https://new/foo.js"></script>
+ >>> pr_html(relocate_content(
+ ... '<link href="foo.css">', **kw))
+ <link href="https://new/base/foo.css">
+ >>> pr_html(relocate_content('''\
+ ... <base href="http://blah/stuff/index.html">
+ ... <link href="foo.css">
+ ... <a href="http://old/bar.html">x</a>\
+ ... ''', **kw))
+ <link href="http://blah/stuff/foo.css">
+ <a href="https://new/bar.html">x</a>
More information about the z3-checkins
mailing list