[wwwsearch-commits] r28123 - in wwwsearch/mechanize/trunk: . mechanize test
jjlee at codespeak.net
jjlee at codespeak.net
Fri Jun 2 23:20:30 CEST 2006
Author: jjlee
Date: Fri Jun 2 23:20:28 2006
New Revision: 28123
Added:
wwwsearch/mechanize/trunk/mechanize/_rfc3986.py
wwwsearch/mechanize/trunk/test/test_rfc3986.doctest
Modified:
wwwsearch/mechanize/trunk/test.py
Log:
Add a nasty implementation of RFC 3986 URL-splitting / joining (not yet used in mechanize)
Added: wwwsearch/mechanize/trunk/mechanize/_rfc3986.py
==============================================================================
--- (empty file)
+++ wwwsearch/mechanize/trunk/mechanize/_rfc3986.py Fri Jun 2 23:20:28 2006
@@ -0,0 +1,187 @@
+"""RFC 3986 URI parsing and relative reference resolution / absolutization.
+
+(aka splitting and joining)
+
+Copyright 2006 John J. Lee <jjl at pobox.com>
+
+This code is free software; you can redistribute it and/or modify it under
+the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
+included with the distribution).
+
+"""
+
+# XXX Wow, this is ugly. Overly-direct translation of the RFC ATM.
+
+import sys, re, posixpath
+
+SPLIT_MATCH = re.compile(
+ r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?").match
+def urlsplit(absolute_uri):
+ """Return scheme, authority, path, query, fragment."""
+ match = SPLIT_MATCH(absolute_uri)
+ if match:
+ g = match.groups()
+ return g[1], g[3], g[4], g[6], g[8]
+
+def urlunsplit(parts):
+ scheme, authority, path, query, fragment = parts
+ r = []
+ append = r.append
+ if scheme is not None:
+ append(scheme)
+ append(":")
+ if authority is not None:
+ append("//")
+ append(authority)
+ append(path)
+ if query is not None:
+ append("?")
+ append(query)
+ if fragment is not None:
+ append("#")
+ append(fragment)
+ return "".join(r)
+
+def urljoin(base_uri, uri_reference):
+ return urlunsplit(urljoin_parts(urlsplit(base_uri),
+ urlsplit(uri_reference)))
+
+# oops, this doesn't do the same thing as the literal translation
+# from the RFC below
+## def urljoin_parts(base_parts, reference_parts):
+## scheme, authority, path, query, fragment = base_parts
+## rscheme, rauthority, rpath, rquery, rfragment = reference_parts
+
+## # compute target URI path
+## if rpath == "":
+## tpath = path
+## else:
+## tpath = rpath
+## if not tpath.startswith("/"):
+## tpath = merge(authority, path, tpath)
+## tpath = posixpath.normpath(tpath)
+
+## if rscheme is not None:
+## return (rscheme, rauthority, tpath, rquery, rfragment)
+## elif rauthority is not None:
+## return (scheme, rauthority, tpath, rquery, rfragment)
+## elif rpath == "":
+## if rquery is not None:
+## tquery = rquery
+## else:
+## tquery = query
+## return (scheme, authority, tpath, tquery, rfragment)
+## else:
+## return (scheme, authority, tpath, rquery, rfragment)
+
+def urljoin_parts(base_parts, reference_parts):
+ scheme, authority, path, query, fragment = base_parts
+ rscheme, rauthority, rpath, rquery, rfragment = reference_parts
+
+ if rscheme == scheme:
+ rscheme = None
+
+ if rscheme is not None:
+ tscheme, tauthority, tpath, tquery = (
+ rscheme, rauthority, remove_dot_segments(rpath), rquery)
+ else:
+ if rauthority is not None:
+ tauthority, tpath, tquery = (
+ rauthority, remove_dot_segments(rpath), rquery)
+ else:
+ if rpath == "":
+ tpath = path
+ if rquery is not None:
+ tquery = rquery
+ else:
+ tquery = query
+ else:
+ if rpath.startswith("/"):
+ tpath = remove_dot_segments(rpath)
+ else:
+ tpath = merge(authority, path, rpath)
+ tpath = remove_dot_segments(tpath)
+ tquery = rquery
+ tauthority = authority
+ tscheme = scheme
+ tfragment = rfragment
+ return (tscheme, tauthority, tpath, tquery, tfragment)
+
+# um, something *vaguely* like this is what I want, but I have to generate
+# lots of test cases first, if only to understand what it is that
+# remove_dot_segments really does...
+## def remove_dot_segments(path):
+## if path == '':
+## return ''
+## comps = path.split('/')
+## new_comps = []
+## for comp in comps:
+## if comp in ['.', '']:
+## if not new_comps or new_comps[-1]:
+## new_comps.append('')
+## continue
+## if comp != '..':
+## new_comps.append(comp)
+## elif new_comps:
+## new_comps.pop()
+## return '/'.join(new_comps)
+
+
+def remove_dot_segments(path):
+ r = []
+ while path:
+ # A
+ if path.startswith("../"):
+ path = path[3:]
+ continue
+ if path.startswith("./"):
+ path = path[2:]
+ continue
+ # B
+ if path.startswith("/./"):
+ path = path[2:]
+ continue
+ if path == "/.":
+ path = "/"
+ continue
+ # C
+ if path.startswith("/../"):
+ path = path[3:]
+ if r:
+ r.pop()
+ continue
+ if path == "/..":
+ path = "/"
+ r.pop()
+ continue
+ # D
+ if path == ".":
+ path = path[1:]
+ continue
+ if path == "..":
+ path = path[2:]
+ continue
+ # E
+ start = 0
+ if path.startswith("/"):
+ start = 1
+ ii = path.find("/", start)
+ if ii < 0:
+ ii = None
+ r.append(path[:ii])
+ if ii is None:
+ break
+ path = path[ii:]
+ return "".join(r)
+
+def merge(base_authority, base_path, ref_path):
+ # XXXX Oddly, the sample Perl implementation of this by Roy Fielding
+ # doesn't even take base_authority as a parameter, despite the wording in
+ # the RFC suggesting otherwise. Perhaps I'm missing some obvious identity.
+ #if base_authority is not None and base_path == "":
+ if base_path == "":
+ return "/" + ref_path
+ ii = base_path.rfind("/")
+ if ii >= 0:
+ return base_path[:ii+1] + ref_path
+ return ref_path
Modified: wwwsearch/mechanize/trunk/test.py
==============================================================================
--- wwwsearch/mechanize/trunk/test.py (original)
+++ wwwsearch/mechanize/trunk/test.py Fri Jun 2 23:20:28 2006
@@ -108,15 +108,15 @@
## __builtin__.jjl = jjl
# XXX temporary stop-gap to run doctests
- assert os.path.isdir('test')
- sys.path.insert(0, 'test')
+ assert os.path.isdir("test")
+ sys.path.insert(0, "test")
# needed for recent doctest / linecache -- this is only for testing
# purposes, these don't get installed
# doctest.py revision 45701 and linecache.py revision 45940. Since
# linecache is used by Python itself, linecache.py is renamed
# linecache_copy.py, and this copy of doctest is modified (only) to use
# that renamed module.
- sys.path.insert(0, 'test-tools')
+ sys.path.insert(0, "test-tools")
import doctest
import mechanize
common_globs = {"mechanize": mechanize}
@@ -126,10 +126,12 @@
]:
globs.update(common_globs)
doctest.testfile(
- os.path.join('test', 'test_password_manager.doctest'),
- #os.path.join('test', 'test_scratch.doctest'),
+ os.path.join("test", "test_password_manager.doctest"),
+ #os.path.join("test", "test_scratch.doctest"),
globs=globs,
)
+
+ doctest.testfile(os.path.join("test", "test_rfc3986.doctest"))
from mechanize import _headersutil, _auth, _clientcookie, _pullparser
doctest.testmod(_headersutil)
doctest.testmod(_auth)
Added: wwwsearch/mechanize/trunk/test/test_rfc3986.doctest
==============================================================================
--- (empty file)
+++ wwwsearch/mechanize/trunk/test/test_rfc3986.doctest Fri Jun 2 23:20:28 2006
@@ -0,0 +1,156 @@
+>>> from mechanize._rfc3986 import urlsplit, urljoin, remove_dot_segments
+
+Some common cases
+
+>>> urlsplit("http://example.com/spam/eggs/spam.html?apples=pears&a=b#foo")
+('http', 'example.com', '/spam/eggs/spam.html', 'apples=pears&a=b', 'foo')
+>>> urlsplit("http://example.com/spam.html#foo")
+('http', 'example.com', '/spam.html', None, 'foo')
+>>> urlsplit("ftp://example.com/foo.gif")
+('ftp', 'example.com', '/foo.gif', None, None)
+>>> urlsplit('ftp://joe:password@example.com:port')
+('ftp', 'joe:password at example.com:port', '', None, None)
+>>> urlsplit("mailto:jjl at pobox.com")
+('mailto', None, 'jjl at pobox.com', None, None)
+
+The five path productions
+
+path-abempty:
+
+>>> urlsplit("http://www.example.com")
+('http', 'www.example.com', '', None, None)
+>>> urlsplit("http://www.example.com/foo")
+('http', 'www.example.com', '/foo', None, None)
+
+path-absolute:
+
+>>> urlsplit("a:/")
+('a', None, '/', None, None)
+>>> urlsplit("a:/b:/c/")
+('a', None, '/b:/c/', None, None)
+
+path-noscheme:
+
+>>> urlsplit("a:b/:c/")
+('a', None, 'b/:c/', None, None)
+
+path-rootless:
+
+>>> urlsplit("a:b:/c/")
+('a', None, 'b:/c/', None, None)
+
+path-empty:
+
+>>> urlsplit("quack:")
+('quack', None, '', None, None)
+
+
+>>> remove_dot_segments("/a/b/c/./../../g")
+'/a/g'
+>>> remove_dot_segments("mid/content=5/../6")
+'mid/6'
+>>> remove_dot_segments("/b/c/.")
+'/b/c/'
+>>> remove_dot_segments("/b/c/./.")
+'/b/c/'
+>>> remove_dot_segments(".")
+''
+>>> remove_dot_segments("/.")
+'/'
+>>> remove_dot_segments("./")
+''
+
+
+Examples from RFC 3986 section 5.4
+
+Normal Examples
+
+>>> base = "http://a/b/c/d;p?q"
+>>> def join(uri): return urljoin(base, uri)
+>>> join("g:h")
+'g:h'
+>>> join("g")
+'http://a/b/c/g'
+>>> join("./g")
+'http://a/b/c/g'
+>>> join("g/")
+'http://a/b/c/g/'
+>>> join("/g")
+'http://a/g'
+>>> join("//g")
+'http://g'
+>>> join("?y")
+'http://a/b/c/d;p?y'
+>>> join("g?y")
+'http://a/b/c/g?y'
+>>> join("#s")
+'http://a/b/c/d;p?q#s'
+>>> join("g#s")
+'http://a/b/c/g#s'
+>>> join("g?y#s")
+'http://a/b/c/g?y#s'
+>>> join(";x")
+'http://a/b/c/;x'
+>>> join("g;x")
+'http://a/b/c/g;x'
+>>> join("g;x?y#s")
+'http://a/b/c/g;x?y#s'
+>>> join("")
+'http://a/b/c/d;p?q'
+>>> join(".")
+'http://a/b/c/'
+>>> join("./")
+'http://a/b/c/'
+>>> join("..")
+'http://a/b/'
+>>> join("../")
+'http://a/b/'
+>>> join("../g")
+'http://a/b/g'
+>>> join("../..")
+'http://a/'
+>>> join("../../")
+'http://a/'
+>>> join("../../g")
+'http://a/g'
+
+Abnormal Examples
+
+>>> join("../../../g")
+'http://a/g'
+>>> join("../../../../g")
+'http://a/g'
+>>> join("/./g")
+'http://a/g'
+>>> join("/../g")
+'http://a/g'
+>>> join("g.")
+'http://a/b/c/g.'
+>>> join(".g")
+'http://a/b/c/.g'
+>>> join("g..")
+'http://a/b/c/g..'
+>>> join("..g")
+'http://a/b/c/..g'
+>>> join("./../g")
+'http://a/b/g'
+>>> join("./g/.")
+'http://a/b/c/g/'
+>>> join("g/./h")
+'http://a/b/c/g/h'
+>>> join("g/../h")
+'http://a/b/c/h'
+>>> join("g;x=1/./y")
+'http://a/b/c/g;x=1/y'
+>>> join("g;x=1/../y")
+'http://a/b/c/y'
+>>> join("g?y/./x")
+'http://a/b/c/g?y/./x'
+>>> join("g?y/../x")
+'http://a/b/c/g?y/../x'
+>>> join("g#s/./x")
+'http://a/b/c/g#s/./x'
+>>> join("g#s/../x")
+'http://a/b/c/g#s/../x'
+>>> join("http:g")
+'http://a/b/c/g'
More information about the wwwsearch-commits
mailing list