[wwwsearch-commits] r28123 - in wwwsearch/mechanize/trunk: . mechanize test

jjlee at codespeak.net jjlee at codespeak.net
Fri Jun 2 23:20:30 CEST 2006


Author: jjlee
Date: Fri Jun  2 23:20:28 2006
New Revision: 28123

Added:
   wwwsearch/mechanize/trunk/mechanize/_rfc3986.py
   wwwsearch/mechanize/trunk/test/test_rfc3986.doctest
Modified:
   wwwsearch/mechanize/trunk/test.py
Log:
Add a nasty implementation of RFC 3986 URL-splitting / joining (not yet used in mechanize)

Added: wwwsearch/mechanize/trunk/mechanize/_rfc3986.py
==============================================================================
--- (empty file)
+++ wwwsearch/mechanize/trunk/mechanize/_rfc3986.py	Fri Jun  2 23:20:28 2006
@@ -0,0 +1,187 @@
+"""RFC 3986 URI parsing and relative reference resolution / absolutization.
+
+(aka splitting and joining)
+
+Copyright 2006 John J. Lee <jjl at pobox.com>
+
+This code is free software; you can redistribute it and/or modify it under
+the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
+included with the distribution).
+
+"""
+
+# XXX Wow, this is ugly.  Overly-direct translation of the RFC ATM.
+
+import sys, re, posixpath
+
+SPLIT_MATCH = re.compile(
+    r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?").match
+def urlsplit(absolute_uri):
+    """Return scheme, authority, path, query, fragment."""
+    match = SPLIT_MATCH(absolute_uri)
+    if match:
+        g = match.groups()
+        return g[1], g[3], g[4], g[6], g[8]
+
+def urlunsplit(parts):
+    scheme, authority, path, query, fragment = parts
+    r = []
+    append = r.append
+    if scheme is not None:
+        append(scheme)
+        append(":")
+    if authority is not None:
+        append("//")
+        append(authority)
+    append(path)
+    if query is not None:
+        append("?")
+        append(query)
+    if fragment is not None:
+        append("#")
+        append(fragment)
+    return "".join(r)
+
+def urljoin(base_uri, uri_reference):
+    return urlunsplit(urljoin_parts(urlsplit(base_uri),
+                                    urlsplit(uri_reference)))
+
+# oops, this doesn't do the same thing as the literal translation
+# from the RFC below
+## def urljoin_parts(base_parts, reference_parts):
+##     scheme, authority, path, query, fragment = base_parts
+##     rscheme, rauthority, rpath, rquery, rfragment = reference_parts
+
+##     # compute target URI path
+##     if rpath == "":
+##         tpath = path
+##     else:
+##         tpath = rpath
+##         if not tpath.startswith("/"):
+##             tpath = merge(authority, path, tpath)
+##         tpath = posixpath.normpath(tpath)
+
+##     if rscheme is not None:
+##         return (rscheme, rauthority, tpath, rquery, rfragment)
+##     elif rauthority is not None:
+##         return (scheme, rauthority, tpath, rquery, rfragment)
+##     elif rpath == "":
+##         if rquery is not None:
+##             tquery = rquery
+##         else:
+##             tquery = query
+##         return (scheme, authority, tpath, tquery, rfragment)
+##     else:
+##         return (scheme, authority, tpath, rquery, rfragment)
+
+def urljoin_parts(base_parts, reference_parts):
+    scheme, authority, path, query, fragment = base_parts
+    rscheme, rauthority, rpath, rquery, rfragment = reference_parts
+
+    if rscheme == scheme:
+        rscheme = None
+
+    if rscheme is not None:
+        tscheme, tauthority, tpath, tquery = (
+            rscheme, rauthority, remove_dot_segments(rpath), rquery)
+    else:
+        if rauthority is not None:
+            tauthority, tpath, tquery = (
+                rauthority, remove_dot_segments(rpath), rquery)
+        else:
+            if rpath == "":
+                tpath = path
+                if rquery is not None:
+                    tquery = rquery
+                else:
+                    tquery = query
+            else:
+                if rpath.startswith("/"):
+                    tpath = remove_dot_segments(rpath)
+                else:
+                    tpath = merge(authority, path, rpath)
+                    tpath = remove_dot_segments(tpath)
+                tquery = rquery
+            tauthority = authority
+        tscheme = scheme
+    tfragment = rfragment
+    return (tscheme, tauthority, tpath, tquery, tfragment)
+
+# um, something *vaguely* like this is what I want, but I have to generate
+# lots of test cases first, if only to understand what it is that
+# remove_dot_segments really does...
+## def remove_dot_segments(path):
+##     if path == '':
+##         return ''
+##     comps = path.split('/')
+##     new_comps = []
+##     for comp in comps:
+##         if comp in ['.', '']:
+##             if not new_comps or new_comps[-1]:
+##                 new_comps.append('')
+##             continue
+##         if comp != '..':
+##             new_comps.append(comp)
+##         elif new_comps:
+##             new_comps.pop()
+##     return '/'.join(new_comps)
+
+
+def remove_dot_segments(path):
+    r = []
+    while path:
+        # A
+        if path.startswith("../"):
+            path = path[3:]
+            continue
+        if path.startswith("./"):
+            path = path[2:]
+            continue
+        # B
+        if path.startswith("/./"):
+            path = path[2:]
+            continue
+        if path == "/.":
+            path = "/"
+            continue
+        # C
+        if path.startswith("/../"):
+            path = path[3:]
+            if r:
+                r.pop()
+            continue
+        if path == "/..":
+            path = "/"
+            r.pop()
+            continue
+        # D
+        if path == ".":
+            path = path[1:]
+            continue
+        if path == "..":
+            path = path[2:]
+            continue
+        # E
+        start = 0
+        if path.startswith("/"):
+            start = 1
+        ii = path.find("/", start)
+        if ii < 0:
+            ii = None
+        r.append(path[:ii])
+        if ii is None:
+            break
+        path = path[ii:]
+    return "".join(r)
+
+def merge(base_authority, base_path, ref_path):
+    # XXXX Oddly, the sample Perl implementation of this by Roy Fielding
+    # doesn't even take base_authority as a parameter, despite the wording in
+    # the RFC suggesting otherwise.  Perhaps I'm missing some obvious identity.
+    #if base_authority is not None and base_path == "":
+    if base_path == "":
+        return "/" + ref_path
+    ii = base_path.rfind("/")
+    if ii >= 0:
+        return base_path[:ii+1] + ref_path
+    return ref_path

Modified: wwwsearch/mechanize/trunk/test.py
==============================================================================
--- wwwsearch/mechanize/trunk/test.py	(original)
+++ wwwsearch/mechanize/trunk/test.py	Fri Jun  2 23:20:28 2006
@@ -108,15 +108,15 @@
 ##     __builtin__.jjl = jjl
 
     # XXX temporary stop-gap to run doctests
-    assert os.path.isdir('test')
-    sys.path.insert(0, 'test')
+    assert os.path.isdir("test")
+    sys.path.insert(0, "test")
     # needed for recent doctest / linecache -- this is only for testing
     # purposes, these don't get installed
     # doctest.py revision 45701 and linecache.py revision 45940.  Since
     # linecache is used by Python itself, linecache.py is renamed
     # linecache_copy.py, and this copy of doctest is modified (only) to use
     # that renamed module.
-    sys.path.insert(0, 'test-tools')
+    sys.path.insert(0, "test-tools")
     import doctest
     import mechanize
     common_globs = {"mechanize": mechanize}
@@ -126,10 +126,12 @@
         ]:
         globs.update(common_globs)
         doctest.testfile(
-            os.path.join('test', 'test_password_manager.doctest'),
-            #os.path.join('test', 'test_scratch.doctest'),
+            os.path.join("test", "test_password_manager.doctest"),
+            #os.path.join("test", "test_scratch.doctest"),
             globs=globs,
             )
+    
+    doctest.testfile(os.path.join("test", "test_rfc3986.doctest"))
     from mechanize import _headersutil, _auth, _clientcookie, _pullparser
     doctest.testmod(_headersutil)
     doctest.testmod(_auth)

Added: wwwsearch/mechanize/trunk/test/test_rfc3986.doctest
==============================================================================
--- (empty file)
+++ wwwsearch/mechanize/trunk/test/test_rfc3986.doctest	Fri Jun  2 23:20:28 2006
@@ -0,0 +1,156 @@
+>>> from mechanize._rfc3986 import urlsplit, urljoin, remove_dot_segments
+
+Some common cases
+
+>>> urlsplit("http://example.com/spam/eggs/spam.html?apples=pears&a=b#foo")
+('http', 'example.com', '/spam/eggs/spam.html', 'apples=pears&a=b', 'foo')
+>>> urlsplit("http://example.com/spam.html#foo")
+('http', 'example.com', '/spam.html', None, 'foo')
+>>> urlsplit("ftp://example.com/foo.gif")
+('ftp', 'example.com', '/foo.gif', None, None)
+>>> urlsplit('ftp://joe:password@example.com:port')
+('ftp', 'joe:password at example.com:port', '', None, None)
+>>> urlsplit("mailto:jjl at pobox.com")
+('mailto', None, 'jjl at pobox.com', None, None)
+
+The five path productions
+
+path-abempty:
+
+>>> urlsplit("http://www.example.com")
+('http', 'www.example.com', '', None, None)
+>>> urlsplit("http://www.example.com/foo")
+('http', 'www.example.com', '/foo', None, None)
+
+path-absolute:
+
+>>> urlsplit("a:/")
+('a', None, '/', None, None)
+>>> urlsplit("a:/b:/c/")
+('a', None, '/b:/c/', None, None)
+
+path-noscheme:
+
+>>> urlsplit("a:b/:c/")
+('a', None, 'b/:c/', None, None)
+
+path-rootless:
+
+>>> urlsplit("a:b:/c/")
+('a', None, 'b:/c/', None, None)
+
+path-empty:
+
+>>> urlsplit("quack:")
+('quack', None, '', None, None)
+
+
+>>> remove_dot_segments("/a/b/c/./../../g")
+'/a/g'
+>>> remove_dot_segments("mid/content=5/../6")
+'mid/6'
+>>> remove_dot_segments("/b/c/.")
+'/b/c/'
+>>> remove_dot_segments("/b/c/./.")
+'/b/c/'
+>>> remove_dot_segments(".")
+''
+>>> remove_dot_segments("/.")
+'/'
+>>> remove_dot_segments("./")
+''
+
+
+Examples from RFC 3986 section 5.4
+
+Normal Examples
+
+>>> base = "http://a/b/c/d;p?q"
+>>> def join(uri): return urljoin(base, uri)
+>>> join("g:h")
+'g:h'
+>>> join("g")
+'http://a/b/c/g'
+>>> join("./g")
+'http://a/b/c/g'
+>>> join("g/")
+'http://a/b/c/g/'
+>>> join("/g")
+'http://a/g'
+>>> join("//g")
+'http://g'
+>>> join("?y")
+'http://a/b/c/d;p?y'
+>>> join("g?y")
+'http://a/b/c/g?y'
+>>> join("#s")
+'http://a/b/c/d;p?q#s'
+>>> join("g#s")
+'http://a/b/c/g#s'
+>>> join("g?y#s")
+'http://a/b/c/g?y#s'
+>>> join(";x")
+'http://a/b/c/;x'
+>>> join("g;x")
+'http://a/b/c/g;x'
+>>> join("g;x?y#s")
+'http://a/b/c/g;x?y#s'
+>>> join("")
+'http://a/b/c/d;p?q'
+>>> join(".")
+'http://a/b/c/'
+>>> join("./")
+'http://a/b/c/'
+>>> join("..")
+'http://a/b/'
+>>> join("../")
+'http://a/b/'
+>>> join("../g")
+'http://a/b/g'
+>>> join("../..")
+'http://a/'
+>>> join("../../")
+'http://a/'
+>>> join("../../g")
+'http://a/g'
+
+Abnormal Examples
+
+>>> join("../../../g")
+'http://a/g'
+>>> join("../../../../g")
+'http://a/g'
+>>> join("/./g")
+'http://a/g'
+>>> join("/../g")
+'http://a/g'
+>>> join("g.")
+'http://a/b/c/g.'
+>>> join(".g")
+'http://a/b/c/.g'
+>>> join("g..")
+'http://a/b/c/g..'
+>>> join("..g")
+'http://a/b/c/..g'
+>>> join("./../g")
+'http://a/b/g'
+>>> join("./g/.")
+'http://a/b/c/g/'
+>>> join("g/./h")
+'http://a/b/c/g/h'
+>>> join("g/../h")
+'http://a/b/c/h'
+>>> join("g;x=1/./y")
+'http://a/b/c/g;x=1/y'
+>>> join("g;x=1/../y")
+'http://a/b/c/y'
+>>> join("g?y/./x")
+'http://a/b/c/g?y/./x'
+>>> join("g?y/../x")
+'http://a/b/c/g?y/../x'
+>>> join("g#s/./x")
+'http://a/b/c/g#s/./x'
+>>> join("g#s/../x")
+'http://a/b/c/g#s/../x'
+>>> join("http:g")
+'http://a/b/c/g'


More information about the wwwsearch-commits mailing list