[wwwsearch-commits] r36030 - in wwwsearch/mechanize/trunk: . mechanize

jjlee at codespeak.net jjlee at codespeak.net
Thu Dec 28 22:08:47 CET 2006


Author: jjlee
Date: Thu Dec 28 22:08:46 2006
New Revision: 36030

Modified:
   wwwsearch/mechanize/trunk/mechanize/_request.py
   wwwsearch/mechanize/trunk/mechanize/_rfc3986.py
   wwwsearch/mechanize/trunk/test.py
Log:
Fix redirection to URIs that contain characters that are not allowed in URIs (riko.wichmann at gmx.de).  Log a warning about such URIs

Modified: wwwsearch/mechanize/trunk/mechanize/_request.py
==============================================================================
--- wwwsearch/mechanize/trunk/mechanize/_request.py	(original)
+++ wwwsearch/mechanize/trunk/mechanize/_request.py	Thu Dec 28 22:08:46 2006
@@ -8,15 +8,27 @@
 
 """
 
-import urllib2
-import urllib
+import urllib2, urllib, logging
 
 from _clientcookie import request_host
+import _rfc3986
+
+warn = logging.getLogger("mechanize").warning
 
 
 class Request(urllib2.Request):
     def __init__(self, url, data=None, headers={},
                  origin_req_host=None, unverifiable=False, visit=None):
+        # In mechanize 0.2, the interpretation of a unicode url argument will
+        # change: A unicode url argument will be interpreted as an IRI, and a
+        # bytestring as a URI. For now, we accept unicode or bytestring.  We
+        # don't insist that the value is always a URI (specifically, must only
+        # contain characters which are legal), because that might break working
+        # code (who knows what bytes some servers want to see, especially with
+        # browser plugins for internationalised URIs).
+        if not _rfc3986.is_clean_uri(url):
+            warn("url argument is not a URI "
+                 "(contains illegal characters) %r" % url)
         urllib2.Request.__init__(self, url, data, headers)
         self.selector = None
         self.unredirected_hdrs = {}

Modified: wwwsearch/mechanize/trunk/mechanize/_rfc3986.py
==============================================================================
--- wwwsearch/mechanize/trunk/mechanize/_rfc3986.py	(original)
+++ wwwsearch/mechanize/trunk/mechanize/_rfc3986.py	Thu Dec 28 22:08:46 2006
@@ -17,17 +17,18 @@
 ## def chr_range(a, b):
 ##     return "".join(map(chr, range(ord(a), ord(b)+1)))
 
-## RESERVED_URL_CHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-##                       "abcdefghijklmnopqrstuvwxyz"
-##                       "-_.~")
-## UNRESERVED_URL_CHARS = "!*'();:@&=+$,/?%#[]"
-# we want (RESERVED_URL_CHARS+UNRESERVED_URL_CHARS), minus those
-# 'safe'-by-default characters that urllib.urlquote never quotes
-URLQUOTE_SAFE_URL_CHARS = "!*'();:@&=+$,/?%#[]~"
+## UNRESERVED_URI_CHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+##                         "abcdefghijklmnopqrstuvwxyz"
+##                         "0123456789"
+##                         "-_.~")
+## RESERVED_URI_CHARS = "!*'();:@&=+$,/?#[]"
+## URI_CHARS = RESERVED_URI_CHARS+UNRESERVED_URI_CHARS+'%'
+# this re matches any character that's not in URI_CHARS
+BAD_URI_CHARS_RE = re.compile("[^A-Za-z0-9\-_.~!*'();:@&=+$,/?%#[\]]")
 
 
 def clean_url(url, encoding):
-    # percent-encode illegal URL characters
+    # percent-encode illegal URI characters
     # Trying to come up with test cases for this gave me a headache, revisit
     # when do switch to unicode.
     # Somebody else's comments (lost the attribution):
@@ -37,7 +38,28 @@
     if type(url) == type(""):
         url = url.decode(encoding, "replace")
     url = url.strip()
-    return urllib.quote(url.encode(encoding), URLQUOTE_SAFE_URL_CHARS)
+    # for second param to urllib.quote(), we want URI_CHARS, minus the
+    # 'always_safe' characters that urllib.quote() never percent-encodes
+    return urllib.quote(url.encode(encoding), "!*'();:@&=+$,/?%#[]~")
+
+def is_clean_uri(uri):
+    """
+    >>> is_clean_uri("ABC!")
+    True
+    >>> is_clean_uri(u"ABC!")
+    True
+    >>> is_clean_uri("ABC|")
+    False
+    >>> is_clean_uri(u"ABC|")
+    False
+    >>> is_clean_uri("http://example.com/0")
+    True
+    >>> is_clean_uri(u"http://example.com/0")
+    True
+    """
+    # note module re treats bytestrings as through they were decoded as latin-1
+    # so this function accepts both unicode and bytestrings
+    return not bool(BAD_URI_CHARS_RE.search(uri))
 
 
 SPLIT_MATCH = re.compile(
@@ -211,3 +233,7 @@
     if ii >= 0:
         return base_path[:ii+1] + ref_path
     return ref_path
+
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod()

Modified: wwwsearch/mechanize/trunk/test.py
==============================================================================
--- wwwsearch/mechanize/trunk/test.py	(original)
+++ wwwsearch/mechanize/trunk/test.py	Thu Dec 28 22:08:46 2006
@@ -22,10 +22,12 @@
 from unittest import defaultTestLoader, TextTestRunner, TestSuite, TestCase, \
      _TextTestResult
 
-level = logging.DEBUG
+#level = logging.DEBUG
 #level = logging.INFO
+#level = logging.WARNING
 #level = logging.NOTSET
 #logging.getLogger("mechanize").setLevel(level)
+#logging.getLogger("mechanize").addHandler(logging.StreamHandler(sys.stdout))
 
 
 class CgitbTextResult(_TextTestResult):
@@ -188,8 +190,9 @@
 
     # run doctests in docstrings
     from mechanize import _headersutil, _auth, _clientcookie, _pullparser, \
-         _http
+         _http, _rfc3986
     doctest.testmod(_headersutil)
+    doctest.testmod(_rfc3986)
     doctest.testmod(_auth)
     doctest.testmod(_clientcookie)
     doctest.testmod(_pullparser)


More information about the wwwsearch-commits mailing list