[wwwsearch-commits] r36030 - in wwwsearch/mechanize/trunk: . mechanize
jjlee at codespeak.net
jjlee at codespeak.net
Thu Dec 28 22:08:47 CET 2006
Author: jjlee
Date: Thu Dec 28 22:08:46 2006
New Revision: 36030
Modified:
wwwsearch/mechanize/trunk/mechanize/_request.py
wwwsearch/mechanize/trunk/mechanize/_rfc3986.py
wwwsearch/mechanize/trunk/test.py
Log:
Fix redirection to URIs that contain characters that are not allowed in URIs (riko.wichmann at gmx.de). Log a warning about such URIs
Modified: wwwsearch/mechanize/trunk/mechanize/_request.py
==============================================================================
--- wwwsearch/mechanize/trunk/mechanize/_request.py (original)
+++ wwwsearch/mechanize/trunk/mechanize/_request.py Thu Dec 28 22:08:46 2006
@@ -8,15 +8,27 @@
"""
-import urllib2
-import urllib
+import urllib2, urllib, logging
from _clientcookie import request_host
+import _rfc3986
+
+warn = logging.getLogger("mechanize").warning
class Request(urllib2.Request):
def __init__(self, url, data=None, headers={},
origin_req_host=None, unverifiable=False, visit=None):
+ # In mechanize 0.2, the interpretation of a unicode url argument will
+ # change: A unicode url argument will be interpreted as an IRI, and a
+ # bytestring as a URI. For now, we accept unicode or bytestring. We
+ # don't insist that the value is always a URI (specifically, must only
+ # contain characters which are legal), because that might break working
+ # code (who knows what bytes some servers want to see, especially with
+ # browser plugins for internationalised URIs).
+ if not _rfc3986.is_clean_uri(url):
+ warn("url argument is not a URI "
+ "(contains illegal characters) %r" % url)
urllib2.Request.__init__(self, url, data, headers)
self.selector = None
self.unredirected_hdrs = {}
Modified: wwwsearch/mechanize/trunk/mechanize/_rfc3986.py
==============================================================================
--- wwwsearch/mechanize/trunk/mechanize/_rfc3986.py (original)
+++ wwwsearch/mechanize/trunk/mechanize/_rfc3986.py Thu Dec 28 22:08:46 2006
@@ -17,17 +17,18 @@
## def chr_range(a, b):
## return "".join(map(chr, range(ord(a), ord(b)+1)))
-## RESERVED_URL_CHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-## "abcdefghijklmnopqrstuvwxyz"
-## "-_.~")
-## UNRESERVED_URL_CHARS = "!*'();:@&=+$,/?%#[]"
-# we want (RESERVED_URL_CHARS+UNRESERVED_URL_CHARS), minus those
-# 'safe'-by-default characters that urllib.urlquote never quotes
-URLQUOTE_SAFE_URL_CHARS = "!*'();:@&=+$,/?%#[]~"
+## UNRESERVED_URI_CHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+## "abcdefghijklmnopqrstuvwxyz"
+## "0123456789"
+## "-_.~")
+## RESERVED_URI_CHARS = "!*'();:@&=+$,/?#[]"
+## URI_CHARS = RESERVED_URI_CHARS+UNRESERVED_URI_CHARS+'%'
+# this re matches any character that's not in URI_CHARS
+BAD_URI_CHARS_RE = re.compile("[^A-Za-z0-9\-_.~!*'();:@&=+$,/?%#[\]]")
def clean_url(url, encoding):
- # percent-encode illegal URL characters
+ # percent-encode illegal URI characters
# Trying to come up with test cases for this gave me a headache, revisit
# when do switch to unicode.
# Somebody else's comments (lost the attribution):
@@ -37,7 +38,28 @@
if type(url) == type(""):
url = url.decode(encoding, "replace")
url = url.strip()
- return urllib.quote(url.encode(encoding), URLQUOTE_SAFE_URL_CHARS)
+ # for second param to urllib.quote(), we want URI_CHARS, minus the
+ # 'always_safe' characters that urllib.quote() never percent-encodes
+ return urllib.quote(url.encode(encoding), "!*'();:@&=+$,/?%#[]~")
+
+def is_clean_uri(uri):
+ """
+ >>> is_clean_uri("ABC!")
+ True
+ >>> is_clean_uri(u"ABC!")
+ True
+ >>> is_clean_uri("ABC|")
+ False
+ >>> is_clean_uri(u"ABC|")
+ False
+ >>> is_clean_uri("http://example.com/0")
+ True
+ >>> is_clean_uri(u"http://example.com/0")
+ True
+ """
+ # note module re treats bytestrings as through they were decoded as latin-1
+ # so this function accepts both unicode and bytestrings
+ return not bool(BAD_URI_CHARS_RE.search(uri))
SPLIT_MATCH = re.compile(
@@ -211,3 +233,7 @@
if ii >= 0:
return base_path[:ii+1] + ref_path
return ref_path
+
+if __name__ == "__main__":
+ import doctest
+ doctest.testmod()
Modified: wwwsearch/mechanize/trunk/test.py
==============================================================================
--- wwwsearch/mechanize/trunk/test.py (original)
+++ wwwsearch/mechanize/trunk/test.py Thu Dec 28 22:08:46 2006
@@ -22,10 +22,12 @@
from unittest import defaultTestLoader, TextTestRunner, TestSuite, TestCase, \
_TextTestResult
-level = logging.DEBUG
+#level = logging.DEBUG
#level = logging.INFO
+#level = logging.WARNING
#level = logging.NOTSET
#logging.getLogger("mechanize").setLevel(level)
+#logging.getLogger("mechanize").addHandler(logging.StreamHandler(sys.stdout))
class CgitbTextResult(_TextTestResult):
@@ -188,8 +190,9 @@
# run doctests in docstrings
from mechanize import _headersutil, _auth, _clientcookie, _pullparser, \
- _http
+ _http, _rfc3986
doctest.testmod(_headersutil)
+ doctest.testmod(_rfc3986)
doctest.testmod(_auth)
doctest.testmod(_clientcookie)
doctest.testmod(_pullparser)
More information about the wwwsearch-commits
mailing list