[wwwsearch-commits] r35823 - in wwwsearch/mechanize/trunk: mechanize test

jjlee at codespeak.net jjlee at codespeak.net
Fri Dec 15 23:38:15 CET 2006


Author: jjlee
Date: Fri Dec 15 23:38:12 2006
New Revision: 35823

Modified:
   wwwsearch/mechanize/trunk/mechanize/_html.py
   wwwsearch/mechanize/trunk/mechanize/_rfc3986.py
   wwwsearch/mechanize/trunk/test/test_browser.py
Log:
Move clean_url() to module _rfc3986 where it belongs

Modified: wwwsearch/mechanize/trunk/mechanize/_html.py
==============================================================================
--- wwwsearch/mechanize/trunk/mechanize/_html.py	(original)
+++ wwwsearch/mechanize/trunk/mechanize/_html.py	Fri Dec 15 23:38:12 2006
@@ -8,23 +8,12 @@
 
 """
 
-import re, copy, urllib, htmlentitydefs
+import re, copy, htmlentitydefs
 
 import _request
 from _headersutil import split_header_words, is_html as _is_html
 import _rfc3986
 
-## def chr_range(a, b):
-##     return "".join(map(chr, range(ord(a), ord(b)+1)))
-
-## RESERVED_URL_CHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-##                       "abcdefghijklmnopqrstuvwxyz"
-##                       "-_.~")
-## UNRESERVED_URL_CHARS = "!*'();:@&=+$,/?%#[]"
-# we want (RESERVED_URL_CHARS+UNRESERVED_URL_CHARS), minus those
-# 'safe'-by-default characters that urllib.urlquote never quotes
-URLQUOTE_SAFE_URL_CHARS = "!*'();:@&=+$,/?%#[]~"
-
 DEFAULT_ENCODING = "latin-1"
 
 
@@ -107,19 +96,6 @@
             self.base_url, self.url, self.text, self.tag, self.attrs)
 
 
-def clean_url(url, encoding):
-    # percent-encode illegal URL characters
-    # Trying to come up with test cases for this gave me a headache, revisit
-    # when do switch to unicode.
-    # Somebody else's comments (lost the attribution):
-##     - IE will return you the url in the encoding you send it
-##     - Mozilla/Firefox will send you latin-1 if there's no non latin-1
-##     characters in your link. It will send you utf-8 however if there are...
-    if type(url) == type(""):
-        url = url.decode(encoding, "replace")
-    url = url.strip()
-    return urllib.quote(url.encode(encoding), URLQUOTE_SAFE_URL_CHARS)
-
 class LinksFactory:
 
     def __init__(self,
@@ -175,7 +151,7 @@
                 # this.
                 continue
 
-            url = clean_url(url, encoding)
+            url = _rfc3986.clean_url(url, encoding)
             if tag == "a":
                 if token.type != "startendtag":
                     # hmm, this'd break if end tag is missing
@@ -377,7 +353,7 @@
                 url = attrs_dict.get(url_attr)
                 if not url:
                     continue
-                url = clean_url(url, encoding)
+                url = _rfc3986.clean_url(url, encoding)
                 text = link.firstText(lambda t: True)
                 if text is _beautifulsoup.Null:
                     # follow _pullparser's weird behaviour rigidly

Modified: wwwsearch/mechanize/trunk/mechanize/_rfc3986.py
==============================================================================
--- wwwsearch/mechanize/trunk/mechanize/_rfc3986.py	(original)
+++ wwwsearch/mechanize/trunk/mechanize/_rfc3986.py	Fri Dec 15 23:38:12 2006
@@ -12,7 +12,33 @@
 
 # XXX Wow, this is ugly.  Overly-direct translation of the RFC ATM.
 
-import sys, re, posixpath
+import sys, re, posixpath, urllib
+
+## def chr_range(a, b):
+##     return "".join(map(chr, range(ord(a), ord(b)+1)))
+
+## RESERVED_URL_CHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+##                       "abcdefghijklmnopqrstuvwxyz"
+##                       "-_.~")
+## UNRESERVED_URL_CHARS = "!*'();:@&=+$,/?%#[]"
+# we want (RESERVED_URL_CHARS+UNRESERVED_URL_CHARS), minus those
+# 'safe'-by-default characters that urllib.urlquote never quotes
+URLQUOTE_SAFE_URL_CHARS = "!*'();:@&=+$,/?%#[]~"
+
+
+def clean_url(url, encoding):
+    # percent-encode illegal URL characters
+    # Trying to come up with test cases for this gave me a headache, revisit
+    # when do switch to unicode.
+    # Somebody else's comments (lost the attribution):
+##     - IE will return you the url in the encoding you send it
+##     - Mozilla/Firefox will send you latin-1 if there's no non latin-1
+##     characters in your link. It will send you utf-8 however if there are...
+    if type(url) == type(""):
+        url = url.decode(encoding, "replace")
+    url = url.strip()
+    return urllib.quote(url.encode(encoding), URLQUOTE_SAFE_URL_CHARS)
+
 
 SPLIT_MATCH = re.compile(
     r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?").match

Modified: wwwsearch/mechanize/trunk/test/test_browser.py
==============================================================================
--- wwwsearch/mechanize/trunk/test/test_browser.py	(original)
+++ wwwsearch/mechanize/trunk/test/test_browser.py	Fri Dec 15 23:38:12 2006
@@ -449,7 +449,7 @@
     def _test_link_encoding(self, factory):
         import urllib
         import mechanize
-        from mechanize._html import clean_url
+        from mechanize._rfc3986 import clean_url
         url = "http://example.com/"
         for encoding in ["UTF-8", "latin-1"]:
             encoding_decl = "; charset=%s" % encoding


More information about the wwwsearch-commits mailing list