[wwwsearch-commits] r35823 - in wwwsearch/mechanize/trunk: mechanize test
jjlee at codespeak.net
jjlee at codespeak.net
Fri Dec 15 23:38:15 CET 2006
Author: jjlee
Date: Fri Dec 15 23:38:12 2006
New Revision: 35823
Modified:
wwwsearch/mechanize/trunk/mechanize/_html.py
wwwsearch/mechanize/trunk/mechanize/_rfc3986.py
wwwsearch/mechanize/trunk/test/test_browser.py
Log:
Move clean_url() to module _rfc3986 where it belongs
Modified: wwwsearch/mechanize/trunk/mechanize/_html.py
==============================================================================
--- wwwsearch/mechanize/trunk/mechanize/_html.py (original)
+++ wwwsearch/mechanize/trunk/mechanize/_html.py Fri Dec 15 23:38:12 2006
@@ -8,23 +8,12 @@
"""
-import re, copy, urllib, htmlentitydefs
+import re, copy, htmlentitydefs
import _request
from _headersutil import split_header_words, is_html as _is_html
import _rfc3986
-## def chr_range(a, b):
-## return "".join(map(chr, range(ord(a), ord(b)+1)))
-
-## RESERVED_URL_CHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-## "abcdefghijklmnopqrstuvwxyz"
-## "-_.~")
-## UNRESERVED_URL_CHARS = "!*'();:@&=+$,/?%#[]"
-# we want (RESERVED_URL_CHARS+UNRESERVED_URL_CHARS), minus those
-# 'safe'-by-default characters that urllib.urlquote never quotes
-URLQUOTE_SAFE_URL_CHARS = "!*'();:@&=+$,/?%#[]~"
-
DEFAULT_ENCODING = "latin-1"
@@ -107,19 +96,6 @@
self.base_url, self.url, self.text, self.tag, self.attrs)
-def clean_url(url, encoding):
- # percent-encode illegal URL characters
- # Trying to come up with test cases for this gave me a headache, revisit
- # when do switch to unicode.
- # Somebody else's comments (lost the attribution):
-## - IE will return you the url in the encoding you send it
-## - Mozilla/Firefox will send you latin-1 if there's no non latin-1
-## characters in your link. It will send you utf-8 however if there are...
- if type(url) == type(""):
- url = url.decode(encoding, "replace")
- url = url.strip()
- return urllib.quote(url.encode(encoding), URLQUOTE_SAFE_URL_CHARS)
-
class LinksFactory:
def __init__(self,
@@ -175,7 +151,7 @@
# this.
continue
- url = clean_url(url, encoding)
+ url = _rfc3986.clean_url(url, encoding)
if tag == "a":
if token.type != "startendtag":
# hmm, this'd break if end tag is missing
@@ -377,7 +353,7 @@
url = attrs_dict.get(url_attr)
if not url:
continue
- url = clean_url(url, encoding)
+ url = _rfc3986.clean_url(url, encoding)
text = link.firstText(lambda t: True)
if text is _beautifulsoup.Null:
# follow _pullparser's weird behaviour rigidly
Modified: wwwsearch/mechanize/trunk/mechanize/_rfc3986.py
==============================================================================
--- wwwsearch/mechanize/trunk/mechanize/_rfc3986.py (original)
+++ wwwsearch/mechanize/trunk/mechanize/_rfc3986.py Fri Dec 15 23:38:12 2006
@@ -12,7 +12,33 @@
# XXX Wow, this is ugly. Overly-direct translation of the RFC ATM.
-import sys, re, posixpath
+import sys, re, posixpath, urllib
+
+## def chr_range(a, b):
+## return "".join(map(chr, range(ord(a), ord(b)+1)))
+
+## RESERVED_URL_CHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+## "abcdefghijklmnopqrstuvwxyz"
+## "-_.~")
+## UNRESERVED_URL_CHARS = "!*'();:@&=+$,/?%#[]"
+# we want (RESERVED_URL_CHARS+UNRESERVED_URL_CHARS), minus those
+# 'safe'-by-default characters that urllib.urlquote never quotes
+URLQUOTE_SAFE_URL_CHARS = "!*'();:@&=+$,/?%#[]~"
+
+
+def clean_url(url, encoding):
+ # percent-encode illegal URL characters
+ # Trying to come up with test cases for this gave me a headache, revisit
+ # when do switch to unicode.
+ # Somebody else's comments (lost the attribution):
+## - IE will return you the url in the encoding you send it
+## - Mozilla/Firefox will send you latin-1 if there's no non latin-1
+## characters in your link. It will send you utf-8 however if there are...
+ if type(url) == type(""):
+ url = url.decode(encoding, "replace")
+ url = url.strip()
+ return urllib.quote(url.encode(encoding), URLQUOTE_SAFE_URL_CHARS)
+
SPLIT_MATCH = re.compile(
r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?").match
Modified: wwwsearch/mechanize/trunk/test/test_browser.py
==============================================================================
--- wwwsearch/mechanize/trunk/test/test_browser.py (original)
+++ wwwsearch/mechanize/trunk/test/test_browser.py Fri Dec 15 23:38:12 2006
@@ -449,7 +449,7 @@
def _test_link_encoding(self, factory):
import urllib
import mechanize
- from mechanize._html import clean_url
+ from mechanize._rfc3986 import clean_url
url = "http://example.com/"
for encoding in ["UTF-8", "latin-1"]:
encoding_decl = "; charset=%s" % encoding
More information about the wwwsearch-commits
mailing list