[wwwsearch-commits] r33079 - wwwsearch/mechanize/trunk/mechanize

jjlee at codespeak.net jjlee at codespeak.net
Tue Oct 10 02:05:53 CEST 2006


Author: jjlee
Date: Tue Oct 10 02:05:44 2006
New Revision: 33079

Modified:
   wwwsearch/mechanize/trunk/mechanize/_clientcookie.py
   wwwsearch/mechanize/trunk/mechanize/_headersutil.py
   wwwsearch/mechanize/trunk/mechanize/_html.py
   wwwsearch/mechanize/trunk/mechanize/_http.py
   wwwsearch/mechanize/trunk/mechanize/_mechanize.py
   wwwsearch/mechanize/trunk/mechanize/_opener.py
Log:
Follow RFC 3986 for URL parsing, unparsing and joining -- stop using module urlparse, start using _rfc3986 (but not in _auth.py yet, since I'm probably abusing urlparse in there, so that requires more thought)

Modified: wwwsearch/mechanize/trunk/mechanize/_clientcookie.py
==============================================================================
--- wwwsearch/mechanize/trunk/mechanize/_clientcookie.py	(original)
+++ wwwsearch/mechanize/trunk/mechanize/_clientcookie.py	Tue Oct 10 02:05:44 2006
@@ -32,7 +32,7 @@
 
 """
 
-import sys, re, urlparse, copy, time, struct, urllib, types, logging
+import sys, re, copy, time, struct, urllib, types, logging
 try:
     import threading
     _threading = threading; del threading
@@ -47,6 +47,7 @@
 
 from _headersutil import split_header_words, parse_ns_headers
 from _util import isstringlike
+import _rfc3986
 
 debug = logging.getLogger("mechanize.cookies").debug
 
@@ -156,8 +157,8 @@
 
     """
     url = request.get_full_url()
-    host = urlparse.urlparse(url)[1]
-    if host == "":
+    host = _rfc3986.urlsplit(url)[1]
+    if host is None:
         host = request.get_header("Host", "")
 
     # remove port, if present
@@ -178,15 +179,10 @@
 def request_path(request):
     """request-URI, as defined by RFC 2965."""
     url = request.get_full_url()
-    #scheme, netloc, path, parameters, query, frag = urlparse.urlparse(url)
-    #req_path = escape_path(string.join(urlparse.urlparse(url)[2:], ""))
-    path, parameters, query, frag = urlparse.urlparse(url)[2:]
-    if parameters:
-        path = "%s;%s" % (path, parameters)
+    path, query, frag = _rfc3986.urlsplit(url)[2:]
     path = escape_path(path)
-    req_path = urlparse.urlunparse(("", "", path, "", query, frag))
+    req_path = _rfc3986.urlunsplit((None, None, path, query, frag))
     if not req_path.startswith("/"):
-        # fix bad RFC 2396 absoluteURI
         req_path = "/"+req_path
     return req_path
 

Modified: wwwsearch/mechanize/trunk/mechanize/_headersutil.py
==============================================================================
--- wwwsearch/mechanize/trunk/mechanize/_headersutil.py	(original)
+++ wwwsearch/mechanize/trunk/mechanize/_headersutil.py	Tue Oct 10 02:05:44 2006
@@ -9,12 +9,13 @@
 
 """
 
-import os, re, urlparse
+import os, re
 from types import StringType
 from types import UnicodeType
 STRING_TYPES = StringType, UnicodeType
 
 from _util import http2time
+import _rfc3986
 
 def is_html(ct_headers, url, allow_xhtml=False):
     """
@@ -24,7 +25,7 @@
     """
     if not ct_headers:
         # guess
-        ext = os.path.splitext(urlparse.urlparse(url)[2])[1]
+        ext = os.path.splitext(_rfc3986.urlsplit(url)[2])[1]
         html_exts = [".htm", ".html"]
         if allow_xhtml:
             html_exts += [".xhtml"]

Modified: wwwsearch/mechanize/trunk/mechanize/_html.py
==============================================================================
--- wwwsearch/mechanize/trunk/mechanize/_html.py	(original)
+++ wwwsearch/mechanize/trunk/mechanize/_html.py	Tue Oct 10 02:05:44 2006
@@ -9,17 +9,10 @@
 """
 
 import re, copy, urllib, htmlentitydefs
-from urlparse import urljoin
 
 import _request
 from _headersutil import split_header_words, is_html as _is_html
-
-## # XXXX miserable hack
-## def urljoin(base, url):
-##     if url.startswith("?"):
-##         return base+url
-##     else:
-##         return urlparse.urljoin(base, url)
+import _rfc3986
 
 ## def chr_range(a, b):
 ##     return "".join(map(chr, range(ord(a), ord(b)+1)))
@@ -99,7 +92,7 @@
     def __init__(self, base_url, url, text, tag, attrs):
         assert None not in [url, tag, attrs]
         self.base_url = base_url
-        self.absolute_url = urljoin(base_url, url)
+        self.absolute_url = _rfc3986.urljoin(base_url, url)
         self.url, self.text, self.tag, self.attrs = url, text, tag, attrs
     def __cmp__(self, other):
         try:
@@ -233,6 +226,9 @@
             request_class=self.request_class,
             backwards_compat=self.backwards_compat,
             encoding=encoding,
+            _urljoin=_rfc3986.urljoin,
+            _urlparse=_rfc3986.urlsplit,
+            _urlunparse=_rfc3986.urlunsplit,
             )
 
 class TitleFactory:

Modified: wwwsearch/mechanize/trunk/mechanize/_http.py
==============================================================================
--- wwwsearch/mechanize/trunk/mechanize/_http.py	(original)
+++ wwwsearch/mechanize/trunk/mechanize/_http.py	Tue Oct 10 02:05:44 2006
@@ -12,7 +12,7 @@
 
 """
 
-import copy, time, tempfile, htmlentitydefs, re, logging, socket, urlparse, \
+import copy, time, tempfile, htmlentitydefs, re, logging, socket, \
        urllib2, urllib, httplib, sgmllib
 from urllib2 import URLError, HTTPError, BaseHandler
 from cStringIO import StringIO
@@ -23,6 +23,7 @@
 from _html import unescape, unescape_charref
 from _headersutil import is_html
 from _clientcookie import CookieJar, request_host
+import _rfc3986
 
 debug = logging.getLogger("mechanize.cookies").debug
 
@@ -98,7 +99,7 @@
             newurl = headers.getheaders('uri')[0]
         else:
             return
-        newurl = urlparse.urljoin(req.get_full_url(), newurl)
+        newurl = _rfc3986.urljoin(req.get_full_url(), newurl)
 
         # XXX Probably want to forget about the state of the current
         # request, although that might interact poorly with other

Modified: wwwsearch/mechanize/trunk/mechanize/_mechanize.py
==============================================================================
--- wwwsearch/mechanize/trunk/mechanize/_mechanize.py	(original)
+++ wwwsearch/mechanize/trunk/mechanize/_mechanize.py	Tue Oct 10 02:05:44 2006
@@ -9,13 +9,14 @@
 
 """
 
-import urllib2, urlparse, sys, copy, re
+import urllib2, sys, copy, re
 
 from _useragent import UserAgent
 from _html import DefaultFactory
 from _response import response_seek_wrapper, closeable_response
 import _upgrade
 import _request
+import _rfc3986
 
 __version__ = (0, 1, 3, None, None)  # 0.1.3
 
@@ -158,14 +159,13 @@
             url.get_full_url
         except AttributeError:
             # string URL -- convert to absolute URL if required
-            scheme, netloc = urlparse.urlparse(url)[:2]
-            if not scheme:
+            scheme, authority = _rfc3986.urlsplit(url)[:2]
+            if scheme is None:
                 # relative URL
-                assert not netloc, "malformed URL"
                 if self._response is None:
                     raise BrowserStateError(
-                        "can't fetch relative URL: not viewing any document")
-                url = urlparse.urljoin(self._response.geturl(), url)
+                        "can't fetch relative reference: not viewing any document")
+                url = _rfc3986.urljoin(self._response.geturl(), url)
 
         request = self._request(url, data, visit)
         visit = request.visit
@@ -432,9 +432,9 @@
             original_scheme in ["http", "https"] and
             not (original_scheme == "https" and scheme != "https")):
             # strip URL fragment (RFC 2616 14.36)
-            parts = urlparse.urlparse(self.request.get_full_url())
-            parts = parts[:-1]+("",)
-            referer = urlparse.urlunparse(parts)
+            parts = _rfc3986.urlsplit(self.request.get_full_url())
+            parts = parts[:-1]+(None,)
+            referer = _rfc3986.urlunsplit(parts)
             request.add_unredirected_header("Referer", referer)
         return request
 

Modified: wwwsearch/mechanize/trunk/mechanize/_opener.py
==============================================================================
--- wwwsearch/mechanize/trunk/mechanize/_opener.py	(original)
+++ wwwsearch/mechanize/trunk/mechanize/_opener.py	Tue Oct 10 02:05:44 2006
@@ -9,7 +9,7 @@
 
 """
 
-import os, urllib2, bisect, urllib, urlparse, httplib, types, tempfile
+import os, urllib2, bisect, urllib, httplib, types, tempfile
 try:
     import threading as _threading
 except ImportError:
@@ -22,6 +22,7 @@
 
 import _http
 import _upgrade
+import _rfc3986
 from _util import isstringlike
 from _request import Request
 
@@ -241,7 +242,7 @@
         if filename:
             tfp = open(filename, 'wb')
         else:
-            path = urlparse.urlparse(fullurl)[2]
+            path = _rfc3986.urlsplit(fullurl)[2]
             suffix = os.path.splitext(path)[1]
             fd, filename = tempfile.mkstemp(suffix)
             self._tempfiles.append(filename)


More information about the wwwsearch-commits mailing list