[wwwsearch-commits] r33079 - wwwsearch/mechanize/trunk/mechanize
jjlee at codespeak.net
jjlee at codespeak.net
Tue Oct 10 02:05:53 CEST 2006
Author: jjlee
Date: Tue Oct 10 02:05:44 2006
New Revision: 33079
Modified:
wwwsearch/mechanize/trunk/mechanize/_clientcookie.py
wwwsearch/mechanize/trunk/mechanize/_headersutil.py
wwwsearch/mechanize/trunk/mechanize/_html.py
wwwsearch/mechanize/trunk/mechanize/_http.py
wwwsearch/mechanize/trunk/mechanize/_mechanize.py
wwwsearch/mechanize/trunk/mechanize/_opener.py
Log:
Follow RFC 3986 for URL parsing, unparsing and joining -- stop using module urlparse, start using _rfc3986 (but not in _auth.py yet, since I'm probably abusing urlparse in there, so that requires more thought)
Modified: wwwsearch/mechanize/trunk/mechanize/_clientcookie.py
==============================================================================
--- wwwsearch/mechanize/trunk/mechanize/_clientcookie.py (original)
+++ wwwsearch/mechanize/trunk/mechanize/_clientcookie.py Tue Oct 10 02:05:44 2006
@@ -32,7 +32,7 @@
"""
-import sys, re, urlparse, copy, time, struct, urllib, types, logging
+import sys, re, copy, time, struct, urllib, types, logging
try:
import threading
_threading = threading; del threading
@@ -47,6 +47,7 @@
from _headersutil import split_header_words, parse_ns_headers
from _util import isstringlike
+import _rfc3986
debug = logging.getLogger("mechanize.cookies").debug
@@ -156,8 +157,8 @@
"""
url = request.get_full_url()
- host = urlparse.urlparse(url)[1]
- if host == "":
+ host = _rfc3986.urlsplit(url)[1]
+ if host is None:
host = request.get_header("Host", "")
# remove port, if present
@@ -178,15 +179,10 @@
def request_path(request):
"""request-URI, as defined by RFC 2965."""
url = request.get_full_url()
- #scheme, netloc, path, parameters, query, frag = urlparse.urlparse(url)
- #req_path = escape_path(string.join(urlparse.urlparse(url)[2:], ""))
- path, parameters, query, frag = urlparse.urlparse(url)[2:]
- if parameters:
- path = "%s;%s" % (path, parameters)
+ path, query, frag = _rfc3986.urlsplit(url)[2:]
path = escape_path(path)
- req_path = urlparse.urlunparse(("", "", path, "", query, frag))
+ req_path = _rfc3986.urlunsplit((None, None, path, query, frag))
if not req_path.startswith("/"):
- # fix bad RFC 2396 absoluteURI
req_path = "/"+req_path
return req_path
Modified: wwwsearch/mechanize/trunk/mechanize/_headersutil.py
==============================================================================
--- wwwsearch/mechanize/trunk/mechanize/_headersutil.py (original)
+++ wwwsearch/mechanize/trunk/mechanize/_headersutil.py Tue Oct 10 02:05:44 2006
@@ -9,12 +9,13 @@
"""
-import os, re, urlparse
+import os, re
from types import StringType
from types import UnicodeType
STRING_TYPES = StringType, UnicodeType
from _util import http2time
+import _rfc3986
def is_html(ct_headers, url, allow_xhtml=False):
"""
@@ -24,7 +25,7 @@
"""
if not ct_headers:
# guess
- ext = os.path.splitext(urlparse.urlparse(url)[2])[1]
+ ext = os.path.splitext(_rfc3986.urlsplit(url)[2])[1]
html_exts = [".htm", ".html"]
if allow_xhtml:
html_exts += [".xhtml"]
Modified: wwwsearch/mechanize/trunk/mechanize/_html.py
==============================================================================
--- wwwsearch/mechanize/trunk/mechanize/_html.py (original)
+++ wwwsearch/mechanize/trunk/mechanize/_html.py Tue Oct 10 02:05:44 2006
@@ -9,17 +9,10 @@
"""
import re, copy, urllib, htmlentitydefs
-from urlparse import urljoin
import _request
from _headersutil import split_header_words, is_html as _is_html
-
-## # XXXX miserable hack
-## def urljoin(base, url):
-## if url.startswith("?"):
-## return base+url
-## else:
-## return urlparse.urljoin(base, url)
+import _rfc3986
## def chr_range(a, b):
## return "".join(map(chr, range(ord(a), ord(b)+1)))
@@ -99,7 +92,7 @@
def __init__(self, base_url, url, text, tag, attrs):
assert None not in [url, tag, attrs]
self.base_url = base_url
- self.absolute_url = urljoin(base_url, url)
+ self.absolute_url = _rfc3986.urljoin(base_url, url)
self.url, self.text, self.tag, self.attrs = url, text, tag, attrs
def __cmp__(self, other):
try:
@@ -233,6 +226,9 @@
request_class=self.request_class,
backwards_compat=self.backwards_compat,
encoding=encoding,
+ _urljoin=_rfc3986.urljoin,
+ _urlparse=_rfc3986.urlsplit,
+ _urlunparse=_rfc3986.urlunsplit,
)
class TitleFactory:
Modified: wwwsearch/mechanize/trunk/mechanize/_http.py
==============================================================================
--- wwwsearch/mechanize/trunk/mechanize/_http.py (original)
+++ wwwsearch/mechanize/trunk/mechanize/_http.py Tue Oct 10 02:05:44 2006
@@ -12,7 +12,7 @@
"""
-import copy, time, tempfile, htmlentitydefs, re, logging, socket, urlparse, \
+import copy, time, tempfile, htmlentitydefs, re, logging, socket, \
urllib2, urllib, httplib, sgmllib
from urllib2 import URLError, HTTPError, BaseHandler
from cStringIO import StringIO
@@ -23,6 +23,7 @@
from _html import unescape, unescape_charref
from _headersutil import is_html
from _clientcookie import CookieJar, request_host
+import _rfc3986
debug = logging.getLogger("mechanize.cookies").debug
@@ -98,7 +99,7 @@
newurl = headers.getheaders('uri')[0]
else:
return
- newurl = urlparse.urljoin(req.get_full_url(), newurl)
+ newurl = _rfc3986.urljoin(req.get_full_url(), newurl)
# XXX Probably want to forget about the state of the current
# request, although that might interact poorly with other
Modified: wwwsearch/mechanize/trunk/mechanize/_mechanize.py
==============================================================================
--- wwwsearch/mechanize/trunk/mechanize/_mechanize.py (original)
+++ wwwsearch/mechanize/trunk/mechanize/_mechanize.py Tue Oct 10 02:05:44 2006
@@ -9,13 +9,14 @@
"""
-import urllib2, urlparse, sys, copy, re
+import urllib2, sys, copy, re
from _useragent import UserAgent
from _html import DefaultFactory
from _response import response_seek_wrapper, closeable_response
import _upgrade
import _request
+import _rfc3986
__version__ = (0, 1, 3, None, None) # 0.1.3
@@ -158,14 +159,13 @@
url.get_full_url
except AttributeError:
# string URL -- convert to absolute URL if required
- scheme, netloc = urlparse.urlparse(url)[:2]
- if not scheme:
+ scheme, authority = _rfc3986.urlsplit(url)[:2]
+ if scheme is None:
# relative URL
- assert not netloc, "malformed URL"
if self._response is None:
raise BrowserStateError(
- "can't fetch relative URL: not viewing any document")
- url = urlparse.urljoin(self._response.geturl(), url)
+ "can't fetch relative reference: not viewing any document")
+ url = _rfc3986.urljoin(self._response.geturl(), url)
request = self._request(url, data, visit)
visit = request.visit
@@ -432,9 +432,9 @@
original_scheme in ["http", "https"] and
not (original_scheme == "https" and scheme != "https")):
# strip URL fragment (RFC 2616 14.36)
- parts = urlparse.urlparse(self.request.get_full_url())
- parts = parts[:-1]+("",)
- referer = urlparse.urlunparse(parts)
+ parts = _rfc3986.urlsplit(self.request.get_full_url())
+ parts = parts[:-1]+(None,)
+ referer = _rfc3986.urlunsplit(parts)
request.add_unredirected_header("Referer", referer)
return request
Modified: wwwsearch/mechanize/trunk/mechanize/_opener.py
==============================================================================
--- wwwsearch/mechanize/trunk/mechanize/_opener.py (original)
+++ wwwsearch/mechanize/trunk/mechanize/_opener.py Tue Oct 10 02:05:44 2006
@@ -9,7 +9,7 @@
"""
-import os, urllib2, bisect, urllib, urlparse, httplib, types, tempfile
+import os, urllib2, bisect, urllib, httplib, types, tempfile
try:
import threading as _threading
except ImportError:
@@ -22,6 +22,7 @@
import _http
import _upgrade
+import _rfc3986
from _util import isstringlike
from _request import Request
@@ -241,7 +242,7 @@
if filename:
tfp = open(filename, 'wb')
else:
- path = urlparse.urlparse(fullurl)[2]
+ path = _rfc3986.urlsplit(fullurl)[2]
suffix = os.path.splitext(path)[1]
fd, filename = tempfile.mkstemp(suffix)
self._tempfiles.append(filename)
More information about the wwwsearch-commits
mailing list