[wwwsearch-commits] r32608 - in wwwsearch/mechanize/trunk: . mechanize test
jjlee at codespeak.net
jjlee at codespeak.net
Sat Sep 23 17:52:59 CEST 2006
Author: jjlee
Date: Sat Sep 23 17:52:52 2006
New Revision: 32608
Modified:
wwwsearch/mechanize/trunk/functional_tests.py
wwwsearch/mechanize/trunk/mechanize/__init__.py
wwwsearch/mechanize/trunk/mechanize/_opener.py
wwwsearch/mechanize/trunk/mechanize/_response.py
wwwsearch/mechanize/trunk/mechanize/_urllib2.py
wwwsearch/mechanize/trunk/test/test_mechanize.py
Log:
Fix OpenerDirector.retrieve(), which was very broken (duncan.booth at suttoncourtenay.org.uk)
Modified: wwwsearch/mechanize/trunk/functional_tests.py
==============================================================================
--- wwwsearch/mechanize/trunk/functional_tests.py (original)
+++ wwwsearch/mechanize/trunk/functional_tests.py Sat Sep 23 17:52:52 2006
@@ -197,17 +197,34 @@
def test_urlretrieve(self):
url = "http://www.python.org/"
- verif = CallbackVerifier(self)
- fn, hdrs = urlretrieve(url, "python.html", verif.callback)
- try:
- f = open(fn)
+ test_filename = "python.html"
+ def check_retrieve(opener, filename, headers):
+ self.assertEqual(headers.get('Content-Type'), 'text/html')
+ f = open(filename)
data = f.read()
f.close()
+ opener.close()
+ from urllib import urlopen
+ r = urlopen(url)
+ self.assertEqual(data, r.read())
+ r.close()
+
+ opener = mechanize.build_opener()
+ verif = CallbackVerifier(self)
+ filename, headers = opener.retrieve(url, test_filename, verif.callback)
+ try:
+ self.assertEqual(filename, test_filename)
+ check_retrieve(opener, filename, headers)
+ self.assert_(os.path.isfile(filename))
finally:
- os.remove(fn)
- r = urlopen(url)
- self.assert_(data == r.read())
- r.close()
+ os.remove(filename)
+
+ opener = mechanize.build_opener()
+ verif = CallbackVerifier(self)
+ filename, headers = opener.retrieve(url, reporthook=verif.callback)
+ check_retrieve(opener, filename, headers)
+ # closing the opener removed the temporary file
+ self.failIf(os.path.isfile(filename))
## def test_cacheftp(self):
## from urllib2 import CacheFTPHandler, build_opener
@@ -226,8 +243,7 @@
self._count = 0
self._testcase = testcase
def callback(self, block_nr, block_size, total_size):
- if block_nr != self._count:
- self._testcase.fail()
+ self._testcase.assertEqual(block_nr, self._count)
self._count = self._count + 1
Modified: wwwsearch/mechanize/trunk/mechanize/__init__.py
==============================================================================
--- wwwsearch/mechanize/trunk/mechanize/__init__.py (original)
+++ wwwsearch/mechanize/trunk/mechanize/__init__.py Sat Sep 23 17:52:52 2006
@@ -5,6 +5,7 @@
'Browser',
'BrowserStateError',
'CacheFTPHandler',
+ 'ContentTooShortError',
'Cookie',
'CookieJar',
'CookiePolicy',
@@ -97,6 +98,7 @@
from _urllib2 import *
# misc
+from _opener import ContentTooShortError, OpenerFactory, urlretrieve
from _util import http2time as str2time
from _response import response_seek_wrapper, make_response
from _http import HeadParser
Modified: wwwsearch/mechanize/trunk/mechanize/_opener.py
==============================================================================
--- wwwsearch/mechanize/trunk/mechanize/_opener.py (original)
+++ wwwsearch/mechanize/trunk/mechanize/_opener.py Sat Sep 23 17:52:52 2006
@@ -9,7 +9,7 @@
"""
-import urllib2, bisect, urlparse, httplib, types
+import os, urllib2, bisect, urllib, urlparse, httplib, types, tempfile
try:
import threading as _threading
except ImportError:
@@ -26,6 +26,12 @@
from _request import Request
+class ContentTooShortError(urllib2.URLError):
+ def __init__(self, reason, result):
+ urllib2.URLError.__init__(self, reason)
+ self.result = result
+
+
class OpenerDirector(urllib2.OpenerDirector):
def __init__(self):
urllib2.OpenerDirector.__init__(self)
@@ -36,6 +42,7 @@
self._any_request = {}
self._any_response = {}
self._handler_index_valid = True
+ self._tempfiles = []
def add_handler(self, handler):
if handler in self.handlers:
@@ -198,52 +205,85 @@
args = (dict, 'default', 'http_error_default') + orig_args
return apply(self._call_chain, args)
+ BLOCK_SIZE = 1024*8
def retrieve(self, fullurl, filename=None, reporthook=None, data=None):
"""Returns (filename, headers).
For remote objects, the default filename will refer to a temporary
- file.
+ file. Temporary files are removed when the OpenerDirector.close()
+ method is called.
+
+ For file: URLs, at present the returned filename is None. This may
+ change in future.
+
+ If the actual number of bytes read is less than indicated by the
+ Content-Length header, raises ContentTooShortError (a URLError
+ subclass). The exception's .result attribute contains the (filename,
+ headers) that would have been returned.
"""
req = self._request(fullurl, data)
- type_ = req.get_type()
+ scheme = req.get_type()
fp = self.open(req)
headers = fp.info()
- if filename is None and type == 'file':
- return url2pathname(req.get_selector()), headers
+ if filename is None and scheme == 'file':
+ # XXX req.get_selector() seems broken here, return None,
+ # pending sanity :-/
+ return None, headers
+ #return urllib.url2pathname(req.get_selector()), headers
if filename:
tfp = open(filename, 'wb')
else:
- path = urlparse(fullurl)[2]
+ path = urlparse.urlparse(fullurl)[2]
suffix = os.path.splitext(path)[1]
- tfp = tempfile.TemporaryFile("wb", suffix=suffix)
+ fd, filename = tempfile.mkstemp(suffix)
+ self._tempfiles.append(filename)
+ tfp = os.fdopen(fd, 'wb')
+
result = filename, headers
- bs = 1024*8
+ bs = self.BLOCK_SIZE
size = -1
read = 0
- blocknum = 1
+ blocknum = 0
if reporthook:
- if headers.has_key("content-length"):
+ if "content-length" in headers:
size = int(headers["Content-Length"])
- reporthook(0, bs, size)
+ reporthook(blocknum, bs, size)
while 1:
block = fp.read(bs)
+ if block == "":
+ break
read += len(block)
+ tfp.write(block)
+ blocknum += 1
if reporthook:
reporthook(blocknum, bs, size)
- blocknum = blocknum + 1
- if not block:
- break
- tfp.write(block)
fp.close()
tfp.close()
del fp
del tfp
- if size>=0 and read<size:
- raise IOError("incomplete retrieval error",
- "got only %d bytes out of %d" % (read,size))
+
+ # raise exception if actual size does not match content-length header
+ if size >= 0 and read < size:
+ raise ContentTooShortError(
+ "retrieval incomplete: "
+ "got only %i out of %i bytes" % (read, size),
+ result
+ )
+
return result
+ def close(self):
+ urllib2.OpenerDirector.close(self)
+
+ if self._tempfiles:
+ for filename in self._tempfiles:
+ try:
+ os.unlink(filename)
+ except OSError:
+ pass
+ del self._tempfiles[:]
+
class OpenerFactory:
"""This class's interface is quite likely to change."""
Modified: wwwsearch/mechanize/trunk/mechanize/_response.py
==============================================================================
--- wwwsearch/mechanize/trunk/mechanize/_response.py (original)
+++ wwwsearch/mechanize/trunk/mechanize/_response.py Sat Sep 23 17:52:52 2006
@@ -322,6 +322,10 @@
state["wrapped"] = new_wrapped
return state
+def test_response(data, headers,
+ url="http://example.com/", code=200, msg="OK"):
+ return make_response(data, headers, url, code, msg)
+
def make_response(data, headers, url, code, msg):
"""Convenient factory for objects implementing response interface.
@@ -332,12 +336,18 @@
msg: string response code message (e.g. "OK")
"""
+ mime_headers = make_headers(headers)
+ r = closeable_response(StringIO(data), mime_headers, url, code, msg)
+ return response_seek_wrapper(r)
+
+def make_headers(headers):
+ """
+ headers: sequence of (name, value) pairs
+ """
hdr_text = []
for name_value in headers:
hdr_text.append("%s: %s" % name_value)
- mime_headers = mimetools.Message(StringIO("\n".join(hdr_text)))
- r = closeable_response(StringIO(data), mime_headers, url, code, msg)
- return response_seek_wrapper(r)
+ return mimetools.Message(StringIO("\n".join(hdr_text)))
# Horrible, but needed, at least until fork urllib2. Even then, may want
Modified: wwwsearch/mechanize/trunk/mechanize/_urllib2.py
==============================================================================
--- wwwsearch/mechanize/trunk/mechanize/_urllib2.py (original)
+++ wwwsearch/mechanize/trunk/mechanize/_urllib2.py Sat Sep 23 17:52:52 2006
@@ -6,8 +6,7 @@
GopherError
# ...and from mechanize
from _opener import OpenerDirector, \
- build_opener, install_opener, urlopen, \
- OpenerFactory, urlretrieve
+ build_opener, install_opener, urlopen
from _auth import \
HTTPPasswordMgr, \
HTTPPasswordMgrWithDefaultRealm, \
Modified: wwwsearch/mechanize/trunk/test/test_mechanize.py
==============================================================================
--- wwwsearch/mechanize/trunk/test/test_mechanize.py (original)
+++ wwwsearch/mechanize/trunk/test/test_mechanize.py Sat Sep 23 17:52:52 2006
@@ -1,6 +1,6 @@
#!/usr/bin/env python
-import sys, random
+import sys, os, random, math
from unittest import TestCase
import StringIO, re, UserDict, urllib2
@@ -15,6 +15,18 @@
FACTORY_CLASSES.append(mechanize.RobustFactory)
+def killfile(filename):
+ try:
+ os.remove(filename)
+ except OSError:
+ if os.name=='nt':
+ try:
+ os.chmod(arg, stat.S_IWRITE)
+ os.remove(arg)
+ except OSError:
+ pass
+
+
class RegressionTests(TestCase):
def test_close_base_tag(self):
@@ -183,6 +195,165 @@
default_schemes = []
+class OpenerTests(TestCase):
+
+ def test_retrieve(self):
+ # The .retrieve() method deals with a number of different cases. In
+ # each case, .read() should be called the expected number of times, the
+ # progress callback should be called as expected, and we should end up
+ # with a filename and some headers.
+
+ class Opener(mechanize.OpenerDirector):
+ def __init__(self, content_length=None):
+ mechanize.OpenerDirector.__init__(self)
+ self.calls = []
+ self.block_size = mechanize.OpenerDirector.BLOCK_SIZE
+ self.nr_blocks = 2.5
+ self.data = int((self.block_size/8)*self.nr_blocks)*"01234567"
+ self.total_size = len(self.data)
+ self._content_length = content_length
+ def open(self, fullurl, data=None):
+ from mechanize import _response
+ self.calls.append((fullurl, data))
+ headers = [("Foo", "Bar")]
+ if self._content_length is not None:
+ if self._content_length is True:
+ content_length = str(len(self.data))
+ else:
+ content_length = str(self._content_length)
+ headers.append(("content-length", content_length))
+ return _response.test_response(self.data, headers)
+
+ class CallbackVerifier:
+ def __init__(self, testcase, total_size, block_size):
+ self.count = 0
+ self._testcase = testcase
+ self._total_size = total_size
+ self._block_size = block_size
+ def callback(self, block_nr, block_size, total_size):
+ self._testcase.assertEqual(block_nr, self.count)
+ self._testcase.assertEqual(block_size, self._block_size)
+ self._testcase.assertEqual(total_size, self._total_size)
+ self.count += 1
+
+ # ensure we start without the test file present
+ tfn = "mechanize_test_73940ukewrl.txt"
+ killfile(tfn)
+
+ # case 1: filename supplied
+ op = Opener()
+ verif = CallbackVerifier(self, -1, op.block_size)
+ url = "http://example.com/"
+ try:
+ filename, headers = op.retrieve(
+ url, tfn, reporthook=verif.callback)
+ self.assertEqual(filename, tfn)
+ self.assertEqual(headers["foo"], 'Bar')
+ self.assertEqual(open(filename, "rb").read(), op.data)
+ self.assertEqual(len(op.calls), 1)
+ self.assertEqual(verif.count, math.ceil(op.nr_blocks) + 1)
+ op.close()
+ # .close()ing the opener does NOT remove non-temporary files
+ self.assert_(os.path.isfile(filename))
+ finally:
+ killfile(filename)
+
+ # case 2: no filename supplied, use a temporary file
+ op = Opener(content_length=True)
+ # We asked the Opener to add a content-length header to the response
+ # this time. Verify the total size passed to the callback is that case
+ # is according to the content-length (rather than -1).
+ verif = CallbackVerifier(self, op.total_size, op.block_size)
+ url = "http://example.com/"
+ filename, headers = op.retrieve(url, reporthook=verif.callback)
+ self.assertNotEqual(filename, tfn) # (some temp filename instead)
+ self.assertEqual(headers["foo"], 'Bar')
+ self.assertEqual(open(filename, "rb").read(), op.data)
+ self.assertEqual(len(op.calls), 1)
+ # .close()ing the opener removes temporary files
+ self.assert_(os.path.exists(filename))
+ op.close()
+ self.failIf(os.path.exists(filename))
+ self.assertEqual(verif.count, math.ceil(op.nr_blocks) + 1)
+
+ # case 3: "file:" URL with no filename supplied
+ # we DON'T create a temporary file, since there's a file there already
+ op = Opener()
+ verif = CallbackVerifier(self, -1, op.block_size)
+ tifn = "input_for_"+tfn
+ try:
+ f = open(tifn, 'wb')
+ try:
+ f.write(op.data)
+ finally:
+ f.close()
+ url = "file://" + tifn
+ filename, headers = op.retrieve(url, reporthook=verif.callback)
+ self.assertEqual(filename, None) # this may change
+ self.assertEqual(headers["foo"], 'Bar')
+ self.assertEqual(open(tifn, "rb").read(), op.data)
+ # no .read()s took place, since we already have the disk file,
+ # and we weren't asked to write it to another filename
+ self.assertEqual(verif.count, 0)
+ op.close()
+ # .close()ing the opener does NOT remove the file!
+ self.assert_(os.path.isfile(tifn))
+ finally:
+ killfile(tifn)
+
+ # case 4: "file:" URL and filename supplied
+ # we DO create a new file in this case
+ op = Opener()
+ verif = CallbackVerifier(self, -1, op.block_size)
+ tifn = "input_for_"+tfn
+ try:
+ f = open(tifn, 'wb')
+ try:
+ f.write(op.data)
+ finally:
+ f.close()
+ url = "file://" + tifn
+ try:
+ filename, headers = op.retrieve(
+ url, tfn, reporthook=verif.callback)
+ self.assertEqual(filename, tfn)
+ self.assertEqual(headers["foo"], 'Bar')
+ self.assertEqual(open(tifn, "rb").read(), op.data)
+ self.assertEqual(verif.count, math.ceil(op.nr_blocks) + 1)
+ op.close()
+ # .close()ing the opener does NOT remove non-temporary files
+ self.assert_(os.path.isfile(tfn))
+ finally:
+ killfile(tfn)
+ finally:
+ killfile(tifn)
+
+ # Content-Length mismatch with real file length gives URLError
+ big = 1024*32
+ op = Opener(content_length=big)
+ verif = CallbackVerifier(self, big, op.block_size)
+ url = "http://example.com/"
+ try:
+ try:
+ op.retrieve(url, reporthook=verif.callback)
+ except mechanize.ContentTooShortError, exc:
+ filename, headers = exc.result
+ self.assertNotEqual(filename, tfn)
+ self.assertEqual(headers["foo"], 'Bar')
+ # We still read and wrote to disk everything available, despite
+ # the exception.
+ self.assertEqual(open(filename, "rb").read(), op.data)
+ self.assertEqual(len(op.calls), 1)
+ self.assertEqual(verif.count, math.ceil(op.nr_blocks) + 1)
+ # cleanup should still take place
+ self.assert_(os.path.isfile(filename))
+ op.close()
+ self.failIf(os.path.isfile(filename))
+ else:
+ self.fail()
+ finally:
+ killfile(filename)
+
class BrowserTests(TestCase):
def test_referer(self):
b = TestBrowser()
More information about the wwwsearch-commits
mailing list