[wwwsearch-commits] r32608 - in wwwsearch/mechanize/trunk: . mechanize test

jjlee at codespeak.net jjlee at codespeak.net
Sat Sep 23 17:52:59 CEST 2006


Author: jjlee
Date: Sat Sep 23 17:52:52 2006
New Revision: 32608

Modified:
   wwwsearch/mechanize/trunk/functional_tests.py
   wwwsearch/mechanize/trunk/mechanize/__init__.py
   wwwsearch/mechanize/trunk/mechanize/_opener.py
   wwwsearch/mechanize/trunk/mechanize/_response.py
   wwwsearch/mechanize/trunk/mechanize/_urllib2.py
   wwwsearch/mechanize/trunk/test/test_mechanize.py
Log:
Fix OpenerDirector.retrieve(), which was very broken (duncan.booth at suttoncourtenay.org.uk)

Modified: wwwsearch/mechanize/trunk/functional_tests.py
==============================================================================
--- wwwsearch/mechanize/trunk/functional_tests.py	(original)
+++ wwwsearch/mechanize/trunk/functional_tests.py	Sat Sep 23 17:52:52 2006
@@ -197,17 +197,34 @@
 
     def test_urlretrieve(self):
         url = "http://www.python.org/"
-        verif = CallbackVerifier(self)
-        fn, hdrs = urlretrieve(url, "python.html", verif.callback)
-        try:
-            f = open(fn)
+        test_filename = "python.html"
+        def check_retrieve(opener, filename, headers):
+            self.assertEqual(headers.get('Content-Type'), 'text/html')
+            f = open(filename)
             data = f.read()
             f.close()
+            opener.close()
+            from urllib import urlopen
+            r = urlopen(url)
+            self.assertEqual(data, r.read())
+            r.close()
+
+        opener = mechanize.build_opener()
+        verif = CallbackVerifier(self)
+        filename, headers = opener.retrieve(url, test_filename, verif.callback)
+        try:
+            self.assertEqual(filename, test_filename)
+            check_retrieve(opener, filename, headers)
+            self.assert_(os.path.isfile(filename))
         finally:
-            os.remove(fn)
-        r = urlopen(url)
-        self.assert_(data == r.read())
-        r.close()
+            os.remove(filename)
+
+        opener = mechanize.build_opener()
+        verif = CallbackVerifier(self)
+        filename, headers = opener.retrieve(url, reporthook=verif.callback)
+        check_retrieve(opener, filename, headers)
+        # closing the opener removed the temporary file
+        self.failIf(os.path.isfile(filename))
 
 ##     def test_cacheftp(self):
 ##         from urllib2 import CacheFTPHandler, build_opener
@@ -226,8 +243,7 @@
         self._count = 0
         self._testcase = testcase
     def callback(self, block_nr, block_size, total_size):
-        if block_nr != self._count:
-            self._testcase.fail()
+        self._testcase.assertEqual(block_nr, self._count)
         self._count = self._count + 1
 
 

Modified: wwwsearch/mechanize/trunk/mechanize/__init__.py
==============================================================================
--- wwwsearch/mechanize/trunk/mechanize/__init__.py	(original)
+++ wwwsearch/mechanize/trunk/mechanize/__init__.py	Sat Sep 23 17:52:52 2006
@@ -5,6 +5,7 @@
     'Browser',
     'BrowserStateError',
     'CacheFTPHandler',
+    'ContentTooShortError',
     'Cookie',
     'CookieJar',
     'CookiePolicy',
@@ -97,6 +98,7 @@
 from _urllib2 import *
 
 # misc
+from _opener import ContentTooShortError, OpenerFactory, urlretrieve
 from _util import http2time as str2time
 from _response import response_seek_wrapper, make_response
 from _http import HeadParser

Modified: wwwsearch/mechanize/trunk/mechanize/_opener.py
==============================================================================
--- wwwsearch/mechanize/trunk/mechanize/_opener.py	(original)
+++ wwwsearch/mechanize/trunk/mechanize/_opener.py	Sat Sep 23 17:52:52 2006
@@ -9,7 +9,7 @@
 
 """
 
-import urllib2, bisect, urlparse, httplib, types
+import os, urllib2, bisect, urllib, urlparse, httplib, types, tempfile
 try:
     import threading as _threading
 except ImportError:
@@ -26,6 +26,12 @@
 from _request import Request
 
 
+class ContentTooShortError(urllib2.URLError):
+    def __init__(self, reason, result):
+        urllib2.URLError.__init__(self, reason)
+        self.result = result
+
+
 class OpenerDirector(urllib2.OpenerDirector):
     def __init__(self):
         urllib2.OpenerDirector.__init__(self)
@@ -36,6 +42,7 @@
         self._any_request = {}
         self._any_response = {}
         self._handler_index_valid = True
+        self._tempfiles = []
 
     def add_handler(self, handler):
         if handler in self.handlers:
@@ -198,52 +205,85 @@
             args = (dict, 'default', 'http_error_default') + orig_args
             return apply(self._call_chain, args)
 
+    BLOCK_SIZE = 1024*8
     def retrieve(self, fullurl, filename=None, reporthook=None, data=None):
         """Returns (filename, headers).
 
         For remote objects, the default filename will refer to a temporary
-        file.
+        file.  Temporary files are removed when the OpenerDirector.close()
+        method is called.
+
+        For file: URLs, at present the returned filename is None.  This may
+        change in future.
+
+        If the actual number of bytes read is less than indicated by the
+        Content-Length header, raises ContentTooShortError (a URLError
+        subclass).  The exception's .result attribute contains the (filename,
+        headers) that would have been returned.
 
         """
         req = self._request(fullurl, data)
-        type_ = req.get_type()
+        scheme = req.get_type()
         fp = self.open(req)
         headers = fp.info()
-        if filename is None and type == 'file':
-            return url2pathname(req.get_selector()), headers
+        if filename is None and scheme == 'file':
+            # XXX req.get_selector() seems broken here, return None,
+            #   pending sanity :-/
+            return None, headers
+            #return urllib.url2pathname(req.get_selector()), headers
         if filename:
             tfp = open(filename, 'wb')
         else:
-            path = urlparse(fullurl)[2]
+            path = urlparse.urlparse(fullurl)[2]
             suffix = os.path.splitext(path)[1]
-            tfp = tempfile.TemporaryFile("wb", suffix=suffix)
+            fd, filename = tempfile.mkstemp(suffix)
+            self._tempfiles.append(filename)
+            tfp = os.fdopen(fd, 'wb')
+
         result = filename, headers
-        bs = 1024*8
+        bs = self.BLOCK_SIZE
         size = -1
         read = 0
-        blocknum = 1
+        blocknum = 0
         if reporthook:
-            if headers.has_key("content-length"):
+            if "content-length" in headers:
                 size = int(headers["Content-Length"])
-            reporthook(0, bs, size)
+            reporthook(blocknum, bs, size)
         while 1:
             block = fp.read(bs)
+            if block == "":
+                break
             read += len(block)
+            tfp.write(block)
+            blocknum += 1
             if reporthook:
                 reporthook(blocknum, bs, size)
-            blocknum = blocknum + 1
-            if not block:
-                break
-            tfp.write(block)
         fp.close()
         tfp.close()
         del fp
         del tfp
-        if size>=0 and read<size:
-            raise IOError("incomplete retrieval error",
-                          "got only %d bytes out of %d" % (read,size))
+
+        # raise exception if actual size does not match content-length header
+        if size >= 0 and read < size:
+            raise ContentTooShortError(
+                "retrieval incomplete: "
+                "got only %i out of %i bytes" % (read, size),
+                result
+                )
+
         return result
 
+    def close(self):
+        urllib2.OpenerDirector.close(self)
+
+        if self._tempfiles:
+            for filename in self._tempfiles:
+                try:
+                    os.unlink(filename)
+                except OSError:
+                    pass
+            del self._tempfiles[:]
+
 
 class OpenerFactory:
     """This class's interface is quite likely to change."""

Modified: wwwsearch/mechanize/trunk/mechanize/_response.py
==============================================================================
--- wwwsearch/mechanize/trunk/mechanize/_response.py	(original)
+++ wwwsearch/mechanize/trunk/mechanize/_response.py	Sat Sep 23 17:52:52 2006
@@ -322,6 +322,10 @@
         state["wrapped"] = new_wrapped
         return state
 
+def test_response(data, headers,
+                  url="http://example.com/", code=200, msg="OK"):
+    return make_response(data, headers, url, code, msg)
+
 def make_response(data, headers, url, code, msg):
     """Convenient factory for objects implementing response interface.
 
@@ -332,12 +336,18 @@
     msg: string response code message (e.g. "OK")
 
     """
+    mime_headers = make_headers(headers)
+    r = closeable_response(StringIO(data), mime_headers, url, code, msg)
+    return response_seek_wrapper(r)
+
+def make_headers(headers):
+    """
+    headers: sequence of (name, value) pairs
+    """
     hdr_text = []
     for name_value in headers:
         hdr_text.append("%s: %s" % name_value)
-    mime_headers = mimetools.Message(StringIO("\n".join(hdr_text)))
-    r = closeable_response(StringIO(data), mime_headers, url, code, msg)
-    return response_seek_wrapper(r)
+    return mimetools.Message(StringIO("\n".join(hdr_text)))
 
 
 # Horrible, but needed, at least until fork urllib2.  Even then, may want

Modified: wwwsearch/mechanize/trunk/mechanize/_urllib2.py
==============================================================================
--- wwwsearch/mechanize/trunk/mechanize/_urllib2.py	(original)
+++ wwwsearch/mechanize/trunk/mechanize/_urllib2.py	Sat Sep 23 17:52:52 2006
@@ -6,8 +6,7 @@
      GopherError
 # ...and from mechanize
 from _opener import OpenerDirector, \
-     build_opener, install_opener, urlopen, \
-     OpenerFactory, urlretrieve
+     build_opener, install_opener, urlopen
 from _auth import \
      HTTPPasswordMgr, \
      HTTPPasswordMgrWithDefaultRealm, \

Modified: wwwsearch/mechanize/trunk/test/test_mechanize.py
==============================================================================
--- wwwsearch/mechanize/trunk/test/test_mechanize.py	(original)
+++ wwwsearch/mechanize/trunk/test/test_mechanize.py	Sat Sep 23 17:52:52 2006
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-import sys, random
+import sys, os, random, math
 from unittest import TestCase
 import StringIO, re, UserDict, urllib2
 
@@ -15,6 +15,18 @@
     FACTORY_CLASSES.append(mechanize.RobustFactory)
 
 
+def killfile(filename):
+    try:
+        os.remove(filename)
+    except OSError:
+        if os.name=='nt':
+            try:
+                os.chmod(arg, stat.S_IWRITE)
+                os.remove(arg)
+            except OSError:
+                pass
+
+
 class RegressionTests(TestCase):
 
     def test_close_base_tag(self):
@@ -183,6 +195,165 @@
     default_schemes = []
 
 
+class OpenerTests(TestCase):
+
+    def test_retrieve(self):
+        # The .retrieve() method deals with a number of different cases.  In
+        # each case, .read() should be called the expected number of times, the
+        # progress callback should be called as expected, and we should end up
+        # with a filename and some headers.
+
+        class Opener(mechanize.OpenerDirector):
+            def __init__(self, content_length=None):
+                mechanize.OpenerDirector.__init__(self)
+                self.calls = []
+                self.block_size = mechanize.OpenerDirector.BLOCK_SIZE
+                self.nr_blocks = 2.5
+                self.data = int((self.block_size/8)*self.nr_blocks)*"01234567"
+                self.total_size = len(self.data)
+                self._content_length = content_length
+            def open(self, fullurl, data=None):
+                from mechanize import _response
+                self.calls.append((fullurl, data))
+                headers = [("Foo", "Bar")]
+                if self._content_length is not None:
+                    if self._content_length is True:
+                        content_length = str(len(self.data))
+                    else:
+                        content_length = str(self._content_length)
+                    headers.append(("content-length", content_length))
+                return _response.test_response(self.data, headers)
+
+        class CallbackVerifier:
+            def __init__(self, testcase, total_size, block_size):
+                self.count = 0
+                self._testcase = testcase
+                self._total_size = total_size
+                self._block_size = block_size
+            def callback(self, block_nr, block_size, total_size):
+                self._testcase.assertEqual(block_nr, self.count)
+                self._testcase.assertEqual(block_size, self._block_size)
+                self._testcase.assertEqual(total_size, self._total_size)
+                self.count += 1
+
+        # ensure we start without the test file present
+        tfn = "mechanize_test_73940ukewrl.txt"
+        killfile(tfn)
+
+        # case 1: filename supplied
+        op = Opener()
+        verif = CallbackVerifier(self, -1, op.block_size)
+        url = "http://example.com/"
+        try:
+            filename, headers = op.retrieve(
+                url, tfn, reporthook=verif.callback)
+            self.assertEqual(filename, tfn)
+            self.assertEqual(headers["foo"], 'Bar')
+            self.assertEqual(open(filename, "rb").read(), op.data)
+            self.assertEqual(len(op.calls), 1)
+            self.assertEqual(verif.count, math.ceil(op.nr_blocks) + 1)
+            op.close()
+            # .close()ing the opener does NOT remove non-temporary files
+            self.assert_(os.path.isfile(filename))
+        finally:
+            killfile(filename)
+
+        # case 2: no filename supplied, use a temporary file
+        op = Opener(content_length=True)
+        # We asked the Opener to add a content-length header to the response
+        # this time.  Verify the total size passed to the callback is that case
+        # is according to the content-length (rather than -1).
+        verif = CallbackVerifier(self, op.total_size, op.block_size)
+        url = "http://example.com/"
+        filename, headers = op.retrieve(url, reporthook=verif.callback)
+        self.assertNotEqual(filename, tfn)  # (some temp filename instead)
+        self.assertEqual(headers["foo"], 'Bar')
+        self.assertEqual(open(filename, "rb").read(), op.data)
+        self.assertEqual(len(op.calls), 1)
+        # .close()ing the opener removes temporary files
+        self.assert_(os.path.exists(filename))
+        op.close()
+        self.failIf(os.path.exists(filename))
+        self.assertEqual(verif.count, math.ceil(op.nr_blocks) + 1)
+
+        # case 3: "file:" URL with no filename supplied
+        # we DON'T create a temporary file, since there's a file there already
+        op = Opener()
+        verif = CallbackVerifier(self, -1, op.block_size)
+        tifn = "input_for_"+tfn
+        try:
+            f = open(tifn, 'wb')
+            try:
+                f.write(op.data)
+            finally:
+                f.close()
+            url = "file://" + tifn
+            filename, headers = op.retrieve(url, reporthook=verif.callback)
+            self.assertEqual(filename, None)  # this may change
+            self.assertEqual(headers["foo"], 'Bar')
+            self.assertEqual(open(tifn, "rb").read(), op.data)
+            # no .read()s took place, since we already have the disk file,
+            # and we weren't asked to write it to another filename
+            self.assertEqual(verif.count, 0)
+            op.close()
+            # .close()ing the opener does NOT remove the file!
+            self.assert_(os.path.isfile(tifn))
+        finally:
+            killfile(tifn)
+
+        # case 4: "file:" URL and filename supplied
+        # we DO create a new file in this case
+        op = Opener()
+        verif = CallbackVerifier(self, -1, op.block_size)
+        tifn = "input_for_"+tfn
+        try:
+            f = open(tifn, 'wb')
+            try:
+                f.write(op.data)
+            finally:
+                f.close()
+            url = "file://" + tifn
+            try:
+                filename, headers = op.retrieve(
+                    url, tfn, reporthook=verif.callback)
+                self.assertEqual(filename, tfn)
+                self.assertEqual(headers["foo"], 'Bar')
+                self.assertEqual(open(tifn, "rb").read(), op.data)
+                self.assertEqual(verif.count, math.ceil(op.nr_blocks) + 1)
+                op.close()
+                # .close()ing the opener does NOT remove non-temporary files
+                self.assert_(os.path.isfile(tfn))
+            finally:
+                killfile(tfn)
+        finally:
+            killfile(tifn)
+
+        # Content-Length mismatch with real file length gives URLError
+        big = 1024*32
+        op = Opener(content_length=big)
+        verif = CallbackVerifier(self, big, op.block_size)
+        url = "http://example.com/"
+        try:
+            try:
+                op.retrieve(url, reporthook=verif.callback)
+            except mechanize.ContentTooShortError, exc:
+                filename, headers = exc.result
+                self.assertNotEqual(filename, tfn)
+                self.assertEqual(headers["foo"], 'Bar')
+                # We still read and wrote to disk everything available, despite
+                # the exception.
+                self.assertEqual(open(filename, "rb").read(), op.data)
+                self.assertEqual(len(op.calls), 1)
+                self.assertEqual(verif.count, math.ceil(op.nr_blocks) + 1)
+                # cleanup should still take place
+                self.assert_(os.path.isfile(filename))
+                op.close()
+                self.failIf(os.path.isfile(filename))
+            else:
+                self.fail()
+        finally:
+            killfile(filename)
+
 class BrowserTests(TestCase):
     def test_referer(self):
         b = TestBrowser()


More information about the wwwsearch-commits mailing list