[wwwsearch-commits] r43803 - in wwwsearch/mechanize/trunk: mechanize test

jjlee at codespeak.net jjlee at codespeak.net
Mon May 28 17:22:10 CEST 2007


Author: jjlee
Date: Mon May 28 17:22:09 2007
New Revision: 43803

Modified:
   wwwsearch/mechanize/trunk/mechanize/_http.py
   wwwsearch/mechanize/trunk/test/test_urllib2.py
Log:
Redirected robots.txt fetch no longer results in another attempted robots.txt fetch to check the redirection is allowed!  Needs revisiting post-stable release.


Modified: wwwsearch/mechanize/trunk/mechanize/_http.py
==============================================================================
--- wwwsearch/mechanize/trunk/mechanize/_http.py	(original)
+++ wwwsearch/mechanize/trunk/mechanize/_http.py	Mon May 28 17:22:09 2007
@@ -79,12 +79,14 @@
             # the same.
             # XXX really refresh redirections should be visiting; tricky to
             #  fix, so this will wait until post-stable release
-            return Request(newurl,
-                           headers=req.headers,
-                           origin_req_host=req.get_origin_req_host(),
-                           unverifiable=True,
-                           visit=False,
-                           )
+            new = Request(newurl,
+                          headers=req.headers,
+                          origin_req_host=req.get_origin_req_host(),
+                          unverifiable=True,
+                          visit=False,
+                          )
+            new._origin_req = getattr(req, "_origin_req", req)
+            return new
         else:
             raise HTTPError(req.get_full_url(), code, msg, headers, fp)
 
@@ -412,6 +414,15 @@
                 return request
 
             host = request.get_host()
+
+            # robots.txt requests don't need to be allowed by robots.txt :-)
+            origin_req = getattr(request, "_origin_req", None)
+            if (origin_req is not None and
+                origin_req.get_selector() == "/robots.txt" and
+                origin_req.get_host() == host
+                ):
+                return request
+
             if host != self._host:
                 self.rfp = self.rfp_class()
                 try:

Modified: wwwsearch/mechanize/trunk/test/test_urllib2.py
==============================================================================
--- wwwsearch/mechanize/trunk/test/test_urllib2.py	(original)
+++ wwwsearch/mechanize/trunk/test/test_urllib2.py	Mon May 28 17:22:09 2007
@@ -806,6 +806,41 @@
         h.http_request(req)
         self.assert_(rfpc.calls == [])
 
+    def test_redirected_robots_txt(self):
+        # redirected robots.txt fetch shouldn't result in another attempted
+        # robots.txt fetch to check the redirection is allowed!
+        import mechanize
+        from mechanize import build_opener, HTTPHandler, \
+             HTTPDefaultErrorHandler, HTTPRedirectHandler, \
+             HTTPRobotRulesProcessor
+
+        class MockHTTPHandler(mechanize.BaseHandler):
+            def __init__(self):
+                self.requests = []
+            def http_open(self, req):
+                import mimetools, httplib, copy
+                from StringIO import StringIO
+                self.requests.append(copy.deepcopy(req))
+                if req.get_full_url() == "http://example.com/robots.txt":
+                    hdr = "Location: http://example.com/en/robots.txt\r\n\r\n"
+                    msg = mimetools.Message(StringIO(hdr))
+                    return self.parent.error(
+                        "http", req, test_response(), 302, "Blah", msg)
+                else:
+                    return test_response("Allow: *", [], req.get_full_url())
+
+        hh = MockHTTPHandler()
+        hdeh = HTTPDefaultErrorHandler()
+        hrh = HTTPRedirectHandler()
+        rh = HTTPRobotRulesProcessor()
+        o = build_test_opener(hh, hdeh, hrh, rh)
+        o.open("http://example.com/")
+        self.assertEqual([req.get_full_url() for req in hh.requests],
+                         ["http://example.com/robots.txt",
+                          "http://example.com/en/robots.txt",
+                          "http://example.com/",
+                          ])
+
     def test_cookies(self):
         cj = MockCookieJar()
         h = HTTPCookieProcessor(cj)


More information about the wwwsearch-commits mailing list