[wwwsearch-commits] r43803 - in wwwsearch/mechanize/trunk: mechanize test
jjlee at codespeak.net
jjlee at codespeak.net
Mon May 28 17:22:10 CEST 2007
Author: jjlee
Date: Mon May 28 17:22:09 2007
New Revision: 43803
Modified:
wwwsearch/mechanize/trunk/mechanize/_http.py
wwwsearch/mechanize/trunk/test/test_urllib2.py
Log:
Redirected robots.txt fetch no longer results in another attempted robots.txt fetch to check the redirection is allowed! Needs revisiting post-stable release.
Modified: wwwsearch/mechanize/trunk/mechanize/_http.py
==============================================================================
--- wwwsearch/mechanize/trunk/mechanize/_http.py (original)
+++ wwwsearch/mechanize/trunk/mechanize/_http.py Mon May 28 17:22:09 2007
@@ -79,12 +79,14 @@
# the same.
# XXX really refresh redirections should be visiting; tricky to
# fix, so this will wait until post-stable release
- return Request(newurl,
- headers=req.headers,
- origin_req_host=req.get_origin_req_host(),
- unverifiable=True,
- visit=False,
- )
+ new = Request(newurl,
+ headers=req.headers,
+ origin_req_host=req.get_origin_req_host(),
+ unverifiable=True,
+ visit=False,
+ )
+ new._origin_req = getattr(req, "_origin_req", req)
+ return new
else:
raise HTTPError(req.get_full_url(), code, msg, headers, fp)
@@ -412,6 +414,15 @@
return request
host = request.get_host()
+
+ # robots.txt requests don't need to be allowed by robots.txt :-)
+ origin_req = getattr(request, "_origin_req", None)
+ if (origin_req is not None and
+ origin_req.get_selector() == "/robots.txt" and
+ origin_req.get_host() == host
+ ):
+ return request
+
if host != self._host:
self.rfp = self.rfp_class()
try:
Modified: wwwsearch/mechanize/trunk/test/test_urllib2.py
==============================================================================
--- wwwsearch/mechanize/trunk/test/test_urllib2.py (original)
+++ wwwsearch/mechanize/trunk/test/test_urllib2.py Mon May 28 17:22:09 2007
@@ -806,6 +806,41 @@
h.http_request(req)
self.assert_(rfpc.calls == [])
+ def test_redirected_robots_txt(self):
+ # redirected robots.txt fetch shouldn't result in another attempted
+ # robots.txt fetch to check the redirection is allowed!
+ import mechanize
+ from mechanize import build_opener, HTTPHandler, \
+ HTTPDefaultErrorHandler, HTTPRedirectHandler, \
+ HTTPRobotRulesProcessor
+
+ class MockHTTPHandler(mechanize.BaseHandler):
+ def __init__(self):
+ self.requests = []
+ def http_open(self, req):
+ import mimetools, httplib, copy
+ from StringIO import StringIO
+ self.requests.append(copy.deepcopy(req))
+ if req.get_full_url() == "http://example.com/robots.txt":
+ hdr = "Location: http://example.com/en/robots.txt\r\n\r\n"
+ msg = mimetools.Message(StringIO(hdr))
+ return self.parent.error(
+ "http", req, test_response(), 302, "Blah", msg)
+ else:
+ return test_response("Allow: *", [], req.get_full_url())
+
+ hh = MockHTTPHandler()
+ hdeh = HTTPDefaultErrorHandler()
+ hrh = HTTPRedirectHandler()
+ rh = HTTPRobotRulesProcessor()
+ o = build_test_opener(hh, hdeh, hrh, rh)
+ o.open("http://example.com/")
+ self.assertEqual([req.get_full_url() for req in hh.requests],
+ ["http://example.com/robots.txt",
+ "http://example.com/en/robots.txt",
+ "http://example.com/",
+ ])
+
def test_cookies(self):
cj = MockCookieJar()
h = HTTPCookieProcessor(cj)
More information about the wwwsearch-commits
mailing list