[wwwsearch-commits] r31244 - in wwwsearch/mechanize/trunk: . mechanize test
jjlee at codespeak.net
jjlee at codespeak.net
Thu Aug 10 21:54:59 CEST 2006
Author: jjlee
Date: Thu Aug 10 21:54:58 2006
New Revision: 31244
Modified:
wwwsearch/mechanize/trunk/functional_tests.py
wwwsearch/mechanize/trunk/mechanize/_http.py
wwwsearch/mechanize/trunk/test/test_urllib2.py
Log:
Use mechanize to open robots.txt; Don't consult RobotFileParser instance about non-HTTP URLs
Modified: wwwsearch/mechanize/trunk/functional_tests.py
==============================================================================
--- wwwsearch/mechanize/trunk/functional_tests.py (original)
+++ wwwsearch/mechanize/trunk/functional_tests.py Thu Aug 10 21:54:58 2006
@@ -18,9 +18,10 @@
#from mechanize import CreateBSDDBCookieJar
-## logger = logging.getLogger("mechanize")
-## logger.addHandler(logging.StreamHandler())
-## logger.setLevel(logging.DEBUG)
+import logging
+logger = logging.getLogger("mechanize")
+logger.addHandler(logging.StreamHandler())
+logger.setLevel(logging.DEBUG)
def sanepathname2url(path):
@@ -184,6 +185,16 @@
o.close()
install_opener(None)
+ def test_robots(self):
+ plain_opener = mechanize.build_opener(mechanize.HTTPRobotRulesProcessor)
+ browser = mechanize.Browser()
+ for opener in plain_opener, browser:
+ r = opener.open("http://wwwsearch.sourceforge.net/robots")
+ self.assertEqual(r.code, 200)
+ self.assertRaises(
+ mechanize.RobotExclusionError,
+ opener.open, "http://wwwsearch.sourceforge.net/norobots")
+
def test_urlretrieve(self):
url = "http://www.python.org/"
verif = CallbackVerifier(self)
Modified: wwwsearch/mechanize/trunk/mechanize/_http.py
==============================================================================
--- wwwsearch/mechanize/trunk/mechanize/_http.py (original)
+++ wwwsearch/mechanize/trunk/mechanize/_http.py Thu Aug 10 21:54:58 2006
@@ -332,6 +332,46 @@
except ImportError:
pass
else:
+ class MechanizeRobotFileParser(robotparser.RobotFileParser):
+
+ def __init__(self, url='', opener=None):
+ import _opener
+ robotparser.RobotFileParser.__init__(self, url)
+ self._opener = opener
+
+ def set_opener(self, opener=None):
+ if opener is None:
+ opener = _opener.OpenerDirector()
+ self._opener = opener
+
+ def read(self):
+ """Reads the robots.txt URL and feeds it to the parser."""
+ if self._opener is None:
+ self.set_opener()
+ try:
+ f = self._opener.open(self.url)
+ except HTTPError, f:
+ pass
+ except (IOError, socket.error, OSError), exc:
+ robotparser._debug("ignoring error opening %r: %s" %
+ (self.url, exc))
+ return
+ lines = []
+ line = f.readline()
+ while line:
+ lines.append(line.strip())
+ line = f.readline()
+ status = f.code
+ if status == 401 or status == 403:
+ self.disallow_all = True
+ robotparser._debug("disallow all")
+ elif status >= 400:
+ self.allow_all = True
+ robotparser._debug("allow all")
+ elif status == 200 and lines:
+ robotparser._debug("parse lines")
+ self.parse(lines)
+
class RobotExclusionError(urllib2.HTTPError):
def __init__(self, request, *args):
apply(urllib2.HTTPError.__init__, (self,)+args)
@@ -349,16 +389,29 @@
else:
http_response_class = HTTPMessage
- def __init__(self, rfp_class=robotparser.RobotFileParser):
+ def __init__(self, rfp_class=MechanizeRobotFileParser):
self.rfp_class = rfp_class
self.rfp = None
self._host = None
def http_request(self, request):
- host = request.get_host()
scheme = request.get_type()
+ if scheme not in ["http", "https"]:
+ # robots exclusion only applies to HTTP
+ return request
+
+ if request.get_selector() == "/robots.txt":
+ # /robots.txt is always OK to fetch
+ return request
+
+ host = request.get_host()
if host != self._host:
self.rfp = self.rfp_class()
+ try:
+ self.rfp.set_opener(self.parent)
+ except AttributeError:
+ debug("%r instance does not support set_opener" %
+ self.rfp.__class__)
self.rfp.set_url(scheme+"://"+host+"/robots.txt")
self.rfp.read()
self._host = host
Modified: wwwsearch/mechanize/trunk/test/test_urllib2.py
==============================================================================
--- wwwsearch/mechanize/trunk/test/test_urllib2.py (original)
+++ wwwsearch/mechanize/trunk/test/test_urllib2.py Thu Aug 10 21:54:58 2006
@@ -394,6 +394,8 @@
return self
def set_url(self, url):
self.calls.append(("set_url", url))
+ def set_opener(self, opener):
+ self.calls.append(("set_opener", opener))
def read(self):
self.calls.append("read")
def can_fetch(self, ua, url):
@@ -669,8 +671,10 @@
return # skip test
else:
from mechanize import HTTPRobotRulesProcessor
+ opener = OpenerDirector()
rfpc = MockRobotFileParserClass()
h = HTTPRobotRulesProcessor(rfpc)
+ opener.add_handler(h)
url = "http://example.com:80/foo/bar.html"
req = Request(url)
@@ -679,6 +683,7 @@
h.http_request(req)
self.assert_(rfpc.calls == [
"__call__",
+ ("set_opener", opener),
("set_url", "http://example.com:80/robots.txt"),
"read",
("can_fetch", "", url),
@@ -718,6 +723,7 @@
h.http_request(req)
self.assert_(rfpc.calls == [
"__call__",
+ ("set_opener", opener),
("set_url", "http://example.com/robots.txt"),
"read",
("can_fetch", "", url),
@@ -729,10 +735,17 @@
h.http_request(req)
self.assert_(rfpc.calls == [
"__call__",
+ ("set_opener", opener),
("set_url", "https://example.org/robots.txt"),
"read",
("can_fetch", "", url),
])
+ # non-HTTP URL -> ignore robots.txt
+ rfpc.clear()
+ url = "ftp://example.com/"
+ req = Request(url)
+ h.http_request(req)
+ self.assert_(rfpc.calls == [])
def test_cookies(self):
cj = MockCookieJar()
More information about the wwwsearch-commits
mailing list