[wwwsearch-commits] r31244 - in wwwsearch/mechanize/trunk: . mechanize test

jjlee at codespeak.net jjlee at codespeak.net
Thu Aug 10 21:54:59 CEST 2006


Author: jjlee
Date: Thu Aug 10 21:54:58 2006
New Revision: 31244

Modified:
   wwwsearch/mechanize/trunk/functional_tests.py
   wwwsearch/mechanize/trunk/mechanize/_http.py
   wwwsearch/mechanize/trunk/test/test_urllib2.py
Log:
Use mechanize to open robots.txt; Don't consult RobotFileParser instance about non-HTTP URLs

Modified: wwwsearch/mechanize/trunk/functional_tests.py
==============================================================================
--- wwwsearch/mechanize/trunk/functional_tests.py	(original)
+++ wwwsearch/mechanize/trunk/functional_tests.py	Thu Aug 10 21:54:58 2006
@@ -18,9 +18,10 @@
 
 #from mechanize import CreateBSDDBCookieJar
 
-## logger = logging.getLogger("mechanize")
-## logger.addHandler(logging.StreamHandler())
-## logger.setLevel(logging.DEBUG)
+import logging
+logger = logging.getLogger("mechanize")
+logger.addHandler(logging.StreamHandler())
+logger.setLevel(logging.DEBUG)
 
 
 def sanepathname2url(path):
@@ -184,6 +185,16 @@
             o.close()
             install_opener(None)
 
+    def test_robots(self):
+        plain_opener = mechanize.build_opener(mechanize.HTTPRobotRulesProcessor)
+        browser = mechanize.Browser()
+        for opener in plain_opener, browser:
+            r = opener.open("http://wwwsearch.sourceforge.net/robots")
+            self.assertEqual(r.code, 200)
+            self.assertRaises(
+                mechanize.RobotExclusionError,
+                opener.open, "http://wwwsearch.sourceforge.net/norobots")
+
     def test_urlretrieve(self):
         url = "http://www.python.org/"
         verif = CallbackVerifier(self)

Modified: wwwsearch/mechanize/trunk/mechanize/_http.py
==============================================================================
--- wwwsearch/mechanize/trunk/mechanize/_http.py	(original)
+++ wwwsearch/mechanize/trunk/mechanize/_http.py	Thu Aug 10 21:54:58 2006
@@ -332,6 +332,46 @@
 except ImportError:
     pass
 else:
+    class MechanizeRobotFileParser(robotparser.RobotFileParser):
+
+        def __init__(self, url='', opener=None):
+            import _opener
+            robotparser.RobotFileParser.__init__(self, url)
+            self._opener = opener
+
+        def set_opener(self, opener=None):
+            if opener is None:
+                opener = _opener.OpenerDirector()
+            self._opener = opener
+
+        def read(self):
+            """Reads the robots.txt URL and feeds it to the parser."""
+            if self._opener is None:
+                self.set_opener()
+            try:
+                f = self._opener.open(self.url)
+            except HTTPError, f:
+                pass
+            except (IOError, socket.error, OSError), exc:
+                robotparser._debug("ignoring error opening %r: %s" %
+                                   (self.url, exc))
+                return
+            lines = []
+            line = f.readline()
+            while line:
+                lines.append(line.strip())
+                line = f.readline()
+            status = f.code
+            if status == 401 or status == 403:
+                self.disallow_all = True
+                robotparser._debug("disallow all")
+            elif status >= 400:
+                self.allow_all = True
+                robotparser._debug("allow all")
+            elif status == 200 and lines:
+                robotparser._debug("parse lines")
+                self.parse(lines)
+
     class RobotExclusionError(urllib2.HTTPError):
         def __init__(self, request, *args):
             apply(urllib2.HTTPError.__init__, (self,)+args)
@@ -349,16 +389,29 @@
         else:
             http_response_class = HTTPMessage
 
-        def __init__(self, rfp_class=robotparser.RobotFileParser):
+        def __init__(self, rfp_class=MechanizeRobotFileParser):
             self.rfp_class = rfp_class
             self.rfp = None
             self._host = None
 
         def http_request(self, request):
-            host = request.get_host()
             scheme = request.get_type()
+            if scheme not in ["http", "https"]:
+                # robots exclusion only applies to HTTP
+                return request
+
+            if request.get_selector() == "/robots.txt":
+                # /robots.txt is always OK to fetch
+                return request
+
+            host = request.get_host()
             if host != self._host:
                 self.rfp = self.rfp_class()
+                try:
+                    self.rfp.set_opener(self.parent)
+                except AttributeError:
+                    debug("%r instance does not support set_opener" %
+                          self.rfp.__class__)
                 self.rfp.set_url(scheme+"://"+host+"/robots.txt")
                 self.rfp.read()
                 self._host = host

Modified: wwwsearch/mechanize/trunk/test/test_urllib2.py
==============================================================================
--- wwwsearch/mechanize/trunk/test/test_urllib2.py	(original)
+++ wwwsearch/mechanize/trunk/test/test_urllib2.py	Thu Aug 10 21:54:58 2006
@@ -394,6 +394,8 @@
         return self
     def set_url(self, url):
         self.calls.append(("set_url", url))
+    def set_opener(self, opener):
+        self.calls.append(("set_opener", opener))
     def read(self):
         self.calls.append("read")
     def can_fetch(self, ua, url):
@@ -669,8 +671,10 @@
             return  # skip test
         else:
             from mechanize import HTTPRobotRulesProcessor
+        opener = OpenerDirector()
         rfpc = MockRobotFileParserClass()
         h = HTTPRobotRulesProcessor(rfpc)
+        opener.add_handler(h)
 
         url = "http://example.com:80/foo/bar.html"
         req = Request(url)
@@ -679,6 +683,7 @@
         h.http_request(req)
         self.assert_(rfpc.calls == [
             "__call__",
+            ("set_opener", opener),
             ("set_url", "http://example.com:80/robots.txt"),
             "read",
             ("can_fetch", "", url),
@@ -718,6 +723,7 @@
         h.http_request(req)
         self.assert_(rfpc.calls == [
             "__call__",
+            ("set_opener", opener),
             ("set_url", "http://example.com/robots.txt"),
             "read",
             ("can_fetch", "", url),
@@ -729,10 +735,17 @@
         h.http_request(req)
         self.assert_(rfpc.calls == [
             "__call__",
+            ("set_opener", opener),
             ("set_url", "https://example.org/robots.txt"),
             "read",
             ("can_fetch", "", url),
             ])
+        # non-HTTP URL -> ignore robots.txt
+        rfpc.clear()
+        url = "ftp://example.com/"
+        req = Request(url)
+        h.http_request(req)
+        self.assert_(rfpc.calls == [])
 
     def test_cookies(self):
         cj = MockCookieJar()


More information about the wwwsearch-commits mailing list