[wwwsearch-commits] r19181 - in wwwsearch/mechanize/trunk: . mechanize

jjlee at codespeak.net jjlee at codespeak.net
Sun Oct 30 16:54:19 CET 2005


Author: jjlee
Date: Sun Oct 30 16:54:18 2005
New Revision: 19181

Modified:
   wwwsearch/mechanize/trunk/mechanize/_mechanize.py
   wwwsearch/mechanize/trunk/test.py
Log:
Fix .viewing_html(); Clarify some comments and an exception message

Modified: wwwsearch/mechanize/trunk/mechanize/_mechanize.py
==============================================================================
--- wwwsearch/mechanize/trunk/mechanize/_mechanize.py	(original)
+++ wwwsearch/mechanize/trunk/mechanize/_mechanize.py	Sun Oct 30 16:54:18 2005
@@ -286,6 +286,8 @@
             tag = token.data
             name = attrs.get("name")
             text = None
+            # XXX need to sort out quoting
+            #url = urllib.quote_plus(attrs.get(self.urltags[tag]))
             url = attrs.get(self.urltags[tag])
             if tag == "a":
                 if token.type != "startendtag":
@@ -330,9 +332,18 @@
         """Return whether the current response contains HTML data."""
         if self._response is None:
             raise BrowserStateError("not viewing any document")
-        ct = self._response.info().getheaders("content-type")
-        return ct and (ct[0].startswith("text/html") or
-                       ct[0].startswith("text/xhtml"))
+        ct_hdrs = self._response.info().getheaders("content-type")
+        if not ct_hdrs:
+            # guess
+            url = self._response.geturl()
+            return (url.endswith('.htm') or url.endswith('.html') or
+                    url.endswith('.xhtml'))
+        # use first header
+        ct = split_header_words(ct_hdrs)[0][0][0]
+        return ct in [
+            "text/html", "text/xhtml", "text/xml",
+            "application/xml", "application/xhtml+xml",
+            ]
 
     def title(self):
         """Return title, or None if there is no title element in the document.
@@ -357,7 +368,7 @@
     def select_form(self, name=None, predicate=None, nr=None):
         """Select an HTML form for input.
 
-        This is like giving a form the "input focus" in a browser.
+        This is a bit like giving a form the "input focus" in a browser.
 
         If a form is selected, the object supports the HTMLForm interface, so
         you can call methods like .set_value(), .set(), and .click().
@@ -524,9 +535,11 @@
         if form is not None:
             try: return getattr(form, name)
             except AttributeError: pass
-        raise AttributeError("%s instance has no attribute %s "
-                             "(perhaps you forgot to .select_form()?" %
-                             (self.__class__, name))
+
+        msg = "%s instance has no attribute %s " % (self.__class__, name)
+        if form is None:
+            msg += "(perhaps you forgot to .select_form()?)"
+        raise AttributeError(msg)
 
 #---------------------------------------------------
 # Private methods.
@@ -601,9 +614,10 @@
         return self.default_encoding
 
     def _parse_html(self, response):
+        # this is now lazy, so we just reset the various attributes that
+        # result from parsing
         self.form = None
         self._title = None
         if not self.viewing_html():
-            # nothing to see here
             return
         self._forms = self._links = None

Modified: wwwsearch/mechanize/trunk/test.py
==============================================================================
--- wwwsearch/mechanize/trunk/test.py	(original)
+++ wwwsearch/mechanize/trunk/test.py	Sun Oct 30 16:54:18 2005
@@ -180,6 +180,46 @@
         self.assert_(b.back(2) is r5)
         self.assertRaises(mechanize.BrowserStateError, b.back, 2)
 
+    def test_viewing_html(self):
+        # XXX not testing multiple Content-Type headers
+        import mechanize
+        url = "http://example.com/"
+
+        for ct, isHtml in [
+            (None, False),
+            ("text/plain", False),
+            ("text/html", True),
+            ("text/xhtml", True),
+            ("text/xml", True),
+            ("application/xml", True),
+            ("application/xhtml+xml", True),
+            ("text/html; charset=blah", True),
+            (" text/xml ; charset=ook ", True),
+            ]:
+            b = TestBrowser()
+            hdrs = {}
+            if ct is not None:
+                hdrs["Content-Type"] = ct
+            b.add_handler(MockHandler([("http_open",
+                                        MockResponse(url, "", hdrs))]))
+            r = b.open(url)
+            self.assertEqual(b.viewing_html(), isHtml)
+
+        for ext, isHtml in [
+            (".htm", True),
+            (".html", True),
+            (".xhtml", True),
+            (".txt", False),
+            (".xml", False),  # XXX is this sensible?
+            ("", False),
+            ]:
+            b = TestBrowser()
+            url = "http://example.com/foo"+ext
+            b.add_handler(MockHandler(
+                [("http_open", MockResponse(url, "", {}))]))
+            r = b.open(url)
+            self.assertEqual(b.viewing_html(), isHtml)
+
     def test_empty(self):
         import mechanize
         url = "http://example.com/"


More information about the wwwsearch-commits mailing list