[wwwsearch-commits] r19185 - wwwsearch/ClientCookie/trunk/ClientCookie

jjlee at codespeak.net jjlee at codespeak.net
Sun Oct 30 17:06:24 CET 2005


Author: jjlee
Date: Sun Oct 30 17:06:23 2005
New Revision: 19185

Modified:
   wwwsearch/ClientCookie/trunk/ClientCookie/_HeadersUtil.py
   wwwsearch/ClientCookie/trunk/ClientCookie/_urllib2_support.py
Log:
Move is_html() code here from mechanize

Modified: wwwsearch/ClientCookie/trunk/ClientCookie/_HeadersUtil.py
==============================================================================
--- wwwsearch/ClientCookie/trunk/ClientCookie/_HeadersUtil.py	(original)
+++ wwwsearch/ClientCookie/trunk/ClientCookie/_HeadersUtil.py	Sun Oct 30 17:06:23 2005
@@ -24,6 +24,23 @@
     True = 1
     False = 0
 
+def is_html(ct_headers, url):
+    """
+    ct_headers: Sequence of Content-Type headers
+    url: Response URL
+
+    """
+    if not ct_headers:
+        # guess
+        return (url.endswith('.htm') or url.endswith('.html') or
+                url.endswith('.xhtml'))
+    # use first header
+    ct = split_header_words(ct_headers)[0][0][0]
+    return ct in [
+        "text/html", "text/xhtml", "text/xml",
+        "application/xml", "application/xhtml+xml",
+        ]
+
 def unmatched(match):
     """Return unmatched part of re.Match object."""
     start, end = match.span(0)

Modified: wwwsearch/ClientCookie/trunk/ClientCookie/_urllib2_support.py
==============================================================================
--- wwwsearch/ClientCookie/trunk/ClientCookie/_urllib2_support.py	(original)
+++ wwwsearch/ClientCookie/trunk/ClientCookie/_urllib2_support.py	Sun Oct 30 17:06:23 2005
@@ -16,6 +16,7 @@
 import ClientCookie
 from _ClientCookie import CookieJar, request_host
 from _Util import isstringlike, startswith, getheaders
+from _HeadersUtil import is_html
 from _Debug import getLogger
 
 try: True
@@ -306,10 +307,9 @@
             if not hasattr(response, "seek"):
                 response = response_seek_wrapper(response)
             headers = response.info()
-            ct = getheaders(response.info(), "content-type")
-            html = ct and (ct[0].startswith("text/html") or
-                           ct[0].startswith("text/xhtml"))
-            if html:
+            url = response.geturl()
+            ct_hdrs = getheaders(response.info(), "content-type")
+            if is_html(ct_hdrs, url):
                 try:
                     try:
                         html_headers = parse_head(response, self.head_parser_class())


More information about the wwwsearch-commits mailing list