From jjlee at codespeak.net Sat Jul 7 18:28:58 2007 From: jjlee at codespeak.net (jjlee at codespeak.net) Date: Sat, 7 Jul 2007 18:28:58 +0200 (CEST) Subject: [wwwsearch-commits] r44825 - wwwsearch/mechanize/trunk/test-tools Message-ID: <20070707162858.7626481C1@code0.codespeak.net> Author: jjlee Date: Sat Jul 7 18:28:56 2007 New Revision: 44825 Modified: wwwsearch/mechanize/trunk/test-tools/testprogram.py Log: * Use SO_REUSEADDR for test server. * Raise exception if local server fails to start. Modified: wwwsearch/mechanize/trunk/test-tools/testprogram.py ============================================================================== --- wwwsearch/mechanize/trunk/test-tools/testprogram.py (original) +++ wwwsearch/mechanize/trunk/test-tools/testprogram.py Sat Jul 7 18:28:56 2007 @@ -42,6 +42,7 @@ import socket def connect(): sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) sock.settimeout(1.0) try: sock.connect(('127.0.0.1', self.port)) @@ -70,6 +71,8 @@ timeout = min(timeout, hard_limit) else: break + else: + raise def kill_windows(handle, report_hook): try: From jjlee at codespeak.net Sat Jul 7 18:39:31 2007 From: jjlee at codespeak.net (jjlee at codespeak.net) Date: Sat, 7 Jul 2007 18:39:31 +0200 (CEST) Subject: [wwwsearch-commits] r44827 - in wwwsearch/mechanize/trunk: mechanize test Message-ID: <20070707163931.8E4F981C6@code0.codespeak.net> Author: jjlee Date: Sat Jul 7 18:39:31 2007 New Revision: 44827 Modified: wwwsearch/mechanize/trunk/mechanize/_http.py wwwsearch/mechanize/trunk/test/test_urllib2.py Log: * Log skipped Refreshes * Add some more Refresh tests Modified: wwwsearch/mechanize/trunk/mechanize/_http.py ============================================================================== --- wwwsearch/mechanize/trunk/mechanize/_http.py (original) +++ wwwsearch/mechanize/trunk/mechanize/_http.py Sat Jul 7 18:39:31 2007 @@ -540,6 +540,7 @@ def __init__(self, max_time=0, honor_time=True): self.max_time = max_time self.honor_time = honor_time + self._sleep = time.sleep def http_response(self, request, response): code, msg, hdrs = response.code, response.msg, response.info() @@ -551,16 +552,19 @@ except ValueError: debug("bad Refresh header: %r" % refresh) return response + if newurl is None: newurl = response.geturl() if (self.max_time is None) or (pause <= self.max_time): if pause > 1E-3 and self.honor_time: - time.sleep(pause) + self._sleep(pause) hdrs["location"] = newurl # hardcoded http is NOT a bug response = self.parent.error( "http", request, response, "refresh", msg, hdrs) + else: + debug("Refresh header ignored: %r" % refresh) return response Modified: wwwsearch/mechanize/trunk/test/test_urllib2.py ============================================================================== --- wwwsearch/mechanize/trunk/test/test_urllib2.py (original) +++ wwwsearch/mechanize/trunk/test/test_urllib2.py Sat Jul 7 18:39:31 2007 @@ -919,6 +919,48 @@ self.assertEqual(o.proto, "http") self.assertEqual(o.args, (req, r, "refresh", "OK", headers)) + def test_refresh_honor_time(self): + class SleepTester: + def __init__(self, test, seconds): + self._test = test + if seconds is 0: + seconds = None # don't expect a sleep for 0 seconds + self._expected = seconds + self._got = None + def sleep(self, seconds): + self._got = seconds + def verify(self): + self._test.assertEqual(self._expected, self._got) + class Opener: + called = False + def error(self, *args, **kwds): + self.called = True + def test(rp, header, refresh_after): + expect_refresh = refresh_after is not None + opener = Opener() + rp.parent = opener + st = SleepTester(self, refresh_after) + rp._sleep = st.sleep + rp.http_response(Request("http://example.com"), + test_response(headers=[("Refresh", header)]), + ) + self.assertEqual(expect_refresh, opener.called) + st.verify() + + # by default, only zero-time refreshes are honoured + test(HTTPRefreshProcessor(), "0", 0) + test(HTTPRefreshProcessor(), "2", None) + + # if requested, more than zero seconds are allowed + test(HTTPRefreshProcessor(max_time=None), "2", 2) + test(HTTPRefreshProcessor(max_time=30), "2", 2) + + # no sleep if we don't "honor_time" + test(HTTPRefreshProcessor(max_time=30, honor_time=False), "2", 0) + + # request for too-long wait before refreshing --> no refresh occurs + test(HTTPRefreshProcessor(max_time=30), "60", None) + def test_redirect(self): from_url = "http://example.com/a.html" to_url = "http://example.com/b.html" From jjlee at codespeak.net Sat Jul 7 18:47:34 2007 From: jjlee at codespeak.net (jjlee at codespeak.net) Date: Sat, 7 Jul 2007 18:47:34 +0200 (CEST) Subject: [wwwsearch-commits] r44828 - wwwsearch/mechanize/trunk/mechanize Message-ID: <20070707164734.BA82781C7@code0.codespeak.net> Author: jjlee Date: Sat Jul 7 18:47:34 2007 New Revision: 44828 Modified: wwwsearch/mechanize/trunk/mechanize/_useragent.py Log: Change default mechanize.UserAgent (hence mechanize.Browser) Refresh behaviour: * Don't follow Refreshes > 30 seconds * honor_time is now False by default This is a backwards-incompatible change. The old default behaviour was confusing and rarely useful. You can get behaviour the same as the old defaults like so: browser.set_handle_refresh(True, max_time=None, honor_time=True) Modified: wwwsearch/mechanize/trunk/mechanize/_useragent.py ============================================================================== --- wwwsearch/mechanize/trunk/mechanize/_useragent.py (original) +++ wwwsearch/mechanize/trunk/mechanize/_useragent.py Sat Jul 7 18:47:34 2007 @@ -229,7 +229,7 @@ def set_handle_redirect(self, handle): """Set whether to handle HTTP 30x redirections.""" self._set_handler("_redirect", handle) - def set_handle_refresh(self, handle, max_time=None, honor_time=True): + def set_handle_refresh(self, handle, max_time=30.0, honor_time=False): """Set whether to handle HTTP Refresh headers.""" self._set_handler("_refresh", handle, constructor_kwds= {"max_time": max_time, "honor_time": honor_time}) From jjlee at codespeak.net Sat Jul 7 19:09:27 2007 From: jjlee at codespeak.net (jjlee at codespeak.net) Date: Sat, 7 Jul 2007 19:09:27 +0200 (CEST) Subject: [wwwsearch-commits] r44829 - in wwwsearch/mechanize/trunk: . test-tools Message-ID: <20070707170927.C645A81CA@code0.codespeak.net> Author: jjlee Date: Sat Jul 7 19:09:27 2007 New Revision: 44829 Modified: wwwsearch/mechanize/trunk/functional_tests.py wwwsearch/mechanize/trunk/test-tools/cookietest.cgi wwwsearch/mechanize/trunk/test-tools/testprogram.py wwwsearch/mechanize/trunk/test-tools/twisted-localserver.py Log: * Add a functional test for Refresh * Update docstrings / comments re local functional testing server Modified: wwwsearch/mechanize/trunk/functional_tests.py ============================================================================== --- wwwsearch/mechanize/trunk/functional_tests.py (original) +++ wwwsearch/mechanize/trunk/functional_tests.py Sat Jul 7 19:09:27 2007 @@ -4,7 +4,7 @@ # thanks Moof (aka Giles Antonio Radford) for some of these -import os, sys +import os, sys, urllib from unittest import TestCase import mechanize @@ -15,13 +15,6 @@ HTTPRedirectDebugProcessor, HTTPResponseDebugProcessor from mechanize._rfc3986 import urljoin -# XXX -# document twisted.web2 install (I forgot how I did it -- reinstall!) -# implement remaining stuff used by functional_tests.py -# in twisted-localserver.py: -# - 302 followed by 404 response -# - helper cgi script for cookies &c. - #from cookielib import CookieJar #from urllib2 import build_opener, install_opener, urlopen #from urllib2 import HTTPCookieProcessor, HTTPHandler @@ -90,6 +83,26 @@ self.assertEqual(r.code, 200) self.assert_("GeneralFAQ.html" in r.read(2048)) + def test_refresh(self): + def refresh_request(seconds): + uri = urljoin(self.uri, "/cgi-bin/cookietest.cgi") + val = urllib.quote_plus('%d; url="%s"' % (seconds, self.uri)) + return uri + ("?refresh=%s" % val) + r = self.browser.open(refresh_request(5)) + self.assertEqual(r.geturl(), self.uri) + # Refresh with pause > 30 seconds is ignored by default (these long + # refreshes tend to be there only because the website owner wants you + # to see the latest news, or whatever -- they're not essential to the + # operation of the site, and not really useful or appropriate when + # scraping). + refresh_uri = refresh_request(60) + r = self.browser.open(refresh_uri) + self.assertEqual(r.geturl(), refresh_uri) + # allow long refreshes (note we don't actually wait 60 seconds by default) + self.browser.set_handle_refresh(True, max_time=None) + r = self.browser.open(refresh_request(60)) + self.assertEqual(r.geturl(), self.uri) + def test_file_url(self): url = "file://%s" % sanepathname2url( os.path.abspath('functional_tests.py')) @@ -409,9 +422,8 @@ - start a local Twisted HTTP server and run the functional tests against that, rather than against SourceForge (quicker!) - Note not all the functional tests use the local server yet - -- some currently always access the internet regardless of - this option and the --uri option. + If this option doesn't work on Windows/Mac, somebody please + tell me about it, or I'll never find out... """ prog = testprogram.TestProgram( ["functional_tests"], Modified: wwwsearch/mechanize/trunk/test-tools/cookietest.cgi ============================================================================== --- wwwsearch/mechanize/trunk/test-tools/cookietest.cgi (original) +++ wwwsearch/mechanize/trunk/test-tools/cookietest.cgi Sat Jul 7 19:09:27 2007 @@ -3,9 +3,12 @@ # This is used by functional_tests.py +#import cgitb; cgitb.enable() + print "Content-Type: text/html" print "Set-Cookie: foo=bar\n" -import sys, os, string, cgi, Cookie +import sys, os, string, cgi, Cookie, urllib +from xml.sax import saxutils from types import ListType @@ -13,8 +16,18 @@ cookie = Cookie.SimpleCookie() cookieHdr = os.environ.get("HTTP_COOKIE", "") cookie.load(cookieHdr) -if not cookie.has_key("foo"): +form = cgi.FieldStorage() +refresh_value = None +if form.has_key("refresh"): + refresh = form["refresh"] + if not isinstance(refresh, ListType): + refresh_value = refresh.value +if refresh_value is not None: + print '' % ( + saxutils.quoteattr(urllib.unquote_plus(refresh_value))) +elif not cookie.has_key("foo"): print '' + print "" print "

Received cookies:

" print "
"
@@ -26,7 +39,6 @@
 print "
"
 print cgi.escape(os.environ.get("HTTP_REFERER", ""))
 print "
" -form = cgi.FieldStorage() print "

Received parameters:

" print "
"
 for k in form.keys():

Modified: wwwsearch/mechanize/trunk/test-tools/testprogram.py
==============================================================================
--- wwwsearch/mechanize/trunk/test-tools/testprogram.py	(original)
+++ wwwsearch/mechanize/trunk/test-tools/testprogram.py	Sat Jul  7 19:09:27 2007
@@ -210,7 +210,7 @@
 """
     def __init__(self, moduleNames, localServerProcess, defaultTest=None,
                  argv=None, testRunner=None, testLoader=defaultTestLoader,
-                 defaultUri="http://wwwsearch.sf.net/",
+                 defaultUri="http://wwwsearch.sourceforge.net/",
                  usageExamples=USAGE_EXAMPLES,
                  ):
         self.modules = []

Modified: wwwsearch/mechanize/trunk/test-tools/twisted-localserver.py
==============================================================================
--- wwwsearch/mechanize/trunk/test-tools/twisted-localserver.py	(original)
+++ wwwsearch/mechanize/trunk/test-tools/twisted-localserver.py	Sat Jul  7 19:09:27 2007
@@ -9,9 +9,9 @@
 python test-tools/twisted-localserver.py 8042
 python functional_tests.py --uri=http://localhost:8042/
 
-You need Twisted XXX version to run it:
+You need twisted.web2 to run it.  On ubuntu feisty, you can install it like so:
 
-XXX installation instructions
+sudo apt-get install python-twisted-web2
 """
 
 import sys, re

From jjlee at codespeak.net  Sat Jul  7 20:11:16 2007
From: jjlee at codespeak.net (jjlee at codespeak.net)
Date: Sat,  7 Jul 2007 20:11:16 +0200 (CEST)
Subject: [wwwsearch-commits] r44830 - wwwsearch/release_scripts
Message-ID: <20070707181116.C880681CD@code0.codespeak.net>

Author: jjlee
Date: Sat Jul  7 20:11:15 2007
New Revision: 44830

Modified:
   wwwsearch/release_scripts/mrelease.py
   wwwsearch/release_scripts/release.py
Log:
* Use subprocess.call() instead of os.system()
* Exit if an external command fails
* Add --no-version-check and --no-date-check arguments for testing source release build process


Modified: wwwsearch/release_scripts/mrelease.py
==============================================================================
--- wwwsearch/release_scripts/mrelease.py	(original)
+++ wwwsearch/release_scripts/mrelease.py	Sat Jul  7 20:11:15 2007
@@ -67,7 +67,9 @@
     if tag:
         sdist.tag(src, clean=options.clean)
     if rel:
-        sdist.build(update=options.update, clean=options.clean)
+        sdist.build(update=options.update, clean=options.clean,
+                    check_versions=options.check_versions,
+                    check_dates=options.check_dates)
     if upload:
         sdist.upload_to_sourceforge()
 

Modified: wwwsearch/release_scripts/release.py
==============================================================================
--- wwwsearch/release_scripts/release.py	(original)
+++ wwwsearch/release_scripts/release.py	Sat Jul  7 20:11:15 2007
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-import sys, os, re, tempfile, time, stat, posixpath, shutil
+import sys, os, re, tempfile, time, stat, posixpath, shutil, subprocess
 
 import logging
 logger = logging.getLogger("wwwsearch.release")
@@ -193,13 +193,22 @@
         finally:
             chdir(cwd, self.pretend)
 
-    def build(self, update=True, clean=False):
+    def build(self, update=True, clean=False,
+              check_versions=True,
+              check_dates=True,
+              ):
         self.prepare(update, clean)
         if self.pretend:
             print "(would check versions here)"
         else:
-            bad_versions = self.check_versions()
-            bad_dates = self.check_dates()
+            if check_versions:
+                bad_versions = self.check_versions()
+            else:
+                bad_versions = []
+            if check_dates:
+                bad_dates = self.check_dates()
+            else:
+                bad_dates = []
             if bad_versions:
                 raise BuildError(
                     "version doesn't match in %s" % " ".join(bad_versions))
@@ -280,10 +289,17 @@
     def build_files(self):
         return self.build(self.files)
 
-def system(cmd, pretend=False):
+def system(cmd, pretend=False, stdout=None):
     print cmd
     if not pretend:
-        os.system(cmd)
+        args = cmd.split()
+        assert ">" not in args, "shell redirect in command: "+cmd
+        try:
+            r = subprocess.call(args, stdout=stdout)
+        except OSError, exc:
+            raise RuntimeError("%s while executing: %s" % (exc, args))
+        if r != 0:
+            raise RuntimeError("%d exit status from: %s" % (r, args))
 
 def rename(src, dest, pretend=False):
     print "renaming %s --> %s" % (src, dest)
@@ -337,7 +353,8 @@
     if defines:
         def_text = " %s " % (" ".join(["-D%s" % define for define in defines]))
     def cmd(fn):
-        return system("em.py %s%s > %s" % (filename, def_text, fn), pretend)
+        system("em.py %s%s" % (filename, def_text),
+               pretend, stdout=open(fn, "w"))
     out_fn = wrap_command(
         cmd,
         os.path.dirname(filename),
@@ -347,8 +364,11 @@
     return out_fn
 
 def lynx_dump(filename, pretend=False):
+    def cmd(fn):
+        return system("lynx -dump %s" % filename,
+                      pretend, stdout=open(fn, "w"))
     out_fn = wrap_command(
-        lambda fn: system("lynx -dump %s > %s" % (filename, fn), pretend),
+        cmd,
         os.path.dirname(filename),
         os.path.splitext(os.path.basename(filename))[0]+".txt",
         pretend=pretend,
@@ -384,6 +404,14 @@
                       action="store_false", dest="update", default=True,
                       help="Leave svn working copy unchanged "
                            " (do not update or checkout)")
+    parser.add_option("--no-version-check",
+                      action="store_false", dest="check_versions", default=True,
+                      help="Don't check the version strings that appear in "
+                           "various files for correctness.")
+    parser.add_option("--no-date-check",
+                      action="store_false", dest="check_dates", default=True,
+                      help="Don't check the date strings that appear in "
+                           "various files for correctness.")
 
     options, args = parser.parse_args()
     tag = False

From jjlee at codespeak.net  Sun Jul 15 00:51:32 2007
From: jjlee at codespeak.net (jjlee at codespeak.net)
Date: Sun, 15 Jul 2007 00:51:32 +0200 (CEST)
Subject: [wwwsearch-commits] r45095 - wwwsearch/mechanize/trunk/test
Message-ID: <20070714225132.1B77F8179@code0.codespeak.net>

Author: jjlee
Date: Sun Jul 15 00:51:31 2007
New Revision: 45095

Modified:
   wwwsearch/mechanize/trunk/test/test_browser.py
Log:
Make test_browser.BrowserTests.test_empty() run with all factory classes

Modified: wwwsearch/mechanize/trunk/test/test_browser.py
==============================================================================
--- wwwsearch/mechanize/trunk/test/test_browser.py	(original)
+++ wwwsearch/mechanize/trunk/test/test_browser.py	Sun Jul 15 00:51:31 2007
@@ -334,10 +334,14 @@
                 self.assertEqual(b.viewing_html(), expect)
 
     def test_empty(self):
+        for factory_class in FACTORY_CLASSES:
+            self._test_empty(factory_class())
+
+    def _test_empty(self, factory):
         import mechanize
         url = "http://example.com/"
 
-        b = TestBrowser()
+        b = TestBrowser(factory=factory)
 
         self.assert_(b.response() is None)
 

From jjlee at codespeak.net  Sun Jul 15 00:56:01 2007
From: jjlee at codespeak.net (jjlee at codespeak.net)
Date: Sun, 15 Jul 2007 00:56:01 +0200 (CEST)
Subject: [wwwsearch-commits] r45096 - in wwwsearch/mechanize/trunk:
	mechanize test
Message-ID: <20070714225601.D1B018179@code0.codespeak.net>

Author: jjlee
Date: Sun Jul 15 00:56:01 2007
New Revision: 45096

Modified:
   wwwsearch/mechanize/trunk/mechanize/_html.py
   wwwsearch/mechanize/trunk/mechanize/_pullparser.py
   wwwsearch/mechanize/trunk/test/test_html.doctest
Log:
Make title parsing follow Firefox behaviour wrt child elements (previously the behaviour differed between Factory and RobustFactory).

Modified: wwwsearch/mechanize/trunk/mechanize/_html.py
==============================================================================
--- wwwsearch/mechanize/trunk/mechanize/_html.py	(original)
+++ wwwsearch/mechanize/trunk/mechanize/_html.py	Sun Jul 15 00:56:01 2007
@@ -17,6 +17,8 @@
 
 DEFAULT_ENCODING = "latin-1"
 
+COMPRESS_RE = re.compile(r"\s+")
+
 
 # the base classe is purely for backwards compatibility
 class ParseError(ClientForm.ParseError): pass
@@ -235,6 +237,30 @@
         self._response = response
         self._encoding = encoding
 
+    def _get_title_text(self, parser):
+        text = []
+        tok = None
+        while 1:
+            try:
+                tok = parser.get_token()
+            except NoMoreTokensError:
+                break
+            if tok.type == "data":
+                text.append(str(tok))
+            elif tok.type == "entityref":
+                t = unescape("&%s;" % tok.data,
+                             parser._entitydefs, parser.encoding)
+                text.append(t)
+            elif tok.type == "charref":
+                t = unescape_charref(tok.data, parser.encoding)
+                text.append(t)
+            elif tok.type in ["starttag", "endtag", "startendtag"]:
+                tag_name = tok.data
+                if tok.type == "endtag" and tag_name == "title":
+                    break
+                text.append(str(tok))
+        return COMPRESS_RE.sub(" ", "".join(text).strip())
+
     def title(self):
         import _pullparser
         p = _pullparser.TolerantPullParser(
@@ -245,7 +271,7 @@
             except _pullparser.NoMoreTokensError:
                 return None
             else:
-                return p.get_text()
+                return self._get_title_text(p)
         except sgmllib.SGMLParseError, exc:
             raise ParseError(exc)
 
@@ -328,7 +354,7 @@
 
 class RobustLinksFactory:
 
-    compress_re = re.compile(r"\s+")
+    compress_re = COMPRESS_RE
 
     def __init__(self,
                  link_parser_class=None,
@@ -418,7 +444,8 @@
         if title == _beautifulsoup.Null:
             return None
         else:
-            return title.firstText(lambda t: True)
+            inner_html = "".join([str(node) for node in title.contents])
+            return COMPRESS_RE.sub(" ", inner_html.strip())
 
 
 class Factory:

Modified: wwwsearch/mechanize/trunk/mechanize/_pullparser.py
==============================================================================
--- wwwsearch/mechanize/trunk/mechanize/_pullparser.py	(original)
+++ wwwsearch/mechanize/trunk/mechanize/_pullparser.py	Sun Jul 15 00:56:01 2007
@@ -35,6 +35,7 @@
 
 import re, htmlentitydefs
 import sgmllib, HTMLParser
+from xml.sax import saxutils
 
 from _html import unescape, unescape_charref
 
@@ -85,6 +86,60 @@
         args = ", ".join(map(repr, [self.type, self.data, self.attrs]))
         return self.__class__.__name__+"(%s)" % args
 
+    def __str__(self):
+        """
+        >>> print Token("starttag", "br")
+        
+ >>> print Token("starttag", "a", + ... [("href", "http://www.python.org/"), ("alt", '"foo"')]) + + >>> print Token("startendtag", "br") +
+ >>> print Token("startendtag", "br", [("spam", "eggs")]) +
+ >>> print Token("endtag", "p") +

+ >>> print Token("charref", "38") + & + >>> print Token("entityref", "amp") + & + >>> print Token("data", "foo\\nbar") + foo + bar + >>> print Token("comment", "Life is a bowl\\nof cherries.") + + >>> print Token("decl", "decl") + + >>> print Token("pi", "pi") + + """ + if self.attrs is not None: + attrs = "".join([" %s=%s" % (k, saxutils.quoteattr(v)) for + k, v in self.attrs]) + else: + attrs = "" + if self.type == "starttag": + return "<%s%s>" % (self.data, attrs) + elif self.type == "startendtag": + return "<%s%s />" % (self.data, attrs) + elif self.type == "endtag": + return "" % self.data + elif self.type == "charref": + return "&#%s;" % self.data + elif self.type == "entityref": + return "&%s;" % self.data + elif self.type == "data": + return self.data + elif self.type == "comment": + return "" % self.data + elif self.type == "decl": + return "" % self.data + elif self.type == "pi": + return "" % self.data + assert False + + def iter_until_exception(fn, exception, *args, **kwds): while 1: try: Modified: wwwsearch/mechanize/trunk/test/test_html.doctest ============================================================================== --- wwwsearch/mechanize/trunk/test/test_html.doctest (original) +++ wwwsearch/mechanize/trunk/test/test_html.doctest Sun Jul 15 00:56:01 2007 @@ -213,3 +213,41 @@ None >>> print get_first_link_text_sgmllib(html) None + + +Title parsing. We follow Firefox's behaviour with regard to child +elements (haven't tested IE). + +>>> def get_title_bs(html): +... factory = RobustTitleFactory() +... soup = MechanizeBs("utf-8", html) +... factory.set_soup(soup, "utf-8") +... return factory.title() + +>>> def get_title_sgmllib(html): +... factory = TitleFactory() +... response = test_html_response(html) +... factory.set_response(response, "utf-8") +... return factory.title() + +>>> html = ("""\ +... +... Title +...

Blah.

+... """) +>>> get_title_bs(html) +'Title' +>>> get_title_sgmllib(html) +'Title' + +>>> html = ("""\ +... +... Ti<script type="text/strange">alert("this is valid HTML -- yuck!")</script> +... tle && +... +...

Blah.

+... """) +>>> get_title_bs(html) +'Ti tle &&' +>>> get_title_sgmllib(html) +'Ti tle &&' From jjlee at codespeak.net Fri Jul 27 20:26:06 2007 From: jjlee at codespeak.net (jjlee at codespeak.net) Date: Fri, 27 Jul 2007 20:26:06 +0200 (CEST) Subject: [wwwsearch-commits] r45412 - wwwsearch/mechanize/trunk/mechanize Message-ID: <20070727182606.87D9B8341@code0.codespeak.net> Author: jjlee Date: Fri Jul 27 20:26:06 2007 New Revision: 45412 Modified: wwwsearch/mechanize/trunk/mechanize/_mechanize.py Log: Remove some out-of-date docstring text. Modified: wwwsearch/mechanize/trunk/mechanize/_mechanize.py ============================================================================== --- wwwsearch/mechanize/trunk/mechanize/_mechanize.py (original) +++ wwwsearch/mechanize/trunk/mechanize/_mechanize.py Fri Jul 27 20:26:06 2007 @@ -163,13 +163,7 @@ self.follow_link = self.find_link = None def set_handle_referer(self, handle): - """Set whether to add Referer header to each request. - - This base class does not implement this feature (so don't turn this on - if you're using this base class directly), but the subclass - mechanize.Browser does. - - """ + """Set whether to add Referer header to each request.""" self._set_handler("_referer", handle) self._handle_referer = bool(handle) From jjlee at codespeak.net Fri Jul 27 20:29:01 2007 From: jjlee at codespeak.net (jjlee at codespeak.net) Date: Fri, 27 Jul 2007 20:29:01 +0200 (CEST) Subject: [wwwsearch-commits] r45413 - wwwsearch/mechanize/trunk/mechanize Message-ID: <20070727182901.3C8D082E2@code0.codespeak.net> Author: jjlee Date: Fri Jul 27 20:29:00 2007 New Revision: 45413 Modified: wwwsearch/mechanize/trunk/mechanize/_mechanize.py Log: Docstring grammar fix. Modified: wwwsearch/mechanize/trunk/mechanize/_mechanize.py ============================================================================== --- wwwsearch/mechanize/trunk/mechanize/_mechanize.py (original) +++ wwwsearch/mechanize/trunk/mechanize/_mechanize.py Fri Jul 27 20:29:00 2007 @@ -190,8 +190,8 @@ def open_novisit(self, url, data=None): """Open a URL without visiting it. - The browser state (including .request, .response(), history, forms and - links) are all left unchanged by calling this function. + Browser state (including request, response, history, forms and links) + is left unchanged by calling this function. The interface is the same as for .open(). From jjlee at codespeak.net Sat Jul 28 20:28:56 2007 From: jjlee at codespeak.net (jjlee at codespeak.net) Date: Sat, 28 Jul 2007 20:28:56 +0200 (CEST) Subject: [wwwsearch-commits] r45426 - wwwsearch/mechanize/trunk/mechanize Message-ID: <20070728182856.732288065@code0.codespeak.net> Author: jjlee Date: Sat Jul 28 20:28:55 2007 New Revision: 45426 Modified: wwwsearch/mechanize/trunk/mechanize/_mechanize.py Log: Update .title() docstring to reflect recent change to behaviour (treatment of tags). Modified: wwwsearch/mechanize/trunk/mechanize/_mechanize.py ============================================================================== --- wwwsearch/mechanize/trunk/mechanize/_mechanize.py (original) +++ wwwsearch/mechanize/trunk/mechanize/_mechanize.py Sat Jul 28 20:28:55 2007 @@ -451,10 +451,10 @@ return self._factory.encoding def title(self): - """Return title, or None if there is no title element in the document. + r"""Return title, or None if there is no title element in the document. - Tags are stripped or textified as described in docs for - PullParser.get_text() method of pullparser module. + Treatment of any tag children of attempts to follow Firefox and IE + (currently, tags are preserved). """ if not self.viewing_html(): From jjlee at codespeak.net Sun Sep 2 22:01:44 2007 From: jjlee at codespeak.net (jjlee at codespeak.net) Date: Sun, 2 Sep 2007 22:01:44 +0200 (CEST) Subject: [wwwsearch-commits] r46246 - wwwsearch/mechanize/trunk/mechanize Message-ID: <20070902200144.7C0DF81E1@code0.codespeak.net> Author: jjlee Date: Sun Sep 2 22:01:42 2007 New Revision: 46246 Modified: wwwsearch/mechanize/trunk/mechanize/_clientcookie.py wwwsearch/mechanize/trunk/mechanize/_http.py wwwsearch/mechanize/trunk/mechanize/_mechanize.py wwwsearch/mechanize/trunk/mechanize/_msiecookiejar.py wwwsearch/mechanize/trunk/mechanize/_pullparser.py wwwsearch/mechanize/trunk/mechanize/_useragent.py wwwsearch/mechanize/trunk/mechanize/_util.py Log: Fix some too-long lines. Modified: wwwsearch/mechanize/trunk/mechanize/_clientcookie.py ============================================================================== --- wwwsearch/mechanize/trunk/mechanize/_clientcookie.py (original) +++ wwwsearch/mechanize/trunk/mechanize/_clientcookie.py Sun Sep 2 22:01:42 2007 @@ -465,8 +465,8 @@ Note that domain_return_ok is called for every *cookie* domain, not just for the *request* domain. For example, the function might be - called with both ".acme.com" and "www.acme.com" if the request domain is - "www.acme.com". The same goes for path_return_ok. + called with both ".acme.com" and "www.acme.com" if the request domain + is "www.acme.com". The same goes for path_return_ok. For argument documentation, see the docstring for return_ok. @@ -818,7 +818,8 @@ # done by domain_return_ok. debug(" - checking cookie %s", cookie) - for n in "version", "verifiability", "secure", "expires", "port", "domain": + for n in ("version", "verifiability", "secure", "expires", "port", + "domain"): fn_name = "return_ok_"+n fn = getattr(self, fn_name) if not fn(cookie, request): Modified: wwwsearch/mechanize/trunk/mechanize/_http.py ============================================================================== --- wwwsearch/mechanize/trunk/mechanize/_http.py (original) +++ wwwsearch/mechanize/trunk/mechanize/_http.py Sun Sep 2 22:01:42 2007 @@ -300,7 +300,8 @@ if is_html(ct_hdrs, url, self._allow_xhtml): try: try: - html_headers = parse_head(response, self.head_parser_class()) + html_headers = parse_head(response, + self.head_parser_class()) finally: response.seek(0) except (HTMLParser.HTMLParseError, Modified: wwwsearch/mechanize/trunk/mechanize/_mechanize.py ============================================================================== --- wwwsearch/mechanize/trunk/mechanize/_mechanize.py (original) +++ wwwsearch/mechanize/trunk/mechanize/_mechanize.py Sun Sep 2 22:01:42 2007 @@ -425,8 +425,8 @@ """Return the global form object, or None if the factory implementation did not supply one. - The "global" form object contains all controls that are not descendants of - any FORM element. + The "global" form object contains all controls that are not descendants + of any FORM element. The returned form object implements the ClientForm.HTMLForm interface. Modified: wwwsearch/mechanize/trunk/mechanize/_msiecookiejar.py ============================================================================== --- wwwsearch/mechanize/trunk/mechanize/_msiecookiejar.py (original) +++ wwwsearch/mechanize/trunk/mechanize/_msiecookiejar.py Sun Sep 2 22:01:42 2007 @@ -114,8 +114,9 @@ domain = m.group(1) path = m.group(2) - cookies.append({"KEY": key, "VALUE": value, "DOMAIN": domain, - "PATH": path, "FLAGS": flags, "HIXP": hi_expire, + cookies.append({"KEY": key, "VALUE": value, + "DOMAIN": domain, "PATH": path, + "FLAGS": flags, "HIXP": hi_expire, "LOXP": lo_expire, "HICREATE": hi_create, "LOCREATE": lo_create}) finally: Modified: wwwsearch/mechanize/trunk/mechanize/_pullparser.py ============================================================================== --- wwwsearch/mechanize/trunk/mechanize/_pullparser.py (original) +++ wwwsearch/mechanize/trunk/mechanize/_pullparser.py Sun Sep 2 22:01:42 2007 @@ -205,7 +205,8 @@ return iter_until_exception(self.get_tag, NoMoreTokensError, *names) def tokens(self, *tokentypes): - return iter_until_exception(self.get_token, NoMoreTokensError, *tokentypes) + return iter_until_exception(self.get_token, NoMoreTokensError, + *tokentypes) def next(self): try: Modified: wwwsearch/mechanize/trunk/mechanize/_useragent.py ============================================================================== --- wwwsearch/mechanize/trunk/mechanize/_useragent.py (original) +++ wwwsearch/mechanize/trunk/mechanize/_useragent.py Sun Sep 2 22:01:42 2007 @@ -306,7 +306,8 @@ if obj is not None: newhandler = handler_class(obj) else: - newhandler = handler_class(*constructor_args, **constructor_kwds) + newhandler = handler_class( + *constructor_args, **constructor_kwds) else: newhandler = None self._replace_handler(name, newhandler) Modified: wwwsearch/mechanize/trunk/mechanize/_util.py ============================================================================== --- wwwsearch/mechanize/trunk/mechanize/_util.py (original) +++ wwwsearch/mechanize/trunk/mechanize/_util.py Sun Sep 2 22:01:42 2007 @@ -156,7 +156,8 @@ return t -strict_re = re.compile(r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) (\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$") +strict_re = re.compile(r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) " + r"(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$") wkday_re = re.compile( r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I) loose_http_re = re.compile( From wwwsearch-commits at codespeak.net Thu Sep 20 22:28:13 2007 From: wwwsearch-commits at codespeak.net (Viagra.com Inc) Date: Thu, 20 Sep 2007 22:28:13 +0200 (CEST) Subject: [wwwsearch-commits] Lovers package at discount price! Message-ID: <20070920112945.3458.qmail@host30-221-dynamic.11-79-r.retail.telecomitalia.it> An HTML attachment was scrubbed... URL: http://codespeak.net/pipermail/wwwsearch-commits/attachments/20070920/dc49e7f1/attachment.htm From wwwsearch-commits at codespeak.net Wed Sep 26 18:44:14 2007 From: wwwsearch-commits at codespeak.net (Viagra.com Inc) Date: Wed, 26 Sep 2007 18:44:14 +0200 (CEST) Subject: [wwwsearch-commits] September 70% OFF Message-ID: <20070926074620.13974.qmail@AOrleans-258-1-116-100.w90-21.abo.wanadoo.fr> An HTML attachment was scrubbed... URL: http://codespeak.net/pipermail/wwwsearch-commits/attachments/20070926/96df9694/attachment.htm