[wwwsearch-commits] r26198 - in
wwwsearch/mechanize/branch/mechanize-0.1.0-devel: . mechanize
jjlee at codespeak.net
jjlee at codespeak.net
Sun Apr 23 19:11:45 CEST 2006
Author: jjlee
Date: Sun Apr 23 19:11:43 2006
New Revision: 26198
Modified:
wwwsearch/mechanize/branch/mechanize-0.1.0-devel/README.html.in
wwwsearch/mechanize/branch/mechanize-0.1.0-devel/mechanize/_html.py
wwwsearch/mechanize/branch/mechanize-0.1.0-devel/mechanize/_mechanize.py
wwwsearch/mechanize/branch/mechanize-0.1.0-devel/test.py
Log:
Interface changes and cleanup for beta release (not finished yet: want to use single BeautifulSoup instance for RobustFactory): Changed Factory interface to make it easier to avoid re-parsing (principally: add .set_response() method and make factory methods take no args); Moved some c'tor args from Browser to Factory; Browser.encoding method (which was new anyway) now takes no args; Simplify links code and remove .get_links_iter(); .forms() and .links() now both return iterators (in fact, generators), not sequences (not really an interface change: these were always documented to return iterables)
Modified: wwwsearch/mechanize/branch/mechanize-0.1.0-devel/README.html.in
==============================================================================
--- wwwsearch/mechanize/branch/mechanize-0.1.0-devel/README.html.in (original)
+++ wwwsearch/mechanize/branch/mechanize-0.1.0-devel/README.html.in Sun Apr 23 19:11:43 2006
@@ -159,16 +159,14 @@
<h3>Specific to mechanize</h3>
<ul>
- <li>Apply Titus' patch to move stuff into separate file and change
- Factory interface.
- <li>Kill off <code>.get_links_iter()</code>.
+ <li>Make encoding_finder public, I guess.
+ <li>Test BeautifulSoup support better / fix encoding issue.
<li>Support Mark Pilgrim's universal encoding detector?
<li>Add another History implementation or two and finalise interface.
<li>History cache expiration.
<li>Investigate possible leak (see Balazs Ree's list posting).
<li>Add <code>Browser.form_as_string()</code> and
<code>Browser.__str__()</code> methods.
- <li>Test BeautifulSoup support better / fix encoding issue.
<li>Add two-way links between BeautifulSoup & ClientForm object models.
<li>Add basic proxy support. I hope somebody else does this!
</ul>
Modified: wwwsearch/mechanize/branch/mechanize-0.1.0-devel/mechanize/_html.py
==============================================================================
--- wwwsearch/mechanize/branch/mechanize-0.1.0-devel/mechanize/_html.py (original)
+++ wwwsearch/mechanize/branch/mechanize-0.1.0-devel/mechanize/_html.py Sun Apr 23 19:11:43 2006
@@ -1,9 +1,10 @@
from __future__ import generators
-import re, urllib, htmlentitydefs
+import re, copy, urllib, htmlentitydefs
from urlparse import urljoin
import ClientCookie
+from ClientCookie._HeadersUtil import split_header_words, is_html as _is_html
## # XXXX miserable hack
## def urljoin(base, url):
@@ -23,6 +24,41 @@
# 'safe'-by-default characters that urllib.urlquote never quotes
URLQUOTE_SAFE_URL_CHARS = "!*'();:@&=+$,/?%#[]~"
+DEFAULT_ENCODING = "latin-1"
+
+class CachingGeneratorFunction(object):
+ """Caching wrapper around a no-arguments iterable."""
+ def __init__(self, iterable):
+ self._iterable = iterable
+ self._cache = []
+ def __call__(self):
+ cache = self._cache
+ for item in cache:
+ yield item
+ for item in self._iterable:
+ cache.append(item)
+ yield item
+
+def encoding_finder(default_encoding):
+ def encoding(response):
+ # HTTPEquivProcessor may be in use, so both HTTP and HTTP-EQUIV
+ # headers may be in the response. HTTP-EQUIV headers come last,
+ # so try in order from first to last.
+ for ct in response.info().getheaders("content-type"):
+ for k, v in split_header_words([ct])[0]:
+ if k == "charset":
+ return v
+ return default_encoding
+ return encoding
+
+def make_is_html(allow_xhtml):
+ def is_html(response, encoding):
+ ct_hdrs = response.info().getheaders("content-type")
+ url = response.geturl()
+ # XXX encoding
+ return _is_html(ct_hdrs, url, allow_xhtml)
+ return is_html
+
# idea for this argument-processing trick is from Peter Otten
class Args:
def __init__(self, args_map):
@@ -38,7 +74,7 @@
form_parser_class=None,
request_class=None,
backwards_compat=False,
- encoding="latin-1", # deprecated
+ encoding=DEFAULT_ENCODING, # deprecated
):
return Args(locals())
@@ -138,7 +174,7 @@
form_parser_class=None,
request_class=None,
backwards_compat=False,
- encoding="latin-1", # deprecated
+ encoding=DEFAULT_ENCODING, # deprecated
):
import ClientForm
self.select_default = select_default
@@ -347,20 +383,48 @@
class Factory:
"""Factory for forms, links, etc.
- The interface of this class may expand in future.
+ This interface may expand in future.
+
+ Public methods:
+
+ set_request_class(request_class)
+ set_response(response)
+ forms()
+ links()
+
+ Public attributes:
+
+ encoding: string specifying the encoding of response if it contains a text
+ document (this value is left unspecified for documents that do not have
+ an encoding, e.g. an image file)
+ is_html: true if response contains an HTML document (XHTML may be
+ regarded as HTML too)
+ title: page title, or None if no title or not HTML
"""
- def __init__(self, forms_factory, links_factory, get_title):
+ def __init__(self, forms_factory, links_factory, get_title,
+ get_encoding=encoding_finder(DEFAULT_ENCODING),
+ is_html_p=make_is_html(allow_xhtml=False),
+ ):
"""
- Pass keyword
- arguments only.
+ Pass keyword arguments only.
+
+ default_encoding: character encoding to use if encoding cannot be
+ determined (or guessed) from the response. You should turn on
+ HTTP-EQUIV handling if you want the best chance of getting this right
+ without resorting to this default. The default value of this
+ parameter (currently latin-1) may change in future.
"""
self._forms_factory = forms_factory
self._links_factory = links_factory
self._get_title = get_title
+ self._get_encoding = get_encoding
+ self._is_html_p = is_html_p
+
+ self.set_response(None)
def set_request_class(self, request_class):
"""Set urllib2.Request class.
@@ -371,30 +435,73 @@
"""
self._forms_factory.request_class = request_class
- def forms(self, response, encoding):
+ def set_response(self, response):
+ """Set response.
+
+ The response must implement the same interface as objects returned by
+ urllib2.urlopen().
+
+ """
+ self._response = response
+ self._forms_genf = self._links_genf = None
+ for name in ["encoding", "is_html", "title"]:
+ try:
+ delattr(self, name)
+ except AttributeError:
+ pass
+
+ def __getattr__(self, name):
+ if name not in ["encoding", "is_html", "title"]:
+ return getattr(self.__class__, name)
+
+ try:
+ if name == "encoding":
+ self.encoding = self._get_encoding(self._response)
+ return self.encoding
+ elif name == "is_html":
+ self.is_html = self._is_html_p(self._response, self.encoding)
+ return self.is_html
+ elif name == "title":
+ if self.is_html:
+ self.title = self._get_title(self._response, self.encoding)
+ else:
+ self.title = None
+ return self.title
+ finally:
+ self._response.seek(0)
+
+ def forms(self):
"""Return iterable over ClientForm.HTMLForm-like objects."""
- return self._forms_factory.parse_response(response, encoding)
+ if self._forms_genf is None:
+ forms_gen = self._forms_factory.parse_response(
+ copy.copy(self._response), self.encoding)
+ self._forms_genf = CachingGeneratorFunction(forms_gen)
+ return self._forms_genf()
- def links(self, response, encoding):
+ def links(self):
"""Return iterable over mechanize.Link-like objects."""
- return self._links_factory.links(response, response.geturl(), encoding)
-
- def title(self, response, encoding):
- """Return page title."""
- return self._get_title(response, encoding)
+ if self._links_genf is None:
+ links_gen = self._links_factory.links(
+ copy.copy(self._response), self._response.geturl(), self.encoding)
+ self._links_genf = CachingGeneratorFunction(links_gen)
+ return self._links_genf()
class DefaultFactory(Factory):
- def __init__(self):
- Factory.__init__(self,
- forms_factory=FormsFactory(),
- links_factory=LinksFactory(),
- get_title=pp_get_title,
- )
+ def __init__(self, i_want_broken_xhtml_support=False):
+ Factory.__init__(
+ self,
+ forms_factory=FormsFactory(),
+ links_factory=LinksFactory(),
+ get_title=pp_get_title,
+ is_html_p=make_is_html(allow_xhtml=i_want_broken_xhtml_support),
+ )
class RobustFactory(Factory):
- def __init__(self):
- Factory.__init__(self,
- forms_factory=RobustFormsFactory(),
- links_factory=RobustLinksFactory(),
- get_title=bs_get_title,
- )
+ def __init__(self, i_want_broken_xhtml_support=False):
+ Factory.__init__(
+ self,
+ forms_factory=RobustFormsFactory(),
+ links_factory=RobustLinksFactory(),
+ get_title=bs_get_title,
+ is_html_p=make_is_html(allow_xhtml=i_want_broken_xhtml_support),
+ )
Modified: wwwsearch/mechanize/branch/mechanize-0.1.0-devel/mechanize/_mechanize.py
==============================================================================
--- wwwsearch/mechanize/branch/mechanize-0.1.0-devel/mechanize/_mechanize.py (original)
+++ wwwsearch/mechanize/branch/mechanize-0.1.0-devel/mechanize/_mechanize.py Sun Apr 23 19:11:43 2006
@@ -10,17 +10,19 @@
"""
# XXXX
+# spaces in URLs
+# clean_url(): test Moz behaviour against Apache rather than File->Open!
# test referer bugs (frags and don't add in redirect unless orig req had Referer)
# XXX
# The stuff on web page's todo list.
# Moof's emails about response object, .back(), etc.
+from __future__ import generators
+
import urllib2, urlparse, sys, copy
import ClientCookie
-from ClientCookie._Util import response_seek_wrapper
-from ClientCookie._HeadersUtil import split_header_words, is_html
from _useragent import UserAgent
from _html import DefaultFactory
@@ -76,66 +78,47 @@
request: current request (ClientCookie.Request or urllib2.Request)
form: currently selected form (see .select_form())
- default_encoding: character encoding used if no encoding is found in the
- response (you should turn on HTTP-EQUIV handling if you want the best
- chance of getting this right without resorting to this default)
"""
- def __init__(self, default_encoding="latin-1",
+ def __init__(self,
factory=None,
history=None,
request_class=None,
- i_want_broken_xhtml_support=False,
- forms_factory=None, # deprecated
- links_factory=None, # deprecated
- get_title=None, # deprecated
):
"""
Only named arguments should be passed to this constructor.
- default_encoding: See class docs.
+ factory: object implementing the mechanize.Factory interface.
+ history: object implementing the mechanize.History interface. Note this
+ interface is still experimental and may change in future.
request_class: Request class to use. Defaults to ClientCookie.Request
by default for Pythons older than 2.4, urllib2.Request otherwise.
- factory: mechanize.Factory
+
+ The Factory and History objects passed in are 'owned' by the Browser,
+ so they should not be shared across Browsers. In particular,
+ factory.set_response() should not be called except by the owning
+ Browser itself.
Note that the supplied factory's request_class is overridden by this
constructor, to ensure only one Request class is used.
-
- Deprecated arguments:
-
- forms_factory: Object supporting the mechanize.FormsFactory interface.
- links_factory: Object supporting the mechanize.LinksFactory interface.
- get_title: callable taking a response object and an encoding string,
- and returning the page title.
-
"""
- self.default_encoding = default_encoding
- self._allow_xhtml = i_want_broken_xhtml_support
if history is None:
history = History()
self._history = history
self.request = self._response = None
self.form = None
- self._forms = None
- self._links = None
- self._title = None
if request_class is None:
if not hasattr(urllib2.Request, "add_unredirected_header"):
request_class = ClientCookie.Request
else:
- request_class = urllib2.Request # Python 2.4
+ request_class = urllib2.Request # Python >= 2.4
if factory is None:
- if (forms_factory is None and
- links_factory is None and
- get_title is None):
- factory = DefaultFactory()
- else:
- factory = Factory(forms_factory, links_factory, get_title)
+ factory = DefaultFactory()
factory.set_request_class(request_class)
self._factory = factory
self.request_class = request_class
@@ -149,7 +132,6 @@
if self._history is not None:
self._history.close()
self._history = None
- self._forms = self._title = self._links = None
self.request = self._response = None
def open(self, url, data=None):
@@ -180,10 +162,10 @@
success = True
try:
- self._response = UserAgent.open(self, self.request, data)
+ response = UserAgent.open(self, self.request, data)
except urllib2.HTTPError, error:
success = False
- self._response = error
+ response = error
## except (IOError, socket.error, OSError), error:
## # Yes, urllib2 really does raise all these :-((
## # See test_urllib2.py in stdlib and in ClientCookie for examples
@@ -196,9 +178,7 @@
## # Python core, a fix would need some backwards-compat. hack to be
## # acceptable.
## raise
- if not hasattr(self._response, "seek"):
- self._response = response_seek_wrapper(self._response)
- self._parse_html(self._response)
+ self.set_response(response)
if not success:
raise error
return copy.copy(self._response)
@@ -214,10 +194,11 @@
def set_response(self, response):
"""Replace current response with response."""
+ self.form = None
if not hasattr(response, "seek"):
- response = response_seek_wrapper(self._response)
+ response = ClientCookie.response_seek_wrapper(response)
self._response = response
- self._parse_html(self._response)
+ self._factory.set_response(self._response)
def geturl(self):
"""Get URL of current document."""
@@ -241,9 +222,9 @@
"""
if self._response is not None:
self._response.close()
- self.request, self._response = self._history.back(n, self._response)
- self._parse_html(self._response)
- return self._response
+ self.request, response = self._history.back(n, self._response)
+ self.set_response(response)
+ return response
def clear_history(self):
self._history.clear()
@@ -252,31 +233,11 @@
"""Return iterable over links (mechanize.Link objects)."""
if not self.viewing_html():
raise BrowserStateError("not viewing HTML")
+ links = self._factory.links()
if kwds:
- return self._find_links(False, **kwds)
- if self._links is None:
- try:
- self._links = list(self.get_links_iter())
- finally:
- self._response.seek(0)
- return self._links
-
- def get_links_iter(self):
- """Return an iterator that provides links of the document.
-
- This method is provided in addition to .links() to allow lazy iteration
- over links, while still keeping .links() safe against somebody
- .seek()ing on a response "behind your back". When response objects are
- fixed to have independent seek positions, this method will be
- deprecated in favour of .links().
-
- """
- if not self.viewing_html():
- raise BrowserStateError("not viewing HTML")
- base_url = self._response.geturl()
- self._response.seek(0)
- return self._factory.links(
- self._response, self.encoding(self._response))
+ return self._filter_links(links, **kwds)
+ else:
+ return links
def forms(self):
"""Return iterable over forms.
@@ -286,33 +247,19 @@
"""
if not self.viewing_html():
raise BrowserStateError("not viewing HTML")
- if self._forms is None:
- response = self._response
- response.seek(0)
- try:
- self._forms = self._factory.forms(
- response, self.encoding(self._response))
- finally:
- response.seek(0)
- return self._forms
+ return self._factory.forms()
def viewing_html(self):
"""Return whether the current response contains HTML data."""
if self._response is None:
raise BrowserStateError("not viewing any document")
- ct_hdrs = self._response.info().getheaders("content-type")
- url = self._response.geturl()
- return is_html(ct_hdrs, url, self._allow_xhtml)
-
- def encoding(self, response):
- # HTTPEquivProcessor may be in use, so both HTTP and HTTP-EQUIV
- # headers may be in the response. HTTP-EQUIV headers come last,
- # so try in order from first to last.
- for ct in response.info().getheaders("content-type"):
- for k, v in split_header_words([ct])[0]:
- if k == "charset":
- return v
- return self.default_encoding
+ return self._factory.is_html
+
+ def encoding(self):
+ """"""
+ if self._response is None:
+ raise BrowserStateError("not viewing any document")
+ return self._factory.encoding
def title(self):
"""Return title, or None if there is no title element in the document.
@@ -323,10 +270,7 @@
"""
if not self.viewing_html():
raise BrowserStateError("not viewing HTML")
- if self._title is None:
- self._title = self._factory.title(
- self._response, self.encoding(self._response))
- return self._title
+ return self._factory.title
def select_form(self, name=None, predicate=None, nr=None):
"""Select an HTML form for input.
@@ -489,7 +433,10 @@
nr: matches the nth link that matches all other criteria (default 0)
"""
- return self._find_links(True, **kwds)
+ try:
+ return self._filter_links(self._factory.links(), **kwds).next()
+ except StopIteration:
+ raise LinkNotFoundError()
def __getattr__(self, name):
# pass through ClientForm / DOMForm methods and attributes
@@ -503,7 +450,7 @@
#---------------------------------------------------
# Private methods.
- def _find_links(self, single,
+ def _filter_links(self, links,
text=None, text_regex=None,
name=None, name_regex=None,
url=None, url_regex=None,
@@ -517,19 +464,7 @@
found_links = []
orig_nr = nr
- # An optimization, so that if we look for a single link we do not have
- # to necessarily parse the entire file.
- if self._links is None and single:
- all_links = self.get_links_iter()
- else:
- if self._links is None:
- try:
- self._links = list(self.get_links_iter())
- finally:
- self._response.seek(0)
- all_links = self._links
-
- for link in all_links:
+ for link in links:
if url is not None and url != link.url:
continue
if url_regex is not None and not url_regex.search(link.url):
@@ -553,18 +488,5 @@
if nr:
nr -= 1
continue
- if single:
- return link
- else:
- found_links.append(link)
- nr = orig_nr
- if not found_links:
- raise LinkNotFoundError()
- return found_links
-
- def _parse_html(self, response):
- # this is now lazy, so we just reset the various attributes that
- # result from parsing
- self.form = None
- self._title = None
- self._forms = self._links = None
+ yield link
+ nr = orig_nr
Modified: wwwsearch/mechanize/branch/mechanize-0.1.0-devel/test.py
==============================================================================
--- wwwsearch/mechanize/branch/mechanize-0.1.0-devel/test.py (original)
+++ wwwsearch/mechanize/branch/mechanize-0.1.0-devel/test.py Sun Apr 23 19:11:43 2006
@@ -17,6 +17,46 @@
FACTORY_CLASSES.append(mechanize.RobustFactory)
+class CachingGeneratorFunctionTests(TestCase):
+
+ def _get_simple_cgenf(self, log):
+ from mechanize._html import CachingGeneratorFunction
+ todo = []
+ for ii in range(2):
+ def work(ii=ii):
+ log.append(ii)
+ return ii
+ todo.append(work)
+ def genf():
+ for a in todo:
+ yield a()
+ return CachingGeneratorFunction(genf())
+
+ def test_cache(self):
+ log = []
+ cgenf = self._get_simple_cgenf(log)
+ for repeat in range(2):
+ for ii, jj in zip(cgenf(), range(2)):
+ self.assertEqual(ii, jj)
+ self.assertEqual(log, range(2)) # work only done once
+
+ def test_interleaved(self):
+ log = []
+ cgenf = self._get_simple_cgenf(log)
+ cgen = cgenf()
+ self.assertEqual(cgen.next(), 0)
+ self.assertEqual(log, [0])
+ cgen2 = cgenf()
+ self.assertEqual(cgen2.next(), 0)
+ self.assertEqual(log, [0])
+ self.assertEqual(cgen.next(), 1)
+ self.assertEqual(log, [0, 1])
+ self.assertEqual(cgen2.next(), 1)
+ self.assertEqual(log, [0, 1])
+ self.assertRaises(StopIteration, cgen.next)
+ self.assertRaises(StopIteration, cgen2.next)
+
+
class UnescapeTests(TestCase):
def test_unescape_charref(self):
@@ -199,11 +239,12 @@
import mechanize
from StringIO import StringIO
import urllib, mimetools
- # always take first encoding, since that's the one
+ # always take first encoding, since that's the one from the real HTTP
+ # headers, rather than from HTTP-EQUIV
b = mechanize.Browser()
- for s, ct in [("", b.default_encoding),
+ for s, ct in [("", mechanize._html.DEFAULT_ENCODING),
- ("Foo: Bar\r\n\r\n", b.default_encoding),
+ ("Foo: Bar\r\n\r\n", mechanize._html.DEFAULT_ENCODING),
("Content-Type: text/html; charset=UTF-8\r\n\r\n",
"UTF-8"),
@@ -214,7 +255,8 @@
]:
msg = mimetools.Message(StringIO(s))
r = urllib.addinfourl(StringIO(""), msg, "http://www.example.com/")
- self.assertEqual(b.encoding(r), ct)
+ b.set_response(r)
+ self.assertEqual(b.encoding(), ct)
def test_history(self):
import mechanize
@@ -281,7 +323,8 @@
("text/html; charset=blah", True),
(" text/html ; charset=ook ", True),
]:
- b = TestBrowser(i_want_broken_xhtml_support=allow_xhtml)
+ b = TestBrowser(mechanize.DefaultFactory(
+ i_want_broken_xhtml_support=allow_xhtml))
hdrs = {}
if ct is not None:
hdrs["Content-Type"] = ct
@@ -303,7 +346,8 @@
(".xml", False),
("", False),
]:
- b = TestBrowser(i_want_broken_xhtml_support=allow_xhtml)
+ b = TestBrowser(mechanize.DefaultFactory(
+ i_want_broken_xhtml_support=allow_xhtml))
url = "http://example.com/foo"+ext
b.add_handler(MockHandler(
[("http_open", MockResponse(url, "", {}))]))
@@ -378,7 +422,7 @@
b.add_handler(MockHandler([("http_open", r)]))
r = b.open(url)
- forms = b.forms()
+ forms = list(b.forms())
self.assertEqual(len(forms), 2)
for got, expect in zip([f.name for f in forms], [
"form1", "form2"]):
@@ -489,7 +533,7 @@
Link(url, "foo", None, "iframe",
[("src", "foo")]),
]
- links = b.links()
+ links = list(b.links())
self.assertEqual(len(links), len(exp_links))
for got, expect in zip(links, exp_links):
self.assertEqual(got, expect)
@@ -579,6 +623,7 @@
class ResponseTests(TestCase):
def test_set_response(self):
+ import copy
from ClientCookie import response_seek_wrapper
br = TestBrowser()
@@ -591,7 +636,8 @@
r = br.open(url)
self.assertEqual(r.read(), html)
r.seek(0)
- self.assertEqual(br.links()[0].url, "spam")
+ self.assertEqual(copy.copy(r).read(), html)
+ self.assertEqual(list(br.links())[0].url, "spam")
newhtml = """<html><body><a href="eggs">click me</a></body></html>"""
@@ -600,11 +646,12 @@
self.assertEqual(br.response().read(), html)
br.response().set_data(newhtml)
self.assertEqual(br.response().read(), html)
- self.assertEqual(br.links()[0].url, "spam")
+ self.assertEqual(list(br.links())[0].url, "spam")
+ r.seek(0)
br.set_response(r)
self.assertEqual(br.response().read(), newhtml)
- self.assertEqual(br.links()[0].url, "eggs")
+ self.assertEqual(list(br.links())[0].url, "eggs")
class UserAgentTests(TestCase):
More information about the wwwsearch-commits
mailing list