[wwwsearch-commits] r19201 - wwwsearch/mechanize/trunk/mechanize
jjlee at codespeak.net
jjlee at codespeak.net
Sun Oct 30 20:33:50 CET 2005
Author: jjlee
Date: Sun Oct 30 20:33:49 2005
New Revision: 19201
Modified:
wwwsearch/mechanize/trunk/mechanize/_mechanize.py
Log:
Remove pullparser import dependency; Remove Browser.urltags (now an argument to LinksFactory); Document Browser constructor as taking keyword args only (and change positional arg spec)
Modified: wwwsearch/mechanize/trunk/mechanize/_mechanize.py
==============================================================================
--- wwwsearch/mechanize/trunk/mechanize/_mechanize.py (original)
+++ wwwsearch/mechanize/trunk/mechanize/_mechanize.py Sun Oct 30 20:33:49 2005
@@ -23,7 +23,6 @@
import ClientCookie
from ClientCookie._Util import response_seek_wrapper
from ClientCookie._HeadersUtil import split_header_words, is_html
-import pullparser
# serves me right for not using a version tuple...
VERSION_RE = re.compile(r"(?P<major>\d+)\.(?P<minor>\d+)\.(?P<bugfix>\d+)"
r"(?P<state>[ab])?(?:-pre)?(?P<pre>\d+)?$")
@@ -35,8 +34,6 @@
("major", "minor", "bugfix", "state", "pre")])
assert map(int, parse_version(ClientCookie.VERSION)[:3]) >= [1, 0, 2], \
"ClientCookie 1.0.2 or newer is required"
-assert pullparser.__version__[:3] >= (0, 0, 4), \
- "pullparser 0.0.4b or newer is required"
from _useragent import UserAgent
@@ -65,6 +62,67 @@
self.base_url, self.url, self.text, self.tag, self.attrs)
+class LinksFactory:
+
+ def __init__(self,
+ link_parser_class=None,
+ link_class=Link,
+ urltags=None,
+ ):
+ import pullparser
+ assert pullparser.__version__[:3] >= (0, 0, 4), \
+ "pullparser 0.0.4b or newer is required"
+ if link_parser_class is None:
+ link_parser_class = pullparser.TolerantPullParser
+ self.link_parser_class = link_parser_class
+ self.link_class = link_class
+ if urltags is None:
+ urltags = {
+ "a": "href",
+ "area": "href",
+ "frame": "src",
+ "iframe": "src",
+ }
+ self.urltags = urltags
+
+ def get_links_iter(self, fh, base_url, encoding=None):
+ import pullparser
+ p = pullparser.TolerantPullParser(fh, encoding=encoding)
+
+ for token in p.tags(*(self.urltags.keys()+["base"])):
+ if token.data == "base":
+ base_url = dict(token.attrs).get("href")
+ continue
+ if token.type == "endtag":
+ continue
+ attrs = dict(token.attrs)
+ tag = token.data
+ name = attrs.get("name")
+ text = None
+ # XXX need to sort out quoting
+ #url = urllib.quote_plus(attrs.get(self.urltags[tag]))
+ url = attrs.get(self.urltags[tag])
+ if tag == "a":
+ if token.type != "startendtag":
+ # XXX hmm, this'd break if end tag is missing
+ text = p.get_compressed_text(("endtag", tag))
+ # but this doesn't work for eg. <a href="blah"><b>Andy</b></a>
+ #text = p.get_compressed_text()
+ # This is a hack from WWW::Mechanize to get some really basic
+ # JavaScript working, which I'm not yet convinced is a good
+ # idea.
+## onClick = attrs["onclick"]
+## m = re.search(r"/^window\.open\(\s*'([^']+)'/", onClick)
+## if onClick and m:
+## url = m.group(1)
+ if not url:
+ # Probably an <A NAME="blah"> link or <AREA NOHREF...>.
+ # For our purposes a link is something with a URL, so ignore
+ # this.
+ continue
+
+ yield Link(base_url, url, text, tag, token.attrs)
+
class FormsFactory:
"""Makes a sequence of objects satisfying ClientForm.HTMLForm interface.
@@ -136,21 +194,19 @@
getting this right without resorting to this default)
"""
- urltags = {
- "a": "href",
- "area": "href",
- "frame": "src",
- "iframe": "src",
- }
def __init__(self, default_encoding="latin-1",
forms_factory=None,
+ links_factory=None,
request_class=None,
):
"""
+ Only named arguments should be passed to this constructor.
+
default_encoding: See class docs.
forms_factory: Object supporting the mechanize.FormsFactory interface.
+ links_factory: Object supporting the mechanize.LinksFactory interface.
request_class: Request class to use. Defaults to ClientCookie.Request
by default for Pythons older than 2.4, urllib2.Request otherwise.
@@ -177,6 +233,9 @@
forms_factory = FormsFactory()
self._forms_factory = forms_factory
forms_factory.request_class = request_class
+ if links_factory is None:
+ links_factory = LinksFactory()
+ self._links_factory = links_factory
UserAgent.__init__(self) # do this last to avoid __getattr__ problems
@@ -272,45 +331,10 @@
"""Return an iterator that provides links of the document."""
if not self.viewing_html():
raise BrowserStateError("not viewing HTML")
- base = self._response.geturl()
+ base_url = self._response.geturl()
self._response.seek(0)
- p = pullparser.TolerantPullParser(
- self._response, encoding=self._encoding(self._response))
-
- for token in p.tags(*(self.urltags.keys()+["base"])):
- if token.data == "base":
- base = dict(token.attrs).get("href")
- continue
- if token.type == "endtag":
- continue
- attrs = dict(token.attrs)
- tag = token.data
- name = attrs.get("name")
- text = None
- # XXX need to sort out quoting
- #url = urllib.quote_plus(attrs.get(self.urltags[tag]))
- url = attrs.get(self.urltags[tag])
- if tag == "a":
- if token.type != "startendtag":
- # XXX hmm, this'd break if end tag is missing
- text = p.get_compressed_text(("endtag", tag))
- # but this doesn't work for eg. <a href="blah"><b>Andy</b></a>
- #text = p.get_compressed_text()
- # This is a hack from WWW::Mechanize to get some really basic
- # JavaScript working, which I'm not yet convinced is a good
- # idea.
-## onClick = attrs["onclick"]
-## m = re.search(r"/^window\.open\(\s*'([^']+)'/", onClick)
-## if onClick and m:
-## url = m.group(1)
- if not url:
- # Probably an <A NAME="blah"> link or <AREA NOHREF...>.
- # For our purposes a link is something with a URL, so ignore
- # this.
- continue
-
- yield Link(base, url, text, tag, token.attrs)
-
+ return self._links_factory.get_links_iter(
+ self._response, base_url, self._encoding(self._response))
def forms(self):
"""Return iterable over forms.
@@ -344,6 +368,7 @@
PullParser.get_text() method of pullparser module.
"""
+ import pullparser
if not self.viewing_html():
raise BrowserStateError("not viewing HTML")
if self._title is None:
More information about the wwwsearch-commits
mailing list