[wwwsearch-commits] r26069 - in wwwsearch/mechanize/trunk: .
mechanize
jjlee at codespeak.net
jjlee at codespeak.net
Fri Apr 21 00:02:26 CEST 2006
Author: jjlee
Date: Fri Apr 21 00:02:25 2006
New Revision: 26069
Added:
wwwsearch/mechanize/trunk/mechanize/_html.py
Modified:
wwwsearch/mechanize/trunk/mechanize/__init__.py
wwwsearch/mechanize/trunk/mechanize/_mechanize.py
wwwsearch/mechanize/trunk/test.py
Log:
Shift HTML stuff into new module
Modified: wwwsearch/mechanize/trunk/mechanize/__init__.py
==============================================================================
--- wwwsearch/mechanize/trunk/mechanize/__init__.py (original)
+++ wwwsearch/mechanize/trunk/mechanize/__init__.py Fri Apr 21 00:02:25 2006
@@ -1,7 +1,9 @@
from _useragent import UserAgent
-from _mechanize import Browser, Link, \
- Factory, DefaultFactory, RobustFactory, \
- FormsFactory, LinksFactory, pp_get_title, \
- RobustFormsFactory, RobustLinksFactory, bs_get_title, \
+from _mechanize import Browser, \
BrowserStateError, LinkNotFoundError, FormNotFoundError, \
__version__
+
+from _html import Link, \
+ Factory, DefaultFactory, RobustFactory, \
+ FormsFactory, LinksFactory, pp_get_title, \
+ RobustFormsFactory, RobustLinksFactory, bs_get_title
Added: wwwsearch/mechanize/trunk/mechanize/_html.py
==============================================================================
--- (empty file)
+++ wwwsearch/mechanize/trunk/mechanize/_html.py Fri Apr 21 00:02:25 2006
@@ -0,0 +1,400 @@
+from __future__ import generators
+
+import re, urllib, htmlentitydefs
+from urlparse import urljoin
+
+import ClientCookie
+
+## # XXXX miserable hack
+## def urljoin(base, url):
+## if url.startswith("?"):
+## return base+url
+## else:
+## return urlparse.urljoin(base, url)
+
+## def chr_range(a, b):
+## return "".join(map(chr, range(ord(a), ord(b)+1)))
+
+## RESERVED_URL_CHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+## "abcdefghijklmnopqrstuvwxyz"
+## "-_.~")
+## UNRESERVED_URL_CHARS = "!*'();:@&=+$,/?%#[]"
+# we want (RESERVED_URL_CHARS+UNRESERVED_URL_CHARS), minus those
+# 'safe'-by-default characters that urllib.urlquote never quotes
+URLQUOTE_SAFE_URL_CHARS = "!*'();:@&=+$,/?%#[]~"
+
+# idea for this argument-processing trick is from Peter Otten
+class Args:
+ def __init__(self, args_map):
+ self.dictionary = dict(args_map)
+ def __getattr__(self, key):
+ try:
+ return self.dictionary[key]
+ except KeyError:
+ return getattr(self.__class__, key)
+
+def form_parser_args(
+ select_default=False,
+ form_parser_class=None,
+ request_class=None,
+ backwards_compat=False,
+ encoding="latin-1", # deprecated
+ ):
+ return Args(locals())
+
+
+class Link:
+ def __init__(self, base_url, url, text, tag, attrs):
+ assert None not in [url, tag, attrs]
+ self.base_url = base_url
+ self.absolute_url = urljoin(base_url, url)
+ self.url, self.text, self.tag, self.attrs = url, text, tag, attrs
+ def __cmp__(self, other):
+ try:
+ for name in "url", "text", "tag", "attrs":
+ if getattr(self, name) != getattr(other, name):
+ return -1
+ except AttributeError:
+ return -1
+ return 0
+ def __repr__(self):
+ return "Link(base_url=%r, url=%r, text=%r, tag=%r, attrs=%r)" % (
+ self.base_url, self.url, self.text, self.tag, self.attrs)
+
+
+def clean_url(url, encoding):
+ # percent-encode illegal URL characters
+ if type(url) == type(""):
+ url = url.decode(encoding, "replace")
+ return urllib.quote(url.encode(encoding), URLQUOTE_SAFE_URL_CHARS)
+
+class LinksFactory:
+
+ def __init__(self,
+ link_parser_class=None,
+ link_class=Link,
+ urltags=None,
+ ):
+ import pullparser
+ if link_parser_class is None:
+ link_parser_class = pullparser.TolerantPullParser
+ self.link_parser_class = link_parser_class
+ self.link_class = link_class
+ if urltags is None:
+ urltags = {
+ "a": "href",
+ "area": "href",
+ "frame": "src",
+ "iframe": "src",
+ }
+ self.urltags = urltags
+
+ def links(self, fh, base_url, encoding=None):
+ """Return an iterator that provides links of the document."""
+ import pullparser
+ p = self.link_parser_class(fh, encoding=encoding)
+
+ for token in p.tags(*(self.urltags.keys()+["base"])):
+ if token.data == "base":
+ base_url = dict(token.attrs).get("href")
+ continue
+ if token.type == "endtag":
+ continue
+ attrs = dict(token.attrs)
+ tag = token.data
+ name = attrs.get("name")
+ text = None
+ # XXX use attr_encoding for ref'd doc if that doc does not provide
+ # one by other means
+ #attr_encoding = attrs.get("charset")
+ url = attrs.get(self.urltags[tag]) # XXX is "" a valid URL?
+ if not url:
+ # Probably an <A NAME="blah"> link or <AREA NOHREF...>.
+ # For our purposes a link is something with a URL, so ignore
+ # this.
+ continue
+
+ url = clean_url(url, encoding)
+ if tag == "a":
+ if token.type != "startendtag":
+ # hmm, this'd break if end tag is missing
+ text = p.get_compressed_text(("endtag", tag))
+ # but this doesn't work for eg. <a href="blah"><b>Andy</b></a>
+ #text = p.get_compressed_text()
+
+ yield Link(base_url, url, text, tag, token.attrs)
+
+class FormsFactory:
+
+ """Makes a sequence of objects satisfying ClientForm.HTMLForm interface.
+
+ For constructor argument docs, see ClientForm.ParseResponse
+ argument docs.
+
+ """
+
+ def __init__(self,
+ select_default=False,
+ form_parser_class=None,
+ request_class=None,
+ backwards_compat=False,
+ encoding="latin-1", # deprecated
+ ):
+ import ClientForm
+ self.select_default = select_default
+ if form_parser_class is None:
+ form_parser_class = ClientForm.FormParser
+ self.form_parser_class = form_parser_class
+ if request_class is None:
+ request_class = ClientCookie.Request
+ self.request_class = request_class
+ self.backwards_compat = backwards_compat
+ self.encoding = encoding
+
+ def parse_response(self, response, encoding=None):
+ import ClientForm
+ if encoding is None:
+ encoding = self.encoding
+ return ClientForm.ParseResponse(
+ response,
+ select_default=self.select_default,
+ form_parser_class=self.form_parser_class,
+ request_class=self.request_class,
+ backwards_compat=self.backwards_compat,
+ encoding=encoding,
+ )
+
+ def parse_file(self, file_obj, base_url, encoding=None):
+ import ClientForm
+ if encoding is None:
+ encoding = self.encoding
+ return ClientForm.ParseFile(
+ file_obj,
+ base_url,
+ select_default=self.select_default,
+ form_parser_class=self.form_parser_class,
+ request_class=self.request_class,
+ backwards_compat=self.backwards_compat,
+ encoding=encoding,
+ )
+
+def pp_get_title(response, encoding):
+ import pullparser
+ p = pullparser.TolerantPullParser(response, encoding=encoding)
+ try:
+ p.get_tag("title")
+ except pullparser.NoMoreTokensError:
+ return None
+ else:
+ return p.get_text()
+
+
+def unescape(data, entities, encoding):
+ if data is None or "&" not in data:
+ return data
+
+ def replace_entities(match):
+ ent = match.group()
+ if ent[1] == "#":
+ return unescape_charref(ent[2:-1], encoding)
+
+ repl = entities.get(ent[1:-1])
+ if repl is not None:
+ repl = unichr(repl)
+ if type(repl) != type(""):
+ try:
+ repl = repl.encode(encoding)
+ except UnicodeError:
+ repl = ent
+ else:
+ repl = ent
+ return repl
+
+ return re.sub(r"&#?[A-Za-z0-9]+?;", replace_entities, data)
+
+def unescape_charref(data, encoding):
+ name, base = data, 10
+ if name.startswith("x"):
+ name, base= name[1:], 16
+ uc = unichr(int(name, base))
+ if encoding is None:
+ return uc
+ else:
+ try:
+ repl = uc.encode(encoding)
+ except UnicodeError:
+ repl = "&#%s;" % data
+ return repl
+
+def get_entitydefs():
+ try:
+ htmlentitydefs.name2codepoint
+ except AttributeError:
+ entitydefs = {}
+ for name, char in htmlentitydefs.entitydefs.items():
+ uc = char.decode("latin-1")
+ if uc.startswith("&#") and uc.endswith(";"):
+ uc = unescape_charref(uc[2:-1], None)
+ codepoint = ord(uc)
+ entitydefs[name] = codepoint
+ else:
+ entitydefs = htmlentitydefs.name2codepoint
+ return entitydefs
+
+
+try:
+ import BeautifulSoup
+except ImportError:
+ pass
+else:
+ import sgmllib
+ # monkeypatch to fix http://www.python.org/sf/803422 :-(
+ sgmllib.charref = re.compile("&#(x?[0-9a-fA-F]+)[^0-9a-fA-F]")
+ class MechanizeBs(BeautifulSoup.BeautifulSoup):
+ _entitydefs = get_entitydefs()
+ def __init__(self, encoding, text=None, avoidParserProblems=True,
+ initialTextIsEverything=True):
+ self._encoding = encoding
+ BeautifulSoup.BeautifulSoup.__init__(
+ self, text, avoidParserProblems, initialTextIsEverything)
+
+ def handle_charref(self, ref):
+ t = unescape("&#%s;"%ref, self._entitydefs, self._encoding)
+ self.handle_data(t)
+ def handle_entityref(self, ref):
+ t = unescape("&%s;"%ref, self._entitydefs, self._encoding)
+ self.handle_data(t)
+ def unescape_attrs(self, attrs):
+ escaped_attrs = []
+ for key, val in attrs:
+ val = unescape(val, self._entitydefs, self._encoding)
+ escaped_attrs.append((key, val))
+ return escaped_attrs
+
+class RobustLinksFactory:
+
+ compress_re = re.compile(r"\s+")
+
+ def __init__(self,
+ link_parser_class=None,
+ link_class=Link,
+ urltags=None,
+ ):
+ import BeautifulSoup
+ if link_parser_class is None:
+ link_parser_class = MechanizeBs
+ self.link_parser_class = link_parser_class
+ self.link_class = link_class
+ if urltags is None:
+ urltags = {
+ "a": "href",
+ "area": "href",
+ "frame": "src",
+ "iframe": "src",
+ }
+ self.urltags = urltags
+
+ def links(self, fh, base_url, encoding=None):
+ import BeautifulSoup
+ data = fh.read()
+ bs = self.link_parser_class(encoding, data)
+ gen = bs.recursiveChildGenerator()
+ for ch in bs.recursiveChildGenerator():
+ if (isinstance(ch, BeautifulSoup.Tag) and
+ ch.name in self.urltags.keys()+["base"]):
+ link = ch
+ attrs = bs.unescape_attrs(link.attrs)
+ attrs_dict = dict(attrs)
+ if link.name == "base":
+ base_url = attrs_dict.get("href")
+ continue
+ url_attr = self.urltags[link.name]
+ url = attrs_dict.get(url_attr)
+ if not url:
+ continue
+ url = clean_url(url, encoding)
+ text = link.firstText(lambda t: True)
+ if text is BeautifulSoup.Null:
+ # follow pullparser's weird behaviour rigidly
+ if link.name == "a":
+ text = ""
+ else:
+ text = None
+ else:
+ text = self.compress_re.sub(" ", text.strip())
+ yield Link(base_url, url, text, link.name, attrs)
+
+
+class RobustFormsFactory(FormsFactory):
+ def __init__(self, *args, **kwds):
+ import ClientForm
+ args = form_parser_args(*args, **kwds)
+ if args.form_parser_class is None:
+ args.form_parser_class = ClientForm.RobustFormParser
+ FormsFactory.__init__(self, **args.dictionary)
+
+def bs_get_title(response, encoding):
+ import BeautifulSoup
+ # XXXX encoding
+ bs = BeautifulSoup.BeautifulSoup(response.read())
+ title = bs.first("title")
+ if title == BeautifulSoup.Null:
+ return None
+ else:
+ return title.firstText(lambda t: True)
+
+
+class Factory:
+ """Factory for forms, links, etc.
+
+ The interface of this class may expand in future.
+
+ """
+
+ def __init__(self, forms_factory, links_factory, get_title):
+ """
+
+ Pass keyword
+ arguments only.
+
+ """
+ self._forms_factory = forms_factory
+ self._links_factory = links_factory
+ self._get_title = get_title
+
+ def set_request_class(self, request_class):
+ """Set urllib2.Request class.
+
+ ClientForm.HTMLForm instances returned by .forms() will return
+ instances of this class when .click()ed.
+
+ """
+ self._forms_factory.request_class = request_class
+
+ def forms(self, response, encoding):
+ """Return iterable over ClientForm.HTMLForm-like objects."""
+ return self._forms_factory.parse_response(response, encoding)
+
+ def links(self, response, encoding):
+ """Return iterable over mechanize.Link-like objects."""
+ return self._links_factory.links(response, response.geturl(), encoding)
+
+ def title(self, response, encoding):
+ """Return page title."""
+ return self._get_title(response, encoding)
+
+class DefaultFactory(Factory):
+ def __init__(self):
+ Factory.__init__(self,
+ forms_factory=FormsFactory(),
+ links_factory=LinksFactory(),
+ get_title=pp_get_title,
+ )
+
+class RobustFactory(Factory):
+ def __init__(self):
+ Factory.__init__(self,
+ forms_factory=RobustFormsFactory(),
+ links_factory=RobustLinksFactory(),
+ get_title=bs_get_title,
+ )
Modified: wwwsearch/mechanize/trunk/mechanize/_mechanize.py
==============================================================================
--- wwwsearch/mechanize/trunk/mechanize/_mechanize.py (original)
+++ wwwsearch/mechanize/trunk/mechanize/_mechanize.py Fri Apr 21 00:02:25 2006
@@ -16,16 +16,14 @@
# The stuff on web page's todo list.
# Moof's emails about response object, .back(), etc.
-from __future__ import generators
-
-import urllib2, socket, urlparse, urllib, re, sys, htmlentitydefs, copy
-from urlparse import urljoin
+import urllib2, urlparse, sys, copy
import ClientCookie
from ClientCookie._Util import response_seek_wrapper
from ClientCookie._HeadersUtil import split_header_words, is_html
from _useragent import UserAgent
+from _html import DefaultFactory
__version__ = (0, 0, 12, "a", None) # 0.0.12a
@@ -33,400 +31,6 @@
class LinkNotFoundError(Exception): pass
class FormNotFoundError(Exception): pass
-## def chr_range(a, b):
-## return "".join(map(chr, range(ord(a), ord(b)+1)))
-
-## RESERVED_URL_CHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-## "abcdefghijklmnopqrstuvwxyz"
-## "-_.~")
-## UNRESERVED_URL_CHARS = "!*'();:@&=+$,/?%#[]"
-# we want (RESERVED_URL_CHARS+UNRESERVED_URL_CHARS), minus those
-# 'safe'-by-default characters that urllib.urlquote never quotes
-URLQUOTE_SAFE_URL_CHARS = "!*'();:@&=+$,/?%#[]~"
-
-## # XXXX miserable hack
-## def urljoin(base, url):
-## if url.startswith("?"):
-## return base+url
-## else:
-## return urlparse.urljoin(base, url)
-
-# idea for this argument-processing trick is from Peter Otten
-class Args:
- def __init__(self, args_map):
- self.dictionary = dict(args_map)
- def __getattr__(self, key):
- try:
- return self.dictionary[key]
- except KeyError:
- return getattr(self.__class__, key)
-
-def form_parser_args(
- select_default=False,
- form_parser_class=None,
- request_class=None,
- backwards_compat=False,
- encoding="latin-1", # deprecated
- ):
- return Args(locals())
-
-
-class Link:
- def __init__(self, base_url, url, text, tag, attrs):
- assert None not in [url, tag, attrs]
- self.base_url = base_url
- self.absolute_url = urljoin(base_url, url)
- self.url, self.text, self.tag, self.attrs = url, text, tag, attrs
- def __cmp__(self, other):
- try:
- for name in "url", "text", "tag", "attrs":
- if getattr(self, name) != getattr(other, name):
- return -1
- except AttributeError:
- return -1
- return 0
- def __repr__(self):
- return "Link(base_url=%r, url=%r, text=%r, tag=%r, attrs=%r)" % (
- self.base_url, self.url, self.text, self.tag, self.attrs)
-
-
-def clean_url(url, encoding):
- # percent-encode illegal URL characters
- if type(url) == type(""):
- url = url.decode(encoding, "replace")
- return urllib.quote(url.encode(encoding), URLQUOTE_SAFE_URL_CHARS)
-
-class LinksFactory:
-
- def __init__(self,
- link_parser_class=None,
- link_class=Link,
- urltags=None,
- ):
- import pullparser
- if link_parser_class is None:
- link_parser_class = pullparser.TolerantPullParser
- self.link_parser_class = link_parser_class
- self.link_class = link_class
- if urltags is None:
- urltags = {
- "a": "href",
- "area": "href",
- "frame": "src",
- "iframe": "src",
- }
- self.urltags = urltags
-
- def links(self, fh, base_url, encoding=None):
- """Return an iterator that provides links of the document."""
- import pullparser
- p = self.link_parser_class(fh, encoding=encoding)
-
- for token in p.tags(*(self.urltags.keys()+["base"])):
- if token.data == "base":
- base_url = dict(token.attrs).get("href")
- continue
- if token.type == "endtag":
- continue
- attrs = dict(token.attrs)
- tag = token.data
- name = attrs.get("name")
- text = None
- # XXX use attr_encoding for ref'd doc if that doc does not provide
- # one by other means
- #attr_encoding = attrs.get("charset")
- url = attrs.get(self.urltags[tag]) # XXX is "" a valid URL?
- if not url:
- # Probably an <A NAME="blah"> link or <AREA NOHREF...>.
- # For our purposes a link is something with a URL, so ignore
- # this.
- continue
-
- url = clean_url(url, encoding)
- if tag == "a":
- if token.type != "startendtag":
- # hmm, this'd break if end tag is missing
- text = p.get_compressed_text(("endtag", tag))
- # but this doesn't work for eg. <a href="blah"><b>Andy</b></a>
- #text = p.get_compressed_text()
-
- yield Link(base_url, url, text, tag, token.attrs)
-
-class FormsFactory:
-
- """Makes a sequence of objects satisfying ClientForm.HTMLForm interface.
-
- For constructor argument docs, see ClientForm.ParseResponse
- argument docs.
-
- """
-
- def __init__(self,
- select_default=False,
- form_parser_class=None,
- request_class=None,
- backwards_compat=False,
- encoding="latin-1", # deprecated
- ):
- import ClientForm
- self.select_default = select_default
- if form_parser_class is None:
- form_parser_class = ClientForm.FormParser
- self.form_parser_class = form_parser_class
- if request_class is None:
- request_class = ClientCookie.Request
- self.request_class = request_class
- self.backwards_compat = backwards_compat
- self.encoding = encoding
-
- def parse_response(self, response, encoding=None):
- import ClientForm
- if encoding is None:
- encoding = self.encoding
- return ClientForm.ParseResponse(
- response,
- select_default=self.select_default,
- form_parser_class=self.form_parser_class,
- request_class=self.request_class,
- backwards_compat=self.backwards_compat,
- encoding=encoding,
- )
-
- def parse_file(self, file_obj, base_url, encoding=None):
- import ClientForm
- if encoding is None:
- encoding = self.encoding
- return ClientForm.ParseFile(
- file_obj,
- base_url,
- select_default=self.select_default,
- form_parser_class=self.form_parser_class,
- request_class=self.request_class,
- backwards_compat=self.backwards_compat,
- encoding=encoding,
- )
-
-def pp_get_title(response, encoding):
- import pullparser
- p = pullparser.TolerantPullParser(response, encoding=encoding)
- try:
- p.get_tag("title")
- except pullparser.NoMoreTokensError:
- return None
- else:
- return p.get_text()
-
-
-def unescape(data, entities, encoding):
- if data is None or "&" not in data:
- return data
-
- def replace_entities(match):
- ent = match.group()
- if ent[1] == "#":
- return unescape_charref(ent[2:-1], encoding)
-
- repl = entities.get(ent[1:-1])
- if repl is not None:
- repl = unichr(repl)
- if type(repl) != type(""):
- try:
- repl = repl.encode(encoding)
- except UnicodeError:
- repl = ent
- else:
- repl = ent
- return repl
-
- return re.sub(r"&#?[A-Za-z0-9]+?;", replace_entities, data)
-
-def unescape_charref(data, encoding):
- name, base = data, 10
- if name.startswith("x"):
- name, base= name[1:], 16
- uc = unichr(int(name, base))
- if encoding is None:
- return uc
- else:
- try:
- repl = uc.encode(encoding)
- except UnicodeError:
- repl = "&#%s;" % data
- return repl
-
-def get_entitydefs():
- try:
- htmlentitydefs.name2codepoint
- except AttributeError:
- entitydefs = {}
- for name, char in htmlentitydefs.entitydefs.items():
- uc = char.decode("latin-1")
- if uc.startswith("&#") and uc.endswith(";"):
- uc = unescape_charref(uc[2:-1], None)
- codepoint = ord(uc)
- entitydefs[name] = codepoint
- else:
- entitydefs = htmlentitydefs.name2codepoint
- return entitydefs
-
-
-try:
- import BeautifulSoup
-except ImportError:
- pass
-else:
- import sgmllib
- # monkeypatch to fix http://www.python.org/sf/803422 :-(
- sgmllib.charref = re.compile("&#(x?[0-9a-fA-F]+)[^0-9a-fA-F]")
- class MechanizeBs(BeautifulSoup.BeautifulSoup):
- _entitydefs = get_entitydefs()
- def __init__(self, encoding, text=None, avoidParserProblems=True,
- initialTextIsEverything=True):
- self._encoding = encoding
- BeautifulSoup.BeautifulSoup.__init__(
- self, text, avoidParserProblems, initialTextIsEverything)
-
- def handle_charref(self, ref):
- t = unescape("&#%s;"%ref, self._entitydefs, self._encoding)
- self.handle_data(t)
- def handle_entityref(self, ref):
- t = unescape("&%s;"%ref, self._entitydefs, self._encoding)
- self.handle_data(t)
- def unescape_attrs(self, attrs):
- escaped_attrs = []
- for key, val in attrs:
- val = unescape(val, self._entitydefs, self._encoding)
- escaped_attrs.append((key, val))
- return escaped_attrs
-
-class RobustLinksFactory:
-
- compress_re = re.compile(r"\s+")
-
- def __init__(self,
- link_parser_class=None,
- link_class=Link,
- urltags=None,
- ):
- import BeautifulSoup
- if link_parser_class is None:
- link_parser_class = MechanizeBs
- self.link_parser_class = link_parser_class
- self.link_class = link_class
- if urltags is None:
- urltags = {
- "a": "href",
- "area": "href",
- "frame": "src",
- "iframe": "src",
- }
- self.urltags = urltags
-
- def links(self, fh, base_url, encoding=None):
- import BeautifulSoup
- data = fh.read()
- bs = self.link_parser_class(encoding, data)
- gen = bs.recursiveChildGenerator()
- for ch in bs.recursiveChildGenerator():
- if (isinstance(ch, BeautifulSoup.Tag) and
- ch.name in self.urltags.keys()+["base"]):
- link = ch
- attrs = bs.unescape_attrs(link.attrs)
- attrs_dict = dict(attrs)
- if link.name == "base":
- base_url = attrs_dict.get("href")
- continue
- url_attr = self.urltags[link.name]
- url = attrs_dict.get(url_attr)
- if not url:
- continue
- url = clean_url(url, encoding)
- text = link.firstText(lambda t: True)
- if text is BeautifulSoup.Null:
- # follow pullparser's weird behaviour rigidly
- if link.name == "a":
- text = ""
- else:
- text = None
- else:
- text = self.compress_re.sub(" ", text.strip())
- yield Link(base_url, url, text, link.name, attrs)
-
-
-class RobustFormsFactory(FormsFactory):
- def __init__(self, *args, **kwds):
- import ClientForm
- args = form_parser_args(*args, **kwds)
- if args.form_parser_class is None:
- args.form_parser_class = ClientForm.RobustFormParser
- FormsFactory.__init__(self, **args.dictionary)
-
-def bs_get_title(response, encoding):
- import BeautifulSoup
- # XXXX encoding
- bs = BeautifulSoup.BeautifulSoup(response.read())
- title = bs.first("title")
- if title == BeautifulSoup.Null:
- return None
- else:
- return title.firstText(lambda t: True)
-
-
-class Factory:
- """Factory for forms, links, etc.
-
- The interface of this class may expand in future.
-
- """
-
- def __init__(self, forms_factory, links_factory, get_title):
- """
-
- Pass keyword
- arguments only.
-
- """
- self._forms_factory = forms_factory
- self._links_factory = links_factory
- self._get_title = get_title
-
- def set_request_class(self, request_class):
- """Set urllib2.Request class.
-
- ClientForm.HTMLForm instances returned by .forms() will return
- instances of this class when .click()ed.
-
- """
- self._forms_factory.request_class = request_class
-
- def forms(self, response, encoding):
- """Return iterable over ClientForm.HTMLForm-like objects."""
- return self._forms_factory.parse_response(response, encoding)
-
- def links(self, response, encoding):
- """Return iterable over mechanize.Link-like objects."""
- return self._links_factory.links(response, response.geturl(), encoding)
-
- def title(self, response, encoding):
- """Return page title."""
- return self._get_title(response, encoding)
-
-class DefaultFactory(Factory):
- def __init__(self):
- Factory.__init__(self,
- forms_factory=FormsFactory(),
- links_factory=LinksFactory(),
- get_title=pp_get_title,
- )
-
-class RobustFactory(Factory):
- def __init__(self):
- Factory.__init__(self,
- forms_factory=RobustFormsFactory(),
- links_factory=RobustLinksFactory(),
- get_title=bs_get_title,
- )
-
class History:
"""
Modified: wwwsearch/mechanize/trunk/test.py
==============================================================================
--- wwwsearch/mechanize/trunk/test.py (original)
+++ wwwsearch/mechanize/trunk/test.py Fri Apr 21 00:02:25 2006
@@ -20,7 +20,7 @@
class UnescapeTests(TestCase):
def test_unescape_charref(self):
- from mechanize._mechanize import unescape_charref, get_entitydefs
+ from mechanize._html import unescape_charref, get_entitydefs
mdash_utf8 = u"\u2014".encode("utf-8")
for ref, codepoint, utf8, latin1 in [
("38", 38, u"&".encode("utf-8"), "&"),
@@ -32,7 +32,7 @@
self.assertEqual(unescape_charref(ref, 'utf-8'), utf8)
def test_get_entitydefs(self):
- from mechanize._mechanize import get_entitydefs
+ from mechanize._html import get_entitydefs
ed = get_entitydefs()
for name, codepoint in [
("amp", ord(u"&")),
@@ -45,7 +45,7 @@
def test_unescape(self):
import htmlentitydefs
- from mechanize._mechanize import unescape, get_entitydefs
+ from mechanize._html import unescape, get_entitydefs
data = "& < — — —"
mdash_utf8 = u"\u2014".encode("utf-8")
ue = unescape(data, get_entitydefs(), "utf-8")
@@ -407,7 +407,7 @@
def _test_link_encoding(self, factory):
import urllib
import mechanize
- from mechanize._mechanize import clean_url
+ from mechanize._html import clean_url
url = "http://example.com/"
for encoding in ["UTF-8", "latin-1"]:
encoding_decl = "; charset=%s" % encoding
More information about the wwwsearch-commits
mailing list