[wwwsearch-commits] r26625 -
wwwsearch/mechanize/branch/mechanize-0.1.0-devel/mechanize
jjlee at codespeak.net
jjlee at codespeak.net
Mon May 1 00:44:58 CEST 2006
Author: jjlee
Date: Mon May 1 00:44:56 2006
New Revision: 26625
Modified:
wwwsearch/mechanize/branch/mechanize-0.1.0-devel/mechanize/__init__.py
wwwsearch/mechanize/branch/mechanize-0.1.0-devel/mechanize/_html.py
Log:
More factory interface breakage, mostly to separate details of data source from factory methods. It now does a little bit less re-parsing in the BeautifulSoup case. Probably I should get rid of the separate forms, links, title factories too, to simplify it a bit...
Modified: wwwsearch/mechanize/branch/mechanize-0.1.0-devel/mechanize/__init__.py
==============================================================================
--- wwwsearch/mechanize/branch/mechanize-0.1.0-devel/mechanize/__init__.py (original)
+++ wwwsearch/mechanize/branch/mechanize-0.1.0-devel/mechanize/__init__.py Mon May 1 00:44:56 2006
@@ -5,5 +5,5 @@
from _html import Link, \
Factory, DefaultFactory, RobustFactory, \
- FormsFactory, LinksFactory, pp_get_title, \
- RobustFormsFactory, RobustLinksFactory, bs_get_title
+ FormsFactory, LinksFactory, TitleFactory, \
+ RobustFormsFactory, RobustLinksFactory, RobustTitleFactory
Modified: wwwsearch/mechanize/branch/mechanize-0.1.0-devel/mechanize/_html.py
==============================================================================
--- wwwsearch/mechanize/branch/mechanize-0.1.0-devel/mechanize/_html.py (original)
+++ wwwsearch/mechanize/branch/mechanize-0.1.0-devel/mechanize/_html.py Mon May 1 00:44:56 2006
@@ -74,7 +74,6 @@
form_parser_class=None,
request_class=None,
backwards_compat=False,
- encoding=DEFAULT_ENCODING, # deprecated
):
return Args(locals())
@@ -124,11 +123,21 @@
"iframe": "src",
}
self.urltags = urltags
+ self._response = None
+ self._encoding = None
- def links(self, fh, base_url, encoding=None):
+ def set_response(self, response, base_url, encoding):
+ self._response = response
+ self._encoding = encoding
+ self._base_url = base_url
+
+ def links(self):
"""Return an iterator that provides links of the document."""
import pullparser
- p = self.link_parser_class(fh, encoding=encoding)
+ response = self._response
+ encoding = self._encoding
+ base_url = self._base_url
+ p = self.link_parser_class(response, encoding=encoding)
for token in p.tags(*(self.urltags.keys()+["base"])):
if token.data == "base":
@@ -174,7 +183,6 @@
form_parser_class=None,
request_class=None,
backwards_compat=False,
- encoding=DEFAULT_ENCODING, # deprecated
):
import ClientForm
self.select_default = select_default
@@ -185,14 +193,18 @@
request_class = ClientCookie.Request
self.request_class = request_class
self.backwards_compat = backwards_compat
+ self._response = None
+ self.encoding = None
+
+ def set_response(self, response, encoding):
+ self._response = response
self.encoding = encoding
- def parse_response(self, response, encoding=None):
+ def forms(self):
import ClientForm
- if encoding is None:
- encoding = self.encoding
+ encoding = self.encoding
return ClientForm.ParseResponse(
- response,
+ self._response,
select_default=self.select_default,
form_parser_class=self.form_parser_class,
request_class=self.request_class,
@@ -200,29 +212,24 @@
encoding=encoding,
)
- def parse_file(self, file_obj, base_url, encoding=None):
- import ClientForm
- if encoding is None:
- encoding = self.encoding
- return ClientForm.ParseFile(
- file_obj,
- base_url,
- select_default=self.select_default,
- form_parser_class=self.form_parser_class,
- request_class=self.request_class,
- backwards_compat=self.backwards_compat,
- encoding=encoding,
- )
+class TitleFactory:
+ def __init__(self):
+ self._response = self._encoding = None
-def pp_get_title(response, encoding):
- import pullparser
- p = pullparser.TolerantPullParser(response, encoding=encoding)
- try:
- p.get_tag("title")
- except pullparser.NoMoreTokensError:
- return None
- else:
- return p.get_text()
+ def set_response(self, response, encoding):
+ self._response = response
+ self._encoding = encoding
+
+ def title(self):
+ import pullparser
+ p = pullparser.TolerantPullParser(
+ self._response, encoding=self._encoding)
+ try:
+ p.get_tag("title")
+ except pullparser.NoMoreTokensError:
+ return None
+ else:
+ return p.get_text()
def unescape(data, entities, encoding):
@@ -288,6 +295,13 @@
sgmllib.charref = re.compile("&#(x?[0-9a-fA-F]+)[^0-9a-fA-F]")
class MechanizeBs(BeautifulSoup.BeautifulSoup):
_entitydefs = get_entitydefs()
+ # don't want the magic Microsoft-char workaround
+ PARSER_MASSAGE = [(re.compile('(<[^<>]*)/>'),
+ lambda(x):x.group(1) + ' />'),
+ (re.compile('<!\s+([^<>]*)>'),
+ lambda(x):'<!' + x.group(1) + '>')
+ ]
+
def __init__(self, encoding, text=None, avoidParserProblems=True,
initialTextIsEverything=True):
self._encoding = encoding
@@ -329,11 +343,20 @@
"iframe": "src",
}
self.urltags = urltags
+ self._bs = None
+ self._encoding = None
+ self._base_url = None
+
+ def set_soup(self, soup, base_url, encoding):
+ self._bs = soup
+ self._base_url = base_url
+ self._encoding = encoding
- def links(self, fh, base_url, encoding=None):
+ def links(self):
import BeautifulSoup
- data = fh.read()
- bs = self.link_parser_class(encoding, data)
+ bs = self._bs
+ base_url = self._base_url
+ encoding = self._encoding
gen = bs.recursiveChildGenerator()
for ch in bs.recursiveChildGenerator():
if (isinstance(ch, BeautifulSoup.Tag) and
@@ -342,7 +365,7 @@
attrs = bs.unescape_attrs(link.attrs)
attrs_dict = dict(attrs)
if link.name == "base":
- base_url = attrs_dict.get("href")
+ base_url = attrs_dict.get("href").encode(encoding)
continue
url_attr = self.urltags[link.name]
url = attrs_dict.get(url_attr)
@@ -358,7 +381,9 @@
text = None
else:
text = self.compress_re.sub(" ", text.strip())
- yield Link(base_url, url, text, link.name, attrs)
+ text = text.encode(encoding)
+ linkname = link.name.encode(encoding)
+ yield Link(base_url, url, text, linkname, attrs)
class RobustFormsFactory(FormsFactory):
@@ -369,15 +394,26 @@
args.form_parser_class = ClientForm.RobustFormParser
FormsFactory.__init__(self, **args.dictionary)
-def bs_get_title(response, encoding):
- import BeautifulSoup
- # XXXX encoding
- bs = BeautifulSoup.BeautifulSoup(response.read())
- title = bs.first("title")
- if title == BeautifulSoup.Null:
- return None
- else:
- return title.firstText(lambda t: True)
+ def set_response(self, response, encoding):
+ self._response = response
+ self.encoding = encoding
+
+
+class RobustTitleFactory:
+ def __init__(self):
+ self._bs = self._encoding = None
+
+ def set_soup(self, soup, encoding):
+ self._bs = soup
+ self._encoding = encoding
+
+ def title(soup):
+ import BeautifulSoup
+ title = self._bs.first("title")
+ if title == BeautifulSoup.Null:
+ return None
+ else:
+ return title.firstText(lambda t: True)
class Factory:
@@ -403,7 +439,7 @@
"""
- def __init__(self, forms_factory, links_factory, get_title,
+ def __init__(self, forms_factory, links_factory, title_factory,
get_encoding=encoding_finder(DEFAULT_ENCODING),
is_html_p=make_is_html(allow_xhtml=False),
):
@@ -420,7 +456,7 @@
"""
self._forms_factory = forms_factory
self._links_factory = links_factory
- self._get_title = get_title
+ self._title_factory = title_factory
self._get_encoding = get_encoding
self._is_html_p = is_html_p
@@ -444,6 +480,7 @@
"""
self._response = response
self._forms_genf = self._links_genf = None
+ self._get_title = None
for name in ["encoding", "is_html", "title"]:
try:
delattr(self, name)
@@ -463,7 +500,7 @@
return self.is_html
elif name == "title":
if self.is_html:
- self.title = self._get_title(self._response, self.encoding)
+ self.title = self._title_factory.title()
else:
self.title = None
return self.title
@@ -473,35 +510,60 @@
def forms(self):
"""Return iterable over ClientForm.HTMLForm-like objects."""
if self._forms_genf is None:
- forms_gen = self._forms_factory.parse_response(
- copy.copy(self._response), self.encoding)
- self._forms_genf = CachingGeneratorFunction(forms_gen)
+ self._forms_genf = CachingGeneratorFunction(
+ self._forms_factory.forms())
return self._forms_genf()
def links(self):
"""Return iterable over mechanize.Link-like objects."""
if self._links_genf is None:
- links_gen = self._links_factory.links(
- copy.copy(self._response), self._response.geturl(), self.encoding)
- self._links_genf = CachingGeneratorFunction(links_gen)
+ self._links_genf = CachingGeneratorFunction(
+ self._links_factory.links())
return self._links_genf()
class DefaultFactory(Factory):
+ """Based on sgmllib."""
def __init__(self, i_want_broken_xhtml_support=False):
Factory.__init__(
self,
forms_factory=FormsFactory(),
links_factory=LinksFactory(),
- get_title=pp_get_title,
+ title_factory=TitleFactory(),
is_html_p=make_is_html(allow_xhtml=i_want_broken_xhtml_support),
)
+ def set_response(self, response):
+ Factory.set_response(self, response)
+ if response is not None:
+ self._forms_factory.set_response(
+ copy.copy(response), self.encoding)
+ self._links_factory.set_response(
+ copy.copy(response), self._response.geturl(), self.encoding)
+ self._title_factory.set_response(
+ copy.copy(response), self.encoding)
+
class RobustFactory(Factory):
+ """Based on BeautifulSoup, hopefully a bit more robust to bad HTML than is
+ DefaultFactory.
+
+ """
def __init__(self, i_want_broken_xhtml_support=False):
Factory.__init__(
self,
forms_factory=RobustFormsFactory(),
links_factory=RobustLinksFactory(),
- get_title=bs_get_title,
+ title_factory=RobustTitleFactory(),
is_html_p=make_is_html(allow_xhtml=i_want_broken_xhtml_support),
)
+ self._bs = None
+
+ def set_response(self, response):
+ import BeautifulSoup
+ Factory.set_response(self, response)
+ if response is not None:
+ data = response.read()
+ self._bs = self.link_parser_class(self.encoding, data)
+ self._forms_factory.set_response(response, self.encoding)
+ self._links_factory.set_soup(
+ soup, response.geturl(), self.encoding)
+ self._title_factory.set_soup(soup, self.encoding)
More information about the wwwsearch-commits
mailing list