[wwwsearch-commits] r36175 - in wwwsearch/mechanize/trunk: mechanize test
jjlee at codespeak.net
jjlee at codespeak.net
Sat Jan 6 00:05:49 CET 2007
Author: jjlee
Date: Sat Jan 6 00:05:30 2007
New Revision: 36175
Modified:
wwwsearch/mechanize/trunk/mechanize/__init__.py
wwwsearch/mechanize/trunk/mechanize/_html.py
wwwsearch/mechanize/trunk/test/test_forms.doctest
wwwsearch/mechanize/trunk/test/test_html.doctest
Log:
Add mechanize.ParseError class, document it as part of the mechanize.Factory interface, and raise it from all Factory implementations
Modified: wwwsearch/mechanize/trunk/mechanize/__init__.py
==============================================================================
--- wwwsearch/mechanize/trunk/mechanize/__init__.py (original)
+++ wwwsearch/mechanize/trunk/mechanize/__init__.py Sat Jan 6 00:05:30 2007
@@ -50,6 +50,7 @@
'MozillaCookieJar',
'OpenerDirector',
'OpenerFactory',
+ 'ParseError',
'ProxyBasicAuthHandler',
'ProxyDigestAuthHandler',
'ProxyHandler',
@@ -89,6 +90,7 @@
# configurable URL-opener interface
from _useragent import UserAgentBase, UserAgent
from _html import \
+ ParseError, \
Link, \
Factory, DefaultFactory, RobustFactory, \
FormsFactory, LinksFactory, TitleFactory, \
Modified: wwwsearch/mechanize/trunk/mechanize/_html.py
==============================================================================
--- wwwsearch/mechanize/trunk/mechanize/_html.py (original)
+++ wwwsearch/mechanize/trunk/mechanize/_html.py Sat Jan 6 00:05:30 2007
@@ -9,6 +9,7 @@
"""
import re, copy, htmlentitydefs
+import sgmllib, HTMLParser, ClientForm
import _request
from _headersutil import split_header_words, is_html as _is_html
@@ -17,6 +18,10 @@
DEFAULT_ENCODING = "latin-1"
+# the base classe is purely for backwards compatibility
+class ParseError(ClientForm.ParseError): pass
+
+
class CachingGeneratorFunction(object):
"""Caching wrapper around a no-arguments iterable."""
@@ -131,37 +136,41 @@
base_url = self._base_url
p = self.link_parser_class(response, encoding=encoding)
- for token in p.tags(*(self.urltags.keys()+["base"])):
- if token.type == "endtag":
- continue
- if token.data == "base":
- base_href = dict(token.attrs).get("href")
- if base_href is not None:
- base_url = base_href
- continue
- attrs = dict(token.attrs)
- tag = token.data
- name = attrs.get("name")
- text = None
- # XXX use attr_encoding for ref'd doc if that doc does not provide
- # one by other means
- #attr_encoding = attrs.get("charset")
- url = attrs.get(self.urltags[tag]) # XXX is "" a valid URL?
- if not url:
- # Probably an <A NAME="blah"> link or <AREA NOHREF...>.
- # For our purposes a link is something with a URL, so ignore
- # this.
- continue
-
- url = _rfc3986.clean_url(url, encoding)
- if tag == "a":
- if token.type != "startendtag":
- # hmm, this'd break if end tag is missing
- text = p.get_compressed_text(("endtag", tag))
- # but this doesn't work for eg. <a href="blah"><b>Andy</b></a>
- #text = p.get_compressed_text()
+ try:
+ for token in p.tags(*(self.urltags.keys()+["base"])):
+ if token.type == "endtag":
+ continue
+ if token.data == "base":
+ base_href = dict(token.attrs).get("href")
+ if base_href is not None:
+ base_url = base_href
+ continue
+ attrs = dict(token.attrs)
+ tag = token.data
+ name = attrs.get("name")
+ text = None
+ # XXX use attr_encoding for ref'd doc if that doc does not
+ # provide one by other means
+ #attr_encoding = attrs.get("charset")
+ url = attrs.get(self.urltags[tag]) # XXX is "" a valid URL?
+ if not url:
+ # Probably an <A NAME="blah"> link or <AREA NOHREF...>.
+ # For our purposes a link is something with a URL, so
+ # ignore this.
+ continue
- yield Link(base_url, url, text, tag, token.attrs)
+ url = _rfc3986.clean_url(url, encoding)
+ if tag == "a":
+ if token.type != "startendtag":
+ # hmm, this'd break if end tag is missing
+ text = p.get_compressed_text(("endtag", tag))
+ # but this doesn't work for eg.
+ # <a href="blah"><b>Andy</b></a>
+ #text = p.get_compressed_text()
+
+ yield Link(base_url, url, text, tag, token.attrs)
+ except sgmllib.SGMLParseError, exc:
+ raise ParseError(exc)
class FormsFactory:
@@ -202,16 +211,19 @@
def forms(self):
import ClientForm
encoding = self.encoding
- forms = ClientForm.ParseResponseEx(
- self._response,
- select_default=self.select_default,
- form_parser_class=self.form_parser_class,
- request_class=self.request_class,
- encoding=encoding,
- _urljoin=_rfc3986.urljoin,
- _urlparse=_rfc3986.urlsplit,
- _urlunparse=_rfc3986.urlunsplit,
- )
+ try:
+ forms = ClientForm.ParseResponseEx(
+ self._response,
+ select_default=self.select_default,
+ form_parser_class=self.form_parser_class,
+ request_class=self.request_class,
+ encoding=encoding,
+ _urljoin=_rfc3986.urljoin,
+ _urlparse=_rfc3986.urlsplit,
+ _urlunparse=_rfc3986.urlunsplit,
+ )
+ except ClientForm.ParseError, exc:
+ raise ParseError(exc)
self.global_form = forms[0]
return forms[1:]
@@ -228,11 +240,14 @@
p = _pullparser.TolerantPullParser(
self._response, encoding=self._encoding)
try:
- p.get_tag("title")
- except _pullparser.NoMoreTokensError:
- return None
- else:
- return p.get_text()
+ try:
+ p.get_tag("title")
+ except _pullparser.NoMoreTokensError:
+ return None
+ else:
+ return p.get_text()
+ except sgmllib.SGMLParseError, exc:
+ raise ParseError(exc)
def unescape(data, entities, encoding):
@@ -420,6 +435,8 @@
Public attributes:
+ Note that accessing these attributes may raise ParseError.
+
encoding: string specifying the encoding of response if it contains a text
document (this value is left unspecified for documents that do not have
an encoding, e.g. an image file)
@@ -505,7 +522,10 @@
return self.global_form
def forms(self):
- """Return iterable over ClientForm.HTMLForm-like objects."""
+ """Return iterable over ClientForm.HTMLForm-like objects.
+
+ Raises mechanize.ParseError on failure.
+ """
# this implementation sets .global_form as a side-effect, for benefit
# of __getattr__ impl
if self._forms_genf is None:
@@ -520,7 +540,10 @@
return self._forms_genf()
def links(self):
- """Return iterable over mechanize.Link-like objects."""
+ """Return iterable over mechanize.Link-like objects.
+
+ Raises mechanize.ParseError on failure.
+ """
if self._links_genf is None:
try:
self._links_genf = CachingGeneratorFunction(
Modified: wwwsearch/mechanize/trunk/test/test_forms.doctest
==============================================================================
--- wwwsearch/mechanize/trunk/test/test_forms.doctest (original)
+++ wwwsearch/mechanize/trunk/test/test_forms.doctest Sat Jan 6 00:05:30 2007
@@ -41,7 +41,7 @@
already been .read(). Fixed by calling Factory.set_response() on
error.
->>> import mechanize, sgmllib
+>>> import mechanize
>>> br = mechanize.Browser()
>>> r = mechanize._response.test_html_response("""\
... <form>
@@ -52,8 +52,8 @@
>>> br.set_response(r)
>>> try:
... br.select_form(nr=0)
-... except sgmllib.SGMLParseError:
+... except mechanize.ParseError:
... pass
>>> br.select_form(nr=0) # doctest: +IGNORE_EXCEPTION_DETAIL
Traceback (most recent call last):
-SGMLParseError: expected name token
+ParseError: expected name token
Modified: wwwsearch/mechanize/trunk/test/test_html.doctest
==============================================================================
--- wwwsearch/mechanize/trunk/test/test_html.doctest (original)
+++ wwwsearch/mechanize/trunk/test/test_html.doctest Sat Jan 6 00:05:30 2007
@@ -1,3 +1,122 @@
+>>> import mechanize
+>>> from mechanize._response import test_html_response
+>>> from mechanize._html import LinksFactory, FormsFactory, TitleFactory, \
+... MechanizeBs, \
+... RobustLinksFactory, RobustFormsFactory, RobustTitleFactory
+
+mechanize.ParseError should be raised on parsing erroneous HTML.
+
+For backwards compatibility, mechanize.ParseError derives from
+exception classes that mechanize used to raise, prior to version
+0.1.6.
+
+>>> import sgmllib
+>>> import HTMLParser
+>>> import ClientForm
+>>> issubclass(mechanize.ParseError, sgmllib.SGMLParseError)
+True
+>>> issubclass(mechanize.ParseError, HTMLParser.HTMLParseError)
+True
+>>> issubclass(mechanize.ParseError, ClientForm.ParseError)
+True
+
+>>> def create_response(error=True):
+... extra = ""
+... if error:
+... extra = "<!!!>"
+... html = """\
+... <html>
+... <head>
+... <title>Title</title>
+... %s
+... </head>
+... <body>
+... <p>Hello world
+... </body>
+... </html>
+... """ % extra
+... return test_html_response(html)
+
+>>> f = LinksFactory()
+>>> f.set_response(create_response(), "http://example.com", "latin-1")
+>>> list(f.links()) # doctest: +IGNORE_EXCEPTION_DETAIL
+Traceback (most recent call last):
+ParseError:
+>>> f = FormsFactory()
+>>> f.set_response(create_response(), "latin-1")
+>>> list(f.forms()) # doctest: +IGNORE_EXCEPTION_DETAIL
+Traceback (most recent call last):
+ParseError:
+>>> f = TitleFactory()
+>>> f.set_response(create_response(), "latin-1")
+>>> f.title() # doctest: +IGNORE_EXCEPTION_DETAIL
+Traceback (most recent call last):
+ParseError:
+
+
+Accessing attributes on Factory may also raise ParseError
+
+>>> def factory_getattr(attr_name):
+... fact = mechanize.DefaultFactory()
+... fact.set_response(create_response())
+... getattr(fact, attr_name)
+>>> factory_getattr("title") # doctest: +IGNORE_EXCEPTION_DETAIL
+Traceback (most recent call last):
+ParseError:
+>>> factory_getattr("global_form") # doctest: +IGNORE_EXCEPTION_DETAIL
+Traceback (most recent call last):
+ParseError:
+
+
+BeautifulSoup ParseErrors:
+
+XXX If I could come up with examples that break links and forms
+parsing, I'd uncomment these!
+
+>>> def create_soup(html):
+... r = test_html_response(html)
+... return MechanizeBs("latin-1", r.read())
+
+#>>> f = RobustLinksFactory()
+#>>> html = """\
+#... <a href="a">
+#... <frame src="b">
+#... <a href="c">
+#... <iframe src="d">
+#... </a>
+#... </area>
+#... </frame>
+#... """
+#>>> f.set_soup(create_soup(html), "http://example.com", "latin-1")
+#>>> list(f.links()) # doctest: +IGNORE_EXCEPTION_DETAIL
+#Traceback (most recent call last):
+#ParseError:
+
+>>> html = """\
+... <table>
+... <tr><td>
+... <input name='broken'>
+... </td>
+... </form>
+... </tr>
+... </form>
+... """
+>>> f = RobustFormsFactory()
+>>> f.set_response(create_response(), "latin-1")
+>>> list(f.forms()) # doctest: +IGNORE_EXCEPTION_DETAIL
+Traceback (most recent call last):
+ParseError:
+
+#>>> f = RobustTitleFactory()
+#>>> f.set_soup(create_soup(""), "latin-1")
+#>>> f.title() # doctest: +IGNORE_EXCEPTION_DETAIL
+#Traceback (most recent call last):
+#ParseError:
+
+
+
+Utility class for caching forms etc.
+
>>> from mechanize._html import CachingGeneratorFunction
>>> i = [1]
More information about the wwwsearch-commits
mailing list