From jjlee at codespeak.net Fri Jan 5 23:12:36 2007 From: jjlee at codespeak.net (jjlee at codespeak.net) Date: Fri, 5 Jan 2007 23:12:36 +0100 (CET) Subject: [wwwsearch-commits] r36173 - wwwsearch/ClientForm/trunk Message-ID: <20070105221236.EEFF810079@code0.codespeak.net> Author: jjlee Date: Fri Jan 5 23:12:34 2007 New Revision: 36173 Modified: wwwsearch/ClientForm/trunk/ClientForm.py Log: For backwards compatibility, make ParseError derive from the exceptions that used to be raised during parsing Modified: wwwsearch/ClientForm/trunk/ClientForm.py ============================================================================== --- wwwsearch/ClientForm/trunk/ClientForm.py (original) +++ wwwsearch/ClientForm/trunk/ClientForm.py Fri Jan 5 23:12:34 2007 @@ -103,6 +103,19 @@ htmlentitydefs, re, random from cStringIO import StringIO +import sgmllib +# monkeypatch to fix http://www.python.org/sf/803422 :-( +sgmllib.charref = re.compile("&#(x?[0-9a-fA-F]+)[^0-9a-fA-F]") + +# HTMLParser.HTMLParser is recent, so live without it if it's not available +# (also, sgmllib.SGMLParser is much more tolerant of bad HTML) +try: + import HTMLParser +except ImportError: + HAVE_MODULE_HTMLPARSER = False +else: + HAVE_MODULE_HTMLPARSER = True + try: import warnings except ImportError: @@ -426,8 +439,16 @@ class ItemCountError(ValueError): pass - -class ParseError(Exception): pass +# for backwards compatibility, ParseError derives from exceptions that were +# raised by versions of ClientForm <= 0.2.5 +if HAVE_MODULE_HTMLPARSER: + class ParseError(sgmllib.SGMLParser, + HTMLParser.HTMLParser, + ): + pass +else: + class ParseError(sgmllib.SGMLParser): + pass class _AbstractFormParser: @@ -748,11 +769,7 @@ def unknown_charref(self, ref): self.handle_data("&#%s;" % ref) -# HTMLParser.HTMLParser is recent, so live without it if it's not available -# (also, htmllib.HTMLParser is much more tolerant of bad HTML) -try: - import HTMLParser -except ImportError: +if not HAVE_MODULE_HTMLPARSER: class XHTMLCompatibleFormParser: def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING): raise ValueError("HTMLParser could not be imported") @@ -807,9 +824,6 @@ def unescape_attrs_if_required(self, attrs): return attrs # ditto -import sgmllib -# monkeypatch to fix http://www.python.org/sf/803422 :-( -sgmllib.charref = re.compile("&#(x?[0-9a-fA-F]+)[^0-9a-fA-F]") class _AbstractSgmllibParser(_AbstractFormParser): From jjlee at codespeak.net Fri Jan 5 23:51:53 2007 From: jjlee at codespeak.net (jjlee at codespeak.net) Date: Fri, 5 Jan 2007 23:51:53 +0100 (CET) Subject: [wwwsearch-commits] r36174 - wwwsearch/ClientForm/trunk Message-ID: <20070105225153.194C610079@code0.codespeak.net> Author: jjlee Date: Fri Jan 5 23:51:52 2007 New Revision: 36174 Modified: wwwsearch/ClientForm/trunk/ClientForm.py wwwsearch/ClientForm/trunk/test.py Log: Derive ParseError from the right class Modified: wwwsearch/ClientForm/trunk/ClientForm.py ============================================================================== --- wwwsearch/ClientForm/trunk/ClientForm.py (original) +++ wwwsearch/ClientForm/trunk/ClientForm.py Fri Jan 5 23:51:52 2007 @@ -442,8 +442,8 @@ # for backwards compatibility, ParseError derives from exceptions that were # raised by versions of ClientForm <= 0.2.5 if HAVE_MODULE_HTMLPARSER: - class ParseError(sgmllib.SGMLParser, - HTMLParser.HTMLParser, + class ParseError(sgmllib.SGMLParseError, + HTMLParser.HTMLParseError, ): pass else: Modified: wwwsearch/ClientForm/trunk/test.py ============================================================================== --- wwwsearch/ClientForm/trunk/test.py (original) +++ wwwsearch/ClientForm/trunk/test.py Fri Jan 5 23:51:52 2007 @@ -226,12 +226,14 @@ def test_failing_parse(self): # XXX couldn't provoke an error from BeautifulSoup (!), so this has not # been tested with RobuststFormParser + import sgmllib f = StringIO("") base_uri = "http://localhost/" self.assertRaises( ClientForm.ParseError, ClientForm.ParseFile, f, base_uri, backwards_compat=False, ) + self.assert_(issubclass(ClientForm.ParseError, sgmllib.SGMLParseError)) def test_unknown_control(self): f = StringIO( From jjlee at codespeak.net Sat Jan 6 00:05:49 2007 From: jjlee at codespeak.net (jjlee at codespeak.net) Date: Sat, 6 Jan 2007 00:05:49 +0100 (CET) Subject: [wwwsearch-commits] r36175 - in wwwsearch/mechanize/trunk: mechanize test Message-ID: <20070105230549.5DE6010079@code0.codespeak.net> Author: jjlee Date: Sat Jan 6 00:05:30 2007 New Revision: 36175 Modified: wwwsearch/mechanize/trunk/mechanize/__init__.py wwwsearch/mechanize/trunk/mechanize/_html.py wwwsearch/mechanize/trunk/test/test_forms.doctest wwwsearch/mechanize/trunk/test/test_html.doctest Log: Add mechanize.ParseError class, document it as part of the mechanize.Factory interface, and raise it from all Factory implementations Modified: wwwsearch/mechanize/trunk/mechanize/__init__.py ============================================================================== --- wwwsearch/mechanize/trunk/mechanize/__init__.py (original) +++ wwwsearch/mechanize/trunk/mechanize/__init__.py Sat Jan 6 00:05:30 2007 @@ -50,6 +50,7 @@ 'MozillaCookieJar', 'OpenerDirector', 'OpenerFactory', + 'ParseError', 'ProxyBasicAuthHandler', 'ProxyDigestAuthHandler', 'ProxyHandler', @@ -89,6 +90,7 @@ # configurable URL-opener interface from _useragent import UserAgentBase, UserAgent from _html import \ + ParseError, \ Link, \ Factory, DefaultFactory, RobustFactory, \ FormsFactory, LinksFactory, TitleFactory, \ Modified: wwwsearch/mechanize/trunk/mechanize/_html.py ============================================================================== --- wwwsearch/mechanize/trunk/mechanize/_html.py (original) +++ wwwsearch/mechanize/trunk/mechanize/_html.py Sat Jan 6 00:05:30 2007 @@ -9,6 +9,7 @@ """ import re, copy, htmlentitydefs +import sgmllib, HTMLParser, ClientForm import _request from _headersutil import split_header_words, is_html as _is_html @@ -17,6 +18,10 @@ DEFAULT_ENCODING = "latin-1" +# the base classe is purely for backwards compatibility +class ParseError(ClientForm.ParseError): pass + + class CachingGeneratorFunction(object): """Caching wrapper around a no-arguments iterable.""" @@ -131,37 +136,41 @@ base_url = self._base_url p = self.link_parser_class(response, encoding=encoding) - for token in p.tags(*(self.urltags.keys()+["base"])): - if token.type == "endtag": - continue - if token.data == "base": - base_href = dict(token.attrs).get("href") - if base_href is not None: - base_url = base_href - continue - attrs = dict(token.attrs) - tag = token.data - name = attrs.get("name") - text = None - # XXX use attr_encoding for ref'd doc if that doc does not provide - # one by other means - #attr_encoding = attrs.get("charset") - url = attrs.get(self.urltags[tag]) # XXX is "" a valid URL? - if not url: - # Probably an link or . - # For our purposes a link is something with a URL, so ignore - # this. - continue - - url = _rfc3986.clean_url(url, encoding) - if tag == "a": - if token.type != "startendtag": - # hmm, this'd break if end tag is missing - text = p.get_compressed_text(("endtag", tag)) - # but this doesn't work for eg. Andy - #text = p.get_compressed_text() + try: + for token in p.tags(*(self.urltags.keys()+["base"])): + if token.type == "endtag": + continue + if token.data == "base": + base_href = dict(token.attrs).get("href") + if base_href is not None: + base_url = base_href + continue + attrs = dict(token.attrs) + tag = token.data + name = attrs.get("name") + text = None + # XXX use attr_encoding for ref'd doc if that doc does not + # provide one by other means + #attr_encoding = attrs.get("charset") + url = attrs.get(self.urltags[tag]) # XXX is "" a valid URL? + if not url: + # Probably an link or . + # For our purposes a link is something with a URL, so + # ignore this. + continue - yield Link(base_url, url, text, tag, token.attrs) + url = _rfc3986.clean_url(url, encoding) + if tag == "a": + if token.type != "startendtag": + # hmm, this'd break if end tag is missing + text = p.get_compressed_text(("endtag", tag)) + # but this doesn't work for eg. + # Andy + #text = p.get_compressed_text() + + yield Link(base_url, url, text, tag, token.attrs) + except sgmllib.SGMLParseError, exc: + raise ParseError(exc) class FormsFactory: @@ -202,16 +211,19 @@ def forms(self): import ClientForm encoding = self.encoding - forms = ClientForm.ParseResponseEx( - self._response, - select_default=self.select_default, - form_parser_class=self.form_parser_class, - request_class=self.request_class, - encoding=encoding, - _urljoin=_rfc3986.urljoin, - _urlparse=_rfc3986.urlsplit, - _urlunparse=_rfc3986.urlunsplit, - ) + try: + forms = ClientForm.ParseResponseEx( + self._response, + select_default=self.select_default, + form_parser_class=self.form_parser_class, + request_class=self.request_class, + encoding=encoding, + _urljoin=_rfc3986.urljoin, + _urlparse=_rfc3986.urlsplit, + _urlunparse=_rfc3986.urlunsplit, + ) + except ClientForm.ParseError, exc: + raise ParseError(exc) self.global_form = forms[0] return forms[1:] @@ -228,11 +240,14 @@ p = _pullparser.TolerantPullParser( self._response, encoding=self._encoding) try: - p.get_tag("title") - except _pullparser.NoMoreTokensError: - return None - else: - return p.get_text() + try: + p.get_tag("title") + except _pullparser.NoMoreTokensError: + return None + else: + return p.get_text() + except sgmllib.SGMLParseError, exc: + raise ParseError(exc) def unescape(data, entities, encoding): @@ -420,6 +435,8 @@ Public attributes: + Note that accessing these attributes may raise ParseError. + encoding: string specifying the encoding of response if it contains a text document (this value is left unspecified for documents that do not have an encoding, e.g. an image file) @@ -505,7 +522,10 @@ return self.global_form def forms(self): - """Return iterable over ClientForm.HTMLForm-like objects.""" + """Return iterable over ClientForm.HTMLForm-like objects. + + Raises mechanize.ParseError on failure. + """ # this implementation sets .global_form as a side-effect, for benefit # of __getattr__ impl if self._forms_genf is None: @@ -520,7 +540,10 @@ return self._forms_genf() def links(self): - """Return iterable over mechanize.Link-like objects.""" + """Return iterable over mechanize.Link-like objects. + + Raises mechanize.ParseError on failure. + """ if self._links_genf is None: try: self._links_genf = CachingGeneratorFunction( Modified: wwwsearch/mechanize/trunk/test/test_forms.doctest ============================================================================== --- wwwsearch/mechanize/trunk/test/test_forms.doctest (original) +++ wwwsearch/mechanize/trunk/test/test_forms.doctest Sat Jan 6 00:05:30 2007 @@ -41,7 +41,7 @@ already been .read(). Fixed by calling Factory.set_response() on error. ->>> import mechanize, sgmllib +>>> import mechanize >>> br = mechanize.Browser() >>> r = mechanize._response.test_html_response("""\ ...
@@ -52,8 +52,8 @@ >>> br.set_response(r) >>> try: ... br.select_form(nr=0) -... except sgmllib.SGMLParseError: +... except mechanize.ParseError: ... pass >>> br.select_form(nr=0) # doctest: +IGNORE_EXCEPTION_DETAIL Traceback (most recent call last): -SGMLParseError: expected name token +ParseError: expected name token Modified: wwwsearch/mechanize/trunk/test/test_html.doctest ============================================================================== --- wwwsearch/mechanize/trunk/test/test_html.doctest (original) +++ wwwsearch/mechanize/trunk/test/test_html.doctest Sat Jan 6 00:05:30 2007 @@ -1,3 +1,122 @@ +>>> import mechanize +>>> from mechanize._response import test_html_response +>>> from mechanize._html import LinksFactory, FormsFactory, TitleFactory, \ +... MechanizeBs, \ +... RobustLinksFactory, RobustFormsFactory, RobustTitleFactory + +mechanize.ParseError should be raised on parsing erroneous HTML. + +For backwards compatibility, mechanize.ParseError derives from +exception classes that mechanize used to raise, prior to version +0.1.6. + +>>> import sgmllib +>>> import HTMLParser +>>> import ClientForm +>>> issubclass(mechanize.ParseError, sgmllib.SGMLParseError) +True +>>> issubclass(mechanize.ParseError, HTMLParser.HTMLParseError) +True +>>> issubclass(mechanize.ParseError, ClientForm.ParseError) +True + +>>> def create_response(error=True): +... extra = "" +... if error: +... extra = "" +... html = """\ +... +... +... Title +... %s +... +... +...

Hello world +... +... +... """ % extra +... return test_html_response(html) + +>>> f = LinksFactory() +>>> f.set_response(create_response(), "http://example.com", "latin-1") +>>> list(f.links()) # doctest: +IGNORE_EXCEPTION_DETAIL +Traceback (most recent call last): +ParseError: +>>> f = FormsFactory() +>>> f.set_response(create_response(), "latin-1") +>>> list(f.forms()) # doctest: +IGNORE_EXCEPTION_DETAIL +Traceback (most recent call last): +ParseError: +>>> f = TitleFactory() +>>> f.set_response(create_response(), "latin-1") +>>> f.title() # doctest: +IGNORE_EXCEPTION_DETAIL +Traceback (most recent call last): +ParseError: + + +Accessing attributes on Factory may also raise ParseError + +>>> def factory_getattr(attr_name): +... fact = mechanize.DefaultFactory() +... fact.set_response(create_response()) +... getattr(fact, attr_name) +>>> factory_getattr("title") # doctest: +IGNORE_EXCEPTION_DETAIL +Traceback (most recent call last): +ParseError: +>>> factory_getattr("global_form") # doctest: +IGNORE_EXCEPTION_DETAIL +Traceback (most recent call last): +ParseError: + + +BeautifulSoup ParseErrors: + +XXX If I could come up with examples that break links and forms +parsing, I'd uncomment these! + +>>> def create_soup(html): +... r = test_html_response(html) +... return MechanizeBs("latin-1", r.read()) + +#>>> f = RobustLinksFactory() +#>>> html = """\ +#... +#... +#... +#...