[wwwsearch-commits] r36175 - in wwwsearch/mechanize/trunk: mechanize test

jjlee at codespeak.net jjlee at codespeak.net
Sat Jan 6 00:05:49 CET 2007


Author: jjlee
Date: Sat Jan  6 00:05:30 2007
New Revision: 36175

Modified:
   wwwsearch/mechanize/trunk/mechanize/__init__.py
   wwwsearch/mechanize/trunk/mechanize/_html.py
   wwwsearch/mechanize/trunk/test/test_forms.doctest
   wwwsearch/mechanize/trunk/test/test_html.doctest
Log:
Add mechanize.ParseError class, document it as part of the mechanize.Factory interface, and raise it from all Factory implementations

Modified: wwwsearch/mechanize/trunk/mechanize/__init__.py
==============================================================================
--- wwwsearch/mechanize/trunk/mechanize/__init__.py	(original)
+++ wwwsearch/mechanize/trunk/mechanize/__init__.py	Sat Jan  6 00:05:30 2007
@@ -50,6 +50,7 @@
     'MozillaCookieJar',
     'OpenerDirector',
     'OpenerFactory',
+    'ParseError',
     'ProxyBasicAuthHandler',
     'ProxyDigestAuthHandler',
     'ProxyHandler',
@@ -89,6 +90,7 @@
 # configurable URL-opener interface
 from _useragent import UserAgentBase, UserAgent
 from _html import \
+     ParseError, \
      Link, \
      Factory, DefaultFactory, RobustFactory, \
      FormsFactory, LinksFactory, TitleFactory, \

Modified: wwwsearch/mechanize/trunk/mechanize/_html.py
==============================================================================
--- wwwsearch/mechanize/trunk/mechanize/_html.py	(original)
+++ wwwsearch/mechanize/trunk/mechanize/_html.py	Sat Jan  6 00:05:30 2007
@@ -9,6 +9,7 @@
 """
 
 import re, copy, htmlentitydefs
+import sgmllib, HTMLParser, ClientForm
 
 import _request
 from _headersutil import split_header_words, is_html as _is_html
@@ -17,6 +18,10 @@
 DEFAULT_ENCODING = "latin-1"
 
 
+# the base classe is purely for backwards compatibility
+class ParseError(ClientForm.ParseError): pass
+
+
 class CachingGeneratorFunction(object):
     """Caching wrapper around a no-arguments iterable."""
 
@@ -131,37 +136,41 @@
         base_url = self._base_url
         p = self.link_parser_class(response, encoding=encoding)
 
-        for token in p.tags(*(self.urltags.keys()+["base"])):
-            if token.type == "endtag":
-                continue
-            if token.data == "base":
-                base_href = dict(token.attrs).get("href")
-                if base_href is not None:
-                    base_url = base_href
-                continue
-            attrs = dict(token.attrs)
-            tag = token.data
-            name = attrs.get("name")
-            text = None
-            # XXX use attr_encoding for ref'd doc if that doc does not provide
-            #  one by other means
-            #attr_encoding = attrs.get("charset")
-            url = attrs.get(self.urltags[tag])  # XXX is "" a valid URL?
-            if not url:
-                # Probably an <A NAME="blah"> link or <AREA NOHREF...>.
-                # For our purposes a link is something with a URL, so ignore
-                # this.
-                continue
-
-            url = _rfc3986.clean_url(url, encoding)
-            if tag == "a":
-                if token.type != "startendtag":
-                    # hmm, this'd break if end tag is missing
-                    text = p.get_compressed_text(("endtag", tag))
-                # but this doesn't work for eg. <a href="blah"><b>Andy</b></a>
-                #text = p.get_compressed_text()
+        try:
+            for token in p.tags(*(self.urltags.keys()+["base"])):
+                if token.type == "endtag":
+                    continue
+                if token.data == "base":
+                    base_href = dict(token.attrs).get("href")
+                    if base_href is not None:
+                        base_url = base_href
+                    continue
+                attrs = dict(token.attrs)
+                tag = token.data
+                name = attrs.get("name")
+                text = None
+                # XXX use attr_encoding for ref'd doc if that doc does not
+                #  provide one by other means
+                #attr_encoding = attrs.get("charset")
+                url = attrs.get(self.urltags[tag])  # XXX is "" a valid URL?
+                if not url:
+                    # Probably an <A NAME="blah"> link or <AREA NOHREF...>.
+                    # For our purposes a link is something with a URL, so
+                    # ignore this.
+                    continue
 
-            yield Link(base_url, url, text, tag, token.attrs)
+                url = _rfc3986.clean_url(url, encoding)
+                if tag == "a":
+                    if token.type != "startendtag":
+                        # hmm, this'd break if end tag is missing
+                        text = p.get_compressed_text(("endtag", tag))
+                    # but this doesn't work for eg.
+                    # <a href="blah"><b>Andy</b></a>
+                    #text = p.get_compressed_text()
+
+                yield Link(base_url, url, text, tag, token.attrs)
+        except sgmllib.SGMLParseError, exc:
+            raise ParseError(exc)
 
 class FormsFactory:
 
@@ -202,16 +211,19 @@
     def forms(self):
         import ClientForm
         encoding = self.encoding
-        forms = ClientForm.ParseResponseEx(
-            self._response,
-            select_default=self.select_default,
-            form_parser_class=self.form_parser_class,
-            request_class=self.request_class,
-            encoding=encoding,
-            _urljoin=_rfc3986.urljoin,
-            _urlparse=_rfc3986.urlsplit,
-            _urlunparse=_rfc3986.urlunsplit,
-            )
+        try:
+            forms = ClientForm.ParseResponseEx(
+                self._response,
+                select_default=self.select_default,
+                form_parser_class=self.form_parser_class,
+                request_class=self.request_class,
+                encoding=encoding,
+                _urljoin=_rfc3986.urljoin,
+                _urlparse=_rfc3986.urlsplit,
+                _urlunparse=_rfc3986.urlunsplit,
+                )
+        except ClientForm.ParseError, exc:
+            raise ParseError(exc)
         self.global_form = forms[0]
         return forms[1:]
 
@@ -228,11 +240,14 @@
         p = _pullparser.TolerantPullParser(
             self._response, encoding=self._encoding)
         try:
-            p.get_tag("title")
-        except _pullparser.NoMoreTokensError:
-            return None
-        else:
-            return p.get_text()
+            try:
+                p.get_tag("title")
+            except _pullparser.NoMoreTokensError:
+                return None
+            else:
+                return p.get_text()
+        except sgmllib.SGMLParseError, exc:
+            raise ParseError(exc)
 
 
 def unescape(data, entities, encoding):
@@ -420,6 +435,8 @@
 
     Public attributes:
 
+    Note that accessing these attributes may raise ParseError.
+
     encoding: string specifying the encoding of response if it contains a text
      document (this value is left unspecified for documents that do not have
      an encoding, e.g. an image file)
@@ -505,7 +522,10 @@
             return self.global_form
 
     def forms(self):
-        """Return iterable over ClientForm.HTMLForm-like objects."""
+        """Return iterable over ClientForm.HTMLForm-like objects.
+
+        Raises mechanize.ParseError on failure.
+        """
         # this implementation sets .global_form as a side-effect, for benefit
         # of __getattr__ impl
         if self._forms_genf is None:
@@ -520,7 +540,10 @@
         return self._forms_genf()
 
     def links(self):
-        """Return iterable over mechanize.Link-like objects."""
+        """Return iterable over mechanize.Link-like objects.
+
+        Raises mechanize.ParseError on failure.
+        """
         if self._links_genf is None:
             try:
                 self._links_genf = CachingGeneratorFunction(

Modified: wwwsearch/mechanize/trunk/test/test_forms.doctest
==============================================================================
--- wwwsearch/mechanize/trunk/test/test_forms.doctest	(original)
+++ wwwsearch/mechanize/trunk/test/test_forms.doctest	Sat Jan  6 00:05:30 2007
@@ -41,7 +41,7 @@
 already been .read().  Fixed by calling Factory.set_response() on
 error.
 
->>> import mechanize, sgmllib
+>>> import mechanize
 >>> br = mechanize.Browser()
 >>> r = mechanize._response.test_html_response("""\
 ... <form>
@@ -52,8 +52,8 @@
 >>> br.set_response(r)
 >>> try:
 ...     br.select_form(nr=0)
-... except sgmllib.SGMLParseError:
+... except mechanize.ParseError:
 ...     pass
 >>> br.select_form(nr=0)  # doctest: +IGNORE_EXCEPTION_DETAIL
 Traceback (most recent call last):
-SGMLParseError: expected name token
+ParseError: expected name token

Modified: wwwsearch/mechanize/trunk/test/test_html.doctest
==============================================================================
--- wwwsearch/mechanize/trunk/test/test_html.doctest	(original)
+++ wwwsearch/mechanize/trunk/test/test_html.doctest	Sat Jan  6 00:05:30 2007
@@ -1,3 +1,122 @@
+>>> import mechanize
+>>> from mechanize._response import test_html_response
+>>> from mechanize._html import LinksFactory, FormsFactory, TitleFactory, \
+... MechanizeBs, \
+... RobustLinksFactory,  RobustFormsFactory, RobustTitleFactory
+
+mechanize.ParseError should be raised on parsing erroneous HTML.
+
+For backwards compatibility, mechanize.ParseError derives from
+exception classes that mechanize used to raise, prior to version
+0.1.6.
+
+>>> import sgmllib
+>>> import HTMLParser
+>>> import ClientForm
+>>> issubclass(mechanize.ParseError, sgmllib.SGMLParseError)
+True
+>>> issubclass(mechanize.ParseError, HTMLParser.HTMLParseError)
+True
+>>> issubclass(mechanize.ParseError, ClientForm.ParseError)
+True
+
+>>> def create_response(error=True):
+...     extra = ""
+...     if error:
+...         extra = "<!!!>"
+...     html = """\
+... <html>
+... <head>
+...     <title>Title</title>
+...     %s
+... </head>
+... <body>
+...     <p>Hello world
+... </body>
+... </html>
+... """ % extra
+...     return test_html_response(html)
+
+>>> f = LinksFactory()
+>>> f.set_response(create_response(), "http://example.com", "latin-1")
+>>> list(f.links())  # doctest: +IGNORE_EXCEPTION_DETAIL
+Traceback (most recent call last):
+ParseError:
+>>> f = FormsFactory()
+>>> f.set_response(create_response(), "latin-1")
+>>> list(f.forms())  # doctest: +IGNORE_EXCEPTION_DETAIL
+Traceback (most recent call last):
+ParseError:
+>>> f = TitleFactory()
+>>> f.set_response(create_response(), "latin-1")
+>>> f.title()  # doctest: +IGNORE_EXCEPTION_DETAIL
+Traceback (most recent call last):
+ParseError:
+
+
+Accessing attributes on Factory may also raise ParseError
+
+>>> def factory_getattr(attr_name):
+...    fact = mechanize.DefaultFactory()
+...    fact.set_response(create_response())
+...    getattr(fact, attr_name)
+>>> factory_getattr("title")  # doctest: +IGNORE_EXCEPTION_DETAIL
+Traceback (most recent call last):
+ParseError:
+>>> factory_getattr("global_form")  # doctest: +IGNORE_EXCEPTION_DETAIL
+Traceback (most recent call last):
+ParseError:
+
+
+BeautifulSoup ParseErrors:
+
+XXX If I could come up with examples that break links and forms
+parsing, I'd uncomment these!
+
+>>> def create_soup(html):
+...     r = test_html_response(html)
+...     return MechanizeBs("latin-1", r.read())
+
+#>>> f = RobustLinksFactory()
+#>>> html = """\
+#... <a href="a">
+#... <frame src="b">
+#... <a href="c">
+#... <iframe src="d">
+#... </a>
+#... </area>
+#... </frame>
+#... """
+#>>> f.set_soup(create_soup(html), "http://example.com", "latin-1")
+#>>> list(f.links())  # doctest: +IGNORE_EXCEPTION_DETAIL
+#Traceback (most recent call last):
+#ParseError:
+
+>>> html = """\
+... <table>
+... <tr><td>
+... <input name='broken'>
+... </td>
+... </form>
+... </tr>
+... </form>
+... """
+>>> f = RobustFormsFactory()
+>>> f.set_response(create_response(), "latin-1")
+>>> list(f.forms())  # doctest: +IGNORE_EXCEPTION_DETAIL
+Traceback (most recent call last):
+ParseError:
+
+#>>> f = RobustTitleFactory()
+#>>> f.set_soup(create_soup(""), "latin-1")
+#>>> f.title()  # doctest: +IGNORE_EXCEPTION_DETAIL
+#Traceback (most recent call last):
+#ParseError:
+
+
+
+Utility class for caching forms etc.
+
 >>> from mechanize._html import CachingGeneratorFunction
 
 >>> i = [1]


More information about the wwwsearch-commits mailing list