[wwwsearch-commits] r35772 - wwwsearch/ClientForm/trunk

jjlee at codespeak.net jjlee at codespeak.net
Fri Dec 15 02:24:50 CET 2006


Author: jjlee
Date: Fri Dec 15 02:24:43 2006
New Revision: 35772

Modified:
   wwwsearch/ClientForm/trunk/ClientForm.py
   wwwsearch/ClientForm/trunk/test.py
Log:
Allow controls to appear outside of forms (the HTML spec allows this).  This involved adding new functions ParseFileEx and ParseResponseEx, which return a list that's always one longer than the return value of their counterparts ParseFile and ParseResponse.  The new first element in the list of forms is an HTMLForm representing the collection of all forms that lie outside of any FORM element.

Modified: wwwsearch/ClientForm/trunk/ClientForm.py
==============================================================================
--- wwwsearch/ClientForm/trunk/ClientForm.py	(original)
+++ wwwsearch/ClientForm/trunk/ClientForm.py	Fri Dec 15 02:24:43 2006
@@ -27,6 +27,7 @@
 """
 
 # XXX
+# add an __all__
 # Remove parser testing hack
 # safeUrl()-ize action
 # Add url attribute to ParseError
@@ -449,6 +450,13 @@
         self._option = None
         self._textarea = None
 
+        # forms[0] will contain all controls that are outside of any form
+        # self._global_form is an alias for self.forms[0]
+        self._global_form = None
+        self.start_form([])
+        self.end_form()
+        self._current_form = self._global_form = self.forms[0]
+
     def do_base(self, attrs):
         debug("%s", attrs)
         for key, value in attrs:
@@ -459,12 +467,12 @@
         debug("")
         if self._current_label is not None:
             self.end_label()
-        if self._current_form is not None:
+        if self._current_form is not self._global_form:
             self.end_form()
 
     def start_form(self, attrs):
         debug("%s", attrs)
-        if self._current_form is not None:
+        if self._current_form is not self._global_form:
             raise ParseError("nested FORMs")
         name = None
         action = None
@@ -488,15 +496,13 @@
         debug("")
         if self._current_label is not None:
             self.end_label()
-        if self._current_form is None:
+        if self._current_form is self._global_form:
             raise ParseError("end of FORM before start")
         self.forms.append(self._current_form)
-        self._current_form = None
+        self._current_form = self._global_form
 
     def start_select(self, attrs):
         debug("%s", attrs)
-        if self._current_form is None:
-            raise ParseError("start of SELECT before start of FORM")
         if self._select is not None:
             raise ParseError("nested SELECTs")
         if self._textarea is not None:
@@ -512,8 +518,8 @@
 
     def end_select(self):
         debug("")
-        if self._current_form is None:
-            raise ParseError("end of SELECT before start of FORM")
+        if self._current_form is self._global_form:
+            return
         if self._select is None:
             raise ParseError("end of SELECT before start")
 
@@ -580,8 +586,6 @@
 
     def start_textarea(self, attrs):
         debug("%s", attrs)
-        if self._current_form is None:
-            raise ParseError("start of TEXTAREA before start of FORM")
         if self._textarea is not None:
             raise ParseError("nested TEXTAREAs")
         if self._select is not None:
@@ -595,8 +599,8 @@
 
     def end_textarea(self):
         debug("")
-        if self._current_form is None:
-            raise ParseError("end of TEXTAREA before start of FORM")
+        if self._current_form is self._global_form:
+            return
         if self._textarea is None:
             raise ParseError("end of TEXTAREA before start")
         controls = self._current_form[2]
@@ -675,8 +679,6 @@
 
     def do_button(self, attrs):
         debug("%s", attrs)
-        if self._current_form is None:
-            raise ParseError("start of BUTTON before start of FORM")
         d = {}
         d["type"] = "submit"  # default
         for key, val in attrs:
@@ -695,8 +697,6 @@
 
     def do_input(self, attrs):
         debug("%s", attrs)
-        if self._current_form is None:
-            raise ParseError("start of INPUT before start of FORM")
         d = {}
         d["type"] = "text"  # default
         for key, val in attrs:
@@ -710,8 +710,6 @@
 
     def do_isindex(self, attrs):
         debug("%s", attrs)
-        if self._current_form is None:
-            raise ParseError("start of ISINDEX before start of FORM")
         d = {}
         for key, val in attrs:
             d[key] = val
@@ -886,19 +884,76 @@
 #FormParser = XHTMLCompatibleFormParser  # testing hack
 #FormParser = RobustFormParser  # testing hack
 
-def ParseResponse(response, select_default=False,
-                  ignore_errors=False,  # ignored!
-                  form_parser_class=FormParser,
-                  request_class=urllib2.Request,
-                  entitydefs=None,
-                  backwards_compat=True,
-                  encoding=DEFAULT_ENCODING,
-
-                  # private
-                  _urljoin=urlparse.urljoin,
-                  _urlparse=urlparse.urlparse,
-                  _urlunparse=urlparse.urlunparse,
-                  ):
+
+def ParseResponseEx(response,
+                    select_default=False,
+                    form_parser_class=FormParser,
+                    request_class=urllib2.Request,
+                    entitydefs=None,
+                    encoding=DEFAULT_ENCODING,
+
+                    # private
+                    _urljoin=urlparse.urljoin,
+                    _urlparse=urlparse.urlparse,
+                    _urlunparse=urlparse.urlunparse,
+                    ):
+    """Identical to ParseResponse, except that:
+
+    1. The returned list contains an extra item.  The first form in the list
+    contains all controls not contained in any FORM element.
+
+    2. The arguments ignore_errors and backwards_compat have been removed.
+
+    3. Backwards-compatibility mode (backwards_compat=True) is not available.
+    """
+    return _ParseFileEx(response, response.geturl(),
+                        select_default,
+                        False,
+                        form_parser_class,
+                        request_class,
+                        entitydefs,
+                        False,
+                        encoding,
+                        _urljoin=_urljoin,
+                        _urlparse=_urlparse,
+                        _urlunparse=_urlunparse,
+                        )
+
+def ParseFileEx(file, base_uri,
+                select_default=False,
+                form_parser_class=FormParser,
+                request_class=urllib2.Request,
+                entitydefs=None,
+                encoding=DEFAULT_ENCODING,
+
+                # private
+                _urljoin=urlparse.urljoin,
+                _urlparse=urlparse.urlparse,
+                _urlunparse=urlparse.urlunparse,
+                ):
+    """Identical to ParseFile, except that:
+
+    1. The returned list contains an extra item.  The first form in the list
+    contains all controls not contained in any FORM element.
+
+    2. The arguments ignore_errors and backwards_compat have been removed.
+
+    3. Backwards-compatibility mode (backwards_compat=True) is not available.
+    """
+    return _ParseFileEx(file, base_uri,
+                        select_default,
+                        False,
+                        form_parser_class,
+                        request_class,
+                        entitydefs,
+                        False,
+                        encoding,
+                        _urljoin=_urljoin,
+                        _urlparse=_urlparse,
+                        _urlunparse=_urlunparse,
+                        )
+
+def ParseResponse(response, *args, **kwds):
     """Parse HTTP response and return a list of HTMLForm instances.
 
     The return value of urllib2.urlopen can be conveniently passed to this
@@ -958,33 +1013,9 @@
     own risk: there is no well-defined interface.
 
     """
-    return ParseFile(response, response.geturl(), select_default,
-                     False,
-                     form_parser_class,
-                     request_class,
-                     entitydefs,
-                     backwards_compat,
-                     encoding,
-                     _urljoin=_urljoin,
-                     _urlparse=_urlparse,
-                     _urlunparse=_urlunparse,
-                     )
-
-def ParseFile(file, base_uri, select_default=False,
-              ignore_errors=False,  # ignored!
-              form_parser_class=FormParser,
-              request_class=urllib2.Request,
-              entitydefs=None,
-              backwards_compat=True,
-              encoding=DEFAULT_ENCODING,
-
-              # these private arguments ars here as a hack to allow mechanize
-              # to follow RFC 3986.  ClientForm should do the same really --
-              # perhaps it's time to merge ClientForm with mechanize...
-              _urljoin=urlparse.urljoin,
-              _urlparse=urlparse.urlparse,
-              _urlunparse=urlparse.urlunparse,
-              ):
+    return _ParseFileEx(response, response.geturl(), *args, **kwds)[1:]
+
+def ParseFile(file, base_uri, *args, **kwds):
     """Parse HTML and return a list of HTMLForm instances.
 
     ClientForm.ParseError is raised on parse errors.
@@ -998,6 +1029,20 @@
     For the other arguments and further details, see ParseResponse.__doc__.
 
     """
+    return _ParseFileEx(file, base_uri, *args, **kwds)[1:]
+
+def _ParseFileEx(file, base_uri,
+                 select_default=False,
+                 ignore_errors=False,
+                 form_parser_class=FormParser,
+                 request_class=urllib2.Request,
+                 entitydefs=None,
+                 backwards_compat=True,
+                 encoding=DEFAULT_ENCODING,
+                 _urljoin=urlparse.urljoin,
+                 _urlparse=urlparse.urlparse,
+                 _urlunparse=urlparse.urlunparse,
+                 ):
     if backwards_compat:
         deprecation("operating in backwards-compatibility mode")
     fp = form_parser_class(entitydefs, encoding)

Modified: wwwsearch/ClientForm/trunk/test.py
==============================================================================
--- wwwsearch/ClientForm/trunk/test.py	(original)
+++ wwwsearch/ClientForm/trunk/test.py	Fri Dec 15 02:24:43 2006
@@ -212,6 +212,15 @@
     except AttributeError:
         return req.headers.items()
 
+class MockResponse:
+    def __init__(self, f, url):
+        self._file = f
+        self._url = url
+    def geturl(self):
+        return self._url
+    def __getattr__(self, name):
+        return getattr(self._file, name)
+
 class ParseTests(TestCase):
     def test_unknown_control(self):
         f = StringIO(
@@ -226,6 +235,84 @@
         for ctl in form.controls:
             self.assert_(isinstance(ctl, ClientForm.TextControl))
 
+    def test_ParseFileEx(self):
+        # empty "outer form" (where the "outer form" is the form consisting of
+        # all controls outside of any form)
+        f = StringIO(
+"""<form action="abc">
+<input type="text"></input>
+</form>
+""")
+        base_uri = "http://localhost/"
+        forms = ClientForm.ParseFileEx(f, base_uri)
+        outer = forms[0]
+        self.assertEqual(len(forms), 2)
+        self.assertEqual(outer.controls, [])
+        self.assertEqual(outer.name, None)
+        self.assertEqual(outer.action, base_uri)
+        self.assertEqual(outer.method, "GET")
+        self.assertEqual(outer.enctype, "application/x-www-form-urlencoded")
+        self.assertEqual(outer.attrs, {})
+
+        # non-empty outer form
+        f = StringIO(
+"""
+<input type="text" name="a"></input>
+<form action="abc">
+  <input type="text" name="b"></input>
+</form>
+<input type="text" name="c"></input>
+<form action="abc">
+  <input type="text" name="d"></input>
+</form>
+<input type="text" name="e"></input>
+""")
+        base_uri = "http://localhost/"
+        forms = ClientForm.ParseFileEx(f, base_uri)
+        outer = forms[0]
+        self.assertEqual(len(forms), 3)
+        self.assertEqual([c.name for c in outer.controls], ["a", "c", "e"])
+        self.assertEqual(outer.name, None)
+        self.assertEqual(outer.action, base_uri)
+        self.assertEqual(outer.method, "GET")
+        self.assertEqual(outer.enctype, "application/x-www-form-urlencoded")
+        self.assertEqual(outer.attrs, {})
+
+    def test_ParseResponse(self):
+        url = "http://example.com/"
+        r = MockResponse(
+            StringIO("""\
+<input type="text" name="outer"></input>
+<form action="abc"><input type="text" name="inner"></input></form>
+"""),
+            url,
+            )
+
+        forms = ClientForm.ParseResponse(r)
+        self.assertEqual(len(forms), 1)
+        form = forms[0]
+        self.assertEqual(form.action, url+"abc")
+        self.assertEqual(form.controls[0].name, "inner")
+
+    def test_ParseResponseEx(self):
+        url = "http://example.com/"
+        r = MockResponse(
+            StringIO("""\
+<input type="text" name="outer"></input>
+<form action="abc"><input type="text" name="inner"></input></form>
+"""),
+            url,
+            )
+
+        forms = ClientForm.ParseResponseEx(r)
+        self.assertEqual(len(forms), 2)
+        outer = forms[0]
+        inner = forms[1]
+        self.assertEqual(inner.action, url+"abc")
+        self.assertEqual(outer.action, url)
+        self.assertEqual(outer.controls[0].name, "outer")
+        self.assertEqual(inner.controls[0].name, "inner")
+
     def test_parse_error(self):
         f = StringIO(
 """<form action="abc">


More information about the wwwsearch-commits mailing list