[wwwsearch-commits] r21641 - wwwsearch/ClientForm/trunk

jjlee at codespeak.net jjlee at codespeak.net
Mon Jan 2 19:43:40 CET 2006


Author: jjlee
Date: Mon Jan  2 19:43:38 2006
New Revision: 21641

Modified:
   wwwsearch/ClientForm/trunk/ClientForm.py
   wwwsearch/ClientForm/trunk/test.py
Log:
Apply pullparser / mechanize encoding fixes to ClientForm (A few ClientForm-specific character ref fixes were required also); Python backwards-compatibility fixes

Modified: wwwsearch/ClientForm/trunk/ClientForm.py
==============================================================================
--- wwwsearch/ClientForm/trunk/ClientForm.py	(original)
+++ wwwsearch/ClientForm/trunk/ClientForm.py	Mon Jan  2 19:43:38 2006
@@ -27,6 +27,11 @@
 """
 
 # XXX
+# remove unescape_attr method
+# remove parser testing hack
+# safeUrl-ize action
+# Really need to merge CC, CF, pp and mechanize as soon as mechanize
+#  goes to beta...
 # Add some more functional tests
 #  Especially single and multiple file upload on the internet.
 #  Does file upload work when name is missing?  Sourceforge tracker form
@@ -172,16 +177,58 @@
 def unescape(data, entities, encoding='latin-1'):
     if data is None or '&' not in data:
         return data
+
     def replace_entities(match, entities=entities, encoding=encoding):
         ent = match.group()
+        if ent[1] == "#":
+            return unescape_charref(ent[2:-1], encoding)
+
         repl = entities.get(ent)
         if repl is not None:
             if type(repl) != type(""):
-                repl = repl.encode(encoding)
+                try:
+                    repl = repl.encode(encoding)
+                except UnicodeError:
+                    repl = ent
         else:
             repl = ent
+
+        return repl
+
+    return re.sub(r'&#?\S+?;', replace_entities, data)
+
+def unescape_charref(data, encoding):
+    name, base = data, 10
+    if name.startswith("x"):
+        name, base= name[1:], 16
+    uc = unichr(int(name, base))
+    if encoding is None:
+        return uc
+    else:
+        try:
+            repl = uc.encode(encoding)
+        except UnicodeError:
+            repl = "&#%s;" % data
         return repl
-    return re.sub(r'&\S+?;', replace_entities, data)
+
+def get_entitydefs():
+    import htmlentitydefs
+    from codecs import latin_1_decode
+    entitydefs = {}
+    try:
+        htmlentitydefs.name2codepoint
+    except AttributeError:
+        entitydefs = {}
+        for name, char in htmlentitydefs.entitydefs.items():
+            uc = latin_1_decode(char)[0]
+            if uc.startswith("&#") and uc.endswith(";"):
+                uc = unescape_charref(uc[2:-1], None)
+            entitydefs["&%s;" % name] = uc
+    else:
+        for name, codepoint in htmlentitydefs.name2codepoint.items():
+            entitydefs["&%s;" % name] = unichr(codepoint)
+    return entitydefs
+
 
 def issequence(x):
     try:
@@ -690,6 +737,9 @@
             self.unknown_entityref(name)
             return
 
+    def handle_charref(self, name):
+        self.handle_data(unescape_charref(name, self._encoding))
+
     def unescape_attr(self, name):
         return unescape(name, self._entitydefs, self._encoding)
 
@@ -715,7 +765,7 @@
     import HTMLParser
 except ImportError:
     class XHTMLCompatibleFormParser:
-        def __init__(self, entitydefs=None):
+        def __init__(self, entitydefs=None, encoding='latin-1'):
             raise ValueError("HTMLParser could not be imported")
 else:
     class XHTMLCompatibleFormParser(_AbstractFormParser, HTMLParser.HTMLParser):
@@ -752,18 +802,6 @@
             else:
                 method()
 
-        # taken from sgmllib, with changes
-        def handle_charref(self, name):
-            try:
-                n = int(name)
-            except ValueError:
-                self.unknown_charref(name)
-                return
-            if not 0 <= n <= 255:
-                self.unknown_charref(name)
-                return
-            self.handle_data(chr(n))
-
         def unescape(self, name):
             # Use the entitydefs passed into constructor, not
             # HTMLParser.HTMLParser's entitydefs.
@@ -775,6 +813,8 @@
             return attrs  # ditto
 
 import sgmllib
+# monkeypatch to fix http://www.python.org/sf/803422 :-(
+sgmllib.charref = re.compile('&#(x?[0-9a-fA-F]+)[^0-9a-fA-F]')
 class _AbstractSgmllibParser(_AbstractFormParser):
     def do_option(self, attrs):
         _AbstractFormParser._start_option(self, attrs)
@@ -791,6 +831,8 @@
         _AbstractFormParser.__init__(self, entitydefs, encoding)
 
 try:
+    if sys.version_info[:2] < (2, 2):
+        raise ImportError  # BeautifulSoup uses generators
     import BeautifulSoup
 except ImportError:
     pass
@@ -820,12 +862,6 @@
 #FormParser = XHTMLCompatibleFormParser  # testing hack
 #FormParser = RobustFormParser  # testing hack
 
-def get_entitydefs():
-    entitydefs = {}
-    for name, codepoint in htmlentitydefs.name2codepoint.items():
-        entitydefs["&%s;" % name] = unichr(codepoint)
-    return entitydefs
-
 def ParseResponse(response, select_default=False,
                   ignore_errors=False,  # ignored!
                   form_parser_class=FormParser,

Modified: wwwsearch/ClientForm/trunk/test.py
==============================================================================
--- wwwsearch/ClientForm/trunk/test.py	(original)
+++ wwwsearch/ClientForm/trunk/test.py	Mon Jan  2 19:43:38 2006
@@ -71,7 +71,39 @@
         raise ClientForm.ControlNotFoundError
 
 class UnescapeTests(TestCase):
-    def test_unescape(self):
+
+    def test_unescape_charref(self):
+        from ClientForm import unescape_charref, get_entitydefs
+        mdash_utf8 = u"\u2014".encode("utf-8")
+        for ref, codepoint, utf8, latin1 in [
+            ("38", 38, u"&".encode("utf-8"), "&"),
+            ("x2014", 0x2014, mdash_utf8, "&#x2014;"),
+            ("8212", 8212, mdash_utf8, "&#8212;"),
+            ]:
+            self.assertEqual(unescape_charref(ref, None), unichr(codepoint))
+            self.assertEqual(unescape_charref(ref, 'latin-1'), latin1)
+            self.assertEqual(unescape_charref(ref, 'utf-8'), utf8)
+
+    def test_get_entitydefs(self):
+        from ClientForm import get_entitydefs
+        ed = get_entitydefs()
+        for name, char in [
+            ("&amp;", u"&"),
+            ("&lt;", u"<"),
+            ("&gt;", u">"),
+            ("&mdash;", u"\u2014"),
+            ("&spades;", u"\u2660"),
+            ]:
+            self.assertEqual(ed[name], char)
+
+    def test_unescape1(self):
+        from ClientForm import unescape, get_entitydefs
+        data = "&amp; &lt; &mdash; &#8212; &#x2014;"
+        mdash_utf8 = u"\u2014".encode("utf-8")
+        ue = unescape(data, get_entitydefs(), "utf-8")
+        self.assertEqual("& < %s %s %s" % ((mdash_utf8,)*3), ue)
+
+    def test_unescape2(self):
         from ClientForm import unescape, get_entitydefs
         self.assertEqual(unescape("Donald Duck &amp; Co",
                                   {"&amp;": "&"}), "Donald Duck & Co")
@@ -85,8 +117,29 @@
             unescape("&amp;foo;",
                      {"&amp;": "&", "&foo;": "splat"}), "&foo;")
         self.assertEqual(unescape("&amp;", {}), "&amp;")
-        self.assertEqual(unescape("&#x06aa;&mdash;", get_entitydefs(), "utf8"),
-                         "&#x06aa;"+(u"\u2014".encode('utf8')))
+
+        for encoding, expected in [
+            ("utf-8", u"&\u06aa\u2014\u2014".encode("utf-8")),
+            ("latin-1", "&&#x06aa;&#x2014;&mdash;")]:
+            self.assertEqual(
+                expected,
+                unescape("&amp;&#x06aa;&#x2014;&mdash;", get_entitydefs(), encoding))
+
+    def test_unescape_parsing(self):
+        file = StringIO(
+"""<form action="&amp;&mdash;&#x2014;&#8212;">
+<textarea name="name&amp;&mdash;&#x2014;&#8212;">val&amp;&mdash;&#x2014;&#8212;</textarea>
+</form>
+""")  #"
+        forms = ClientForm.ParseFile(file, "http://localhost/",
+                                     backwards_compat=False, encoding="utf-8")
+        form = forms[0]
+        test_string = "&"+(u"\u2014".encode('utf8')*3)
+        self.assertEqual(form.action, "http://localhost/"+test_string)
+        control = form.find_control(type="textarea", nr=0)
+        self.assertEqual(control.value, "val"+test_string)
+        self.assertEqual(control.name, "name"+test_string)
+
 
 class LWPFormTests(TestCase):
     """The original tests from libwww-perl 5.64."""


More information about the wwwsearch-commits mailing list