[wwwsearch-commits] r21640 - in wwwsearch/mechanize/trunk: . mechanize

jjlee at codespeak.net jjlee at codespeak.net
Mon Jan 2 19:37:52 CET 2006


Author: jjlee
Date: Mon Jan  2 19:37:51 2006
New Revision: 21640

Modified:
   wwwsearch/mechanize/trunk/mechanize/_mechanize.py
   wwwsearch/mechanize/trunk/test.py
Log:
More encoding bugs: leave unrepresentable charrefs untouched; Fix Python 2.2 compatibility

Modified: wwwsearch/mechanize/trunk/mechanize/_mechanize.py
==============================================================================
--- wwwsearch/mechanize/trunk/mechanize/_mechanize.py	(original)
+++ wwwsearch/mechanize/trunk/mechanize/_mechanize.py	Mon Jan  2 19:37:51 2006
@@ -18,7 +18,7 @@
 
 from __future__ import generators
 
-import urllib2, socket, urlparse, urllib, re, sys
+import urllib2, socket, urlparse, urllib, re, sys, htmlentitydefs
 from urlparse import urljoin
 
 import ClientCookie
@@ -254,11 +254,29 @@
     if name.startswith("x"):
         name, base= name[1:], 16
     uc = unichr(int(name, base))
+    if encoding is None:
+        return uc
+    else:
+        try:
+            repl = uc.encode(encoding)
+        except UnicodeError:
+            repl = "&#%s;" % data
+        return repl
+
+def get_entitydefs():
     try:
-        t = uc.encode(encoding)
-    except UnicodeError:
-        t = '&#%s;' % data
-    return t
+        htmlentitydefs.name2codepoint
+    except AttributeError:
+        entitydefs = {}
+        for name, char in htmlentitydefs.entitydefs.items():
+            uc = char.decode("latin-1")
+            if uc.startswith("&#") and uc.endswith(";"):
+                uc = unescape_charref(uc[2:-1], None)
+            codepoint = ord(uc)
+            entitydefs[name] = codepoint
+    else:
+        entitydefs = htmlentitydefs.name2codepoint
+    return entitydefs
 
 
 try:
@@ -270,7 +288,7 @@
     # monkeypatch to fix http://www.python.org/sf/803422 :-(
     sgmllib.charref = re.compile("&#(x?[0-9a-fA-F]+)[^0-9a-fA-F]")
     class MechanizeBs(BeautifulSoup.BeautifulSoup):
-        from htmlentitydefs import name2codepoint as _entitydefs
+        _entitydefs = get_entitydefs()
         def __init__(self, encoding, text=None, avoidParserProblems=True,
                      initialTextIsEverything=True):
             self._encoding = encoding

Modified: wwwsearch/mechanize/trunk/test.py
==============================================================================
--- wwwsearch/mechanize/trunk/test.py	(original)
+++ wwwsearch/mechanize/trunk/test.py	Mon Jan  2 19:37:51 2006
@@ -8,6 +8,41 @@
 import mechanize
 FACTORY_CLASSES = [mechanize.DefaultFactory, mechanize.RobustFactory]
 
+
+class UnescapeTests(TestCase):
+
+    def test_unescape_charref(self):
+        from mechanize._mechanize import unescape_charref, get_entitydefs
+        mdash_utf8 = u"\u2014".encode("utf-8")
+        for ref, codepoint, utf8, latin1 in [
+            ("38", 38, u"&".encode("utf-8"), "&"),
+            ("x2014", 0x2014, mdash_utf8, "—"),
+            ("8212", 8212, mdash_utf8, "—"),
+            ]:
+            self.assertEqual(unescape_charref(ref, None), unichr(codepoint))
+            self.assertEqual(unescape_charref(ref, 'latin-1'), latin1)
+            self.assertEqual(unescape_charref(ref, 'utf-8'), utf8)
+
+    def test_get_entitydefs(self):
+        from mechanize._mechanize import get_entitydefs
+        ed = get_entitydefs()
+        for name, codepoint in [
+            ("amp", ord(u"&")),
+            ("lt", ord(u"<")),
+            ("gt", ord(u">")),
+            ("mdash", 0x2014),
+            ("spades", 0x2660),
+            ]:
+            self.assertEqual(ed[name], codepoint)
+
+    def test_unescape(self):
+        from mechanize._mechanize import unescape, get_entitydefs
+        data = "&amp; &lt; &mdash; &#8212; &#x2014;"
+        mdash_utf8 = u"\u2014".encode("utf-8")
+        ue = unescape(data, get_entitydefs(), "utf-8")
+        self.assertEqual("& < %s %s %s" % ((mdash_utf8,)*3), ue)
+
+
 class MockMethod:
     def __init__(self, meth_name, action, handle):
         self.meth_name = meth_name


More information about the wwwsearch-commits mailing list