[wwwsearch-commits] r21639 - wwwsearch/pullparser/trunk

jjlee at codespeak.net jjlee at codespeak.net
Mon Jan 2 19:35:49 CET 2006


Author: jjlee
Date: Mon Jan  2 19:35:48 2006
New Revision: 21639

Modified:
   wwwsearch/pullparser/trunk/pullparser.py
   wwwsearch/pullparser/trunk/test.py
Log:
More encoding bugs: leave unrepresentable charrefs untouched; Fix Python 2.2 compatibility

Modified: wwwsearch/pullparser/trunk/pullparser.py
==============================================================================
--- wwwsearch/pullparser/trunk/pullparser.py	(original)
+++ wwwsearch/pullparser/trunk/pullparser.py	Mon Jan  2 19:35:48 2006
@@ -128,13 +128,30 @@
     name, base = data, 10
     if name.startswith("x"):
         name, base= name[1:], 16
-    t = unichr(int(name, base)).encode(encoding)
-    return t
+    uc = unichr(int(name, base))
+    if encoding is None:
+        return uc
+    else:
+        try:
+            repl = uc.encode(encoding)
+        except UnicodeError:
+            repl = "&#%s;" % data
+        return repl
 
 def get_entitydefs():
     entitydefs = {}
-    for name, codepoint in htmlentitydefs.name2codepoint.items():
-        entitydefs["&%s;" % name] = unichr(codepoint)
+    try:
+        htmlentitydefs.name2codepoint
+    except AttributeError:
+        entitydefs = {}
+        for name, char in htmlentitydefs.entitydefs.items():
+            uc = char.decode("latin-1")
+            if uc.startswith("&#") and uc.endswith(";"):
+                uc = unescape_charref(uc[2:-1], None)
+            entitydefs["&%s;" % name] = uc
+    else:
+        for name, codepoint in htmlentitydefs.name2codepoint.items():
+            entitydefs["&%s;" % name] = unichr(codepoint)
     return entitydefs
 
 

Modified: wwwsearch/pullparser/trunk/test.py
==============================================================================
--- wwwsearch/pullparser/trunk/test.py	(original)
+++ wwwsearch/pullparser/trunk/test.py	Mon Jan  2 19:35:48 2006
@@ -22,6 +22,30 @@
 
 class UnescapeTests(TestCase):
 
+    def test_unescape_charref(self):
+        from pullparser import unescape_charref, get_entitydefs
+        mdash_utf8 = u"\u2014".encode("utf-8")
+        for ref, codepoint, utf8, latin1 in [
+            ("38", 38, u"&".encode("utf-8"), "&"),
+            ("x2014", 0x2014, mdash_utf8, "—"),
+            ("8212", 8212, mdash_utf8, "—"),
+            ]:
+            self.assertEqual(unescape_charref(ref, None), unichr(codepoint))
+            self.assertEqual(unescape_charref(ref, 'latin-1'), latin1)
+            self.assertEqual(unescape_charref(ref, 'utf-8'), utf8)
+
+    def test_get_entitydefs(self):
+        from pullparser import get_entitydefs
+        ed = get_entitydefs()
+        for name, char in [
+            ("&", u"&"),
+            ("&lt;", u"<"),
+            ("&gt;", u">"),
+            ("&mdash;", u"\u2014"),
+            ("&spades;", u"\u2660"),
+            ]:
+            self.assertEqual(ed[name], char)
+
     def test_unescape(self):
         import pullparser
         data = "&amp; &lt; &mdash; &#8212; &#x2014;"


More information about the wwwsearch-commits mailing list