[wwwsearch-commits] r18225 - wwwsearch/pullparser/trunk

jjlee at codespeak.net jjlee at codespeak.net
Thu Oct 6 21:51:00 CEST 2005


Author: jjlee
Date: Thu Oct  6 21:50:59 2005
New Revision: 18225

Modified:
   wwwsearch/pullparser/trunk/pullparser.py
   wwwsearch/pullparser/trunk/test.py
Log:
Fix hex char refs, though not for TolerantPullParser, due to sgmllib bug (report & patch from John Reese)

Modified: wwwsearch/pullparser/trunk/pullparser.py
==============================================================================
--- wwwsearch/pullparser/trunk/pullparser.py	(original)
+++ wwwsearch/pullparser/trunk/pullparser.py	Thu Oct  6 21:50:59 2005
@@ -265,8 +265,10 @@
                     t = "&%s;" % name
                 text.append(t)
             elif tok.type == "charref":
-                name = tok.data
-                t = unichr(int(name)).encode(self.encoding)
+                name, base = tok.data, 10
+                if name.startswith('x'):
+                    name, base= name[1:], 16
+                t = unichr(int(name, base)).encode(self.encoding)
                 text.append(t)
             elif tok.type in ["starttag", "endtag", "startendtag"]:
                 tag_name = tok.data

Modified: wwwsearch/pullparser/trunk/test.py
==============================================================================
--- wwwsearch/pullparser/trunk/test.py	(original)
+++ wwwsearch/pullparser/trunk/test.py	Thu Oct  6 21:50:59 2005
@@ -52,17 +52,22 @@
             self._test_encoding(pc, tolerant)
     def _test_encoding(self, parser_class, tolerant):
         from StringIO import StringIO
-        data = "<a>&#1092;</a>"
-
-        f = StringIO(data)
-        p = parser_class(f, encoding="KOI8-R")
-        p.get_tag("a")
-        self.assertEqual(p.get_text(), "\xc6")
-
-        f = StringIO(data)
-        p = parser_class(f, encoding="UTF-8")
-        p.get_tag("a")
-        self.assertEqual(p.get_text(), "\xd1\x84")
+        datas = ["<a>&#1092;</a>"]
+        if not tolerant:
+            # sgmllib (hence TolerantPullParser) is broken for hex charrefs:
+            # http://python.org/sf/803422
+            datas.append("<a>&#x444;</a>")
+
+        for data in datas:
+            f = StringIO(data)
+            p = parser_class(f, encoding="KOI8-R")
+            p.get_tag("a")
+            self.assertEqual(p.get_text(), "\xc6")
+
+            f = StringIO(data)
+            p = parser_class(f, encoding="UTF-8")
+            p.get_tag("a")
+            self.assertEqual(p.get_text(), "\xd1\x84")
 
 #        response = urllib.addinfourl(f, {"content-type": "text/html; charset=XXX"}, req.get_full_url())
     def test_get_token(self):


More information about the wwwsearch-commits mailing list