[wwwsearch-commits] r18225 - wwwsearch/pullparser/trunk
jjlee at codespeak.net
jjlee at codespeak.net
Thu Oct 6 21:51:00 CEST 2005
Author: jjlee
Date: Thu Oct 6 21:50:59 2005
New Revision: 18225
Modified:
wwwsearch/pullparser/trunk/pullparser.py
wwwsearch/pullparser/trunk/test.py
Log:
Fix hex char refs, though not for TolerantPullParser, due to sgmllib bug (report & patch from John Reese)
Modified: wwwsearch/pullparser/trunk/pullparser.py
==============================================================================
--- wwwsearch/pullparser/trunk/pullparser.py (original)
+++ wwwsearch/pullparser/trunk/pullparser.py Thu Oct 6 21:50:59 2005
@@ -265,8 +265,10 @@
t = "&%s;" % name
text.append(t)
elif tok.type == "charref":
- name = tok.data
- t = unichr(int(name)).encode(self.encoding)
+ name, base = tok.data, 10
+ if name.startswith('x'):
+ name, base= name[1:], 16
+ t = unichr(int(name, base)).encode(self.encoding)
text.append(t)
elif tok.type in ["starttag", "endtag", "startendtag"]:
tag_name = tok.data
Modified: wwwsearch/pullparser/trunk/test.py
==============================================================================
--- wwwsearch/pullparser/trunk/test.py (original)
+++ wwwsearch/pullparser/trunk/test.py Thu Oct 6 21:50:59 2005
@@ -52,17 +52,22 @@
self._test_encoding(pc, tolerant)
def _test_encoding(self, parser_class, tolerant):
from StringIO import StringIO
- data = "<a>ф</a>"
-
- f = StringIO(data)
- p = parser_class(f, encoding="KOI8-R")
- p.get_tag("a")
- self.assertEqual(p.get_text(), "\xc6")
-
- f = StringIO(data)
- p = parser_class(f, encoding="UTF-8")
- p.get_tag("a")
- self.assertEqual(p.get_text(), "\xd1\x84")
+ datas = ["<a>ф</a>"]
+ if not tolerant:
+ # sgmllib (hence TolerantPullParser) is broken for hex charrefs:
+ # http://python.org/sf/803422
+ datas.append("<a>ф</a>")
+
+ for data in datas:
+ f = StringIO(data)
+ p = parser_class(f, encoding="KOI8-R")
+ p.get_tag("a")
+ self.assertEqual(p.get_text(), "\xc6")
+
+ f = StringIO(data)
+ p = parser_class(f, encoding="UTF-8")
+ p.get_tag("a")
+ self.assertEqual(p.get_text(), "\xd1\x84")
# response = urllib.addinfourl(f, {"content-type": "text/html; charset=XXX"}, req.get_full_url())
def test_get_token(self):
More information about the wwwsearch-commits
mailing list