[wwwsearch-commits] r21639 - wwwsearch/pullparser/trunk
jjlee at codespeak.net
jjlee at codespeak.net
Mon Jan 2 19:35:49 CET 2006
Author: jjlee
Date: Mon Jan 2 19:35:48 2006
New Revision: 21639
Modified:
wwwsearch/pullparser/trunk/pullparser.py
wwwsearch/pullparser/trunk/test.py
Log:
More encoding bugs: leave unrepresentable charrefs untouched; Fix Python 2.2 compatibility
Modified: wwwsearch/pullparser/trunk/pullparser.py
==============================================================================
--- wwwsearch/pullparser/trunk/pullparser.py (original)
+++ wwwsearch/pullparser/trunk/pullparser.py Mon Jan 2 19:35:48 2006
@@ -128,13 +128,30 @@
name, base = data, 10
if name.startswith("x"):
name, base= name[1:], 16
- t = unichr(int(name, base)).encode(encoding)
- return t
+ uc = unichr(int(name, base))
+ if encoding is None:
+ return uc
+ else:
+ try:
+ repl = uc.encode(encoding)
+ except UnicodeError:
+ repl = "&#%s;" % data
+ return repl
def get_entitydefs():
entitydefs = {}
- for name, codepoint in htmlentitydefs.name2codepoint.items():
- entitydefs["&%s;" % name] = unichr(codepoint)
+ try:
+ htmlentitydefs.name2codepoint
+ except AttributeError:
+ entitydefs = {}
+ for name, char in htmlentitydefs.entitydefs.items():
+ uc = char.decode("latin-1")
+ if uc.startswith("&#") and uc.endswith(";"):
+ uc = unescape_charref(uc[2:-1], None)
+ entitydefs["&%s;" % name] = uc
+ else:
+ for name, codepoint in htmlentitydefs.name2codepoint.items():
+ entitydefs["&%s;" % name] = unichr(codepoint)
return entitydefs
Modified: wwwsearch/pullparser/trunk/test.py
==============================================================================
--- wwwsearch/pullparser/trunk/test.py (original)
+++ wwwsearch/pullparser/trunk/test.py Mon Jan 2 19:35:48 2006
@@ -22,6 +22,30 @@
class UnescapeTests(TestCase):
+ def test_unescape_charref(self):
+ from pullparser import unescape_charref, get_entitydefs
+ mdash_utf8 = u"\u2014".encode("utf-8")
+ for ref, codepoint, utf8, latin1 in [
+ ("38", 38, u"&".encode("utf-8"), "&"),
+ ("x2014", 0x2014, mdash_utf8, "—"),
+ ("8212", 8212, mdash_utf8, "—"),
+ ]:
+ self.assertEqual(unescape_charref(ref, None), unichr(codepoint))
+ self.assertEqual(unescape_charref(ref, 'latin-1'), latin1)
+ self.assertEqual(unescape_charref(ref, 'utf-8'), utf8)
+
+ def test_get_entitydefs(self):
+ from pullparser import get_entitydefs
+ ed = get_entitydefs()
+ for name, char in [
+ ("&", u"&"),
+ ("<", u"<"),
+ (">", u">"),
+ ("—", u"\u2014"),
+ ("♠", u"\u2660"),
+ ]:
+ self.assertEqual(ed[name], char)
+
def test_unescape(self):
import pullparser
data = "& < — — —"
More information about the wwwsearch-commits
mailing list