[wwwsearch-commits] r21640 - in wwwsearch/mechanize/trunk: .
mechanize
jjlee at codespeak.net
jjlee at codespeak.net
Mon Jan 2 19:37:52 CET 2006
Author: jjlee
Date: Mon Jan 2 19:37:51 2006
New Revision: 21640
Modified:
wwwsearch/mechanize/trunk/mechanize/_mechanize.py
wwwsearch/mechanize/trunk/test.py
Log:
More encoding bugs: leave unrepresentable charrefs untouched; Fix Python 2.2 compatibility
Modified: wwwsearch/mechanize/trunk/mechanize/_mechanize.py
==============================================================================
--- wwwsearch/mechanize/trunk/mechanize/_mechanize.py (original)
+++ wwwsearch/mechanize/trunk/mechanize/_mechanize.py Mon Jan 2 19:37:51 2006
@@ -18,7 +18,7 @@
from __future__ import generators
-import urllib2, socket, urlparse, urllib, re, sys
+import urllib2, socket, urlparse, urllib, re, sys, htmlentitydefs
from urlparse import urljoin
import ClientCookie
@@ -254,11 +254,29 @@
if name.startswith("x"):
name, base= name[1:], 16
uc = unichr(int(name, base))
+ if encoding is None:
+ return uc
+ else:
+ try:
+ repl = uc.encode(encoding)
+ except UnicodeError:
+ repl = "&#%s;" % data
+ return repl
+
+def get_entitydefs():
try:
- t = uc.encode(encoding)
- except UnicodeError:
- t = '&#%s;' % data
- return t
+ htmlentitydefs.name2codepoint
+ except AttributeError:
+ entitydefs = {}
+ for name, char in htmlentitydefs.entitydefs.items():
+ uc = char.decode("latin-1")
+ if uc.startswith("&#") and uc.endswith(";"):
+ uc = unescape_charref(uc[2:-1], None)
+ codepoint = ord(uc)
+ entitydefs[name] = codepoint
+ else:
+ entitydefs = htmlentitydefs.name2codepoint
+ return entitydefs
try:
@@ -270,7 +288,7 @@
# monkeypatch to fix http://www.python.org/sf/803422 :-(
sgmllib.charref = re.compile("&#(x?[0-9a-fA-F]+)[^0-9a-fA-F]")
class MechanizeBs(BeautifulSoup.BeautifulSoup):
- from htmlentitydefs import name2codepoint as _entitydefs
+ _entitydefs = get_entitydefs()
def __init__(self, encoding, text=None, avoidParserProblems=True,
initialTextIsEverything=True):
self._encoding = encoding
Modified: wwwsearch/mechanize/trunk/test.py
==============================================================================
--- wwwsearch/mechanize/trunk/test.py (original)
+++ wwwsearch/mechanize/trunk/test.py Mon Jan 2 19:37:51 2006
@@ -8,6 +8,41 @@
import mechanize
FACTORY_CLASSES = [mechanize.DefaultFactory, mechanize.RobustFactory]
+
+class UnescapeTests(TestCase):
+
+ def test_unescape_charref(self):
+ from mechanize._mechanize import unescape_charref, get_entitydefs
+ mdash_utf8 = u"\u2014".encode("utf-8")
+ for ref, codepoint, utf8, latin1 in [
+ ("38", 38, u"&".encode("utf-8"), "&"),
+ ("x2014", 0x2014, mdash_utf8, "—"),
+ ("8212", 8212, mdash_utf8, "—"),
+ ]:
+ self.assertEqual(unescape_charref(ref, None), unichr(codepoint))
+ self.assertEqual(unescape_charref(ref, 'latin-1'), latin1)
+ self.assertEqual(unescape_charref(ref, 'utf-8'), utf8)
+
+ def test_get_entitydefs(self):
+ from mechanize._mechanize import get_entitydefs
+ ed = get_entitydefs()
+ for name, codepoint in [
+ ("amp", ord(u"&")),
+ ("lt", ord(u"<")),
+ ("gt", ord(u">")),
+ ("mdash", 0x2014),
+ ("spades", 0x2660),
+ ]:
+ self.assertEqual(ed[name], codepoint)
+
+ def test_unescape(self):
+ from mechanize._mechanize import unescape, get_entitydefs
+ data = "& < — — —"
+ mdash_utf8 = u"\u2014".encode("utf-8")
+ ue = unescape(data, get_entitydefs(), "utf-8")
+ self.assertEqual("& < %s %s %s" % ((mdash_utf8,)*3), ue)
+
+
class MockMethod:
def __init__(self, meth_name, action, handle):
self.meth_name = meth_name
More information about the wwwsearch-commits
mailing list