[wwwsearch-commits] r21641 - wwwsearch/ClientForm/trunk
jjlee at codespeak.net
jjlee at codespeak.net
Mon Jan 2 19:43:40 CET 2006
Author: jjlee
Date: Mon Jan 2 19:43:38 2006
New Revision: 21641
Modified:
wwwsearch/ClientForm/trunk/ClientForm.py
wwwsearch/ClientForm/trunk/test.py
Log:
Apply pullparser / mechanize encoding fixes to ClientForm (A few ClientForm-specific character ref fixes were required also); Python backwards-compatibility fixes
Modified: wwwsearch/ClientForm/trunk/ClientForm.py
==============================================================================
--- wwwsearch/ClientForm/trunk/ClientForm.py (original)
+++ wwwsearch/ClientForm/trunk/ClientForm.py Mon Jan 2 19:43:38 2006
@@ -27,6 +27,11 @@
"""
# XXX
+# remove unescape_attr method
+# remove parser testing hack
+# safeUrl-ize action
+# Really need to merge CC, CF, pp and mechanize as soon as mechanize
+# goes to beta...
# Add some more functional tests
# Especially single and multiple file upload on the internet.
# Does file upload work when name is missing? Sourceforge tracker form
@@ -172,16 +177,58 @@
def unescape(data, entities, encoding='latin-1'):
if data is None or '&' not in data:
return data
+
def replace_entities(match, entities=entities, encoding=encoding):
ent = match.group()
+ if ent[1] == "#":
+ return unescape_charref(ent[2:-1], encoding)
+
repl = entities.get(ent)
if repl is not None:
if type(repl) != type(""):
- repl = repl.encode(encoding)
+ try:
+ repl = repl.encode(encoding)
+ except UnicodeError:
+ repl = ent
else:
repl = ent
+
+ return repl
+
+ return re.sub(r'&#?\S+?;', replace_entities, data)
+
+def unescape_charref(data, encoding):
+ name, base = data, 10
+ if name.startswith("x"):
+ name, base= name[1:], 16
+ uc = unichr(int(name, base))
+ if encoding is None:
+ return uc
+ else:
+ try:
+ repl = uc.encode(encoding)
+ except UnicodeError:
+ repl = "&#%s;" % data
return repl
- return re.sub(r'&\S+?;', replace_entities, data)
+
+def get_entitydefs():
+ import htmlentitydefs
+ from codecs import latin_1_decode
+ entitydefs = {}
+ try:
+ htmlentitydefs.name2codepoint
+ except AttributeError:
+ entitydefs = {}
+ for name, char in htmlentitydefs.entitydefs.items():
+ uc = latin_1_decode(char)[0]
+ if uc.startswith("&#") and uc.endswith(";"):
+ uc = unescape_charref(uc[2:-1], None)
+ entitydefs["&%s;" % name] = uc
+ else:
+ for name, codepoint in htmlentitydefs.name2codepoint.items():
+ entitydefs["&%s;" % name] = unichr(codepoint)
+ return entitydefs
+
def issequence(x):
try:
@@ -690,6 +737,9 @@
self.unknown_entityref(name)
return
+ def handle_charref(self, name):
+ self.handle_data(unescape_charref(name, self._encoding))
+
def unescape_attr(self, name):
return unescape(name, self._entitydefs, self._encoding)
@@ -715,7 +765,7 @@
import HTMLParser
except ImportError:
class XHTMLCompatibleFormParser:
- def __init__(self, entitydefs=None):
+ def __init__(self, entitydefs=None, encoding='latin-1'):
raise ValueError("HTMLParser could not be imported")
else:
class XHTMLCompatibleFormParser(_AbstractFormParser, HTMLParser.HTMLParser):
@@ -752,18 +802,6 @@
else:
method()
- # taken from sgmllib, with changes
- def handle_charref(self, name):
- try:
- n = int(name)
- except ValueError:
- self.unknown_charref(name)
- return
- if not 0 <= n <= 255:
- self.unknown_charref(name)
- return
- self.handle_data(chr(n))
-
def unescape(self, name):
# Use the entitydefs passed into constructor, not
# HTMLParser.HTMLParser's entitydefs.
@@ -775,6 +813,8 @@
return attrs # ditto
import sgmllib
+# monkeypatch to fix http://www.python.org/sf/803422 :-(
+sgmllib.charref = re.compile('&#(x?[0-9a-fA-F]+)[^0-9a-fA-F]')
class _AbstractSgmllibParser(_AbstractFormParser):
def do_option(self, attrs):
_AbstractFormParser._start_option(self, attrs)
@@ -791,6 +831,8 @@
_AbstractFormParser.__init__(self, entitydefs, encoding)
try:
+ if sys.version_info[:2] < (2, 2):
+ raise ImportError # BeautifulSoup uses generators
import BeautifulSoup
except ImportError:
pass
@@ -820,12 +862,6 @@
#FormParser = XHTMLCompatibleFormParser # testing hack
#FormParser = RobustFormParser # testing hack
-def get_entitydefs():
- entitydefs = {}
- for name, codepoint in htmlentitydefs.name2codepoint.items():
- entitydefs["&%s;" % name] = unichr(codepoint)
- return entitydefs
-
def ParseResponse(response, select_default=False,
ignore_errors=False, # ignored!
form_parser_class=FormParser,
Modified: wwwsearch/ClientForm/trunk/test.py
==============================================================================
--- wwwsearch/ClientForm/trunk/test.py (original)
+++ wwwsearch/ClientForm/trunk/test.py Mon Jan 2 19:43:38 2006
@@ -71,7 +71,39 @@
raise ClientForm.ControlNotFoundError
class UnescapeTests(TestCase):
- def test_unescape(self):
+
+ def test_unescape_charref(self):
+ from ClientForm import unescape_charref, get_entitydefs
+ mdash_utf8 = u"\u2014".encode("utf-8")
+ for ref, codepoint, utf8, latin1 in [
+ ("38", 38, u"&".encode("utf-8"), "&"),
+ ("x2014", 0x2014, mdash_utf8, "—"),
+ ("8212", 8212, mdash_utf8, "—"),
+ ]:
+ self.assertEqual(unescape_charref(ref, None), unichr(codepoint))
+ self.assertEqual(unescape_charref(ref, 'latin-1'), latin1)
+ self.assertEqual(unescape_charref(ref, 'utf-8'), utf8)
+
+ def test_get_entitydefs(self):
+ from ClientForm import get_entitydefs
+ ed = get_entitydefs()
+ for name, char in [
+ ("&", u"&"),
+ ("<", u"<"),
+ (">", u">"),
+ ("—", u"\u2014"),
+ ("♠", u"\u2660"),
+ ]:
+ self.assertEqual(ed[name], char)
+
+ def test_unescape1(self):
+ from ClientForm import unescape, get_entitydefs
+ data = "& < — — —"
+ mdash_utf8 = u"\u2014".encode("utf-8")
+ ue = unescape(data, get_entitydefs(), "utf-8")
+ self.assertEqual("& < %s %s %s" % ((mdash_utf8,)*3), ue)
+
+ def test_unescape2(self):
from ClientForm import unescape, get_entitydefs
self.assertEqual(unescape("Donald Duck & Co",
{"&": "&"}), "Donald Duck & Co")
@@ -85,8 +117,29 @@
unescape("&foo;",
{"&": "&", "&foo;": "splat"}), "&foo;")
self.assertEqual(unescape("&", {}), "&")
- self.assertEqual(unescape("ڪ—", get_entitydefs(), "utf8"),
- "ڪ"+(u"\u2014".encode('utf8')))
+
+ for encoding, expected in [
+ ("utf-8", u"&\u06aa\u2014\u2014".encode("utf-8")),
+ ("latin-1", "&ڪ——")]:
+ self.assertEqual(
+ expected,
+ unescape("&ڪ——", get_entitydefs(), encoding))
+
+ def test_unescape_parsing(self):
+ file = StringIO(
+"""<form action="&———">
+<textarea name="name&———">val&———</textarea>
+</form>
+""") #"
+ forms = ClientForm.ParseFile(file, "http://localhost/",
+ backwards_compat=False, encoding="utf-8")
+ form = forms[0]
+ test_string = "&"+(u"\u2014".encode('utf8')*3)
+ self.assertEqual(form.action, "http://localhost/"+test_string)
+ control = form.find_control(type="textarea", nr=0)
+ self.assertEqual(control.value, "val"+test_string)
+ self.assertEqual(control.name, "name"+test_string)
+
class LWPFormTests(TestCase):
"""The original tests from libwww-perl 5.64."""
More information about the wwwsearch-commits
mailing list