[wwwsearch-commits] r21619 - wwwsearch/pullparser/trunk
jjlee at codespeak.net
jjlee at codespeak.net
Mon Jan 2 02:09:16 CET 2006
Author: jjlee
Date: Mon Jan 2 02:09:15 2006
New Revision: 21619
Modified:
wwwsearch/pullparser/trunk/pullparser.py
wwwsearch/pullparser/trunk/test.py
Log:
Modified: wwwsearch/pullparser/trunk/pullparser.py
==============================================================================
--- wwwsearch/pullparser/trunk/pullparser.py (original)
+++ wwwsearch/pullparser/trunk/pullparser.py Mon Jan 2 02:09:15 2006
@@ -103,16 +103,32 @@
def unescape(data, entities, encoding):
if data is None or '&' not in data:
return data
+
def replace_entities(match):
ent = match.group()
+ if ent[1] == '#':
+ return unescape_charref(ent[2:-1], encoding)
+
repl = entities.get(ent)
if repl is not None:
if type(repl) != type(""):
- repl = repl.encode(encoding)
+ try:
+ repl = repl.encode(encoding)
+ except UnicodeError:
+ repl = ent
else:
repl = ent
+
return repl
- return re.sub(r'&\S+?;', replace_entities, data)
+
+ return re.sub(r'&#?\S+?;', replace_entities, data)
+
+def unescape_charref(data, encoding):
+ name, base = data, 10
+ if name.startswith('x'):
+ name, base= name[1:], 16
+ t = unichr(int(name, base)).encode(encoding)
+ return t
def get_entitydefs():
entitydefs = {}
@@ -133,8 +149,16 @@
to represent opening tags as text
encoding: encoding used to encode numeric character references by
.get_text() and .get_compressed_text() ("ascii" by default)
+
entitydefs: mapping like {'&': '&', ...} containing HTML entity
- definitions (a sensible default is used)
+ definitions (a sensible default is used). This is used to unescape
+ entities in .get_text() (and .get_compressed_text()) and attribute
+ values. If the encoding can not represent the character, the entity
+ reference is left unescaped. Note that entity references (both
+ numeric - e.g. { or ઼ - and non-numeric - e.g. &) are
+ unescaped in attribute values and the return value of .get_text(), but
+ not in data outside of tags. Instead, entity references outside of
+ tags are represented as tokens. This is a bit odd, it's true :-/
If the element name of an opening tag matches a key in the textify
mapping then that tag is converted to text. The corresponding value is
@@ -265,10 +289,7 @@
t = unescape('&%s;'%tok.data, self._entitydefs, self.encoding)
text.append(t)
elif tok.type == "charref":
- name, base = tok.data, 10
- if name.startswith('x'):
- name, base= name[1:], 16
- t = unichr(int(name, base)).encode(self.encoding)
+ t = unescape_charref(tok.data, self.encoding)
text.append(t)
elif tok.type in ["starttag", "endtag", "startendtag"]:
tag_name = tok.data
Modified: wwwsearch/pullparser/trunk/test.py
==============================================================================
--- wwwsearch/pullparser/trunk/test.py (original)
+++ wwwsearch/pullparser/trunk/test.py Mon Jan 2 02:09:15 2006
@@ -20,6 +20,15 @@
p.unget_token(tok)
return tok
+class UnescapeTests(TestCase):
+
+ def test_unescape(self):
+ import pullparser
+ data = "& < — — —"
+ mdash_utf8 = u"\u2014".encode("utf-8")
+ ue = pullparser.unescape(data, pullparser.get_entitydefs(), "utf-8")
+ self.assertEqual("& < %s %s %s" % ((mdash_utf8,)*3), ue)
+
class PullParserTests(TestCase):
from pullparser import PullParser, TolerantPullParser
PARSERS = [(PullParser, False), (TolerantPullParser, True)]
@@ -81,8 +90,12 @@
self.assertEqual(get_text(data, "KOI8-R"), "\xc6")
self.assertEqual(get_text(data, "UTF-8"), "\xd1\x84")
- self.assertEqual(get_text("<a>—</a>", "UTF-8"), u"\u2014".encode('utf8'))
- self.assertEqual(get_attr('<a name="—">blah</a>', "UTF-8", "a", "name"), u"\u2014".encode('utf8'))
+ self.assertEqual(get_text("<a>—</a>", "UTF-8"),
+ u"\u2014".encode('utf8'))
+ self.assertEqual(
+ get_attr('<a name="—">blah</a>', "UTF-8", "a", "name"),
+ u"\u2014".encode('utf8'))
+ self.assertEqual(get_text("<a>—</a>", "ascii"), "—")
# response = urllib.addinfourl(f, {"content-type": "text/html; charset=XXX"}, req.get_full_url())
def test_get_token(self):
@@ -111,7 +124,7 @@
self.assertEqual(p.get_token(), ("data", "\n", None))
self.assertEqual(p.get_token(), ("starttag", "p", []))
self.assertEqual(p.get_token(), ("data", "This is a data ", None))
- self.assertEqual(p.get_token(), ("starttag", "img", [("alt", "blah & a")]))
+ self.assertEqual(p.get_token(), ("starttag", "img", [("alt", "blah & a")]))
self.assertEqual(p.get_token(), ("data", " ", None))
self.assertEqual(p.get_token(), ("entityref", "amp", None))
self.assertEqual(p.get_token(), ("data",
@@ -197,7 +210,7 @@
self.assertEqual(p.get_text(), "\n"); p.get_token()
self.assertEqual(p.get_text(), "\n"); p.get_token()
self.assertEqual(p.get_text(),
- "This is a data blah & a[IMG]"); p.get_token()
+ "This is a data blah & a[IMG]"); p.get_token()
self.assertEqual(p.get_text(), " & that was an entityref "
"and this a is\na charref. "); p.get_token()
self.assertEqual(p.get_text(), ".\n\n\n\n"); p.get_token()
@@ -222,7 +235,7 @@
data, f = self.data_and_file()
p = parser_class(f)
self.assertEqual(p.get_text(endat=("endtag", "html")),
- u"\n\n\nTitle\n\n\nThis is a data blah & a[IMG]"
+ u"\n\n\nTitle\n\n\nThis is a data blah & a[IMG]"
" & that was an entityref and this a is\na charref. ."
"\n\n\n\n\n\n")
f.close()
@@ -232,7 +245,7 @@
self.assertEqual(p.get_text(endat=("endtag", "title")),
"\n\n\nTitle")
self.assertEqual(p.get_text(endat=("starttag", "img")),
- "\n\n\nThis is a data blah & a[IMG]")
+ "\n\n\nThis is a data blah & a[IMG]")
f.close()
# textify arg
@@ -248,7 +261,7 @@
data, f = self.data_and_file()
p = parser_class(f)
self.assertEqual(p.get_compressed_text(endat=("endtag", "html")),
- u"Title This is a data blah & a[IMG]"
+ u"Title This is a data blah & a[IMG]"
" & that was an entityref and this a is a charref. .")
f.close()
More information about the wwwsearch-commits
mailing list