[wwwsearch-commits] r21619 - wwwsearch/pullparser/trunk

jjlee at codespeak.net jjlee at codespeak.net
Mon Jan 2 02:09:16 CET 2006


Author: jjlee
Date: Mon Jan  2 02:09:15 2006
New Revision: 21619

Modified:
   wwwsearch/pullparser/trunk/pullparser.py
   wwwsearch/pullparser/trunk/test.py
Log:


Modified: wwwsearch/pullparser/trunk/pullparser.py
==============================================================================
--- wwwsearch/pullparser/trunk/pullparser.py	(original)
+++ wwwsearch/pullparser/trunk/pullparser.py	Mon Jan  2 02:09:15 2006
@@ -103,16 +103,32 @@
 def unescape(data, entities, encoding):
     if data is None or '&' not in data:
         return data
+
     def replace_entities(match):
         ent = match.group()
+        if ent[1] == '#':
+            return unescape_charref(ent[2:-1], encoding)
+
         repl = entities.get(ent)
         if repl is not None:
             if type(repl) != type(""):
-                repl = repl.encode(encoding)
+                try:
+                    repl = repl.encode(encoding)
+                except UnicodeError:
+                    repl = ent
         else:
             repl = ent
+
         return repl
-    return re.sub(r'&\S+?;', replace_entities, data)
+
+    return re.sub(r'&#?\S+?;', replace_entities, data)
+
+def unescape_charref(data, encoding):
+    name, base = data, 10
+    if name.startswith('x'):
+        name, base= name[1:], 16
+    t = unichr(int(name, base)).encode(encoding)
+    return t
 
 def get_entitydefs():
     entitydefs = {}
@@ -133,8 +149,16 @@
          to represent opening tags as text
         encoding: encoding used to encode numeric character references by
          .get_text() and .get_compressed_text() ("ascii" by default)
+
         entitydefs: mapping like {'&': '&', ...} containing HTML entity
-         definitions (a sensible default is used)
+         definitions (a sensible default is used).  This is used to unescape
+         entities in .get_text() (and .get_compressed_text()) and attribute
+         values.  If the encoding can not represent the character, the entity
+         reference is left unescaped.  Note that entity references (both
+         numeric - e.g. { or ઼ - and non-numeric - e.g. &) are
+         unescaped in attribute values and the return value of .get_text(), but
+         not in data outside of tags.  Instead, entity references outside of
+         tags are represented as tokens.  This is a bit odd, it's true :-/
 
         If the element name of an opening tag matches a key in the textify
         mapping then that tag is converted to text.  The corresponding value is
@@ -265,10 +289,7 @@
                 t = unescape('&%s;'%tok.data, self._entitydefs, self.encoding)
                 text.append(t)
             elif tok.type == "charref":
-                name, base = tok.data, 10
-                if name.startswith('x'):
-                    name, base= name[1:], 16
-                t = unichr(int(name, base)).encode(self.encoding)
+                t = unescape_charref(tok.data, self.encoding)
                 text.append(t)
             elif tok.type in ["starttag", "endtag", "startendtag"]:
                 tag_name = tok.data

Modified: wwwsearch/pullparser/trunk/test.py
==============================================================================
--- wwwsearch/pullparser/trunk/test.py	(original)
+++ wwwsearch/pullparser/trunk/test.py	Mon Jan  2 02:09:15 2006
@@ -20,6 +20,15 @@
     p.unget_token(tok)
     return tok
 
+class UnescapeTests(TestCase):
+
+    def test_unescape(self):
+        import pullparser
+        data = "& < — — —"
+        mdash_utf8 = u"\u2014".encode("utf-8")
+        ue = pullparser.unescape(data, pullparser.get_entitydefs(), "utf-8")
+        self.assertEqual("& < %s %s %s" % ((mdash_utf8,)*3), ue)
+
 class PullParserTests(TestCase):
     from pullparser import PullParser, TolerantPullParser
     PARSERS = [(PullParser, False), (TolerantPullParser, True)]
@@ -81,8 +90,12 @@
             self.assertEqual(get_text(data, "KOI8-R"), "\xc6")
             self.assertEqual(get_text(data, "UTF-8"), "\xd1\x84")
 
-        self.assertEqual(get_text("<a>&mdash;</a>", "UTF-8"), u"\u2014".encode('utf8'))
-        self.assertEqual(get_attr('<a name="&mdash;">blah</a>', "UTF-8", "a", "name"), u"\u2014".encode('utf8'))
+        self.assertEqual(get_text("<a>&mdash;</a>", "UTF-8"),
+                         u"\u2014".encode('utf8'))
+        self.assertEqual(
+            get_attr('<a name="&mdash;">blah</a>', "UTF-8", "a", "name"),
+            u"\u2014".encode('utf8'))
+        self.assertEqual(get_text("<a>&mdash;</a>", "ascii"), "&mdash;")
 
 #        response = urllib.addinfourl(f, {"content-type": "text/html; charset=XXX"}, req.get_full_url())
     def test_get_token(self):
@@ -111,7 +124,7 @@
         self.assertEqual(p.get_token(), ("data", "\n", None))
         self.assertEqual(p.get_token(), ("starttag", "p", []))
         self.assertEqual(p.get_token(), ("data", "This is a data ", None))
-        self.assertEqual(p.get_token(), ("starttag", "img", [("alt", "blah & &#097;")]))
+        self.assertEqual(p.get_token(), ("starttag", "img", [("alt", "blah & a")]))
         self.assertEqual(p.get_token(), ("data", " ", None))
         self.assertEqual(p.get_token(), ("entityref", "amp", None))
         self.assertEqual(p.get_token(), ("data",
@@ -197,7 +210,7 @@
         self.assertEqual(p.get_text(), "\n"); p.get_token()
         self.assertEqual(p.get_text(), "\n"); p.get_token()
         self.assertEqual(p.get_text(),
-                         "This is a data blah & &#097;[IMG]"); p.get_token()
+                         "This is a data blah & a[IMG]"); p.get_token()
         self.assertEqual(p.get_text(), " & that was an entityref "
                          "and this a is\na charref.  "); p.get_token()
         self.assertEqual(p.get_text(), ".\n\n\n\n"); p.get_token()
@@ -222,7 +235,7 @@
         data, f = self.data_and_file()
         p = parser_class(f)
         self.assertEqual(p.get_text(endat=("endtag", "html")),
-                     u"\n\n\nTitle\n\n\nThis is a data blah & &#097;[IMG]"
+                     u"\n\n\nTitle\n\n\nThis is a data blah & a[IMG]"
                      " & that was an entityref and this a is\na charref.  ."
                      "\n\n\n\n\n\n")
         f.close()
@@ -232,7 +245,7 @@
         self.assertEqual(p.get_text(endat=("endtag", "title")),
                          "\n\n\nTitle")
         self.assertEqual(p.get_text(endat=("starttag", "img")),
-                         "\n\n\nThis is a data blah & &#097;[IMG]")
+                         "\n\n\nThis is a data blah & a[IMG]")
         f.close()
 
         # textify arg
@@ -248,7 +261,7 @@
         data, f = self.data_and_file()
         p = parser_class(f)
         self.assertEqual(p.get_compressed_text(endat=("endtag", "html")),
-                         u"Title This is a data blah & &#097;[IMG]"
+                         u"Title This is a data blah & a[IMG]"
                          " & that was an entityref and this a is a charref. .")
         f.close()
 


More information about the wwwsearch-commits mailing list