[wwwsearch-commits] r21621 - wwwsearch/pullparser/trunk

jjlee at codespeak.net jjlee at codespeak.net
Mon Jan 2 02:43:39 CET 2006


Author: jjlee
Date: Mon Jan  2 02:43:38 2006
New Revision: 21621

Modified:
   wwwsearch/pullparser/trunk/pullparser.py
   wwwsearch/pullparser/trunk/test.py
Log:
TolerantPullParser: monkeypatch sgmllib to fix http://www.python.org/sf/803422 :-(

Modified: wwwsearch/pullparser/trunk/pullparser.py
==============================================================================
--- wwwsearch/pullparser/trunk/pullparser.py	(original)
+++ wwwsearch/pullparser/trunk/pullparser.py	Mon Jan  2 02:43:38 2006
@@ -25,7 +25,7 @@
     print "Title: %s" % title
 
 
-Copyright 2003-2004 John J. Lee <jjl at pobox.com>
+Copyright 2003-2005 John J. Lee <jjl at pobox.com>
 Copyright 1998-2001 Gisle Aas (original libwww-perl code)
 
 This code is free software; you can redistribute it and/or modify it
@@ -361,6 +361,8 @@
         return self.unescape_attr(name)
 
 import sgmllib
+# monkeypatch to fix http://www.python.org/sf/803422 :-(
+sgmllib.charref = re.compile('&#(x?[0-9a-fA-F]+)[^0-9a-fA-F]')
 class TolerantPullParser(_AbstractParser, sgmllib.SGMLParser):
     def __init__(self, *args, **kwds):
         sgmllib.SGMLParser.__init__(self)

Modified: wwwsearch/pullparser/trunk/test.py
==============================================================================
--- wwwsearch/pullparser/trunk/test.py	(original)
+++ wwwsearch/pullparser/trunk/test.py	Mon Jan  2 02:43:38 2006
@@ -63,11 +63,7 @@
             self._test_encoding(pc, tolerant)
     def _test_encoding(self, parser_class, tolerant):
         from StringIO import StringIO
-        datas = ["<a>&#1092;</a>"]
-        if not tolerant:
-            # sgmllib (hence TolerantPullParser) is broken for hex charrefs:
-            # http://python.org/sf/803422
-            datas.append("<a>&#x444;</a>")
+        datas = ["<a>&#1092;</a>", "<a>&#x444;</a>"]
         def get_text(data, encoding):
             p = _get_parser(data, encoding)
             p.get_tag("a")


More information about the wwwsearch-commits mailing list