[wwwsearch-commits] r21621 - wwwsearch/pullparser/trunk
jjlee at codespeak.net
jjlee at codespeak.net
Mon Jan 2 02:43:39 CET 2006
Author: jjlee
Date: Mon Jan 2 02:43:38 2006
New Revision: 21621
Modified:
wwwsearch/pullparser/trunk/pullparser.py
wwwsearch/pullparser/trunk/test.py
Log:
TolerantPullParser: monkeypatch sgmllib to fix http://www.python.org/sf/803422 :-(
Modified: wwwsearch/pullparser/trunk/pullparser.py
==============================================================================
--- wwwsearch/pullparser/trunk/pullparser.py (original)
+++ wwwsearch/pullparser/trunk/pullparser.py Mon Jan 2 02:43:38 2006
@@ -25,7 +25,7 @@
print "Title: %s" % title
-Copyright 2003-2004 John J. Lee <jjl at pobox.com>
+Copyright 2003-2005 John J. Lee <jjl at pobox.com>
Copyright 1998-2001 Gisle Aas (original libwww-perl code)
This code is free software; you can redistribute it and/or modify it
@@ -361,6 +361,8 @@
return self.unescape_attr(name)
import sgmllib
+# monkeypatch to fix http://www.python.org/sf/803422 :-(
+sgmllib.charref = re.compile('&#(x?[0-9a-fA-F]+)[^0-9a-fA-F]')
class TolerantPullParser(_AbstractParser, sgmllib.SGMLParser):
def __init__(self, *args, **kwds):
sgmllib.SGMLParser.__init__(self)
Modified: wwwsearch/pullparser/trunk/test.py
==============================================================================
--- wwwsearch/pullparser/trunk/test.py (original)
+++ wwwsearch/pullparser/trunk/test.py Mon Jan 2 02:43:38 2006
@@ -63,11 +63,7 @@
self._test_encoding(pc, tolerant)
def _test_encoding(self, parser_class, tolerant):
from StringIO import StringIO
- datas = ["<a>ф</a>"]
- if not tolerant:
- # sgmllib (hence TolerantPullParser) is broken for hex charrefs:
- # http://python.org/sf/803422
- datas.append("<a>ф</a>")
+ datas = ["<a>ф</a>", "<a>ф</a>"]
def get_text(data, encoding):
p = _get_parser(data, encoding)
p.get_tag("a")
More information about the wwwsearch-commits
mailing list