[wwwsearch-commits] r45096 - in wwwsearch/mechanize/trunk: mechanize test
jjlee at codespeak.net
jjlee at codespeak.net
Sun Jul 15 00:56:01 CEST 2007
Author: jjlee
Date: Sun Jul 15 00:56:01 2007
New Revision: 45096
Modified:
wwwsearch/mechanize/trunk/mechanize/_html.py
wwwsearch/mechanize/trunk/mechanize/_pullparser.py
wwwsearch/mechanize/trunk/test/test_html.doctest
Log:
Make title parsing follow Firefox behaviour wrt child elements (previously the behaviour differed between Factory and RobustFactory).
Modified: wwwsearch/mechanize/trunk/mechanize/_html.py
==============================================================================
--- wwwsearch/mechanize/trunk/mechanize/_html.py (original)
+++ wwwsearch/mechanize/trunk/mechanize/_html.py Sun Jul 15 00:56:01 2007
@@ -17,6 +17,8 @@
DEFAULT_ENCODING = "latin-1"
+COMPRESS_RE = re.compile(r"\s+")
+
# the base classe is purely for backwards compatibility
class ParseError(ClientForm.ParseError): pass
@@ -235,6 +237,30 @@
self._response = response
self._encoding = encoding
+ def _get_title_text(self, parser):
+ text = []
+ tok = None
+ while 1:
+ try:
+ tok = parser.get_token()
+ except NoMoreTokensError:
+ break
+ if tok.type == "data":
+ text.append(str(tok))
+ elif tok.type == "entityref":
+ t = unescape("&%s;" % tok.data,
+ parser._entitydefs, parser.encoding)
+ text.append(t)
+ elif tok.type == "charref":
+ t = unescape_charref(tok.data, parser.encoding)
+ text.append(t)
+ elif tok.type in ["starttag", "endtag", "startendtag"]:
+ tag_name = tok.data
+ if tok.type == "endtag" and tag_name == "title":
+ break
+ text.append(str(tok))
+ return COMPRESS_RE.sub(" ", "".join(text).strip())
+
def title(self):
import _pullparser
p = _pullparser.TolerantPullParser(
@@ -245,7 +271,7 @@
except _pullparser.NoMoreTokensError:
return None
else:
- return p.get_text()
+ return self._get_title_text(p)
except sgmllib.SGMLParseError, exc:
raise ParseError(exc)
@@ -328,7 +354,7 @@
class RobustLinksFactory:
- compress_re = re.compile(r"\s+")
+ compress_re = COMPRESS_RE
def __init__(self,
link_parser_class=None,
@@ -418,7 +444,8 @@
if title == _beautifulsoup.Null:
return None
else:
- return title.firstText(lambda t: True)
+ inner_html = "".join([str(node) for node in title.contents])
+ return COMPRESS_RE.sub(" ", inner_html.strip())
class Factory:
Modified: wwwsearch/mechanize/trunk/mechanize/_pullparser.py
==============================================================================
--- wwwsearch/mechanize/trunk/mechanize/_pullparser.py (original)
+++ wwwsearch/mechanize/trunk/mechanize/_pullparser.py Sun Jul 15 00:56:01 2007
@@ -35,6 +35,7 @@
import re, htmlentitydefs
import sgmllib, HTMLParser
+from xml.sax import saxutils
from _html import unescape, unescape_charref
@@ -85,6 +86,60 @@
args = ", ".join(map(repr, [self.type, self.data, self.attrs]))
return self.__class__.__name__+"(%s)" % args
+ def __str__(self):
+ """
+ >>> print Token("starttag", "br")
+ <br>
+ >>> print Token("starttag", "a",
+ ... [("href", "http://www.python.org/"), ("alt", '"foo"')])
+ <a href="http://www.python.org/" alt='"foo"'>
+ >>> print Token("startendtag", "br")
+ <br />
+ >>> print Token("startendtag", "br", [("spam", "eggs")])
+ <br spam="eggs" />
+ >>> print Token("endtag", "p")
+ </p>
+ >>> print Token("charref", "38")
+ &
+ >>> print Token("entityref", "amp")
+ &
+ >>> print Token("data", "foo\\nbar")
+ foo
+ bar
+ >>> print Token("comment", "Life is a bowl\\nof cherries.")
+ <!--Life is a bowl
+ of cherries.-->
+ >>> print Token("decl", "decl")
+ <!decl>
+ >>> print Token("pi", "pi")
+ <?pi>
+ """
+ if self.attrs is not None:
+ attrs = "".join([" %s=%s" % (k, saxutils.quoteattr(v)) for
+ k, v in self.attrs])
+ else:
+ attrs = ""
+ if self.type == "starttag":
+ return "<%s%s>" % (self.data, attrs)
+ elif self.type == "startendtag":
+ return "<%s%s />" % (self.data, attrs)
+ elif self.type == "endtag":
+ return "</%s>" % self.data
+ elif self.type == "charref":
+ return "&#%s;" % self.data
+ elif self.type == "entityref":
+ return "&%s;" % self.data
+ elif self.type == "data":
+ return self.data
+ elif self.type == "comment":
+ return "<!--%s-->" % self.data
+ elif self.type == "decl":
+ return "<!%s>" % self.data
+ elif self.type == "pi":
+ return "<?%s>" % self.data
+ assert False
+
+
def iter_until_exception(fn, exception, *args, **kwds):
while 1:
try:
Modified: wwwsearch/mechanize/trunk/test/test_html.doctest
==============================================================================
--- wwwsearch/mechanize/trunk/test/test_html.doctest (original)
+++ wwwsearch/mechanize/trunk/test/test_html.doctest Sun Jul 15 00:56:01 2007
@@ -213,3 +213,41 @@
None
>>> print get_first_link_text_sgmllib(html)
None
+
+
+Title parsing. We follow Firefox's behaviour with regard to child
+elements (haven't tested IE).
+
+>>> def get_title_bs(html):
+... factory = RobustTitleFactory()
+... soup = MechanizeBs("utf-8", html)
+... factory.set_soup(soup, "utf-8")
+... return factory.title()
+
+>>> def get_title_sgmllib(html):
+... factory = TitleFactory()
+... response = test_html_response(html)
+... factory.set_response(response, "utf-8")
+... return factory.title()
+
+>>> html = ("""\
+... <html><head>
+... <title>Title</title>
+... </head><body><p>Blah.<p></body></html>
+... """)
+>>> get_title_bs(html)
+'Title'
+>>> get_title_sgmllib(html)
+'Title'
+
+>>> html = ("""\
+... <html><head>
+... <title> Ti<script type="text/strange">alert("this is valid HTML -- yuck!")</script>
+... tle &&
+... </title>
+... </head><body><p>Blah.<p></body></html>
+... """)
+>>> get_title_bs(html)
+'Ti<script type="text/strange">alert("this is valid HTML -- yuck!")</script> tle &&'
+>>> get_title_sgmllib(html)
+'Ti<script type="text/strange">alert("this is valid HTML -- yuck!")</script> tle &&'
More information about the wwwsearch-commits
mailing list