[wwwsearch-commits] r45096 - in wwwsearch/mechanize/trunk: mechanize test

jjlee at codespeak.net jjlee at codespeak.net
Sun Jul 15 00:56:01 CEST 2007


Author: jjlee
Date: Sun Jul 15 00:56:01 2007
New Revision: 45096

Modified:
   wwwsearch/mechanize/trunk/mechanize/_html.py
   wwwsearch/mechanize/trunk/mechanize/_pullparser.py
   wwwsearch/mechanize/trunk/test/test_html.doctest
Log:
Make title parsing follow Firefox behaviour wrt child elements (previously the behaviour differed between Factory and RobustFactory).

Modified: wwwsearch/mechanize/trunk/mechanize/_html.py
==============================================================================
--- wwwsearch/mechanize/trunk/mechanize/_html.py	(original)
+++ wwwsearch/mechanize/trunk/mechanize/_html.py	Sun Jul 15 00:56:01 2007
@@ -17,6 +17,8 @@
 
 DEFAULT_ENCODING = "latin-1"
 
+COMPRESS_RE = re.compile(r"\s+")
+
 
 # the base classe is purely for backwards compatibility
 class ParseError(ClientForm.ParseError): pass
@@ -235,6 +237,30 @@
         self._response = response
         self._encoding = encoding
 
+    def _get_title_text(self, parser):
+        text = []
+        tok = None
+        while 1:
+            try:
+                tok = parser.get_token()
+            except NoMoreTokensError:
+                break
+            if tok.type == "data":
+                text.append(str(tok))
+            elif tok.type == "entityref":
+                t = unescape("&%s;" % tok.data,
+                             parser._entitydefs, parser.encoding)
+                text.append(t)
+            elif tok.type == "charref":
+                t = unescape_charref(tok.data, parser.encoding)
+                text.append(t)
+            elif tok.type in ["starttag", "endtag", "startendtag"]:
+                tag_name = tok.data
+                if tok.type == "endtag" and tag_name == "title":
+                    break
+                text.append(str(tok))
+        return COMPRESS_RE.sub(" ", "".join(text).strip())
+
     def title(self):
         import _pullparser
         p = _pullparser.TolerantPullParser(
@@ -245,7 +271,7 @@
             except _pullparser.NoMoreTokensError:
                 return None
             else:
-                return p.get_text()
+                return self._get_title_text(p)
         except sgmllib.SGMLParseError, exc:
             raise ParseError(exc)
 
@@ -328,7 +354,7 @@
 
 class RobustLinksFactory:
 
-    compress_re = re.compile(r"\s+")
+    compress_re = COMPRESS_RE
 
     def __init__(self,
                  link_parser_class=None,
@@ -418,7 +444,8 @@
         if title == _beautifulsoup.Null:
             return None
         else:
-            return title.firstText(lambda t: True)
+            inner_html = "".join([str(node) for node in title.contents])
+            return COMPRESS_RE.sub(" ", inner_html.strip())
 
 
 class Factory:

Modified: wwwsearch/mechanize/trunk/mechanize/_pullparser.py
==============================================================================
--- wwwsearch/mechanize/trunk/mechanize/_pullparser.py	(original)
+++ wwwsearch/mechanize/trunk/mechanize/_pullparser.py	Sun Jul 15 00:56:01 2007
@@ -35,6 +35,7 @@
 
 import re, htmlentitydefs
 import sgmllib, HTMLParser
+from xml.sax import saxutils
 
 from _html import unescape, unescape_charref
 
@@ -85,6 +86,60 @@
         args = ", ".join(map(repr, [self.type, self.data, self.attrs]))
         return self.__class__.__name__+"(%s)" % args
 
+    def __str__(self):
+        """
+        >>> print Token("starttag", "br")
+        <br>
+        >>> print Token("starttag", "a",
+        ...     [("href", "http://www.python.org/"), ("alt", '"foo"')])
+        <a href="http://www.python.org/" alt='"foo"'>
+        >>> print Token("startendtag", "br")
+        <br />
+        >>> print Token("startendtag", "br", [("spam", "eggs")])
+        <br spam="eggs" />
+        >>> print Token("endtag", "p")
+        </p>
+        >>> print Token("charref", "38")
+        &#38;
+        >>> print Token("entityref", "amp")
+        &amp;
+        >>> print Token("data", "foo\\nbar")
+        foo
+        bar
+        >>> print Token("comment", "Life is a bowl\\nof cherries.")
+        <!--Life is a bowl
+        of cherries.-->
+        >>> print Token("decl", "decl")
+        <!decl>
+        >>> print Token("pi", "pi")
+        <?pi>
+        """
+        if self.attrs is not None:
+            attrs = "".join([" %s=%s" % (k, saxutils.quoteattr(v)) for
+                             k, v in self.attrs])
+        else:
+            attrs = ""
+        if self.type == "starttag":
+            return "<%s%s>" % (self.data, attrs)
+        elif self.type == "startendtag":
+            return "<%s%s />" % (self.data, attrs)
+        elif self.type == "endtag":
+            return "</%s>" % self.data
+        elif self.type == "charref":
+            return "&#%s;" % self.data
+        elif self.type == "entityref":
+            return "&%s;" % self.data
+        elif self.type == "data":
+            return self.data
+        elif self.type == "comment":
+            return "<!--%s-->" % self.data
+        elif self.type == "decl":
+            return "<!%s>" % self.data
+        elif self.type == "pi":
+            return "<?%s>" % self.data
+        assert False
+
+
 def iter_until_exception(fn, exception, *args, **kwds):
     while 1:
         try:

Modified: wwwsearch/mechanize/trunk/test/test_html.doctest
==============================================================================
--- wwwsearch/mechanize/trunk/test/test_html.doctest	(original)
+++ wwwsearch/mechanize/trunk/test/test_html.doctest	Sun Jul 15 00:56:01 2007
@@ -213,3 +213,41 @@
 None
 >>> print get_first_link_text_sgmllib(html)
 None
+
+
+Title parsing.  We follow Firefox's behaviour with regard to child
+elements (haven't tested IE).
+
+>>> def get_title_bs(html):
+...     factory = RobustTitleFactory()
+...     soup = MechanizeBs("utf-8", html)
+...     factory.set_soup(soup, "utf-8")
+...     return factory.title()
+
+>>> def get_title_sgmllib(html):
+...     factory = TitleFactory()
+...     response = test_html_response(html)
+...     factory.set_response(response, "utf-8")
+...     return factory.title()
+
+>>> html = ("""\
+... <html><head>
+... <title>Title</title>
+... </head><body><p>Blah.<p></body></html>
+... """)
+>>> get_title_bs(html)
+'Title'
+>>> get_title_sgmllib(html)
+'Title'
+
+>>> html = ("""\
+... <html><head>
+... <title>  Ti<script type="text/strange">alert("this is valid HTML -- yuck!")</script>
+... tle &amp;&#38;
+... </title>
+... </head><body><p>Blah.<p></body></html>
+... """)
+>>> get_title_bs(html)
+'Ti<script type="text/strange">alert("this is valid HTML -- yuck!")</script> tle &&'
+>>> get_title_sgmllib(html)
+'Ti<script type="text/strange">alert("this is valid HTML -- yuck!")</script> tle &&'


More information about the wwwsearch-commits mailing list