[wwwsearch-commits] r26625 - wwwsearch/mechanize/branch/mechanize-0.1.0-devel/mechanize

jjlee at codespeak.net jjlee at codespeak.net
Mon May 1 00:44:58 CEST 2006


Author: jjlee
Date: Mon May  1 00:44:56 2006
New Revision: 26625

Modified:
   wwwsearch/mechanize/branch/mechanize-0.1.0-devel/mechanize/__init__.py
   wwwsearch/mechanize/branch/mechanize-0.1.0-devel/mechanize/_html.py
Log:
More factory interface breakage, mostly to separate details of data source from factory methods.  It now does a little bit less re-parsing in the BeautifulSoup case.  Probably I should get rid of the separate forms, links, title factories too, to simplify it a bit...

Modified: wwwsearch/mechanize/branch/mechanize-0.1.0-devel/mechanize/__init__.py
==============================================================================
--- wwwsearch/mechanize/branch/mechanize-0.1.0-devel/mechanize/__init__.py	(original)
+++ wwwsearch/mechanize/branch/mechanize-0.1.0-devel/mechanize/__init__.py	Mon May  1 00:44:56 2006
@@ -5,5 +5,5 @@
 
 from _html import Link, \
      Factory, DefaultFactory, RobustFactory, \
-     FormsFactory, LinksFactory, pp_get_title, \
-     RobustFormsFactory, RobustLinksFactory, bs_get_title
+     FormsFactory, LinksFactory, TitleFactory, \
+     RobustFormsFactory, RobustLinksFactory, RobustTitleFactory

Modified: wwwsearch/mechanize/branch/mechanize-0.1.0-devel/mechanize/_html.py
==============================================================================
--- wwwsearch/mechanize/branch/mechanize-0.1.0-devel/mechanize/_html.py	(original)
+++ wwwsearch/mechanize/branch/mechanize-0.1.0-devel/mechanize/_html.py	Mon May  1 00:44:56 2006
@@ -74,7 +74,6 @@
     form_parser_class=None,
     request_class=None,
     backwards_compat=False,
-    encoding=DEFAULT_ENCODING,  # deprecated
     ):
     return Args(locals())
 
@@ -124,11 +123,21 @@
                 "iframe": "src",
                 }
         self.urltags = urltags
+        self._response = None
+        self._encoding = None
 
-    def links(self, fh, base_url, encoding=None):
+    def set_response(self, response, base_url, encoding):
+        self._response = response
+        self._encoding = encoding
+        self._base_url = base_url
+
+    def links(self):
         """Return an iterator that provides links of the document."""
         import pullparser
-        p = self.link_parser_class(fh, encoding=encoding)
+        response = self._response
+        encoding = self._encoding
+        base_url = self._base_url
+        p = self.link_parser_class(response, encoding=encoding)
 
         for token in p.tags(*(self.urltags.keys()+["base"])):
             if token.data == "base":
@@ -174,7 +183,6 @@
                  form_parser_class=None,
                  request_class=None,
                  backwards_compat=False,
-                 encoding=DEFAULT_ENCODING,  # deprecated
                  ):
         import ClientForm
         self.select_default = select_default
@@ -185,14 +193,18 @@
             request_class = ClientCookie.Request
         self.request_class = request_class
         self.backwards_compat = backwards_compat
+        self._response = None
+        self.encoding = None
+
+    def set_response(self, response, encoding):
+        self._response = response
         self.encoding = encoding
 
-    def parse_response(self, response, encoding=None):
+    def forms(self):
         import ClientForm
-        if encoding is None:
-            encoding = self.encoding
+        encoding = self.encoding
         return ClientForm.ParseResponse(
-            response,
+            self._response,
             select_default=self.select_default,
             form_parser_class=self.form_parser_class,
             request_class=self.request_class,
@@ -200,29 +212,24 @@
             encoding=encoding,
             )
 
-    def parse_file(self, file_obj, base_url, encoding=None):
-        import ClientForm
-        if encoding is None:
-            encoding = self.encoding
-        return ClientForm.ParseFile(
-            file_obj,
-            base_url,
-            select_default=self.select_default,
-            form_parser_class=self.form_parser_class,
-            request_class=self.request_class,
-            backwards_compat=self.backwards_compat,
-            encoding=encoding,
-            )
+class TitleFactory:
+    def __init__(self):
+        self._response = self._encoding = None
 
-def pp_get_title(response, encoding):
-    import pullparser
-    p = pullparser.TolerantPullParser(response, encoding=encoding)
-    try:
-        p.get_tag("title")
-    except pullparser.NoMoreTokensError:
-        return None
-    else:
-        return p.get_text()
+    def set_response(self, response, encoding):
+        self._response = response
+        self._encoding = encoding
+
+    def title(self):
+        import pullparser
+        p = pullparser.TolerantPullParser(
+            self._response, encoding=self._encoding)
+        try:
+            p.get_tag("title")
+        except pullparser.NoMoreTokensError:
+            return None
+        else:
+            return p.get_text()
 
 
 def unescape(data, entities, encoding):
@@ -288,6 +295,13 @@
     sgmllib.charref = re.compile("&#(x?[0-9a-fA-F]+)[^0-9a-fA-F]")
     class MechanizeBs(BeautifulSoup.BeautifulSoup):
         _entitydefs = get_entitydefs()
+        # don't want the magic Microsoft-char workaround
+        PARSER_MASSAGE = [(re.compile('(<[^<>]*)/>'),
+                           lambda(x):x.group(1) + ' />'),
+                          (re.compile('<!\s+([^<>]*)>'),
+                           lambda(x):'<!' + x.group(1) + '>')
+                          ]
+
         def __init__(self, encoding, text=None, avoidParserProblems=True,
                      initialTextIsEverything=True):
             self._encoding = encoding
@@ -329,11 +343,20 @@
                 "iframe": "src",
                 }
         self.urltags = urltags
+        self._bs = None
+        self._encoding = None
+        self._base_url = None
+
+    def set_soup(self, soup, base_url, encoding):
+        self._bs = soup
+        self._base_url = base_url
+        self._encoding = encoding
 
-    def links(self, fh, base_url, encoding=None):
+    def links(self):
         import BeautifulSoup
-        data = fh.read()
-        bs = self.link_parser_class(encoding, data)
+        bs = self._bs
+        base_url = self._base_url
+        encoding = self._encoding
         gen = bs.recursiveChildGenerator()
         for ch in bs.recursiveChildGenerator():
             if (isinstance(ch, BeautifulSoup.Tag) and
@@ -342,7 +365,7 @@
                 attrs = bs.unescape_attrs(link.attrs)
                 attrs_dict = dict(attrs)
                 if link.name == "base":
-                    base_url = attrs_dict.get("href")
+                    base_url = attrs_dict.get("href").encode(encoding)
                     continue
                 url_attr = self.urltags[link.name]
                 url = attrs_dict.get(url_attr)
@@ -358,7 +381,9 @@
                         text = None
                 else:
                     text = self.compress_re.sub(" ", text.strip())
-                yield Link(base_url, url, text, link.name, attrs)
+                    text = text.encode(encoding)
+                linkname = link.name.encode(encoding)
+                yield Link(base_url, url, text, linkname, attrs)
 
 
 class RobustFormsFactory(FormsFactory):
@@ -369,15 +394,26 @@
             args.form_parser_class = ClientForm.RobustFormParser
         FormsFactory.__init__(self, **args.dictionary)
 
-def bs_get_title(response, encoding):
-    import BeautifulSoup
-    # XXXX encoding
-    bs = BeautifulSoup.BeautifulSoup(response.read())
-    title = bs.first("title")
-    if title == BeautifulSoup.Null:
-        return None
-    else:
-        return title.firstText(lambda t: True)
+    def set_response(self, response, encoding):
+        self._response = response
+        self.encoding = encoding
+
+
+class RobustTitleFactory:
+    def __init__(self):
+        self._bs = self._encoding = None
+
+    def set_soup(self, soup, encoding):
+        self._bs = soup
+        self._encoding = encoding
+
+    def title(soup):
+        import BeautifulSoup
+        title = self._bs.first("title")
+        if title == BeautifulSoup.Null:
+            return None
+        else:
+            return title.firstText(lambda t: True)
 
 
 class Factory:
@@ -403,7 +439,7 @@
 
     """
 
-    def __init__(self, forms_factory, links_factory, get_title,
+    def __init__(self, forms_factory, links_factory, title_factory,
                  get_encoding=encoding_finder(DEFAULT_ENCODING),
                  is_html_p=make_is_html(allow_xhtml=False),
                  ):
@@ -420,7 +456,7 @@
         """
         self._forms_factory = forms_factory
         self._links_factory = links_factory
-        self._get_title = get_title
+        self._title_factory = title_factory
         self._get_encoding = get_encoding
         self._is_html_p = is_html_p
 
@@ -444,6 +480,7 @@
         """
         self._response = response
         self._forms_genf = self._links_genf = None
+        self._get_title = None
         for name in ["encoding", "is_html", "title"]:
             try:
                 delattr(self, name)
@@ -463,7 +500,7 @@
                 return self.is_html
             elif name == "title":
                 if self.is_html:
-                    self.title = self._get_title(self._response, self.encoding)
+                    self.title = self._title_factory.title()
                 else:
                     self.title = None
                 return self.title
@@ -473,35 +510,60 @@
     def forms(self):
         """Return iterable over ClientForm.HTMLForm-like objects."""
         if self._forms_genf is None:
-            forms_gen = self._forms_factory.parse_response(
-                copy.copy(self._response), self.encoding)
-            self._forms_genf = CachingGeneratorFunction(forms_gen)
+            self._forms_genf = CachingGeneratorFunction(
+                self._forms_factory.forms())
         return self._forms_genf()
 
     def links(self):
         """Return iterable over mechanize.Link-like objects."""
         if self._links_genf is None:
-            links_gen = self._links_factory.links(
-                copy.copy(self._response), self._response.geturl(), self.encoding)
-            self._links_genf = CachingGeneratorFunction(links_gen)
+            self._links_genf = CachingGeneratorFunction(
+                self._links_factory.links())
         return self._links_genf()
 
 class DefaultFactory(Factory):
+    """Based on sgmllib."""
     def __init__(self, i_want_broken_xhtml_support=False):
         Factory.__init__(
             self,
             forms_factory=FormsFactory(),
             links_factory=LinksFactory(),
-            get_title=pp_get_title,
+            title_factory=TitleFactory(),
             is_html_p=make_is_html(allow_xhtml=i_want_broken_xhtml_support),
             )
 
+    def set_response(self, response):
+        Factory.set_response(self, response)
+        if response is not None:
+            self._forms_factory.set_response(
+                copy.copy(response), self.encoding)
+            self._links_factory.set_response(
+                copy.copy(response), self._response.geturl(), self.encoding)
+            self._title_factory.set_response(
+                copy.copy(response), self.encoding)
+
 class RobustFactory(Factory):
+    """Based on BeautifulSoup, hopefully a bit more robust to bad HTML than is
+    DefaultFactory.
+
+    """
     def __init__(self, i_want_broken_xhtml_support=False):
         Factory.__init__(
             self,
             forms_factory=RobustFormsFactory(),
             links_factory=RobustLinksFactory(),
-            get_title=bs_get_title,
+            title_factory=RobustTitleFactory(),
             is_html_p=make_is_html(allow_xhtml=i_want_broken_xhtml_support),
             )
+        self._bs = None
+
+    def set_response(self, response):
+        import BeautifulSoup
+        Factory.set_response(self, response)
+        if response is not None:
+            data = response.read()
+            self._bs = self.link_parser_class(self.encoding, data)
+            self._forms_factory.set_response(response, self.encoding)
+            self._links_factory.set_soup(
+                soup, response.geturl(), self.encoding)
+            self._title_factory.set_soup(soup, self.encoding)


More information about the wwwsearch-commits mailing list