[wwwsearch-commits] r17026 - in wwwsearch/mechanize/trunk: . mechanize

jjlee at codespeak.net jjlee at codespeak.net
Mon Aug 29 00:44:35 CEST 2005


Author: jjlee
Date: Mon Aug 29 00:44:33 2005
New Revision: 17026

Modified:
   wwwsearch/mechanize/trunk/mechanize/_mechanize.py
   wwwsearch/mechanize/trunk/test.py
Log:
Apply all of Stephan Richter's patch (optimisation, avoid use in tests of feature deprecated in ClientForm 0.2) except for the .back() fixes -- that needs more work

Modified: wwwsearch/mechanize/trunk/mechanize/_mechanize.py
==============================================================================
--- wwwsearch/mechanize/trunk/mechanize/_mechanize.py	(original)
+++ wwwsearch/mechanize/trunk/mechanize/_mechanize.py	Mon Aug 29 00:44:33 2005
@@ -16,6 +16,8 @@
 # The stuff on web page's todo list.
 # Moof's emails about response object, .back(), etc.
 
+from __future__ import generators
+
 import urllib2, urlparse, re, sys
 
 import ClientCookie
@@ -256,8 +258,53 @@
             raise BrowserStateError("not viewing HTML")
         if kwds:
             return self._find_links(False, **kwds)
+        if self._links is None:
+            try:
+                self._links = list(self.get_links_iter())
+            finally:
+                self._response.seek(0)
         return list(self._links)
 
+    def get_links_iter(self):
+        """Return an iterator that provides links of the document."""
+        base = self._response.geturl()
+        self._response.seek(0)
+        p = pullparser.PullParser(
+            self._response, encoding=self._encoding(self._response))
+
+        for token in p.tags(*(self.urltags.keys()+["base"])):
+            if token.data == "base":
+                base = dict(token.attrs).get("href")
+                continue
+            if token.type == "endtag":
+                continue
+            attrs = dict(token.attrs)
+            tag = token.data
+            name = attrs.get("name")
+            text = None
+            url = attrs.get(self.urltags[tag])
+            if tag == "a":
+                if token.type != "startendtag":
+                    # XXX hmm, this'd break if end tag is missing
+                    text = p.get_compressed_text(("endtag", tag))
+                # but this doesn't work for eg. <a href="blah"><b>Andy</b></a>
+                #text = p.get_compressed_text()
+                # This is a hack from WWW::Mechanize to get some really basic
+                # JavaScript working, which I'm not yet convinced is a good
+                # idea.
+##                 onClick = attrs["onclick"]
+##                 m = re.search(r"/^window\.open\(\s*'([^']+)'/", onClick)
+##                 if onClick and m:
+##                     url = m.group(1)
+            if not url:
+                # Probably an <A NAME="blah"> link or <AREA NOHREF...>.
+                # For our purposes a link is something with a URL, so ignore
+                # this.
+                continue
+
+            yield Link(base, url, text, tag, token.attrs)
+
+
     def forms(self):
         """Return iterable over forms.
 
@@ -266,6 +313,13 @@
         """
         if not self.viewing_html():
             raise BrowserStateError("not viewing HTML")
+        if self._forms is None:
+            response = self._response
+            response.seek(0)
+            try:
+                self._forms = self._forms_factory.parse_response(response)
+            finally:
+                response.seek(0)
         return self._forms
 
     def viewing_html(self):
@@ -326,7 +380,7 @@
                 "at least one argument must be supplied to specify form")
 
         orig_nr = nr
-        for form in self._forms:
+        for form in self.forms():
             if name is not None and name != form.name:
                 continue
             if predicate is not None and not predicate(form):
@@ -484,10 +538,22 @@
         if not self.viewing_html():
             raise BrowserStateError("not viewing HTML")
 
-        links = []
+        found_links = []
         orig_nr = nr
 
-        for link in self._links:
+        # An optimization, so that if we look for a single link we do not have
+        # to necessarily parse the entire file.
+        if self._links is None and single:
+            all_links = self.get_links_iter()
+        else:
+            if self._links is None:
+                try:
+                    self._links = list(self.get_links_iter())
+                finally:
+                    self._response.seek(0)
+            all_links = self._links
+
+        for link in all_links:
             if url is not None and url != link.url:
                 continue
             if url_regex is not None and not url_regex.search(link.url):
@@ -514,11 +580,11 @@
             if single:
                 return link
             else:
-                links.append(link)
+                found_links.append(link)
                 nr = orig_nr
-        if not links:
+        if not found_links:
             raise LinkNotFoundError()
-        return links
+        return found_links
 
     def _encoding(self, response):
         # HTTPEquivProcessor may be in use, so both HTTP and HTTP-EQUIV
@@ -536,50 +602,4 @@
         if not self.viewing_html():
             # nothing to see here
             return
-        try:
-            self._forms = self._forms_factory.parse_response(response)
-        finally:
-            response.seek(0)
-        try:
-            self._links = self._extract_links(response)
-        finally:
-            response.seek(0)
-
-    def _extract_links(self, response):
-        base = response.geturl()
-        p = pullparser.TolerantPullParser(
-            response, encoding=self._encoding(response))
-        links = []
-        for token in p.tags(*(self.urltags.keys()+["base"])):
-            if token.data == "base":
-                base = dict(token.attrs).get("href")
-                continue
-            if token.type == "endtag":
-                continue
-            attrs = dict(token.attrs)
-            tag = token.data
-            name = attrs.get("name")
-            text = None
-            url = attrs.get(self.urltags[tag])
-            if tag == "a":
-                if token.type != "startendtag":
-                    # XXX hmm, this'd break if end tag is missing
-                    text = p.get_compressed_text(("endtag", tag))
-                # but this doesn't work for eg. <a href="blah"><b>Andy</b></a>
-                #text = p.get_compressed_text()
-                # This is a hack from WWW::Mechanize to get some really basic
-                # JavaScript working, which I'm not yet convinced is a good
-                # idea.
-##                 onClick = attrs["onclick"]
-##                 m = re.search(r"/^window\.open\(\s*'([^']+)'/", onClick)
-##                 if onClick and m:
-##                     url = m.group(1)
-            if not url:
-                # Probably an <A NAME="blah"> link or <AREA NOHREF...>.
-                # For our purposes a link is something with a URL, so ignore
-                # this.
-                continue
-
-            link = Link(base, url, text, tag, token.attrs)
-            links.append(link)
-        return links
+        self._forms = self._links = None

Modified: wwwsearch/mechanize/trunk/test.py
==============================================================================
--- wwwsearch/mechanize/trunk/test.py	(original)
+++ wwwsearch/mechanize/trunk/test.py	Mon Aug 29 00:44:33 2005
@@ -257,7 +257,9 @@
         self.assertRaises(AttributeError, getattr, b, "possible_items")
         b.select_form("form1")
         # now unknown methods are fed through to selected ClientForm.HTMLForm
-        self.assertEqual(b.possible_items("cheeses"), ["cheddar", "edam"])
+        self.assertEqual(
+            [i.name for i in b.find_control('cheeses').items],
+            ["cheddar", "edam"])
         b["cheeses"] = ["cheddar", "edam"]
         self.assertEqual(b.click_pairs(), [
             ("cheeses", "cheddar"), ("cheeses", "edam"), ("one", "")])


More information about the wwwsearch-commits mailing list