[Lxml-checkins] r45166 - lxml/branch/html/src/lxml/html

ianb at codespeak.net ianb at codespeak.net
Tue Jul 17 19:46:23 CEST 2007


Author: ianb
Date: Tue Jul 17 19:46:23 2007
New Revision: 45166

Modified:
   lxml/branch/html/src/lxml/html/__init__.py
Log:
Join form.action to the base_url.  Pass keyword arguments through to the parser (even if I can't get any keyword arguments to work at the moment).  Expose the tree base url (in docinfo.URL) as el.base_url.  Have .make_links_absolute() default to self.base_url.  Change CheckboxGroup.values to .value, so it's the same as all the other elements and groups of elements.

Modified: lxml/branch/html/src/lxml/html/__init__.py
==============================================================================
--- lxml/branch/html/src/lxml/html/__init__.py	(original)
+++ lxml/branch/html/src/lxml/html/__init__.py	Tue Jul 17 19:46:23 2007
@@ -22,6 +22,16 @@
 
 class HtmlMixin(object):
 
+    def base_url(self):
+        """
+        Returns the base URL, given when the page was parsed.
+
+        Use with ``urlparse.urljoin(el.base_url, href)`` to get
+        absolute URLs.
+        """
+        return self.gettreeroot().docinfo.URL
+    base_url = property(base_url, doc=base_url.__doc__)
+
     def forms(self):
         """
         Return a list of all the forms
@@ -175,16 +185,21 @@
     ## Link functions
     ########################################
 
-    def make_links_absolute(self, base_href, resolve_base_href=True):
+    def make_links_absolute(self, base_url=None, resolve_base_href=True):
         """
         Make all links in the document absolute, given the
-        ``base_href`` for the document (the full URL where the
-        document came from).
+        ``base_url`` for the document (the full URL where the document
+        came from), or if no ``base_url`` is given, then the ``.base_url`` of the document.
 
         If ``resolve_base_href`` is true, then any ``<base href>``
         tags in the document are used *and* removed from the document.
         If it is false then any such tag is ignored.
         """
+        if base_url is None:
+            base_url = self.base_url
+            if base_url is None:
+                raise TypeError(
+                    "No base_url given, and the document has no base_url")
         if resolve_base_href:
             self.resolve_base_href()
         def link_repl(href):
@@ -356,14 +371,14 @@
 html_parser = etree.HTMLParser()
 html_parser.setElementClassLookup(HtmlLookup())
 
-def document_fromstring(html):
+def document_fromstring(html, **kw):
     value = etree.HTML(html, html_parser)
     if value is None:
         raise etree.ParserError(
             "Document is empty")
     return value
 
-def fragments_fromstring(html, no_leading_text=False):
+def fragments_fromstring(html, no_leading_text=False, **kw):
     """
     Parses several HTML elements, returning a list of elements.
 
@@ -376,7 +391,7 @@
     start = html[:20].lstrip().lower()
     if not start.startswith('<html') and not start.startswith('<!doctype'):
         html = '<html><body>%s</body></html>' % html
-    doc = document_fromstring(html)
+    doc = document_fromstring(html, **kw)
     assert doc.tag == 'html'
     bodies = [e for e in doc if e.tag == 'body']
     assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html))
@@ -392,7 +407,7 @@
     # would be nice
     return elements
 
-def fragment_fromstring(html, create_parent=False):
+def fragment_fromstring(html, create_parent=False, **kw):
     """
     Parses a single HTML element; it is an error if there is more than
     one element, or if anything but whitespace precedes or follows the
@@ -405,7 +420,7 @@
         if not isinstance(create_parent, basestring):
             create_parent = 'div'
         return fragment_fromstring('<%s>%s</%s>' % (
-            create_parent, html, create_parent))
+            create_parent, html, create_parent), **kw)
     elements = fragments_fromstring(html, no_leading_text=True)
     if not elements:
         raise etree.ParserError(
@@ -421,7 +436,7 @@
     el.tail = None
     return el
 
-def fromstring(html):
+def fromstring(html, **kw):
     """
     Parse the html, returning a single element/document.
 
@@ -431,9 +446,9 @@
     start = html[:10].lstrip().lower()
     if start.startswith('<html') or start.startswith('<!doctype'):
         # Looks like a full HTML document
-        return document_fromstring(html)
+        return document_fromstring(html, **kw)
     # otherwise, lets parse it out...
-    doc = document_fromstring(html)
+    doc = document_fromstring(html, **kw)
     bodies = doc.findall('body')
     if bodies:
         body = bodies[0]
@@ -476,11 +491,14 @@
         body.tag = 'span'
     return body
 
-def parse(filename):
+def parse(filename, **kw):
     """
     Parse a filename, URL, or file-like object into an HTML document.
+
+    You may pass the keyword argument ``base_url='http://...'`` to set
+    the base URL.
     """
-    return etree.parse(filename, html_parser)
+    return etree.parse(filename, html_parser, **kw)
 
 def _contains_block_level_tag(el):
     # FIXME: I could do this with XPath, but would that just be
@@ -558,7 +576,12 @@
         """
         Get/set the form's ``action`` attribute.
         """
-        return self.get('action')
+        base_url = self.base_url
+        action = self.get('action')
+        if base_url and action is not None:
+            return urlparse.urljoin(base_url, action)
+        else:
+            return action
     def action__set(self, value):
         self.set('action', value)
     def action__del(self):
@@ -902,25 +925,24 @@
     Represents a group of checkboxes (``<input type=checkbox>``) that
     have the same name.
 
-    In addition to using this like a list, the ``.values`` attribute
+    In addition to using this like a list, the ``.value`` attribute
     returns a set-like object that you can add to or remove from to
     check and uncheck checkboxes.  You can also use ``.value_options``
     to get the possible values.
     """
 
-    ## FIXME: should this be named .value?
-    def values__get(self):
+    def value__get(self):
         """
         Return a set-like object that can be modified to check or
         uncheck individual checkboxes according to their value.
         """
         return CheckboxValues(self)
-    def values__set(self, value):
-        self.values.clear()
-        self.values |= value
-    def values__del(self):
-        self.values.clear()
-    values = property(values__get, values__set, values__del, doc=values__get.__doc__)
+    def value__set(self, value):
+        self.value.clear()
+        self.value |= value
+    def value__del(self):
+        self.value.clear()
+    value = property(value__get, value__set, value__del, doc=value__get.__doc__)
 
     def __repr__(self):
         return '%s(%s)' % (


More information about the lxml-checkins mailing list