[Lxml-checkins] r45166 - lxml/branch/html/src/lxml/html
ianb at codespeak.net
ianb at codespeak.net
Tue Jul 17 19:46:23 CEST 2007
Author: ianb
Date: Tue Jul 17 19:46:23 2007
New Revision: 45166
Modified:
lxml/branch/html/src/lxml/html/__init__.py
Log:
Join form.action to the base_url. Pass keyword arguments through to the parser (even if I can't get any keyword arguments to work at the moment). Expose the tree base url (in docinfo.URL) as el.base_url. Have .make_links_absolute() default to self.base_url. Change CheckboxGroup.values to .value, so it's the same as all the other elements and groups of elements.
Modified: lxml/branch/html/src/lxml/html/__init__.py
==============================================================================
--- lxml/branch/html/src/lxml/html/__init__.py (original)
+++ lxml/branch/html/src/lxml/html/__init__.py Tue Jul 17 19:46:23 2007
@@ -22,6 +22,16 @@
class HtmlMixin(object):
+ def base_url(self):
+ """
+ Returns the base URL, given when the page was parsed.
+
+ Use with ``urlparse.urljoin(el.base_url, href)`` to get
+ absolute URLs.
+ """
+ return self.gettreeroot().docinfo.URL
+ base_url = property(base_url, doc=base_url.__doc__)
+
def forms(self):
"""
Return a list of all the forms
@@ -175,16 +185,21 @@
## Link functions
########################################
- def make_links_absolute(self, base_href, resolve_base_href=True):
+ def make_links_absolute(self, base_url=None, resolve_base_href=True):
"""
Make all links in the document absolute, given the
- ``base_href`` for the document (the full URL where the
- document came from).
+ ``base_url`` for the document (the full URL where the document
+ came from), or if no ``base_url`` is given, then the ``.base_url`` of the document.
If ``resolve_base_href`` is true, then any ``<base href>``
tags in the document are used *and* removed from the document.
If it is false then any such tag is ignored.
"""
+ if base_url is None:
+ base_url = self.base_url
+ if base_url is None:
+ raise TypeError(
+ "No base_url given, and the document has no base_url")
if resolve_base_href:
self.resolve_base_href()
def link_repl(href):
@@ -356,14 +371,14 @@
html_parser = etree.HTMLParser()
html_parser.setElementClassLookup(HtmlLookup())
-def document_fromstring(html):
+def document_fromstring(html, **kw):
value = etree.HTML(html, html_parser)
if value is None:
raise etree.ParserError(
"Document is empty")
return value
-def fragments_fromstring(html, no_leading_text=False):
+def fragments_fromstring(html, no_leading_text=False, **kw):
"""
Parses several HTML elements, returning a list of elements.
@@ -376,7 +391,7 @@
start = html[:20].lstrip().lower()
if not start.startswith('<html') and not start.startswith('<!doctype'):
html = '<html><body>%s</body></html>' % html
- doc = document_fromstring(html)
+ doc = document_fromstring(html, **kw)
assert doc.tag == 'html'
bodies = [e for e in doc if e.tag == 'body']
assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html))
@@ -392,7 +407,7 @@
# would be nice
return elements
-def fragment_fromstring(html, create_parent=False):
+def fragment_fromstring(html, create_parent=False, **kw):
"""
Parses a single HTML element; it is an error if there is more than
one element, or if anything but whitespace precedes or follows the
@@ -405,7 +420,7 @@
if not isinstance(create_parent, basestring):
create_parent = 'div'
return fragment_fromstring('<%s>%s</%s>' % (
- create_parent, html, create_parent))
+ create_parent, html, create_parent), **kw)
elements = fragments_fromstring(html, no_leading_text=True)
if not elements:
raise etree.ParserError(
@@ -421,7 +436,7 @@
el.tail = None
return el
-def fromstring(html):
+def fromstring(html, **kw):
"""
Parse the html, returning a single element/document.
@@ -431,9 +446,9 @@
start = html[:10].lstrip().lower()
if start.startswith('<html') or start.startswith('<!doctype'):
# Looks like a full HTML document
- return document_fromstring(html)
+ return document_fromstring(html, **kw)
# otherwise, lets parse it out...
- doc = document_fromstring(html)
+ doc = document_fromstring(html, **kw)
bodies = doc.findall('body')
if bodies:
body = bodies[0]
@@ -476,11 +491,14 @@
body.tag = 'span'
return body
-def parse(filename):
+def parse(filename, **kw):
"""
Parse a filename, URL, or file-like object into an HTML document.
+
+ You may pass the keyword argument ``base_url='http://...'`` to set
+ the base URL.
"""
- return etree.parse(filename, html_parser)
+ return etree.parse(filename, html_parser, **kw)
def _contains_block_level_tag(el):
# FIXME: I could do this with XPath, but would that just be
@@ -558,7 +576,12 @@
"""
Get/set the form's ``action`` attribute.
"""
- return self.get('action')
+ base_url = self.base_url
+ action = self.get('action')
+ if base_url and action is not None:
+ return urlparse.urljoin(base_url, action)
+ else:
+ return action
def action__set(self, value):
self.set('action', value)
def action__del(self):
@@ -902,25 +925,24 @@
Represents a group of checkboxes (``<input type=checkbox>``) that
have the same name.
- In addition to using this like a list, the ``.values`` attribute
+ In addition to using this like a list, the ``.value`` attribute
returns a set-like object that you can add to or remove from to
check and uncheck checkboxes. You can also use ``.value_options``
to get the possible values.
"""
- ## FIXME: should this be named .value?
- def values__get(self):
+ def value__get(self):
"""
Return a set-like object that can be modified to check or
uncheck individual checkboxes according to their value.
"""
return CheckboxValues(self)
- def values__set(self, value):
- self.values.clear()
- self.values |= value
- def values__del(self):
- self.values.clear()
- values = property(values__get, values__set, values__del, doc=values__get.__doc__)
+ def value__set(self, value):
+ self.value.clear()
+ self.value |= value
+ def value__del(self):
+ self.value.clear()
+ value = property(value__get, value__set, value__del, doc=value__get.__doc__)
def __repr__(self):
return '%s(%s)' % (
More information about the lxml-checkins
mailing list