[Lxml-checkins] r54353 - in lxml/trunk: . src/lxml/html
scoder at codespeak.net
scoder at codespeak.net
Fri May 2 21:56:35 CEST 2008
Author: scoder
Date: Fri May 2 21:56:34 2008
New Revision: 54353
Modified:
lxml/trunk/ (props changed)
lxml/trunk/CHANGES.txt
lxml/trunk/src/lxml/html/__init__.py
lxml/trunk/src/lxml/html/clean.py
lxml/trunk/src/lxml/html/formfill.py
Log:
r4141 at delle: sbehnel | 2008-05-02 21:47:32 +0200
support XHTML tags in XPath expressions of lxml.html
Modified: lxml/trunk/CHANGES.txt
==============================================================================
--- lxml/trunk/CHANGES.txt (original)
+++ lxml/trunk/CHANGES.txt Fri May 2 21:56:34 2008
@@ -2,6 +2,21 @@
lxml changelog
==============
+Under development
+=================
+
+Features added
+--------------
+
+* Most features in lxml.html work for XHTML namespaced tag names.
+
+Bugs fixed
+----------
+
+Other changes
+-------------
+
+
2.1beta2 (2008-05-02)
=====================
Modified: lxml/trunk/src/lxml/html/__init__.py
==============================================================================
--- lxml/trunk/src/lxml/html/__init__.py (original)
+++ lxml/trunk/src/lxml/html/__init__.py Fri May 2 21:56:34 2008
@@ -22,16 +22,30 @@
'find_rel_links', 'find_class', 'make_links_absolute',
'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser']
-_rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]")
+XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"
+
+_rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]",
+ namespaces={'x':XHTML_NAMESPACE})
+_options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option",
+ namespaces={'x':XHTML_NAMESPACE})
+_forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form",
+ namespaces={'x':XHTML_NAMESPACE})
#_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'})
_class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
_id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")
_collect_string_content = etree.XPath("string()")
_css_url_re = re.compile(r'url\((.*?)\)', re.I)
_css_import_re = re.compile(r'@import "(.*?)"')
-_label_xpath = etree.XPath("//label[@for=$id]")
+_label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]",
+ namespaces={'x':XHTML_NAMESPACE})
_archive_re = re.compile(r'[^ ]+')
+def _nons(tag):
+ if isinstance(tag, basestring):
+ if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE:
+ return tag.split('}')[-1]
+ return tag
+
class HtmlMixin(object):
def base_url(self):
@@ -48,7 +62,7 @@
"""
Return a list of all the forms
"""
- return list(self.getiterator('form'))
+ return _forms_xpath(self)
forms = property(forms, doc=forms.__doc__)
def body(self):
@@ -56,7 +70,7 @@
Return the <body> element. Can be called from a child element
to get the document's head.
"""
- return self.xpath('//body')[0]
+ return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]
body = property(body, doc=body.__doc__)
def head(self):
@@ -64,7 +78,7 @@
Returns the <head> element. Can be called from a child
element to get the document's head.
"""
- return self.xpath('//head')[0]
+ return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]
head = property(head, doc=head.__doc__)
def _label__get(self):
@@ -85,7 +99,7 @@
raise TypeError(
"You cannot set a label for an element (%r) that has no id"
% self)
- if not label.tag == 'label':
+ if _nons(label.tag) != 'label':
raise TypeError(
"You can only assign label to a label element (not %r)"
% label)
@@ -228,7 +242,7 @@
tag once it has been applied.
"""
base_href = None
- basetags = self.xpath('//base[@href]')
+ basetags = self.xpath('//base[@href]|//x:base[@href]', namespaces={'x':XHTML_NAMESPACE})
for b in basetags:
base_href = b.get('href')
b.drop_tree()
@@ -249,11 +263,12 @@
link_attrs = defs.link_attrs
for el in self.getiterator():
attribs = el.attrib
- if el.tag != 'object':
+ tag = _nons(el.tag)
+ if tag != 'object':
for attrib in link_attrs:
if attrib in attribs:
yield (el, attrib, attribs[attrib], 0)
- elif el.tag == 'object':
+ elif tag == 'object':
codebase = None
## <object> tags have attributes that are relative to
## codebase
@@ -272,7 +287,7 @@
if codebase is not None:
value = urlparse.urljoin(codebase, value)
yield (el, 'archive', value, match.start())
- if el.tag == 'param':
+ if tag == 'param':
valuetype = el.get('valuetype') or ''
if valuetype.lower() == 'ref':
## FIXME: while it's fine we *find* this link,
@@ -282,7 +297,7 @@
## doesn't have a valuetype="ref" (which seems to be the norm)
## http://www.w3.org/TR/html401/struct/objects.html#adef-valuetype
yield (el, 'value', el.get('value'), 0)
- if el.tag == 'style' and el.text:
+ if tag == 'style' and el.text:
for match in _css_url_re.finditer(el.text):
yield (el, None, match.group(1), match.start(1))
for match in _css_import_re.finditer(el.text):
@@ -471,8 +486,8 @@
if not start.startswith('<html') and not start.startswith('<!doctype'):
html = '<html><body>%s</body></html>' % html
doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
- assert doc.tag == 'html'
- bodies = [e for e in doc if e.tag == 'body']
+ assert _nons(doc.tag) == 'html'
+ bodies = [e for e in doc if _nons(e.tag) == 'body']
assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html))
body = bodies[0]
elements = []
@@ -540,6 +555,8 @@
# otherwise, lets parse it out...
doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
bodies = doc.findall('body')
+ if not bodies:
+ bodies = doc.findall('{%s}body' % XHTML_NAMESPACE)
if bodies:
body = bodies[0]
if len(bodies) > 1:
@@ -558,6 +575,8 @@
else:
body = None
heads = doc.findall('head')
+ if not heads:
+ heads = doc.findall('{%s}head' % XHTML_NAMESPACE)
if heads:
# Well, we have some sort of structure, so lets keep it all
head = heads[0]
@@ -598,7 +617,7 @@
# FIXME: I could do this with XPath, but would that just be
# unnecessarily slow?
for el in el.getiterator():
- if el.tag in defs.block_tags:
+ if _nons(el.tag) in defs.block_tags:
return True
return False
@@ -608,7 +627,7 @@
elif isinstance(el, basestring):
return 'string'
else:
- return el.tag
+ return _nons(el.tag)
################################################################################
# form handling
@@ -655,7 +674,10 @@
return self.get('name')
elif self.get('id'):
return '#' + self.get('id')
- return str(self.body.findall('form').index(self))
+ forms = self.body.findall('form')
+ if not forms:
+ forms = self.body.findall('{%s}form' % XHTML_NAMESPACE)
+ return str(forms.index(self))
def form_values(self):
"""
@@ -667,9 +689,10 @@
name = el.name
if not name:
continue
- if el.tag == 'textarea':
+ tag = _nons(el.tag)
+ if tag == 'textarea':
results.append((name, el.value))
- elif el.tag == 'select':
+ elif tag == 'select':
value = el.value
if el.multiple:
for v in value:
@@ -677,7 +700,7 @@
elif value is not None:
results.append((name, el.value))
else:
- assert el.tag == 'input', (
+ assert tag == 'input', (
"Unexpected tag: %r" % el)
if el.checkable and not el.checked:
continue
@@ -801,8 +824,8 @@
checkboxes and radio elements are returned individually.
"""
- _name_xpath = etree.XPath(".//*[@name = $name and (name(.) = 'select' or name(.) = 'input' or name(.) = 'textarea')]")
- _all_xpath = etree.XPath(".//*[name() = 'select' or name() = 'input' or name() = 'textarea']")
+ _name_xpath = etree.XPath(".//*[@name = $name and (local-name(.) = 'select' or local-name(.) = 'input' or local-name(.) = 'textarea')]")
+ _all_xpath = etree.XPath(".//*[local-name() = 'select' or local-name() = 'input' or local-name() = 'textarea']")
def __init__(self, form):
self.form = form
@@ -919,7 +942,7 @@
"""
if self.multiple:
return MultipleSelectOptions(self)
- for el in self.getiterator('option'):
+ for el in _options_xpath(self):
if 'selected' in el.attrib:
value = el.get('value')
# FIXME: If value is None, what to return?, get_text()?
@@ -935,7 +958,7 @@
self.value.update(value)
return
if value is not None:
- for el in self.getiterator('option'):
+ for el in _options_xpath(self):
# FIXME: also if el.get('value') is None?
if el.get('value') == value:
checked_option = el
@@ -943,7 +966,7 @@
else:
raise ValueError(
"There is no option with the value of %r" % value)
- for el in self.getiterator('option'):
+ for el in _options_xpath(self):
if 'selected' in el.attrib:
del el.attrib['selected']
if value is not None:
@@ -963,7 +986,7 @@
All the possible values this select can have (the ``value``
attribute of all the ``<option>`` elements.
"""
- return [el.get('value') for el in self.getiterator('option')]
+ return [el.get('value') for el in _options_xpath(self)]
value_options = property(value_options, doc=value_options.__doc__)
def _multiple__get(self):
@@ -995,7 +1018,7 @@
"""
Iterator of all the ``<option>`` elements.
"""
- return self.select.getiterator('option')
+ return iter(_options_xpath(self.select))
options = property(options)
def __iter__(self):
Modified: lxml/trunk/src/lxml/html/clean.py
==============================================================================
--- lxml/trunk/src/lxml/html/clean.py (original)
+++ lxml/trunk/src/lxml/html/clean.py Fri May 2 21:56:34 2008
@@ -9,7 +9,7 @@
import urlparse
from lxml import etree
from lxml.html import defs
-from lxml.html import fromstring, tostring
+from lxml.html import fromstring, tostring, XHTML_NAMESPACE, _nons
try:
set
@@ -62,7 +62,9 @@
"descendant-or-self::*[@style]")
_find_external_links = etree.XPath(
- "descendant-or-self::a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']")
+ ("descendant-or-self::a [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |"
+ "descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"),
+ namespaces={'x':XHTML_NAMESPACE})
class Cleaner(object):
"""
@@ -201,6 +203,11 @@
if hasattr(doc, 'getroot'):
# ElementTree instance, instead of an element
doc = doc.getroot()
+ # convert XHTML to HTML
+ for el in doc.iter():
+ tag = el.tag
+ if isinstance(tag, basestring):
+ el.tag = _nons(tag)
# Normalize a case that IE treats <image> like <img>, and that
# can confuse either this step or later steps.
for el in doc.iter('image'):
Modified: lxml/trunk/src/lxml/html/formfill.py
==============================================================================
--- lxml/trunk/src/lxml/html/formfill.py (original)
+++ lxml/trunk/src/lxml/html/formfill.py Fri May 2 21:56:34 2008
@@ -1,5 +1,6 @@
from lxml.etree import XPath, ElementBase
-from lxml.html import fromstring, tostring
+from lxml.html import fromstring, tostring, XHTML_NAMESPACE
+from lxml.html import _forms_xpath, _options_xpath, _nons
from lxml.html import defs
__all__ = ['FormNotFound', 'fill_form', 'fill_form_html',
@@ -11,9 +12,11 @@
Raised when no form can be found
"""
-_form_name_xpath = XPath('descendant-or-self::form[name=$name]')
-_input_xpath = XPath('descendant-or-self::input | descendant-or-self::select | descendant-or-self::textarea')
-_label_for_xpath = XPath('//label[@for=$for_id]')
+_form_name_xpath = XPath('descendant-or-self::form[name=$name]|descendant-or-self::x:form[name=$name]', namespaces={'x':XHTML_NAMESPACE})
+_input_xpath = XPath('|'.join(['descendant-or-self::'+_tag for _tag in ('input','select','textarea','x:input','x:select','x:textarea')]),
+ namespaces={'x':XHTML_NAMESPACE})
+_label_for_xpath = XPath('//label[@for=$for_id]|//x:label[@for=$for_id]',
+ namespaces={'x':XHTML_NAMESPACE})
_name_xpath = XPath('descendant-or-self::*[@name=$name]')
def fill_form(
@@ -69,7 +72,7 @@
_fill_single(input, value)
def _takes_multiple(input):
- if input.tag == 'select' and input.get('multiple'):
+ if _nons(input.tag) == 'select' and input.get('multiple'):
# FIXME: multiple="0"?
return True
type = input.get('type', '').lower()
@@ -96,8 +99,8 @@
v = input.get('value')
_check(input, v in value)
else:
- assert input.tag == 'select'
- for option in input.findall('option'):
+ assert _nons(input.tag) == 'select'
+ for option in _options_xpath(input):
v = option.get('value')
if v is None:
# This seems to be the default, at least on IE
@@ -120,7 +123,7 @@
del el.attrib['selected']
def _fill_single(input, value):
- if input.tag == 'textarea':
+ if _nons(input.tag) == 'textarea':
input.clear()
input.text = value
else:
@@ -128,7 +131,7 @@
def _find_form(el, form_id=None, form_index=None):
if form_id is None and form_index is None:
- forms = el.getiterator('form')
+ forms = _forms_xpath(el)
for form in forms:
return form
raise FormNotFound(
@@ -145,7 +148,7 @@
"No form with the name or id of %r (forms: %s)"
% (id, ', '.join(_find_form_ids(el))))
if form_index is not None:
- forms = el.getiterator('form')
+ forms = _forms_xpath(el)
try:
return forms[form_index]
except IndexError:
@@ -154,7 +157,7 @@
% (form_index, len(forms)))
def _find_form_ids(el):
- forms = el.getiterator('form')
+ forms = _forms_xpath(el)
if not forms:
yield '(no forms)'
return
@@ -254,11 +257,11 @@
return doc
def _insert_error(el, error, error_class, error_creator):
- if el.tag in defs.empty_tags or el.tag == 'textarea':
+ if _nons(el.tag) in defs.empty_tags or _nons(el.tag) == 'textarea':
is_block = False
else:
is_block = True
- if el.tag != 'form' and error_class:
+ if _nons(el.tag) != 'form' and error_class:
_add_class(el, error_class)
if el.get('id'):
labels = _label_for_xpath(el, for_id=el.get('id'))
More information about the lxml-checkins
mailing list