[Lxml-checkins] r47957 - in lxml/trunk: . src/lxml/html
ianb at codespeak.net
ianb at codespeak.net
Thu Oct 25 18:25:51 CEST 2007
Author: ianb
Date: Thu Oct 25 18:25:51 2007
New Revision: 47957
Modified:
lxml/trunk/CHANGES.txt
lxml/trunk/src/lxml/html/__init__.py
Log:
Added link parsing for the <object> tag, which has some special rules, and the archive attribute
Modified: lxml/trunk/CHANGES.txt
==============================================================================
--- lxml/trunk/CHANGES.txt (original)
+++ lxml/trunk/CHANGES.txt Thu Oct 25 18:25:51 2007
@@ -30,6 +30,10 @@
output, it will then be namespace-neutral (before the ellipsis was
treated as a real namespace).
+* In the ``lxml.html`` ``iter_links`` method, links in ``<object>``
+ tags weren't recognized. (Note: plugin-specific link parameters
+ still aren't recognized.)
+
Other changes
-------------
Modified: lxml/trunk/src/lxml/html/__init__.py
==============================================================================
--- lxml/trunk/src/lxml/html/__init__.py (original)
+++ lxml/trunk/src/lxml/html/__init__.py Thu Oct 25 18:25:51 2007
@@ -27,6 +27,7 @@
_css_url_re = re.compile(r'url\((.*?)\)', re.I)
_css_import_re = re.compile(r'@import "(.*?)"')
_label_xpath = etree.XPath("//label[@for=$id]")
+_archive_re = re.compile(r'[^ ]+')
class HtmlMixin(object):
@@ -245,9 +246,39 @@
link_attrs = defs.link_attrs
for el in self.getiterator():
attribs = el.attrib
- for attrib in link_attrs:
- if attrib in attribs:
- yield (el, attrib, attribs[attrib], 0)
+ if el.tag != 'object':
+ for attrib in link_attrs:
+ if attrib in attribs:
+ yield (el, attrib, attribs[attrib], 0)
+ elif el.tag == 'object':
+ codebase = None
+ ## <object> tags have attributes that are relative to
+ ## codebase
+ if 'codebase' in attribs:
+ codebase = el.get('codebase')
+ yield (el, 'codebase', codebase, 0)
+ for attrib in 'classid', 'data':
+ if attrib in attribs:
+ value = el.get(attrib)
+ if codebase is not None:
+ value = urlparse.urljoin(codebase, value)
+ yield (el, attrib, value, 0)
+ if 'archive' in attribs:
+ for match in _archive_re.finditer(el.get('archive')):
+ value = match.group(0)
+ if codebase is not None:
+ value = urlparse.urljoin(codebase, value)
+ yield (el, 'archive', value, match.start())
+ if el.tag == 'param':
+ valuetype = el.get('valuetype') or ''
+ if valuetype.lower() == 'ref':
+ ## FIXME: while it's fine we *find* this link,
+ ## according to the spec we aren't supposed to
+ ## actually change the value, including resolving
+ ## it. It can also still be a link, even if it
+ ## doesn't have a valuetype="ref" (which seems to be the norm)
+ ## http://www.w3.org/TR/html401/struct/objects.html#adef-valuetype
+ yield (el, 'value', el.get('value'), 0)
if el.tag == 'style' and el.text:
for match in _css_url_re.finditer(el.text):
yield (el, None, match.group(1), match.start(1))
More information about the lxml-checkins
mailing list