[Lxml-checkins] r43884 - lxml/branch/html/src/lxml/html
ianb at codespeak.net
ianb at codespeak.net
Tue May 29 22:42:18 CEST 2007
Author: ianb
Date: Tue May 29 22:42:18 2007
New Revision: 43884
Modified:
lxml/branch/html/src/lxml/html/__init__.py
lxml/branch/html/src/lxml/html/clean.py
lxml/branch/html/src/lxml/html/rewritelinks.py
Log:
Added get_element_by_id and text_only; some comments for TODOs
Modified: lxml/branch/html/src/lxml/html/__init__.py
==============================================================================
--- lxml/branch/html/src/lxml/html/__init__.py (original)
+++ lxml/branch/html/src/lxml/html/__init__.py Tue May 29 22:42:18 2007
@@ -7,6 +7,7 @@
_rel_links_xpath = etree.XPath("descendant-or-self::a[fn:upper-case(@rel)=$rel]")
#_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'})
_class_xpath = etree.XPath("descendant-or-self::*[contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
+_id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")
class HtmlMixin(object):
@@ -58,6 +59,27 @@
def find_class(self, class_name):
return _class_xpath(self, class_name=class_name.lower())
+ def get_element_by_id(self, id, default=None):
+ # FIXME: should this raise an exception when something isn't found?
+ try:
+ # FIXME: should this check for multiple matches?
+ # browsers just return the first one
+ return _id_xpath(self, id=id)[0]
+ except IndexError:
+ return default
+
+ def text_only(self, with_tail=False):
+ """
+ Return the text inside this element, without any tags. If with_tail
+ is true, then also include the text that follows this tag.
+ """
+ parts = [self.text or '']
+ for child in self:
+ parts.append(child.text_only(with_tail=True))
+ if with_tail:
+ parts.append(self.tail or '')
+ return ''.join(parts)
+
class HtmlComment(etree._Comment, HtmlMixin):
pass
Modified: lxml/branch/html/src/lxml/html/clean.py
==============================================================================
--- lxml/branch/html/src/lxml/html/clean.py (original)
+++ lxml/branch/html/src/lxml/html/clean.py Tue May 29 22:42:18 2007
@@ -86,6 +86,8 @@
del el.attrib[attrib]
for attrib in defs.link_attrs:
# FIXME: should call lower-case()
+ # FIXME: starts-with isn't really good either, because
+ # href=" javascript:..." is also a problem
for el in doc.xpath("descendant-or-self::*[starts-with(@%s, 'javascript:')]" % attrib):
if isinstance(el, basestring):
assert 0, repr(el)
Modified: lxml/branch/html/src/lxml/html/rewritelinks.py
==============================================================================
--- lxml/branch/html/src/lxml/html/rewritelinks.py (original)
+++ lxml/branch/html/src/lxml/html/rewritelinks.py Tue May 29 22:42:18 2007
@@ -39,6 +39,7 @@
if remove_base_tags:
resolve_base_href(doc)
+ # FIXME: should use defs.link_attrs
for attrib in 'href', 'src':
els = doc.xpath('//*[@%s]' % attrib)
for el in els:
More information about the lxml-checkins
mailing list