[Lxml-checkins] r43884 - lxml/branch/html/src/lxml/html

ianb at codespeak.net ianb at codespeak.net
Tue May 29 22:42:18 CEST 2007


Author: ianb
Date: Tue May 29 22:42:18 2007
New Revision: 43884

Modified:
   lxml/branch/html/src/lxml/html/__init__.py
   lxml/branch/html/src/lxml/html/clean.py
   lxml/branch/html/src/lxml/html/rewritelinks.py
Log:
Added get_element_by_id and text_only; some comments for TODOs

Modified: lxml/branch/html/src/lxml/html/__init__.py
==============================================================================
--- lxml/branch/html/src/lxml/html/__init__.py	(original)
+++ lxml/branch/html/src/lxml/html/__init__.py	Tue May 29 22:42:18 2007
@@ -7,6 +7,7 @@
 _rel_links_xpath = etree.XPath("descendant-or-self::a[fn:upper-case(@rel)=$rel]")
 #_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'})
 _class_xpath = etree.XPath("descendant-or-self::*[contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
+_id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")
 
 class HtmlMixin(object):
 
@@ -58,6 +59,27 @@
     def find_class(self, class_name):
         return _class_xpath(self, class_name=class_name.lower())
 
+    def get_element_by_id(self, id, default=None):
+        # FIXME: should this raise an exception when something isn't found?
+        try:
+            # FIXME: should this check for multiple matches?
+            # browsers just return the first one
+            return _id_xpath(self, id=id)[0]
+        except IndexError:
+            return default
+
+    def text_only(self, with_tail=False):
+        """
+        Return the text inside this element, without any tags.  If with_tail
+        is true, then also include the text that follows this tag.
+        """
+        parts = [self.text or '']
+        for child in self:
+            parts.append(child.text_only(with_tail=True))
+        if with_tail:
+            parts.append(self.tail or '')
+        return ''.join(parts)
+
 class HtmlComment(etree._Comment, HtmlMixin):
     pass
 

Modified: lxml/branch/html/src/lxml/html/clean.py
==============================================================================
--- lxml/branch/html/src/lxml/html/clean.py	(original)
+++ lxml/branch/html/src/lxml/html/clean.py	Tue May 29 22:42:18 2007
@@ -86,6 +86,8 @@
                 del el.attrib[attrib]
         for attrib in defs.link_attrs:
             # FIXME: should call lower-case()
+            # FIXME: starts-with isn't really good either, because
+            #        href="   javascript:..." is also a problem
             for el in doc.xpath("descendant-or-self::*[starts-with(@%s, 'javascript:')]" % attrib):
                 if isinstance(el, basestring):
                     assert 0, repr(el)

Modified: lxml/branch/html/src/lxml/html/rewritelinks.py
==============================================================================
--- lxml/branch/html/src/lxml/html/rewritelinks.py	(original)
+++ lxml/branch/html/src/lxml/html/rewritelinks.py	Tue May 29 22:42:18 2007
@@ -39,6 +39,7 @@
     if remove_base_tags:
         resolve_base_href(doc)
 
+    # FIXME: should use defs.link_attrs
     for attrib in 'href', 'src':
         els = doc.xpath('//*[@%s]' % attrib)
         for el in els:


More information about the lxml-checkins mailing list