[Lxml-checkins] r43900 - lxml/branch/html/src/lxml/html

ianb at codespeak.net ianb at codespeak.net
Wed May 30 17:59:35 CEST 2007


Author: ianb
Date: Wed May 30 17:59:35 2007
New Revision: 43900

Modified:
   lxml/branch/html/src/lxml/html/__init__.py
Log:
Rename text_only to get_text_content.  Remove unnecessary <head> from the fragment parsing.  Add some doc strings

Modified: lxml/branch/html/src/lxml/html/__init__.py
==============================================================================
--- lxml/branch/html/src/lxml/html/__init__.py	(original)
+++ lxml/branch/html/src/lxml/html/__init__.py	Wed May 30 17:59:35 2007
@@ -32,6 +32,13 @@
         """
         Remove the tag, but not its children or text.  The children and text
         are merged into the parent.
+
+        Example::
+
+            >>> h = parse_element('<div>Hello <b>World!</b></div>')
+            >>> h.xpath('//b')[0].drop_tag()
+            >>> print tostring(h)
+            <div>Hello World!</div>
         """
         parent = self.getparent()
         assert parent
@@ -54,12 +61,27 @@
         parent[index:index+1] = list(self)
 
     def find_rel_links(self, rel):
+        """
+        Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements.
+        """
         return _rel_links_xpath(self, rel=rel.lower())
 
     def find_class(self, class_name):
+        """
+        Find any elements with the given class name.
+        """
         return _class_xpath(self, class_name=class_name.lower())
 
     def get_element_by_id(self, id, default=None):
+        """
+        Get the first element in a document with the given id.  If
+        none are found, return default (None).
+
+        Note that there can be more than one element with the same id,
+        and this isn't uncommon in HTML documents found in the wild.
+        Browsers return only the first match, and this function does
+        the same.
+        """
         # FIXME: should this raise an exception when something isn't found?
         try:
             # FIXME: should this check for multiple matches?
@@ -68,17 +90,11 @@
         except IndexError:
             return default
 
-    def text_only(self, with_tail=False):
+    def get_text_content(self):
         """
-        Return the text inside this element, without any tags.  If with_tail
-        is true, then also include the text that follows this tag.
+        Return the text content of the tag (and the text in any children).
         """
-        parts = [self.text or '']
-        for child in self:
-            parts.append(child.text_only(with_tail=True))
-        if with_tail:
-            parts.append(self.tail or '')
-        return ''.join(parts)
+        return self.xpath("string()")
 
 class HtmlComment(etree.CommentBase, HtmlMixin):
     pass
@@ -104,10 +120,11 @@
 
     The first item in the list may be a string (though leading
     whitespace is removed).  If no_leading_text is true, then it will
-    be an error if there is leading text.
+    be an error if there is leading text, and it will always be a list
+    of only elements.
     """
     # FIXME: check what happens when you give html with a body, head, etc.
-    html = '<html><head></head><body>%s</body></html>' % html
+    html = '<html><body>%s</body></html>' % html
     doc = HTML(html)
     assert doc.tag == 'html'
     bodies = [e for e in doc if e.tag == 'body']
@@ -153,7 +170,6 @@
     return el
 
 def Element(*args, **kw):
-    # FIXME: this is totally broken; segfaults
     v = html_parser.makeelement(*args, **kw)
     return v
 


More information about the lxml-checkins mailing list