[Lxml-checkins] r43900 - lxml/branch/html/src/lxml/html
ianb at codespeak.net
ianb at codespeak.net
Wed May 30 17:59:35 CEST 2007
Author: ianb
Date: Wed May 30 17:59:35 2007
New Revision: 43900
Modified:
lxml/branch/html/src/lxml/html/__init__.py
Log:
Rename text_only to get_text_content. Remove unnecessary <head> from the fragment parsing. Add some doc strings
Modified: lxml/branch/html/src/lxml/html/__init__.py
==============================================================================
--- lxml/branch/html/src/lxml/html/__init__.py (original)
+++ lxml/branch/html/src/lxml/html/__init__.py Wed May 30 17:59:35 2007
@@ -32,6 +32,13 @@
"""
Remove the tag, but not its children or text. The children and text
are merged into the parent.
+
+ Example::
+
+ >>> h = parse_element('<div>Hello <b>World!</b></div>')
+ >>> h.xpath('//b')[0].drop_tag()
+ >>> print tostring(h)
+ <div>Hello World!</div>
"""
parent = self.getparent()
assert parent
@@ -54,12 +61,27 @@
parent[index:index+1] = list(self)
def find_rel_links(self, rel):
+ """
+ Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements.
+ """
return _rel_links_xpath(self, rel=rel.lower())
def find_class(self, class_name):
+ """
+ Find any elements with the given class name.
+ """
return _class_xpath(self, class_name=class_name.lower())
def get_element_by_id(self, id, default=None):
+ """
+ Get the first element in a document with the given id. If
+ none are found, return default (None).
+
+ Note that there can be more than one element with the same id,
+ and this isn't uncommon in HTML documents found in the wild.
+ Browsers return only the first match, and this function does
+ the same.
+ """
# FIXME: should this raise an exception when something isn't found?
try:
# FIXME: should this check for multiple matches?
@@ -68,17 +90,11 @@
except IndexError:
return default
- def text_only(self, with_tail=False):
+ def get_text_content(self):
"""
- Return the text inside this element, without any tags. If with_tail
- is true, then also include the text that follows this tag.
+ Return the text content of the tag (and the text in any children).
"""
- parts = [self.text or '']
- for child in self:
- parts.append(child.text_only(with_tail=True))
- if with_tail:
- parts.append(self.tail or '')
- return ''.join(parts)
+ return self.xpath("string()")
class HtmlComment(etree.CommentBase, HtmlMixin):
pass
@@ -104,10 +120,11 @@
The first item in the list may be a string (though leading
whitespace is removed). If no_leading_text is true, then it will
- be an error if there is leading text.
+ be an error if there is leading text, and it will always be a list
+ of only elements.
"""
# FIXME: check what happens when you give html with a body, head, etc.
- html = '<html><head></head><body>%s</body></html>' % html
+ html = '<html><body>%s</body></html>' % html
doc = HTML(html)
assert doc.tag == 'html'
bodies = [e for e in doc if e.tag == 'body']
@@ -153,7 +170,6 @@
return el
def Element(*args, **kw):
- # FIXME: this is totally broken; segfaults
v = html_parser.makeelement(*args, **kw)
return v
More information about the lxml-checkins
mailing list