[Lxml-checkins] r44115 - in lxml/branch/html/src/lxml/html: . tests
scoder at codespeak.net
scoder at codespeak.net
Sat Jun 9 16:44:53 CEST 2007
Author: scoder
Date: Sat Jun 9 16:44:51 2007
New Revision: 44115
Modified:
lxml/branch/html/src/lxml/html/__init__.py
lxml/branch/html/src/lxml/html/clean.py
lxml/branch/html/src/lxml/html/diff.py
lxml/branch/html/src/lxml/html/tests/test_basic.txt
Log:
renamed drop_tag to drop_element and drop_element to drop_tree, some more cleanup
Modified: lxml/branch/html/src/lxml/html/__init__.py
==============================================================================
--- lxml/branch/html/src/lxml/html/__init__.py (original)
+++ lxml/branch/html/src/lxml/html/__init__.py Sat Jun 9 16:44:51 2007
@@ -11,31 +11,31 @@
_rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]")
#_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'})
-_class_xpath = etree.XPath("descendant-or-self::*[contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
+_class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
_id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")
+_collect_string_content = etree.XPath("string()")
_css_url_re = re.compile(r'url\((.*?)\)', re.I)
_css_import_re = re.compile(r'@import "(.*?)"')
class HtmlMixin(object):
- def drop_element(self):
+ def drop_tree(self):
"""
Removes this element from the tree, including its children and
text. The tail text is joined to the previous element or
parent.
"""
parent = self.getparent()
- assert parent
- index = parent.index(self)
+ assert parent is not None
if self.tail:
- if index == 0:
+ previous = self.getprevious()
+ if previous is None:
parent.text = (parent.text or '') + self.tail
else:
- previous = parent[index-1]
previous.tail = (previous.tail or '') + self.tail
parent.remove(self)
- def drop_tag(self):
+ def drop_element(self):
"""
Remove the tag, but not its children or text. The children and text
are merged into the parent.
@@ -43,29 +43,28 @@
Example::
>>> h = parse_element('<div>Hello <b>World!</b></div>')
- >>> h.xpath('//b')[0].drop_tag()
+ >>> h.find('//b').drop_element()
>>> print tostring(h)
<div>Hello World!</div>
"""
parent = self.getparent()
- assert parent
- index = parent.index(self)
+ assert parent is not None
+ previous = self.getprevious()
if self.text:
- if index == 0:
+ if previous is None:
parent.text = (parent.text or '') + self.text
else:
- prev = parent[index-1]
- prev.tail = (prev.tail or '') + self.text
+ previous.tail = (previous.tail or '') + self.text
if self.tail:
if len(self):
last = self[-1]
last.tail = (last.tail or '') + self.tail
- elif index == 0:
+ elif previous is None:
parent.text = (parent.text or '') + self.tail
else:
- prev = parent[index-1]
- prev.tail = (prev.tail or '') + self.tail
- parent[index:index+1] = list(self)
+ previous.tail = (previous.tail or '') + self.tail
+ index = parent.index(self)
+ parent[index:index+1] = self[:]
def find_rel_links(self, rel):
"""
@@ -73,7 +72,7 @@
"""
rel = rel.lower()
return [el for el in _rel_links_xpath(self)
- if el.attrib['rel'].lower() == rel]
+ if el.get('rel').lower() == rel]
def find_class(self, class_name):
"""
@@ -103,7 +102,7 @@
"""
Return the text content of the tag (and the text in any children).
"""
- return self.xpath("string()")
+ return _collect_string_content(self)
########################################
## Link functions
@@ -134,8 +133,8 @@
base_href = None
basetags = self.xpath('//base[@href]')
for b in basetags:
- base_href = b.attrib['href']
- b.drop_element()
+ base_href = b.get('href')
+ b.drop_tree()
if not base_href:
return
self.make_links_absolute(base_href, resolve_base_href=False)
@@ -370,7 +369,7 @@
body.extend(other_body)
# We'll ignore tail
# I guess we are ignoring attributes too
- other_body.drop_element()
+ other_body.drop_tree()
else:
body = None
heads = doc.findall('head')
@@ -381,7 +380,7 @@
for other_head in heads[1:]:
head.extend(other_head)
# We don't care about text or tail in a head
- other_head.drop_element()
+ other_head.drop_tree()
return doc
if (len(body) == 1 and (not body.text or not body.text.strip())
and (not body[-1].tail or not body[-1].tail.strip())):
Modified: lxml/branch/html/src/lxml/html/clean.py
==============================================================================
--- lxml/branch/html/src/lxml/html/clean.py (original)
+++ lxml/branch/html/src/lxml/html/clean.py Sat Jun 9 16:44:51 2007
@@ -53,7 +53,7 @@
"descendant-or-self::*[@style]")
_find_external_links = etree.XPath(
- "descendant-or-self::a[string(@href) and substring(@href,0,1) != '#']")
+ "descendant-or-self::a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']")
def clean_html(html, **kw):
"""
@@ -164,7 +164,7 @@
if _conditional_comment_re.search(el.text):
bad.append(el)
for el in bad:
- el.drop_element()
+ el.drop_tree()
# First, handle a case that IE treats <image> like <img>, and that can
# confuse either this step or later steps.
for el in doc.getiterator('image'):
@@ -205,7 +205,7 @@
el.set('style', new)
for el in list(doc.getiterator('style')):
if el.get('type', '').lower().strip() == 'text/javascript':
- el.drop_element()
+ el.drop_tree()
continue
old = el.text or ''
new = _css_javascript_re.sub('', old)
@@ -230,9 +230,9 @@
# del el.attrib['xmlns']
if style:
kill_tags.add('style')
- for el in doc.getiterator('link'):
+ for el in list(doc.getiterator('link')):
if 'stylesheet' in el.get('rel', '').lower():
- el.drop_element()
+ el.drop_tree()
for el in _find_styled_elements(doc):
del el.attrib['style']
if links:
@@ -288,9 +288,9 @@
el.clear()
for el in _kill:
- el.drop_element()
+ el.drop_tree()
for el in _remove:
- el.drop_tag()
+ el.drop_element()
if remove_unknown_tags:
if allow_tags:
@@ -304,10 +304,10 @@
bad.append(el)
if strip_tags:
for el in bad:
- el.drop_tag()
+ el.drop_element()
else:
for el in bad:
- el.drop_element()
+ el.drop_tree()
if add_nofollow:
for el in _find_external_links(doc):
el.set('rel', 'nofollow')
@@ -356,7 +356,7 @@
"""
if el.tag in avoid_elements:
return
- class_name = el.attrib.get('class')
+ class_name = el.get('class')
if class_name:
class_name = class_name.split()
for match_class in avoid_classes:
@@ -428,7 +428,7 @@
assert not leading_text
leading_text = prev_text
anchor = factory('a')
- anchor.attrib['href'] = link
+ anchor.set('href', link)
body = best_match.group('body')
if not body:
body = link
@@ -478,7 +478,7 @@
# http://www.cs.tut.fi/~jkorpela/html/nobr.html
if el.tag in _avoid_word_break_elements:
return
- class_name = el.attrib.get('class')
+ class_name = el.get('class')
if class_name:
dont_break = False
class_name = class_name.split()
Modified: lxml/branch/html/src/lxml/html/diff.py
==============================================================================
--- lxml/branch/html/src/lxml/html/diff.py (original)
+++ lxml/branch/html/src/lxml/html/diff.py Sat Jun 9 16:44:51 2007
@@ -516,7 +516,7 @@
included as a special kind of diffable token."""
body_el = parse_html(html, cleanup=True)
# Then we split the document into text chunks for each tag, word, and end tag:
- chunks = flatten_el(body_el, drop_tag=True, include_hrefs=include_hrefs)
+ chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs)
# Finally re-joining them into token objects:
return fixup_chunks(chunks)
@@ -657,14 +657,14 @@
)
-def flatten_el(el, include_hrefs, drop_tag=False):
+def flatten_el(el, include_hrefs, skip_tag=False):
""" Takes an lxml element el, and generates all the text chunks for
that tag. Each start tag is a chunk, each word is a chunk, and each
end tag is a chunk.
- If drop_tag is true, then the outermost container tag is
+ If skip_tag is true, then the outermost container tag is
not returned (just its contents)."""
- if not drop_tag:
+ if not skip_tag:
if el.tag == 'img':
yield ('img', el.attrib['src'], start_tag(el))
else:
@@ -679,7 +679,7 @@
yield item
if el.tag == 'a' and el.attrib.get('href') and include_hrefs:
yield ('href', el.attrib['href'])
- if not drop_tag:
+ if not skip_tag:
yield end_tag(el)
end_words = split_words(el.tail)
for word in end_words:
@@ -729,14 +729,14 @@
<p><ins>word</ins></p> """
doc = parse_html(html, cleanup=False)
_fixup_ins_del_tags(doc)
- html = serialize_html_fragment(doc, drop_outer=True)
+ html = serialize_html_fragment(doc, skip_outer=True)
return html
-def serialize_html_fragment(el, drop_outer=False):
+def serialize_html_fragment(el, skip_outer=False):
""" Serialize a single lxml element as HTML. The serialized form
includes the elements tail.
- If drop_outer is true, then don't serialize the outermost tag
+ If skip_outer is true, then don't serialize the outermost tag
"""
html_xsl = """\
@@ -751,13 +751,13 @@
assert not isinstance(el, basestring), (
"You should pass in an element, not a string like %r" % el)
html = str(transform(el))
- if drop_outer:
+ if skip_outer:
# Get rid of the extra starting tag:
html = html[html.find('>')+1:]
- if drop_outer:
+ if skip_outer:
# Get rid of the extra end tag:
html = html[:html.rfind('<')]
- if drop_outer:
+ if skip_outer:
return html.strip()
else:
return html.lstrip()
@@ -770,7 +770,7 @@
if not _contains_block_level_tag(el):
continue
_move_el_inside_block(el, tag=tag)
- el.drop_tag()
+ el.drop_element()
#_merge_element_contents(el)
def _contains_block_level_tag(el):
Modified: lxml/branch/html/src/lxml/html/tests/test_basic.txt
==============================================================================
--- lxml/branch/html/src/lxml/html/tests/test_basic.txt (original)
+++ lxml/branch/html/src/lxml/html/tests/test_basic.txt Sat Jun 9 16:44:51 2007
@@ -52,7 +52,7 @@
>>> el.text_content()
'This is a bold link'
-Or drop both tags (leaving content) or the entire element, like::
+Or drop an element (leaving its content) or the entire tree, like::
>>> doc = HTML('''
... <html>
@@ -63,7 +63,7 @@
... <div>footer</div>
... </body>
... </html>''')
- >>> doc.get_element_by_id('link').drop_tag()
+ >>> doc.get_element_by_id('link').drop_element()
>>> print tostring(doc)
<html>
<body>
@@ -73,7 +73,7 @@
<div>footer</div>
</body>
</html>
- >>> doc.get_element_by_id('body').drop_element()
+ >>> doc.get_element_by_id('body').drop_tree()
>>> print tostring(doc)
<html>
<body>
More information about the lxml-checkins
mailing list