[Lxml-checkins] r44057 - lxml/branch/html/src/lxml/html
scoder at codespeak.net
scoder at codespeak.net
Wed Jun 6 19:55:51 CEST 2007
Author: scoder
Date: Wed Jun 6 19:55:50 2007
New Revision: 44057
Modified:
lxml/branch/html/src/lxml/html/clean.py
Log:
cleanup: replace custom iteration function by call to getiterator(), use sets for kill_tags and remove_tags, avoid XPath calls where iteration is done anyway
Modified: lxml/branch/html/src/lxml/html/clean.py
==============================================================================
--- lxml/branch/html/src/lxml/html/clean.py (original)
+++ lxml/branch/html/src/lxml/html/clean.py Wed Jun 6 19:55:50 2007
@@ -3,6 +3,11 @@
from lxml.html import defs
from lxml.html import parse, tostring
+try:
+ import set
+except ImportError:
+ from sets import Set as set
+
__all__ = ['clean_html', 'clean', 'autolink', 'autolink_html',
'word_break', 'word_break_html']
@@ -52,14 +57,6 @@
clean(doc, **kw)
return tostring(doc)
-def _itertree(el):
- """
- Return the element's descendants, and the element itself
- """
- yield el
- for item in el.iterdescendants():
- yield item
-
# FIXME: I really have to figure out what a sane set of defaults is
# for these keyword arguments. And is this signature out of control?
# What about if we want things like whitelisting of <object> or other
@@ -148,12 +145,15 @@
This modifies the document *in place*.
"""
+ if hasattr(doc, 'getroot'):
+ # ElementTree
+ doc = doc.getroot()
# IE conditional comments basically embed HTML that the parser doesn't
# normally see. We can't allow anything like that, so we'll kill any
# comments that could be conditional
if not comments:
bad = []
- for el in _itertree(doc):
+ for el in doc.getiterator():
if (isinstance(el, etree.CommentBase)
and _conditional_comment_re.search(el.text)):
bad.append(el)
@@ -163,10 +163,10 @@
# confuse either this step or later steps.
for el in doc.xpath('descendant-or-self::image'):
el.tag = 'img'
- kill_tags = []
- remove_tags = list(remove_tags or [])
+ kill_tags = set()
+ remove_tags = set(remove_tags or ())
if scripts:
- kill_tags.append('script')
+ kill_tags.add('script')
if safe_attrs_only:
safe_attrs = set(defs.safe_attrs)
for el in doc.getiterator():
@@ -212,58 +212,85 @@
# FIXME: why either? I feel like there's some obscure reason
# because you can put PIs in comments...? But I've already
# forgotten it
- kill_tags.append(etree.Comment)
+ kill_tags.add(etree.Comment)
if processing_instructions:
- kill_tags.append(etree.ProcessingInstruction)
+ kill_tags.add(etree.ProcessingInstruction)
## SB: Does this actually work? Definitely not the right place to do this.
# if processing_instructions:
# # FIXME: is this really the right place to remove these attributes?
# for el in doc.xpath('descendant-or-self::*[@xmlns]'):
# del el.attrib['xmlns']
if style:
- kill_tags.append('style')
+ kill_tags.add('style')
for el in doc.xpath('descendant-or-self::link'):
if 'stylesheet' in el.attrib.get('rel', '').lower():
el.drop_element()
for el in doc.xpath('descendant-or-self::*[@style]'):
del el.attrib['style']
if links:
- kill_tags.append('link')
+ kill_tags.add('link')
elif javascript:
# FIXME: we should get rid of included stylesheets in this
# case, as you can put Javascript in them
pass
if meta:
- kill_tags.append('meta')
+ kill_tags.add('meta')
if page_structure:
- remove_tags.extend(['head', 'html', 'title'])
+ remove_tags.union(('head', 'html', 'title'))
if embedded:
# FIXME: is <layer> really embedded?
- kill_tags.extend(['applet', 'param'])
+ kill_tags.union(('applet', 'param'))
# The alternate contents that are in an iframe are a good fallback:
# FIXME: somehow embed seems to be getting data, but from what I
# can tell the embed tag is supposed to always be empty
- remove_tags.extend(['iframe', 'object', 'embed', 'layer'])
+ remove_tags.union(('iframe', 'object', 'embed', 'layer'))
if frames:
- kill_tags.extend(defs.frame_tags)
+ kill_tags.union(defs.frame_tags)
if forms:
- remove_tags.extend(['form'])
- kill_tags.extend(['button', 'input', 'select', 'textarea'])
+ remove_tags.add('form')
+ kill_tags.union(('button', 'input', 'select', 'textarea'))
if annoying_tags:
- remove_tags.extend(['blink', 'marque'])
- bad = []
- for el in _itertree(doc):
- if el.tag in kill_tags:
- bad.append(el)
- for el in bad:
+ remove_tags.union(('blink', 'marque'))
+
+ _remove = []
+ if strip_tags:
+ _kill = []
+ for el in doc.getiterator():
+ if el.tag in kill_tags:
+ _kill.append(el)
+ elif el.tag in remove_tags:
+ _remove.append(el)
+ else:
+ kill_tags.update(remove_tags)
+ _kill = [ el for el in doc.getiterator()
+ if el.tag in kill_tags ]
+
+ if _remove and _remove[0] == doc:
+ # We have to drop the parent-most tag, which we can't
+ # do. Instead we'll rewrite it:
+ el = _remove.pop(0)
+ el.tag = 'div'
+ el.attrib.clear()
+ elif _kill and _kill[0] == doc:
+ # We have to drop the parent-most element, which we can't
+ # do. Instead we'll clear it:
+ el = _kill.pop(0)
+ if el.tag != 'html':
+ el.tag = 'div'
+ el.clear()
+
+ for el in _kill:
el.drop_element()
- if remove_tags:
+ for el in _remove:
+ el.drop_tag()
+
+ if False and remove_tags:
xpath = ' | '.join([
"descendant-or-self::%s" % tag
for tag in remove_tags])
for el in doc.xpath(xpath):
if strip_tags:
- if el.getparent():
+ if el.getparent() is not None:
el.drop_tag()
else:
# We have to drop the parent-most tag, which we can't
@@ -273,6 +300,7 @@
else:
# FIXME: Should we test if this has been removed because of a parent?
el.drop_element()
+
if remove_unknown_tags:
if allow_tags:
raise ValueError(
@@ -280,7 +308,7 @@
allow_tags = defs.tags
if allow_tags:
bad = []
- for el in _itertree(doc):
+ for el in doc.getiterator():
if el.tag not in allow_tags:
bad.append(el)
for el in bad:
More information about the lxml-checkins
mailing list