[Lxml-checkins] r44088 - lxml/branch/html/src/lxml/html
scoder at codespeak.net
scoder at codespeak.net
Thu Jun 7 14:00:52 CEST 2007
Author: scoder
Date: Thu Jun 7 14:00:51 2007
New Revision: 44088
Modified:
lxml/branch/html/src/lxml/html/clean.py
Log:
more cleanup: use sets, precompiled XPath expressions and getiterator() where possible
Modified: lxml/branch/html/src/lxml/html/clean.py
==============================================================================
--- lxml/branch/html/src/lxml/html/clean.py (original)
+++ lxml/branch/html/src/lxml/html/clean.py Thu Jun 7 14:00:51 2007
@@ -48,6 +48,12 @@
_conditional_comment_re = re.compile(
r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S)
+_find_styled_elements = etree.XPath(
+ "descendant-or-self::*[@style]")
+
+_find_external_links = etree.XPath(
+ "descendant-or-self::a[string(@href) and substring(@href,0,1) != '#']")
+
def clean_html(html, **kw):
"""
Like clean(), but takes a text input document, and returns a text
@@ -165,6 +171,8 @@
el.tag = 'img'
kill_tags = set()
remove_tags = set(remove_tags or ())
+ if allow_tags:
+ allow_tags = set(allow_tags)
if scripts:
kill_tags.add('script')
if safe_attrs_only:
@@ -186,7 +194,7 @@
if not style:
# If we're deleting style then we don't have to remove JS links
# from styles, otherwise...
- for el in doc.xpath('descendant-or-self::*[@style]'):
+ for el in _find_styled_elements(doc):
old = el.get('style')
new = _css_javascript_re.sub('', old)
new = _css_import_re.sub('', old)
@@ -195,7 +203,7 @@
del el.attrib['style']
elif new != old:
el.set('style', new)
- for el in doc.xpath('descendant-or-self::style'):
+ for el in list(doc.getiterator('style')):
if el.get('type', '').lower().strip() == 'text/javascript':
el.drop_element()
continue
@@ -222,10 +230,10 @@
# del el.attrib['xmlns']
if style:
kill_tags.add('style')
- for el in doc.xpath('descendant-or-self::link'):
- if 'stylesheet' in el.attrib.get('rel', '').lower():
+ for el in doc.getiterator('link'):
+ if 'stylesheet' in el.get('rel', '').lower():
el.drop_element()
- for el in doc.xpath('descendant-or-self::*[@style]'):
+ for el in _find_styled_elements(doc):
del el.attrib['style']
if links:
kill_tags.add('link')
@@ -288,25 +296,21 @@
if allow_tags:
raise ValueError(
"It does not make sense to pass in both allow_tags and remove_unknown_tags")
- allow_tags = defs.tags
+ allow_tags = set(defs.tags)
if allow_tags:
bad = []
for el in doc.getiterator():
if el.tag not in allow_tags:
bad.append(el)
- for el in bad:
- if strip_tags:
+ if strip_tags:
+ for el in bad:
el.drop_tag()
- else:
- # FIXME: Should we test if this has been removed because of a parent?
+ else:
+ for el in bad:
el.drop_element()
if add_nofollow:
- for el in doc.xpath('descendant-or-self::a[@href]'):
- href = el.attrib['href']
- if not href or href.startswith('#'):
- # internal link, we don't care
- continue
- el.attrib['rel'] = 'nofollow'
+ for el in _find_external_links(doc):
+ el.set('rel', 'nofollow')
def _remove_javascript(link):
# links like "j a v a s c r i p t:" might be interpreted in IE
More information about the lxml-checkins
mailing list