[Lxml-checkins] r50533 - in lxml/trunk: . src/lxml/html
scoder at codespeak.net
scoder at codespeak.net
Sat Jan 12 19:41:33 CET 2008
Author: scoder
Date: Sat Jan 12 19:41:32 2008
New Revision: 50533
Modified:
lxml/trunk/ (props changed)
lxml/trunk/src/lxml/html/clean.py
Log:
r3254 at delle: sbehnel | 2008-01-12 19:41:17 +0100
code cleanup
Modified: lxml/trunk/src/lxml/html/clean.py
==============================================================================
--- lxml/trunk/src/lxml/html/clean.py (original)
+++ lxml/trunk/src/lxml/html/clean.py Sat Jan 12 19:41:32 2008
@@ -44,7 +44,7 @@
# execution:
_javascript_scheme_re = re.compile(
r'\s*(?:javascript|jscript|livescript|vbscript|about|mocha):', re.I)
-_whitespace_re = re.compile(r'\s+')
+_substitute_whitespace = re.compile(r'\s+').sub
# FIXME: should data: be blocked?
# FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx
@@ -57,15 +57,6 @@
_find_external_links = etree.XPath(
"descendant-or-self::a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']")
-def clean_html(html, **kw):
- """
- Like clean(), but takes a text input document, and returns a text
- document.
- """
- doc = fromstring(html)
- clean(doc, **kw)
- return tostring(doc)
-
class Cleaner(object):
"""
Instances cleans the document of each of the possible offending
@@ -205,7 +196,7 @@
doc = doc.getroot()
# Normalize a case that IE treats <image> like <img>, and that
# can confuse either this step or later steps.
- for el in doc.getiterator('image'):
+ for el in doc.iter('image'):
el.tag = 'img'
if not self.comments:
# Of course, if we were going to kill comments anyway, we don't
@@ -221,7 +212,7 @@
kill_tags.add('script')
if self.safe_attrs_only:
safe_attrs = set(defs.safe_attrs)
- for el in doc.getiterator():
+ for el in doc.iter():
attrib = el.attrib
for aname in attrib.keys():
if aname not in safe_attrs:
@@ -229,7 +220,7 @@
if self.javascript:
if not self.safe_attrs_only:
# safe_attrs handles events attributes itself
- for el in doc.getiterator():
+ for el in doc.iter():
attrib = el.attrib
for aname in attrib.keys():
if aname.startswith('on'):
@@ -248,7 +239,7 @@
del el.attrib['style']
elif new != old:
el.set('style', new)
- for el in list(doc.getiterator('style')):
+ for el in list(doc.iter('style')):
if el.get('type', '').lower().strip() == 'text/javascript':
el.drop_tree()
continue
@@ -277,7 +268,7 @@
elif self.style or self.javascript:
# We must get rid of included stylesheets if Javascript is not
# allowed, as you can put Javascript in them
- for el in list(doc.getiterator('link')):
+ for el in list(doc.iter('link')):
if 'stylesheet' in el.get('rel', '').lower():
# Note this kills alternate stylesheets as well
el.drop_tree()
@@ -289,7 +280,7 @@
# FIXME: is <layer> really embedded?
# We should get rid of any <param> tags not inside <applet>;
# These are not really valid anyway.
- for el in list(doc.getiterator('param')):
+ for el in list(doc.iter('param')):
found_parent = False
parent = el.getparent()
while parent is not None and parent.tag not in ('applet', 'object'):
@@ -312,7 +303,7 @@
_remove = []
_kill = []
- for el in doc.getiterator():
+ for el in doc.iter():
if el.tag in kill_tags:
if self.allow_element(el):
continue
@@ -349,7 +340,7 @@
allow_tags = set(defs.tags)
if allow_tags:
bad = []
- for el in doc.getiterator():
+ for el in doc.iter():
if el.tag not in allow_tags:
bad.append(el)
for el in bad:
@@ -408,7 +399,7 @@
def _kill_elements(self, doc, condition, iterate=None):
bad = []
- for el in doc.getiterator(iterate):
+ for el in doc.iter(iterate):
if condition(el):
bad.append(el)
for el in bad:
@@ -416,13 +407,13 @@
def _remove_javascript_link(self, link):
# links like "j a v a s c r i p t:" might be interpreted in IE
- new = _whitespace_re.sub('', link)
+ new = _substitute_whitespace('', link)
if _javascript_scheme_re.search(new):
# FIXME: should this be None to delete?
return ''
return link
- _decomment_re = re.compile(r'/\*.*?\*/', re.S)
+ _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub
def _has_sneaky_javascript(self, style):
"""
@@ -435,9 +426,9 @@
that and remove only the Javascript from the style; this catches
more sneaky attempts.
"""
- style = self._decomment_re.sub('', style)
+ style = self._substitute_comments('', style)
style = style.replace('\\', '')
- style = _whitespace_re.sub('', style)
+ style = _substitute_whitespace('', style)
style = style.lower()
if 'javascript:' in style:
return True
More information about the lxml-checkins
mailing list