[Lxml-checkins] r44685 - in lxml/branch/html/src/lxml/html: . tests
ianb at codespeak.net
ianb at codespeak.net
Tue Jul 3 03:29:50 CEST 2007
Author: ianb
Date: Tue Jul 3 03:29:49 2007
New Revision: 44685
Modified:
lxml/branch/html/src/lxml/html/clean.py
lxml/branch/html/src/lxml/html/tests/test_clean.txt
Log:
Moved to a class-based cleaner instead of a function. Resulting rearrangement
Modified: lxml/branch/html/src/lxml/html/clean.py
==============================================================================
--- lxml/branch/html/src/lxml/html/clean.py (original)
+++ lxml/branch/html/src/lxml/html/clean.py Tue Jul 3 03:29:49 2007
@@ -64,66 +64,44 @@
clean(doc, **kw)
return tostring(doc)
-# FIXME: I really have to figure out what a sane set of defaults is
-# for these keyword arguments. And is this signature out of control?
-# What about if we want things like whitelisting of <object> or other
-# controls? Maybe this has to be more than a function.
-def clean(doc,
- scripts=True,
- javascript=True,
- comments=True,
- style=False,
- links=True,
- meta=True,
- page_structure=True,
- processing_instructions=True,
- embedded=True,
- frames=True,
- forms=True,
- annoying_tags=True,
- remove_tags=None,
- allow_tags=None,
- strip_tags=True,
- remove_unknown_tags=True,
- safe_attrs_only=True,
- add_nofollow=False,
- # callbacks?
- ):
+class Cleaner(object):
"""
- Cleans the document of each of the possible offending elements:
+ Instances cleans the document of each of the possible offending
+ elements. The cleaning is controlled by attributes; you can
+ override attributes in a subclass, or set them in the constructor.
``scripts``:
- Any ``<script>`` tags.
+ Removes any ``<script>`` tags.
``javascript``:
- Any Javascript, like an ``onclick`` attribute.
+ Removes any Javascript, like an ``onclick`` attribute.
``comments``:
- Any comments.
+ Removes any comments.
``style``:
- Any style tags or attributes.
+ Removes any style tags or attributes.
``links``:
- Remove any ``<link>`` tags
+ Removes any ``<link>`` tags
``meta``:
- Remove any ``<meta>`` tags
+ Removes any ``<meta>`` tags
``page_structure``:
Structural parts of a page: ``<head>``, ``<html>``, ``<title>``.
``processing_instructions``:
- Remove any processing instructions.
+ Removes any processing instructions.
``embedded``:
- Remove any embedded objects (flash, iframes)
+ Removes any embedded objects (flash, iframes)
``frames``:
- Remove any frame-related tags
+ Removes any frame-related tags
``forms``:
- Remove any form tags
+ Removes any form tags
``annoying_tags``:
Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marque>``
@@ -134,11 +112,6 @@
``allow_tags``:
A list of tags to include (default include all).
- ``strip_tags``:
- If true, then any tag taken out by remove_tags or allow_tags will
- leave its text in place; if false, then the tag and its content are
- removed.
-
``remove_unknown_tags``:
Remove any tags that aren't standard parts of HTML.
@@ -152,173 +125,249 @@
This modifies the document *in place*.
"""
- if hasattr(doc, 'getroot'):
- # ElementTree
- doc = doc.getroot()
- # IE conditional comments basically embed HTML that the parser doesn't
- # normally see. We can't allow anything like that, so we'll kill any
- # comments that could be conditional
- if not comments:
- bad = []
- for el in doc.getiterator(etree.Comment):
- if _conditional_comment_re.search(el.text):
- bad.append(el)
- for el in bad:
- el.drop_tree()
- # First, handle a case that IE treats <image> like <img>, and that can
- # confuse either this step or later steps.
- for el in doc.getiterator('image'):
- el.tag = 'img'
- kill_tags = set()
- remove_tags = set(remove_tags or ())
- if allow_tags:
- allow_tags = set(allow_tags)
- if scripts:
- kill_tags.add('script')
- if safe_attrs_only:
- safe_attrs = set(defs.safe_attrs)
- for el in doc.getiterator():
- attrib = el.attrib
- for aname in attrib.keys():
- if aname not in defs.safe_attrs:
- del attrib[aname]
- if javascript:
- if not safe_attrs_only:
- # safe_attrs handles events attributes itself
+
+ scripts = True
+ javascript = True
+ comments = True
+ style = False
+ links = True
+ meta = True
+ page_structure = True
+ processing_instructions = True
+ embedded = True
+ frames = True
+ forms = True
+ annoying_tags = True
+ remove_tags = None
+ allow_tags = None
+ remove_unknown_tags = True
+ safe_attrs_only = True
+ add_nofollow = False
+
+ def __init__(self, **kw):
+ for name, value in kw.items():
+ if not hasattr(self, name):
+ raise TypeError(
+ "Unknown parameter: %s=%r" % (name, value))
+ setattr(self, name, value)
+
+ def __call__(self, doc):
+ """
+ Cleans the document.
+ """
+ if hasattr(doc, 'getroot'):
+ # ElementTree
+ doc = doc.getroot()
+ # Normalize a case that IE treats <image> like <img>, and that
+ # can confuse either this step or later steps.
+ for el in doc.getiterator('image'):
+ el.tag = 'img'
+ if not self.comments:
+ # Of course, if we were going to kill comments anyway, we don't
+ # need to worry about this
+ self.kill_conditional_comments(doc)
+ kill_tags = set()
+ remove_tags = set(self.remove_tags or ())
+ if self.allow_tags:
+ allow_tags = set(self.allow_tags)
+ else:
+ allow_tags = set()
+ if self.scripts:
+ kill_tags.add('script')
+ if self.safe_attrs_only:
+ safe_attrs = set(defs.safe_attrs)
for el in doc.getiterator():
attrib = el.attrib
for aname in attrib.keys():
- if aname.startswith('on'):
+ if aname not in safe_attrs:
del attrib[aname]
- doc.rewrite_links(_remove_javascript, resolve_base_href=False)
- if not style:
- # If we're deleting style then we don't have to remove JS links
- # from styles, otherwise...
+ if self.javascript:
+ if not self.safe_attrs_only:
+ # safe_attrs handles events attributes itself
+ for el in doc.getiterator():
+ attrib = el.attrib
+ for aname in attrib.keys():
+ if aname.startswith('on'):
+ del attrib[aname]
+ doc.rewrite_links(self._remove_javascript_link,
+ resolve_base_href=False)
+ if not self.style:
+ # If we're deleting style then we don't have to remove JS links
+ # from styles, otherwise...
+ for el in _find_styled_elements(doc):
+ old = el.get('style')
+ new = _css_javascript_re.sub('', old)
+ new = _css_import_re.sub('', old)
+ if self._has_sneaky_javascript(new):
+ # Something tricky is going on...
+ del el.attrib['style']
+ elif new != old:
+ el.set('style', new)
+ for el in list(doc.getiterator('style')):
+ if el.get('type', '').lower().strip() == 'text/javascript':
+ el.drop_tree()
+ continue
+ old = el.text or ''
+ new = _css_javascript_re.sub('', old)
+ # The imported CSS can do anything; we just can't allow:
+ new = _css_import_re.sub('', old)
+ if self._has_sneaky_javascript(new):
+ # Something tricky is going on...
+ el.text = '/* deleted */'
+ elif new != old:
+ el.text = new
+ if self.comments or self.processing_instructions:
+ # FIXME: why either? I feel like there's some obscure reason
+ # because you can put PIs in comments...? But I've already
+ # forgotten it
+ kill_tags.add(etree.Comment)
+ if self.processing_instructions:
+ kill_tags.add(etree.ProcessingInstruction)
+ if self.style:
+ kill_tags.add('style')
for el in _find_styled_elements(doc):
- old = el.get('style')
- new = _css_javascript_re.sub('', old)
- new = _css_import_re.sub('', old)
- if _has_sneaky_javascript(new):
- # Something tricky is going on...
- del el.attrib['style']
- elif new != old:
- el.set('style', new)
- for el in list(doc.getiterator('style')):
- if el.get('type', '').lower().strip() == 'text/javascript':
+ del el.attrib['style']
+ if self.links:
+ kill_tags.add('link')
+ elif self.style or self.javascript:
+ # We must get rid of included stylesheets if Javascript is not
+ # allowed, as you can put Javascript in them
+ for el in list(doc.getiterator('link')):
+ if 'stylesheet' in el.get('rel', '').lower():
+ # Note this kills alternate stylesheets as well
el.drop_tree()
- continue
- old = el.text or ''
- new = _css_javascript_re.sub('', old)
- # The imported CSS can do anything; we just can't allow:
- new = _css_import_re.sub('', old)
- if _has_sneaky_javascript(new):
- # Something tricky is going on...
- el.text = '/* deleted */'
- elif new != old:
- el.text = new
- if comments or processing_instructions:
- # FIXME: why either? I feel like there's some obscure reason
- # because you can put PIs in comments...? But I've already
- # forgotten it
- kill_tags.add(etree.Comment)
- if processing_instructions:
- kill_tags.add(etree.ProcessingInstruction)
-## SB: Does this actually work? Definitely not the right place to do this.
-# if processing_instructions:
-# # FIXME: is this really the right place to remove these attributes?
-# for el in doc.xpath('descendant-or-self::*[@xmlns]'):
-# del el.attrib['xmlns']
- if style:
- kill_tags.add('style')
- for el in list(doc.getiterator('link')):
- if 'stylesheet' in el.get('rel', '').lower():
- el.drop_tree()
- for el in _find_styled_elements(doc):
- del el.attrib['style']
- if links:
- kill_tags.add('link')
- elif javascript:
- # FIXME: we should get rid of included stylesheets in this
- # case, as you can put Javascript in them
- pass
- if meta:
- kill_tags.add('meta')
- if page_structure:
- remove_tags.update(('head', 'html', 'title'))
- if embedded:
- # FIXME: is <layer> really embedded?
- kill_tags.update(('applet', 'param'))
- # The alternate contents that are in an iframe are a good fallback:
- # FIXME: somehow embed seems to be getting data, but from what I
- # can tell the embed tag is supposed to always be empty
- remove_tags.update(('iframe', 'object', 'embed', 'layer'))
- if frames:
- kill_tags.update(defs.frame_tags)
- if forms:
- remove_tags.add('form')
- kill_tags.update(('button', 'input', 'select', 'textarea'))
- if annoying_tags:
- remove_tags.update(('blink', 'marque'))
+ if self.meta:
+ kill_tags.add('meta')
+ if self.page_structure:
+ remove_tags.update(('head', 'html', 'title'))
+ if self.embedded:
+ # FIXME: is <layer> really embedded?
+ kill_tags.update(('applet', 'param'))
+ # The alternate contents that are in an iframe are a good fallback:
+ # FIXME: somehow embed seems to be getting data, but from what I
+ # can tell the embed tag is supposed to always be empty
+ remove_tags.update(('iframe', 'object', 'embed', 'layer'))
+ if self.frames:
+ kill_tags.update(defs.frame_tags)
+ if self.forms:
+ remove_tags.add('form')
+ kill_tags.update(('button', 'input', 'select', 'textarea'))
+ if self.annoying_tags:
+ remove_tags.update(('blink', 'marque'))
- _remove = []
- if strip_tags:
+ _remove = []
_kill = []
for el in doc.getiterator():
if el.tag in kill_tags:
_kill.append(el)
elif el.tag in remove_tags:
_remove.append(el)
- else:
- kill_tags.update(remove_tags)
- _kill = [ el for el in doc.getiterator()
- if el.tag in kill_tags ]
-
- if _remove and _remove[0] == doc:
- # We have to drop the parent-most tag, which we can't
- # do. Instead we'll rewrite it:
- el = _remove.pop(0)
- el.tag = 'div'
- el.attrib.clear()
- elif _kill and _kill[0] == doc:
- # We have to drop the parent-most element, which we can't
- # do. Instead we'll clear it:
- el = _kill.pop(0)
- if el.tag != 'html':
+
+ if _remove and _remove[0] == doc:
+ # We have to drop the parent-most tag, which we can't
+ # do. Instead we'll rewrite it:
+ el = _remove.pop(0)
el.tag = 'div'
- el.clear()
+ el.attrib.clear()
+ elif _kill and _kill[0] == doc:
+ # We have to drop the parent-most element, which we can't
+ # do. Instead we'll clear it:
+ el = _kill.pop(0)
+ if el.tag != 'html':
+ el.tag = 'div'
+ el.clear()
- for el in _kill:
- el.drop_tree()
- for el in _remove:
- el.drop_tag()
+ for el in _kill:
+ el.drop_tree()
+ for el in _remove:
+ el.drop_tag()
- if remove_unknown_tags:
+ allow_tags = self.allow_tags
+ if self.remove_unknown_tags:
+ if allow_tags:
+ raise ValueError(
+ "It does not make sense to pass in both allow_tags and remove_unknown_tags")
+ allow_tags = set(defs.tags)
if allow_tags:
- raise ValueError(
- "It does not make sense to pass in both allow_tags and remove_unknown_tags")
- allow_tags = set(defs.tags)
- if allow_tags:
- bad = []
- for el in doc.getiterator():
- if el.tag not in allow_tags:
- bad.append(el)
- if strip_tags:
+ bad = []
+ for el in doc.getiterator():
+ if el.tag not in allow_tags:
+ bad.append(el)
for el in bad:
el.drop_tag()
+ if self.add_nofollow:
+ for el in _find_external_links(doc):
+ el.set('rel', 'nofollow')
+
+ def kill_conditional_comments(self, doc):
+ """
+ IE conditional comments basically embed HTML that the parser
+ doesn't normally see. We can't allow anything like that, so
+ we'll kill any comments that could be conditional.
+ """
+ bad = []
+ self._kill_elements(
+ doc, lambda el: _conditional_comment_re.search(el.text),
+ etree.Comment)
+
+ def _kill_elements(self, doc, condition, iterate=None):
+ bad = []
+ for el in doc.getiterator(iterate):
+ if condition(el):
+ bad.append(el)
+ for el in bad:
+ el.drop_tree()
+
+ def _remove_javascript_link(self, link):
+ # links like "j a v a s c r i p t:" might be interpreted in IE
+ new = _whitespace_re.sub('', link)
+ if _javascript_scheme_re.search(new):
+ # FIXME: should this be None to delete?
+ return ''
+ return link
+
+ _decomment_re = re.compile(r'/\*.*?\*/', re.S)
+
+ def _has_sneaky_javascript(self, style):
+ """
+ Depending on the browser, stuff like ``e x p r e s s i o n(...)``
+ can get interpreted, or ``expre/* stuff */ssion(...)``. This
+ checks for attempt to do stuff like this.
+
+ Typically the response will be to kill the entire style; if you
+ have just a bit of Javascript in the style another rule will catch
+ that and remove only the Javascript from the style; this catches
+ more sneaky attempts.
+ """
+ style = self._decomment_re.sub('', style)
+ style = style.replace('\\', '')
+ style = _whitespace_re.sub('', style)
+ style = style.lower()
+ if 'javascript:' in style:
+ return True
+ if 'expression(' in style:
+ return True
+ return False
+
+ def clean_html(self, html):
+ if isinstance(html, basestring):
+ return_string = True
+ doc = parse(html)
else:
- for el in bad:
- el.drop_tree()
- if add_nofollow:
- for el in _find_external_links(doc):
- el.set('rel', 'nofollow')
-
-def _remove_javascript(link):
- # links like "j a v a s c r i p t:" might be interpreted in IE
- new = _whitespace_re.sub('', link)
- if _javascript_scheme_re.search(new):
- # FIXME: should this be None to delete?
- return ''
- return link
+ return_string = False
+ doc = copy.deepcopy(doc)
+ self(doc)
+ if return_string:
+ return tostring(doc)
+ else:
+ return doc
+
+clean = Cleaner()
+clean_html = clean.clean_html
+
+############################################################
+## Autolinking
+############################################################
_link_regexes = [
re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?)', re.I),
@@ -454,6 +503,10 @@
autolink_html.__doc__ = autolink.__doc__
+############################################################
+## Word wrapping
+############################################################
+
_avoid_word_break_elements = ['pre', 'textarea', 'code']
_avoid_word_break_classes = ['nobreak']
@@ -530,27 +583,4 @@
word = word[len(start):]
result += word
return result
-
-_decomment_re = re.compile(r'/\*.*?\*/', re.S)
-
-def _has_sneaky_javascript(style):
- """
- Depending on the browser, stuff like ``e x p r e s s i o n(...)``
- can get interpreted, or ``expre/* stuff */ssion(...)``. This
- checks for attempt to do stuff like this.
-
- Typically the response will be to kill the entire style; if you
- have just a bit of Javascript in the style another rule will catch
- that and remove only the Javascript from the style; this catches
- more sneaky attempts.
- """
- style = _decomment_re.sub('', style)
- style = style.replace('\\', '')
- style = _whitespace_re.sub('', style)
- style = style.lower()
- if 'javascript:' in style:
- return True
- if 'expression(' in style:
- return True
- return False
Modified: lxml/branch/html/src/lxml/html/tests/test_clean.txt
==============================================================================
--- lxml/branch/html/src/lxml/html/tests/test_clean.txt (original)
+++ lxml/branch/html/src/lxml/html/tests/test_clean.txt Tue Jul 3 03:29:49 2007
@@ -1,5 +1,5 @@
>>> from lxml.html import parse, tostring
->>> from lxml.html.clean import clean, clean_html
+>>> from lxml.html.clean import clean, clean_html, Cleaner
>>> from lxml.html import usedoctest
>>> doc = '''<html>
@@ -82,7 +82,7 @@
</body>
</html>
->>> print clean_html(doc, page_structure=False, safe_attrs_only=False)
+>>> print Cleaner(page_structure=False, safe_attrs_only=False).clean_html(doc)
<html>
<head>
<style>/* deleted */</style>
@@ -100,8 +100,8 @@
</body>
</html>
->>> print clean_html(doc, style=True, links=True, add_nofollow=True,
-... page_structure=False, safe_attrs_only=False)
+>>> print Cleaner(style=True, links=True, add_nofollow=True,
+... page_structure=False, safe_attrs_only=False).clean_html(doc)
<html>
<head>
</head>
More information about the lxml-checkins
mailing list