[Lxml-checkins] r44685 - in lxml/branch/html/src/lxml/html: . tests

ianb at codespeak.net ianb at codespeak.net
Tue Jul 3 03:29:50 CEST 2007


Author: ianb
Date: Tue Jul  3 03:29:49 2007
New Revision: 44685

Modified:
   lxml/branch/html/src/lxml/html/clean.py
   lxml/branch/html/src/lxml/html/tests/test_clean.txt
Log:
Moved to a class-based cleaner instead of a function.  Resulting rearrangement

Modified: lxml/branch/html/src/lxml/html/clean.py
==============================================================================
--- lxml/branch/html/src/lxml/html/clean.py	(original)
+++ lxml/branch/html/src/lxml/html/clean.py	Tue Jul  3 03:29:49 2007
@@ -64,66 +64,44 @@
     clean(doc, **kw)
     return tostring(doc)
 
-# FIXME: I really have to figure out what a sane set of defaults is
-# for these keyword arguments.  And is this signature out of control?
-# What about if we want things like whitelisting of <object> or other
-# controls?  Maybe this has to be more than a function.
-def clean(doc,
-          scripts=True,
-          javascript=True,
-          comments=True,
-          style=False,
-          links=True,
-          meta=True,
-          page_structure=True,
-          processing_instructions=True,
-          embedded=True,
-          frames=True,
-          forms=True,
-          annoying_tags=True,
-          remove_tags=None,
-          allow_tags=None,
-          strip_tags=True,
-          remove_unknown_tags=True,
-          safe_attrs_only=True,
-          add_nofollow=False,
-          # callbacks?
-          ):
+class Cleaner(object):
     """
-    Cleans the document of each of the possible offending elements:
+    Instances cleans the document of each of the possible offending
+    elements.  The cleaning is controlled by attributes; you can
+    override attributes in a subclass, or set them in the constructor.
 
     ``scripts``:
-        Any ``<script>`` tags.
+        Removes any ``<script>`` tags.
 
     ``javascript``:
-        Any Javascript, like an ``onclick`` attribute.
+        Removes any Javascript, like an ``onclick`` attribute.
 
     ``comments``:
-        Any comments.
+        Removes any comments.
 
     ``style``:
-        Any style tags or attributes.
+        Removes any style tags or attributes.
 
     ``links``:
-        Remove any ``<link>`` tags
+        Removes any ``<link>`` tags
 
     ``meta``:
-        Remove any ``<meta>`` tags
+        Removes any ``<meta>`` tags
 
     ``page_structure``:
         Structural parts of a page: ``<head>``, ``<html>``, ``<title>``.
 
     ``processing_instructions``:
-        Remove any processing instructions.
+        Removes any processing instructions.
 
     ``embedded``:
-        Remove any embedded objects (flash, iframes)
+        Removes any embedded objects (flash, iframes)
 
     ``frames``:
-        Remove any frame-related tags
+        Removes any frame-related tags
 
     ``forms``:
-        Remove any form tags
+        Removes any form tags
 
     ``annoying_tags``:
         Tags that aren't *wrong*, but are annoying.  ``<blink>`` and ``<marque>``
@@ -134,11 +112,6 @@
     ``allow_tags``:
         A list of tags to include (default include all).
 
-    ``strip_tags``:
-        If true, then any tag taken out by remove_tags or allow_tags will
-        leave its text in place; if false, then the tag and its content are
-        removed.
-
     ``remove_unknown_tags``:
         Remove any tags that aren't standard parts of HTML.
 
@@ -152,173 +125,249 @@
 
     This modifies the document *in place*.
     """
-    if hasattr(doc, 'getroot'):
-        # ElementTree
-        doc = doc.getroot()
-    # IE conditional comments basically embed HTML that the parser doesn't
-    # normally see.  We can't allow anything like that, so we'll kill any
-    # comments that could be conditional
-    if not comments:
-        bad = []
-        for el in doc.getiterator(etree.Comment):
-            if _conditional_comment_re.search(el.text):
-                bad.append(el)
-        for el in bad:
-            el.drop_tree()
-    # First, handle a case that IE treats <image> like <img>, and that can
-    # confuse either this step or later steps.
-    for el in doc.getiterator('image'):
-        el.tag = 'img'
-    kill_tags = set()
-    remove_tags = set(remove_tags or ())
-    if allow_tags:
-        allow_tags  = set(allow_tags)
-    if scripts:
-        kill_tags.add('script')
-    if safe_attrs_only:
-        safe_attrs = set(defs.safe_attrs)
-        for el in doc.getiterator():
-            attrib = el.attrib
-            for aname in attrib.keys():
-                if aname not in defs.safe_attrs:
-                    del attrib[aname]
-    if javascript:
-        if not safe_attrs_only:
-            # safe_attrs handles events attributes itself
+
+    scripts = True
+    javascript = True
+    comments = True
+    style = False
+    links = True
+    meta = True
+    page_structure = True
+    processing_instructions = True
+    embedded = True
+    frames = True
+    forms = True
+    annoying_tags = True
+    remove_tags = None
+    allow_tags = None
+    remove_unknown_tags = True
+    safe_attrs_only = True
+    add_nofollow = False
+
+    def __init__(self, **kw):
+        for name, value in kw.items():
+            if not hasattr(self, name):
+                raise TypeError(
+                    "Unknown parameter: %s=%r" % (name, value))
+            setattr(self, name, value)
+
+    def __call__(self, doc):
+        """
+        Cleans the document.
+        """
+        if hasattr(doc, 'getroot'):
+            # ElementTree
+            doc = doc.getroot()
+        # Normalize a case that IE treats <image> like <img>, and that
+        # can confuse either this step or later steps.
+        for el in doc.getiterator('image'):
+            el.tag = 'img'
+        if not self.comments:
+            # Of course, if we were going to kill comments anyway, we don't
+            # need to worry about this
+            self.kill_conditional_comments(doc)
+        kill_tags = set()
+        remove_tags = set(self.remove_tags or ())
+        if self.allow_tags:
+            allow_tags = set(self.allow_tags)
+        else:
+            allow_tags = set()
+        if self.scripts:
+            kill_tags.add('script')
+        if self.safe_attrs_only:
+            safe_attrs = set(defs.safe_attrs)
             for el in doc.getiterator():
                 attrib = el.attrib
                 for aname in attrib.keys():
-                    if aname.startswith('on'):
+                    if aname not in safe_attrs:
                         del attrib[aname]
-        doc.rewrite_links(_remove_javascript, resolve_base_href=False)
-        if not style:
-            # If we're deleting style then we don't have to remove JS links
-            # from styles, otherwise...
+        if self.javascript:
+            if not self.safe_attrs_only:
+                # safe_attrs handles events attributes itself
+                for el in doc.getiterator():
+                    attrib = el.attrib
+                    for aname in attrib.keys():
+                        if aname.startswith('on'):
+                            del attrib[aname]
+            doc.rewrite_links(self._remove_javascript_link,
+                              resolve_base_href=False)
+            if not self.style:
+                # If we're deleting style then we don't have to remove JS links
+                # from styles, otherwise...
+                for el in _find_styled_elements(doc):
+                    old = el.get('style')
+                    new = _css_javascript_re.sub('', old)
+                    new = _css_import_re.sub('', old)
+                    if self._has_sneaky_javascript(new):
+                        # Something tricky is going on...
+                        del el.attrib['style']
+                    elif new != old:
+                        el.set('style', new)
+                for el in list(doc.getiterator('style')):
+                    if el.get('type', '').lower().strip() == 'text/javascript':
+                        el.drop_tree()
+                        continue
+                    old = el.text or ''
+                    new = _css_javascript_re.sub('', old)
+                    # The imported CSS can do anything; we just can't allow:
+                    new = _css_import_re.sub('', old)
+                    if self._has_sneaky_javascript(new):
+                        # Something tricky is going on...
+                        el.text = '/* deleted */'
+                    elif new != old:
+                        el.text = new
+        if self.comments or self.processing_instructions:
+            # FIXME: why either?  I feel like there's some obscure reason
+            # because you can put PIs in comments...?  But I've already
+            # forgotten it
+            kill_tags.add(etree.Comment)
+        if self.processing_instructions:
+            kill_tags.add(etree.ProcessingInstruction)
+        if self.style:
+            kill_tags.add('style')
             for el in _find_styled_elements(doc):
-                old = el.get('style')
-                new = _css_javascript_re.sub('', old)
-                new = _css_import_re.sub('', old)
-                if _has_sneaky_javascript(new):
-                    # Something tricky is going on...
-                    del el.attrib['style']
-                elif new != old:
-                    el.set('style', new)
-            for el in list(doc.getiterator('style')):
-                if el.get('type', '').lower().strip() == 'text/javascript':
+                del el.attrib['style']
+        if self.links:
+            kill_tags.add('link')
+        elif self.style or self.javascript:
+            # We must get rid of included stylesheets if Javascript is not
+            # allowed, as you can put Javascript in them
+            for el in list(doc.getiterator('link')):
+                if 'stylesheet' in el.get('rel', '').lower():
+                    # Note this kills alternate stylesheets as well
                     el.drop_tree()
-                    continue
-                old = el.text or ''
-                new = _css_javascript_re.sub('', old)
-                # The imported CSS can do anything; we just can't allow:
-                new = _css_import_re.sub('', old)
-                if _has_sneaky_javascript(new):
-                    # Something tricky is going on...
-                    el.text = '/* deleted */'
-                elif new != old:
-                    el.text = new
-    if comments or processing_instructions:
-        # FIXME: why either?  I feel like there's some obscure reason
-        # because you can put PIs in comments...?  But I've already
-        # forgotten it
-        kill_tags.add(etree.Comment)
-    if processing_instructions:
-        kill_tags.add(etree.ProcessingInstruction)
-## SB: Does this actually work? Definitely not the right place to do this.
-#    if processing_instructions:
-#        # FIXME: is this really the right place to remove these attributes?
-#        for el in doc.xpath('descendant-or-self::*[@xmlns]'):
-#            del el.attrib['xmlns']
-    if style:
-        kill_tags.add('style')
-        for el in list(doc.getiterator('link')):
-            if 'stylesheet' in el.get('rel', '').lower():
-                el.drop_tree()
-        for el in _find_styled_elements(doc):
-            del el.attrib['style']
-    if links:
-        kill_tags.add('link')
-    elif javascript:
-        # FIXME: we should get rid of included stylesheets in this
-        # case, as you can put Javascript in them
-        pass
-    if meta:
-        kill_tags.add('meta')
-    if page_structure:
-        remove_tags.update(('head', 'html', 'title'))
-    if embedded:
-        # FIXME: is <layer> really embedded?
-        kill_tags.update(('applet', 'param'))
-        # The alternate contents that are in an iframe are a good fallback:
-        # FIXME: somehow embed seems to be getting data, but from what I
-        # can tell the embed tag is supposed to always be empty
-        remove_tags.update(('iframe', 'object', 'embed', 'layer'))
-    if frames:
-        kill_tags.update(defs.frame_tags)
-    if forms:
-        remove_tags.add('form')
-        kill_tags.update(('button', 'input', 'select', 'textarea'))
-    if annoying_tags:
-        remove_tags.update(('blink', 'marque'))
+        if self.meta:
+            kill_tags.add('meta')
+        if self.page_structure:
+            remove_tags.update(('head', 'html', 'title'))
+        if self.embedded:
+            # FIXME: is <layer> really embedded?
+            kill_tags.update(('applet', 'param'))
+            # The alternate contents that are in an iframe are a good fallback:
+            # FIXME: somehow embed seems to be getting data, but from what I
+            # can tell the embed tag is supposed to always be empty
+            remove_tags.update(('iframe', 'object', 'embed', 'layer'))
+        if self.frames:
+            kill_tags.update(defs.frame_tags)
+        if self.forms:
+            remove_tags.add('form')
+            kill_tags.update(('button', 'input', 'select', 'textarea'))
+        if self.annoying_tags:
+            remove_tags.update(('blink', 'marque'))
 
-    _remove = []
-    if strip_tags:
+        _remove = []
         _kill = []
         for el in doc.getiterator():
             if el.tag in kill_tags:
                 _kill.append(el)
             elif el.tag in remove_tags:
                 _remove.append(el)
-    else:
-        kill_tags.update(remove_tags)
-        _kill = [ el for el in doc.getiterator()
-                  if el.tag in kill_tags ]
-
-    if _remove and _remove[0] == doc:
-        # We have to drop the parent-most tag, which we can't
-        # do.  Instead we'll rewrite it:
-        el = _remove.pop(0)
-        el.tag = 'div'
-        el.attrib.clear()
-    elif _kill and _kill[0] == doc:
-        # We have to drop the parent-most element, which we can't
-        # do.  Instead we'll clear it:
-        el = _kill.pop(0)
-        if el.tag != 'html':
+
+        if _remove and _remove[0] == doc:
+            # We have to drop the parent-most tag, which we can't
+            # do.  Instead we'll rewrite it:
+            el = _remove.pop(0)
             el.tag = 'div'
-        el.clear()
+            el.attrib.clear()
+        elif _kill and _kill[0] == doc:
+            # We have to drop the parent-most element, which we can't
+            # do.  Instead we'll clear it:
+            el = _kill.pop(0)
+            if el.tag != 'html':
+                el.tag = 'div'
+            el.clear()
 
-    for el in _kill:
-        el.drop_tree()
-    for el in _remove:
-        el.drop_tag()
+        for el in _kill:
+            el.drop_tree()
+        for el in _remove:
+            el.drop_tag()
 
-    if remove_unknown_tags:
+        allow_tags = self.allow_tags
+        if self.remove_unknown_tags:
+            if allow_tags:
+                raise ValueError(
+                    "It does not make sense to pass in both allow_tags and remove_unknown_tags")
+            allow_tags = set(defs.tags)
         if allow_tags:
-            raise ValueError(
-                "It does not make sense to pass in both allow_tags and remove_unknown_tags")
-        allow_tags = set(defs.tags)
-    if allow_tags:
-        bad = []
-        for el in doc.getiterator():
-            if el.tag not in allow_tags:
-                bad.append(el)
-        if strip_tags:
+            bad = []
+            for el in doc.getiterator():
+                if el.tag not in allow_tags:
+                    bad.append(el)
             for el in bad:
                 el.drop_tag()
+        if self.add_nofollow:
+            for el in _find_external_links(doc):
+                el.set('rel', 'nofollow')
+
+    def kill_conditional_comments(self, doc):
+        """
+        IE conditional comments basically embed HTML that the parser
+        doesn't normally see.  We can't allow anything like that, so
+        we'll kill any comments that could be conditional.
+        """
+        bad = []
+        self._kill_elements(
+            doc, lambda el: _conditional_comment_re.search(el.text),
+            etree.Comment)                
+
+    def _kill_elements(self, doc, condition, iterate=None):
+        bad = []
+        for el in doc.getiterator(iterate):
+            if condition(el):
+                bad.append(el)
+        for el in bad:
+            el.drop_tree()
+
+    def _remove_javascript_link(self, link):
+        # links like "j a v a s c r i p t:" might be interpreted in IE
+        new = _whitespace_re.sub('', link)
+        if _javascript_scheme_re.search(new):
+            # FIXME: should this be None to delete?
+            return ''
+        return link
+
+    _decomment_re = re.compile(r'/\*.*?\*/', re.S)
+
+    def _has_sneaky_javascript(self, style):
+        """
+        Depending on the browser, stuff like ``e x p r e s s i o n(...)``
+        can get interpreted, or ``expre/* stuff */ssion(...)``.  This
+        checks for attempt to do stuff like this.
+
+        Typically the response will be to kill the entire style; if you
+        have just a bit of Javascript in the style another rule will catch
+        that and remove only the Javascript from the style; this catches
+        more sneaky attempts.
+        """
+        style = self._decomment_re.sub('', style)
+        style = style.replace('\\', '')
+        style = _whitespace_re.sub('', style)
+        style = style.lower()
+        if 'javascript:' in style:
+            return True
+        if 'expression(' in style:
+            return True
+        return False
+
+    def clean_html(self, html):
+        if isinstance(html, basestring):
+            return_string = True
+            doc = parse(html)
         else:
-            for el in bad:
-                el.drop_tree()
-    if add_nofollow:
-        for el in _find_external_links(doc):
-            el.set('rel', 'nofollow')
-
-def _remove_javascript(link):
-    # links like "j a v a s c r i p t:" might be interpreted in IE
-    new = _whitespace_re.sub('', link)
-    if _javascript_scheme_re.search(new):
-        # FIXME: should this be None to delete?
-        return ''
-    return link
+            return_string = False
+            doc = copy.deepcopy(doc)
+        self(doc)
+        if return_string:
+            return tostring(doc)
+        else:
+            return doc
+
+clean = Cleaner()
+clean_html = clean.clean_html
+
+############################################################
+## Autolinking
+############################################################
 
 _link_regexes = [
     re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?)', re.I),
@@ -454,6 +503,10 @@
 
 autolink_html.__doc__ = autolink.__doc__
 
+############################################################
+## Word wrapping
+############################################################
+
 _avoid_word_break_elements = ['pre', 'textarea', 'code']
 _avoid_word_break_classes = ['nobreak']
 
@@ -530,27 +583,4 @@
         word = word[len(start):]
     result += word
     return result
-
-_decomment_re = re.compile(r'/\*.*?\*/', re.S)
-
-def _has_sneaky_javascript(style):
-    """
-    Depending on the browser, stuff like ``e x p r e s s i o n(...)``
-    can get interpreted, or ``expre/* stuff */ssion(...)``.  This
-    checks for attempt to do stuff like this.
-
-    Typically the response will be to kill the entire style; if you
-    have just a bit of Javascript in the style another rule will catch
-    that and remove only the Javascript from the style; this catches
-    more sneaky attempts.
-    """
-    style = _decomment_re.sub('', style)
-    style = style.replace('\\', '')
-    style = _whitespace_re.sub('', style)
-    style = style.lower()
-    if 'javascript:' in style:
-        return True
-    if 'expression(' in style:
-        return True
-    return False
     

Modified: lxml/branch/html/src/lxml/html/tests/test_clean.txt
==============================================================================
--- lxml/branch/html/src/lxml/html/tests/test_clean.txt	(original)
+++ lxml/branch/html/src/lxml/html/tests/test_clean.txt	Tue Jul  3 03:29:49 2007
@@ -1,5 +1,5 @@
 >>> from lxml.html import parse, tostring
->>> from lxml.html.clean import clean, clean_html
+>>> from lxml.html.clean import clean, clean_html, Cleaner
 >>> from lxml.html import usedoctest
 
 >>> doc = '''<html>
@@ -82,7 +82,7 @@
   </body>
 </html>
 
->>> print clean_html(doc, page_structure=False, safe_attrs_only=False)
+>>> print Cleaner(page_structure=False, safe_attrs_only=False).clean_html(doc)
 <html>
   <head>
     <style>/* deleted */</style>
@@ -100,8 +100,8 @@
   </body>
 </html>
 
->>> print clean_html(doc, style=True, links=True, add_nofollow=True,
-...                  page_structure=False, safe_attrs_only=False)
+>>> print Cleaner(style=True, links=True, add_nofollow=True,
+...               page_structure=False, safe_attrs_only=False).clean_html(doc)
 <html>
   <head>
   </head>


More information about the lxml-checkins mailing list