[Lxml-checkins] r43976 - in lxml/branch/html/src/lxml/html: . tests

ianb at codespeak.net ianb at codespeak.net
Fri Jun 1 20:11:25 CEST 2007


Author: ianb
Date: Fri Jun  1 20:11:25 2007
New Revision: 43976

Added:
   lxml/branch/html/src/lxml/html/tests/test_feedparser_data.py   (contents, props changed)
   lxml/branch/html/src/lxml/html/tests/transform_feedparser_data.py   (contents, props changed)
Modified:
   lxml/branch/html/src/lxml/html/__init__.py
   lxml/branch/html/src/lxml/html/clean.py
   lxml/branch/html/src/lxml/html/defs.py
Log:
Added tests from feedparser.  Make sure to traverse the root element as well as children (_itertree).  Keep contents of some tags like <iframe>.  Add filter for <blink>.  Add new parser that handles random HTML a bit better.

Modified: lxml/branch/html/src/lxml/html/__init__.py
==============================================================================
--- lxml/branch/html/src/lxml/html/__init__.py	(original)
+++ lxml/branch/html/src/lxml/html/__init__.py	Fri Jun  1 20:11:25 2007
@@ -269,7 +269,7 @@
     # FIXME: should this notice a fragment and parse accordingly?
     value = etree.HTML(html, html_parser)
     if value is None:
-        raise ParserError(
+        raise etree.ParserError(
             "Could not parse document")
     return value
 
@@ -283,15 +283,18 @@
     of only elements.
     """
     # FIXME: check what happens when you give html with a body, head, etc.
-    html = '<html><body>%s</body></html>' % html
+    start = html[:20].lstrip().lower()
+    if not start.startswith('<html') and not start.startswith('<!doctype'):
+        # FIXME: That test doesn't work with a doctype or PI
+        html = '<html><body>%s</body></html>' % html
     doc = HTML(html)
     assert doc.tag == 'html'
     bodies = [e for e in doc if e.tag == 'body']
-    assert len(bodies) == 1
+    assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html))
     body = bodies[0]
     elements = []
     if no_leading_text and body.text and body.text.strip():
-        raise ParserError(
+        raise etree.ParserError(
             "There is leading text: %r" % body.text)
     if body.text and body.text.strip():
         elements.append(body.text)
@@ -313,21 +316,81 @@
         if not isinstance(create_parent, basestring):
             create_parent = 'div'
         return parse_element('<%s>%s</%s>' % (create_parent, html, create_parent))
+    else:
+        print '----------\n', html
     elements = parse_elements(html, no_leading_text=True)
     if not elements:
-        raise ParserError(
+        raise etree.ParserError(
             "No elements found")
     if len(elements) > 1:
-        raise ParserError(
+        raise etree.ParserError(
             "Multiple elements found (%s)"
-            % ', '.join([e.tag for e in elements]))
+            % ', '.join([_element_name(e) for e in elements]))
     el = elements[0]
     if el.tail and el.tail.strip():
-        raise ParserError(
+        raise etree.ParserError(
             "Element followed by text: %r" % el.tail)
     el.tail = None
     return el
 
+def parse(html):
+    """
+    Parse the html, returning a single element/document.
+
+    This tries to minimally parse the chunk of text, without knowing if it
+    is a fragment or a document.
+    """
+    start = html[:10].lstrip().lower()
+    if start.startswith('<html') or start.startswith('<!doctype'):
+        # Looks like a full HTML document
+        return HTML(html)
+    # otherwise, lets parse it out...
+    doc = HTML(html)
+    bodies = doc.findall('body')
+    body = bodies[0]
+    if len(bodies) > 1:
+        # Somehow there are multiple bodies, which is bad, but just
+        # smash them into one body
+        for other_body in bodies[1:]:
+            if other_body.text:
+                if len(body):
+                    body[-1].tail = (body[-1].tail or '') + other_body.text
+                else:
+                    body.text = (body.text or '') + other_body.text
+            body.extend(other_body)
+            # We'll ignore tail
+            # I guess we are ignoring attributes too
+            other_body.drop_element()
+    heads = doc.findall('head')
+    if heads:
+        # Well, we have some sort of structure, so lets keep it all
+        head = heads[0]
+        if len(heads) > 1:
+            for other_head in heads[1:]:
+                head.extend(other_head)
+                # We don't care about text or tail in a head
+                other_head.drop_element()
+        return doc
+    
+    if (len(body) == 1 and (not body.text or not body.text.strip())
+        and (not body[-1].tail or not body[-1].tail.strip())):
+        # The body has just one element, so it was probably a single
+        # element passed in
+        return body[0]
+    # Now we have a body which represents a bunch of tags which have the
+    # content that was passed in.  We will create a fake container, which
+    # is the body tag, except body implies too much structure.
+    body.tag = 'div'
+    return body
+
+def _element_name(el):
+    if isinstance(el, etree.CommentBase):
+        return 'comment'
+    elif isinstance(el, basestring):
+        return 'string'
+    else:
+        return el.tag
+
 def Element(*args, **kw):
     v = html_parser.makeelement(*args, **kw)
     return v

Modified: lxml/branch/html/src/lxml/html/clean.py
==============================================================================
--- lxml/branch/html/src/lxml/html/clean.py	(original)
+++ lxml/branch/html/src/lxml/html/clean.py	Fri Jun  1 20:11:25 2007
@@ -1,7 +1,7 @@
 import re
 from lxml import etree
 from lxml.html import defs
-from lxml.html import parse_element, tostring
+from lxml.html import parse, tostring
 
 __all__ = ['clean_html', 'clean']
 
@@ -25,26 +25,36 @@
     Like clean(), but takes a text input document, and returns a text
     document.
     """
-    doc = parse_element(html, create_parent=True)
+    doc = parse(html)
     clean(doc, **kw)
     return tostring(doc)
 
+def _itertree(el):
+    """
+    Return the element's descendants, and the element itself
+    """
+    yield el
+    for item in el.iterdescendants():
+        yield item
+
 def clean(doc,
           scripts=True,
           javascript=True,
           comments=True,
           # process instructions?
           style=False,
-          links=False,
-          meta=False,
-          page_structure=False,
+          links=True,
+          meta=True,
+          page_structure=True,
           embedded=True,
           frames=True,
           forms=True,
+          annoying_tags=True,
           remove_tags=None,
           allow_tags=None,
           strip_tags=True,
           remove_unknown_tags=True,
+          safe_attrs_only=True,
           add_nofollow=False,
           # callbacks?
           ):
@@ -70,7 +80,8 @@
         Remove any ``<meta>`` tags
 
     ``page_structure``:
-        Structural parts of a page: ``<head>``, ``<html>``, ``<title>``
+        Structural parts of a page: ``<head>``, ``<html>``, ``<title>``.
+        Also xmlns attributes are removed with this.
 
     ``embedded``:
         Remove any embedded objects (flash, iframes)
@@ -81,6 +92,9 @@
     ``forms``:
         Remove any form tags
 
+    ``annoying_tags``:
+        Tags that aren't *wrong*, but are annoying.  ``<blink>`` (FIXME: marquee?)
+
     ``remove_tags``:
         A list of tags to remove.
 
@@ -95,6 +109,11 @@
     ``remove_unknown_tags``:
         Remove any tags that aren't standard parts of HTML.
 
+    ``safe_attrs_only``:
+        If true, only include 'safe' attributes (specifically the list
+        from `feedparser
+        <http://feedparser.org/docs/html-sanitization.html>`_).
+
     ``add_nofollow``:
         If true, then any <a> tags will have ``rel="nofollow"`` added to them.
 
@@ -108,12 +127,23 @@
     remove_tags = list(remove_tags or [])
     if scripts:
         kill_tags.append('script')
+    if safe_attrs_only:
+        safe_attrs = set(defs.safe_attrs)
+        for el in _itertree(doc):
+            for aname in el.attrib.keys():
+                if aname not in defs.safe_attrs:
+                    del el.attrib[aname]
     if javascript:
-        for attrib in defs.event_attrs:
-            for el in doc.xpath('descendant-or-self::*[@%s]' % attrib):
-                del el.attrib[attrib]
+        if not safe_attrs_only:
+            # safe_attrs handles events attributes itself
+            for el in _itertree(doc):
+                for aname in el.attrib.keys():
+                    if aname.startswith('on'):
+                        del el.attrib[aname]
         doc.rewrite_links(_remove_javascript, resolve_base_href=False)
         if not style:
+            # If we're deleting style then we don't have to remove JS links
+            # from styles, otherwise...
             for el in doc.xpath('descendant-or-self::*[@style]'):
                 old = el.attrib['style']
                 new = _css_javascript_re.sub('', old)
@@ -127,7 +157,7 @@
     if comments:
         # Easier way?
         bad = []
-        for el in doc.iterdescendants():
+        for el in _itertree(doc):
             if isinstance(el, etree._Comment):
                 bad.append(el)
         for el in bad:
@@ -144,16 +174,25 @@
         kill_tags.append('meta')
     if page_structure:
         remove_tags.extend(['head', 'html', 'title'])
+        # FIXME: is this really the right place to remove these attributes?
+        for el in doc.xpath('descendant-or-self::*[@xmlns]'):
+            del el.attrib['xmlns']
     if embedded:
         # FIXME: is <layer> really embedded?
-        kill_tags.extend(['object', 'embed', 'iframe', 'applet', 'layer'])
+        kill_tags.extend(['applet', 'param'])
+        # The alternate contents that are in an iframe are a good fallback:
+        # FIXME: somehow embed seems to be getting data, but from what I
+        # can tell the embed tag is supposed to always be empty
+        remove_tags.extend(['iframe', 'object', 'embed', 'layer'])
     if frames:
         kill_tags.extend(defs.frame_tags)
     if forms:
         remove_tags.extend(['form'])
         kill_tags.extend(['button', 'input', 'select', 'textarea'])
+    if annoying_tags:
+        remove_tags.extend(['blink'])
     bad = []
-    for el in doc.iterdescendants():
+    for el in _itertree(doc):
         if el.tag in kill_tags:
             bad.append(el)
     for el in bad:
@@ -164,7 +203,13 @@
             for tag in remove_tags])
         for el in doc.xpath(xpath):
             if strip_tags:
-                el.drop_tag()
+                if el.getparent():
+                    el.drop_tag()
+                else:
+                    # We have to drop the parent-most tag, which we can't
+                    # do.  Instead we'll rewrite it:
+                    el.tag = 'div'
+                    el.attrib.clear()
             else:
                 # FIXME: Should we test if this has been removed because of a parent?
                 el.drop_element()
@@ -175,7 +220,7 @@
         allow_tags = defs.tags
     if allow_tags:
         bad = []
-        for el in doc.iterdescendants():
+        for el in _itertree(doc):
             if el.tag not in allow_tags:
                 bad.append(el)
         for el in bad:

Modified: lxml/branch/html/src/lxml/html/defs.py
==============================================================================
--- lxml/branch/html/src/lxml/html/defs.py	(original)
+++ lxml/branch/html/src/lxml/html/defs.py	Fri Jun  1 20:11:25 2007
@@ -19,14 +19,27 @@
     'usemap']
 
 # Not in the HTML 4 spec:
-# onerror
+# onerror, onresize
 event_attrs = [
     'onblur', 'onchange', 'onclick', 'ondblclick', 'onerror',
     'onfocus', 'onkeydown', 'onkeypress', 'onkeyup', 'onload',
     'onmousedown', 'onmousemove', 'onmouseout', 'onmouseover',
-    'onmouseup', 'onreset', 'onselect', 'onsubmit', 'onunload',
+    'onmouseup', 'onreset', 'onresize', 'onselect', 'onsubmit',
+    'onunload',
     ]
 
+safe_attrs = [
+    'abbr', 'accept', 'accept-charset', 'accesskey', 'action', 'align',
+    'alt', 'axis', 'border', 'cellpadding', 'cellspacing', 'char', 'charoff',
+    'charset', 'checked', 'cite', 'class', 'clear', 'cols', 'colspan',
+    'color', 'compact', 'coords', 'datetime', 'dir', 'disabled', 'enctype',
+    'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace', 'id',
+    'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method',
+    'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly',
+    'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape',
+    'size', 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
+    'type', 'usemap', 'valign', 'value', 'vspace', 'width']
+
 # From http://htmlhelp.com/reference/html40/olist.html
 top_level_tags = [
     'html', 'head', 'body', 'frameset',

Added: lxml/branch/html/src/lxml/html/tests/test_feedparser_data.py
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/tests/test_feedparser_data.py	Fri Jun  1 20:11:25 2007
@@ -0,0 +1,83 @@
+import os
+import re
+import rfc822
+import unittest
+from lxml.tests.common_imports import doctest
+from lxml.doctestcompare import LHTMLOutputChecker
+
+from lxml.html import HTML, parse_element
+from lxml.html.clean import clean, clean_html
+
+feed_dir = os.path.join(os.path.dirname(__file__), 'feedparser-data')
+bar_re = re.compile(r"-----+")
+
+class DummyInput:
+    def __init__(self, **kw):
+        for name, value in kw.items():
+            setattr(self, name, value)
+
+class FeedTestCase(unittest.TestCase):
+
+    def __init__(self, filename):
+        self.filename = filename
+        unittest.TestCase.__init__(self)
+
+    def parse(self):
+        f = open(self.filename, 'rb')
+        headers = rfc822.Message(f)
+        c = f.read()
+        f.close()
+        if not headers.keys():
+            raise Exception(
+                "File %s has no headers" % self.filename)
+        self.description = headers['Description']
+        self.expect = headers['Expect']
+        self.ignore = headers.get('Ignore')
+        self.options = [
+            o.strip() for o in headers['Options'].split(',')
+            if o.strip()]
+        parts = bar_re.split(c)
+        self.input = parts[0].rstrip() + '\n'
+        if parts[1:]:
+            self.expect = parts[1].rstrip() + '\n'
+        else:
+            self.expect = None
+
+    def runTest(self):
+        self.parse()
+        if self.ignore:
+            # We've marked this test to be ignored.
+            return
+        kw = {}
+        for name in self.options:
+            if name.startswith('-'):
+                kw[name[1:]] = False
+            else:
+                kw[name] = True
+        transformed = clean_html(self.input, **kw)
+        assert self.expect is not None, (
+            "No expected output in %s" % self.filename)
+        checker = LHTMLOutputChecker()
+        if not checker.check_output(self.expect, transformed, 0):
+            result = checker.output_difference(
+                DummyInput(want=self.expect), transformed, 0)
+            #result += '\noptions: %s %r' % (', '.join(self.options), kw)
+            #result += repr(transformed)
+            raise Exception("\n"+result)
+
+    def shortDescription(self):
+        return self.filename
+
+def test_suite():
+    suite = unittest.TestSuite()
+    for fn in os.listdir(feed_dir):
+        fn = os.path.join(feed_dir, fn)
+        if fn.endswith('.data'):
+            case = FeedTestCase(fn)
+            suite.addTests([case])
+            # This is my lazy way of stopping on first error:
+            try:
+                case.runTest()
+            except:
+                break
+    return suite

Added: lxml/branch/html/src/lxml/html/tests/transform_feedparser_data.py
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/tests/transform_feedparser_data.py	Fri Jun  1 20:11:25 2007
@@ -0,0 +1,110 @@
+"""
+This takes the feedparser tests from here:
+
+  http://feedparser.org/tests/wellformed/sanitize/
+
+and rewrites them to be easier to handle (not using the internal model
+of feedparser).  The input format is::
+
+  <!--
+  Description: {description}
+  Expect: {expression}
+  -->
+  ...
+  <content ...>{content}</content>
+  ...
+
+The Expect expression is checked for
+``entries[0]['content'][0]['value'] == {data}``.
+
+The output format is::
+
+  Description: {description}
+  Expect: {expression} (if data couldn't be parsed)
+  Options: 
+
+  {content, unescaped}
+  ----------
+  {data, unescaped, if found}
+
+"""
+
+import re
+import os
+import traceback
+
+_desc_re = re.compile(r'\s*Description:\s*(.*)')
+_expect_re = re.compile(r'\s*Expect:\s*(.*)')
+_data_expect_re = re.compile(r"entries\[0\]\['[^']+'\](?:\[0\]\['value'\])?\s*==\s*(.*)")
+_feed_data_expect_re = re.compile(r"feed\['[^']+'\]\s*==\s*(.*)")
+
+def parse_content(content):
+    match = _desc_re.search(content)
+    desc = match.group(1)
+    match = _expect_re.search(content)
+    expect = match.group(1)
+    data = None
+    for regex in [_data_expect_re, _feed_data_expect_re]:
+        match = regex.search(expect)
+        if match:
+            # Icky, but I'll trust it
+            data = eval(match.group(1).strip())
+            break
+    c = None
+    for tag in ['content', 'summary', 'title', 'copyright', 'tagline', 'info', 'subtitle', 'fullitem', 'body', 'description', 'content:encoded']:
+        regex = re.compile(r"<%s.*?>(.*)</%s>" % (tag, tag), re.S)
+        match = regex.search(content)
+        if match:
+            c = match.group(1)
+            break
+    assert c is not None
+    # Seems like body isn't quoted
+    if tag != 'body':
+        c = c.replace('&lt;', '<')
+        c = c.replace('&amp;', '&')
+    # FIXME: I should really do more unescaping...
+    return {
+        'Description': desc,
+        'Expect': expect,
+        'data': data,
+        'content': c}
+
+def serialize_content(d):
+    s = '''\
+Description: %(Description)s
+Expect: %(Expect)s
+Options: 
+
+%(content)s
+''' % d
+    if d.get('data') is not None:
+        s += '----------\n%s' % d['data']
+    return s
+
+def translate_file(filename):
+    f = open(filename, 'rb')
+    c = f.read()
+    f.close()
+    try:
+        output = serialize_content(parse_content(c))
+    except:
+        print 'Bad data in %s:' % filename
+        print c
+        traceback.print_exc()
+        print '-'*60
+        return
+    new = os.path.splitext(filename)[0] + '.data'
+    f = open(new, 'wb')
+    f.write(output)
+    f.close()
+
+def translate_all(dir):
+    for fn in os.listdir(dir):
+        fn = os.path.join(dir, fn)
+        if fn.endswith('.xml'):
+            translate_file(fn)
+        
+if __name__ == '__main__':
+    import sys
+    translate_all(os.path.join(os.path.dirname(__file__), 'feedparser-data'))
+


More information about the lxml-checkins mailing list