[Lxml-checkins] r43976 - in lxml/branch/html/src/lxml/html: . tests
ianb at codespeak.net
ianb at codespeak.net
Fri Jun 1 20:11:25 CEST 2007
Author: ianb
Date: Fri Jun 1 20:11:25 2007
New Revision: 43976
Added:
lxml/branch/html/src/lxml/html/tests/test_feedparser_data.py (contents, props changed)
lxml/branch/html/src/lxml/html/tests/transform_feedparser_data.py (contents, props changed)
Modified:
lxml/branch/html/src/lxml/html/__init__.py
lxml/branch/html/src/lxml/html/clean.py
lxml/branch/html/src/lxml/html/defs.py
Log:
Added tests from feedparser. Make sure to traverse the root element as well as children (_itertree). Keep contents of some tags like <iframe>. Add filter for <blink>. Add new parser that handles random HTML a bit better.
Modified: lxml/branch/html/src/lxml/html/__init__.py
==============================================================================
--- lxml/branch/html/src/lxml/html/__init__.py (original)
+++ lxml/branch/html/src/lxml/html/__init__.py Fri Jun 1 20:11:25 2007
@@ -269,7 +269,7 @@
# FIXME: should this notice a fragment and parse accordingly?
value = etree.HTML(html, html_parser)
if value is None:
- raise ParserError(
+ raise etree.ParserError(
"Could not parse document")
return value
@@ -283,15 +283,18 @@
of only elements.
"""
# FIXME: check what happens when you give html with a body, head, etc.
- html = '<html><body>%s</body></html>' % html
+ start = html[:20].lstrip().lower()
+ if not start.startswith('<html') and not start.startswith('<!doctype'):
+ # FIXME: That test doesn't work with a doctype or PI
+ html = '<html><body>%s</body></html>' % html
doc = HTML(html)
assert doc.tag == 'html'
bodies = [e for e in doc if e.tag == 'body']
- assert len(bodies) == 1
+ assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html))
body = bodies[0]
elements = []
if no_leading_text and body.text and body.text.strip():
- raise ParserError(
+ raise etree.ParserError(
"There is leading text: %r" % body.text)
if body.text and body.text.strip():
elements.append(body.text)
@@ -313,21 +316,81 @@
if not isinstance(create_parent, basestring):
create_parent = 'div'
return parse_element('<%s>%s</%s>' % (create_parent, html, create_parent))
+ else:
+ print '----------\n', html
elements = parse_elements(html, no_leading_text=True)
if not elements:
- raise ParserError(
+ raise etree.ParserError(
"No elements found")
if len(elements) > 1:
- raise ParserError(
+ raise etree.ParserError(
"Multiple elements found (%s)"
- % ', '.join([e.tag for e in elements]))
+ % ', '.join([_element_name(e) for e in elements]))
el = elements[0]
if el.tail and el.tail.strip():
- raise ParserError(
+ raise etree.ParserError(
"Element followed by text: %r" % el.tail)
el.tail = None
return el
+def parse(html):
+ """
+ Parse the html, returning a single element/document.
+
+ This tries to minimally parse the chunk of text, without knowing if it
+ is a fragment or a document.
+ """
+ start = html[:10].lstrip().lower()
+ if start.startswith('<html') or start.startswith('<!doctype'):
+ # Looks like a full HTML document
+ return HTML(html)
+ # otherwise, lets parse it out...
+ doc = HTML(html)
+ bodies = doc.findall('body')
+ body = bodies[0]
+ if len(bodies) > 1:
+ # Somehow there are multiple bodies, which is bad, but just
+ # smash them into one body
+ for other_body in bodies[1:]:
+ if other_body.text:
+ if len(body):
+ body[-1].tail = (body[-1].tail or '') + other_body.text
+ else:
+ body.text = (body.text or '') + other_body.text
+ body.extend(other_body)
+ # We'll ignore tail
+ # I guess we are ignoring attributes too
+ other_body.drop_element()
+ heads = doc.findall('head')
+ if heads:
+ # Well, we have some sort of structure, so lets keep it all
+ head = heads[0]
+ if len(heads) > 1:
+ for other_head in heads[1:]:
+ head.extend(other_head)
+ # We don't care about text or tail in a head
+ other_head.drop_element()
+ return doc
+
+ if (len(body) == 1 and (not body.text or not body.text.strip())
+ and (not body[-1].tail or not body[-1].tail.strip())):
+ # The body has just one element, so it was probably a single
+ # element passed in
+ return body[0]
+ # Now we have a body which represents a bunch of tags which have the
+ # content that was passed in. We will create a fake container, which
+ # is the body tag, except body implies too much structure.
+ body.tag = 'div'
+ return body
+
+def _element_name(el):
+ if isinstance(el, etree.CommentBase):
+ return 'comment'
+ elif isinstance(el, basestring):
+ return 'string'
+ else:
+ return el.tag
+
def Element(*args, **kw):
v = html_parser.makeelement(*args, **kw)
return v
Modified: lxml/branch/html/src/lxml/html/clean.py
==============================================================================
--- lxml/branch/html/src/lxml/html/clean.py (original)
+++ lxml/branch/html/src/lxml/html/clean.py Fri Jun 1 20:11:25 2007
@@ -1,7 +1,7 @@
import re
from lxml import etree
from lxml.html import defs
-from lxml.html import parse_element, tostring
+from lxml.html import parse, tostring
__all__ = ['clean_html', 'clean']
@@ -25,26 +25,36 @@
Like clean(), but takes a text input document, and returns a text
document.
"""
- doc = parse_element(html, create_parent=True)
+ doc = parse(html)
clean(doc, **kw)
return tostring(doc)
+def _itertree(el):
+ """
+ Return the element's descendants, and the element itself
+ """
+ yield el
+ for item in el.iterdescendants():
+ yield item
+
def clean(doc,
scripts=True,
javascript=True,
comments=True,
# process instructions?
style=False,
- links=False,
- meta=False,
- page_structure=False,
+ links=True,
+ meta=True,
+ page_structure=True,
embedded=True,
frames=True,
forms=True,
+ annoying_tags=True,
remove_tags=None,
allow_tags=None,
strip_tags=True,
remove_unknown_tags=True,
+ safe_attrs_only=True,
add_nofollow=False,
# callbacks?
):
@@ -70,7 +80,8 @@
Remove any ``<meta>`` tags
``page_structure``:
- Structural parts of a page: ``<head>``, ``<html>``, ``<title>``
+ Structural parts of a page: ``<head>``, ``<html>``, ``<title>``.
+ Also xmlns attributes are removed with this.
``embedded``:
Remove any embedded objects (flash, iframes)
@@ -81,6 +92,9 @@
``forms``:
Remove any form tags
+ ``annoying_tags``:
+ Tags that aren't *wrong*, but are annoying. ``<blink>`` (FIXME: marquee?)
+
``remove_tags``:
A list of tags to remove.
@@ -95,6 +109,11 @@
``remove_unknown_tags``:
Remove any tags that aren't standard parts of HTML.
+ ``safe_attrs_only``:
+ If true, only include 'safe' attributes (specifically the list
+ from `feedparser
+ <http://feedparser.org/docs/html-sanitization.html>`_).
+
``add_nofollow``:
If true, then any <a> tags will have ``rel="nofollow"`` added to them.
@@ -108,12 +127,23 @@
remove_tags = list(remove_tags or [])
if scripts:
kill_tags.append('script')
+ if safe_attrs_only:
+ safe_attrs = set(defs.safe_attrs)
+ for el in _itertree(doc):
+ for aname in el.attrib.keys():
+ if aname not in defs.safe_attrs:
+ del el.attrib[aname]
if javascript:
- for attrib in defs.event_attrs:
- for el in doc.xpath('descendant-or-self::*[@%s]' % attrib):
- del el.attrib[attrib]
+ if not safe_attrs_only:
+ # safe_attrs handles events attributes itself
+ for el in _itertree(doc):
+ for aname in el.attrib.keys():
+ if aname.startswith('on'):
+ del el.attrib[aname]
doc.rewrite_links(_remove_javascript, resolve_base_href=False)
if not style:
+ # If we're deleting style then we don't have to remove JS links
+ # from styles, otherwise...
for el in doc.xpath('descendant-or-self::*[@style]'):
old = el.attrib['style']
new = _css_javascript_re.sub('', old)
@@ -127,7 +157,7 @@
if comments:
# Easier way?
bad = []
- for el in doc.iterdescendants():
+ for el in _itertree(doc):
if isinstance(el, etree._Comment):
bad.append(el)
for el in bad:
@@ -144,16 +174,25 @@
kill_tags.append('meta')
if page_structure:
remove_tags.extend(['head', 'html', 'title'])
+ # FIXME: is this really the right place to remove these attributes?
+ for el in doc.xpath('descendant-or-self::*[@xmlns]'):
+ del el.attrib['xmlns']
if embedded:
# FIXME: is <layer> really embedded?
- kill_tags.extend(['object', 'embed', 'iframe', 'applet', 'layer'])
+ kill_tags.extend(['applet', 'param'])
+ # The alternate contents that are in an iframe are a good fallback:
+ # FIXME: somehow embed seems to be getting data, but from what I
+ # can tell the embed tag is supposed to always be empty
+ remove_tags.extend(['iframe', 'object', 'embed', 'layer'])
if frames:
kill_tags.extend(defs.frame_tags)
if forms:
remove_tags.extend(['form'])
kill_tags.extend(['button', 'input', 'select', 'textarea'])
+ if annoying_tags:
+ remove_tags.extend(['blink'])
bad = []
- for el in doc.iterdescendants():
+ for el in _itertree(doc):
if el.tag in kill_tags:
bad.append(el)
for el in bad:
@@ -164,7 +203,13 @@
for tag in remove_tags])
for el in doc.xpath(xpath):
if strip_tags:
- el.drop_tag()
+ if el.getparent():
+ el.drop_tag()
+ else:
+ # We have to drop the parent-most tag, which we can't
+ # do. Instead we'll rewrite it:
+ el.tag = 'div'
+ el.attrib.clear()
else:
# FIXME: Should we test if this has been removed because of a parent?
el.drop_element()
@@ -175,7 +220,7 @@
allow_tags = defs.tags
if allow_tags:
bad = []
- for el in doc.iterdescendants():
+ for el in _itertree(doc):
if el.tag not in allow_tags:
bad.append(el)
for el in bad:
Modified: lxml/branch/html/src/lxml/html/defs.py
==============================================================================
--- lxml/branch/html/src/lxml/html/defs.py (original)
+++ lxml/branch/html/src/lxml/html/defs.py Fri Jun 1 20:11:25 2007
@@ -19,14 +19,27 @@
'usemap']
# Not in the HTML 4 spec:
-# onerror
+# onerror, onresize
event_attrs = [
'onblur', 'onchange', 'onclick', 'ondblclick', 'onerror',
'onfocus', 'onkeydown', 'onkeypress', 'onkeyup', 'onload',
'onmousedown', 'onmousemove', 'onmouseout', 'onmouseover',
- 'onmouseup', 'onreset', 'onselect', 'onsubmit', 'onunload',
+ 'onmouseup', 'onreset', 'onresize', 'onselect', 'onsubmit',
+ 'onunload',
]
+safe_attrs = [
+ 'abbr', 'accept', 'accept-charset', 'accesskey', 'action', 'align',
+ 'alt', 'axis', 'border', 'cellpadding', 'cellspacing', 'char', 'charoff',
+ 'charset', 'checked', 'cite', 'class', 'clear', 'cols', 'colspan',
+ 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled', 'enctype',
+ 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace', 'id',
+ 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method',
+ 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly',
+ 'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape',
+ 'size', 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
+ 'type', 'usemap', 'valign', 'value', 'vspace', 'width']
+
# From http://htmlhelp.com/reference/html40/olist.html
top_level_tags = [
'html', 'head', 'body', 'frameset',
Added: lxml/branch/html/src/lxml/html/tests/test_feedparser_data.py
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/tests/test_feedparser_data.py Fri Jun 1 20:11:25 2007
@@ -0,0 +1,83 @@
+import os
+import re
+import rfc822
+import unittest
+from lxml.tests.common_imports import doctest
+from lxml.doctestcompare import LHTMLOutputChecker
+
+from lxml.html import HTML, parse_element
+from lxml.html.clean import clean, clean_html
+
+feed_dir = os.path.join(os.path.dirname(__file__), 'feedparser-data')
+bar_re = re.compile(r"-----+")
+
+class DummyInput:
+ def __init__(self, **kw):
+ for name, value in kw.items():
+ setattr(self, name, value)
+
+class FeedTestCase(unittest.TestCase):
+
+ def __init__(self, filename):
+ self.filename = filename
+ unittest.TestCase.__init__(self)
+
+ def parse(self):
+ f = open(self.filename, 'rb')
+ headers = rfc822.Message(f)
+ c = f.read()
+ f.close()
+ if not headers.keys():
+ raise Exception(
+ "File %s has no headers" % self.filename)
+ self.description = headers['Description']
+ self.expect = headers['Expect']
+ self.ignore = headers.get('Ignore')
+ self.options = [
+ o.strip() for o in headers['Options'].split(',')
+ if o.strip()]
+ parts = bar_re.split(c)
+ self.input = parts[0].rstrip() + '\n'
+ if parts[1:]:
+ self.expect = parts[1].rstrip() + '\n'
+ else:
+ self.expect = None
+
+ def runTest(self):
+ self.parse()
+ if self.ignore:
+ # We've marked this test to be ignored.
+ return
+ kw = {}
+ for name in self.options:
+ if name.startswith('-'):
+ kw[name[1:]] = False
+ else:
+ kw[name] = True
+ transformed = clean_html(self.input, **kw)
+ assert self.expect is not None, (
+ "No expected output in %s" % self.filename)
+ checker = LHTMLOutputChecker()
+ if not checker.check_output(self.expect, transformed, 0):
+ result = checker.output_difference(
+ DummyInput(want=self.expect), transformed, 0)
+ #result += '\noptions: %s %r' % (', '.join(self.options), kw)
+ #result += repr(transformed)
+ raise Exception("\n"+result)
+
+ def shortDescription(self):
+ return self.filename
+
+def test_suite():
+ suite = unittest.TestSuite()
+ for fn in os.listdir(feed_dir):
+ fn = os.path.join(feed_dir, fn)
+ if fn.endswith('.data'):
+ case = FeedTestCase(fn)
+ suite.addTests([case])
+ # This is my lazy way of stopping on first error:
+ try:
+ case.runTest()
+ except:
+ break
+ return suite
Added: lxml/branch/html/src/lxml/html/tests/transform_feedparser_data.py
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/tests/transform_feedparser_data.py Fri Jun 1 20:11:25 2007
@@ -0,0 +1,110 @@
+"""
+This takes the feedparser tests from here:
+
+ http://feedparser.org/tests/wellformed/sanitize/
+
+and rewrites them to be easier to handle (not using the internal model
+of feedparser). The input format is::
+
+ <!--
+ Description: {description}
+ Expect: {expression}
+ -->
+ ...
+ <content ...>{content}</content>
+ ...
+
+The Expect expression is checked for
+``entries[0]['content'][0]['value'] == {data}``.
+
+The output format is::
+
+ Description: {description}
+ Expect: {expression} (if data couldn't be parsed)
+ Options:
+
+ {content, unescaped}
+ ----------
+ {data, unescaped, if found}
+
+"""
+
+import re
+import os
+import traceback
+
+_desc_re = re.compile(r'\s*Description:\s*(.*)')
+_expect_re = re.compile(r'\s*Expect:\s*(.*)')
+_data_expect_re = re.compile(r"entries\[0\]\['[^']+'\](?:\[0\]\['value'\])?\s*==\s*(.*)")
+_feed_data_expect_re = re.compile(r"feed\['[^']+'\]\s*==\s*(.*)")
+
+def parse_content(content):
+ match = _desc_re.search(content)
+ desc = match.group(1)
+ match = _expect_re.search(content)
+ expect = match.group(1)
+ data = None
+ for regex in [_data_expect_re, _feed_data_expect_re]:
+ match = regex.search(expect)
+ if match:
+ # Icky, but I'll trust it
+ data = eval(match.group(1).strip())
+ break
+ c = None
+ for tag in ['content', 'summary', 'title', 'copyright', 'tagline', 'info', 'subtitle', 'fullitem', 'body', 'description', 'content:encoded']:
+ regex = re.compile(r"<%s.*?>(.*)</%s>" % (tag, tag), re.S)
+ match = regex.search(content)
+ if match:
+ c = match.group(1)
+ break
+ assert c is not None
+ # Seems like body isn't quoted
+ if tag != 'body':
+ c = c.replace('<', '<')
+ c = c.replace('&', '&')
+ # FIXME: I should really do more unescaping...
+ return {
+ 'Description': desc,
+ 'Expect': expect,
+ 'data': data,
+ 'content': c}
+
+def serialize_content(d):
+ s = '''\
+Description: %(Description)s
+Expect: %(Expect)s
+Options:
+
+%(content)s
+''' % d
+ if d.get('data') is not None:
+ s += '----------\n%s' % d['data']
+ return s
+
+def translate_file(filename):
+ f = open(filename, 'rb')
+ c = f.read()
+ f.close()
+ try:
+ output = serialize_content(parse_content(c))
+ except:
+ print 'Bad data in %s:' % filename
+ print c
+ traceback.print_exc()
+ print '-'*60
+ return
+ new = os.path.splitext(filename)[0] + '.data'
+ f = open(new, 'wb')
+ f.write(output)
+ f.close()
+
+def translate_all(dir):
+ for fn in os.listdir(dir):
+ fn = os.path.join(dir, fn)
+ if fn.endswith('.xml'):
+ translate_file(fn)
+
+if __name__ == '__main__':
+ import sys
+ translate_all(os.path.join(os.path.dirname(__file__), 'feedparser-data'))
+
More information about the lxml-checkins
mailing list