[Lxml-checkins] r43968 - in lxml/branch/html/src/lxml/html: . tests
ianb at codespeak.net
ianb at codespeak.net
Fri Jun 1 08:35:35 CEST 2007
Author: ianb
Date: Fri Jun 1 08:35:34 2007
New Revision: 43968
Modified:
lxml/branch/html/src/lxml/html/__init__.py
lxml/branch/html/src/lxml/html/clean.py
lxml/branch/html/src/lxml/html/tests/test_clean.txt
Log:
Clean using rewrite_links; catch expression() in styles
Modified: lxml/branch/html/src/lxml/html/__init__.py
==============================================================================
--- lxml/branch/html/src/lxml/html/__init__.py (original)
+++ lxml/branch/html/src/lxml/html/__init__.py Fri Jun 1 08:35:34 2007
@@ -178,6 +178,9 @@
If you give ``base_href`` then all links passed to
``link_repl_func()`` will take that into account.
+
+ If the ``link_repl_func`` returns None, the attribute or
+ tag text will be removed completely.
"""
if base_href is not None:
# FIXME: this can be done in one pass with a wrapper
@@ -189,6 +192,13 @@
new_link = link_repl_func(link)
if new_link == link:
continue
+ if new_link is None:
+ # Remove the attribute or element content
+ if attrib is None:
+ el.text = ''
+ else:
+ del el.attrib[attrib]
+ continue
if attrib is None:
new = el.text[:pos] + new_link + el.text[pos+len(link):]
el.text = new
Modified: lxml/branch/html/src/lxml/html/clean.py
==============================================================================
--- lxml/branch/html/src/lxml/html/clean.py (original)
+++ lxml/branch/html/src/lxml/html/clean.py Fri Jun 1 08:35:34 2007
@@ -1,3 +1,4 @@
+import re
from lxml import etree
from lxml.html import defs
from lxml.html import HTML, tostring
@@ -5,9 +6,6 @@
__all__ = ['clean_html', 'clean']
# FIXME: I should study this for more ideas: http://feedparser.org/docs/html-sanitization.html
-# In CSS/style attribute:
-# url(javascript:...)
-# expression(...)
# Other on* attributes that aren't standard?
# Try these tests: http://feedparser.org/tests/wellformed/sanitize/
# Also http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl
@@ -19,6 +17,10 @@
# CSS stuff?
# remove images?
+# This is an IE-specific construct you can have in a stylesheet to
+# run some Javascript:
+_css_javascript_re = re.compile(
+ r'expression\(.*?\)', re.S|re.I)
def clean_html(html, **kw):
"""
@@ -108,14 +110,18 @@
for attrib in defs.event_attrs:
for el in doc.xpath('descendant-or-self::*[@%s]' % attrib):
del el.attrib[attrib]
- for attrib in defs.link_attrs:
- # FIXME: should call lower-case()
- # FIXME: starts-with isn't really good either, because
- # href=" javascript:..." is also a problem
- for el in doc.xpath("descendant-or-self::*[starts-with(@%s, 'javascript:')]" % attrib):
- if isinstance(el, basestring):
- assert 0, repr(el)
- el.attrib[attrib] = ""
+ doc.rewrite_links(_remove_javascript, resolve_base_href=False)
+ if not style:
+ for el in doc.xpath('descendant-or-self::*[@style]'):
+ old = el.attrib['style']
+ new = _css_javascript_re.sub('', old)
+ if new != old:
+ el.attrib['style'] = new
+ for el in doc.xpath('descendant-or-self::style'):
+ old = el.text or ''
+ new = _css_javascript_re.sub('', old)
+ if new != old:
+ el.text = new
if comments:
# Easier way?
bad = []
@@ -183,3 +189,9 @@
continue
el.attrib['rel'] = 'nofollow'
+def _remove_javascript(link):
+ if link.strip().startswith('javascript:'):
+ # FIXME: should this be None to delete?
+ return ''
+ return link
+
Modified: lxml/branch/html/src/lxml/html/tests/test_clean.txt
==============================================================================
--- lxml/branch/html/src/lxml/html/tests/test_clean.txt (original)
+++ lxml/branch/html/src/lxml/html/tests/test_clean.txt Fri Jun 1 08:35:34 2007
@@ -5,6 +5,10 @@
... <head>
... <script type="text/javascript" src="evil-site"></script>
... <link rel="alternate" type="text/rss" src="evil-rss">
+... <style>
+... body {background-image: url(javascript:do_evil)};
+... div {color: expression(evil)};
+... </style>
... </head>
... <body onload="evil_function()">
... <!-- I am interpreted for EVIL! -->
@@ -27,6 +31,10 @@
<head>
<script type="text/javascript" src="evil-site"></script>
<link rel="alternate" type="text/rss" src="evil-rss">
+ <style>
+ body {background-image: url(javascript:do_evil)};
+ div {color: expression(evil)};
+ </style>
</head>
<body onload="evil_function()">
<!-- I am interpreted for EVIL! -->
@@ -49,6 +57,10 @@
<head>
<script type="text/javascript" src="evil-site"></script>
<link rel="alternate" type="text/rss" src="evil-rss">
+ <style>
+ body {background-image: url(javascript:do_evil)};
+ div {color: expression(evil)};
+ </style>
</head>
<body onload="evil_function()">
<!-- I am interpreted for EVIL! -->
@@ -70,6 +82,10 @@
<html>
<head>
<link rel="alternate" type="text/rss" src="evil-rss">
+ <style>
+ body {background-image: url()};
+ div {color: };
+ </style>
</head>
<body>
<a href="">a link</a>
More information about the lxml-checkins
mailing list