[Lxml-checkins] r43989 - in lxml/branch/html/src/lxml/html: . tests tests/hackers-org-data
ianb at codespeak.net
ianb at codespeak.net
Sat Jun 2 03:38:49 CEST 2007
Author: ianb
Date: Sat Jun 2 03:38:49 2007
New Revision: 43989
Added:
lxml/branch/html/src/lxml/html/tests/hackers-org-data/
lxml/branch/html/src/lxml/html/tests/hackers-org-data/background-image-plus.data
lxml/branch/html/src/lxml/html/tests/hackers-org-data/background-image-with-unicoded.data
lxml/branch/html/src/lxml/html/tests/hackers-org-data/downlevel-hidden.data
lxml/branch/html/src/lxml/html/tests/hackers-org-data/html-plus-time.data
lxml/branch/html/src/lxml/html/tests/hackers-org-data/javascript-link.data
lxml/branch/html/src/lxml/html/tests/hackers-org-data/style-comment.data
lxml/branch/html/src/lxml/html/tests/hackers-org-data/style-expression.data
lxml/branch/html/src/lxml/html/tests/hackers-org-data/style-import.data
lxml/branch/html/src/lxml/html/tests/hackers-org-data/style-js-tag.data
lxml/branch/html/src/lxml/html/tests/hackers-org-data/style-url-js.data
lxml/branch/html/src/lxml/html/tests/hackers-org-data/xml-data-island.data
lxml/branch/html/src/lxml/html/tests/hackers-org-data/xml-embedded-js.data
lxml/branch/html/src/lxml/html/tests/hackers-org-data/xml-namespace.data
Modified:
lxml/branch/html/src/lxml/html/clean.py
lxml/branch/html/src/lxml/html/tests/test_feedparser_data.py
Log:
Fix a number of smaller XSS attacks
Modified: lxml/branch/html/src/lxml/html/clean.py
==============================================================================
--- lxml/branch/html/src/lxml/html/clean.py (original)
+++ lxml/branch/html/src/lxml/html/clean.py Sat Jun 2 03:38:49 2007
@@ -15,17 +15,30 @@
# Some way to whitelist object, iframe, etc (e.g., if you want to
# allow *just* embedded YouTube movies)
# Log what was deleted and why?
+# style="behavior: ..." might be bad in IE?
+# Should we have something for just <meta http-equiv>? That's the worst of the
+# metas.
+# UTF-7 detections? Example:
+# <HEAD><META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=UTF-7"> </HEAD>+ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4-
+# you don't always have to have the charset set, if the page has no charset
+# and there's UTF7-like code in it.
+
# This is an IE-specific construct you can have in a stylesheet to
# run some Javascript:
_css_javascript_re = re.compile(
r'expression\s*\(.*?\)', re.S|re.I)
+# Do I have to worry about @\nimport?
+_css_import_re = re.compile(
+ r'@\s*import', re.I)
+
# All kinds of schemes besides just javascript: that can cause
# execution:
_javascript_scheme_re = re.compile(
- r'\s*(?:javascript|jscript|livescript|vbscript|about):', re.I)
+ r'\s*(?:javascript|jscript|livescript|vbscript|about|mocha):', re.I)
_whitespace_re = re.compile(r'\s+')
+# FIXME: should data: be blocked?
def clean_html(html, **kw):
"""
@@ -53,6 +66,7 @@
links=True,
meta=True,
page_structure=True,
+ processing_instructions=True,
embedded=True,
frames=True,
forms=True,
@@ -88,7 +102,10 @@
``page_structure``:
Structural parts of a page: ``<head>``, ``<html>``, ``<title>``.
- Also xmlns attributes are removed with this.
+
+ ``processing_instructions``:
+ Remove any processing instructions. Also xmlns attributes are
+ removed with this.
``embedded``:
Remove any embedded objects (flash, iframes)
@@ -154,21 +171,39 @@
for el in doc.xpath('descendant-or-self::*[@style]'):
old = el.attrib['style']
new = _css_javascript_re.sub('', old)
- if new != old:
+ new = _css_import_re.sub('', old)
+ if _has_sneaky_javascript(new):
+ # Something tricky is going on...
+ del el.attrib['style']
+ elif new != old:
el.attrib['style'] = new
for el in doc.xpath('descendant-or-self::style'):
+ if el.attrib.get('type', '').lower().strip() == 'text/javascript':
+ el.drop_element()
+ continue
old = el.text or ''
new = _css_javascript_re.sub('', old)
- if new != old:
+ # The imported CSS can do anything; we just can't allow:
+ new = _css_import_re.sub('', old)
+ if _has_sneaky_javascript(new):
+ # Something tricky is going on...
+ el.text = '/* deleted */'
+ elif new != old:
el.text = new
- if comments:
+ if comments or processing_instructions:
# Easier way?
bad = []
for el in _itertree(doc):
- if isinstance(el, etree._Comment):
+ if comments and isinstance(el, etree._Comment):
+ bad.append(el)
+ if processing_instructions and isinstance(el, etree._ProcessingInstruction):
bad.append(el)
for el in bad:
el.drop_element()
+ if processing_instructions:
+ # FIXME: is this really the right place to remove these attributes?
+ for el in doc.xpath('descendant-or-self::*[@xmlns]'):
+ del el.attrib['xmlns']
if style:
kill_tags.append('style')
for el in doc.xpath('descendant-or-self::link[lower-case(@rel)="stylesheet"]'):
@@ -177,13 +212,14 @@
del el.attrib['style']
if links:
kill_tags.append('link')
+ elif javascript:
+ # FIXME: we should get rid of included stylesheets in this
+ # case, as you can put Javascript in them
+ pass
if meta:
kill_tags.append('meta')
if page_structure:
remove_tags.extend(['head', 'html', 'title'])
- # FIXME: is this really the right place to remove these attributes?
- for el in doc.xpath('descendant-or-self::*[@xmlns]'):
- del el.attrib['xmlns']
if embedded:
# FIXME: is <layer> really embedded?
kill_tags.extend(['applet', 'param'])
@@ -452,6 +488,16 @@
word = word[len(start):]
result += word
return result
-
-
+
+_decomment_re = re.compile(r'/\*.*?\*/', re.S)
+
+def _has_sneaky_javascript(style):
+ style = _decomment_re.sub('', style)
+ style = style.replace('\\', '')
+ style = _whitespace_re.sub('', style)
+ if 'javascript:' in style:
+ return True
+ if 'expression(' in style:
+ return True
+ return False
Added: lxml/branch/html/src/lxml/html/tests/hackers-org-data/background-image-plus.data
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/tests/hackers-org-data/background-image-plus.data Sat Jun 2 03:38:49 2007
@@ -0,0 +1,8 @@
+Description: I built a quick XSS fuzzer to detect any erroneous characters that are allowed after the open parenthesis but before the JavaScript directive in IE and Netscape 8.1 in secure site mode. These are in decimal but you can include hex and add padding of course. (Any of the following chars can be used: 1-32, 34, 39, 160, 8192-8.13, 12288, 65279)
+ http://ha.ckers.org/xss.html#XSS_DIV_background-image_plus
+Options: -safe_attrs_only
+Notes: As you see, the CSS gets corrupted, but I don't really care that much.
+
+<DIV STYLE="background-image: url(javascript:alert('XSS'))">text</div>
+----------
+<div style="background-image: url(">text</div>
Added: lxml/branch/html/src/lxml/html/tests/hackers-org-data/background-image-with-unicoded.data
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/tests/hackers-org-data/background-image-with-unicoded.data Sat Jun 2 03:38:49 2007
@@ -0,0 +1,10 @@
+Description: exploit (this has been modified slightly to obfuscate the url parameter). The original vulnerability was found by Renaud Lifchitz as a vulnerability in Hotmail.
+ http://ha.ckers.org/xss.html#XSS_DIV_background_image_unicode
+Options: -safe_attrs_only
+Ignore: true
+Notes: I don't understand how this exploit works. It seems like the description actually refers to
+ the unicode you'd import, but why that matters I don't know.
+
+<DIV STYLE="background-image:\0075\0072\006C\0028'\006a\0061\0076\0061\0073\0063\0072\0069\0070\0074\003a\0061\006c\0065\0072\0074\0028.1027\0058.1053\0053\0027\0029'\0029">text</div>
+----------
+<div style="background-image: ">text</div>
Added: lxml/branch/html/src/lxml/html/tests/hackers-org-data/downlevel-hidden.data
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/tests/hackers-org-data/downlevel-hidden.data Sat Jun 2 03:38:49 2007
@@ -0,0 +1,11 @@
+Description: Downlevel-Hidden-Hidden block (only works in IE5.0 and later and Netscape 8.1 in IE rendering engine mode). Some websites consider anything inside a comment block to be safe and therefore does not need to be removed, which allows our Cross Site Scripting vector. Or the system could add comment tags around something to attempt to render it harmless. As we can see, that probably wouldn't do the job
+ http://ha.ckers.org/xss.html#XSS_Downlevel-Hidden
+Options: -comments
+
+<div><!--[if gte IE 4]>
+<SCRIPT>alert('XSS');</SCRIPT>
+<![endif]--></div>
+----------
+<div>[if gte IE 4]>
+<SCRIPT>alert('XSS');</SCRIPT>
+<![endif]</div>
Added: lxml/branch/html/src/lxml/html/tests/hackers-org-data/html-plus-time.data
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/tests/hackers-org-data/html-plus-time.data Sat Jun 2 03:38:49 2007
@@ -0,0 +1,12 @@
+Description: HTML+TIME in XML. This is how Grey Magic hacked Hotmail and Yahoo!. This only works in Internet Explorer and Netscape 8.1 in IE rendering engine mode and remember that you need to be between HTML and BODY tags for this to work
+ http://ha.ckers.org/xss.html#XSS_HTML_plus_time
+Ignore: true
+Notes: I don't understand the vector here, or how this is supposed to work.
+
+<div>
+<t:set attributeName="innerHTML" to="XSS<SCRIPT DEFER>alert("XSS")</SCRIPT>">
+</BODY></HTML></div>
+----------
+<div>
+<t:set attributeName="innerHTML" to="XSS<SCRIPT DEFER>alert("XSS")</SCRIPT>">
+</BODY></HTML>x</div>
Added: lxml/branch/html/src/lxml/html/tests/hackers-org-data/javascript-link.data
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/tests/hackers-org-data/javascript-link.data Sat Jun 2 03:38:49 2007
@@ -0,0 +1,15 @@
+Description: javascript: in many forms
+
+<div>
+ <a href="java
+script:alert()">x</a>
+ <a href="j a v a s c r i p t:alert()">x</a>
+ <a href="jscript
+:alert()">x</a>
+</div>
+----------
+<div>
+ <a href="">x</a>
+ <a href="">x</a>
+ <a href="">x</a>
+</div>
Added: lxml/branch/html/src/lxml/html/tests/hackers-org-data/style-comment.data
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/tests/hackers-org-data/style-comment.data Sat Jun 2 03:38:49 2007
@@ -0,0 +1,8 @@
+Description: to break up expression (Thanks to Roman Ivanov for this one)
+ http://ha.ckers.org/xss.html#XSS_STYLE_comment
+Options: -safe_attrs_only
+Notes: Because of the suspicious stuff in there, the style is removed entirely
+
+<IMG STYLE="xss:expr/*XSS*/ession(alert('XSS'))">
+----------
+<img>
Added: lxml/branch/html/src/lxml/html/tests/hackers-org-data/style-expression.data
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/tests/hackers-org-data/style-expression.data Sat Jun 2 03:38:49 2007
@@ -0,0 +1,10 @@
+Description: (this is really a hybrid of the above XSS vectors, but it really does show how hard STYLE tags can be to parse apart, like above this can send IE into a loop)
+ http://ha.ckers.org/xss.html#XSS_IMG_STYLE_expression
+Options: -safe_attrs_only
+Notes: Modified to avoid a parsing in libxml2 that ruins the XSS (the " marks).
+ Also there seemed to be an extra "p" in exppression
+
+<div><img style="xss: ex/*<A STYLE='no\xss:noxss(*//*);
+xss:ex/*XSS*//*/*/pression(alert('XSS'))"></div>
+----------
+<div><img></div>
Added: lxml/branch/html/src/lxml/html/tests/hackers-org-data/style-import.data
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/tests/hackers-org-data/style-import.data Sat Jun 2 03:38:49 2007
@@ -0,0 +1,8 @@
+Description: tags with broken up JavaScript for XSS (this XSS at times sends IE into an infinite loop of alerts)
+ http://ha.ckers.org/xss.html#XSS_STYLE
+Options: -safe_attrs_only
+
+<div><STYLE>@im\port'\ja\vasc\ript:alert("XSS")';</STYLE></div>
+----------
+<div><style>/* deleted */</style></div>
+
Added: lxml/branch/html/src/lxml/html/tests/hackers-org-data/style-js-tag.data
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/tests/hackers-org-data/style-js-tag.data Sat Jun 2 03:38:49 2007
@@ -0,0 +1,7 @@
+Description: (Older versions of Netscape only)
+ http://ha.ckers.org/xss.html#XSS_STYLE_tag
+Options: -safe_attrs_only
+
+<div><STYLE TYPE="text/javascript">alert('XSS');</STYLE></div>
+----------
+<div></div>
Added: lxml/branch/html/src/lxml/html/tests/hackers-org-data/style-url-js.data
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/tests/hackers-org-data/style-url-js.data Sat Jun 2 03:38:49 2007
@@ -0,0 +1,8 @@
+Description: http://ha.ckers.org/xss.html#XSS_STYLE_background-image
+Options: -style, -safe_attrs_only
+Notes: The CSS is messed up here, but so it goes
+
+<div><STYLE>.XSS{background-image:url("javascript:alert('XSS')");}</STYLE><A CLASS=XSS></A></div>
+----------
+<div><style>.XSS{background-image:url()");}</style><a class="XSS"></a></div>
+
Added: lxml/branch/html/src/lxml/html/tests/hackers-org-data/xml-data-island.data
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/tests/hackers-org-data/xml-data-island.data Sat Jun 2 03:38:49 2007
@@ -0,0 +1,10 @@
+Description: XML data island with comment obfuscation (this is another take on the same exploit that doesn't use CDATA fields, but rather uses comments to break up the javascript directive)
+ http://ha.ckers.org/xss.html#XSS_XML_data_island_comment
+Ignore: true
+Notes: I don't understand the vector here. Maybe datasrc should be filtered?
+
+<div><XML ID="xss"><I><B><IMG SRC="javas<!-- -->cript:alert('XSS')"></B></I></XML>
+<SPAN DATASRC="#xss" DATAFLD="B" DATAFORMATAS="HTML"></SPAN></div>
+----------
+<div><XML ID="xss"><I><B><IMG SRC="javas<!-- -->cript:alert('XSS')"></B></I></XML>
+<SPAN DATASRC="#xss" DATAFLD="B" DATAFORMATAS="HTML"></SPAN>x</div>
Added: lxml/branch/html/src/lxml/html/tests/hackers-org-data/xml-embedded-js.data
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/tests/hackers-org-data/xml-embedded-js.data Sat Jun 2 03:38:49 2007
@@ -0,0 +1,9 @@
+Description: Locally hosted XML with embedded JavaScript#XSS_Local_XML that is generated using an XML data island. This is the same as above but instead referrs to a locally hosted (must be on the same server) XML file that contains your cross site scripting vector. You can see the result here <http://ha.ckers.org/xssxmltest.html>
+ http://ha.ckers.org/xss.html#XSS_Local_XML
+
+<div><XML SRC="xsstest.xml" ID=I></XML>
+<SPAN DATASRC=#I DATAFLD=C DATAFORMATAS=HTML></SPAN></div>
+----------
+<div>
+ <span></span>
+</div>
Added: lxml/branch/html/src/lxml/html/tests/hackers-org-data/xml-namespace.data
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/tests/hackers-org-data/xml-namespace.data Sat Jun 2 03:38:49 2007
@@ -0,0 +1,16 @@
+Description: XML namespace. The htc file must be located on the same server as your XSS vector
+ http://ha.ckers.org/xss.html#XSS_XML_namespace
+Note: I don't completely understand the vector here. page_structure is what does this.
+
+<HTML xmlns:xss>
+ <body>
+ <?import namespace="xss" implementation="http://ha.ckers.org/xss.htc">
+ <xss:xss>XSS</xss:xss>
+ </body>
+</HTML>
+----------
+<HTML>
+ <body>
+ <div>XSS</div>
+ </body>
+</HTML>
Modified: lxml/branch/html/src/lxml/html/tests/test_feedparser_data.py
==============================================================================
--- lxml/branch/html/src/lxml/html/tests/test_feedparser_data.py (original)
+++ lxml/branch/html/src/lxml/html/tests/test_feedparser_data.py Sat Jun 2 03:38:49 2007
@@ -8,7 +8,10 @@
from lxml.html import HTML, parse_element
from lxml.html.clean import clean, clean_html
-feed_dir = os.path.join(os.path.dirname(__file__), 'feedparser-data')
+feed_dirs = [
+ os.path.join(os.path.dirname(__file__), 'feedparser-data'),
+ os.path.join(os.path.dirname(__file__), 'hackers-org-data'),
+ ]
bar_re = re.compile(r"-----+")
class DummyInput:
@@ -31,10 +34,10 @@
raise Exception(
"File %s has no headers" % self.filename)
self.description = headers['Description']
- self.expect = headers['Expect']
+ self.expect = headers.get('Expect', '')
self.ignore = headers.get('Ignore')
self.options = [
- o.strip() for o in headers['Options'].split(',')
+ o.strip() for o in headers.get('Options', '').split(',')
if o.strip()]
parts = bar_re.split(c)
self.input = parts[0].rstrip() + '\n'
@@ -54,7 +57,10 @@
kw[name[1:]] = False
else:
kw[name] = True
- transformed = clean_html(self.input, **kw)
+ if kw.get('clean', True):
+ transformed = clean_html(self.input, **kw)
+ else:
+ transformed = self.input
assert self.expect is not None, (
"No expected output in %s" % self.filename)
checker = LHTMLOutputChecker()
@@ -70,14 +76,15 @@
def test_suite():
suite = unittest.TestSuite()
- for fn in os.listdir(feed_dir):
- fn = os.path.join(feed_dir, fn)
- if fn.endswith('.data'):
- case = FeedTestCase(fn)
- suite.addTests([case])
- # This is my lazy way of stopping on first error:
- try:
- case.runTest()
- except:
- break
+ for dir in feed_dirs:
+ for fn in os.listdir(dir):
+ fn = os.path.join(dir, fn)
+ if fn.endswith('.data'):
+ case = FeedTestCase(fn)
+ suite.addTests([case])
+ # This is my lazy way of stopping on first error:
+ try:
+ case.runTest()
+ except:
+ break
return suite
More information about the lxml-checkins
mailing list