[Lxml-checkins] r43898 - lxml/branch/html/src/lxml/html
ianb at codespeak.net
ianb at codespeak.net
Wed May 30 17:46:38 CEST 2007
Author: ianb
Date: Wed May 30 17:46:38 2007
New Revision: 43898
Modified:
lxml/branch/html/src/lxml/html/__init__.py
lxml/branch/html/src/lxml/html/clean.py
Log:
rename remove_element to drop_element, remove_tag to drop_tag. Add clean support for dropping meta tags, and drop applet along with other embedded objects
Modified: lxml/branch/html/src/lxml/html/__init__.py
==============================================================================
--- lxml/branch/html/src/lxml/html/__init__.py (original)
+++ lxml/branch/html/src/lxml/html/__init__.py Wed May 30 17:46:38 2007
@@ -11,7 +11,7 @@
class HtmlMixin(object):
- def remove_element(self):
+ def drop_element(self):
"""
Removes this element from the tree, including its children and
text. The tail text is joined to the previous element or
@@ -28,7 +28,7 @@
previous.tail = (previous.tail or '') + self.tail
parent.remove(self)
- def remove_tag(self):
+ def drop_tag(self):
"""
Remove the tag, but not its children or text. The children and text
are merged into the parent.
Modified: lxml/branch/html/src/lxml/html/clean.py
==============================================================================
--- lxml/branch/html/src/lxml/html/clean.py (original)
+++ lxml/branch/html/src/lxml/html/clean.py Wed May 30 17:46:38 2007
@@ -4,6 +4,13 @@
__all__ = ['clean_html', 'clean']
+# FIXME: I should study this for more ideas: http://feedparser.org/docs/html-sanitization.html
+# In CSS/style attribute:
+# url(javascript:...)
+# expression(...)
+# Other on* attributes that aren't standard?
+# Try these tests: http://feedparser.org/tests/wellformed/sanitize/
+
def clean_html(html, **kw):
"""
Like clean(), but takes a text input document, and returns a text
@@ -20,6 +27,7 @@
# process instructions?
style=False,
links=False,
+ meta=False,
embedded=True,
frames=True,
forms=True,
@@ -48,6 +56,9 @@
``links``:
Remove any ``<link>`` tags
+ ``meta``:
+ Remove any ``<meta>`` tags
+
``frames``:
Remove any frame-related tags
@@ -99,17 +110,19 @@
if isinstance(el, etree._Comment):
bad.append(el)
for el in bad:
- el.remove_element()
+ el.drop_element()
if style:
kill_tags.append('style')
for el in doc.xpath('descendant-or-self::link[lower-case(@rel)="stylesheet"]'):
- el.remove_element()
+ el.drop_element()
for el in doc.xpath('descendant-or-self::*[@style]'):
del el.attrib['style']
if links:
kill_tags.append('link')
+ if meta:
+ kill_tags.append('meta')
if embedded:
- kill_tags.extend(['object', 'embed', 'iframe'])
+ kill_tags.extend(['object', 'embed', 'iframe', 'applet'])
if frames:
kill_tags.extend(defs.frame_tags)
if forms:
@@ -122,17 +135,17 @@
if el.tag in kill_tags:
bad.append(el)
for el in bad:
- el.remove_element()
+ el.drop_element()
if remove_tags:
xpath = ' | '.join([
"descendant-or-self::%s" % tag
for tag in remove_tags])
for el in doc.xpath(xpath):
if strip_tags:
- el.remove_tag()
+ el.drop_tag()
else:
# FIXME: Should we test if this has been removed because of a parent?
- el.remove_element()
+ el.drop_element()
if remove_unknown_tags:
if allow_tags:
raise ValueError(
@@ -145,10 +158,10 @@
bad.append(el)
for el in bad:
if strip_tags:
- el.remove_tag()
+ el.drop_tag()
else:
# FIXME: Should we test if this has been removed because of a parent?
- el.remove_element()
+ el.drop_element()
if add_nofollow:
for el in doc.xpath('descendant-or-self::a[@href]'):
href = el.attrib['href']
More information about the lxml-checkins
mailing list