[Lxml-checkins] r43970 - lxml/branch/html/src/lxml/html
ianb at codespeak.net
ianb at codespeak.net
Fri Jun 1 08:41:26 CEST 2007
Author: ianb
Date: Fri Jun 1 08:41:25 2007
New Revision: 43970
Modified:
lxml/branch/html/src/lxml/html/clean.py
Log:
add page_structure removal; for clean_html parse the html as a fragment
Modified: lxml/branch/html/src/lxml/html/clean.py
==============================================================================
--- lxml/branch/html/src/lxml/html/clean.py (original)
+++ lxml/branch/html/src/lxml/html/clean.py Fri Jun 1 08:41:25 2007
@@ -1,7 +1,7 @@
import re
from lxml import etree
from lxml.html import defs
-from lxml.html import HTML, tostring
+from lxml.html import parse_element, tostring
__all__ = ['clean_html', 'clean']
@@ -9,7 +9,6 @@
# Other on* attributes that aren't standard?
# Try these tests: http://feedparser.org/tests/wellformed/sanitize/
# Also http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl
-# <head> and <title> is fishy in a fragment
# max width for words
# max height?
# autolink?
@@ -26,7 +25,7 @@
Like clean(), but takes a text input document, and returns a text
document.
"""
- doc = HTML(html)
+ doc = parse_element(html, create_parent=True)
clean(doc, **kw)
return tostring(doc)
@@ -38,6 +37,7 @@
style=False,
links=False,
meta=False,
+ page_structure=False,
embedded=True,
frames=True,
forms=True,
@@ -69,12 +69,15 @@
``meta``:
Remove any ``<meta>`` tags
- ``frames``:
- Remove any frame-related tags
+ ``page_structure``:
+ Structural parts of a page: ``<head>``, ``<html>``, ``<title>``
``embedded``:
Remove any embedded objects (flash, iframes)
+ ``frames``:
+ Remove any frame-related tags
+
``forms``:
Remove any form tags
@@ -139,6 +142,8 @@
kill_tags.append('link')
if meta:
kill_tags.append('meta')
+ if page_structure:
+ remove_tags.extend(['head', 'html', 'title'])
if embedded:
# FIXME: is <layer> really embedded?
kill_tags.extend(['object', 'embed', 'iframe', 'applet', 'layer'])
More information about the lxml-checkins
mailing list