[Lxml-checkins] r43970 - lxml/branch/html/src/lxml/html

ianb at codespeak.net ianb at codespeak.net
Fri Jun 1 08:41:26 CEST 2007


Author: ianb
Date: Fri Jun  1 08:41:25 2007
New Revision: 43970

Modified:
   lxml/branch/html/src/lxml/html/clean.py
Log:
add page_structure removal; for clean_html parse the html as a fragment

Modified: lxml/branch/html/src/lxml/html/clean.py
==============================================================================
--- lxml/branch/html/src/lxml/html/clean.py	(original)
+++ lxml/branch/html/src/lxml/html/clean.py	Fri Jun  1 08:41:25 2007
@@ -1,7 +1,7 @@
 import re
 from lxml import etree
 from lxml.html import defs
-from lxml.html import HTML, tostring
+from lxml.html import parse_element, tostring
 
 __all__ = ['clean_html', 'clean']
 
@@ -9,7 +9,6 @@
 # Other on* attributes that aren't standard?
 # Try these tests: http://feedparser.org/tests/wellformed/sanitize/
 # Also http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl
-# <head> and <title> is fishy in a fragment
 # max width for words
 # max height?
 # autolink?
@@ -26,7 +25,7 @@
     Like clean(), but takes a text input document, and returns a text
     document.
     """
-    doc = HTML(html)
+    doc = parse_element(html, create_parent=True)
     clean(doc, **kw)
     return tostring(doc)
 
@@ -38,6 +37,7 @@
           style=False,
           links=False,
           meta=False,
+          page_structure=False,
           embedded=True,
           frames=True,
           forms=True,
@@ -69,12 +69,15 @@
     ``meta``:
         Remove any ``<meta>`` tags
 
-    ``frames``:
-        Remove any frame-related tags
+    ``page_structure``:
+        Structural parts of a page: ``<head>``, ``<html>``, ``<title>``
 
     ``embedded``:
         Remove any embedded objects (flash, iframes)
 
+    ``frames``:
+        Remove any frame-related tags
+
     ``forms``:
         Remove any form tags
 
@@ -139,6 +142,8 @@
         kill_tags.append('link')
     if meta:
         kill_tags.append('meta')
+    if page_structure:
+        remove_tags.extend(['head', 'html', 'title'])
     if embedded:
         # FIXME: is <layer> really embedded?
         kill_tags.extend(['object', 'embed', 'iframe', 'applet', 'layer'])


More information about the lxml-checkins mailing list