[Lxml-checkins] r44011 - lxml/branch/html/src/lxml/html
scoder at codespeak.net
scoder at codespeak.net
Sun Jun 3 18:36:47 CEST 2007
Author: scoder
Date: Sun Jun 3 18:36:47 2007
New Revision: 44011
Modified:
lxml/branch/html/src/lxml/html/clean.py
Log:
better way to deal with comments and PIs
Modified: lxml/branch/html/src/lxml/html/clean.py
==============================================================================
--- lxml/branch/html/src/lxml/html/clean.py (original)
+++ lxml/branch/html/src/lxml/html/clean.py Sun Jun 3 18:36:47 2007
@@ -61,7 +61,6 @@
scripts=True,
javascript=True,
comments=True,
- # process instructions?
style=False,
links=True,
meta=True,
@@ -104,8 +103,7 @@
Structural parts of a page: ``<head>``, ``<html>``, ``<title>``.
``processing_instructions``:
- Remove any processing instructions. Also xmlns attributes are
- removed with this.
+ Remove any processing instructions.
``embedded``:
Remove any embedded objects (flash, iframes)
@@ -191,19 +189,14 @@
elif new != old:
el.text = new
if comments or processing_instructions:
- # Easier way?
- bad = []
- for el in _itertree(doc):
- if comments and isinstance(el, etree._Comment):
- bad.append(el)
- if processing_instructions and isinstance(el, etree._ProcessingInstruction):
- bad.append(el)
- for el in bad:
- el.drop_element()
+ kill_tags.append(etree.Comment)
if processing_instructions:
- # FIXME: is this really the right place to remove these attributes?
- for el in doc.xpath('descendant-or-self::*[@xmlns]'):
- del el.attrib['xmlns']
+ kill_tags.append(etree.ProcessingInstruction)
+## SB: Does this actually work? Definitely not the right place to do this.
+# if processing_instructions:
+# # FIXME: is this really the right place to remove these attributes?
+# for el in doc.xpath('descendant-or-self::*[@xmlns]'):
+# del el.attrib['xmlns']
if style:
kill_tags.append('style')
for el in doc.xpath('descendant-or-self::link[lower-case(@rel)="stylesheet"]'):
More information about the lxml-checkins
mailing list