[Lxml-checkins] r44011 - lxml/branch/html/src/lxml/html

scoder at codespeak.net scoder at codespeak.net
Sun Jun 3 18:36:47 CEST 2007


Author: scoder
Date: Sun Jun  3 18:36:47 2007
New Revision: 44011

Modified:
   lxml/branch/html/src/lxml/html/clean.py
Log:
better way to deal with comments and PIs

Modified: lxml/branch/html/src/lxml/html/clean.py
==============================================================================
--- lxml/branch/html/src/lxml/html/clean.py	(original)
+++ lxml/branch/html/src/lxml/html/clean.py	Sun Jun  3 18:36:47 2007
@@ -61,7 +61,6 @@
           scripts=True,
           javascript=True,
           comments=True,
-          # process instructions?
           style=False,
           links=True,
           meta=True,
@@ -104,8 +103,7 @@
         Structural parts of a page: ``<head>``, ``<html>``, ``<title>``.
 
     ``processing_instructions``:
-        Remove any processing instructions.  Also xmlns attributes are
-        removed with this.
+        Remove any processing instructions.
 
     ``embedded``:
         Remove any embedded objects (flash, iframes)
@@ -191,19 +189,14 @@
                 elif new != old:
                     el.text = new
     if comments or processing_instructions:
-        # Easier way?
-        bad = []
-        for el in _itertree(doc):
-            if comments and isinstance(el, etree._Comment):
-                bad.append(el)
-            if processing_instructions and isinstance(el, etree._ProcessingInstruction):
-                bad.append(el)
-        for el in bad:
-            el.drop_element()
+        kill_tags.append(etree.Comment)
     if processing_instructions:
-        # FIXME: is this really the right place to remove these attributes?
-        for el in doc.xpath('descendant-or-self::*[@xmlns]'):
-            del el.attrib['xmlns']
+        kill_tags.append(etree.ProcessingInstruction)
+## SB: Does this actually work? Definitely not the right place to do this.
+#    if processing_instructions:
+#        # FIXME: is this really the right place to remove these attributes?
+#        for el in doc.xpath('descendant-or-self::*[@xmlns]'):
+#            del el.attrib['xmlns']
     if style:
         kill_tags.append('style')
         for el in doc.xpath('descendant-or-self::link[lower-case(@rel)="stylesheet"]'):


More information about the lxml-checkins mailing list