[Lxml-checkins] r44115 - in lxml/branch/html/src/lxml/html: . tests

scoder at codespeak.net scoder at codespeak.net
Sat Jun 9 16:44:53 CEST 2007


Author: scoder
Date: Sat Jun  9 16:44:51 2007
New Revision: 44115

Modified:
   lxml/branch/html/src/lxml/html/__init__.py
   lxml/branch/html/src/lxml/html/clean.py
   lxml/branch/html/src/lxml/html/diff.py
   lxml/branch/html/src/lxml/html/tests/test_basic.txt
Log:
renamed drop_tag to drop_element and drop_element to drop_tree, some more cleanup

Modified: lxml/branch/html/src/lxml/html/__init__.py
==============================================================================
--- lxml/branch/html/src/lxml/html/__init__.py	(original)
+++ lxml/branch/html/src/lxml/html/__init__.py	Sat Jun  9 16:44:51 2007
@@ -11,31 +11,31 @@
 
 _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]")
 #_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'})
-_class_xpath = etree.XPath("descendant-or-self::*[contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
+_class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
 _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")
+_collect_string_content = etree.XPath("string()")
 _css_url_re = re.compile(r'url\((.*?)\)', re.I)
 _css_import_re = re.compile(r'@import "(.*?)"')
 
 class HtmlMixin(object):
 
-    def drop_element(self):
+    def drop_tree(self):
         """
         Removes this element from the tree, including its children and
         text.  The tail text is joined to the previous element or
         parent.
         """
         parent = self.getparent()
-        assert parent
-        index = parent.index(self)
+        assert parent is not None
         if self.tail:
-            if index == 0:
+            previous = self.getprevious()
+            if previous is None:
                 parent.text = (parent.text or '') + self.tail
             else:
-                previous = parent[index-1]
                 previous.tail = (previous.tail or '') + self.tail
         parent.remove(self)
 
-    def drop_tag(self):
+    def drop_element(self):
         """
         Remove the tag, but not its children or text.  The children and text
         are merged into the parent.
@@ -43,29 +43,28 @@
         Example::
 
             >>> h = parse_element('<div>Hello <b>World!</b></div>')
-            >>> h.xpath('//b')[0].drop_tag()
+            >>> h.find('//b').drop_element()
             >>> print tostring(h)
             <div>Hello World!</div>
         """
         parent = self.getparent()
-        assert parent
-        index = parent.index(self)
+        assert parent is not None
+        previous = self.getprevious()
         if self.text:
-            if index == 0:
+            if previous is None:
                 parent.text = (parent.text or '') + self.text
             else:
-                prev = parent[index-1]
-                prev.tail = (prev.tail or '') + self.text
+                previous.tail = (previous.tail or '') + self.text
         if self.tail:
             if len(self):
                 last = self[-1]
                 last.tail = (last.tail or '') + self.tail
-            elif index == 0:
+            elif previous is None:
                 parent.text = (parent.text or '') + self.tail
             else:
-                prev = parent[index-1]
-                prev.tail = (prev.tail or '') + self.tail
-        parent[index:index+1] = list(self)
+                previous.tail = (previous.tail or '') + self.tail
+        index = parent.index(self)
+        parent[index:index+1] = self[:]
 
     def find_rel_links(self, rel):
         """
@@ -73,7 +72,7 @@
         """
         rel = rel.lower()
         return [el for el in _rel_links_xpath(self)
-                if el.attrib['rel'].lower() == rel]
+                if el.get('rel').lower() == rel]
 
     def find_class(self, class_name):
         """
@@ -103,7 +102,7 @@
         """
         Return the text content of the tag (and the text in any children).
         """
-        return self.xpath("string()")
+        return _collect_string_content(self)
 
     ########################################
     ## Link functions
@@ -134,8 +133,8 @@
         base_href = None
         basetags = self.xpath('//base[@href]')
         for b in basetags:
-            base_href = b.attrib['href']
-            b.drop_element()
+            base_href = b.get('href')
+            b.drop_tree()
         if not base_href:
             return
         self.make_links_absolute(base_href, resolve_base_href=False)
@@ -370,7 +369,7 @@
                 body.extend(other_body)
                 # We'll ignore tail
                 # I guess we are ignoring attributes too
-                other_body.drop_element()
+                other_body.drop_tree()
     else:
         body = None
     heads = doc.findall('head')
@@ -381,7 +380,7 @@
             for other_head in heads[1:]:
                 head.extend(other_head)
                 # We don't care about text or tail in a head
-                other_head.drop_element()
+                other_head.drop_tree()
         return doc
     if (len(body) == 1 and (not body.text or not body.text.strip())
         and (not body[-1].tail or not body[-1].tail.strip())):

Modified: lxml/branch/html/src/lxml/html/clean.py
==============================================================================
--- lxml/branch/html/src/lxml/html/clean.py	(original)
+++ lxml/branch/html/src/lxml/html/clean.py	Sat Jun  9 16:44:51 2007
@@ -53,7 +53,7 @@
     "descendant-or-self::*[@style]")
 
 _find_external_links = etree.XPath(
-    "descendant-or-self::a[string(@href) and substring(@href,0,1) != '#']")
+    "descendant-or-self::a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']")
 
 def clean_html(html, **kw):
     """
@@ -164,7 +164,7 @@
             if _conditional_comment_re.search(el.text):
                 bad.append(el)
         for el in bad:
-            el.drop_element()
+            el.drop_tree()
     # First, handle a case that IE treats <image> like <img>, and that can
     # confuse either this step or later steps.
     for el in doc.getiterator('image'):
@@ -205,7 +205,7 @@
                     el.set('style', new)
             for el in list(doc.getiterator('style')):
                 if el.get('type', '').lower().strip() == 'text/javascript':
-                    el.drop_element()
+                    el.drop_tree()
                     continue
                 old = el.text or ''
                 new = _css_javascript_re.sub('', old)
@@ -230,9 +230,9 @@
 #            del el.attrib['xmlns']
     if style:
         kill_tags.add('style')
-        for el in doc.getiterator('link'):
+        for el in list(doc.getiterator('link')):
             if 'stylesheet' in el.get('rel', '').lower():
-                el.drop_element()
+                el.drop_tree()
         for el in _find_styled_elements(doc):
             del el.attrib['style']
     if links:
@@ -288,9 +288,9 @@
         el.clear()
 
     for el in _kill:
-        el.drop_element()
+        el.drop_tree()
     for el in _remove:
-        el.drop_tag()
+        el.drop_element()
 
     if remove_unknown_tags:
         if allow_tags:
@@ -304,10 +304,10 @@
                 bad.append(el)
         if strip_tags:
             for el in bad:
-                el.drop_tag()
+                el.drop_element()
         else:
             for el in bad:
-                el.drop_element()
+                el.drop_tree()
     if add_nofollow:
         for el in _find_external_links(doc):
             el.set('rel', 'nofollow')
@@ -356,7 +356,7 @@
     """
     if el.tag in avoid_elements:
         return
-    class_name = el.attrib.get('class')
+    class_name = el.get('class')
     if class_name:
         class_name = class_name.split()
         for match_class in avoid_classes:
@@ -428,7 +428,7 @@
             assert not leading_text
             leading_text = prev_text
         anchor = factory('a')
-        anchor.attrib['href'] = link
+        anchor.set('href', link)
         body = best_match.group('body')
         if not body:
             body = link
@@ -478,7 +478,7 @@
     #   http://www.cs.tut.fi/~jkorpela/html/nobr.html
     if el.tag in _avoid_word_break_elements:
         return
-    class_name = el.attrib.get('class')
+    class_name = el.get('class')
     if class_name:
         dont_break = False
         class_name = class_name.split()

Modified: lxml/branch/html/src/lxml/html/diff.py
==============================================================================
--- lxml/branch/html/src/lxml/html/diff.py	(original)
+++ lxml/branch/html/src/lxml/html/diff.py	Sat Jun  9 16:44:51 2007
@@ -516,7 +516,7 @@
     included as a special kind of diffable token."""
     body_el = parse_html(html, cleanup=True)
     # Then we split the document into text chunks for each tag, word, and end tag:
-    chunks = flatten_el(body_el, drop_tag=True, include_hrefs=include_hrefs)
+    chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs)
     # Finally re-joining them into token objects:
     return fixup_chunks(chunks)
 
@@ -657,14 +657,14 @@
     )
 
 
-def flatten_el(el, include_hrefs, drop_tag=False):
+def flatten_el(el, include_hrefs, skip_tag=False):
     """ Takes an lxml element el, and generates all the text chunks for
     that tag.  Each start tag is a chunk, each word is a chunk, and each
     end tag is a chunk.
 
-    If drop_tag is true, then the outermost container tag is
+    If skip_tag is true, then the outermost container tag is
     not returned (just its contents)."""
-    if not drop_tag:
+    if not skip_tag:
         if el.tag == 'img':
             yield ('img', el.attrib['src'], start_tag(el))
         else:
@@ -679,7 +679,7 @@
             yield item
     if el.tag == 'a' and el.attrib.get('href') and include_hrefs:
         yield ('href', el.attrib['href'])
-    if not drop_tag:
+    if not skip_tag:
         yield end_tag(el)
         end_words = split_words(el.tail)
         for word in end_words:
@@ -729,14 +729,14 @@
     <p><ins>word</ins></p> """
     doc = parse_html(html, cleanup=False)
     _fixup_ins_del_tags(doc)
-    html = serialize_html_fragment(doc, drop_outer=True)
+    html = serialize_html_fragment(doc, skip_outer=True)
     return html
 
-def serialize_html_fragment(el, drop_outer=False):
+def serialize_html_fragment(el, skip_outer=False):
     """ Serialize a single lxml element as HTML.  The serialized form
     includes the elements tail.  
 
-    If drop_outer is true, then don't serialize the outermost tag
+    If skip_outer is true, then don't serialize the outermost tag
     """
     
     html_xsl = """\
@@ -751,13 +751,13 @@
     assert not isinstance(el, basestring), (
         "You should pass in an element, not a string like %r" % el)
     html = str(transform(el))
-    if drop_outer:
+    if skip_outer:
         # Get rid of the extra starting tag:
         html = html[html.find('>')+1:]
-    if drop_outer:
+    if skip_outer:
         # Get rid of the extra end tag:
         html = html[:html.rfind('<')]
-    if drop_outer:
+    if skip_outer:
         return html.strip()
     else:
         return html.lstrip()
@@ -770,7 +770,7 @@
             if not _contains_block_level_tag(el):
                 continue
             _move_el_inside_block(el, tag=tag)
-            el.drop_tag()
+            el.drop_element()
             #_merge_element_contents(el)
 
 def _contains_block_level_tag(el):

Modified: lxml/branch/html/src/lxml/html/tests/test_basic.txt
==============================================================================
--- lxml/branch/html/src/lxml/html/tests/test_basic.txt	(original)
+++ lxml/branch/html/src/lxml/html/tests/test_basic.txt	Sat Jun  9 16:44:51 2007
@@ -52,7 +52,7 @@
     >>> el.text_content()
     'This is a bold link'
 
-Or drop both tags (leaving content) or the entire element, like::
+Or drop an element (leaving its content) or the entire tree, like::
 
     >>> doc = HTML('''
     ... <html>
@@ -63,7 +63,7 @@
     ...   <div>footer</div>
     ...  </body>
     ... </html>''')
-    >>> doc.get_element_by_id('link').drop_tag()
+    >>> doc.get_element_by_id('link').drop_element()
     >>> print tostring(doc)
     <html>
      <body>
@@ -73,7 +73,7 @@
       <div>footer</div>
      </body>
     </html>
-    >>> doc.get_element_by_id('body').drop_element()
+    >>> doc.get_element_by_id('body').drop_tree()
     >>> print tostring(doc)
     <html>
      <body>


More information about the lxml-checkins mailing list