[Lxml-checkins] r44049 - lxml/branch/html/src/lxml/html

ianb at codespeak.net ianb at codespeak.net
Wed Jun 6 10:10:04 CEST 2007


Author: ianb
Date: Wed Jun  6 10:10:04 2007
New Revision: 44049

Modified:
   lxml/branch/html/src/lxml/html/__init__.py
Log:
Make iter_links also find link in the root element (wouldn't work with iterdescendants).  Make function alternatives to the methods use parse() instead of an explicit keyword argument (maybe an explicit option should also be allowed though).  Made the parser use <span> when possible and necessar

Modified: lxml/branch/html/src/lxml/html/__init__.py
==============================================================================
--- lxml/branch/html/src/lxml/html/__init__.py	(original)
+++ lxml/branch/html/src/lxml/html/__init__.py	Wed Jun  6 10:10:04 2007
@@ -151,7 +151,7 @@
         link you get is exactly the link in the document.
         """
         link_attrs = defs.link_attrs
-        for el in self.iterdescendants():
+        for el in _itertree(self):
             for attrib in link_attrs:
                 if attrib in el.attrib:
                     yield (el, attrib, el.attrib[attrib], 0)
@@ -226,34 +226,42 @@
     # make a copy of the document.  The problem is it changes the
     # return type, as it should return the copied document and not a
     # serialization.  Is that odd?
-    def __init__(self, name, fragment=False, source_class=HtmlMixin):
+    def __init__(self, name, copy=False, source_class=HtmlMixin):
         self.name = name
-        self.fragment = fragment
+        self.copy = copy
         self.__doc__ = getattr(source_class, self.name).__doc__
     def __call__(self, doc, *args, **kw):
-        if 'fragment' in kw:
-            fragment = kw.pop('fragment')
-        else:
-            fragment = self.fragment
         if isinstance(doc, basestring):
-            if fragment:
-                doc = parse_element(doc)
+            if 'copy' in kw:
+                raise TypeError(
+                    "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name)
+            return_string = True
+            doc = parse(doc)
+        else:
+            if 'copy' in kw:
+                copy = kw.pop('copy')
             else:
-                doc = HTML(doc)
+                copy = self.copy
+            return_string = False
+            if copy:
+                doc = copy.deepcopy(doc)
         meth = getattr(doc, self.name)
         result = meth(*args, **kw)
         if result is None:
-            # Then serialize and return
-            return tostring(doc)
+            # Then return what we got in
+            if return_string:
+                return tostring(doc)
+            else:
+                return doc
         else:
             return result
 
-find_rel_links = _MethodFunc('find_rel_links')
-find_class = _MethodFunc('find_class')
-make_links_absolute = _MethodFunc('make_links_absolute')
-resolve_base_href = _MethodFunc('resolve_base_href')
-iter_links = _MethodFunc('iter_links')
-rewrite_links = _MethodFunc('rewrite_links')
+find_rel_links = _MethodFunc('find_rel_links', copy=False)
+find_class = _MethodFunc('find_class', copy=False)
+make_links_absolute = _MethodFunc('make_links_absolute', copy=True)
+resolve_base_href = _MethodFunc('resolve_base_href', copy=True)
+iter_links = _MethodFunc('iter_links', copy=False)
+rewrite_links = _MethodFunc('rewrite_links', copy=True)
 
 class HtmlComment(etree.CommentBase, HtmlMixin):
     pass
@@ -382,10 +390,21 @@
         return body[0]
     # Now we have a body which represents a bunch of tags which have the
     # content that was passed in.  We will create a fake container, which
-    # is the body tag, except body implies too much structure.
-    body.tag = 'div'
+    # is the body tag, except <body> implies too much structure.
+    if _contains_block_level_tag(el):
+        body.tag = 'div'
+    else:
+        body.tag = 'span'
     return body
 
+def _contains_block_level_tag(el):
+    # FIXME: I could do this with XPath, but would that just be
+    # unnecessarily slow?
+    for el in _itertree(el):
+        if el.tag in defs.block_tags:
+            return True
+    return False
+
 def _element_name(el):
     if isinstance(el, etree.CommentBase):
         return 'comment'
@@ -394,6 +413,16 @@
     else:
         return el.tag
 
+# FIXME: should this be a method?  It's convenient, but I can't find a
+# method that does something like it.
+def _itertree(el):
+    """
+    Return the element's descendants, and the element itself
+    """
+    yield el
+    for item in el.iterdescendants():
+        yield item
+
 def Element(*args, **kw):
     v = html_parser.makeelement(*args, **kw)
     return v


More information about the lxml-checkins mailing list