From ianb at codespeak.net Fri Jun 1 06:18:44 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Fri, 1 Jun 2007 06:18:44 +0200 (CEST) Subject: [Lxml-checkins] r43954 - lxml/branch/html/src/lxml Message-ID: <20070601041844.70BB48093@code0.codespeak.net> Author: ianb Date: Fri Jun 1 06:18:43 2007 New Revision: 43954 Modified: lxml/branch/html/src/lxml/doctestcompare.py Log: Change the diff output a bit; only parse if both got *and* want look like HTML/XML Modified: lxml/branch/html/src/lxml/doctestcompare.py ============================================================================== --- lxml/branch/html/src/lxml/doctestcompare.py (original) +++ lxml/branch/html/src/lxml/doctestcompare.py Fri Jun 1 06:18:43 2007 @@ -78,9 +78,11 @@ parser = HTML elif PARSE_XML & optionflags: parser = etree.XML - elif want.strip().lower().startswith('' % tag @@ -297,7 +299,7 @@ if not got: return '' return self.format_text(got, strip) - text = '%s (not %s)' % (got, want) + text = '%s (got: %s)' % (want, got) return self.format_text(text, strip) class LHTMLOutputChecker(LXMLOutputChecker): From ianb at codespeak.net Fri Jun 1 06:25:27 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Fri, 1 Jun 2007 06:25:27 +0200 (CEST) Subject: [Lxml-checkins] r43955 - in lxml/branch/html/src/lxml/html: . tests Message-ID: <20070601042527.A7A778093@code0.codespeak.net> Author: ianb Date: Fri Jun 1 06:25:27 2007 New Revision: 43955 Removed: lxml/branch/html/src/lxml/html/rewritelinks.py Modified: lxml/branch/html/src/lxml/html/__init__.py lxml/branch/html/src/lxml/html/tests/test_rewritelinks.txt Log: Move all the link functions directly into __init__; change rewriting to all use iter_links Modified: lxml/branch/html/src/lxml/html/__init__.py ============================================================================== --- lxml/branch/html/src/lxml/html/__init__.py (original) +++ lxml/branch/html/src/lxml/html/__init__.py Fri Jun 1 06:25:27 2007 @@ -1,13 +1,19 @@ import threading import re +import urlparse from lxml import etree +from lxml.html import defs -__all__ = ['HTML', 'tostring', 'Element'] +__all__ = ['HTML', 'tostring', 'Element', 'defs', + 'find_rel_links', 'find_class', 'make_links_absolute', + 'resolve_base_href', 'iter_links', 'rewrite_links'] _rel_links_xpath = etree.XPath("descendant-or-self::a[fn:upper-case(@rel)=$rel]") #_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'}) _class_xpath = etree.XPath("descendant-or-self::*[contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") +_css_url_re = re.compile(r'url\((.*?)\)', re.I) +_css_import_re = re.compile(r'@import "(.*?)"') class HtmlMixin(object): @@ -110,8 +116,11 @@ tags in the document are used *and* removed from the document. If it is false then any such tag is ignored. """ - from lxml.html.rewritelinks import make_links_absolute - make_links_absolute(self, base_href, resolve_base_href=resolve_base_href) + if resolve_base_href: + self.resolve_base_href() + def link_repl(href): + return urlparse.urljoin(base_href, href) + self.rewrite_links(link_repl) def resolve_base_href(self): """ @@ -119,25 +128,38 @@ values to all links found in the document. Also remove the tag once it has been applied. """ - from lxml.html.rewritelinks import resolve_base_href - resolve_base_href(self) - - def iter_links(self, in_order=True): - """ - Iterate over all the links in the document, yielding - ``(element, attribute, link)``. - - The ``element`` contains the link. ``attribute`` is a string - like ``'href'`` or ``'src'``. It may be None, which means - that the link is in the body of the element. The only type - this occurs is with `` - ... - ... - ...
- ...
|
- ...
- ... Hi world!
- ... |
- ...