[Lxml-checkins] r43955 - in lxml/branch/html/src/lxml/html: . tests
ianb at codespeak.net
ianb at codespeak.net
Fri Jun 1 06:25:27 CEST 2007
Author: ianb
Date: Fri Jun 1 06:25:27 2007
New Revision: 43955
Removed:
lxml/branch/html/src/lxml/html/rewritelinks.py
Modified:
lxml/branch/html/src/lxml/html/__init__.py
lxml/branch/html/src/lxml/html/tests/test_rewritelinks.txt
Log:
Move all the link functions directly into __init__; change rewriting to all use iter_links
Modified: lxml/branch/html/src/lxml/html/__init__.py
==============================================================================
--- lxml/branch/html/src/lxml/html/__init__.py (original)
+++ lxml/branch/html/src/lxml/html/__init__.py Fri Jun 1 06:25:27 2007
@@ -1,13 +1,19 @@
import threading
import re
+import urlparse
from lxml import etree
+from lxml.html import defs
-__all__ = ['HTML', 'tostring', 'Element']
+__all__ = ['HTML', 'tostring', 'Element', 'defs',
+ 'find_rel_links', 'find_class', 'make_links_absolute',
+ 'resolve_base_href', 'iter_links', 'rewrite_links']
_rel_links_xpath = etree.XPath("descendant-or-self::a[fn:upper-case(@rel)=$rel]")
#_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'})
_class_xpath = etree.XPath("descendant-or-self::*[contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
_id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")
+_css_url_re = re.compile(r'url\((.*?)\)', re.I)
+_css_import_re = re.compile(r'@import "(.*?)"')
class HtmlMixin(object):
@@ -110,8 +116,11 @@
tags in the document are used *and* removed from the document.
If it is false then any such tag is ignored.
"""
- from lxml.html.rewritelinks import make_links_absolute
- make_links_absolute(self, base_href, resolve_base_href=resolve_base_href)
+ if resolve_base_href:
+ self.resolve_base_href()
+ def link_repl(href):
+ return urlparse.urljoin(base_href, href)
+ self.rewrite_links(link_repl)
def resolve_base_href(self):
"""
@@ -119,25 +128,38 @@
values to all links found in the document. Also remove the
tag once it has been applied.
"""
- from lxml.html.rewritelinks import resolve_base_href
- resolve_base_href(self)
-
- def iter_links(self, in_order=True):
- """
- Iterate over all the links in the document, yielding
- ``(element, attribute, link)``.
-
- The ``element`` contains the link. ``attribute`` is a string
- like ``'href'`` or ``'src'``. It may be None, which means
- that the link is in the body of the element. The only type
- this occurs is with ``<style>`` tags that contain links like
- ``url(...)``. ``link`` is the actual link, like
- ``'http://codespeak.net'``
-
- Note: links are not returned in document order.
- """
- from lxml.html.rewritelinks import iter_links
- return iter_links(self, in_order=in_order)
+ base_href = None
+ basetags = self.xpath('//base[@href]')
+ for b in basetags:
+ base_href = b.attrib['href']
+ b.drop_element()
+ if not base_href:
+ return
+ self.make_links_absolute(base_href, resolve_base_href=False)
+
+ def iter_links(self):
+ """
+ Yield (element, attribute, link, pos), where attribute may be None
+ (indicating the link is in the text). ``pos`` is the position
+ where the link occurs; often 0, but sometimes something else in
+ the case of links in stylesheets or style tags.
+
+ Note: <base href> is *not* taken into account in any way. The
+ link you get is exactly the link in the document.
+ """
+ link_attrs = defs.link_attrs
+ for el in self.iterdescendants():
+ for attrib in link_attrs:
+ if attrib in el.attrib:
+ yield (el, attrib, el.attrib[attrib], 0)
+ if el.tag == 'style' and el.text:
+ for match in _css_url_re.finditer(el.text):
+ yield (el, None, match.group(1), match.start(1))
+ for match in _css_import_re.finditer(el.text):
+ yield (el, None, match.group(1), match.start(1))
+ if 'style' in el.attrib:
+ for match in _css_url_re.finditer(el.attrib['style']):
+ yield (el, 'style', match.group(1), match.start(1))
def rewrite_links(self, link_repl_func, resolve_base_href=True,
base_href=None):
@@ -152,15 +174,30 @@
``'mailto:email'`` or ``'javascript:expr'``.
If you give ``base_href`` then all links passed to
- ``link_repl_func()`` will be absolute.
+ ``link_repl_func()`` will take that into account.
"""
- from lxml.html.rewritelinks import rewrite_links
if base_href is not None:
# FIXME: this can be done in one pass with a wrapper
# around link_repl_func
self.make_links_absolute(base_href, resolve_base_href=resolve_base_href)
- resolve_base_href = False
- rewrite_links(self, link_repl_func, remove_base_tags=resolve_base_href)
+ elif resolve_base_href:
+ self.resolve_base_href()
+ for el, attrib, link, pos in self.iter_links():
+ new_link = link_repl_func(link)
+ if new_link == link:
+ continue
+ if attrib is None:
+ new = el.text[:pos] + new_link + el.text[pos+len(link):]
+ el.text = new
+ else:
+ cur = el.attrib[attrib]
+ if not pos and len(cur) == len(link):
+ # Most common case
+ el.attrib[attrib] = new_link
+ else:
+ new = cur[:pos] + new_link + cur[pos+len(link):]
+ el.attrib[attrib] = new
+
class _MethodFunc(object):
def __init__(self, name, fragment=False, source_class=HtmlMixin):
Deleted: /lxml/branch/html/src/lxml/html/rewritelinks.py
==============================================================================
--- /lxml/branch/html/src/lxml/html/rewritelinks.py Fri Jun 1 06:25:27 2007
+++ (empty file)
@@ -1,148 +0,0 @@
-"""
-utilities for manipulating html links
-"""
-
-
-from lxml.html import tostring, HTML
-from lxml.html import defs
-import urlparse
-import re
-
-__all__ = ['make_links_absolute', 'make_links_absolute_html',
- 'rewrite_links', 'rewrite_links_html',
- 'Relocator']
-
-def make_links_absolute(doc, base_href, resolve_base_href=True):
- def link_repl(href):
- return urlparse.urljoin(base_href, href)
- rewrite_links(doc, link_repl_func, remove_base_tags=resolve_base_href)
-
-def make_links_absolute_html(html, base_href):
- doc = HTML(html)
- make_links_absolute(doc, base_href)
- return tostring(doc)
-
-def rewrite_links_html(html, link_repl_func, remove_base_tags=True):
- """
- rewrite_links(), but work on text and returns text
- """
- doc = HTML(html)
- rewrite_links(doc, link_repl_func, remove_base_tags=remove_base_tags)
- return tostring(doc)
-
-def rewrite_links(doc, link_repl_func,
- remove_base_tags=True):
- """
- Takes a given document (already parsed by lxml) and modifies it
- in-place. Every link is passed through link_repl_func, and the
- output of that function replaces the link.
- """
- if remove_base_tags:
- resolve_base_href(doc)
-
- # FIXME: should use defs.link_attrs
- for attrib in 'href', 'src':
- els = doc.xpath('//*[@%s]' % attrib)
- for el in els:
- el.attrib[attrib] = link_repl_func(el.attrib[attrib])
-
- rewrite_css_links(doc, link_repl_func)
- rewrite_style_links(doc, link_repl_func)
-
-def resolve_base_href(doc):
- """
- removes all html <base href=""> tags
- from the document given.
- """
- base_href = None
- basetags = doc.xpath('//base[@href]')
- for b in basetags:
- base_href = b.attrib['href']
- b.getparent().remove(b)
- if base_href is None:
- return
- # Now that we have a base_href (blech) we have to fix up all the
- # links in the document with this new information.
- def link_repl(href):
- return urlparse.urljoin(base_href, href)
- rewrite_links(doc, link_repl, remove_base_tags=False)
-
-CSS_URL_PAT = re.compile(r'url\((.*?)\)', re.I)
-CSS_IMPORT_PAT = re.compile(r'@import "(.*?)"')
-def rewrite_css_links(doc, link_repl_func):
- """
- Fixes up any url(...) links in CSS style elements
- """
- def absuri(matchobj):
- return 'url(%s)' % link_repl_func(matchobj.group(1))
- def absimport(matchobj):
- return '@import "%s"' % link_repl_func(matchobj.group(1))
- els = doc.xpath('//head/style')
- for el in els:
- if el.text:
- el.text = CSS_URL_PAT.sub(absuri, el.text)
- el.text = CSS_IMPORT_PAT.sub(absimport, el.text)
-
-def rewrite_style_links(doc, link_repl_func):
- def absuri(matchobj):
- return 'url(%s)' % link_repl_func(matchobj.group(1))
- for el in doc.xpath("//*[contains(@style, 'url(')]"):
- el.attrib['style'] = CSS_URL_PAT.sub(absuri, el.attrib['style'])
-
-def iter_links(doc):
- """
- Yield (element, attribute, link, pos), where attribute may be None
- (indicating the link is in the text). ``pos`` is the position
- where the link occurs; often 0, but sometimes something else in
- the case of links in stylesheets or style tags.
-
- Note: <base href> is *not* taken into account in any way. The
- link you get is exactly the link in the document.
- """
- link_attrs = defs.link_attrs
- for el in doc.iterdescendants():
- for attrib in link_attrs:
- if attrib in el.attrib:
- yield (el, attrib, el.attrib[attrib], 0)
- if el.tag == 'style' and el.text:
- for match in CSS_URL_PAT.finditer(el.text):
- yield (el, None, match.group(1), match.start(1))
- for match in CSS_IMPORT_PAT.finditer(el.text):
- yield (el, None, match.group(1), match.start(1))
- if 'style' in el.attrib:
- for match in CSS_URL_PAT.finditer(el.attrib['style']):
- yield (el, 'style', match.group(1), match.start(1))
-
-class Relocator(object):
- """
- This helper can be used to move all links in a document from one
- location to another. Typically you use this like::
-
- rewrite_links_html(
- html, Relocator('http://old-domain/', 'http://new-domain',
- base_href='http://old-domain/foo/bar.html'))
-
- This means that the document was located at
- ``http://old-domain/foo/bar.html`` (used to resolve relative
- links), and that you want to change every occurance of
- ``http://old-domain/`` to ``http://new-domain``
- """
- # This catches the case of http://foo, which is equivalent to
- # http://foo/ :
- _domain_no_slash_re = re.compile(r'^[a-z]+://[^/]+$', re.I)
-
- def __init__(self, old_href, new_href, base_href=None):
- self.old_href = old_href
- self.new_href = new_href
- self.base_href = base_href
-
- def __call__(self, href):
- if self.base_href is not None:
- real_href = urlparse.urljoin(self.base_href, href)
- if self._domain_no_slash_re.search(real_href):
- real_href += '/'
- if not real_href.startswith(self.old_href):
- # A link somewhere else entirely
- return href
- return self.new_href + real_href[len(self.old_href):]
-
Modified: lxml/branch/html/src/lxml/html/tests/test_rewritelinks.txt
==============================================================================
--- lxml/branch/html/src/lxml/html/tests/test_rewritelinks.txt (original)
+++ lxml/branch/html/src/lxml/html/tests/test_rewritelinks.txt Fri Jun 1 06:25:27 2007
@@ -154,35 +154,3 @@
a href="/other.html"
td style="/td-bg.png"@22
img src="/logo.gif"
- >>> print_iter(iter_links('''
- ... <html>
- ... <head>
- ... <link rel="stylesheet" href="style.css">
- ... <style type="text/css">
- ... body {
- ... background-image: url(/bg.gif);
- ... }
- ... @import "/other-styles.css";
- ... </style>
- ... <script src="/js-funcs.js"></script>
- ... </head>
- ... <body>
- ... <table>
- ... <tr><td><ul>
- ... <li><a href="/test.html">Test stuff</a></li>
- ... <li><a href="/other.html">Other stuff</a></li>
- ... </td></tr>
- ... <td style="background-image: url(/td-bg.png)">
- ... <img src="/logo.gif">
- ... Hi world!
- ... </td></tr>
- ... </table>
- ... </body></html>''', False))
- link href="style.css"
- a href="/test.html"
- a href="/other.html"
- script src="/js-funcs.js"
- img src="/logo.gif"
- style None="/bg.gif"@40
- style None="/other-styles.css"@69
- td style="/td-bg.png"@22
More information about the lxml-checkins
mailing list