"""
utilities for manipulating html links
"""
from deliverance.htmlserialize import decodeAndParseHTML, tostring
from deliverance.utils import get_serializer
import urlparse
import re
def fixup_text_links(environ, doc, link_repl_func, remove_base_tags=True):
"""
fixup_links(), but work on text and returns text
"""
doc = decodeAndParseHTML(doc)
fixup_links(doc, link_repl_func, remove_base_tags=remove_base_tags)
serializer = get_serializer(environ, tostring)
return serializer(doc)
def fixup_links(doc, link_repl_func,
remove_base_tags=True):
"""
Takes a given document (already parsed by lxml) and modifies it
in-place. Every link is passed through link_repl_func, and the
output of that function replaces the link.
"""
if remove_base_tags:
resolve_base_tags_in_document(doc)
for attrib in 'href', 'src':
els = doc.xpath('//*[@%s]' % attrib)
for el in els:
el.attrib[attrib] = link_repl_func(el.attrib[attrib])
fixup_css_links(doc, link_repl_func)
def resolve_base_tags_in_document(doc):
"""
removes all html tags
from the document given.
"""
base_href = None
basetags = doc.xpath('//base[@href]')
for b in basetags:
base_href = b.attrib['href']
b.getparent().remove(b)
if base_href is None:
return
# Now that we have a base_href (blech) we have to fix up all the
# links in the document with this new information.
def link_repl(href):
return urlparse.urljoin(base_href, href)
fixup_links(doc, link_repl, remove_base_tags=False)
CSS_URL_PAT = re.compile(r'url\((.*?)\)', re.I)
def fixup_css_links(doc, link_repl_func):
"""
prepends url(...) in css style elements to be
absolute links based on base_uri
"""
def absuri(matchobj):
return 'url(%s)' % link_repl_func(matchobj.group(1))
els = doc.xpath('//head/style')
for el in els:
if el.text:
el.text = re.sub(CSS_URL_PAT,absuri,el.text)