import threading
import re
import urlparse
import copy
from lxml import etree
from lxml.html import defs
from lxml import cssselect
from lxml.html.setmixin import SetMixin
try:
from UserDict import DictMixin
except ImportError:
# DictMixin was introduced in Python 2.4
from lxml.html._dictmixin import DictMixin
import sets
__all__ = ['document_fromstring', 'tostring', 'Element', 'defs',
'find_rel_links', 'find_class', 'make_links_absolute',
'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser']
_rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]")
#_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'})
_class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
_id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")
_collect_string_content = etree.XPath("string()")
_css_url_re = re.compile(r'url\((.*?)\)', re.I)
_css_import_re = re.compile(r'@import "(.*?)"')
_label_xpath = etree.XPath("//label[@for=$id]")
class HtmlMixin(object):
def base_url(self):
"""
Returns the base URL, given when the page was parsed.
Use with ``urlparse.urljoin(el.base_url, href)`` to get
absolute URLs.
"""
return self.getroottree().docinfo.URL
base_url = property(base_url, doc=base_url.__doc__)
def forms(self):
"""
Return a list of all the forms
"""
return list(self.getiterator('form'))
forms = property(forms, doc=forms.__doc__)
def body(self):
"""
Return the
element. Can be called from a child element
to get the document's head.
"""
return self.xpath('//body')[0]
body = property(body, doc=body.__doc__)
def head(self):
"""
Returns the element. Can be called from a child
element to get the document's head.
"""
return self.xpath('//head')[0]
head = property(head, doc=head.__doc__)
def label__get(self):
"""
Get or set any element associated with this element.
"""
id = self.get('id')
if not id:
return None
result = _label_xpath(self, id=id)
if not result:
return None
else:
return result[0]
def label__set(self, label):
id = self.get('id')
if not id:
raise TypeError(
"You cannot set a label for an element (%r) that has no id"
% self)
if not label.tag == 'label':
raise TypeError(
"You can only assign label to a label element (not %r)"
% label)
label.set('for', id)
def label__del(self):
label = self.label
if label is not None:
del label.attrib['for']
label = property(label__get, label__set, label__del, doc=label__get.__doc__)
def drop_tree(self):
"""
Removes this element from the tree, including its children and
text. The tail text is joined to the previous element or
parent.
"""
parent = self.getparent()
assert parent is not None
if self.tail:
previous = self.getprevious()
if previous is None:
parent.text = (parent.text or '') + self.tail
else:
previous.tail = (previous.tail or '') + self.tail
parent.remove(self)
def drop_tag(self):
"""
Remove the tag, but not its children or text. The children and text
are merged into the parent.
Example::
>>> h = fragment_fromstring('Hello World!
')
>>> h.find('//b').drop_tag()
>>> print tostring(h)
Hello World!
"""
parent = self.getparent()
assert parent is not None
previous = self.getprevious()
if self.text and isinstance(self.tag, basestring):
# not a Comment, etc.
if previous is None:
parent.text = (parent.text or '') + self.text
else:
previous.tail = (previous.tail or '') + self.text
if self.tail:
if len(self):
last = self[-1]
last.tail = (last.tail or '') + self.tail
elif previous is None:
parent.text = (parent.text or '') + self.tail
else:
previous.tail = (previous.tail or '') + self.tail
index = parent.index(self)
parent[index:index+1] = self[:]
def find_rel_links(self, rel):
"""
Find any links like ``... ``; returns a list of elements.
"""
rel = rel.lower()
return [el for el in _rel_links_xpath(self)
if el.get('rel').lower() == rel]
def find_class(self, class_name):
"""
Find any elements with the given class name.
"""
return _class_xpath(self, class_name=class_name)
def get_element_by_id(self, id, *default):
"""
Get the first element in a document with the given id. If none is
found, return the default argument if provided or raise KeyError
otherwise.
Note that there can be more than one element with the same id,
and this isn't uncommon in HTML documents found in the wild.
Browsers return only the first match, and this function does
the same.
"""
try:
# FIXME: should this check for multiple matches?
# browsers just return the first one
return _id_xpath(self, id=id)[0]
except IndexError:
if default:
return default[0]
else:
raise KeyError, id
def text_content(self):
"""
Return the text content of the tag (and the text in any children).
"""
return _collect_string_content(self)
def cssselect(self, expr):
"""
Run the CSS expression on this element and its children,
returning a list of the results.
Equivalent to lxml.cssselect.CSSSelect(expr)(self) -- note
that pre-compiling the expression can provide a substantial
speedup.
"""
return cssselect.CSSSelect(expr)(self)
########################################
## Link functions
########################################
def make_links_absolute(self, base_url=None, resolve_base_href=True):
"""
Make all links in the document absolute, given the
``base_url`` for the document (the full URL where the document
came from), or if no ``base_url`` is given, then the ``.base_url`` of the document.
If ``resolve_base_href`` is true, then any `` ``
tags in the document are used *and* removed from the document.
If it is false then any such tag is ignored.
"""
if base_url is None:
base_url = self.base_url
if base_url is None:
raise TypeError(
"No base_url given, and the document has no base_url")
if resolve_base_href:
self.resolve_base_href()
def link_repl(href):
return urlparse.urljoin(base_url, href)
self.rewrite_links(link_repl)
def resolve_base_href(self):
"""
Find any `` `` tag in the document, and apply its
values to all links found in the document. Also remove the
tag once it has been applied.
"""
base_href = None
basetags = self.xpath('//base[@href]')
for b in basetags:
base_href = b.get('href')
b.drop_tree()
if not base_href:
return
self.make_links_absolute(base_href, resolve_base_href=False)
def iterlinks(self):
"""
Yield (element, attribute, link, pos), where attribute may be None
(indicating the link is in the text). ``pos`` is the position
where the link occurs; often 0, but sometimes something else in
the case of links in stylesheets or style tags.
Note: is *not* taken into account in any way. The
link you get is exactly the link in the document.
"""
link_attrs = defs.link_attrs
for el in self.getiterator():
for attrib in link_attrs:
if attrib in el.attrib:
yield (el, attrib, el.attrib[attrib], 0)
if el.tag == 'style' and el.text:
for match in _css_url_re.finditer(el.text):
yield (el, None, match.group(1), match.start(1))
for match in _css_import_re.finditer(el.text):
yield (el, None, match.group(1), match.start(1))
if 'style' in el.attrib:
for match in _css_url_re.finditer(el.attrib['style']):
yield (el, 'style', match.group(1), match.start(1))
def rewrite_links(self, link_repl_func, resolve_base_href=True,
base_href=None):
"""
Rewrite all the links in the document. For each link
``link_repl_func(link)`` will be called, and the return value
will replace the old link.
Note that links may not be absolute (unless you first called
``make_links_absolute()``), and may be internal (e.g.,
``'#anchor'``). They can also be values like
``'mailto:email'`` or ``'javascript:expr'``.
If you give ``base_href`` then all links passed to
``link_repl_func()`` will take that into account.
If the ``link_repl_func`` returns None, the attribute or
tag text will be removed completely.
"""
if base_href is not None:
# FIXME: this can be done in one pass with a wrapper
# around link_repl_func
self.make_links_absolute(base_href, resolve_base_href=resolve_base_href)
elif resolve_base_href:
self.resolve_base_href()
for el, attrib, link, pos in self.iterlinks():
new_link = link_repl_func(link)
if new_link == link:
continue
if new_link is None:
# Remove the attribute or element content
if attrib is None:
el.text = ''
else:
del el.attrib[attrib]
continue
if attrib is None:
new = el.text[:pos] + new_link + el.text[pos+len(link):]
el.text = new
else:
cur = el.attrib[attrib]
if not pos and len(cur) == len(link):
# Most common case
el.attrib[attrib] = new_link
else:
new = cur[:pos] + new_link + cur[pos+len(link):]
el.attrib[attrib] = new
class _MethodFunc(object):
"""
An object that represents a method on an element as a function;
the function takes either an element or an HTML string. It
returns whatever the function normally returns, or if the function
works in-place (and so returns None) it returns a serialized form
of the resulting document.
"""
def __init__(self, name, copy=False, source_class=HtmlMixin):
self.name = name
self.copy = copy
self.__doc__ = getattr(source_class, self.name).__doc__
def __call__(self, doc, *args, **kw):
if isinstance(doc, basestring):
if 'copy' in kw:
raise TypeError(
"The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name)
return_string = True
doc = fromstring(doc, **kw)
else:
if 'copy' in kw:
copy = kw.pop('copy')
else:
copy = self.copy
return_string = False
if copy:
doc = copy.deepcopy(doc)
meth = getattr(doc, self.name)
result = meth(*args, **kw)
# FIXME: this None test is a bit sloppy
if result is None:
# Then return what we got in
if return_string:
return tostring(doc)
else:
return doc
else:
return result
find_rel_links = _MethodFunc('find_rel_links', copy=False)
find_class = _MethodFunc('find_class', copy=False)
make_links_absolute = _MethodFunc('make_links_absolute', copy=True)
resolve_base_href = _MethodFunc('resolve_base_href', copy=True)
iterlinks = _MethodFunc('iterlinks', copy=False)
rewrite_links = _MethodFunc('rewrite_links', copy=True)
class HtmlComment(etree.CommentBase, HtmlMixin):
pass
class HtmlElement(etree.ElementBase, HtmlMixin):
pass
class HtmlProcessingInstruction(etree.PIBase, HtmlMixin):
pass
class HtmlEntity(etree.EntityBase, HtmlMixin):
pass
class HtmlElementClassLookup(etree.CustomElementClassLookup):
"""A lookup scheme for HTML Element classes.
To create a lookup instance with different Element classes, pass a tag
name mapping of Element classes in the ``classes`` keyword argument and/or
a tag name mapping of Mixin classes in the ``mixins`` keyword argument.
The special key '*' denotes a Mixin class that should be mixed into all
Element classes.
"""
_default_element_classes = {}
def __init__(self, classes=None, mixins=None):
etree.CustomElementClassLookup.__init__(self)
if classes is None:
classes = self._default_element_classes.copy()
if mixins:
mixers = {}
for name, value in mixins:
if name == '*':
for n in classes.keys():
mixers.setdefault(n, []).append(value)
else:
mixers.setdefault(name, []).append(value)
for name, mix_bases in mixers.items():
cur = classes.get(name, HtmlElement)
bases = tuple(mix_bases + [cur])
classes[name] = type(cur.__name__, bases, {})
self._element_classes = classes
def lookup(self, node_type, document, namespace, name):
if node_type == 'element':
return self._element_classes.get(name.lower(), HtmlElement)
elif node_type == 'comment':
return HtmlComment
elif node_type == 'PI':
return HtmlProcessingInstruction
elif node_type == 'entity':
return HtmlEntity
# Otherwise normal lookup
return None
html_parser = etree.HTMLParser()
def document_fromstring(html, **kw):
value = etree.HTML(html, html_parser, **kw)
if value is None:
raise etree.ParserError(
"Document is empty")
return value
def fragments_fromstring(html, no_leading_text=False, **kw):
"""
Parses several HTML elements, returning a list of elements.
The first item in the list may be a string (though leading
whitespace is removed). If no_leading_text is true, then it will
be an error if there is leading text, and it will always be a list
of only elements.
"""
# FIXME: check what happens when you give html with a body, head, etc.
start = html[:20].lstrip().lower()
if not start.startswith('%s%s>' % (
create_parent, html, create_parent), **kw)
elements = fragments_fromstring(html, no_leading_text=True)
if not elements:
raise etree.ParserError(
"No elements found")
if len(elements) > 1:
raise etree.ParserError(
"Multiple elements found (%s)"
% ', '.join([_element_name(e) for e in elements]))
el = elements[0]
if el.tail and el.tail.strip():
raise etree.ParserError(
"Element followed by text: %r" % el.tail)
el.tail = None
return el
def fromstring(html, **kw):
"""
Parse the html, returning a single element/document.
This tries to minimally parse the chunk of text, without knowing if it
is a fragment or a document.
"""
start = html[:10].lstrip().lower()
if start.startswith(' 1:
# Somehow there are multiple bodies, which is bad, but just
# smash them into one body
for other_body in bodies[1:]:
if other_body.text:
if len(body):
body[-1].tail = (body[-1].tail or '') + other_body.text
else:
body.text = (body.text or '') + other_body.text
body.extend(other_body)
# We'll ignore tail
# I guess we are ignoring attributes too
other_body.drop_tree()
else:
body = None
heads = doc.findall('head')
if heads:
# Well, we have some sort of structure, so lets keep it all
head = heads[0]
if len(heads) > 1:
for other_head in heads[1:]:
head.extend(other_head)
# We don't care about text or tail in a head
other_head.drop_tree()
return doc
if (len(body) == 1 and (not body.text or not body.text.strip())
and (not body[-1].tail or not body[-1].tail.strip())):
# The body has just one element, so it was probably a single
# element passed in
return body[0]
# Now we have a body which represents a bunch of tags which have the
# content that was passed in. We will create a fake container, which
# is the body tag, except implies too much structure.
if _contains_block_level_tag(body):
body.tag = 'div'
else:
body.tag = 'span'
return body
def parse(filename, **kw):
"""
Parse a filename, URL, or file-like object into an HTML document.
You may pass the keyword argument ``base_url='http://...'`` to set
the base URL.
"""
return etree.parse(filename, html_parser, **kw)
def _contains_block_level_tag(el):
# FIXME: I could do this with XPath, but would that just be
# unnecessarily slow?
for el in el.getiterator():
if el.tag in defs.block_tags:
return True
return False
def _element_name(el):
if isinstance(el, etree.CommentBase):
return 'comment'
elif isinstance(el, basestring):
return 'string'
else:
return el.tag
def Element(*args, **kw):
v = html_parser.makeelement(*args, **kw)
return v
class FormElement(HtmlElement):
"""
Represents a