import threading import re import urlparse import copy from lxml import etree from lxml.html import defs from lxml import cssselect from lxml.html.setmixin import SetMixin try: from UserDict import DictMixin except ImportError: # DictMixin was introduced in Python 2.4 from lxml.html._dictmixin import DictMixin import sets __all__ = ['document_fromstring', 'tostring', 'Element', 'defs', 'find_rel_links', 'find_class', 'make_links_absolute', 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser'] _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]") #_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'}) _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") _collect_string_content = etree.XPath("string()") _css_url_re = re.compile(r'url\((.*?)\)', re.I) _css_import_re = re.compile(r'@import "(.*?)"') _label_xpath = etree.XPath("//label[@for=$id]") class HtmlMixin(object): def base_url(self): """ Returns the base URL, given when the page was parsed. Use with ``urlparse.urljoin(el.base_url, href)`` to get absolute URLs. """ return self.getroottree().docinfo.URL base_url = property(base_url, doc=base_url.__doc__) def forms(self): """ Return a list of all the forms """ return list(self.getiterator('form')) forms = property(forms, doc=forms.__doc__) def body(self): """ Return the element. Can be called from a child element to get the document's head. """ return self.xpath('//body')[0] body = property(body, doc=body.__doc__) def head(self): """ Returns the element. Can be called from a child element to get the document's head. """ return self.xpath('//head')[0] head = property(head, doc=head.__doc__) def label__get(self): """ Get or set any