From scoder at codespeak.net Fri Sep 4 21:15:31 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 4 Sep 2009 21:15:31 +0200 (CEST) Subject: [Lxml-checkins] r67502 - in lxml/trunk: . src/lxml Message-ID: <20090904191531.4CE38168011@codespeak.net> Author: scoder Date: Fri Sep 4 21:15:29 2009 New Revision: 67502 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/extensions.pxi Log: r5225 at delle: sbehnel | 2009-08-22 15:46:24 +0200 fix smart string property types Modified: lxml/trunk/src/lxml/extensions.pxi ============================================================================== --- lxml/trunk/src/lxml/extensions.pxi (original) +++ lxml/trunk/src/lxml/extensions.pxi Fri Sep 4 21:15:29 2009 @@ -589,9 +589,9 @@ cdef class _ElementUnicodeResult(python.unicode): cdef _Element _parent - cdef readonly bint is_tail - cdef readonly bint is_text - cdef readonly bint is_attribute + cdef readonly object is_tail + cdef readonly object is_text + cdef readonly object is_attribute cdef readonly object attrname def getparent(self): From scoder at codespeak.net Fri Sep 4 21:15:35 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 4 Sep 2009 21:15:35 +0200 (CEST) Subject: [Lxml-checkins] r67503 - in lxml/trunk: . src/lxml Message-ID: <20090904191535.3B41416800B@codespeak.net> Author: scoder Date: Fri Sep 4 21:15:33 2009 New Revision: 67503 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/lxml.objectify.pyx Log: r5226 at delle: sbehnel | 2009-08-22 22:06:24 +0200 doctest fix Modified: lxml/trunk/src/lxml/lxml.objectify.pyx ============================================================================== --- lxml/trunk/src/lxml/lxml.objectify.pyx (original) +++ lxml/trunk/src/lxml/lxml.objectify.pyx Fri Sep 4 21:15:33 2009 @@ -1260,7 +1260,7 @@ >>> html = M.html( M.body( M.p('hello', M.br, 'objectify') ) ) >>> from lxml.etree import tostring - >>> print(tostring(html, method='html')) + >>> print(tostring(html, method='html').decode('ASCII'))

hello
objectify

Note that this module has a predefined ElementMaker instance called ``E``. From scoder at codespeak.net Fri Sep 4 21:15:38 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 4 Sep 2009 21:15:38 +0200 (CEST) Subject: [Lxml-checkins] r67504 - in lxml/trunk: . src/lxml/html Message-ID: <20090904191538.8D250168016@codespeak.net> Author: scoder Date: Fri Sep 4 21:15:38 2009 New Revision: 67504 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/html/__init__.py Log: r5227 at delle: sbehnel | 2009-09-04 20:29:01 +0200 comments Modified: lxml/trunk/src/lxml/html/__init__.py ============================================================================== --- lxml/trunk/src/lxml/html/__init__.py (original) +++ lxml/trunk/src/lxml/html/__init__.py Fri Sep 4 21:15:38 2009 @@ -1471,11 +1471,26 @@ ################################################################################ class HTMLParser(etree.HTMLParser): + """An HTML parser that is configured to return lxml.html Element + objects. + """ def __init__(self, **kwargs): super(HTMLParser, self).__init__(**kwargs) self.set_element_class_lookup(HtmlElementClassLookup()) class XHTMLParser(etree.XMLParser): + """An XML parser that is configured to return lxml.html Element + objects. + + Note that this parser is not really XHTML aware unless you let it + load a DTD that declares the HTML entities. To do this, make sure + you have the XHTML DTDs installed in your catalogs, and create the + parser like this:: + + parser = XHTMLParser(load_dtd=True) + + For catalog support, see http://www.xmlsoft.org/catalog.html. + """ def __init__(self, **kwargs): super(XHTMLParser, self).__init__(**kwargs) self.set_element_class_lookup(HtmlElementClassLookup()) From scoder at codespeak.net Fri Sep 4 21:15:44 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 4 Sep 2009 21:15:44 +0200 (CEST) Subject: [Lxml-checkins] r67505 - in lxml/trunk: . doc Message-ID: <20090904191544.06D43168014@codespeak.net> Author: scoder Date: Fri Sep 4 21:15:43 2009 New Revision: 67505 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/performance.txt Log: r5228 at delle: sbehnel | 2009-09-04 20:36:48 +0200 link to parser benchmarks on xml.com Modified: lxml/trunk/doc/performance.txt ============================================================================== --- lxml/trunk/doc/performance.txt (original) +++ lxml/trunk/doc/performance.txt Fri Sep 4 21:15:43 2009 @@ -252,6 +252,21 @@ .. _`high-performance XML parsing`: http://www.ibm.com/developerworks/xml/library/x-hiperfparse/ +Finally, `xml.com`_ has a couple of publications about XML parser +performance. Farwick and Hafner have written two interesting articles +that compare the parser of libxml2 to some major Java based XML +parsers. One deals with `event-driven parser performance`_, the other +one presents `benchmark results comparing DOM parsers`_. Both +comparisons suggest that libxml2's parser performance is largely +superiour to all commonly used Java parsers in almost all cases. Note +that the C parser benchmark results are based on xmlbench_, which uses +a simpler setup for libxml2 than lxml does. + +.. _`xml.com`: http://www.xml.com/ +.. _`event-driven parser performance`: http://www.xml.com/lpt/a/1702 +.. _`benchmark results comparing DOM parsers`: http://www.xml.com/lpt/a/1703 +.. _xmlbench: http://xmlbench.sourceforge.net/ + The ElementTree API =================== From scoder at codespeak.net Fri Sep 4 21:15:48 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 4 Sep 2009 21:15:48 +0200 (CEST) Subject: [Lxml-checkins] r67506 - in lxml/trunk: . doc Message-ID: <20090904191548.16C6B168014@codespeak.net> Author: scoder Date: Fri Sep 4 21:15:47 2009 New Revision: 67506 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/build.txt Log: r5229 at delle: sbehnel | 2009-09-04 20:37:50 +0200 show how to use the STATIC_DEPS env var with easy_install Modified: lxml/trunk/doc/build.txt ============================================================================== --- lxml/trunk/doc/build.txt (original) +++ lxml/trunk/doc/build.txt Fri Sep 4 21:15:47 2009 @@ -16,9 +16,10 @@ 2 Subversion 3 Setuptools 4 Running the tests and reporting errors - 5 Contributing an egg - 6 Static linking on Windows - 7 Building Debian packages from SVN sources + 5 Building an egg + 6 Building lxml on MacOS-X + 7 Static linking on Windows + 8 Building Debian packages from SVN sources Cython @@ -211,6 +212,18 @@ Instead of ``build``, you can use any target, like ``bdist_egg`` if you want to use setuptools to build an installable egg. +Note that this also works with EasyInstall_. Since you can't pass +command line options in this case, you have to use an environment +variable instead:: + + STATIC_DEPS=true easy_install lxml + +Some machines may require an additional run with "sudo" to install the +package into the Python package directory:: + + STATIC_DEPS=true sudo easy_install lxml + + Static linking on Windows ------------------------- From scoder at codespeak.net Fri Sep 4 21:15:51 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 4 Sep 2009 21:15:51 +0200 (CEST) Subject: [Lxml-checkins] r67507 - in lxml/trunk: . doc Message-ID: <20090904191551.8E6D5168014@codespeak.net> Author: scoder Date: Fri Sep 4 21:15:50 2009 New Revision: 67507 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/elementsoup.txt Log: r5230 at delle: sbehnel | 2009-09-04 21:11:10 +0200 show how to use BeautifulSoup to detect encodings Modified: lxml/trunk/doc/elementsoup.txt ============================================================================== --- lxml/trunk/doc/elementsoup.txt (original) +++ lxml/trunk/doc/elementsoup.txt Fri Sep 4 21:15:50 2009 @@ -23,6 +23,13 @@ document, and ``convert_tree()`` to convert an existing BeautifulSoup tree into a list of top-level Elements. +.. contents:: +.. + 1 Parsing with the soupparser + 2 Entity handling + 3 Using soupparser as a fallback + 4 Using only the encoding detection + Parsing with the soupparser =========================== @@ -168,3 +175,27 @@ ... ignore = tostring(root, encoding=unicode) ... except UnicodeDecodeError: ... root = lxml.html.soupparser.fromstring(tag_soup) + + +Using only the encoding detection +================================= + +If you prefer a 'real' (and fast) HTML parser instead of the regular +expression based one in BeautifulSoup, you can still benefit from +BeautifulSoup's _`support for encoding detection` in the +``UnicodeDammit`` class. + +.. sourcecode:: pycon + + >>> from BeautifulSoup import UnicodeDammit + + >>> def decode_html(html_string): + ... converted = UnicodeDammit(html_string, isHTML=True) + ... if not converted.unicode: + ... raise UnicodeDecodeError( + ... "Failed to detect encoding, tried [%s]", + ... ', '.join(converted.triedEncodings)) + ... # print converted.originalEncoding + ... return converted.unicode + + >>> root = lxml.html.fromstring(decode_html(tag_soup)) From scoder at codespeak.net Sun Sep 6 14:54:31 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Sun, 6 Sep 2009 14:54:31 +0200 (CEST) Subject: [Lxml-checkins] r67542 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20090906125431.0862316800D@codespeak.net> Author: scoder Date: Sun Sep 6 14:54:30 2009 New Revision: 67542 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/cssselect.py lxml/trunk/src/lxml/tests/test_css.txt Log: r5237 at delle: sbehnel | 2009-09-06 14:50:11 +0200 fix several unicode and character escape issues in lxml.cssselect Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Sun Sep 6 14:54:30 2009 @@ -19,6 +19,11 @@ Bugs fixed ---------- +* CSS special character escapes were not properly handled in + ``lxml.cssselect``. + +* CSS Unicode escapes were not properly decoded in ``lxml.cssselect``. + * Select options in HTML forms that had no explicit ``value`` attribute were not handled correctly. The HTML standard dictates that their value is defined by their text content. This is now Modified: lxml/trunk/src/lxml/cssselect.py ============================================================================== --- lxml/trunk/src/lxml/cssselect.py (original) +++ lxml/trunk/src/lxml/cssselect.py Sun Sep 6 14:54:30 2009 @@ -49,9 +49,11 @@ try: _unicode = unicode + _unichr = unichr except NameError: # Python 3 _unicode = str + _unichr = chr class _UniToken(_unicode): def __new__(cls, contents, pos): @@ -99,7 +101,7 @@ def xpath(self): sel_xpath = self.selector.xpath() sel_xpath.add_condition( - "contains(concat(' ', normalize-space(@class), ' '), %s)" % xpath_repr(' '+self.class_name+' ')) + "contains(concat(' ', normalize-space(@class), ' '), %s)" % xpath_literal(' '+self.class_name+' ')) return sel_xpath class Function(object): @@ -194,7 +196,7 @@ if isinstance(expr, Element): expr = expr._format_element() xpath.add_condition('contains(css:lower-case(string(.)), %s)' - % xpath_repr(expr.lower())) + % xpath_literal(expr.lower())) # FIXME: Currently case insensitive matching doesn't seem to be happening return xpath @@ -349,34 +351,34 @@ path.add_condition(attrib) elif self.operator == '=': path.add_condition('%s = %s' % (attrib, - xpath_repr(value))) + xpath_literal(value))) elif self.operator == '!=': # FIXME: this seems like a weird hack... if value: path.add_condition('not(%s) or %s != %s' - % (attrib, attrib, xpath_repr(value))) + % (attrib, attrib, xpath_literal(value))) else: path.add_condition('%s != %s' - % (attrib, xpath_repr(value))) - #path.add_condition('%s != %s' % (attrib, xpath_repr(value))) + % (attrib, xpath_literal(value))) + #path.add_condition('%s != %s' % (attrib, xpath_literal(value))) elif self.operator == '~=': - path.add_condition("contains(concat(' ', normalize-space(%s), ' '), %s)" % (attrib, xpath_repr(' '+value+' '))) + path.add_condition("contains(concat(' ', normalize-space(%s), ' '), %s)" % (attrib, xpath_literal(' '+value+' '))) elif self.operator == '|=': # Weird, but true... path.add_condition('%s = %s or starts-with(%s, %s)' % ( - attrib, xpath_repr(value), - attrib, xpath_repr(value + '-'))) + attrib, xpath_literal(value), + attrib, xpath_literal(value + '-'))) elif self.operator == '^=': path.add_condition('starts-with(%s, %s)' % ( - attrib, xpath_repr(value))) + attrib, xpath_literal(value))) elif self.operator == '$=': # Oddly there is a starts-with in XPath 1.0, but not ends-with path.add_condition('substring(%s, string-length(%s)-%s) = %s' - % (attrib, attrib, len(value)-1, xpath_repr(value))) + % (attrib, attrib, len(value)-1, xpath_literal(value))) elif self.operator == '*=': # FIXME: case sensitive? path.add_condition('contains(%s, %s)' % ( - attrib, xpath_repr(value))) + attrib, xpath_literal(value))) else: assert 0, ("Unknown operator: %r" % self.operator) return path @@ -425,7 +427,7 @@ def xpath(self): path = self.selector.xpath() - path.add_condition('@id = %s' % xpath_repr(self.id)) + path.add_condition('@id = %s' % xpath_literal(self.id)) return path class Or(object): @@ -501,9 +503,9 @@ ############################## ## XPathExpr objects: -_el_re = re.compile(r'^\w+\s*$') -_id_re = re.compile(r'^(\w*)#(\w+)\s*$') -_class_re = re.compile(r'^(\w*)\.(\w+)\s*$') +_el_re = re.compile(r'^\w+\s*$', re.UNICODE) +_id_re = re.compile(r'^(\w*)#(\w+)\s*$', re.UNICODE) +_class_re = re.compile(r'^(\w*)\.(\w+)\s*$', re.UNICODE) def css_to_xpath(css_expr, prefix='descendant-or-self::'): if isinstance(css_expr, _basestring): @@ -524,7 +526,7 @@ "Got None for xpath expression from %s" % repr(css_expr)) if prefix: expr.add_prefix(prefix) - return str(expr) + return _unicode(expr) class XPathExpr(object): @@ -539,10 +541,10 @@ def __str__(self): path = '' if self.prefix is not None: - path += str(self.prefix) + path += _unicode(self.prefix) if self.path is not None: - path += str(self.path) - path += str(self.element) + path += _unicode(self.path) + path += _unicode(self.element) if self.condition: path += '[%s]' % self.condition return path @@ -574,7 +576,7 @@ if self.element == '*': # We weren't doing a test anyway return - self.add_condition("name() = %s" % xpath_repr(self.element)) + self.add_condition("name() = %s" % xpath_literal(self.element)) self.element = '*' def add_star_prefix(self): @@ -589,7 +591,7 @@ self.star_prefix = True def join(self, combiner, other): - prefix = str(self) + prefix = _unicode(self) prefix += combiner path = (other.prefix or '') + (other.path or '') # We don't need a star prefix if we are joining to this other @@ -615,16 +617,26 @@ def __str__(self): prefix = self.prefix or '' - return ' | '.join([prefix + str(i) for i in self.items]) + return ' | '.join(["%s%s" % (prefix,i) for i in self.items]) -def xpath_repr(s): - # FIXME: I don't think this is right, but lacking any reasonable - # specification on what XPath literals look like (which doesn't seem - # to be in the XPath specification) it is hard to do 'right' +split_at_single_quotes = re.compile("('+)").split + +def xpath_literal(s): if isinstance(s, Element): # This is probably a symbol that looks like an expression... s = s._format_element() - return repr(str(s)) + else: + s = _unicode(s) + if "'" not in s: + s = "'%s'" % s + elif '"' not in s: + s = '"%s"' % s + else: + s = "concat(%s)" % ','.join([ + (("'" in part) and '"%s"' or "'%s'") % part + for part in split_at_single_quotes(s) if part + ]) + return s ############################## ## Parsing functions @@ -814,9 +826,9 @@ ## Tokenizing ############################################################ -_whitespace_re = re.compile(r'\s+') +_whitespace_re = re.compile(r'\s+', re.UNICODE) -_comment_re = re.compile(r'/\*.*?\*/', re.S) +_comment_re = re.compile(r'/\*.*?\*/', re.DOTALL) _count_re = re.compile(r'[+-]?\d*n(?:[+-]\d+)?') @@ -861,6 +873,28 @@ yield Symbol(sym, old_pos) continue +split_at_string_escapes = re.compile(r'(\\(?:%s))' + % '|'.join(['[A-Fa-f0-9]{1,6}(?:\r\n|\s)?', + '[^A-Fa-f0-9]'])).split + +def unescape_string_literal(literal): + substrings = [] + for substring in split_at_string_escapes(literal): + if not substring: + continue + elif '\\' in substring: + if substring[0] == '\\' and len(substring) > 1: + substring = substring[1:] + if substring[0] in '0123456789ABCDEFabcdef': + # int() correctly ignores the potentially trailing whitespace + substring = _unichr(int(substring, 16)) + else: + raise SelectorSyntaxError( + "Invalid escape sequence %r in string %r" + % (substring.split('\\')[1], literal)) + substrings.append(substring) + return ''.join(substrings) + def tokenize_escaped_string(s, pos): quote = s[pos] assert quote in ('"', "'") @@ -873,13 +907,13 @@ "Expected closing %s for string in: %r" % (quote, s[start:])) result = s[start:next] - try: - result = result.encode('ASCII', 'backslashreplace').decode('unicode_escape') - except UnicodeDecodeError: - # Probably a hanging \ + if result.endswith('\\'): + # next quote character is escaped pos = next+1 - else: - return result, next+1 + continue + if '\\' in result: + result = unescape_string_literal(result) + return result, next+1 _illegal_symbol = re.compile(r'[^\w\\-]', re.UNICODE) Modified: lxml/trunk/src/lxml/tests/test_css.txt ============================================================================== --- lxml/trunk/src/lxml/tests/test_css.txt (original) +++ lxml/trunk/src/lxml/tests/test_css.txt Sun Sep 6 14:54:30 2009 @@ -123,7 +123,44 @@ ... NotImplementedError: *:only-of-type is not implemented -Then of parse_series: +Now a Unicode character test: + + >>> from lxml.cssselect import css_to_xpath + >>> import sys + >>> if sys.version_info[0] >= 3: + ... css_expr = '.a\xc1b' + ... else: + ... css_expr = '.a\xc1b'.decode('ISO-8859-1') + + >>> xpath_expr = css_to_xpath(css_expr) + >>> print( css_expr[1:] in xpath_expr ) + True + >>> print( xpath_expr.encode('ascii', 'xmlcharrefreplace').decode('ASCII') ) + descendant-or-self::*[contains(concat(' ', normalize-space(@class), ' '), ' aÁb ')] + +And some special character tests: + + >>> print( css_to_xpath('*[aval="\'"]') ) + descendant-or-self::*[@aval = "'"] + >>> print( css_to_xpath('*[aval="\'\'\'"]') ) + descendant-or-self::*[@aval = "'''"] + >>> print( css_to_xpath('*[aval=\'"\']') ) + descendant-or-self::*[@aval = '"'] + >>> print( css_to_xpath('*[aval=\'"""\']') ) + descendant-or-self::*[@aval = '"""'] + +Some Unicode escape tests (including the trailing whitespace rules): + + >>> print( css_to_xpath(r'*[aval="\'\22\'"]') ) # \22 == '"' + descendant-or-self::*[@aval = concat("'",'"',"'")] + >>> print( css_to_xpath(r'*[aval="\'\22 2\'"]') ) + descendant-or-self::*[@aval = concat("'",'"2',"'")] + >>> print( css_to_xpath(r'*[aval="\'\20 \'"]') ) # \20 == ' ' + descendant-or-self::*[@aval = "' '"] + >>> print( css_to_xpath('*[aval="\'\\20\r\n \'"]') ) + descendant-or-self::*[@aval = "' '"] + +Then some test for parse_series: >>> from lxml.cssselect import parse_series >>> parse_series('1n+3') From scoder at codespeak.net Thu Sep 10 06:49:32 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Thu, 10 Sep 2009 06:49:32 +0200 (CEST) Subject: [Lxml-checkins] r67600 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20090910044932.C5305168024@codespeak.net> Author: scoder Date: Thu Sep 10 06:49:30 2009 New Revision: 67600 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/cssselect.py lxml/trunk/src/lxml/tests/test_css.txt Log: r5239 at delle: sbehnel | 2009-09-10 06:45:11 +0200 cssselect: fix error reporting and infinite loop on syntax error Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Thu Sep 10 06:49:30 2009 @@ -19,6 +19,12 @@ Bugs fixed ---------- +* Syntax errors in ``lxml.cssselect`` could result in misleading error + messages. + +* Invalid syntax in CSS expressions could lead to an infinite loop in + the parser of ``lxml.cssselect``. + * CSS special character escapes were not properly handled in ``lxml.cssselect``. Modified: lxml/trunk/src/lxml/cssselect.py ============================================================================== --- lxml/trunk/src/lxml/cssselect.py (original) +++ lxml/trunk/src/lxml/cssselect.py Thu Sep 10 06:49:30 2009 @@ -649,8 +649,12 @@ except SelectorSyntaxError: import sys e = sys.exc_info()[1] - e.args = tuple(["%s at %s -> %s" % ( - e, stream.used, list(stream))]) + message = "%s at %s -> %r" % ( + e, stream.used, stream.peek()) + e.msg = message + if sys.version_info < (2,6): + e.message = message + e.args = tuple([message]) raise def parse_selector_group(stream): @@ -677,7 +681,11 @@ combinator = stream.next() else: combinator = ' ' + consumed = len(stream.used) next_selector = parse_simple_selector(stream) + if consumed == len(stream.used): + raise SelectorSyntaxError( + "Expected selector, got '%s'" % stream.peek()) result = CombinedSelector(result, combinator, next_selector) return result @@ -689,14 +697,14 @@ next = stream.next() if next != '*' and not isinstance(next, Symbol): raise SelectorSyntaxError( - "Expected symbol, got %r" % next) + "Expected symbol, got '%s'" % next) if stream.peek() == '|': namespace = next stream.next() element = stream.next() if element != '*' and not isinstance(next, Symbol): raise SelectorSyntaxError( - "Expected symbol, got %r" % next) + "Expected symbol, got '%s'" % next) else: namespace = '*' element = next @@ -723,14 +731,14 @@ next = stream.next() if not next == ']': raise SelectorSyntaxError( - "] expected, got %r" % next) + "] expected, got '%s'" % next) continue elif peek == ':' or peek == '::': type = stream.next() ident = stream.next() if not isinstance(ident, Symbol): raise SelectorSyntaxError( - "Expected symbol, got %r" % ident) + "Expected symbol, got '%s'" % ident) if stream.peek() == '(': stream.next() peek = stream.peek() @@ -744,7 +752,7 @@ next = stream.next() if not next == ')': raise SelectorSyntaxError( - "Expected ), got %r and %r" + "Expected ')', got '%s' and '%s'" % (next, selector)) result = Function(result, type, ident, selector) else: @@ -778,11 +786,11 @@ op = stream.next() if not op in ('^=', '$=', '*=', '=', '~=', '|=', '!='): raise SelectorSyntaxError( - "Operator expected, got %r" % op) + "Operator expected, got '%s'" % op) value = stream.next() if not isinstance(value, (Symbol, String)): raise SelectorSyntaxError( - "Expected string or symbol, got %r" % value) + "Expected string or symbol, got '%s'" % value) return Attrib(selector, namespace, attrib, op, value) def parse_series(s): Modified: lxml/trunk/src/lxml/tests/test_css.txt ============================================================================== --- lxml/trunk/src/lxml/tests/test_css.txt (original) +++ lxml/trunk/src/lxml/tests/test_css.txt Thu Sep 10 06:49:30 2009 @@ -49,6 +49,14 @@ >>> parse('td ~ th') CombinedSelector[Element[td] ~ Element[th]] +Some parse error tests: + + >>> try: parse('attributes(href)/html/body/a') + ... except: # Py2, Py3, ... + ... import sys + ... print(str(sys.exc_info()[1]).replace("(u'", "('")) + Expected selector, got '(' at [Symbol('attributes', 0)] -> Token('(', 10) + Now of translation: >>> def xpath(css): From scoder at codespeak.net Fri Sep 11 12:29:47 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 11 Sep 2009 12:29:47 +0200 (CEST) Subject: [Lxml-checkins] r67652 - in lxml/trunk: . src/lxml Message-ID: <20090911102947.32CB0168024@codespeak.net> Author: scoder Date: Fri Sep 11 12:29:45 2009 New Revision: 67652 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/cleanup.pxi lxml/trunk/src/lxml/lxml.objectify.pyx Log: r5241 at delle: sbehnel | 2009-09-11 10:55:09 +0200 doc comments Modified: lxml/trunk/src/lxml/cleanup.pxi ============================================================================== --- lxml/trunk/src/lxml/cleanup.pxi (original) +++ lxml/trunk/src/lxml/cleanup.pxi Fri Sep 11 12:29:45 2009 @@ -4,7 +4,7 @@ u"""cleanup_namespaces(tree_or_element) Remove all namespace declarations from a subtree that are not used - by any of the elements in that tree. + by any of the elements or attributes in that tree. """ cdef _Element element element = _rootNodeOrRaise(tree_or_element) Modified: lxml/trunk/src/lxml/lxml.objectify.pyx ============================================================================== --- lxml/trunk/src/lxml/lxml.objectify.pyx (original) +++ lxml/trunk/src/lxml/lxml.objectify.pyx Fri Sep 11 12:29:45 2009 @@ -1722,8 +1722,7 @@ tree.xmlSetNsProp(c_node, c_ns, "nil", "true") tree.END_FOR_EACH_ELEMENT_FROM(c_node) -cdef object _strip_attributes -_strip_attributes = etree.strip_attributes +cdef object _strip_attributes = etree.strip_attributes def deannotate(element_or_tree, *, pytype=True, xsi=True, xsi_nil=False): u"""deannotate(element_or_tree, pytype=True, xsi=True, xsi_nil=False) @@ -1736,6 +1735,10 @@ default), 'xsi:type' attributes will be removed. If the 'xsi_nil' keyword argument is True (default: False), 'xsi:nil' attributes will be removed. + + Note that this does not touch the namespace declarations. If you + want to remove unused namespace declarations from the tree, use + ``lxml.etree.cleanup_namespaces()``. """ cdef list attribute_names = [] From scoder at codespeak.net Fri Sep 11 12:29:50 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Fri, 11 Sep 2009 12:29:50 +0200 (CEST) Subject: [Lxml-checkins] r67653 - in lxml/trunk: . doc Message-ID: <20090911102950.00682168025@codespeak.net> Author: scoder Date: Fri Sep 11 12:29:49 2009 New Revision: 67653 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/objectify.txt Log: r5242 at delle: sbehnel | 2009-09-11 12:25:24 +0200 objectify doc fixes Modified: lxml/trunk/doc/objectify.txt ============================================================================== --- lxml/trunk/doc/objectify.txt (original) +++ lxml/trunk/doc/objectify.txt Fri Sep 11 12:29:49 2009 @@ -361,63 +361,74 @@ Namespace handling ------------------ -Namespaces are handled mostly behind the scenes. If you access a child of an -Element without specifying a namespace, the lookup will use the namespace of -the parent: +During tag lookups, namespaces are handled mostly behind the scenes. +If you access a child of an Element without specifying a namespace, +the lookup will use the namespace of the parent: .. sourcecode:: pycon - >>> root = objectify.Element("{ns}root") - >>> b = etree.SubElement(root, "{ns}b") - >>> c = etree.SubElement(root, "{other}c") + >>> root = objectify.Element("{http://ns/}root") + >>> b = etree.SubElement(root, "{http://ns/}b") + >>> c = etree.SubElement(root, "{http://other/}c") >>> print(root.b.tag) - {ns}b - >>> print(root.c) - Traceback (most recent call last): - ... - AttributeError: no such child: {ns}c + {http://ns/}b -You can access elements with different namespaces via ``getattr()``: +Note that the ``SubElement()`` factory of ``lxml.etree`` does not +inherit any namespaces when creating a new subelement. Element +creation must be explicit about the namespace, and is simplified +through the E-factory as described above. Lookups, however, inherit +namespaces implicitly. + +To access an element in a different namespace than its parent, you can +use ``getattr()``: .. sourcecode:: pycon - >>> print(getattr(root, "{other}c").tag) - {other}c + >>> print (root.tag) + {http://ns/}root + + >>> print(root.c) + Traceback (most recent call last): + ... + AttributeError: no such child: {http://ns/}c + + >>> print(getattr(root, "{http://other/}c").tag) + {http://other/}c For convenience, there is also a quick way through item access: .. sourcecode:: pycon - >>> print(root["{other}c"].tag) - {other}c + >>> print(root["{http://other/}c"].tag) + {http://other/}c The same approach must be used to access children with tag names that are not valid Python identifiers: .. sourcecode:: pycon - >>> el = etree.SubElement(root, "{ns}tag-name") + >>> el = etree.SubElement(root, "{http://ns/}tag-name") >>> print(root["tag-name"].tag) - {ns}tag-name + {http://ns/}tag-name - >>> new_el = objectify.Element("{ns}new-element") - >>> el = etree.SubElement(new_el, "{ns}child") - >>> el = etree.SubElement(new_el, "{ns}child") - >>> el = etree.SubElement(new_el, "{ns}child") + >>> new_el = objectify.Element("{http://ns/}new-element") + >>> el = etree.SubElement(new_el, "{http://ns/}child") + >>> el = etree.SubElement(new_el, "{http://ns/}child") + >>> el = etree.SubElement(new_el, "{http://ns/}child") >>> root["tag-name"] = [ new_el, new_el ] >>> print(len(root["tag-name"])) 2 >>> print(root["tag-name"].tag) - {ns}tag-name + {http://ns/}tag-name >>> print(len(root["tag-name"].child)) 3 >>> print(root["tag-name"].child.tag) - {ns}child + {http://ns/}child >>> print(root["tag-name"][1].child.tag) - {ns}child + {http://ns/}child or for names that have a special meaning in lxml.objectify: @@ -505,11 +516,11 @@ .. sourcecode:: pycon - >>> root = objectify.Element("{ns}root") - >>> b1 = etree.SubElement(root, "{ns}b") - >>> c = etree.SubElement(b1, "{ns}c") - >>> b2 = etree.SubElement(root, "{ns}b") - >>> d = etree.SubElement(root, "{other}d") + >>> root = objectify.Element("{http://ns/}root") + >>> b1 = etree.SubElement(root, "{http://ns/}b") + >>> c = etree.SubElement(b1, "{http://ns/}c") + >>> b2 = etree.SubElement(root, "{http://ns/}b") + >>> d = etree.SubElement(root, "{http://other/}d") >>> path = objectify.ObjectPath("root.b.c") >>> print(path) @@ -517,15 +528,15 @@ >>> path.hasattr(root) True >>> print(path.find(root).tag) - {ns}c + {http://ns/}c >>> find = objectify.ObjectPath("root.b.c") >>> print(find(root).tag) - {ns}c + {http://ns/}c - >>> find = objectify.ObjectPath("root.{other}d") + >>> find = objectify.ObjectPath("root.{http://other/}d") >>> print(find(root).tag) - {other}d + {http://other/}d >>> find = objectify.ObjectPath("root.{not}there") >>> print(find(root).tag) @@ -537,15 +548,15 @@ >>> print(find(root).tag) Traceback (most recent call last): ... - ValueError: root element does not match: need {not}there, got {ns}root + ValueError: root element does not match: need {not}there, got {http://ns/}root >>> find = objectify.ObjectPath("root.b[1]") >>> print(find(root).tag) - {ns}b + {http://ns/}b - >>> find = objectify.ObjectPath("root.{ns}b[1]") + >>> find = objectify.ObjectPath("root.{http://ns/}b[1]") >>> print(find(root).tag) - {ns}b + {http://ns/}b Apart from strings, ObjectPath also accepts lists of path segments: @@ -553,11 +564,11 @@ >>> find = objectify.ObjectPath(['root', 'b', 'c']) >>> print(find(root).tag) - {ns}c + {http://ns/}c - >>> find = objectify.ObjectPath(['root', '{ns}b[1]']) + >>> find = objectify.ObjectPath(['root', '{http://ns/}b[1]']) >>> print(find(root).tag) - {ns}b + {http://ns/}b You can also use relative paths starting with a '.' to ignore the actual root element and only inherit its namespace: @@ -566,23 +577,23 @@ >>> find = objectify.ObjectPath(".b[1]") >>> print(find(root).tag) - {ns}b + {http://ns/}b >>> find = objectify.ObjectPath(['', 'b[1]']) >>> print(find(root).tag) - {ns}b + {http://ns/}b >>> find = objectify.ObjectPath(".unknown[1]") >>> print(find(root).tag) Traceback (most recent call last): ... - AttributeError: no such child: {ns}unknown + AttributeError: no such child: {http://ns/}unknown - >>> find = objectify.ObjectPath(".{other}unknown[1]") + >>> find = objectify.ObjectPath(".{http://other/}unknown[1]") >>> print(find(root).tag) Traceback (most recent call last): ... - AttributeError: no such child: {other}unknown + AttributeError: no such child: {http://other/}unknown For convenience, a single dot represents the empty ObjectPath (identity): @@ -590,28 +601,28 @@ >>> find = objectify.ObjectPath(".") >>> print(find(root).tag) - {ns}root + {http://ns/}root ObjectPath objects can be used to manipulate trees: .. sourcecode:: pycon - >>> root = objectify.Element("{ns}root") + >>> root = objectify.Element("{http://ns/}root") - >>> path = objectify.ObjectPath(".some.child.{other}unknown") + >>> path = objectify.ObjectPath(".some.child.{http://other/}unknown") >>> path.hasattr(root) False >>> path.find(root) Traceback (most recent call last): ... - AttributeError: no such child: {ns}some + AttributeError: no such child: {http://ns/}some >>> path.setattr(root, "my value") # creates children as necessary >>> path.hasattr(root) True >>> print(path.find(root).text) my value - >>> print(root.some.child["{other}unknown"].text) + >>> print(root.some.child["{http://other/}unknown"].text) my value >>> print(len( path.find(root) )) From jholg at codespeak.net Mon Sep 28 17:31:33 2009 From: jholg at codespeak.net (jholg at codespeak.net) Date: Mon, 28 Sep 2009 17:31:33 +0200 (CEST) Subject: [Lxml-checkins] r67943 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20090928153133.789B7168003@codespeak.net> Author: jholg Date: Mon Sep 28 17:31:28 2009 New Revision: 67943 Modified: lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/lxml.objectify.pyx lxml/trunk/src/lxml/tests/test_objectify.py Log: Fixed ObjectifiedElement.__setattr__ to not raise a ValueError but still create an empty-string child element for non-ascii/non-unicode values. Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Mon Sep 28 17:31:28 2009 @@ -51,6 +51,9 @@ * Diverting the error logging to Python's logging system was broken. +* ObjectifiedElement.__setattr__ created an empty-string child element when the + attribute value was rejected as a non-unicode/non-ascii string + Other changes ------------- Modified: lxml/trunk/src/lxml/lxml.objectify.pyx ============================================================================== --- lxml/trunk/src/lxml/lxml.objectify.pyx (original) +++ lxml/trunk/src/lxml/lxml.objectify.pyx Mon Sep 28 17:31:28 2009 @@ -523,9 +523,10 @@ for item in value: _appendValue(parent, tag, item) else: - new_element = cetree.makeSubElement( - parent, tag, None, None, None, None) + new_element = cetree.makeElement( + tag, parent._doc, None, None, None, None, None) _setElementValue(new_element, value) + cetree.appendChild(parent, new_element) cdef _setElementValue(_Element element, value): cdef python.PyObject* _pytype Modified: lxml/trunk/src/lxml/tests/test_objectify.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_objectify.py (original) +++ lxml/trunk/src/lxml/tests/test_objectify.py Mon Sep 28 17:31:28 2009 @@ -346,6 +346,25 @@ self.assertRaises(AttributeError, getattr, root.c1, "NOT_THERE") self.assertRaises(AttributeError, getattr, root.c1, "{unknownNS}c2") + def test_setattr(self): + for val in [ + 2, 2**32, 1.2, "Won't get fooled again", + _str("W\xf6n't get f\xf6\xf6led \xe4g\xe4in", 'ISO-8859-1'), True, + False, None]: + root = self.Element('root') + attrname = 'val' + setattr(root, attrname, val) + result = getattr(root, attrname) + self.assertEquals(val, result) + self.assertEquals(type(val), type(result.pyval)) + + def test_setattr_nonunicode(self): + root = self.Element('root') + attrname = 'val' + val = _bytes("W\xf6n't get f\xf6\xf6led \xe4g\xe4in", 'ISO-8859-1') + self.assertRaises(ValueError, setattr, root, attrname, val) + self.assertRaises(AttributeError, getattr, root, attrname) + def test_addattr(self): root = self.XML(xml_str) self.assertEquals(1, len(root.c1)) From scoder at codespeak.net Tue Sep 29 22:16:19 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 29 Sep 2009 22:16:19 +0200 (CEST) Subject: [Lxml-checkins] r68006 - in lxml/trunk: . doc Message-ID: <20090929201619.CBC81168010@codespeak.net> Author: scoder Date: Tue Sep 29 22:16:19 2009 New Revision: 68006 Modified: lxml/trunk/ (props changed) lxml/trunk/doc/objectify.txt Log: r5245 at delle: sbehnel | 2009-09-11 15:37:45 +0200 objectify docs Modified: lxml/trunk/doc/objectify.txt ============================================================================== --- lxml/trunk/doc/objectify.txt (original) +++ lxml/trunk/doc/objectify.txt Tue Sep 29 22:16:19 2009 @@ -377,30 +377,35 @@ Note that the ``SubElement()`` factory of ``lxml.etree`` does not inherit any namespaces when creating a new subelement. Element creation must be explicit about the namespace, and is simplified -through the E-factory as described above. Lookups, however, inherit -namespaces implicitly. +through the E-factory as described above. -To access an element in a different namespace than its parent, you can -use ``getattr()``: +Lookups, however, inherit namespaces implicitly: .. sourcecode:: pycon - >>> print (root.tag) - {http://ns/}root + >>> print(root.b.tag) + {http://ns/}b >>> print(root.c) Traceback (most recent call last): ... AttributeError: no such child: {http://ns/}c - >>> print(getattr(root, "{http://other/}c").tag) +To access an element in a different namespace than its parent, you can +use ``getattr()``: + +.. sourcecode:: pycon + + >>> c = getattr(root, "{http://other/}c") + >>> print(c.tag) {http://other/}c For convenience, there is also a quick way through item access: .. sourcecode:: pycon - >>> print(root["{http://other/}c"].tag) + >>> c = root["{http://other/}c"] + >>> print(c.tag) {http://other/}c The same approach must be used to access children with tag names that are not From scoder at codespeak.net Tue Sep 29 22:16:35 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 29 Sep 2009 22:16:35 +0200 (CEST) Subject: [Lxml-checkins] r68007 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20090929201635.63B7516800D@codespeak.net> Author: scoder Date: Tue Sep 29 22:16:35 2009 New Revision: 68007 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/etree_defs.h lxml/trunk/src/lxml/tests/test_etree.py Log: r5248 at delle: sbehnel | 2009-09-29 22:16:07 +0200 fix tree traversal for parsed entity references Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Tue Sep 29 22:16:35 2009 @@ -19,6 +19,9 @@ Bugs fixed ---------- +* Modifying trees that contain parsed entity references could result + in an infinite loop. + * Syntax errors in ``lxml.cssselect`` could result in misleading error messages. Modified: lxml/trunk/src/lxml/etree_defs.h ============================================================================== --- lxml/trunk/src/lxml/etree_defs.h (original) +++ lxml/trunk/src/lxml/etree_defs.h Tue Sep 29 22:16:35 2009 @@ -221,8 +221,14 @@ #define _LX__TRAVERSE_TO_NEXT(c_stop_node, c_node, only_elements) \ { \ /* walk through children first */ \ - xmlNode* _lx__next = c_node->children; \ - _LX__ADVANCE_TO_NEXT(_lx__next, only_elements) \ + xmlNode* _lx__next = c_node->children; \ + if (_lx__next != 0) { \ + if (c_node->type == XML_ENTITY_REF_NODE || c_node->type == XML_DTD_NODE) { \ + _lx__next = 0; \ + } else { \ + _LX__ADVANCE_TO_NEXT(_lx__next, only_elements) \ + } \ + } \ if ((_lx__next == 0) && (c_node != c_stop_node)) { \ /* try siblings */ \ _lx__next = c_node->next; \ Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Tue Sep 29 22:16:35 2009 @@ -1049,7 +1049,7 @@ parser = self.etree.XMLParser(resolve_entities=False) Entity = self.etree.Entity - xml = '&myentity;' + xml = _bytes('&myentity;') tree = parse(BytesIO(xml), parser) root = tree.getroot() self.assertEquals(root[0].tag, Entity) @@ -1060,6 +1060,25 @@ self.assertEquals(_bytes('&myentity;'), tostring(root)) + def test_entity_restructure(self): + xml = _bytes(''' ]> + + + +   + ''') + + parser = self.etree.XMLParser(resolve_entities=False) + root = etree.fromstring(xml, parser) + self.assertEquals([ el.tag for el in root ], + ['child1', 'child2', 'child3']) + + root[0] = root[-1] + self.assertEquals([ el.tag for el in root ], + ['child3', 'child2']) + self.assertEquals(root[0][0].text, ' ') + self.assertEquals(root[0][0].name, 'nbsp') + def test_entity_append(self): Entity = self.etree.Entity Element = self.etree.Element From scoder at codespeak.net Tue Sep 29 22:16:52 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 29 Sep 2009 22:16:52 +0200 (CEST) Subject: [Lxml-checkins] r68008 - lxml/trunk Message-ID: <20090929201652.A3954168008@codespeak.net> Author: scoder Date: Tue Sep 29 22:16:52 2009 New Revision: 68008 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt Log: r5251 at delle: sbehnel | 2009-09-29 22:16:44 +0200 cleanup Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Tue Sep 29 22:16:52 2009 @@ -22,6 +22,9 @@ * Modifying trees that contain parsed entity references could result in an infinite loop. +* ObjectifiedElement.__setattr__ created an empty-string child element when the + attribute value was rejected as a non-unicode/non-ascii string + * Syntax errors in ``lxml.cssselect`` could result in misleading error messages. @@ -54,9 +57,6 @@ * Diverting the error logging to Python's logging system was broken. -* ObjectifiedElement.__setattr__ created an empty-string child element when the - attribute value was rejected as a non-unicode/non-ascii string - Other changes ------------- From scoder at codespeak.net Tue Sep 29 22:51:59 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 29 Sep 2009 22:51:59 +0200 (CEST) Subject: [Lxml-checkins] r68009 - in lxml/branch/lxml-2.2: . doc src/lxml src/lxml/html src/lxml/html/tests src/lxml/tests Message-ID: <20090929205159.616B9168003@codespeak.net> Author: scoder Date: Tue Sep 29 22:51:58 2009 New Revision: 68009 Modified: lxml/branch/lxml-2.2/ (props changed) lxml/branch/lxml-2.2/CHANGES.txt lxml/branch/lxml-2.2/doc/build.txt lxml/branch/lxml-2.2/doc/elementsoup.txt lxml/branch/lxml-2.2/doc/objectify.txt lxml/branch/lxml-2.2/doc/performance.txt lxml/branch/lxml-2.2/doc/xpathxslt.txt lxml/branch/lxml-2.2/setup.py lxml/branch/lxml-2.2/setupinfo.py lxml/branch/lxml-2.2/src/lxml/cleanup.pxi lxml/branch/lxml-2.2/src/lxml/cssselect.py lxml/branch/lxml-2.2/src/lxml/etree_defs.h lxml/branch/lxml-2.2/src/lxml/extensions.pxi lxml/branch/lxml-2.2/src/lxml/html/__init__.py lxml/branch/lxml-2.2/src/lxml/html/tests/test_forms.txt lxml/branch/lxml-2.2/src/lxml/lxml.objectify.pyx lxml/branch/lxml-2.2/src/lxml/parser.pxi lxml/branch/lxml-2.2/src/lxml/tests/test_css.txt lxml/branch/lxml-2.2/src/lxml/tests/test_etree.py lxml/branch/lxml-2.2/src/lxml/tests/test_objectify.py lxml/branch/lxml-2.2/version.txt Log: trunk merge of all recent bug fixes Modified: lxml/branch/lxml-2.2/CHANGES.txt ============================================================================== --- lxml/branch/lxml-2.2/CHANGES.txt (original) +++ lxml/branch/lxml-2.2/CHANGES.txt Tue Sep 29 22:51:58 2009 @@ -2,7 +2,7 @@ lxml changelog ============== -Under development +2.2.2 (2009-10-??) ================== Features added @@ -11,6 +11,38 @@ Bugs fixed ---------- +* Modifying trees that contain parsed entity references could result + in an infinite loop. + +* ObjectifiedElement.__setattr__ created an empty-string child element when the + attribute value was rejected as a non-unicode/non-ascii string + +* Syntax errors in ``lxml.cssselect`` could result in misleading error + messages. + +* Invalid syntax in CSS expressions could lead to an infinite loop in + the parser of ``lxml.cssselect``. + +* CSS special character escapes were not properly handled in + ``lxml.cssselect``. + +* CSS Unicode escapes were not properly decoded in ``lxml.cssselect``. + +* Select options in HTML forms that had no explicit ``value`` + attribute were not handled correctly. The HTML standard dictates + that their value is defined by their text content. This is now + supported by lxml.html. + +* XPath raised a TypeError when finding CDATA sections. This is now + fully supported. + +* Calling ``help(lxml.objectify)`` didn't work at the prompt. + +* The ``ElementMaker`` in lxml.objectify no longer defines the default + namespaces when annotation is disabled. + +* Feed parser failed to honout the 'recover' option on parse errors. + * Diverting the error logging to Python's logging system was broken. Other changes Modified: lxml/branch/lxml-2.2/doc/build.txt ============================================================================== --- lxml/branch/lxml-2.2/doc/build.txt (original) +++ lxml/branch/lxml-2.2/doc/build.txt Tue Sep 29 22:51:58 2009 @@ -16,9 +16,10 @@ 2 Subversion 3 Setuptools 4 Running the tests and reporting errors - 5 Contributing an egg - 6 Static linking on Windows - 7 Building Debian packages from SVN sources + 5 Building an egg + 6 Building lxml on MacOS-X + 7 Static linking on Windows + 8 Building Debian packages from SVN sources Cython @@ -211,6 +212,18 @@ Instead of ``build``, you can use any target, like ``bdist_egg`` if you want to use setuptools to build an installable egg. +Note that this also works with EasyInstall_. Since you can't pass +command line options in this case, you have to use an environment +variable instead:: + + STATIC_DEPS=true easy_install lxml + +Some machines may require an additional run with "sudo" to install the +package into the Python package directory:: + + STATIC_DEPS=true sudo easy_install lxml + + Static linking on Windows ------------------------- Modified: lxml/branch/lxml-2.2/doc/elementsoup.txt ============================================================================== --- lxml/branch/lxml-2.2/doc/elementsoup.txt (original) +++ lxml/branch/lxml-2.2/doc/elementsoup.txt Tue Sep 29 22:51:58 2009 @@ -23,6 +23,13 @@ document, and ``convert_tree()`` to convert an existing BeautifulSoup tree into a list of top-level Elements. +.. contents:: +.. + 1 Parsing with the soupparser + 2 Entity handling + 3 Using soupparser as a fallback + 4 Using only the encoding detection + Parsing with the soupparser =========================== @@ -70,6 +77,25 @@ ``makeelement`` factory function to ``parse()`` and ``fromstring()``. By default, this is based on the HTML parser defined in ``lxml.html``. +For a quick comparison, libxml2 2.6.32 parses the same tag soup as +follows. The main difference is that libxml2 tries harder to adhere +to the structure of an HTML document and moves misplaced tags where +they (likely) belong. Note, however, that the result can vary between +parser versions. + +.. sourcecode:: html + + + + + Hello + + +

Hi all

+

+ + + Entity handling =============== @@ -149,3 +175,27 @@ ... ignore = tostring(root, encoding=unicode) ... except UnicodeDecodeError: ... root = lxml.html.soupparser.fromstring(tag_soup) + + +Using only the encoding detection +================================= + +If you prefer a 'real' (and fast) HTML parser instead of the regular +expression based one in BeautifulSoup, you can still benefit from +BeautifulSoup's _`support for encoding detection` in the +``UnicodeDammit`` class. + +.. sourcecode:: pycon + + >>> from BeautifulSoup import UnicodeDammit + + >>> def decode_html(html_string): + ... converted = UnicodeDammit(html_string, isHTML=True) + ... if not converted.unicode: + ... raise UnicodeDecodeError( + ... "Failed to detect encoding, tried [%s]", + ... ', '.join(converted.triedEncodings)) + ... # print converted.originalEncoding + ... return converted.unicode + + >>> root = lxml.html.fromstring(decode_html(tag_soup)) Modified: lxml/branch/lxml-2.2/doc/objectify.txt ============================================================================== --- lxml/branch/lxml-2.2/doc/objectify.txt (original) +++ lxml/branch/lxml-2.2/doc/objectify.txt Tue Sep 29 22:51:58 2009 @@ -361,63 +361,79 @@ Namespace handling ------------------ -Namespaces are handled mostly behind the scenes. If you access a child of an -Element without specifying a namespace, the lookup will use the namespace of -the parent: +During tag lookups, namespaces are handled mostly behind the scenes. +If you access a child of an Element without specifying a namespace, +the lookup will use the namespace of the parent: .. sourcecode:: pycon - >>> root = objectify.Element("{ns}root") - >>> b = etree.SubElement(root, "{ns}b") - >>> c = etree.SubElement(root, "{other}c") + >>> root = objectify.Element("{http://ns/}root") + >>> b = etree.SubElement(root, "{http://ns/}b") + >>> c = etree.SubElement(root, "{http://other/}c") >>> print(root.b.tag) - {ns}b + {http://ns/}b + +Note that the ``SubElement()`` factory of ``lxml.etree`` does not +inherit any namespaces when creating a new subelement. Element +creation must be explicit about the namespace, and is simplified +through the E-factory as described above. + +Lookups, however, inherit namespaces implicitly: + +.. sourcecode:: pycon + + >>> print(root.b.tag) + {http://ns/}b + >>> print(root.c) Traceback (most recent call last): ... - AttributeError: no such child: {ns}c + AttributeError: no such child: {http://ns/}c -You can access elements with different namespaces via ``getattr()``: +To access an element in a different namespace than its parent, you can +use ``getattr()``: .. sourcecode:: pycon - >>> print(getattr(root, "{other}c").tag) - {other}c + >>> c = getattr(root, "{http://other/}c") + >>> print(c.tag) + {http://other/}c For convenience, there is also a quick way through item access: .. sourcecode:: pycon - >>> print(root["{other}c"].tag) - {other}c + >>> c = root["{http://other/}c"] + >>> print(c.tag) + {http://other/}c The same approach must be used to access children with tag names that are not valid Python identifiers: .. sourcecode:: pycon - >>> el = etree.SubElement(root, "{ns}tag-name") + >>> el = etree.SubElement(root, "{http://ns/}tag-name") >>> print(root["tag-name"].tag) - {ns}tag-name + {http://ns/}tag-name - >>> new_el = objectify.Element("{ns}new-element") - >>> el = etree.SubElement(new_el, "{ns}child") - >>> el = etree.SubElement(new_el, "{ns}child") - >>> el = etree.SubElement(new_el, "{ns}child") + >>> new_el = objectify.Element("{http://ns/}new-element") + >>> el = etree.SubElement(new_el, "{http://ns/}child") + >>> el = etree.SubElement(new_el, "{http://ns/}child") + >>> el = etree.SubElement(new_el, "{http://ns/}child") >>> root["tag-name"] = [ new_el, new_el ] >>> print(len(root["tag-name"])) 2 >>> print(root["tag-name"].tag) - {ns}tag-name + {http://ns/}tag-name >>> print(len(root["tag-name"].child)) 3 >>> print(root["tag-name"].child.tag) - {ns}child + {http://ns/}child >>> print(root["tag-name"][1].child.tag) - {ns}child + {http://ns/}child or for names that have a special meaning in lxml.objectify: @@ -505,11 +521,11 @@ .. sourcecode:: pycon - >>> root = objectify.Element("{ns}root") - >>> b1 = etree.SubElement(root, "{ns}b") - >>> c = etree.SubElement(b1, "{ns}c") - >>> b2 = etree.SubElement(root, "{ns}b") - >>> d = etree.SubElement(root, "{other}d") + >>> root = objectify.Element("{http://ns/}root") + >>> b1 = etree.SubElement(root, "{http://ns/}b") + >>> c = etree.SubElement(b1, "{http://ns/}c") + >>> b2 = etree.SubElement(root, "{http://ns/}b") + >>> d = etree.SubElement(root, "{http://other/}d") >>> path = objectify.ObjectPath("root.b.c") >>> print(path) @@ -517,15 +533,15 @@ >>> path.hasattr(root) True >>> print(path.find(root).tag) - {ns}c + {http://ns/}c >>> find = objectify.ObjectPath("root.b.c") >>> print(find(root).tag) - {ns}c + {http://ns/}c - >>> find = objectify.ObjectPath("root.{other}d") + >>> find = objectify.ObjectPath("root.{http://other/}d") >>> print(find(root).tag) - {other}d + {http://other/}d >>> find = objectify.ObjectPath("root.{not}there") >>> print(find(root).tag) @@ -537,15 +553,15 @@ >>> print(find(root).tag) Traceback (most recent call last): ... - ValueError: root element does not match: need {not}there, got {ns}root + ValueError: root element does not match: need {not}there, got {http://ns/}root >>> find = objectify.ObjectPath("root.b[1]") >>> print(find(root).tag) - {ns}b + {http://ns/}b - >>> find = objectify.ObjectPath("root.{ns}b[1]") + >>> find = objectify.ObjectPath("root.{http://ns/}b[1]") >>> print(find(root).tag) - {ns}b + {http://ns/}b Apart from strings, ObjectPath also accepts lists of path segments: @@ -553,11 +569,11 @@ >>> find = objectify.ObjectPath(['root', 'b', 'c']) >>> print(find(root).tag) - {ns}c + {http://ns/}c - >>> find = objectify.ObjectPath(['root', '{ns}b[1]']) + >>> find = objectify.ObjectPath(['root', '{http://ns/}b[1]']) >>> print(find(root).tag) - {ns}b + {http://ns/}b You can also use relative paths starting with a '.' to ignore the actual root element and only inherit its namespace: @@ -566,23 +582,23 @@ >>> find = objectify.ObjectPath(".b[1]") >>> print(find(root).tag) - {ns}b + {http://ns/}b >>> find = objectify.ObjectPath(['', 'b[1]']) >>> print(find(root).tag) - {ns}b + {http://ns/}b >>> find = objectify.ObjectPath(".unknown[1]") >>> print(find(root).tag) Traceback (most recent call last): ... - AttributeError: no such child: {ns}unknown + AttributeError: no such child: {http://ns/}unknown - >>> find = objectify.ObjectPath(".{other}unknown[1]") + >>> find = objectify.ObjectPath(".{http://other/}unknown[1]") >>> print(find(root).tag) Traceback (most recent call last): ... - AttributeError: no such child: {other}unknown + AttributeError: no such child: {http://other/}unknown For convenience, a single dot represents the empty ObjectPath (identity): @@ -590,28 +606,28 @@ >>> find = objectify.ObjectPath(".") >>> print(find(root).tag) - {ns}root + {http://ns/}root ObjectPath objects can be used to manipulate trees: .. sourcecode:: pycon - >>> root = objectify.Element("{ns}root") + >>> root = objectify.Element("{http://ns/}root") - >>> path = objectify.ObjectPath(".some.child.{other}unknown") + >>> path = objectify.ObjectPath(".some.child.{http://other/}unknown") >>> path.hasattr(root) False >>> path.find(root) Traceback (most recent call last): ... - AttributeError: no such child: {ns}some + AttributeError: no such child: {http://ns/}some >>> path.setattr(root, "my value") # creates children as necessary >>> path.hasattr(root) True >>> print(path.find(root).text) my value - >>> print(root.some.child["{other}unknown"].text) + >>> print(root.some.child["{http://other/}unknown"].text) my value >>> print(len( path.find(root) )) Modified: lxml/branch/lxml-2.2/doc/performance.txt ============================================================================== --- lxml/branch/lxml-2.2/doc/performance.txt (original) +++ lxml/branch/lxml-2.2/doc/performance.txt Tue Sep 29 22:51:58 2009 @@ -252,6 +252,21 @@ .. _`high-performance XML parsing`: http://www.ibm.com/developerworks/xml/library/x-hiperfparse/ +Finally, `xml.com`_ has a couple of publications about XML parser +performance. Farwick and Hafner have written two interesting articles +that compare the parser of libxml2 to some major Java based XML +parsers. One deals with `event-driven parser performance`_, the other +one presents `benchmark results comparing DOM parsers`_. Both +comparisons suggest that libxml2's parser performance is largely +superiour to all commonly used Java parsers in almost all cases. Note +that the C parser benchmark results are based on xmlbench_, which uses +a simpler setup for libxml2 than lxml does. + +.. _`xml.com`: http://www.xml.com/ +.. _`event-driven parser performance`: http://www.xml.com/lpt/a/1702 +.. _`benchmark results comparing DOM parsers`: http://www.xml.com/lpt/a/1703 +.. _xmlbench: http://xmlbench.sourceforge.net/ + The ElementTree API =================== Modified: lxml/branch/lxml-2.2/doc/xpathxslt.txt ============================================================================== --- lxml/branch/lxml-2.2/doc/xpathxslt.txt (original) +++ lxml/branch/lxml-2.2/doc/xpathxslt.txt Tue Sep 29 22:51:58 2009 @@ -2,8 +2,8 @@ XPath and XSLT with lxml ======================== -lxml supports both XPath and XSLT through libxml2 and libxslt in a standards -compliant way. +lxml supports XPath 1.0, XSLT 1.0 and the EXSLT extensions through +libxml2 and libxslt in a standards compliant way. .. contents:: .. Modified: lxml/branch/lxml-2.2/setup.py ============================================================================== --- lxml/branch/lxml-2.2/setup.py (original) +++ lxml/branch/lxml-2.2/setup.py Tue Sep 29 22:51:58 2009 @@ -1,5 +1,8 @@ import sys, os +# for command line options and supported environment variables, please +# see the end of 'setupinfo.py' + extra_options = {} try: Modified: lxml/branch/lxml-2.2/setupinfo.py ============================================================================== --- lxml/branch/lxml-2.2/setupinfo.py (original) +++ lxml/branch/lxml-2.2/setupinfo.py Tue Sep 29 22:51:58 2009 @@ -46,6 +46,7 @@ 'libs', 'build/tmp', static_include_dirs, static_library_dirs, static_cflags, static_binaries, + libiconv_version=OPTION_LIBICONV_VERSION, libxml2_version=OPTION_LIBXML2_VERSION, libxslt_version=OPTION_LIBXSLT_VERSION) if CYTHON_INSTALLED: @@ -320,7 +321,7 @@ env_val = os.getenv(name.upper().replace('-', '_')) return env_val -# pick up any commandline options +# pick up any commandline options and/or env variables OPTION_WITHOUT_OBJECTIFY = has_option('without-objectify') OPTION_WITHOUT_ASSERT = has_option('without-assert') OPTION_WITHOUT_THREADING = has_option('without-threading') @@ -337,3 +338,4 @@ OPTION_STATIC = True OPTION_LIBXML2_VERSION = option_value('libxml2-version') OPTION_LIBXSLT_VERSION = option_value('libxslt-version') +OPTION_LIBICONV_VERSION = option_value('libiconv-version') Modified: lxml/branch/lxml-2.2/src/lxml/cleanup.pxi ============================================================================== --- lxml/branch/lxml-2.2/src/lxml/cleanup.pxi (original) +++ lxml/branch/lxml-2.2/src/lxml/cleanup.pxi Tue Sep 29 22:51:58 2009 @@ -4,7 +4,7 @@ u"""cleanup_namespaces(tree_or_element) Remove all namespace declarations from a subtree that are not used - by any of the elements in that tree. + by any of the elements or attributes in that tree. """ cdef _Element element element = _rootNodeOrRaise(tree_or_element) Modified: lxml/branch/lxml-2.2/src/lxml/cssselect.py ============================================================================== --- lxml/branch/lxml-2.2/src/lxml/cssselect.py (original) +++ lxml/branch/lxml-2.2/src/lxml/cssselect.py Tue Sep 29 22:51:58 2009 @@ -49,9 +49,11 @@ try: _unicode = unicode + _unichr = unichr except NameError: # Python 3 _unicode = str + _unichr = chr class _UniToken(_unicode): def __new__(cls, contents, pos): @@ -99,7 +101,7 @@ def xpath(self): sel_xpath = self.selector.xpath() sel_xpath.add_condition( - "contains(concat(' ', normalize-space(@class), ' '), %s)" % xpath_repr(' '+self.class_name+' ')) + "contains(concat(' ', normalize-space(@class), ' '), %s)" % xpath_literal(' '+self.class_name+' ')) return sel_xpath class Function(object): @@ -194,7 +196,7 @@ if isinstance(expr, Element): expr = expr._format_element() xpath.add_condition('contains(css:lower-case(string(.)), %s)' - % xpath_repr(expr.lower())) + % xpath_literal(expr.lower())) # FIXME: Currently case insensitive matching doesn't seem to be happening return xpath @@ -349,34 +351,34 @@ path.add_condition(attrib) elif self.operator == '=': path.add_condition('%s = %s' % (attrib, - xpath_repr(value))) + xpath_literal(value))) elif self.operator == '!=': # FIXME: this seems like a weird hack... if value: path.add_condition('not(%s) or %s != %s' - % (attrib, attrib, xpath_repr(value))) + % (attrib, attrib, xpath_literal(value))) else: path.add_condition('%s != %s' - % (attrib, xpath_repr(value))) - #path.add_condition('%s != %s' % (attrib, xpath_repr(value))) + % (attrib, xpath_literal(value))) + #path.add_condition('%s != %s' % (attrib, xpath_literal(value))) elif self.operator == '~=': - path.add_condition("contains(concat(' ', normalize-space(%s), ' '), %s)" % (attrib, xpath_repr(' '+value+' '))) + path.add_condition("contains(concat(' ', normalize-space(%s), ' '), %s)" % (attrib, xpath_literal(' '+value+' '))) elif self.operator == '|=': # Weird, but true... path.add_condition('%s = %s or starts-with(%s, %s)' % ( - attrib, xpath_repr(value), - attrib, xpath_repr(value + '-'))) + attrib, xpath_literal(value), + attrib, xpath_literal(value + '-'))) elif self.operator == '^=': path.add_condition('starts-with(%s, %s)' % ( - attrib, xpath_repr(value))) + attrib, xpath_literal(value))) elif self.operator == '$=': # Oddly there is a starts-with in XPath 1.0, but not ends-with path.add_condition('substring(%s, string-length(%s)-%s) = %s' - % (attrib, attrib, len(value)-1, xpath_repr(value))) + % (attrib, attrib, len(value)-1, xpath_literal(value))) elif self.operator == '*=': # FIXME: case sensitive? path.add_condition('contains(%s, %s)' % ( - attrib, xpath_repr(value))) + attrib, xpath_literal(value))) else: assert 0, ("Unknown operator: %r" % self.operator) return path @@ -425,7 +427,7 @@ def xpath(self): path = self.selector.xpath() - path.add_condition('@id = %s' % xpath_repr(self.id)) + path.add_condition('@id = %s' % xpath_literal(self.id)) return path class Or(object): @@ -501,9 +503,9 @@ ############################## ## XPathExpr objects: -_el_re = re.compile(r'^\w+\s*$') -_id_re = re.compile(r'^(\w*)#(\w+)\s*$') -_class_re = re.compile(r'^(\w*)\.(\w+)\s*$') +_el_re = re.compile(r'^\w+\s*$', re.UNICODE) +_id_re = re.compile(r'^(\w*)#(\w+)\s*$', re.UNICODE) +_class_re = re.compile(r'^(\w*)\.(\w+)\s*$', re.UNICODE) def css_to_xpath(css_expr, prefix='descendant-or-self::'): if isinstance(css_expr, _basestring): @@ -524,7 +526,7 @@ "Got None for xpath expression from %s" % repr(css_expr)) if prefix: expr.add_prefix(prefix) - return str(expr) + return _unicode(expr) class XPathExpr(object): @@ -539,10 +541,10 @@ def __str__(self): path = '' if self.prefix is not None: - path += str(self.prefix) + path += _unicode(self.prefix) if self.path is not None: - path += str(self.path) - path += str(self.element) + path += _unicode(self.path) + path += _unicode(self.element) if self.condition: path += '[%s]' % self.condition return path @@ -574,7 +576,7 @@ if self.element == '*': # We weren't doing a test anyway return - self.add_condition("name() = %s" % xpath_repr(self.element)) + self.add_condition("name() = %s" % xpath_literal(self.element)) self.element = '*' def add_star_prefix(self): @@ -589,7 +591,7 @@ self.star_prefix = True def join(self, combiner, other): - prefix = str(self) + prefix = _unicode(self) prefix += combiner path = (other.prefix or '') + (other.path or '') # We don't need a star prefix if we are joining to this other @@ -615,16 +617,26 @@ def __str__(self): prefix = self.prefix or '' - return ' | '.join([prefix + str(i) for i in self.items]) + return ' | '.join(["%s%s" % (prefix,i) for i in self.items]) -def xpath_repr(s): - # FIXME: I don't think this is right, but lacking any reasonable - # specification on what XPath literals look like (which doesn't seem - # to be in the XPath specification) it is hard to do 'right' +split_at_single_quotes = re.compile("('+)").split + +def xpath_literal(s): if isinstance(s, Element): # This is probably a symbol that looks like an expression... s = s._format_element() - return repr(str(s)) + else: + s = _unicode(s) + if "'" not in s: + s = "'%s'" % s + elif '"' not in s: + s = '"%s"' % s + else: + s = "concat(%s)" % ','.join([ + (("'" in part) and '"%s"' or "'%s'") % part + for part in split_at_single_quotes(s) if part + ]) + return s ############################## ## Parsing functions @@ -637,8 +649,12 @@ except SelectorSyntaxError: import sys e = sys.exc_info()[1] - e.args = tuple(["%s at %s -> %s" % ( - e, stream.used, list(stream))]) + message = "%s at %s -> %r" % ( + e, stream.used, stream.peek()) + e.msg = message + if sys.version_info < (2,6): + e.message = message + e.args = tuple([message]) raise def parse_selector_group(stream): @@ -665,7 +681,11 @@ combinator = stream.next() else: combinator = ' ' + consumed = len(stream.used) next_selector = parse_simple_selector(stream) + if consumed == len(stream.used): + raise SelectorSyntaxError( + "Expected selector, got '%s'" % stream.peek()) result = CombinedSelector(result, combinator, next_selector) return result @@ -677,14 +697,14 @@ next = stream.next() if next != '*' and not isinstance(next, Symbol): raise SelectorSyntaxError( - "Expected symbol, got %r" % next) + "Expected symbol, got '%s'" % next) if stream.peek() == '|': namespace = next stream.next() element = stream.next() if element != '*' and not isinstance(next, Symbol): raise SelectorSyntaxError( - "Expected symbol, got %r" % next) + "Expected symbol, got '%s'" % next) else: namespace = '*' element = next @@ -711,14 +731,14 @@ next = stream.next() if not next == ']': raise SelectorSyntaxError( - "] expected, got %r" % next) + "] expected, got '%s'" % next) continue elif peek == ':' or peek == '::': type = stream.next() ident = stream.next() if not isinstance(ident, Symbol): raise SelectorSyntaxError( - "Expected symbol, got %r" % ident) + "Expected symbol, got '%s'" % ident) if stream.peek() == '(': stream.next() peek = stream.peek() @@ -732,7 +752,7 @@ next = stream.next() if not next == ')': raise SelectorSyntaxError( - "Expected ), got %r and %r" + "Expected ')', got '%s' and '%s'" % (next, selector)) result = Function(result, type, ident, selector) else: @@ -766,11 +786,11 @@ op = stream.next() if not op in ('^=', '$=', '*=', '=', '~=', '|=', '!='): raise SelectorSyntaxError( - "Operator expected, got %r" % op) + "Operator expected, got '%s'" % op) value = stream.next() if not isinstance(value, (Symbol, String)): raise SelectorSyntaxError( - "Expected string or symbol, got %r" % value) + "Expected string or symbol, got '%s'" % value) return Attrib(selector, namespace, attrib, op, value) def parse_series(s): @@ -814,9 +834,9 @@ ## Tokenizing ############################################################ -_whitespace_re = re.compile(r'\s+') +_whitespace_re = re.compile(r'\s+', re.UNICODE) -_comment_re = re.compile(r'/\*.*?\*/', re.S) +_comment_re = re.compile(r'/\*.*?\*/', re.DOTALL) _count_re = re.compile(r'[+-]?\d*n(?:[+-]\d+)?') @@ -861,6 +881,28 @@ yield Symbol(sym, old_pos) continue +split_at_string_escapes = re.compile(r'(\\(?:%s))' + % '|'.join(['[A-Fa-f0-9]{1,6}(?:\r\n|\s)?', + '[^A-Fa-f0-9]'])).split + +def unescape_string_literal(literal): + substrings = [] + for substring in split_at_string_escapes(literal): + if not substring: + continue + elif '\\' in substring: + if substring[0] == '\\' and len(substring) > 1: + substring = substring[1:] + if substring[0] in '0123456789ABCDEFabcdef': + # int() correctly ignores the potentially trailing whitespace + substring = _unichr(int(substring, 16)) + else: + raise SelectorSyntaxError( + "Invalid escape sequence %r in string %r" + % (substring.split('\\')[1], literal)) + substrings.append(substring) + return ''.join(substrings) + def tokenize_escaped_string(s, pos): quote = s[pos] assert quote in ('"', "'") @@ -873,13 +915,13 @@ "Expected closing %s for string in: %r" % (quote, s[start:])) result = s[start:next] - try: - result = result.encode('ASCII', 'backslashreplace').decode('unicode_escape') - except UnicodeDecodeError: - # Probably a hanging \ + if result.endswith('\\'): + # next quote character is escaped pos = next+1 - else: - return result, next+1 + continue + if '\\' in result: + result = unescape_string_literal(result) + return result, next+1 _illegal_symbol = re.compile(r'[^\w\\-]', re.UNICODE) Modified: lxml/branch/lxml-2.2/src/lxml/etree_defs.h ============================================================================== --- lxml/branch/lxml-2.2/src/lxml/etree_defs.h (original) +++ lxml/branch/lxml-2.2/src/lxml/etree_defs.h Tue Sep 29 22:51:58 2009 @@ -221,8 +221,14 @@ #define _LX__TRAVERSE_TO_NEXT(c_stop_node, c_node, only_elements) \ { \ /* walk through children first */ \ - xmlNode* _lx__next = c_node->children; \ - _LX__ADVANCE_TO_NEXT(_lx__next, only_elements) \ + xmlNode* _lx__next = c_node->children; \ + if (_lx__next != 0) { \ + if (c_node->type == XML_ENTITY_REF_NODE || c_node->type == XML_DTD_NODE) { \ + _lx__next = 0; \ + } else { \ + _LX__ADVANCE_TO_NEXT(_lx__next, only_elements) \ + } \ + } \ if ((_lx__next == 0) && (c_node != c_stop_node)) { \ /* try siblings */ \ _lx__next = c_node->next; \ Modified: lxml/branch/lxml-2.2/src/lxml/extensions.pxi ============================================================================== --- lxml/branch/lxml-2.2/src/lxml/extensions.pxi (original) +++ lxml/branch/lxml-2.2/src/lxml/extensions.pxi Tue Sep 29 22:51:58 2009 @@ -543,7 +543,8 @@ results.append( _fakeDocElementFactory(doc, c_node)) elif c_node.type == tree.XML_TEXT_NODE or \ - c_node.type == tree.XML_ATTRIBUTE_NODE: + c_node.type == tree.XML_CDATA_SECTION_NODE or \ + c_node.type == tree.XML_ATTRIBUTE_NODE: results.append( _buildElementStringResult(doc, c_node, smart_string)) elif c_node.type == tree.XML_NAMESPACE_DECL: @@ -572,7 +573,7 @@ pass else: raise NotImplementedError, \ - u"Not yet implemented result node type: %d" % unicode(c_node.type) + u"Not yet implemented result node type: %d" % c_node.type cdef void _freeXPathObject(xpath.xmlXPathObject* xpathObj): u"""Free the XPath object, but *never* free the *content* of node sets. @@ -642,7 +643,7 @@ tree.xmlFree(s) c_element = NULL else: - #assert c_node.type == tree.XML_TEXT_NODE, "invalid node type" + #assert c_node.type == tree.XML_TEXT_NODE or c_node.type == tree.XML_CDATA_SECTION_NODE, "invalid node type" is_attribute = 0 # may be tail text or normal text value = funicode(c_node.content) Modified: lxml/branch/lxml-2.2/src/lxml/html/__init__.py ============================================================================== --- lxml/branch/lxml-2.2/src/lxml/html/__init__.py (original) +++ lxml/branch/lxml-2.2/src/lxml/html/__init__.py Tue Sep 29 22:51:58 2009 @@ -991,9 +991,12 @@ if self.multiple: return MultipleSelectOptions(self) for el in _options_xpath(self): - if 'selected' in el.attrib: + if el.get('selected') is not None: value = el.get('value') - # FIXME: If value is None, what to return?, get_text()? + if value is None: + value = el.text or '' + if value: + value = value.strip() return value return None @@ -1006,9 +1009,14 @@ self.value.update(value) return if value is not None: + value = value.strip() for el in _options_xpath(self): - # FIXME: also if el.get('value') is None? - if el.get('value') == value: + opt_value = el.get('value') + if opt_value is None: + opt_value = el.text or '' + if opt_value: + opt_value = opt_value.strip() + if opt_value == value: checked_option = el break else: @@ -1034,7 +1042,15 @@ All the possible values this select can have (the ``value`` attribute of all the ``

hello
objectify

+ + Note that this module has a predefined ElementMaker instance called ``E``. + """ + cdef object _makeelement + cdef object _namespace + cdef object _nsmap + cdef bint _annotate + def __init__(self, *, namespace=None, nsmap=None, annotate=True, + makeelement=None): + if nsmap is None: + if annotate: + nsmap = _DEFAULT_NSMAP + else: + nsmap = {} + self._nsmap = nsmap + if namespace is None: + self._namespace = None + else: + self._namespace = u"{%s}" % namespace + self._annotate = annotate + if makeelement is not None: + assert callable(makeelement) + self._makeelement = makeelement + else: + self._makeelement = None + + def __getattr__(self, tag): + cdef _ObjectifyElementMakerCaller element_maker + if is_special_method(tag): + return object.__getattr__(self, tag) + if self._namespace is not None and tag[0] != u"{": + tag = self._namespace + tag + element_maker = NEW_ELEMENT_MAKER(_ObjectifyElementMakerCaller) + element_maker._tag = tag + element_maker._nsmap = self._nsmap + element_maker._annotate = self._annotate + element_maker._element_factory = self._makeelement + return element_maker + ################################################################################ # Recursive element dumping @@ -1700,8 +1723,7 @@ tree.xmlSetNsProp(c_node, c_ns, "nil", "true") tree.END_FOR_EACH_ELEMENT_FROM(c_node) -cdef object _strip_attributes -_strip_attributes = etree.strip_attributes +cdef object _strip_attributes = etree.strip_attributes def deannotate(element_or_tree, *, pytype=True, xsi=True, xsi_nil=False): u"""deannotate(element_or_tree, pytype=True, xsi=True, xsi_nil=False) @@ -1714,6 +1736,10 @@ default), 'xsi:type' attributes will be removed. If the 'xsi_nil' keyword argument is True (default: False), 'xsi:nil' attributes will be removed. + + Note that this does not touch the namespace declarations. If you + want to remove unused namespace declarations from the tree, use + ``lxml.etree.cleanup_namespaces()``. """ cdef list attribute_names = [] Modified: lxml/branch/lxml-2.2/src/lxml/parser.pxi ============================================================================== --- lxml/branch/lxml-2.2/src/lxml/parser.pxi (original) +++ lxml/branch/lxml-2.2/src/lxml/parser.pxi Tue Sep 29 22:51:58 2009 @@ -1042,6 +1042,7 @@ cdef char* c_encoding cdef int buffer_len cdef int error + cdef bint recover = self._parse_options & xmlparser.XML_PARSE_RECOVER if python.PyString_Check(data): if self._default_encoding is None: c_encoding = NULL @@ -1078,10 +1079,10 @@ xmlparser.xmlCtxtUseOptions(pctxt, self._parse_options) error = xmlparser.xmlCtxtResetPush( pctxt, c_data, buffer_len, NULL, c_encoding) - py_buffer_len = py_buffer_len - buffer_len - c_data = c_data + buffer_len + py_buffer_len -= buffer_len + c_data += buffer_len - while error == 0 and py_buffer_len > 0: + while (recover or error == 0) and py_buffer_len > 0: if py_buffer_len > python.INT_MAX: buffer_len = python.INT_MAX else: @@ -1090,11 +1091,10 @@ error = htmlparser.htmlParseChunk(pctxt, c_data, buffer_len, 0) else: error = xmlparser.xmlParseChunk(pctxt, c_data, buffer_len, 0) - py_buffer_len = py_buffer_len - buffer_len - c_data = c_data + buffer_len + py_buffer_len -= buffer_len + c_data += buffer_len - if error or (not pctxt.wellFormed and - not self._parse_options & xmlparser.XML_PARSE_RECOVER): + if not recover and (error or not pctxt.wellFormed): self._feed_parser_running = 0 try: context._handleParseResult(self, NULL, None) Modified: lxml/branch/lxml-2.2/src/lxml/tests/test_css.txt ============================================================================== --- lxml/branch/lxml-2.2/src/lxml/tests/test_css.txt (original) +++ lxml/branch/lxml-2.2/src/lxml/tests/test_css.txt Tue Sep 29 22:51:58 2009 @@ -49,6 +49,14 @@ >>> parse('td ~ th') CombinedSelector[Element[td] ~ Element[th]] +Some parse error tests: + + >>> try: parse('attributes(href)/html/body/a') + ... except: # Py2, Py3, ... + ... import sys + ... print(str(sys.exc_info()[1]).replace("(u'", "('")) + Expected selector, got '(' at [Symbol('attributes', 0)] -> Token('(', 10) + Now of translation: >>> def xpath(css): @@ -123,7 +131,44 @@ ... NotImplementedError: *:only-of-type is not implemented -Then of parse_series: +Now a Unicode character test: + + >>> from lxml.cssselect import css_to_xpath + >>> import sys + >>> if sys.version_info[0] >= 3: + ... css_expr = '.a\xc1b' + ... else: + ... css_expr = '.a\xc1b'.decode('ISO-8859-1') + + >>> xpath_expr = css_to_xpath(css_expr) + >>> print( css_expr[1:] in xpath_expr ) + True + >>> print( xpath_expr.encode('ascii', 'xmlcharrefreplace').decode('ASCII') ) + descendant-or-self::*[contains(concat(' ', normalize-space(@class), ' '), ' aÁb ')] + +And some special character tests: + + >>> print( css_to_xpath('*[aval="\'"]') ) + descendant-or-self::*[@aval = "'"] + >>> print( css_to_xpath('*[aval="\'\'\'"]') ) + descendant-or-self::*[@aval = "'''"] + >>> print( css_to_xpath('*[aval=\'"\']') ) + descendant-or-self::*[@aval = '"'] + >>> print( css_to_xpath('*[aval=\'"""\']') ) + descendant-or-self::*[@aval = '"""'] + +Some Unicode escape tests (including the trailing whitespace rules): + + >>> print( css_to_xpath(r'*[aval="\'\22\'"]') ) # \22 == '"' + descendant-or-self::*[@aval = concat("'",'"',"'")] + >>> print( css_to_xpath(r'*[aval="\'\22 2\'"]') ) + descendant-or-self::*[@aval = concat("'",'"2',"'")] + >>> print( css_to_xpath(r'*[aval="\'\20 \'"]') ) # \20 == ' ' + descendant-or-self::*[@aval = "' '"] + >>> print( css_to_xpath('*[aval="\'\\20\r\n \'"]') ) + descendant-or-self::*[@aval = "' '"] + +Then some test for parse_series: >>> from lxml.cssselect import parse_series >>> parse_series('1n+3') Modified: lxml/branch/lxml-2.2/src/lxml/tests/test_etree.py ============================================================================== --- lxml/branch/lxml-2.2/src/lxml/tests/test_etree.py (original) +++ lxml/branch/lxml-2.2/src/lxml/tests/test_etree.py Tue Sep 29 22:51:58 2009 @@ -580,6 +580,27 @@ self.etree.XMLParser(encoding="utf-8") self.etree.XMLParser(encoding="iso-8859-1") + def test_feed_parser_recover(self): + parser = self.etree.XMLParser(recover=True) + + parser.feed('<') + parser.feed('a test="works"') + parser.feed('> not closed! + parser.feed('>') + + root = parser.close() + + self.assertEquals(root.tag, "root") + self.assertEquals(len(root), 1) + self.assertEquals(root[0].tag, "a") + self.assertEquals(root[0].get("test"), "works") + self.assertEquals(len(root[0]), 1) + self.assertEquals(root[0][0].tag, "othertag") + # FIXME: would be nice to get some errors logged ... + #self.assert_(len(parser.error_log) > 0, "error log is empty") + def test_elementtree_parser_target_type_error(self): assertEquals = self.assertEquals assertFalse = self.assertFalse @@ -970,7 +991,7 @@ parser = self.etree.XMLParser(resolve_entities=False) Entity = self.etree.Entity - xml = '&myentity;' + xml = _bytes('&myentity;') tree = parse(BytesIO(xml), parser) root = tree.getroot() self.assertEquals(root[0].tag, Entity) @@ -981,6 +1002,25 @@ self.assertEquals(_bytes('&myentity;'), tostring(root)) + def test_entity_restructure(self): + xml = _bytes(''' ]> + + + +   + ''') + + parser = self.etree.XMLParser(resolve_entities=False) + root = etree.fromstring(xml, parser) + self.assertEquals([ el.tag for el in root ], + ['child1', 'child2', 'child3']) + + root[0] = root[-1] + self.assertEquals([ el.tag for el in root ], + ['child3', 'child2']) + self.assertEquals(root[0][0].text, ' ') + self.assertEquals(root[0][0].name, 'nbsp') + def test_entity_append(self): Entity = self.etree.Entity Element = self.etree.Element @@ -1061,6 +1101,15 @@ self.assertEquals(_bytes(''), tostring(root)) + def test_cdata_xpath(self): + tostring = self.etree.tostring + parser = self.etree.XMLParser(strip_cdata=False) + root = self.etree.XML(_bytes(''), parser) + self.assertEquals(_bytes(''), + tostring(root)) + + self.assertEquals(['test'], root.xpath('//text()')) + # TypeError in etree, AssertionError in ElementTree; def test_setitem_assert(self): Element = self.etree.Element Modified: lxml/branch/lxml-2.2/src/lxml/tests/test_objectify.py ============================================================================== --- lxml/branch/lxml-2.2/src/lxml/tests/test_objectify.py (original) +++ lxml/branch/lxml-2.2/src/lxml/tests/test_objectify.py Tue Sep 29 22:51:58 2009 @@ -346,6 +346,25 @@ self.assertRaises(AttributeError, getattr, root.c1, "NOT_THERE") self.assertRaises(AttributeError, getattr, root.c1, "{unknownNS}c2") + def test_setattr(self): + for val in [ + 2, 2**32, 1.2, "Won't get fooled again", + _str("W\xf6n't get f\xf6\xf6led \xe4g\xe4in", 'ISO-8859-1'), True, + False, None]: + root = self.Element('root') + attrname = 'val' + setattr(root, attrname, val) + result = getattr(root, attrname) + self.assertEquals(val, result) + self.assertEquals(type(val), type(result.pyval)) + + def test_setattr_nonunicode(self): + root = self.Element('root') + attrname = 'val' + val = _bytes("W\xf6n't get f\xf6\xf6led \xe4g\xe4in", 'ISO-8859-1') + self.assertRaises(ValueError, setattr, root, attrname, val) + self.assertRaises(AttributeError, getattr, root, attrname) + def test_addattr(self): root = self.XML(xml_str) self.assertEquals(1, len(root.c1)) @@ -2453,6 +2472,7 @@ def test_suite(): suite = unittest.TestSuite() suite.addTests([unittest.makeSuite(ObjectifyTestCase)]) + suite.addTests(doctest.DocTestSuite(objectify)) if sys.version_info >= (2,4): suite.addTests( [make_doctest('../../../doc/objectify.txt')]) Modified: lxml/branch/lxml-2.2/version.txt ============================================================================== --- lxml/branch/lxml-2.2/version.txt (original) +++ lxml/branch/lxml-2.2/version.txt Tue Sep 29 22:51:58 2009 @@ -1 +1 @@ -2.2.2 +2.2.3 From scoder at codespeak.net Tue Sep 29 23:23:40 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 29 Sep 2009 23:23:40 +0200 (CEST) Subject: [Lxml-checkins] r68010 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20090929212340.8CEBB168005@codespeak.net> Author: scoder Date: Tue Sep 29 23:23:40 2009 New Revision: 68010 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/apihelpers.pxi lxml/trunk/src/lxml/tests/test_etree.py Log: r5253 at delle: sbehnel | 2009-09-29 23:23:34 +0200 more robust clean up when errors occur in SubElement() Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Tue Sep 29 23:23:40 2009 @@ -19,6 +19,9 @@ Bugs fixed ---------- +* Late errors during calls to ``SubElement()`` (e.g. attribute related + ones) could leave a partially initialised element in the tree. + * Modifying trees that contain parsed entity references could result in an infinite loop. Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Tue Sep 29 23:23:40 2009 @@ -196,15 +196,20 @@ return python.PyErr_NoMemory() tree.xmlAddChild(parent._c_node, c_node) - if text is not None: - _setNodeText(c_node, text) - if tail is not None: - _setTailText(c_node, tail) - - # add namespaces to node if necessary - _initNodeNamespaces(c_node, parent._doc, ns_utf, nsmap) - _initNodeAttributes(c_node, parent._doc, attrib, extra_attrs) - return _elementFactory(parent._doc, c_node) + try: + if text is not None: + _setNodeText(c_node, text) + if tail is not None: + _setTailText(c_node, tail) + + # add namespaces to node if necessary + _initNodeNamespaces(c_node, parent._doc, ns_utf, nsmap) + _initNodeAttributes(c_node, parent._doc, attrib, extra_attrs) + return _elementFactory(parent._doc, c_node) + except: + # make sure we clean up in case of an error + _removeNode(parent._doc, c_node) + raise cdef int _initNodeNamespaces(xmlNode* c_node, _Document doc, object node_ns_utf, object nsmap) except -1: Modified: lxml/trunk/src/lxml/tests/test_etree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_etree.py (original) +++ lxml/trunk/src/lxml/tests/test_etree.py Tue Sep 29 23:23:40 2009 @@ -138,6 +138,15 @@ self.assertRaises(ValueError, SubElement, el, 'na me') self.assertRaises(ValueError, SubElement, el, '{test} name') + def test_subelement_attribute_invalid(self): + Element = self.etree.Element + SubElement = self.etree.SubElement + + el = Element('name') + self.assertRaises(ValueError, SubElement, el, 'name', {'a b c' : 'abc'}) + self.assertRaises(ValueError, SubElement, el, 'name', {'a' : 'a\0\n'}) + self.assertEquals(0, len(el)) + def test_qname_empty(self): QName = self.etree.QName self.assertRaises(ValueError, QName, '') From scoder at codespeak.net Tue Sep 29 23:24:20 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Tue, 29 Sep 2009 23:24:20 +0200 (CEST) Subject: [Lxml-checkins] r68011 - in lxml/branch/lxml-2.2: . src/lxml src/lxml/tests Message-ID: <20090929212420.1F065168002@codespeak.net> Author: scoder Date: Tue Sep 29 23:24:19 2009 New Revision: 68011 Modified: lxml/branch/lxml-2.2/ (props changed) lxml/branch/lxml-2.2/CHANGES.txt lxml/branch/lxml-2.2/src/lxml/apihelpers.pxi lxml/branch/lxml-2.2/src/lxml/tests/test_etree.py Log: trunk merge: SubElement() fix Modified: lxml/branch/lxml-2.2/CHANGES.txt ============================================================================== --- lxml/branch/lxml-2.2/CHANGES.txt (original) +++ lxml/branch/lxml-2.2/CHANGES.txt Tue Sep 29 23:24:19 2009 @@ -11,6 +11,9 @@ Bugs fixed ---------- +* Late errors during calls to ``SubElement()`` (e.g. attribute related + ones) could leave a partially initialised element in the tree. + * Modifying trees that contain parsed entity references could result in an infinite loop. Modified: lxml/branch/lxml-2.2/src/lxml/apihelpers.pxi ============================================================================== --- lxml/branch/lxml-2.2/src/lxml/apihelpers.pxi (original) +++ lxml/branch/lxml-2.2/src/lxml/apihelpers.pxi Tue Sep 29 23:24:19 2009 @@ -194,15 +194,20 @@ return python.PyErr_NoMemory() tree.xmlAddChild(parent._c_node, c_node) - if text is not None: - _setNodeText(c_node, text) - if tail is not None: - _setTailText(c_node, tail) - - # add namespaces to node if necessary - _initNodeNamespaces(c_node, parent._doc, ns_utf, nsmap) - _initNodeAttributes(c_node, parent._doc, attrib, extra_attrs) - return _elementFactory(parent._doc, c_node) + try: + if text is not None: + _setNodeText(c_node, text) + if tail is not None: + _setTailText(c_node, tail) + + # add namespaces to node if necessary + _initNodeNamespaces(c_node, parent._doc, ns_utf, nsmap) + _initNodeAttributes(c_node, parent._doc, attrib, extra_attrs) + return _elementFactory(parent._doc, c_node) + except: + # make sure we clean up in case of an error + _removeNode(parent._doc, c_node) + raise cdef int _initNodeNamespaces(xmlNode* c_node, _Document doc, object node_ns_utf, object nsmap) except -1: Modified: lxml/branch/lxml-2.2/src/lxml/tests/test_etree.py ============================================================================== --- lxml/branch/lxml-2.2/src/lxml/tests/test_etree.py (original) +++ lxml/branch/lxml-2.2/src/lxml/tests/test_etree.py Tue Sep 29 23:24:19 2009 @@ -138,6 +138,15 @@ self.assertRaises(ValueError, SubElement, el, 'na me') self.assertRaises(ValueError, SubElement, el, '{test} name') + def test_subelement_attribute_invalid(self): + Element = self.etree.Element + SubElement = self.etree.SubElement + + el = Element('name') + self.assertRaises(ValueError, SubElement, el, 'name', {'a b c' : 'abc'}) + self.assertRaises(ValueError, SubElement, el, 'name', {'a' : 'a\0\n'}) + self.assertEquals(0, len(el)) + def test_qname_empty(self): QName = self.etree.QName self.assertRaises(ValueError, QName, '') From scoder at codespeak.net Wed Sep 30 19:54:51 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 30 Sep 2009 19:54:51 +0200 (CEST) Subject: [Lxml-checkins] r68047 - in lxml/trunk: . src/lxml Message-ID: <20090930175451.93E26168013@codespeak.net> Author: scoder Date: Wed Sep 30 19:54:51 2009 New Revision: 68047 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/lxml.etree.pyx Log: r5256 at delle: sbehnel | 2009-09-30 13:37:40 +0200 use explicit bytes/string type instead of bare literal Modified: lxml/trunk/src/lxml/lxml.etree.pyx ============================================================================== --- lxml/trunk/src/lxml/lxml.etree.pyx (original) +++ lxml/trunk/src/lxml/lxml.etree.pyx Wed Sep 30 19:54:51 2009 @@ -85,7 +85,7 @@ if _FILENAME_ENCODING is None: _FILENAME_ENCODING = sys.getdefaultencoding() if _FILENAME_ENCODING is None: - _FILENAME_ENCODING = 'ascii' + _FILENAME_ENCODING = b'ascii' else: _FILENAME_ENCODING = _FILENAME_ENCODING.encode(u"UTF-8") cdef char* _C_FILENAME_ENCODING @@ -94,17 +94,17 @@ # set up some default namespace prefixes cdef object _DEFAULT_NAMESPACE_PREFIXES _DEFAULT_NAMESPACE_PREFIXES = { - "http://www.w3.org/1999/xhtml": "html", - "http://www.w3.org/1999/XSL/Transform": "xsl", - "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf", - "http://schemas.xmlsoap.org/wsdl/": "wsdl", + b"http://www.w3.org/1999/xhtml": b"html", + b"http://www.w3.org/1999/XSL/Transform": b"xsl", + b"http://www.w3.org/1999/02/22-rdf-syntax-ns#": b"rdf", + b"http://schemas.xmlsoap.org/wsdl/": b"wsdl", # xml schema - "http://www.w3.org/2001/XMLSchema": "xs", - "http://www.w3.org/2001/XMLSchema-instance": "xsi", + b"http://www.w3.org/2001/XMLSchema": b"xs", + b"http://www.w3.org/2001/XMLSchema-instance": b"xsi", # dublin core - "http://purl.org/dc/elements/1.1/": "dc", + b"http://purl.org/dc/elements/1.1/": b"dc", # objectify - "http://codespeak.net/lxml/objectify/pytype" : "py", + b"http://codespeak.net/lxml/objectify/pytype" : b"py", } # Error superclass for ElementTree compatibility @@ -1508,7 +1508,7 @@ if isinstance(text_or_uri_or_element, _Element): text_or_uri_or_element = (<_Element>text_or_uri_or_element).tag if not _isString(text_or_uri_or_element): - raise ValueError, ("Invalid input tag of type %r" % + raise ValueError, (u"Invalid input tag of type %r" % type(text_or_uri_or_element)) elif isinstance(text_or_uri_or_element, QName): text_or_uri_or_element = (text_or_uri_or_element).text @@ -1766,8 +1766,8 @@ start = path[:1] if start == u"/": path = u"." + path - elif start == "/": - path = "." + path + elif start == b"/": + path = b"." + path return root.find(path) def findtext(self, path, default=None): @@ -1782,8 +1782,8 @@ start = path[:1] if start == u"/": path = u"." + path - elif start == "/": - path = "." + path + elif start == b"/": + path = b"." + path return root.findtext(path, default) def findall(self, path): @@ -1798,8 +1798,8 @@ start = path[:1] if start == u"/": path = u"." + path - elif start == "/": - path = "." + path + elif start == b"/": + path = b"." + path return root.findall(path) def iterfind(self, path): @@ -1814,8 +1814,8 @@ start = path[:1] if start == u"/": path = u"." + path - elif start == "/": - path = "." + path + elif start == b"/": + path = b"." + path return root.iterfind(path) def xpath(self, _path, *, namespaces=None, extensions=None, @@ -2373,7 +2373,7 @@ cdef xmlNode* c_node cdef xmlDoc* c_doc if text is None: - text = '' + text = b'' else: text = _utf8(text) c_doc = _newXMLDoc() @@ -2393,7 +2393,7 @@ cdef xmlDoc* c_doc target = _utf8(target) if text is None: - text = '' + text = b'' else: text = _utf8(text) c_doc = _newXMLDoc() From scoder at codespeak.net Wed Sep 30 19:54:55 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 30 Sep 2009 19:54:55 +0200 (CEST) Subject: [Lxml-checkins] r68048 - in lxml/trunk: . src/lxml Message-ID: <20090930175455.BB233168013@codespeak.net> Author: scoder Date: Wed Sep 30 19:54:55 2009 New Revision: 68048 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/apihelpers.pxi Log: r5257 at delle: sbehnel | 2009-09-30 13:38:05 +0200 better exception message Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Wed Sep 30 19:54:55 2009 @@ -1254,7 +1254,7 @@ raise TypeError, u"Argument must be string or unicode." if invalid: raise ValueError, \ - u"All strings must be XML compatible: Unicode or ASCII, no NULL bytes" + u"All strings must be XML compatible: Unicode or ASCII, no NULL bytes or control characters" return s cdef bint _isFilePath(char* c_path): From scoder at codespeak.net Wed Sep 30 19:55:00 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 30 Sep 2009 19:55:00 +0200 (CEST) Subject: [Lxml-checkins] r68049 - in lxml/trunk: . src/lxml/html Message-ID: <20090930175500.28424168016@codespeak.net> Author: scoder Date: Wed Sep 30 19:54:59 2009 New Revision: 68049 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/html/__init__.py Log: r5258 at delle: sbehnel | 2009-09-30 19:42:59 +0200 doc comment Modified: lxml/trunk/src/lxml/html/__init__.py ============================================================================== --- lxml/trunk/src/lxml/html/__init__.py (original) +++ lxml/trunk/src/lxml/html/__init__.py Wed Sep 30 19:54:59 2009 @@ -1487,7 +1487,11 @@ you have the XHTML DTDs installed in your catalogs, and create the parser like this:: - parser = XHTMLParser(load_dtd=True) + >>> parser = XHTMLParser(load_dtd=True) + + If you additionally want to validate the document, use this:: + + >>> parser = XHTMLParser(dtd_validation=True) For catalog support, see http://www.xmlsoft.org/catalog.html. """ From scoder at codespeak.net Wed Sep 30 19:55:04 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 30 Sep 2009 19:55:04 +0200 (CEST) Subject: [Lxml-checkins] r68050 - in lxml/trunk: . src/lxml/html Message-ID: <20090930175504.2ABF8168013@codespeak.net> Author: scoder Date: Wed Sep 30 19:55:03 2009 New Revision: 68050 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/html/__init__.py Log: r5259 at delle: sbehnel | 2009-09-30 19:46:32 +0200 forms lacking an 'action' attribute now use the base URL Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Wed Sep 30 19:55:03 2009 @@ -8,6 +8,9 @@ Features added -------------- +* Forms that lack an ``action`` attribute default to the base URL of + the document on submit. + * XPath attribute result strings have an ``attrname`` property. * Namespace URIs get validated against RFC 3986 at the API level Modified: lxml/trunk/src/lxml/html/__init__.py ============================================================================== --- lxml/trunk/src/lxml/html/__init__.py (original) +++ lxml/trunk/src/lxml/html/__init__.py Wed Sep 30 19:55:03 2009 @@ -818,10 +818,16 @@ values.extend(extra_values) if open_http is None: open_http = open_http_urllib - return open_http(form.method, form.action, values) + if form.action: + url = form.action + else: + url = form.base_url + return open_http(form.method, url, values) def open_http_urllib(method, url, values): import urllib + if not url: + raise ValueError("cannot submit, no URL provided") ## FIXME: should test that it's not a relative URL or something if method == 'GET': if '?' in url: From scoder at codespeak.net Wed Sep 30 22:26:22 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 30 Sep 2009 22:26:22 +0200 (CEST) Subject: [Lxml-checkins] r68059 - lxml/trunk/src/lxml Message-ID: <20090930202622.7F14E168013@codespeak.net> Author: scoder Date: Wed Sep 30 22:26:22 2009 New Revision: 68059 Modified: lxml/trunk/src/lxml/apihelpers.pxi Log: reduce code size by typing a very common function Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Wed Sep 30 22:26:22 2009 @@ -1345,7 +1345,7 @@ else: raise TypeError, u"Argument must be string or unicode." -cdef _getNsTag(tag): +cdef tuple _getNsTag(tag): u"""Given a tag, find namespace URI and tag name. Return None for NS uri if no namespace URI available. """ From scoder at codespeak.net Wed Sep 30 22:49:12 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 30 Sep 2009 22:49:12 +0200 (CEST) Subject: [Lxml-checkins] r68063 - in lxml/trunk: . src/lxml/tests Message-ID: <20090930204912.7EBAE168011@codespeak.net> Author: scoder Date: Wed Sep 30 22:49:11 2009 New Revision: 68063 Modified: lxml/trunk/ (props changed) lxml/trunk/src/lxml/tests/test_elementtree.py Log: r5265 at delle: sbehnel | 2009-09-30 21:33:55 +0200 test cleanup Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Wed Sep 30 22:49:11 2009 @@ -554,6 +554,26 @@ root.set("attr", "TEST") self.assertEquals("TEST", root.get("attr")) + def test_attribute_iterator(self): + XML = self.etree.XML + + root = XML(_bytes('')) + result = [] + for key in root.attrib: + result.append(key) + result.sort() + self.assertEquals(['alpha', 'beta', 'gamma'], result) + + def test_attribute_manipulation(self): + Element = self.etree.Element + + a = Element('a') + a.attrib['foo'] = 'Foo' + a.attrib['bar'] = 'Bar' + self.assertEquals('Foo', a.attrib['foo']) + del a.attrib['foo'] + self.assertRaises(KeyError, operator.getitem, a.attrib, 'foo') + def test_XML(self): XML = self.etree.XML @@ -714,16 +734,6 @@ result.append(el1.tag) self.assertEquals(['one','one', 'two', 'two', 'one', 'two'], result) - def test_attribute_iterator(self): - XML = self.etree.XML - - root = XML(_bytes('')) - result = [] - for key in root.attrib: - result.append(key) - result.sort() - self.assertEquals(['alpha', 'beta', 'gamma'], result) - def test_itertext(self): # ET 1.3+ XML = self.etree.XML @@ -1793,16 +1803,6 @@ [a2], list(e.getiterator('a'))) - def test_attribute_manipulation(self): - Element = self.etree.Element - - a = Element('a') - a.attrib['foo'] = 'Foo' - a.attrib['bar'] = 'Bar' - self.assertEquals('Foo', a.attrib['foo']) - del a.attrib['foo'] - self.assertRaises(KeyError, operator.getitem, a.attrib, 'foo') - def test_getslice(self): Element = self.etree.Element SubElement = self.etree.SubElement @@ -3240,15 +3240,15 @@ # feed parser interface - def test_feed_parser(self): + def test_feed_parser_bytes(self): parser = self.etree.XMLParser() - parser.feed('<') - parser.feed('a test="works"/') - parser.feed('>') + parser.feed(_bytes('<')) + parser.feed(_bytes('a test="works"/')) + parser.feed(_bytes('>')) root = parser.close() From scoder at codespeak.net Wed Sep 30 22:51:44 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 30 Sep 2009 22:51:44 +0200 (CEST) Subject: [Lxml-checkins] r68065 - in lxml/trunk: . src/lxml src/lxml/tests Message-ID: <20090930205144.A2F86168013@codespeak.net> Author: scoder Date: Wed Sep 30 22:51:44 2009 New Revision: 68065 Modified: lxml/trunk/ (props changed) lxml/trunk/CHANGES.txt lxml/trunk/src/lxml/apihelpers.pxi lxml/trunk/src/lxml/lxml.etree.pyx lxml/trunk/src/lxml/tests/test_elementtree.py Log: r5268 at delle: sbehnel | 2009-09-30 22:48:58 +0200 fix attribute lookup and deletion for plain (unnamespaced) attribute names Modified: lxml/trunk/CHANGES.txt ============================================================================== --- lxml/trunk/CHANGES.txt (original) +++ lxml/trunk/CHANGES.txt Wed Sep 30 22:51:44 2009 @@ -22,6 +22,9 @@ Bugs fixed ---------- +* Looking up and deleting attributes without a namespace could hit a + namespaced attribute of the same name instead. + * Late errors during calls to ``SubElement()`` (e.g. attribute related ones) could leave a partially initialised element in the tree. Modified: lxml/trunk/src/lxml/apihelpers.pxi ============================================================================== --- lxml/trunk/src/lxml/apihelpers.pxi (original) +++ lxml/trunk/src/lxml/apihelpers.pxi Wed Sep 30 22:51:44 2009 @@ -440,43 +440,39 @@ cdef object _attributeValue(xmlNode* c_element, xmlAttr* c_attrib_node): cdef char* value - cdef char* href - href = _getNs(c_attrib_node) - if href is NULL: - value = tree.xmlGetNoNsProp(c_element, c_attrib_node.name) - else: - value = tree.xmlGetNsProp(c_element, c_attrib_node.name, href) - result = funicode(value) - tree.xmlFree(value) + cdef char* c_href + c_href = _getNs(c_attrib_node) + value = tree.xmlGetNsProp(c_element, c_attrib_node.name, c_href) + try: + result = funicode(value) + finally: + tree.xmlFree(value) return result cdef object _attributeValueFromNsName(xmlNode* c_element, char* c_href, char* c_name): - cdef char* c_result - if c_href is NULL: - c_result = tree.xmlGetNoNsProp(c_element, c_name) - else: - c_result = tree.xmlGetNsProp(c_element, c_name, c_href) + cdef char* c_result = tree.xmlGetNsProp(c_element, c_name, c_href) if c_result is NULL: return None - result = funicode(c_result) - tree.xmlFree(c_result) + try: + result = funicode(c_result) + finally: + tree.xmlFree(c_result) return result cdef object _getNodeAttributeValue(xmlNode* c_node, key, default): cdef char* c_result - cdef char* c_tag + cdef char* c_href ns, tag = _getNsTag(key) - c_tag = _cstr(tag) - if ns is None: - c_result = tree.xmlGetNoNsProp(c_node, c_tag) - else: - c_result = tree.xmlGetNsProp(c_node, c_tag, _cstr(ns)) + c_href = NULL if ns is None else _cstr(ns) + c_result = tree.xmlGetNsProp(c_node, _cstr(tag), c_href) if c_result is NULL: # XXX free namespace that is not in use..? return default - result = funicode(c_result) - tree.xmlFree(c_result) + try: + result = funicode(c_result) + finally: + tree.xmlFree(c_result) return result cdef inline object _getAttributeValue(_Element element, key, default): @@ -506,20 +502,14 @@ cdef int _delAttribute(_Element element, key) except -1: cdef char* c_href ns, tag = _getNsTag(key) - if ns is None: - c_href = NULL - else: - c_href = _cstr(ns) + c_href = NULL if ns is None else _cstr(ns) if _delAttributeFromNsName(element._c_node, c_href, _cstr(tag)): raise KeyError, key return 0 cdef int _delAttributeFromNsName(xmlNode* c_node, char* c_href, char* c_name): cdef xmlAttr* c_attr - if c_href is NULL: - c_attr = tree.xmlHasProp(c_node, c_name) - else: - c_attr = tree.xmlHasNsProp(c_node, c_name, c_href) + c_attr = tree.xmlHasNsProp(c_node, c_name, c_href) if c_attr is NULL: # XXX free namespace that is not in use..? return -1 Modified: lxml/trunk/src/lxml/lxml.etree.pyx ============================================================================== --- lxml/trunk/src/lxml/lxml.etree.pyx (original) +++ lxml/trunk/src/lxml/lxml.etree.pyx Wed Sep 30 22:51:44 2009 @@ -2048,20 +2048,14 @@ def __contains__(self, key): cdef xmlNode* c_node - cdef char* c_result - cdef char* c_tag + cdef char* c_href ns, tag = _getNsTag(key) - c_tag = _cstr(tag) c_node = self._element._c_node - if ns is None: - c_result = tree.xmlGetNoNsProp(c_node, c_tag) + c_href = NULL if ns is None else _cstr(ns) + if tree.xmlHasNsProp(c_node, _cstr(tag), c_href): + return 1 else: - c_result = tree.xmlGetNsProp(c_node, c_tag, _cstr(ns)) - if c_result is NULL: return 0 - else: - tree.xmlFree(c_result) - return 1 def __richcmp__(one, other, int op): if not python.PyDict_Check(one): Modified: lxml/trunk/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/trunk/src/lxml/tests/test_elementtree.py (original) +++ lxml/trunk/src/lxml/tests/test_elementtree.py Wed Sep 30 22:51:44 2009 @@ -574,6 +574,52 @@ del a.attrib['foo'] self.assertRaises(KeyError, operator.getitem, a.attrib, 'foo') + def test_del_attribute_ns(self): + Element = self.etree.Element + + a = Element('a') + a.attrib['{http://a/}foo'] = 'Foo' + a.attrib['{http://a/}bar'] = 'Bar' + self.assertEquals(None, a.get('foo')) + self.assertEquals('Foo', a.get('{http://a/}foo')) + self.assertEquals('Foo', a.attrib['{http://a/}foo']) + + self.assertRaises(KeyError, operator.delitem, a.attrib, 'foo') + self.assertEquals('Foo', a.attrib['{http://a/}foo']) + + del a.attrib['{http://a/}foo'] + self.assertRaises(KeyError, operator.getitem, a.attrib, 'foo') + + def test_del_attribute_ns_parsed(self): + XML = self.etree.XML + + a = XML(_bytes('')) + + self.assertEquals('Foo', a.attrib['foo']) + self.assertEquals('FooNS', a.attrib['{http://a/}foo']) + + del a.attrib['foo'] + self.assertEquals('FooNS', a.attrib['{http://a/}foo']) + self.assertRaises(KeyError, operator.getitem, a.attrib, 'foo') + self.assertRaises(KeyError, operator.delitem, a.attrib, 'foo') + + del a.attrib['{http://a/}foo'] + self.assertRaises(KeyError, operator.getitem, a.attrib, '{http://a/}foo') + self.assertRaises(KeyError, operator.getitem, a.attrib, 'foo') + + a = XML(_bytes('')) + + self.assertEquals('Foo', a.attrib['foo']) + self.assertEquals('FooNS', a.attrib['{http://a/}foo']) + + del a.attrib['foo'] + self.assertEquals('FooNS', a.attrib['{http://a/}foo']) + self.assertRaises(KeyError, operator.getitem, a.attrib, 'foo') + + del a.attrib['{http://a/}foo'] + self.assertRaises(KeyError, operator.getitem, a.attrib, '{http://a/}foo') + self.assertRaises(KeyError, operator.getitem, a.attrib, 'foo') + def test_XML(self): XML = self.etree.XML From scoder at codespeak.net Wed Sep 30 23:05:52 2009 From: scoder at codespeak.net (scoder at codespeak.net) Date: Wed, 30 Sep 2009 23:05:52 +0200 (CEST) Subject: [Lxml-checkins] r68066 - in lxml/branch/lxml-2.2: . src/lxml src/lxml/tests Message-ID: <20090930210552.484B7168011@codespeak.net> Author: scoder Date: Wed Sep 30 23:05:51 2009 New Revision: 68066 Modified: lxml/branch/lxml-2.2/ (props changed) lxml/branch/lxml-2.2/CHANGES.txt lxml/branch/lxml-2.2/src/lxml/apihelpers.pxi lxml/branch/lxml-2.2/src/lxml/lxml.etree.pyx lxml/branch/lxml-2.2/src/lxml/tests/test_elementtree.py Log: trunk merge: fix attribute lookup and deletion Modified: lxml/branch/lxml-2.2/CHANGES.txt ============================================================================== --- lxml/branch/lxml-2.2/CHANGES.txt (original) +++ lxml/branch/lxml-2.2/CHANGES.txt Wed Sep 30 23:05:51 2009 @@ -11,6 +11,9 @@ Bugs fixed ---------- +* Looking up and deleting attributes without a namespace could hit a + namespaced attribute of the same name instead. + * Late errors during calls to ``SubElement()`` (e.g. attribute related ones) could leave a partially initialised element in the tree. Modified: lxml/branch/lxml-2.2/src/lxml/apihelpers.pxi ============================================================================== --- lxml/branch/lxml-2.2/src/lxml/apihelpers.pxi (original) +++ lxml/branch/lxml-2.2/src/lxml/apihelpers.pxi Wed Sep 30 23:05:51 2009 @@ -435,43 +435,39 @@ cdef object _attributeValue(xmlNode* c_element, xmlAttr* c_attrib_node): cdef char* value - cdef char* href - href = _getNs(c_attrib_node) - if href is NULL: - value = tree.xmlGetNoNsProp(c_element, c_attrib_node.name) - else: - value = tree.xmlGetNsProp(c_element, c_attrib_node.name, href) - result = funicode(value) - tree.xmlFree(value) + cdef char* c_href + c_href = _getNs(c_attrib_node) + value = tree.xmlGetNsProp(c_element, c_attrib_node.name, c_href) + try: + result = funicode(value) + finally: + tree.xmlFree(value) return result cdef object _attributeValueFromNsName(xmlNode* c_element, char* c_href, char* c_name): - cdef char* c_result - if c_href is NULL: - c_result = tree.xmlGetNoNsProp(c_element, c_name) - else: - c_result = tree.xmlGetNsProp(c_element, c_name, c_href) + cdef char* c_result = tree.xmlGetNsProp(c_element, c_name, c_href) if c_result is NULL: return None - result = funicode(c_result) - tree.xmlFree(c_result) + try: + result = funicode(c_result) + finally: + tree.xmlFree(c_result) return result cdef object _getNodeAttributeValue(xmlNode* c_node, key, default): cdef char* c_result - cdef char* c_tag + cdef char* c_href ns, tag = _getNsTag(key) - c_tag = _cstr(tag) - if ns is None: - c_result = tree.xmlGetNoNsProp(c_node, c_tag) - else: - c_result = tree.xmlGetNsProp(c_node, c_tag, _cstr(ns)) + c_href = NULL if ns is None else _cstr(ns) + c_result = tree.xmlGetNsProp(c_node, _cstr(tag), c_href) if c_result is NULL: # XXX free namespace that is not in use..? return default - result = funicode(c_result) - tree.xmlFree(c_result) + try: + result = funicode(c_result) + finally: + tree.xmlFree(c_result) return result cdef inline object _getAttributeValue(_Element element, key, default): @@ -501,20 +497,14 @@ cdef int _delAttribute(_Element element, key) except -1: cdef char* c_href ns, tag = _getNsTag(key) - if ns is None: - c_href = NULL - else: - c_href = _cstr(ns) + c_href = NULL if ns is None else _cstr(ns) if _delAttributeFromNsName(element._c_node, c_href, _cstr(tag)): raise KeyError, key return 0 cdef int _delAttributeFromNsName(xmlNode* c_node, char* c_href, char* c_name): cdef xmlAttr* c_attr - if c_href is NULL: - c_attr = tree.xmlHasProp(c_node, c_name) - else: - c_attr = tree.xmlHasNsProp(c_node, c_name, c_href) + c_attr = tree.xmlHasNsProp(c_node, c_name, c_href) if c_attr is NULL: # XXX free namespace that is not in use..? return -1 Modified: lxml/branch/lxml-2.2/src/lxml/lxml.etree.pyx ============================================================================== --- lxml/branch/lxml-2.2/src/lxml/lxml.etree.pyx (original) +++ lxml/branch/lxml-2.2/src/lxml/lxml.etree.pyx Wed Sep 30 23:05:51 2009 @@ -2037,20 +2037,14 @@ def __contains__(self, key): cdef xmlNode* c_node - cdef char* c_result - cdef char* c_tag + cdef char* c_href ns, tag = _getNsTag(key) - c_tag = _cstr(tag) c_node = self._element._c_node - if ns is None: - c_result = tree.xmlGetNoNsProp(c_node, c_tag) + c_href = NULL if ns is None else _cstr(ns) + if tree.xmlHasNsProp(c_node, _cstr(tag), c_href): + return 1 else: - c_result = tree.xmlGetNsProp(c_node, c_tag, _cstr(ns)) - if c_result is NULL: return 0 - else: - tree.xmlFree(c_result) - return 1 def __richcmp__(one, other, int op): if not python.PyDict_Check(one): Modified: lxml/branch/lxml-2.2/src/lxml/tests/test_elementtree.py ============================================================================== --- lxml/branch/lxml-2.2/src/lxml/tests/test_elementtree.py (original) +++ lxml/branch/lxml-2.2/src/lxml/tests/test_elementtree.py Wed Sep 30 23:05:51 2009 @@ -554,6 +554,52 @@ root.set("attr", "TEST") self.assertEquals("TEST", root.get("attr")) + def test_del_attribute_ns(self): + Element = self.etree.Element + + a = Element('a') + a.attrib['{http://a/}foo'] = 'Foo' + a.attrib['{http://a/}bar'] = 'Bar' + self.assertEquals(None, a.get('foo')) + self.assertEquals('Foo', a.get('{http://a/}foo')) + self.assertEquals('Foo', a.attrib['{http://a/}foo']) + + self.assertRaises(KeyError, operator.delitem, a.attrib, 'foo') + self.assertEquals('Foo', a.attrib['{http://a/}foo']) + + del a.attrib['{http://a/}foo'] + self.assertRaises(KeyError, operator.getitem, a.attrib, 'foo') + + def test_del_attribute_ns_parsed(self): + XML = self.etree.XML + + a = XML(_bytes('')) + + self.assertEquals('Foo', a.attrib['foo']) + self.assertEquals('FooNS', a.attrib['{http://a/}foo']) + + del a.attrib['foo'] + self.assertEquals('FooNS', a.attrib['{http://a/}foo']) + self.assertRaises(KeyError, operator.getitem, a.attrib, 'foo') + self.assertRaises(KeyError, operator.delitem, a.attrib, 'foo') + + del a.attrib['{http://a/}foo'] + self.assertRaises(KeyError, operator.getitem, a.attrib, '{http://a/}foo') + self.assertRaises(KeyError, operator.getitem, a.attrib, 'foo') + + a = XML(_bytes('')) + + self.assertEquals('Foo', a.attrib['foo']) + self.assertEquals('FooNS', a.attrib['{http://a/}foo']) + + del a.attrib['foo'] + self.assertEquals('FooNS', a.attrib['{http://a/}foo']) + self.assertRaises(KeyError, operator.getitem, a.attrib, 'foo') + + del a.attrib['{http://a/}foo'] + self.assertRaises(KeyError, operator.getitem, a.attrib, '{http://a/}foo') + self.assertRaises(KeyError, operator.getitem, a.attrib, 'foo') + def test_XML(self): XML = self.etree.XML