From scoder at codespeak.net Fri Sep 4 21:15:31 2009
From: scoder at codespeak.net (scoder at codespeak.net)
Date: Fri, 4 Sep 2009 21:15:31 +0200 (CEST)
Subject: [Lxml-checkins] r67502 - in lxml/trunk: . src/lxml
Message-ID: <20090904191531.4CE38168011@codespeak.net>
Author: scoder
Date: Fri Sep 4 21:15:29 2009
New Revision: 67502
Modified:
lxml/trunk/ (props changed)
lxml/trunk/src/lxml/extensions.pxi
Log:
r5225 at delle: sbehnel | 2009-08-22 15:46:24 +0200
fix smart string property types
Modified: lxml/trunk/src/lxml/extensions.pxi
==============================================================================
--- lxml/trunk/src/lxml/extensions.pxi (original)
+++ lxml/trunk/src/lxml/extensions.pxi Fri Sep 4 21:15:29 2009
@@ -589,9 +589,9 @@
cdef class _ElementUnicodeResult(python.unicode):
cdef _Element _parent
- cdef readonly bint is_tail
- cdef readonly bint is_text
- cdef readonly bint is_attribute
+ cdef readonly object is_tail
+ cdef readonly object is_text
+ cdef readonly object is_attribute
cdef readonly object attrname
def getparent(self):
From scoder at codespeak.net Fri Sep 4 21:15:35 2009
From: scoder at codespeak.net (scoder at codespeak.net)
Date: Fri, 4 Sep 2009 21:15:35 +0200 (CEST)
Subject: [Lxml-checkins] r67503 - in lxml/trunk: . src/lxml
Message-ID: <20090904191535.3B41416800B@codespeak.net>
Author: scoder
Date: Fri Sep 4 21:15:33 2009
New Revision: 67503
Modified:
lxml/trunk/ (props changed)
lxml/trunk/src/lxml/lxml.objectify.pyx
Log:
r5226 at delle: sbehnel | 2009-08-22 22:06:24 +0200
doctest fix
Modified: lxml/trunk/src/lxml/lxml.objectify.pyx
==============================================================================
--- lxml/trunk/src/lxml/lxml.objectify.pyx (original)
+++ lxml/trunk/src/lxml/lxml.objectify.pyx Fri Sep 4 21:15:33 2009
@@ -1260,7 +1260,7 @@
>>> html = M.html( M.body( M.p('hello', M.br, 'objectify') ) )
>>> from lxml.etree import tostring
- >>> print(tostring(html, method='html'))
+ >>> print(tostring(html, method='html').decode('ASCII'))
hello
objectify
Note that this module has a predefined ElementMaker instance called ``E``.
From scoder at codespeak.net Fri Sep 4 21:15:38 2009
From: scoder at codespeak.net (scoder at codespeak.net)
Date: Fri, 4 Sep 2009 21:15:38 +0200 (CEST)
Subject: [Lxml-checkins] r67504 - in lxml/trunk: . src/lxml/html
Message-ID: <20090904191538.8D250168016@codespeak.net>
Author: scoder
Date: Fri Sep 4 21:15:38 2009
New Revision: 67504
Modified:
lxml/trunk/ (props changed)
lxml/trunk/src/lxml/html/__init__.py
Log:
r5227 at delle: sbehnel | 2009-09-04 20:29:01 +0200
comments
Modified: lxml/trunk/src/lxml/html/__init__.py
==============================================================================
--- lxml/trunk/src/lxml/html/__init__.py (original)
+++ lxml/trunk/src/lxml/html/__init__.py Fri Sep 4 21:15:38 2009
@@ -1471,11 +1471,26 @@
################################################################################
class HTMLParser(etree.HTMLParser):
+ """An HTML parser that is configured to return lxml.html Element
+ objects.
+ """
def __init__(self, **kwargs):
super(HTMLParser, self).__init__(**kwargs)
self.set_element_class_lookup(HtmlElementClassLookup())
class XHTMLParser(etree.XMLParser):
+ """An XML parser that is configured to return lxml.html Element
+ objects.
+
+ Note that this parser is not really XHTML aware unless you let it
+ load a DTD that declares the HTML entities. To do this, make sure
+ you have the XHTML DTDs installed in your catalogs, and create the
+ parser like this::
+
+ parser = XHTMLParser(load_dtd=True)
+
+ For catalog support, see http://www.xmlsoft.org/catalog.html.
+ """
def __init__(self, **kwargs):
super(XHTMLParser, self).__init__(**kwargs)
self.set_element_class_lookup(HtmlElementClassLookup())
From scoder at codespeak.net Fri Sep 4 21:15:44 2009
From: scoder at codespeak.net (scoder at codespeak.net)
Date: Fri, 4 Sep 2009 21:15:44 +0200 (CEST)
Subject: [Lxml-checkins] r67505 - in lxml/trunk: . doc
Message-ID: <20090904191544.06D43168014@codespeak.net>
Author: scoder
Date: Fri Sep 4 21:15:43 2009
New Revision: 67505
Modified:
lxml/trunk/ (props changed)
lxml/trunk/doc/performance.txt
Log:
r5228 at delle: sbehnel | 2009-09-04 20:36:48 +0200
link to parser benchmarks on xml.com
Modified: lxml/trunk/doc/performance.txt
==============================================================================
--- lxml/trunk/doc/performance.txt (original)
+++ lxml/trunk/doc/performance.txt Fri Sep 4 21:15:43 2009
@@ -252,6 +252,21 @@
.. _`high-performance XML parsing`: http://www.ibm.com/developerworks/xml/library/x-hiperfparse/
+Finally, `xml.com`_ has a couple of publications about XML parser
+performance. Farwick and Hafner have written two interesting articles
+that compare the parser of libxml2 to some major Java based XML
+parsers. One deals with `event-driven parser performance`_, the other
+one presents `benchmark results comparing DOM parsers`_. Both
+comparisons suggest that libxml2's parser performance is largely
+superiour to all commonly used Java parsers in almost all cases. Note
+that the C parser benchmark results are based on xmlbench_, which uses
+a simpler setup for libxml2 than lxml does.
+
+.. _`xml.com`: http://www.xml.com/
+.. _`event-driven parser performance`: http://www.xml.com/lpt/a/1702
+.. _`benchmark results comparing DOM parsers`: http://www.xml.com/lpt/a/1703
+.. _xmlbench: http://xmlbench.sourceforge.net/
+
The ElementTree API
===================
From scoder at codespeak.net Fri Sep 4 21:15:48 2009
From: scoder at codespeak.net (scoder at codespeak.net)
Date: Fri, 4 Sep 2009 21:15:48 +0200 (CEST)
Subject: [Lxml-checkins] r67506 - in lxml/trunk: . doc
Message-ID: <20090904191548.16C6B168014@codespeak.net>
Author: scoder
Date: Fri Sep 4 21:15:47 2009
New Revision: 67506
Modified:
lxml/trunk/ (props changed)
lxml/trunk/doc/build.txt
Log:
r5229 at delle: sbehnel | 2009-09-04 20:37:50 +0200
show how to use the STATIC_DEPS env var with easy_install
Modified: lxml/trunk/doc/build.txt
==============================================================================
--- lxml/trunk/doc/build.txt (original)
+++ lxml/trunk/doc/build.txt Fri Sep 4 21:15:47 2009
@@ -16,9 +16,10 @@
2 Subversion
3 Setuptools
4 Running the tests and reporting errors
- 5 Contributing an egg
- 6 Static linking on Windows
- 7 Building Debian packages from SVN sources
+ 5 Building an egg
+ 6 Building lxml on MacOS-X
+ 7 Static linking on Windows
+ 8 Building Debian packages from SVN sources
Cython
@@ -211,6 +212,18 @@
Instead of ``build``, you can use any target, like ``bdist_egg`` if
you want to use setuptools to build an installable egg.
+Note that this also works with EasyInstall_. Since you can't pass
+command line options in this case, you have to use an environment
+variable instead::
+
+ STATIC_DEPS=true easy_install lxml
+
+Some machines may require an additional run with "sudo" to install the
+package into the Python package directory::
+
+ STATIC_DEPS=true sudo easy_install lxml
+
+
Static linking on Windows
-------------------------
From scoder at codespeak.net Fri Sep 4 21:15:51 2009
From: scoder at codespeak.net (scoder at codespeak.net)
Date: Fri, 4 Sep 2009 21:15:51 +0200 (CEST)
Subject: [Lxml-checkins] r67507 - in lxml/trunk: . doc
Message-ID: <20090904191551.8E6D5168014@codespeak.net>
Author: scoder
Date: Fri Sep 4 21:15:50 2009
New Revision: 67507
Modified:
lxml/trunk/ (props changed)
lxml/trunk/doc/elementsoup.txt
Log:
r5230 at delle: sbehnel | 2009-09-04 21:11:10 +0200
show how to use BeautifulSoup to detect encodings
Modified: lxml/trunk/doc/elementsoup.txt
==============================================================================
--- lxml/trunk/doc/elementsoup.txt (original)
+++ lxml/trunk/doc/elementsoup.txt Fri Sep 4 21:15:50 2009
@@ -23,6 +23,13 @@
document, and ``convert_tree()`` to convert an existing BeautifulSoup
tree into a list of top-level Elements.
+.. contents::
+..
+ 1 Parsing with the soupparser
+ 2 Entity handling
+ 3 Using soupparser as a fallback
+ 4 Using only the encoding detection
+
Parsing with the soupparser
===========================
@@ -168,3 +175,27 @@
... ignore = tostring(root, encoding=unicode)
... except UnicodeDecodeError:
... root = lxml.html.soupparser.fromstring(tag_soup)
+
+
+Using only the encoding detection
+=================================
+
+If you prefer a 'real' (and fast) HTML parser instead of the regular
+expression based one in BeautifulSoup, you can still benefit from
+BeautifulSoup's _`support for encoding detection` in the
+``UnicodeDammit`` class.
+
+.. sourcecode:: pycon
+
+ >>> from BeautifulSoup import UnicodeDammit
+
+ >>> def decode_html(html_string):
+ ... converted = UnicodeDammit(html_string, isHTML=True)
+ ... if not converted.unicode:
+ ... raise UnicodeDecodeError(
+ ... "Failed to detect encoding, tried [%s]",
+ ... ', '.join(converted.triedEncodings))
+ ... # print converted.originalEncoding
+ ... return converted.unicode
+
+ >>> root = lxml.html.fromstring(decode_html(tag_soup))
From scoder at codespeak.net Sun Sep 6 14:54:31 2009
From: scoder at codespeak.net (scoder at codespeak.net)
Date: Sun, 6 Sep 2009 14:54:31 +0200 (CEST)
Subject: [Lxml-checkins] r67542 - in lxml/trunk: . src/lxml src/lxml/tests
Message-ID: <20090906125431.0862316800D@codespeak.net>
Author: scoder
Date: Sun Sep 6 14:54:30 2009
New Revision: 67542
Modified:
lxml/trunk/ (props changed)
lxml/trunk/CHANGES.txt
lxml/trunk/src/lxml/cssselect.py
lxml/trunk/src/lxml/tests/test_css.txt
Log:
r5237 at delle: sbehnel | 2009-09-06 14:50:11 +0200
fix several unicode and character escape issues in lxml.cssselect
Modified: lxml/trunk/CHANGES.txt
==============================================================================
--- lxml/trunk/CHANGES.txt (original)
+++ lxml/trunk/CHANGES.txt Sun Sep 6 14:54:30 2009
@@ -19,6 +19,11 @@
Bugs fixed
----------
+* CSS special character escapes were not properly handled in
+ ``lxml.cssselect``.
+
+* CSS Unicode escapes were not properly decoded in ``lxml.cssselect``.
+
* Select options in HTML forms that had no explicit ``value``
attribute were not handled correctly. The HTML standard dictates
that their value is defined by their text content. This is now
Modified: lxml/trunk/src/lxml/cssselect.py
==============================================================================
--- lxml/trunk/src/lxml/cssselect.py (original)
+++ lxml/trunk/src/lxml/cssselect.py Sun Sep 6 14:54:30 2009
@@ -49,9 +49,11 @@
try:
_unicode = unicode
+ _unichr = unichr
except NameError:
# Python 3
_unicode = str
+ _unichr = chr
class _UniToken(_unicode):
def __new__(cls, contents, pos):
@@ -99,7 +101,7 @@
def xpath(self):
sel_xpath = self.selector.xpath()
sel_xpath.add_condition(
- "contains(concat(' ', normalize-space(@class), ' '), %s)" % xpath_repr(' '+self.class_name+' '))
+ "contains(concat(' ', normalize-space(@class), ' '), %s)" % xpath_literal(' '+self.class_name+' '))
return sel_xpath
class Function(object):
@@ -194,7 +196,7 @@
if isinstance(expr, Element):
expr = expr._format_element()
xpath.add_condition('contains(css:lower-case(string(.)), %s)'
- % xpath_repr(expr.lower()))
+ % xpath_literal(expr.lower()))
# FIXME: Currently case insensitive matching doesn't seem to be happening
return xpath
@@ -349,34 +351,34 @@
path.add_condition(attrib)
elif self.operator == '=':
path.add_condition('%s = %s' % (attrib,
- xpath_repr(value)))
+ xpath_literal(value)))
elif self.operator == '!=':
# FIXME: this seems like a weird hack...
if value:
path.add_condition('not(%s) or %s != %s'
- % (attrib, attrib, xpath_repr(value)))
+ % (attrib, attrib, xpath_literal(value)))
else:
path.add_condition('%s != %s'
- % (attrib, xpath_repr(value)))
- #path.add_condition('%s != %s' % (attrib, xpath_repr(value)))
+ % (attrib, xpath_literal(value)))
+ #path.add_condition('%s != %s' % (attrib, xpath_literal(value)))
elif self.operator == '~=':
- path.add_condition("contains(concat(' ', normalize-space(%s), ' '), %s)" % (attrib, xpath_repr(' '+value+' ')))
+ path.add_condition("contains(concat(' ', normalize-space(%s), ' '), %s)" % (attrib, xpath_literal(' '+value+' ')))
elif self.operator == '|=':
# Weird, but true...
path.add_condition('%s = %s or starts-with(%s, %s)' % (
- attrib, xpath_repr(value),
- attrib, xpath_repr(value + '-')))
+ attrib, xpath_literal(value),
+ attrib, xpath_literal(value + '-')))
elif self.operator == '^=':
path.add_condition('starts-with(%s, %s)' % (
- attrib, xpath_repr(value)))
+ attrib, xpath_literal(value)))
elif self.operator == '$=':
# Oddly there is a starts-with in XPath 1.0, but not ends-with
path.add_condition('substring(%s, string-length(%s)-%s) = %s'
- % (attrib, attrib, len(value)-1, xpath_repr(value)))
+ % (attrib, attrib, len(value)-1, xpath_literal(value)))
elif self.operator == '*=':
# FIXME: case sensitive?
path.add_condition('contains(%s, %s)' % (
- attrib, xpath_repr(value)))
+ attrib, xpath_literal(value)))
else:
assert 0, ("Unknown operator: %r" % self.operator)
return path
@@ -425,7 +427,7 @@
def xpath(self):
path = self.selector.xpath()
- path.add_condition('@id = %s' % xpath_repr(self.id))
+ path.add_condition('@id = %s' % xpath_literal(self.id))
return path
class Or(object):
@@ -501,9 +503,9 @@
##############################
## XPathExpr objects:
-_el_re = re.compile(r'^\w+\s*$')
-_id_re = re.compile(r'^(\w*)#(\w+)\s*$')
-_class_re = re.compile(r'^(\w*)\.(\w+)\s*$')
+_el_re = re.compile(r'^\w+\s*$', re.UNICODE)
+_id_re = re.compile(r'^(\w*)#(\w+)\s*$', re.UNICODE)
+_class_re = re.compile(r'^(\w*)\.(\w+)\s*$', re.UNICODE)
def css_to_xpath(css_expr, prefix='descendant-or-self::'):
if isinstance(css_expr, _basestring):
@@ -524,7 +526,7 @@
"Got None for xpath expression from %s" % repr(css_expr))
if prefix:
expr.add_prefix(prefix)
- return str(expr)
+ return _unicode(expr)
class XPathExpr(object):
@@ -539,10 +541,10 @@
def __str__(self):
path = ''
if self.prefix is not None:
- path += str(self.prefix)
+ path += _unicode(self.prefix)
if self.path is not None:
- path += str(self.path)
- path += str(self.element)
+ path += _unicode(self.path)
+ path += _unicode(self.element)
if self.condition:
path += '[%s]' % self.condition
return path
@@ -574,7 +576,7 @@
if self.element == '*':
# We weren't doing a test anyway
return
- self.add_condition("name() = %s" % xpath_repr(self.element))
+ self.add_condition("name() = %s" % xpath_literal(self.element))
self.element = '*'
def add_star_prefix(self):
@@ -589,7 +591,7 @@
self.star_prefix = True
def join(self, combiner, other):
- prefix = str(self)
+ prefix = _unicode(self)
prefix += combiner
path = (other.prefix or '') + (other.path or '')
# We don't need a star prefix if we are joining to this other
@@ -615,16 +617,26 @@
def __str__(self):
prefix = self.prefix or ''
- return ' | '.join([prefix + str(i) for i in self.items])
+ return ' | '.join(["%s%s" % (prefix,i) for i in self.items])
-def xpath_repr(s):
- # FIXME: I don't think this is right, but lacking any reasonable
- # specification on what XPath literals look like (which doesn't seem
- # to be in the XPath specification) it is hard to do 'right'
+split_at_single_quotes = re.compile("('+)").split
+
+def xpath_literal(s):
if isinstance(s, Element):
# This is probably a symbol that looks like an expression...
s = s._format_element()
- return repr(str(s))
+ else:
+ s = _unicode(s)
+ if "'" not in s:
+ s = "'%s'" % s
+ elif '"' not in s:
+ s = '"%s"' % s
+ else:
+ s = "concat(%s)" % ','.join([
+ (("'" in part) and '"%s"' or "'%s'") % part
+ for part in split_at_single_quotes(s) if part
+ ])
+ return s
##############################
## Parsing functions
@@ -814,9 +826,9 @@
## Tokenizing
############################################################
-_whitespace_re = re.compile(r'\s+')
+_whitespace_re = re.compile(r'\s+', re.UNICODE)
-_comment_re = re.compile(r'/\*.*?\*/', re.S)
+_comment_re = re.compile(r'/\*.*?\*/', re.DOTALL)
_count_re = re.compile(r'[+-]?\d*n(?:[+-]\d+)?')
@@ -861,6 +873,28 @@
yield Symbol(sym, old_pos)
continue
+split_at_string_escapes = re.compile(r'(\\(?:%s))'
+ % '|'.join(['[A-Fa-f0-9]{1,6}(?:\r\n|\s)?',
+ '[^A-Fa-f0-9]'])).split
+
+def unescape_string_literal(literal):
+ substrings = []
+ for substring in split_at_string_escapes(literal):
+ if not substring:
+ continue
+ elif '\\' in substring:
+ if substring[0] == '\\' and len(substring) > 1:
+ substring = substring[1:]
+ if substring[0] in '0123456789ABCDEFabcdef':
+ # int() correctly ignores the potentially trailing whitespace
+ substring = _unichr(int(substring, 16))
+ else:
+ raise SelectorSyntaxError(
+ "Invalid escape sequence %r in string %r"
+ % (substring.split('\\')[1], literal))
+ substrings.append(substring)
+ return ''.join(substrings)
+
def tokenize_escaped_string(s, pos):
quote = s[pos]
assert quote in ('"', "'")
@@ -873,13 +907,13 @@
"Expected closing %s for string in: %r"
% (quote, s[start:]))
result = s[start:next]
- try:
- result = result.encode('ASCII', 'backslashreplace').decode('unicode_escape')
- except UnicodeDecodeError:
- # Probably a hanging \
+ if result.endswith('\\'):
+ # next quote character is escaped
pos = next+1
- else:
- return result, next+1
+ continue
+ if '\\' in result:
+ result = unescape_string_literal(result)
+ return result, next+1
_illegal_symbol = re.compile(r'[^\w\\-]', re.UNICODE)
Modified: lxml/trunk/src/lxml/tests/test_css.txt
==============================================================================
--- lxml/trunk/src/lxml/tests/test_css.txt (original)
+++ lxml/trunk/src/lxml/tests/test_css.txt Sun Sep 6 14:54:30 2009
@@ -123,7 +123,44 @@
...
NotImplementedError: *:only-of-type is not implemented
-Then of parse_series:
+Now a Unicode character test:
+
+ >>> from lxml.cssselect import css_to_xpath
+ >>> import sys
+ >>> if sys.version_info[0] >= 3:
+ ... css_expr = '.a\xc1b'
+ ... else:
+ ... css_expr = '.a\xc1b'.decode('ISO-8859-1')
+
+ >>> xpath_expr = css_to_xpath(css_expr)
+ >>> print( css_expr[1:] in xpath_expr )
+ True
+ >>> print( xpath_expr.encode('ascii', 'xmlcharrefreplace').decode('ASCII') )
+ descendant-or-self::*[contains(concat(' ', normalize-space(@class), ' '), ' aÁb ')]
+
+And some special character tests:
+
+ >>> print( css_to_xpath('*[aval="\'"]') )
+ descendant-or-self::*[@aval = "'"]
+ >>> print( css_to_xpath('*[aval="\'\'\'"]') )
+ descendant-or-self::*[@aval = "'''"]
+ >>> print( css_to_xpath('*[aval=\'"\']') )
+ descendant-or-self::*[@aval = '"']
+ >>> print( css_to_xpath('*[aval=\'"""\']') )
+ descendant-or-self::*[@aval = '"""']
+
+Some Unicode escape tests (including the trailing whitespace rules):
+
+ >>> print( css_to_xpath(r'*[aval="\'\22\'"]') ) # \22 == '"'
+ descendant-or-self::*[@aval = concat("'",'"',"'")]
+ >>> print( css_to_xpath(r'*[aval="\'\22 2\'"]') )
+ descendant-or-self::*[@aval = concat("'",'"2',"'")]
+ >>> print( css_to_xpath(r'*[aval="\'\20 \'"]') ) # \20 == ' '
+ descendant-or-self::*[@aval = "' '"]
+ >>> print( css_to_xpath('*[aval="\'\\20\r\n \'"]') )
+ descendant-or-self::*[@aval = "' '"]
+
+Then some test for parse_series:
>>> from lxml.cssselect import parse_series
>>> parse_series('1n+3')
From scoder at codespeak.net Thu Sep 10 06:49:32 2009
From: scoder at codespeak.net (scoder at codespeak.net)
Date: Thu, 10 Sep 2009 06:49:32 +0200 (CEST)
Subject: [Lxml-checkins] r67600 - in lxml/trunk: . src/lxml src/lxml/tests
Message-ID: <20090910044932.C5305168024@codespeak.net>
Author: scoder
Date: Thu Sep 10 06:49:30 2009
New Revision: 67600
Modified:
lxml/trunk/ (props changed)
lxml/trunk/CHANGES.txt
lxml/trunk/src/lxml/cssselect.py
lxml/trunk/src/lxml/tests/test_css.txt
Log:
r5239 at delle: sbehnel | 2009-09-10 06:45:11 +0200
cssselect: fix error reporting and infinite loop on syntax error
Modified: lxml/trunk/CHANGES.txt
==============================================================================
--- lxml/trunk/CHANGES.txt (original)
+++ lxml/trunk/CHANGES.txt Thu Sep 10 06:49:30 2009
@@ -19,6 +19,12 @@
Bugs fixed
----------
+* Syntax errors in ``lxml.cssselect`` could result in misleading error
+ messages.
+
+* Invalid syntax in CSS expressions could lead to an infinite loop in
+ the parser of ``lxml.cssselect``.
+
* CSS special character escapes were not properly handled in
``lxml.cssselect``.
Modified: lxml/trunk/src/lxml/cssselect.py
==============================================================================
--- lxml/trunk/src/lxml/cssselect.py (original)
+++ lxml/trunk/src/lxml/cssselect.py Thu Sep 10 06:49:30 2009
@@ -649,8 +649,12 @@
except SelectorSyntaxError:
import sys
e = sys.exc_info()[1]
- e.args = tuple(["%s at %s -> %s" % (
- e, stream.used, list(stream))])
+ message = "%s at %s -> %r" % (
+ e, stream.used, stream.peek())
+ e.msg = message
+ if sys.version_info < (2,6):
+ e.message = message
+ e.args = tuple([message])
raise
def parse_selector_group(stream):
@@ -677,7 +681,11 @@
combinator = stream.next()
else:
combinator = ' '
+ consumed = len(stream.used)
next_selector = parse_simple_selector(stream)
+ if consumed == len(stream.used):
+ raise SelectorSyntaxError(
+ "Expected selector, got '%s'" % stream.peek())
result = CombinedSelector(result, combinator, next_selector)
return result
@@ -689,14 +697,14 @@
next = stream.next()
if next != '*' and not isinstance(next, Symbol):
raise SelectorSyntaxError(
- "Expected symbol, got %r" % next)
+ "Expected symbol, got '%s'" % next)
if stream.peek() == '|':
namespace = next
stream.next()
element = stream.next()
if element != '*' and not isinstance(next, Symbol):
raise SelectorSyntaxError(
- "Expected symbol, got %r" % next)
+ "Expected symbol, got '%s'" % next)
else:
namespace = '*'
element = next
@@ -723,14 +731,14 @@
next = stream.next()
if not next == ']':
raise SelectorSyntaxError(
- "] expected, got %r" % next)
+ "] expected, got '%s'" % next)
continue
elif peek == ':' or peek == '::':
type = stream.next()
ident = stream.next()
if not isinstance(ident, Symbol):
raise SelectorSyntaxError(
- "Expected symbol, got %r" % ident)
+ "Expected symbol, got '%s'" % ident)
if stream.peek() == '(':
stream.next()
peek = stream.peek()
@@ -744,7 +752,7 @@
next = stream.next()
if not next == ')':
raise SelectorSyntaxError(
- "Expected ), got %r and %r"
+ "Expected ')', got '%s' and '%s'"
% (next, selector))
result = Function(result, type, ident, selector)
else:
@@ -778,11 +786,11 @@
op = stream.next()
if not op in ('^=', '$=', '*=', '=', '~=', '|=', '!='):
raise SelectorSyntaxError(
- "Operator expected, got %r" % op)
+ "Operator expected, got '%s'" % op)
value = stream.next()
if not isinstance(value, (Symbol, String)):
raise SelectorSyntaxError(
- "Expected string or symbol, got %r" % value)
+ "Expected string or symbol, got '%s'" % value)
return Attrib(selector, namespace, attrib, op, value)
def parse_series(s):
Modified: lxml/trunk/src/lxml/tests/test_css.txt
==============================================================================
--- lxml/trunk/src/lxml/tests/test_css.txt (original)
+++ lxml/trunk/src/lxml/tests/test_css.txt Thu Sep 10 06:49:30 2009
@@ -49,6 +49,14 @@
>>> parse('td ~ th')
CombinedSelector[Element[td] ~ Element[th]]
+Some parse error tests:
+
+ >>> try: parse('attributes(href)/html/body/a')
+ ... except: # Py2, Py3, ...
+ ... import sys
+ ... print(str(sys.exc_info()[1]).replace("(u'", "('"))
+ Expected selector, got '(' at [Symbol('attributes', 0)] -> Token('(', 10)
+
Now of translation:
>>> def xpath(css):
From scoder at codespeak.net Fri Sep 11 12:29:47 2009
From: scoder at codespeak.net (scoder at codespeak.net)
Date: Fri, 11 Sep 2009 12:29:47 +0200 (CEST)
Subject: [Lxml-checkins] r67652 - in lxml/trunk: . src/lxml
Message-ID: <20090911102947.32CB0168024@codespeak.net>
Author: scoder
Date: Fri Sep 11 12:29:45 2009
New Revision: 67652
Modified:
lxml/trunk/ (props changed)
lxml/trunk/src/lxml/cleanup.pxi
lxml/trunk/src/lxml/lxml.objectify.pyx
Log:
r5241 at delle: sbehnel | 2009-09-11 10:55:09 +0200
doc comments
Modified: lxml/trunk/src/lxml/cleanup.pxi
==============================================================================
--- lxml/trunk/src/lxml/cleanup.pxi (original)
+++ lxml/trunk/src/lxml/cleanup.pxi Fri Sep 11 12:29:45 2009
@@ -4,7 +4,7 @@
u"""cleanup_namespaces(tree_or_element)
Remove all namespace declarations from a subtree that are not used
- by any of the elements in that tree.
+ by any of the elements or attributes in that tree.
"""
cdef _Element element
element = _rootNodeOrRaise(tree_or_element)
Modified: lxml/trunk/src/lxml/lxml.objectify.pyx
==============================================================================
--- lxml/trunk/src/lxml/lxml.objectify.pyx (original)
+++ lxml/trunk/src/lxml/lxml.objectify.pyx Fri Sep 11 12:29:45 2009
@@ -1722,8 +1722,7 @@
tree.xmlSetNsProp(c_node, c_ns, "nil", "true")
tree.END_FOR_EACH_ELEMENT_FROM(c_node)
-cdef object _strip_attributes
-_strip_attributes = etree.strip_attributes
+cdef object _strip_attributes = etree.strip_attributes
def deannotate(element_or_tree, *, pytype=True, xsi=True, xsi_nil=False):
u"""deannotate(element_or_tree, pytype=True, xsi=True, xsi_nil=False)
@@ -1736,6 +1735,10 @@
default), 'xsi:type' attributes will be removed.
If the 'xsi_nil' keyword argument is True (default: False), 'xsi:nil'
attributes will be removed.
+
+ Note that this does not touch the namespace declarations. If you
+ want to remove unused namespace declarations from the tree, use
+ ``lxml.etree.cleanup_namespaces()``.
"""
cdef list attribute_names = []
From scoder at codespeak.net Fri Sep 11 12:29:50 2009
From: scoder at codespeak.net (scoder at codespeak.net)
Date: Fri, 11 Sep 2009 12:29:50 +0200 (CEST)
Subject: [Lxml-checkins] r67653 - in lxml/trunk: . doc
Message-ID: <20090911102950.00682168025@codespeak.net>
Author: scoder
Date: Fri Sep 11 12:29:49 2009
New Revision: 67653
Modified:
lxml/trunk/ (props changed)
lxml/trunk/doc/objectify.txt
Log:
r5242 at delle: sbehnel | 2009-09-11 12:25:24 +0200
objectify doc fixes
Modified: lxml/trunk/doc/objectify.txt
==============================================================================
--- lxml/trunk/doc/objectify.txt (original)
+++ lxml/trunk/doc/objectify.txt Fri Sep 11 12:29:49 2009
@@ -361,63 +361,74 @@
Namespace handling
------------------
-Namespaces are handled mostly behind the scenes. If you access a child of an
-Element without specifying a namespace, the lookup will use the namespace of
-the parent:
+During tag lookups, namespaces are handled mostly behind the scenes.
+If you access a child of an Element without specifying a namespace,
+the lookup will use the namespace of the parent:
.. sourcecode:: pycon
- >>> root = objectify.Element("{ns}root")
- >>> b = etree.SubElement(root, "{ns}b")
- >>> c = etree.SubElement(root, "{other}c")
+ >>> root = objectify.Element("{http://ns/}root")
+ >>> b = etree.SubElement(root, "{http://ns/}b")
+ >>> c = etree.SubElement(root, "{http://other/}c")
>>> print(root.b.tag)
- {ns}b
- >>> print(root.c)
- Traceback (most recent call last):
- ...
- AttributeError: no such child: {ns}c
+ {http://ns/}b
-You can access elements with different namespaces via ``getattr()``:
+Note that the ``SubElement()`` factory of ``lxml.etree`` does not
+inherit any namespaces when creating a new subelement. Element
+creation must be explicit about the namespace, and is simplified
+through the E-factory as described above. Lookups, however, inherit
+namespaces implicitly.
+
+To access an element in a different namespace than its parent, you can
+use ``getattr()``:
.. sourcecode:: pycon
- >>> print(getattr(root, "{other}c").tag)
- {other}c
+ >>> print (root.tag)
+ {http://ns/}root
+
+ >>> print(root.c)
+ Traceback (most recent call last):
+ ...
+ AttributeError: no such child: {http://ns/}c
+
+ >>> print(getattr(root, "{http://other/}c").tag)
+ {http://other/}c
For convenience, there is also a quick way through item access:
.. sourcecode:: pycon
- >>> print(root["{other}c"].tag)
- {other}c
+ >>> print(root["{http://other/}c"].tag)
+ {http://other/}c
The same approach must be used to access children with tag names that are not
valid Python identifiers:
.. sourcecode:: pycon
- >>> el = etree.SubElement(root, "{ns}tag-name")
+ >>> el = etree.SubElement(root, "{http://ns/}tag-name")
>>> print(root["tag-name"].tag)
- {ns}tag-name
+ {http://ns/}tag-name
- >>> new_el = objectify.Element("{ns}new-element")
- >>> el = etree.SubElement(new_el, "{ns}child")
- >>> el = etree.SubElement(new_el, "{ns}child")
- >>> el = etree.SubElement(new_el, "{ns}child")
+ >>> new_el = objectify.Element("{http://ns/}new-element")
+ >>> el = etree.SubElement(new_el, "{http://ns/}child")
+ >>> el = etree.SubElement(new_el, "{http://ns/}child")
+ >>> el = etree.SubElement(new_el, "{http://ns/}child")
>>> root["tag-name"] = [ new_el, new_el ]
>>> print(len(root["tag-name"]))
2
>>> print(root["tag-name"].tag)
- {ns}tag-name
+ {http://ns/}tag-name
>>> print(len(root["tag-name"].child))
3
>>> print(root["tag-name"].child.tag)
- {ns}child
+ {http://ns/}child
>>> print(root["tag-name"][1].child.tag)
- {ns}child
+ {http://ns/}child
or for names that have a special meaning in lxml.objectify:
@@ -505,11 +516,11 @@
.. sourcecode:: pycon
- >>> root = objectify.Element("{ns}root")
- >>> b1 = etree.SubElement(root, "{ns}b")
- >>> c = etree.SubElement(b1, "{ns}c")
- >>> b2 = etree.SubElement(root, "{ns}b")
- >>> d = etree.SubElement(root, "{other}d")
+ >>> root = objectify.Element("{http://ns/}root")
+ >>> b1 = etree.SubElement(root, "{http://ns/}b")
+ >>> c = etree.SubElement(b1, "{http://ns/}c")
+ >>> b2 = etree.SubElement(root, "{http://ns/}b")
+ >>> d = etree.SubElement(root, "{http://other/}d")
>>> path = objectify.ObjectPath("root.b.c")
>>> print(path)
@@ -517,15 +528,15 @@
>>> path.hasattr(root)
True
>>> print(path.find(root).tag)
- {ns}c
+ {http://ns/}c
>>> find = objectify.ObjectPath("root.b.c")
>>> print(find(root).tag)
- {ns}c
+ {http://ns/}c
- >>> find = objectify.ObjectPath("root.{other}d")
+ >>> find = objectify.ObjectPath("root.{http://other/}d")
>>> print(find(root).tag)
- {other}d
+ {http://other/}d
>>> find = objectify.ObjectPath("root.{not}there")
>>> print(find(root).tag)
@@ -537,15 +548,15 @@
>>> print(find(root).tag)
Traceback (most recent call last):
...
- ValueError: root element does not match: need {not}there, got {ns}root
+ ValueError: root element does not match: need {not}there, got {http://ns/}root
>>> find = objectify.ObjectPath("root.b[1]")
>>> print(find(root).tag)
- {ns}b
+ {http://ns/}b
- >>> find = objectify.ObjectPath("root.{ns}b[1]")
+ >>> find = objectify.ObjectPath("root.{http://ns/}b[1]")
>>> print(find(root).tag)
- {ns}b
+ {http://ns/}b
Apart from strings, ObjectPath also accepts lists of path segments:
@@ -553,11 +564,11 @@
>>> find = objectify.ObjectPath(['root', 'b', 'c'])
>>> print(find(root).tag)
- {ns}c
+ {http://ns/}c
- >>> find = objectify.ObjectPath(['root', '{ns}b[1]'])
+ >>> find = objectify.ObjectPath(['root', '{http://ns/}b[1]'])
>>> print(find(root).tag)
- {ns}b
+ {http://ns/}b
You can also use relative paths starting with a '.' to ignore the actual root
element and only inherit its namespace:
@@ -566,23 +577,23 @@
>>> find = objectify.ObjectPath(".b[1]")
>>> print(find(root).tag)
- {ns}b
+ {http://ns/}b
>>> find = objectify.ObjectPath(['', 'b[1]'])
>>> print(find(root).tag)
- {ns}b
+ {http://ns/}b
>>> find = objectify.ObjectPath(".unknown[1]")
>>> print(find(root).tag)
Traceback (most recent call last):
...
- AttributeError: no such child: {ns}unknown
+ AttributeError: no such child: {http://ns/}unknown
- >>> find = objectify.ObjectPath(".{other}unknown[1]")
+ >>> find = objectify.ObjectPath(".{http://other/}unknown[1]")
>>> print(find(root).tag)
Traceback (most recent call last):
...
- AttributeError: no such child: {other}unknown
+ AttributeError: no such child: {http://other/}unknown
For convenience, a single dot represents the empty ObjectPath (identity):
@@ -590,28 +601,28 @@
>>> find = objectify.ObjectPath(".")
>>> print(find(root).tag)
- {ns}root
+ {http://ns/}root
ObjectPath objects can be used to manipulate trees:
.. sourcecode:: pycon
- >>> root = objectify.Element("{ns}root")
+ >>> root = objectify.Element("{http://ns/}root")
- >>> path = objectify.ObjectPath(".some.child.{other}unknown")
+ >>> path = objectify.ObjectPath(".some.child.{http://other/}unknown")
>>> path.hasattr(root)
False
>>> path.find(root)
Traceback (most recent call last):
...
- AttributeError: no such child: {ns}some
+ AttributeError: no such child: {http://ns/}some
>>> path.setattr(root, "my value") # creates children as necessary
>>> path.hasattr(root)
True
>>> print(path.find(root).text)
my value
- >>> print(root.some.child["{other}unknown"].text)
+ >>> print(root.some.child["{http://other/}unknown"].text)
my value
>>> print(len( path.find(root) ))
From jholg at codespeak.net Mon Sep 28 17:31:33 2009
From: jholg at codespeak.net (jholg at codespeak.net)
Date: Mon, 28 Sep 2009 17:31:33 +0200 (CEST)
Subject: [Lxml-checkins] r67943 - in lxml/trunk: . src/lxml src/lxml/tests
Message-ID: <20090928153133.789B7168003@codespeak.net>
Author: jholg
Date: Mon Sep 28 17:31:28 2009
New Revision: 67943
Modified:
lxml/trunk/CHANGES.txt
lxml/trunk/src/lxml/lxml.objectify.pyx
lxml/trunk/src/lxml/tests/test_objectify.py
Log:
Fixed ObjectifiedElement.__setattr__ to not raise a ValueError but still
create an empty-string child element for non-ascii/non-unicode values.
Modified: lxml/trunk/CHANGES.txt
==============================================================================
--- lxml/trunk/CHANGES.txt (original)
+++ lxml/trunk/CHANGES.txt Mon Sep 28 17:31:28 2009
@@ -51,6 +51,9 @@
* Diverting the error logging to Python's logging system was broken.
+* ObjectifiedElement.__setattr__ created an empty-string child element when the
+ attribute value was rejected as a non-unicode/non-ascii string
+
Other changes
-------------
Modified: lxml/trunk/src/lxml/lxml.objectify.pyx
==============================================================================
--- lxml/trunk/src/lxml/lxml.objectify.pyx (original)
+++ lxml/trunk/src/lxml/lxml.objectify.pyx Mon Sep 28 17:31:28 2009
@@ -523,9 +523,10 @@
for item in value:
_appendValue(parent, tag, item)
else:
- new_element = cetree.makeSubElement(
- parent, tag, None, None, None, None)
+ new_element = cetree.makeElement(
+ tag, parent._doc, None, None, None, None, None)
_setElementValue(new_element, value)
+ cetree.appendChild(parent, new_element)
cdef _setElementValue(_Element element, value):
cdef python.PyObject* _pytype
Modified: lxml/trunk/src/lxml/tests/test_objectify.py
==============================================================================
--- lxml/trunk/src/lxml/tests/test_objectify.py (original)
+++ lxml/trunk/src/lxml/tests/test_objectify.py Mon Sep 28 17:31:28 2009
@@ -346,6 +346,25 @@
self.assertRaises(AttributeError, getattr, root.c1, "NOT_THERE")
self.assertRaises(AttributeError, getattr, root.c1, "{unknownNS}c2")
+ def test_setattr(self):
+ for val in [
+ 2, 2**32, 1.2, "Won't get fooled again",
+ _str("W\xf6n't get f\xf6\xf6led \xe4g\xe4in", 'ISO-8859-1'), True,
+ False, None]:
+ root = self.Element('root')
+ attrname = 'val'
+ setattr(root, attrname, val)
+ result = getattr(root, attrname)
+ self.assertEquals(val, result)
+ self.assertEquals(type(val), type(result.pyval))
+
+ def test_setattr_nonunicode(self):
+ root = self.Element('root')
+ attrname = 'val'
+ val = _bytes("W\xf6n't get f\xf6\xf6led \xe4g\xe4in", 'ISO-8859-1')
+ self.assertRaises(ValueError, setattr, root, attrname, val)
+ self.assertRaises(AttributeError, getattr, root, attrname)
+
def test_addattr(self):
root = self.XML(xml_str)
self.assertEquals(1, len(root.c1))
From scoder at codespeak.net Tue Sep 29 22:16:19 2009
From: scoder at codespeak.net (scoder at codespeak.net)
Date: Tue, 29 Sep 2009 22:16:19 +0200 (CEST)
Subject: [Lxml-checkins] r68006 - in lxml/trunk: . doc
Message-ID: <20090929201619.CBC81168010@codespeak.net>
Author: scoder
Date: Tue Sep 29 22:16:19 2009
New Revision: 68006
Modified:
lxml/trunk/ (props changed)
lxml/trunk/doc/objectify.txt
Log:
r5245 at delle: sbehnel | 2009-09-11 15:37:45 +0200
objectify docs
Modified: lxml/trunk/doc/objectify.txt
==============================================================================
--- lxml/trunk/doc/objectify.txt (original)
+++ lxml/trunk/doc/objectify.txt Tue Sep 29 22:16:19 2009
@@ -377,30 +377,35 @@
Note that the ``SubElement()`` factory of ``lxml.etree`` does not
inherit any namespaces when creating a new subelement. Element
creation must be explicit about the namespace, and is simplified
-through the E-factory as described above. Lookups, however, inherit
-namespaces implicitly.
+through the E-factory as described above.
-To access an element in a different namespace than its parent, you can
-use ``getattr()``:
+Lookups, however, inherit namespaces implicitly:
.. sourcecode:: pycon
- >>> print (root.tag)
- {http://ns/}root
+ >>> print(root.b.tag)
+ {http://ns/}b
>>> print(root.c)
Traceback (most recent call last):
...
AttributeError: no such child: {http://ns/}c
- >>> print(getattr(root, "{http://other/}c").tag)
+To access an element in a different namespace than its parent, you can
+use ``getattr()``:
+
+.. sourcecode:: pycon
+
+ >>> c = getattr(root, "{http://other/}c")
+ >>> print(c.tag)
{http://other/}c
For convenience, there is also a quick way through item access:
.. sourcecode:: pycon
- >>> print(root["{http://other/}c"].tag)
+ >>> c = root["{http://other/}c"]
+ >>> print(c.tag)
{http://other/}c
The same approach must be used to access children with tag names that are not
From scoder at codespeak.net Tue Sep 29 22:16:35 2009
From: scoder at codespeak.net (scoder at codespeak.net)
Date: Tue, 29 Sep 2009 22:16:35 +0200 (CEST)
Subject: [Lxml-checkins] r68007 - in lxml/trunk: . src/lxml src/lxml/tests
Message-ID: <20090929201635.63B7516800D@codespeak.net>
Author: scoder
Date: Tue Sep 29 22:16:35 2009
New Revision: 68007
Modified:
lxml/trunk/ (props changed)
lxml/trunk/CHANGES.txt
lxml/trunk/src/lxml/etree_defs.h
lxml/trunk/src/lxml/tests/test_etree.py
Log:
r5248 at delle: sbehnel | 2009-09-29 22:16:07 +0200
fix tree traversal for parsed entity references
Modified: lxml/trunk/CHANGES.txt
==============================================================================
--- lxml/trunk/CHANGES.txt (original)
+++ lxml/trunk/CHANGES.txt Tue Sep 29 22:16:35 2009
@@ -19,6 +19,9 @@
Bugs fixed
----------
+* Modifying trees that contain parsed entity references could result
+ in an infinite loop.
+
* Syntax errors in ``lxml.cssselect`` could result in misleading error
messages.
Modified: lxml/trunk/src/lxml/etree_defs.h
==============================================================================
--- lxml/trunk/src/lxml/etree_defs.h (original)
+++ lxml/trunk/src/lxml/etree_defs.h Tue Sep 29 22:16:35 2009
@@ -221,8 +221,14 @@
#define _LX__TRAVERSE_TO_NEXT(c_stop_node, c_node, only_elements) \
{ \
/* walk through children first */ \
- xmlNode* _lx__next = c_node->children; \
- _LX__ADVANCE_TO_NEXT(_lx__next, only_elements) \
+ xmlNode* _lx__next = c_node->children; \
+ if (_lx__next != 0) { \
+ if (c_node->type == XML_ENTITY_REF_NODE || c_node->type == XML_DTD_NODE) { \
+ _lx__next = 0; \
+ } else { \
+ _LX__ADVANCE_TO_NEXT(_lx__next, only_elements) \
+ } \
+ } \
if ((_lx__next == 0) && (c_node != c_stop_node)) { \
/* try siblings */ \
_lx__next = c_node->next; \
Modified: lxml/trunk/src/lxml/tests/test_etree.py
==============================================================================
--- lxml/trunk/src/lxml/tests/test_etree.py (original)
+++ lxml/trunk/src/lxml/tests/test_etree.py Tue Sep 29 22:16:35 2009
@@ -1049,7 +1049,7 @@
parser = self.etree.XMLParser(resolve_entities=False)
Entity = self.etree.Entity
- xml = '&myentity;'
+ xml = _bytes('&myentity;')
tree = parse(BytesIO(xml), parser)
root = tree.getroot()
self.assertEquals(root[0].tag, Entity)
@@ -1060,6 +1060,25 @@
self.assertEquals(_bytes('&myentity;'),
tostring(root))
+ def test_entity_restructure(self):
+ xml = _bytes(''' ]>
+
+
+
+
+ ''')
+
+ parser = self.etree.XMLParser(resolve_entities=False)
+ root = etree.fromstring(xml, parser)
+ self.assertEquals([ el.tag for el in root ],
+ ['child1', 'child2', 'child3'])
+
+ root[0] = root[-1]
+ self.assertEquals([ el.tag for el in root ],
+ ['child3', 'child2'])
+ self.assertEquals(root[0][0].text, ' ')
+ self.assertEquals(root[0][0].name, 'nbsp')
+
def test_entity_append(self):
Entity = self.etree.Entity
Element = self.etree.Element
From scoder at codespeak.net Tue Sep 29 22:16:52 2009
From: scoder at codespeak.net (scoder at codespeak.net)
Date: Tue, 29 Sep 2009 22:16:52 +0200 (CEST)
Subject: [Lxml-checkins] r68008 - lxml/trunk
Message-ID: <20090929201652.A3954168008@codespeak.net>
Author: scoder
Date: Tue Sep 29 22:16:52 2009
New Revision: 68008
Modified:
lxml/trunk/ (props changed)
lxml/trunk/CHANGES.txt
Log:
r5251 at delle: sbehnel | 2009-09-29 22:16:44 +0200
cleanup
Modified: lxml/trunk/CHANGES.txt
==============================================================================
--- lxml/trunk/CHANGES.txt (original)
+++ lxml/trunk/CHANGES.txt Tue Sep 29 22:16:52 2009
@@ -22,6 +22,9 @@
* Modifying trees that contain parsed entity references could result
in an infinite loop.
+* ObjectifiedElement.__setattr__ created an empty-string child element when the
+ attribute value was rejected as a non-unicode/non-ascii string
+
* Syntax errors in ``lxml.cssselect`` could result in misleading error
messages.
@@ -54,9 +57,6 @@
* Diverting the error logging to Python's logging system was broken.
-* ObjectifiedElement.__setattr__ created an empty-string child element when the
- attribute value was rejected as a non-unicode/non-ascii string
-
Other changes
-------------
From scoder at codespeak.net Tue Sep 29 22:51:59 2009
From: scoder at codespeak.net (scoder at codespeak.net)
Date: Tue, 29 Sep 2009 22:51:59 +0200 (CEST)
Subject: [Lxml-checkins] r68009 - in lxml/branch/lxml-2.2: . doc src/lxml
src/lxml/html src/lxml/html/tests src/lxml/tests
Message-ID: <20090929205159.616B9168003@codespeak.net>
Author: scoder
Date: Tue Sep 29 22:51:58 2009
New Revision: 68009
Modified:
lxml/branch/lxml-2.2/ (props changed)
lxml/branch/lxml-2.2/CHANGES.txt
lxml/branch/lxml-2.2/doc/build.txt
lxml/branch/lxml-2.2/doc/elementsoup.txt
lxml/branch/lxml-2.2/doc/objectify.txt
lxml/branch/lxml-2.2/doc/performance.txt
lxml/branch/lxml-2.2/doc/xpathxslt.txt
lxml/branch/lxml-2.2/setup.py
lxml/branch/lxml-2.2/setupinfo.py
lxml/branch/lxml-2.2/src/lxml/cleanup.pxi
lxml/branch/lxml-2.2/src/lxml/cssselect.py
lxml/branch/lxml-2.2/src/lxml/etree_defs.h
lxml/branch/lxml-2.2/src/lxml/extensions.pxi
lxml/branch/lxml-2.2/src/lxml/html/__init__.py
lxml/branch/lxml-2.2/src/lxml/html/tests/test_forms.txt
lxml/branch/lxml-2.2/src/lxml/lxml.objectify.pyx
lxml/branch/lxml-2.2/src/lxml/parser.pxi
lxml/branch/lxml-2.2/src/lxml/tests/test_css.txt
lxml/branch/lxml-2.2/src/lxml/tests/test_etree.py
lxml/branch/lxml-2.2/src/lxml/tests/test_objectify.py
lxml/branch/lxml-2.2/version.txt
Log:
trunk merge of all recent bug fixes
Modified: lxml/branch/lxml-2.2/CHANGES.txt
==============================================================================
--- lxml/branch/lxml-2.2/CHANGES.txt (original)
+++ lxml/branch/lxml-2.2/CHANGES.txt Tue Sep 29 22:51:58 2009
@@ -2,7 +2,7 @@
lxml changelog
==============
-Under development
+2.2.2 (2009-10-??)
==================
Features added
@@ -11,6 +11,38 @@
Bugs fixed
----------
+* Modifying trees that contain parsed entity references could result
+ in an infinite loop.
+
+* ObjectifiedElement.__setattr__ created an empty-string child element when the
+ attribute value was rejected as a non-unicode/non-ascii string
+
+* Syntax errors in ``lxml.cssselect`` could result in misleading error
+ messages.
+
+* Invalid syntax in CSS expressions could lead to an infinite loop in
+ the parser of ``lxml.cssselect``.
+
+* CSS special character escapes were not properly handled in
+ ``lxml.cssselect``.
+
+* CSS Unicode escapes were not properly decoded in ``lxml.cssselect``.
+
+* Select options in HTML forms that had no explicit ``value``
+ attribute were not handled correctly. The HTML standard dictates
+ that their value is defined by their text content. This is now
+ supported by lxml.html.
+
+* XPath raised a TypeError when finding CDATA sections. This is now
+ fully supported.
+
+* Calling ``help(lxml.objectify)`` didn't work at the prompt.
+
+* The ``ElementMaker`` in lxml.objectify no longer defines the default
+ namespaces when annotation is disabled.
+
+* Feed parser failed to honout the 'recover' option on parse errors.
+
* Diverting the error logging to Python's logging system was broken.
Other changes
Modified: lxml/branch/lxml-2.2/doc/build.txt
==============================================================================
--- lxml/branch/lxml-2.2/doc/build.txt (original)
+++ lxml/branch/lxml-2.2/doc/build.txt Tue Sep 29 22:51:58 2009
@@ -16,9 +16,10 @@
2 Subversion
3 Setuptools
4 Running the tests and reporting errors
- 5 Contributing an egg
- 6 Static linking on Windows
- 7 Building Debian packages from SVN sources
+ 5 Building an egg
+ 6 Building lxml on MacOS-X
+ 7 Static linking on Windows
+ 8 Building Debian packages from SVN sources
Cython
@@ -211,6 +212,18 @@
Instead of ``build``, you can use any target, like ``bdist_egg`` if
you want to use setuptools to build an installable egg.
+Note that this also works with EasyInstall_. Since you can't pass
+command line options in this case, you have to use an environment
+variable instead::
+
+ STATIC_DEPS=true easy_install lxml
+
+Some machines may require an additional run with "sudo" to install the
+package into the Python package directory::
+
+ STATIC_DEPS=true sudo easy_install lxml
+
+
Static linking on Windows
-------------------------
Modified: lxml/branch/lxml-2.2/doc/elementsoup.txt
==============================================================================
--- lxml/branch/lxml-2.2/doc/elementsoup.txt (original)
+++ lxml/branch/lxml-2.2/doc/elementsoup.txt Tue Sep 29 22:51:58 2009
@@ -23,6 +23,13 @@
document, and ``convert_tree()`` to convert an existing BeautifulSoup
tree into a list of top-level Elements.
+.. contents::
+..
+ 1 Parsing with the soupparser
+ 2 Entity handling
+ 3 Using soupparser as a fallback
+ 4 Using only the encoding detection
+
Parsing with the soupparser
===========================
@@ -70,6 +77,25 @@
``makeelement`` factory function to ``parse()`` and ``fromstring()``.
By default, this is based on the HTML parser defined in ``lxml.html``.
+For a quick comparison, libxml2 2.6.32 parses the same tag soup as
+follows. The main difference is that libxml2 tries harder to adhere
+to the structure of an HTML document and moves misplaced tags where
+they (likely) belong. Note, however, that the result can vary between
+parser versions.
+
+.. sourcecode:: html
+
+
+
+
+ Hello
+
+
+ Hi all
+
+
+
+
Entity handling
===============
@@ -149,3 +175,27 @@
... ignore = tostring(root, encoding=unicode)
... except UnicodeDecodeError:
... root = lxml.html.soupparser.fromstring(tag_soup)
+
+
+Using only the encoding detection
+=================================
+
+If you prefer a 'real' (and fast) HTML parser instead of the regular
+expression based one in BeautifulSoup, you can still benefit from
+BeautifulSoup's _`support for encoding detection` in the
+``UnicodeDammit`` class.
+
+.. sourcecode:: pycon
+
+ >>> from BeautifulSoup import UnicodeDammit
+
+ >>> def decode_html(html_string):
+ ... converted = UnicodeDammit(html_string, isHTML=True)
+ ... if not converted.unicode:
+ ... raise UnicodeDecodeError(
+ ... "Failed to detect encoding, tried [%s]",
+ ... ', '.join(converted.triedEncodings))
+ ... # print converted.originalEncoding
+ ... return converted.unicode
+
+ >>> root = lxml.html.fromstring(decode_html(tag_soup))
Modified: lxml/branch/lxml-2.2/doc/objectify.txt
==============================================================================
--- lxml/branch/lxml-2.2/doc/objectify.txt (original)
+++ lxml/branch/lxml-2.2/doc/objectify.txt Tue Sep 29 22:51:58 2009
@@ -361,63 +361,79 @@
Namespace handling
------------------
-Namespaces are handled mostly behind the scenes. If you access a child of an
-Element without specifying a namespace, the lookup will use the namespace of
-the parent:
+During tag lookups, namespaces are handled mostly behind the scenes.
+If you access a child of an Element without specifying a namespace,
+the lookup will use the namespace of the parent:
.. sourcecode:: pycon
- >>> root = objectify.Element("{ns}root")
- >>> b = etree.SubElement(root, "{ns}b")
- >>> c = etree.SubElement(root, "{other}c")
+ >>> root = objectify.Element("{http://ns/}root")
+ >>> b = etree.SubElement(root, "{http://ns/}b")
+ >>> c = etree.SubElement(root, "{http://other/}c")
>>> print(root.b.tag)
- {ns}b
+ {http://ns/}b
+
+Note that the ``SubElement()`` factory of ``lxml.etree`` does not
+inherit any namespaces when creating a new subelement. Element
+creation must be explicit about the namespace, and is simplified
+through the E-factory as described above.
+
+Lookups, however, inherit namespaces implicitly:
+
+.. sourcecode:: pycon
+
+ >>> print(root.b.tag)
+ {http://ns/}b
+
>>> print(root.c)
Traceback (most recent call last):
...
- AttributeError: no such child: {ns}c
+ AttributeError: no such child: {http://ns/}c
-You can access elements with different namespaces via ``getattr()``:
+To access an element in a different namespace than its parent, you can
+use ``getattr()``:
.. sourcecode:: pycon
- >>> print(getattr(root, "{other}c").tag)
- {other}c
+ >>> c = getattr(root, "{http://other/}c")
+ >>> print(c.tag)
+ {http://other/}c
For convenience, there is also a quick way through item access:
.. sourcecode:: pycon
- >>> print(root["{other}c"].tag)
- {other}c
+ >>> c = root["{http://other/}c"]
+ >>> print(c.tag)
+ {http://other/}c
The same approach must be used to access children with tag names that are not
valid Python identifiers:
.. sourcecode:: pycon
- >>> el = etree.SubElement(root, "{ns}tag-name")
+ >>> el = etree.SubElement(root, "{http://ns/}tag-name")
>>> print(root["tag-name"].tag)
- {ns}tag-name
+ {http://ns/}tag-name
- >>> new_el = objectify.Element("{ns}new-element")
- >>> el = etree.SubElement(new_el, "{ns}child")
- >>> el = etree.SubElement(new_el, "{ns}child")
- >>> el = etree.SubElement(new_el, "{ns}child")
+ >>> new_el = objectify.Element("{http://ns/}new-element")
+ >>> el = etree.SubElement(new_el, "{http://ns/}child")
+ >>> el = etree.SubElement(new_el, "{http://ns/}child")
+ >>> el = etree.SubElement(new_el, "{http://ns/}child")
>>> root["tag-name"] = [ new_el, new_el ]
>>> print(len(root["tag-name"]))
2
>>> print(root["tag-name"].tag)
- {ns}tag-name
+ {http://ns/}tag-name
>>> print(len(root["tag-name"].child))
3
>>> print(root["tag-name"].child.tag)
- {ns}child
+ {http://ns/}child
>>> print(root["tag-name"][1].child.tag)
- {ns}child
+ {http://ns/}child
or for names that have a special meaning in lxml.objectify:
@@ -505,11 +521,11 @@
.. sourcecode:: pycon
- >>> root = objectify.Element("{ns}root")
- >>> b1 = etree.SubElement(root, "{ns}b")
- >>> c = etree.SubElement(b1, "{ns}c")
- >>> b2 = etree.SubElement(root, "{ns}b")
- >>> d = etree.SubElement(root, "{other}d")
+ >>> root = objectify.Element("{http://ns/}root")
+ >>> b1 = etree.SubElement(root, "{http://ns/}b")
+ >>> c = etree.SubElement(b1, "{http://ns/}c")
+ >>> b2 = etree.SubElement(root, "{http://ns/}b")
+ >>> d = etree.SubElement(root, "{http://other/}d")
>>> path = objectify.ObjectPath("root.b.c")
>>> print(path)
@@ -517,15 +533,15 @@
>>> path.hasattr(root)
True
>>> print(path.find(root).tag)
- {ns}c
+ {http://ns/}c
>>> find = objectify.ObjectPath("root.b.c")
>>> print(find(root).tag)
- {ns}c
+ {http://ns/}c
- >>> find = objectify.ObjectPath("root.{other}d")
+ >>> find = objectify.ObjectPath("root.{http://other/}d")
>>> print(find(root).tag)
- {other}d
+ {http://other/}d
>>> find = objectify.ObjectPath("root.{not}there")
>>> print(find(root).tag)
@@ -537,15 +553,15 @@
>>> print(find(root).tag)
Traceback (most recent call last):
...
- ValueError: root element does not match: need {not}there, got {ns}root
+ ValueError: root element does not match: need {not}there, got {http://ns/}root
>>> find = objectify.ObjectPath("root.b[1]")
>>> print(find(root).tag)
- {ns}b
+ {http://ns/}b
- >>> find = objectify.ObjectPath("root.{ns}b[1]")
+ >>> find = objectify.ObjectPath("root.{http://ns/}b[1]")
>>> print(find(root).tag)
- {ns}b
+ {http://ns/}b
Apart from strings, ObjectPath also accepts lists of path segments:
@@ -553,11 +569,11 @@
>>> find = objectify.ObjectPath(['root', 'b', 'c'])
>>> print(find(root).tag)
- {ns}c
+ {http://ns/}c
- >>> find = objectify.ObjectPath(['root', '{ns}b[1]'])
+ >>> find = objectify.ObjectPath(['root', '{http://ns/}b[1]'])
>>> print(find(root).tag)
- {ns}b
+ {http://ns/}b
You can also use relative paths starting with a '.' to ignore the actual root
element and only inherit its namespace:
@@ -566,23 +582,23 @@
>>> find = objectify.ObjectPath(".b[1]")
>>> print(find(root).tag)
- {ns}b
+ {http://ns/}b
>>> find = objectify.ObjectPath(['', 'b[1]'])
>>> print(find(root).tag)
- {ns}b
+ {http://ns/}b
>>> find = objectify.ObjectPath(".unknown[1]")
>>> print(find(root).tag)
Traceback (most recent call last):
...
- AttributeError: no such child: {ns}unknown
+ AttributeError: no such child: {http://ns/}unknown
- >>> find = objectify.ObjectPath(".{other}unknown[1]")
+ >>> find = objectify.ObjectPath(".{http://other/}unknown[1]")
>>> print(find(root).tag)
Traceback (most recent call last):
...
- AttributeError: no such child: {other}unknown
+ AttributeError: no such child: {http://other/}unknown
For convenience, a single dot represents the empty ObjectPath (identity):
@@ -590,28 +606,28 @@
>>> find = objectify.ObjectPath(".")
>>> print(find(root).tag)
- {ns}root
+ {http://ns/}root
ObjectPath objects can be used to manipulate trees:
.. sourcecode:: pycon
- >>> root = objectify.Element("{ns}root")
+ >>> root = objectify.Element("{http://ns/}root")
- >>> path = objectify.ObjectPath(".some.child.{other}unknown")
+ >>> path = objectify.ObjectPath(".some.child.{http://other/}unknown")
>>> path.hasattr(root)
False
>>> path.find(root)
Traceback (most recent call last):
...
- AttributeError: no such child: {ns}some
+ AttributeError: no such child: {http://ns/}some
>>> path.setattr(root, "my value") # creates children as necessary
>>> path.hasattr(root)
True
>>> print(path.find(root).text)
my value
- >>> print(root.some.child["{other}unknown"].text)
+ >>> print(root.some.child["{http://other/}unknown"].text)
my value
>>> print(len( path.find(root) ))
Modified: lxml/branch/lxml-2.2/doc/performance.txt
==============================================================================
--- lxml/branch/lxml-2.2/doc/performance.txt (original)
+++ lxml/branch/lxml-2.2/doc/performance.txt Tue Sep 29 22:51:58 2009
@@ -252,6 +252,21 @@
.. _`high-performance XML parsing`: http://www.ibm.com/developerworks/xml/library/x-hiperfparse/
+Finally, `xml.com`_ has a couple of publications about XML parser
+performance. Farwick and Hafner have written two interesting articles
+that compare the parser of libxml2 to some major Java based XML
+parsers. One deals with `event-driven parser performance`_, the other
+one presents `benchmark results comparing DOM parsers`_. Both
+comparisons suggest that libxml2's parser performance is largely
+superiour to all commonly used Java parsers in almost all cases. Note
+that the C parser benchmark results are based on xmlbench_, which uses
+a simpler setup for libxml2 than lxml does.
+
+.. _`xml.com`: http://www.xml.com/
+.. _`event-driven parser performance`: http://www.xml.com/lpt/a/1702
+.. _`benchmark results comparing DOM parsers`: http://www.xml.com/lpt/a/1703
+.. _xmlbench: http://xmlbench.sourceforge.net/
+
The ElementTree API
===================
Modified: lxml/branch/lxml-2.2/doc/xpathxslt.txt
==============================================================================
--- lxml/branch/lxml-2.2/doc/xpathxslt.txt (original)
+++ lxml/branch/lxml-2.2/doc/xpathxslt.txt Tue Sep 29 22:51:58 2009
@@ -2,8 +2,8 @@
XPath and XSLT with lxml
========================
-lxml supports both XPath and XSLT through libxml2 and libxslt in a standards
-compliant way.
+lxml supports XPath 1.0, XSLT 1.0 and the EXSLT extensions through
+libxml2 and libxslt in a standards compliant way.
.. contents::
..
Modified: lxml/branch/lxml-2.2/setup.py
==============================================================================
--- lxml/branch/lxml-2.2/setup.py (original)
+++ lxml/branch/lxml-2.2/setup.py Tue Sep 29 22:51:58 2009
@@ -1,5 +1,8 @@
import sys, os
+# for command line options and supported environment variables, please
+# see the end of 'setupinfo.py'
+
extra_options = {}
try:
Modified: lxml/branch/lxml-2.2/setupinfo.py
==============================================================================
--- lxml/branch/lxml-2.2/setupinfo.py (original)
+++ lxml/branch/lxml-2.2/setupinfo.py Tue Sep 29 22:51:58 2009
@@ -46,6 +46,7 @@
'libs', 'build/tmp',
static_include_dirs, static_library_dirs,
static_cflags, static_binaries,
+ libiconv_version=OPTION_LIBICONV_VERSION,
libxml2_version=OPTION_LIBXML2_VERSION,
libxslt_version=OPTION_LIBXSLT_VERSION)
if CYTHON_INSTALLED:
@@ -320,7 +321,7 @@
env_val = os.getenv(name.upper().replace('-', '_'))
return env_val
-# pick up any commandline options
+# pick up any commandline options and/or env variables
OPTION_WITHOUT_OBJECTIFY = has_option('without-objectify')
OPTION_WITHOUT_ASSERT = has_option('without-assert')
OPTION_WITHOUT_THREADING = has_option('without-threading')
@@ -337,3 +338,4 @@
OPTION_STATIC = True
OPTION_LIBXML2_VERSION = option_value('libxml2-version')
OPTION_LIBXSLT_VERSION = option_value('libxslt-version')
+OPTION_LIBICONV_VERSION = option_value('libiconv-version')
Modified: lxml/branch/lxml-2.2/src/lxml/cleanup.pxi
==============================================================================
--- lxml/branch/lxml-2.2/src/lxml/cleanup.pxi (original)
+++ lxml/branch/lxml-2.2/src/lxml/cleanup.pxi Tue Sep 29 22:51:58 2009
@@ -4,7 +4,7 @@
u"""cleanup_namespaces(tree_or_element)
Remove all namespace declarations from a subtree that are not used
- by any of the elements in that tree.
+ by any of the elements or attributes in that tree.
"""
cdef _Element element
element = _rootNodeOrRaise(tree_or_element)
Modified: lxml/branch/lxml-2.2/src/lxml/cssselect.py
==============================================================================
--- lxml/branch/lxml-2.2/src/lxml/cssselect.py (original)
+++ lxml/branch/lxml-2.2/src/lxml/cssselect.py Tue Sep 29 22:51:58 2009
@@ -49,9 +49,11 @@
try:
_unicode = unicode
+ _unichr = unichr
except NameError:
# Python 3
_unicode = str
+ _unichr = chr
class _UniToken(_unicode):
def __new__(cls, contents, pos):
@@ -99,7 +101,7 @@
def xpath(self):
sel_xpath = self.selector.xpath()
sel_xpath.add_condition(
- "contains(concat(' ', normalize-space(@class), ' '), %s)" % xpath_repr(' '+self.class_name+' '))
+ "contains(concat(' ', normalize-space(@class), ' '), %s)" % xpath_literal(' '+self.class_name+' '))
return sel_xpath
class Function(object):
@@ -194,7 +196,7 @@
if isinstance(expr, Element):
expr = expr._format_element()
xpath.add_condition('contains(css:lower-case(string(.)), %s)'
- % xpath_repr(expr.lower()))
+ % xpath_literal(expr.lower()))
# FIXME: Currently case insensitive matching doesn't seem to be happening
return xpath
@@ -349,34 +351,34 @@
path.add_condition(attrib)
elif self.operator == '=':
path.add_condition('%s = %s' % (attrib,
- xpath_repr(value)))
+ xpath_literal(value)))
elif self.operator == '!=':
# FIXME: this seems like a weird hack...
if value:
path.add_condition('not(%s) or %s != %s'
- % (attrib, attrib, xpath_repr(value)))
+ % (attrib, attrib, xpath_literal(value)))
else:
path.add_condition('%s != %s'
- % (attrib, xpath_repr(value)))
- #path.add_condition('%s != %s' % (attrib, xpath_repr(value)))
+ % (attrib, xpath_literal(value)))
+ #path.add_condition('%s != %s' % (attrib, xpath_literal(value)))
elif self.operator == '~=':
- path.add_condition("contains(concat(' ', normalize-space(%s), ' '), %s)" % (attrib, xpath_repr(' '+value+' ')))
+ path.add_condition("contains(concat(' ', normalize-space(%s), ' '), %s)" % (attrib, xpath_literal(' '+value+' ')))
elif self.operator == '|=':
# Weird, but true...
path.add_condition('%s = %s or starts-with(%s, %s)' % (
- attrib, xpath_repr(value),
- attrib, xpath_repr(value + '-')))
+ attrib, xpath_literal(value),
+ attrib, xpath_literal(value + '-')))
elif self.operator == '^=':
path.add_condition('starts-with(%s, %s)' % (
- attrib, xpath_repr(value)))
+ attrib, xpath_literal(value)))
elif self.operator == '$=':
# Oddly there is a starts-with in XPath 1.0, but not ends-with
path.add_condition('substring(%s, string-length(%s)-%s) = %s'
- % (attrib, attrib, len(value)-1, xpath_repr(value)))
+ % (attrib, attrib, len(value)-1, xpath_literal(value)))
elif self.operator == '*=':
# FIXME: case sensitive?
path.add_condition('contains(%s, %s)' % (
- attrib, xpath_repr(value)))
+ attrib, xpath_literal(value)))
else:
assert 0, ("Unknown operator: %r" % self.operator)
return path
@@ -425,7 +427,7 @@
def xpath(self):
path = self.selector.xpath()
- path.add_condition('@id = %s' % xpath_repr(self.id))
+ path.add_condition('@id = %s' % xpath_literal(self.id))
return path
class Or(object):
@@ -501,9 +503,9 @@
##############################
## XPathExpr objects:
-_el_re = re.compile(r'^\w+\s*$')
-_id_re = re.compile(r'^(\w*)#(\w+)\s*$')
-_class_re = re.compile(r'^(\w*)\.(\w+)\s*$')
+_el_re = re.compile(r'^\w+\s*$', re.UNICODE)
+_id_re = re.compile(r'^(\w*)#(\w+)\s*$', re.UNICODE)
+_class_re = re.compile(r'^(\w*)\.(\w+)\s*$', re.UNICODE)
def css_to_xpath(css_expr, prefix='descendant-or-self::'):
if isinstance(css_expr, _basestring):
@@ -524,7 +526,7 @@
"Got None for xpath expression from %s" % repr(css_expr))
if prefix:
expr.add_prefix(prefix)
- return str(expr)
+ return _unicode(expr)
class XPathExpr(object):
@@ -539,10 +541,10 @@
def __str__(self):
path = ''
if self.prefix is not None:
- path += str(self.prefix)
+ path += _unicode(self.prefix)
if self.path is not None:
- path += str(self.path)
- path += str(self.element)
+ path += _unicode(self.path)
+ path += _unicode(self.element)
if self.condition:
path += '[%s]' % self.condition
return path
@@ -574,7 +576,7 @@
if self.element == '*':
# We weren't doing a test anyway
return
- self.add_condition("name() = %s" % xpath_repr(self.element))
+ self.add_condition("name() = %s" % xpath_literal(self.element))
self.element = '*'
def add_star_prefix(self):
@@ -589,7 +591,7 @@
self.star_prefix = True
def join(self, combiner, other):
- prefix = str(self)
+ prefix = _unicode(self)
prefix += combiner
path = (other.prefix or '') + (other.path or '')
# We don't need a star prefix if we are joining to this other
@@ -615,16 +617,26 @@
def __str__(self):
prefix = self.prefix or ''
- return ' | '.join([prefix + str(i) for i in self.items])
+ return ' | '.join(["%s%s" % (prefix,i) for i in self.items])
-def xpath_repr(s):
- # FIXME: I don't think this is right, but lacking any reasonable
- # specification on what XPath literals look like (which doesn't seem
- # to be in the XPath specification) it is hard to do 'right'
+split_at_single_quotes = re.compile("('+)").split
+
+def xpath_literal(s):
if isinstance(s, Element):
# This is probably a symbol that looks like an expression...
s = s._format_element()
- return repr(str(s))
+ else:
+ s = _unicode(s)
+ if "'" not in s:
+ s = "'%s'" % s
+ elif '"' not in s:
+ s = '"%s"' % s
+ else:
+ s = "concat(%s)" % ','.join([
+ (("'" in part) and '"%s"' or "'%s'") % part
+ for part in split_at_single_quotes(s) if part
+ ])
+ return s
##############################
## Parsing functions
@@ -637,8 +649,12 @@
except SelectorSyntaxError:
import sys
e = sys.exc_info()[1]
- e.args = tuple(["%s at %s -> %s" % (
- e, stream.used, list(stream))])
+ message = "%s at %s -> %r" % (
+ e, stream.used, stream.peek())
+ e.msg = message
+ if sys.version_info < (2,6):
+ e.message = message
+ e.args = tuple([message])
raise
def parse_selector_group(stream):
@@ -665,7 +681,11 @@
combinator = stream.next()
else:
combinator = ' '
+ consumed = len(stream.used)
next_selector = parse_simple_selector(stream)
+ if consumed == len(stream.used):
+ raise SelectorSyntaxError(
+ "Expected selector, got '%s'" % stream.peek())
result = CombinedSelector(result, combinator, next_selector)
return result
@@ -677,14 +697,14 @@
next = stream.next()
if next != '*' and not isinstance(next, Symbol):
raise SelectorSyntaxError(
- "Expected symbol, got %r" % next)
+ "Expected symbol, got '%s'" % next)
if stream.peek() == '|':
namespace = next
stream.next()
element = stream.next()
if element != '*' and not isinstance(next, Symbol):
raise SelectorSyntaxError(
- "Expected symbol, got %r" % next)
+ "Expected symbol, got '%s'" % next)
else:
namespace = '*'
element = next
@@ -711,14 +731,14 @@
next = stream.next()
if not next == ']':
raise SelectorSyntaxError(
- "] expected, got %r" % next)
+ "] expected, got '%s'" % next)
continue
elif peek == ':' or peek == '::':
type = stream.next()
ident = stream.next()
if not isinstance(ident, Symbol):
raise SelectorSyntaxError(
- "Expected symbol, got %r" % ident)
+ "Expected symbol, got '%s'" % ident)
if stream.peek() == '(':
stream.next()
peek = stream.peek()
@@ -732,7 +752,7 @@
next = stream.next()
if not next == ')':
raise SelectorSyntaxError(
- "Expected ), got %r and %r"
+ "Expected ')', got '%s' and '%s'"
% (next, selector))
result = Function(result, type, ident, selector)
else:
@@ -766,11 +786,11 @@
op = stream.next()
if not op in ('^=', '$=', '*=', '=', '~=', '|=', '!='):
raise SelectorSyntaxError(
- "Operator expected, got %r" % op)
+ "Operator expected, got '%s'" % op)
value = stream.next()
if not isinstance(value, (Symbol, String)):
raise SelectorSyntaxError(
- "Expected string or symbol, got %r" % value)
+ "Expected string or symbol, got '%s'" % value)
return Attrib(selector, namespace, attrib, op, value)
def parse_series(s):
@@ -814,9 +834,9 @@
## Tokenizing
############################################################
-_whitespace_re = re.compile(r'\s+')
+_whitespace_re = re.compile(r'\s+', re.UNICODE)
-_comment_re = re.compile(r'/\*.*?\*/', re.S)
+_comment_re = re.compile(r'/\*.*?\*/', re.DOTALL)
_count_re = re.compile(r'[+-]?\d*n(?:[+-]\d+)?')
@@ -861,6 +881,28 @@
yield Symbol(sym, old_pos)
continue
+split_at_string_escapes = re.compile(r'(\\(?:%s))'
+ % '|'.join(['[A-Fa-f0-9]{1,6}(?:\r\n|\s)?',
+ '[^A-Fa-f0-9]'])).split
+
+def unescape_string_literal(literal):
+ substrings = []
+ for substring in split_at_string_escapes(literal):
+ if not substring:
+ continue
+ elif '\\' in substring:
+ if substring[0] == '\\' and len(substring) > 1:
+ substring = substring[1:]
+ if substring[0] in '0123456789ABCDEFabcdef':
+ # int() correctly ignores the potentially trailing whitespace
+ substring = _unichr(int(substring, 16))
+ else:
+ raise SelectorSyntaxError(
+ "Invalid escape sequence %r in string %r"
+ % (substring.split('\\')[1], literal))
+ substrings.append(substring)
+ return ''.join(substrings)
+
def tokenize_escaped_string(s, pos):
quote = s[pos]
assert quote in ('"', "'")
@@ -873,13 +915,13 @@
"Expected closing %s for string in: %r"
% (quote, s[start:]))
result = s[start:next]
- try:
- result = result.encode('ASCII', 'backslashreplace').decode('unicode_escape')
- except UnicodeDecodeError:
- # Probably a hanging \
+ if result.endswith('\\'):
+ # next quote character is escaped
pos = next+1
- else:
- return result, next+1
+ continue
+ if '\\' in result:
+ result = unescape_string_literal(result)
+ return result, next+1
_illegal_symbol = re.compile(r'[^\w\\-]', re.UNICODE)
Modified: lxml/branch/lxml-2.2/src/lxml/etree_defs.h
==============================================================================
--- lxml/branch/lxml-2.2/src/lxml/etree_defs.h (original)
+++ lxml/branch/lxml-2.2/src/lxml/etree_defs.h Tue Sep 29 22:51:58 2009
@@ -221,8 +221,14 @@
#define _LX__TRAVERSE_TO_NEXT(c_stop_node, c_node, only_elements) \
{ \
/* walk through children first */ \
- xmlNode* _lx__next = c_node->children; \
- _LX__ADVANCE_TO_NEXT(_lx__next, only_elements) \
+ xmlNode* _lx__next = c_node->children; \
+ if (_lx__next != 0) { \
+ if (c_node->type == XML_ENTITY_REF_NODE || c_node->type == XML_DTD_NODE) { \
+ _lx__next = 0; \
+ } else { \
+ _LX__ADVANCE_TO_NEXT(_lx__next, only_elements) \
+ } \
+ } \
if ((_lx__next == 0) && (c_node != c_stop_node)) { \
/* try siblings */ \
_lx__next = c_node->next; \
Modified: lxml/branch/lxml-2.2/src/lxml/extensions.pxi
==============================================================================
--- lxml/branch/lxml-2.2/src/lxml/extensions.pxi (original)
+++ lxml/branch/lxml-2.2/src/lxml/extensions.pxi Tue Sep 29 22:51:58 2009
@@ -543,7 +543,8 @@
results.append(
_fakeDocElementFactory(doc, c_node))
elif c_node.type == tree.XML_TEXT_NODE or \
- c_node.type == tree.XML_ATTRIBUTE_NODE:
+ c_node.type == tree.XML_CDATA_SECTION_NODE or \
+ c_node.type == tree.XML_ATTRIBUTE_NODE:
results.append(
_buildElementStringResult(doc, c_node, smart_string))
elif c_node.type == tree.XML_NAMESPACE_DECL:
@@ -572,7 +573,7 @@
pass
else:
raise NotImplementedError, \
- u"Not yet implemented result node type: %d" % unicode(c_node.type)
+ u"Not yet implemented result node type: %d" % c_node.type
cdef void _freeXPathObject(xpath.xmlXPathObject* xpathObj):
u"""Free the XPath object, but *never* free the *content* of node sets.
@@ -642,7 +643,7 @@
tree.xmlFree(s)
c_element = NULL
else:
- #assert c_node.type == tree.XML_TEXT_NODE, "invalid node type"
+ #assert c_node.type == tree.XML_TEXT_NODE or c_node.type == tree.XML_CDATA_SECTION_NODE, "invalid node type"
is_attribute = 0
# may be tail text or normal text
value = funicode(c_node.content)
Modified: lxml/branch/lxml-2.2/src/lxml/html/__init__.py
==============================================================================
--- lxml/branch/lxml-2.2/src/lxml/html/__init__.py (original)
+++ lxml/branch/lxml-2.2/src/lxml/html/__init__.py Tue Sep 29 22:51:58 2009
@@ -991,9 +991,12 @@
if self.multiple:
return MultipleSelectOptions(self)
for el in _options_xpath(self):
- if 'selected' in el.attrib:
+ if el.get('selected') is not None:
value = el.get('value')
- # FIXME: If value is None, what to return?, get_text()?
+ if value is None:
+ value = el.text or ''
+ if value:
+ value = value.strip()
return value
return None
@@ -1006,9 +1009,14 @@
self.value.update(value)
return
if value is not None:
+ value = value.strip()
for el in _options_xpath(self):
- # FIXME: also if el.get('value') is None?
- if el.get('value') == value:
+ opt_value = el.get('value')
+ if opt_value is None:
+ opt_value = el.text or ''
+ if opt_value:
+ opt_value = opt_value.strip()
+ if opt_value == value:
checked_option = el
break
else:
@@ -1034,7 +1042,15 @@
All the possible values this select can have (the ``value``
attribute of all the ``