[Lxml-checkins] r44635 - in lxml/branch/html/src/lxml/html: . tests
ianb at codespeak.net
ianb at codespeak.net
Fri Jun 29 23:50:48 CEST 2007
Author: ianb
Date: Fri Jun 29 23:50:47 2007
New Revision: 44635
Added:
lxml/branch/html/src/lxml/html/tests/test_css_select.txt (contents, props changed)
Modified:
lxml/branch/html/src/lxml/html/css.py
lxml/branch/html/src/lxml/html/tests/test_css.py
lxml/branch/html/src/lxml/html/tests/test_css.txt
Log:
improvement to CSS selectors, and more tests
Modified: lxml/branch/html/src/lxml/html/css.py
==============================================================================
--- lxml/branch/html/src/lxml/html/css.py (original)
+++ lxml/branch/html/src/lxml/html/css.py Fri Jun 29 23:50:47 2007
@@ -88,60 +88,52 @@
method = getattr(self, method)
return method(sel_path, self.expr)
- def _xpath_nth_child(self, xpath, expr, last=False):
- if isinstance(expr, int):
- return self._xpath_nth_child_simple(xpath, expr, last)
- if not isinstance(expr, int):
- a, b = parse_series(expr)
- if not a:
- # a=0 means nothing is returned...
- xpath.add_condition('false()')
- return xpath
- if a == 1:
- return self._xpath_nth_child_simple(xpath, expr, last)
- if b > 0:
- b_neg = str(-b)
- else:
- b_neg = '+%s' % (-b)
- expr = '(position() %s) mod %s = 0' % (b_neg, a)
- if b >= 0:
- expr += ' and position() >= %s' % b
- xpath.add_condition(expr)
+ def _xpath_nth_child(self, xpath, expr, last=False,
+ add_name_test=True):
+ a, b = parse_series(expr)
+ if not a:
+ # a=0 means nothing is returned...
+ xpath.add_condition('false() and position() = 0')
return xpath
- # FIXME: handle an+b, odd, even
- # an+b means every-a, plus b, e.g., 2n+1 means odd
- # 0n+b means b
- # n+0 means a=1, i.e., all elements
- # an means every a elements, i.e., 2n means even
- # -n means -1n
- # -1n+6 means elements 6 and previous
-
- def _xpath_nth_child_simple(self, xpath, expr, last=False):
- if isinstance(expr, int):
- expr -= 1
+ if add_name_test:
+ xpath.add_name_test()
+ xpath.add_star_prefix()
+ if a == 1:
if last:
- expr = 'last() - %s' % expr
- xpath = XPath('*/%s' % xpath)
- xpath.add_index(expr)
+ b = 'last() - %s' % b
+ xpath.add_condition('position() = %s' % b)
return xpath
+ if last:
+ # FIXME: I'm not sure if this is right
+ a = -a
+ b = -b
+ if b > 0:
+ b_neg = str(-b)
+ else:
+ b_neg = '+%s' % (-b)
+ expr = '(position() %s) mod %s = 0' % (b_neg, a)
+ if b >= 0:
+ expr += ' and position() >= %s' % b
+ elif b < 0 and last:
+ expr += ' and position() < (last() %s)' % b
+ xpath.add_condition(expr)
+ return xpath
+ # FIXME: handle an+b, odd, even
+ # an+b means every-a, plus b, e.g., 2n+1 means odd
+ # 0n+b means b
+ # n+0 means a=1, i.e., all elements
+ # an means every a elements, i.e., 2n means even
+ # -n means -1n
+ # -1n+6 means elements 6 and previous
def _xpath_nth_last_child(self, xpath, expr):
return self._xpath_nth_child(xpath, expr, last=True)
- def _xpath_nth_of_type(self, xpath, expr, last=False):
- # Like nth-of-type, but only for *this* type
- if isinstance(expr, int):
- expr -= 1
- if last:
- expr = 'last() - %s' % expr
- xpath = XPath('*/%s' % xpath)
- xpath.add_index(expr)
- return xpath
- else:
- raise NotImplementedError
+ def _xpath_nth_of_type(self, xpath, expr):
+ return self._xpath_nth_child(xpath, expr, add_name_test=False)
def _xpath_nth_last_of_type(self, xpath, expr):
- return self._xpath_nth_of_type(xpath, expr, last=True)
+ return self._xpath_nth_child(xpath, expr, last=True, add_name_test=False)
def _xpath_contains(self, xpath, expr):
# text content, minus tags, must contain expr
@@ -149,6 +141,7 @@
expr = expr._format_element()
xpath.add_condition('contains(css:lower-case(string(.)), %s)'
% xpath_repr(expr.lower()))
+ # FIXME: Currently case insensitive matching doesn't seem to be happening
return xpath
def _xpath_not(self, xpath, expr):
@@ -199,7 +192,8 @@
return el_xpath
def _xpath_checked(self, xpath):
- xpath.add_condition("(@selected or @checked) and (node-name(.) = 'input' or node-name(.) = 'option')")
+ # FIXME: is this really all the elements?
+ xpath.add_condition("(@selected or @checked) and (name(.) = 'input' or name(.) = 'option')")
return xpath
def _xpath_root(self, xpath):
@@ -207,35 +201,38 @@
raise NotImplementedError
def _xpath_first_child(self, xpath):
- xpath = XPath('*/%s' % xpath)
- xpath.add_condition('position() = 0')
+ xpath.add_star_prefix()
+ xpath.add_name_test()
+ xpath.add_condition('position() = 1')
return xpath
def _xpath_last_child(self, xpath):
- xpath = XPath('*/%s' % xpath)
+ xpath.add_star_prefix()
+ xpath.add_name_test()
xpath.add_condition('position() = last()')
return xpath
def _xpath_first_of_type(self, xpath):
- xpath = XPath('*/%s' % xpath)
- xpath.add_index(0)
+ xpath.add_star_prefix()
+ xpath.add_condition('position() = 1')
return xpath
def _xpath_last_of_type(self, xpath):
- xpath.add_index('last()')
+ xpath.add_star_prefix()
+ xpath.add_condition('position() = last()')
return xpath
def _xpath_only_child(self, xpath):
- xpath.add_condition('count(..) = 1')
+ xpath.add_name_test()
+ xpath.add_condition('last() = 1')
return xpath
def _xpath_only_of_type(self, xpath):
- # FIXME: I doubt this is right
- xpath.add_condition('count(../node-name(.)) = 1')
+ xpath.add_condition('last() = 1')
return xpath
def _xpath_empty(self, xpath):
- xpath.add_condition("count(.) = 0 and string(.) = ''")
+ xpath.add_condition("count(./children::*) = 0 and string(.) = ''")
return xpath
class Attrib(object):
@@ -311,6 +308,7 @@
path.add_condition('substring(%s, string-length(%s)-%s) = %s'
% (attrib, attrib, len(value)-1, xpath_repr(value)))
elif self.operator == '*=':
+ # FIXME: case sensitive?
path.add_condition('contains(%s, %s)' % (
attrib, xpath_repr(value)))
else:
@@ -339,9 +337,11 @@
def xpath(self):
if self.namespace == '*':
- return XPath(self.element.lower())
+ el = self.element.lower()
else:
- return XPath('%s:%s' % (self.namespace, self.element))
+ # FIXME: Should we lowercase here?
+ el = '%s:%s' % (self.namespace, self.element)
+ return XPath(element=el)
class Hash(object):
"""
@@ -359,7 +359,7 @@
def xpath(self):
path = self.selector.xpath()
- path.add_condition('@id=%s' % xpath_repr(self.id))
+ path.add_condition('@id = %s' % xpath_repr(self.id))
return path
class Or(object):
@@ -412,23 +412,25 @@
def _xpath_descendant(self, xpath, sub):
# when sub is a descendant in any way of xpath
- return XPath('%s/descendant::%s' % (xpath, sub.xpath()))
-
+ xpath.join('/descendant::', sub.xpath())
+ return xpath
+
def _xpath_child(self, xpath, sub):
# when sub is an immediate child of xpath
- return XPath(str(xpath) + '/' + str(sub.xpath()))
+ xpath.join('/', sub.xpath())
+ return xpath
def _xpath_direct_adjacent(self, xpath, sub):
# when sub immediately follows xpath
- path = self._xpath_indirect_adjacent(xpath, sub)
- path.add_index(0)
- return path
+ xpath.join('/following-sibling::', sub.xpath())
+ xpath.add_name_test()
+ xpath.add_condition('position() = 1')
+ return xpath
def _xpath_indirect_adjacent(self, xpath, sub):
# when sub comes somewhere after xpath as a sibling
- return XPath('%s/following-sibling::%s' % (
- xpath, sub.xpath()))
-
+ xpath.join('/following-sibling::', sub.xpath())
+ return xpath
##############################
## XPath objects:
@@ -439,11 +441,8 @@
expr = css_expr.xpath()
assert expr is not None, (
"Got None for xpath expression from %s" % repr(css_expr))
- if isinstance(expr, XPathOr):
- for item in expr.items:
- item.element_path = prefix + item.element_path
- else:
- expr.element_path = prefix + expr.element_path
+ if prefix:
+ expr.add_prefix(prefix)
return str(expr)
def run_xpath(doc, xpath):
@@ -455,12 +454,19 @@
class XPath(object):
- def __init__(self, element_path, condition=None):
- self.element_path = element_path
+ def __init__(self, prefix=None, path=None, element='*', condition=None):
+ self.prefix = prefix
+ self.path = path
+ self.element = element
self.condition = condition
def __str__(self):
- path = str(self.element_path)
+ path = ''
+ if self.prefix is not None:
+ path += str(self.prefix)
+ if self.path is not None:
+ path += str(self.path)
+ path += str(self.element)
if self.condition:
path += '[%s]' % self.condition
return path
@@ -475,8 +481,40 @@
else:
self.condition = condition
- def add_index(self, index):
- self.element_path = '%s[%s]' % (self.element_path, index)
+ def add_path(self, part):
+ if self.path is None:
+ self.path = self.element
+ else:
+ self.path += self.element
+ self.element = part
+
+ def add_prefix(self, prefix):
+ if self.prefix:
+ self.prefix = prefix + self.prefix
+ else:
+ self.prefix = prefix
+
+ def add_name_test(self):
+ if self.element == '*':
+ # We weren't doing a test anyway
+ return
+ self.add_condition("name() = %s" % xpath_repr(self.element))
+ self.element = '*'
+
+ def add_star_prefix(self):
+ if self.path:
+ self.path += '*/'
+ else:
+ self.path = '*/'
+
+ def join(self, combiner, other):
+ prefix = str(self)
+ prefix += combiner
+ path = (other.prefix or '') + (other.path or '')
+ self.prefix = prefix
+ self.path = path
+ self.element = other.element
+ self.condition = other.condition
class XPathOr(XPath):
@@ -485,14 +523,15 @@
the union, it's the sum, so duplicate elements will appear.
"""
- def __init__(self, items):
+ def __init__(self, items, prefix=None):
for item in items:
assert item is not None
self.items = items
+ self.prefix = prefix
def __str__(self):
- return ' | '.join(map(str, self.items))
-
+ prefix = self.prefix or ''
+ return ' | '.join([prefix + str(i) for i in self.items])
def xpath_repr(s):
# FIXME: I don't think this is right
@@ -650,6 +689,9 @@
"""
if isinstance(s, Element):
s = s._format_element()
+ if not s or s == '*':
+ # Happens when there's nothing, which CSS things of as *
+ return (1, 0)
if isinstance(s, int):
# Happens when you just get a number
return (1, s)
@@ -657,6 +699,8 @@
return (2, 1)
elif s == 'even':
return (2, 0)
+ elif s == 'n':
+ return (1, 0)
if 'n' not in s:
# Just a b
return int(s)
Modified: lxml/branch/html/src/lxml/html/tests/test_css.py
==============================================================================
--- lxml/branch/html/src/lxml/html/tests/test_css.py (original)
+++ lxml/branch/html/src/lxml/html/tests/test_css.py Fri Jun 29 23:50:47 2007
@@ -107,6 +107,7 @@
def test_suite():
suite = unittest.TestSuite()
- suite.addTests([doctest.DocFileSuite('test_css.txt')])
+ for fn in 'test_css.txt', 'test_css_select.txt':
+ suite.addTests([doctest.DocFileSuite(fn)])
suite.addTests(list(CSSTestCase.all()))
return suite
Modified: lxml/branch/html/src/lxml/html/tests/test_css.txt
==============================================================================
--- lxml/branch/html/src/lxml/html/tests/test_css.txt (original)
+++ lxml/branch/html/src/lxml/html/tests/test_css.txt Fri Jun 29 23:50:47 2007
@@ -69,33 +69,35 @@
e[@hreflang = 'en' or starts-with(@hreflang, 'en-')]
>>> #xpath('E:root')
>>> xpath('E:nth-child(1)')
- */e[0]
+ */*[name() = 'e' and (position() = 1)]
>>> xpath('E:nth-last-child(1)')
- */e[last() - 0]
+ */*[name() = 'e' and (position() = last() - 1)]
+ >>> xpath('E:nth-last-child(2n+2)')
+ */*[name() = 'e' and ((position() +2) mod -2 = 0 and position() < (last() -2))]
>>> xpath('E:nth-of-type(1)')
- */e[0]
+ */e[position() = 1]
>>> xpath('E:nth-last-of-type(1)')
- */e[last() - 0]
+ */e[position() = last() - 1]
>>> xpath('E:first-child')
- */e[position() = 0]
+ */*[name() = 'e' and (position() = 1)]
>>> xpath('E:last-child')
- */e[position() = last()]
+ */*[name() = 'e' and (position() = last())]
>>> xpath('E:first-of-type')
- */e[0]
+ */e[position() = 1]
>>> xpath('E:last-of-type')
- e[last()]
+ */e[position() = last()]
>>> xpath('E:only-child')
- e[count(..) = 1]
+ *[name() = 'e' and (last() = 1)]
>>> xpath('E:only-of-type')
- e[count(../node-name(.)) = 1]
+ e[last() = 1]
>>> xpath('E:empty')
- e[count(.) = 0 and string(.) = '']
+ e[count(./children::*) = 0 and string(.) = '']
>>> xpath('E:contains("foo")')
e[contains(css:lower-case(string(.)), 'foo')]
>>> xpath('E.warning')
e[contains(concat(' ', normalize-space(@class), ' '), ' warning ')]
>>> xpath('E#myid')
- e[@id='myid']
+ e[@id = 'myid']
>>> xpath('E:not(:contains("foo"))')
e[not(contains(css:lower-case(string(.)), 'foo'))]
>>> xpath('E F')
@@ -103,8 +105,11 @@
>>> xpath('E > F')
e/f
>>> xpath('E + F')
- e/following-sibling::f[0]
+ e/following-sibling::*[name() = 'f' and (position() = 1)]
>>> xpath('E ~ F')
e/following-sibling::f
>>> xpath('div#container p')
- div[@id='container']/descendant::p
+ div[@id = 'container']/descendant::p
+ >>> # FIXME: This isn't right, but I don't know what *is* right
+ >>> xpath('p *:only-of-type')
+ p/descendant::*[last() = 1]
Added: lxml/branch/html/src/lxml/html/tests/test_css_select.txt
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/tests/test_css_select.txt Fri Jun 29 23:50:47 2007
@@ -0,0 +1,149 @@
+This is a test of CSS selectors. We setup a document we'll use for
+all our selections, and a function make querying simpler:
+
+ >>> from lxml.html.css import run_css
+ >>> from lxml.html import HTML
+ >>> doc = HTML('''
+ ... <html><head></head><body>
+ ... <div id="outer-div">
+ ... <a id="name-anchor" name="foo"></a>
+ ... <a id="tag-anchor" rel="tag" href="http://localhost/foo">link</a>
+ ... <a id="nofollow-anchor" rel="nofollow" href="https://example.org">link</a>
+ ... <ol id="first-ol" class="a b c">
+ ... <li id="first-li">content</li>
+ ... <li id="second-li" lang="en-US">
+ ... <div id="li-div">
+ ... </div>
+ ... </li>
+ ... <li id="third-li" class="ab c"></li>
+ ... <li id="fourth-li" class="ab
+ ... c"></li>
+ ... <li id="fifth-li"></li>
+ ... <li id="sixth-li"></li>
+ ... <li id="seventh-li"></li>
+ ... </ol>
+ ... <p id="paragraph">
+ ... <b id="p-b">hi</b> <em id="p-em">there</em>
+ ... <b id="p-b2">guy</b></p>
+ ... <ol id="second-ol">
+ ... </ol>
+ ... </div>
+ ... <div id="foobar-div" foobar="ab bc
+ ... cde"><span id="foobar-span"></span></div>
+ ... </body></html>''')
+ >>> order = {}
+ >>> for count, el in enumerate(doc.getiterator()):
+ ... order[el] = count
+ >>> def select_ids(selector):
+ ... items = run_css(doc, selector)
+ ... if not items:
+ ... return 'empty'
+ ... items = run_css(doc, selector)
+ ... items.sort(key=lambda el: order[el])
+ ... return ', '.join([el.get('id', 'nil') for el in items])
+ >>> def pcss(main, *selectors):
+ ... result = select_ids(main)
+ ... for selector in selectors:
+ ... sel_result = select_ids(selector)
+ ... if sel_result != result:
+ ... print 'Selector %r returns %s' % (selector, sel_result)
+ ... print result
+
+Now, the tests:
+
+ >>> pcss('*') # doctest: +ELLIPSIS
+ nil, nil, nil, outer-div, ... foobar-span
+ >>> pcss('div')
+ outer-div, li-div, foobar-div
+ >>> pcss('a[name]')
+ name-anchor
+ >>> pcss('a[rel]')
+ tag-anchor, nofollow-anchor
+ >>> pcss('a[rel="tag"]')
+ tag-anchor
+ >>> pcss('a[href*="localhost"]')
+ tag-anchor
+ >>> pcss('a[href^="http"]')
+ tag-anchor, nofollow-anchor
+ >>> pcss('a[href^="http:"]')
+ tag-anchor
+ >>> pcss('a[href$="org"]')
+ nofollow-anchor
+ >>> pcss('div[foobar~="bc"]', 'div[foobar~="cde"]')
+ foobar-div
+ >>> pcss('div[foobar~="cd"]')
+ empty
+ >>> pcss('*[lang|="en"]', '*[lang|="en-US"]')
+ second-li
+ >>> pcss('*[lang|="e"]')
+ empty
+ >>> pcss('li:nth-child(3)')
+ third-li
+ >>> pcss('li:nth-child(10)')
+ empty
+ >>> pcss('li:nth-child(2n)', 'li:nth-child(even)', 'li:nth-child(2n+0)')
+ second-li, fourth-li, sixth-li
+ >>> pcss('li:nth-child(+2n+1)', 'li:nth-child(odd)')
+ first-li, third-li, fifth-li, seventh-li
+ >>> pcss('li:nth-child(2n+4)')
+ fourth-li, sixth-li
+ >>> # FIXME: I'm not 100% sure this is right:
+ >>> pcss('li:nth-child(3n+1)')
+ first-li, fourth-li, seventh-li
+ >>> # FIXME: I'm not sure if nth-last-child(1) or nth-last-child(1)
+ >>> # should be equivalent to nth-last-child()
+ >>> pcss('li:nth-last-child()', 'li:nth-last-child(0)')
+ seventh-li
+ >>> pcss('li:nth-last-child(2n)', 'li:nth-last-child(even)')
+ second-li, fourth-li, sixth-li
+ >>> pcss('li:nth-last-child(2n+2)')
+ second-li, fourth-li
+ >>> pcss('ol:first-of-type')
+ first-ol
+ >>> pcss('ol:nth-child(1)')
+ empty
+ >>> pcss('ol:nth-of-type(2)')
+ second-ol
+ >>> # FIXME: like above, (1) or (2)?
+ >>> pcss('ol:nth-last-of-type(1)')
+ first-ol
+ >>> pcss('span:only-child')
+ foobar-span
+ >>> pcss('li div:only-child')
+ li-div
+ >>> pcss('div *:only-child')
+ foobar-span
+ >>> pcss('p *:only-of-type')
+ p-em
+ >>> pcss('p:only-of-type')
+ paragraph
+ >>> pcss('a:empty')
+ name-anchor
+ >>> pcss('li:empty')
+ third-li, fourth-li, fifth-li, sixth-li
+ >>> pcss('*:contains("link")')
+ nil, nil, outer-div, tag-anchor, nofollow-anchor
+ >>> pcss('*:contains("E")')
+ nil, nil, outer-div, first-ol, first-li, paragraph, p-em
+ >>> pcss('.a', '.b', '*.a', 'ol.a')
+ first-ol
+ >>> pcss('.c', '*.c')
+ first-ol, third-li, fourth-li
+ >>> pcss('ol *.c', 'ol li.c', 'ol ~ li.c', 'ol > li.c')
+ third-li, fourth-li
+ >>> pcss('#first-li', 'li#first-li', '*#first-li')
+ first-li
+ >>> # Need some tests of :not()
+ >>> pcss('li div', 'li > div', 'div div')
+ li-div
+ >>> pcss('div > div')
+ empty
+ >>> pcss('div + div')
+ foobar-div
+ >>> pcss('a ~ a')
+ tag-anchor, nofollow-anchor
+ >>> pcss('a[rel="tag"] ~ a')
+ nofollow-anchor
+ >>> pcss('ol#first-ol li:last-child', 'ol#first-ol *:last-child')
+ seventh-li
+
\ No newline at end of file
More information about the lxml-checkins
mailing list