[Lxml-checkins] r44635 - in lxml/branch/html/src/lxml/html: . tests

ianb at codespeak.net ianb at codespeak.net
Fri Jun 29 23:50:48 CEST 2007


Author: ianb
Date: Fri Jun 29 23:50:47 2007
New Revision: 44635

Added:
   lxml/branch/html/src/lxml/html/tests/test_css_select.txt   (contents, props changed)
Modified:
   lxml/branch/html/src/lxml/html/css.py
   lxml/branch/html/src/lxml/html/tests/test_css.py
   lxml/branch/html/src/lxml/html/tests/test_css.txt
Log:
improvement to CSS selectors, and more tests

Modified: lxml/branch/html/src/lxml/html/css.py
==============================================================================
--- lxml/branch/html/src/lxml/html/css.py	(original)
+++ lxml/branch/html/src/lxml/html/css.py	Fri Jun 29 23:50:47 2007
@@ -88,60 +88,52 @@
         method = getattr(self, method)
         return method(sel_path, self.expr)
 
-    def _xpath_nth_child(self, xpath, expr, last=False):
-        if isinstance(expr, int):
-            return self._xpath_nth_child_simple(xpath, expr, last)
-        if not isinstance(expr, int):
-            a, b = parse_series(expr)
-            if not a:
-                # a=0 means nothing is returned...
-                xpath.add_condition('false()')
-                return xpath
-            if a == 1:
-                return self._xpath_nth_child_simple(xpath, expr, last)
-            if b > 0:
-                b_neg = str(-b)
-            else:
-                b_neg = '+%s' % (-b)
-            expr = '(position() %s) mod %s = 0' % (b_neg, a)
-            if b >= 0:
-                expr += ' and position() >= %s' % b
-            xpath.add_condition(expr)
+    def _xpath_nth_child(self, xpath, expr, last=False,
+                         add_name_test=True):
+        a, b = parse_series(expr)
+        if not a:
+            # a=0 means nothing is returned...
+            xpath.add_condition('false() and position() = 0')
             return xpath
-            # FIXME: handle an+b, odd, even
-            # an+b means every-a, plus b, e.g., 2n+1 means odd
-            # 0n+b means b
-            # n+0 means a=1, i.e., all elements
-            # an means every a elements, i.e., 2n means even
-            # -n means -1n
-            # -1n+6 means elements 6 and previous
-
-    def _xpath_nth_child_simple(self, xpath, expr, last=False):
-        if isinstance(expr, int):
-            expr -= 1
+        if add_name_test:
+            xpath.add_name_test()
+        xpath.add_star_prefix()
+        if a == 1:
             if last:
-                expr = 'last() - %s' % expr
-            xpath = XPath('*/%s' % xpath)
-            xpath.add_index(expr)
+                b = 'last() - %s' % b
+            xpath.add_condition('position() = %s' % b)
             return xpath
+        if last:
+            # FIXME: I'm not sure if this is right
+            a = -a
+            b = -b
+        if b > 0:
+            b_neg = str(-b)
+        else:
+            b_neg = '+%s' % (-b)
+        expr = '(position() %s) mod %s = 0' % (b_neg, a)
+        if b >= 0:
+            expr += ' and position() >= %s' % b
+        elif b < 0 and last:
+            expr += ' and position() < (last() %s)' % b
+        xpath.add_condition(expr)
+        return xpath
+        # FIXME: handle an+b, odd, even
+        # an+b means every-a, plus b, e.g., 2n+1 means odd
+        # 0n+b means b
+        # n+0 means a=1, i.e., all elements
+        # an means every a elements, i.e., 2n means even
+        # -n means -1n
+        # -1n+6 means elements 6 and previous
 
     def _xpath_nth_last_child(self, xpath, expr):
         return self._xpath_nth_child(xpath, expr, last=True)
 
-    def _xpath_nth_of_type(self, xpath, expr, last=False):
-        # Like nth-of-type, but only for *this* type
-        if isinstance(expr, int):
-            expr -= 1
-            if last:
-                expr = 'last() - %s' % expr
-            xpath = XPath('*/%s' % xpath)
-            xpath.add_index(expr)
-            return xpath
-        else:
-            raise NotImplementedError
+    def _xpath_nth_of_type(self, xpath, expr):
+        return self._xpath_nth_child(xpath, expr, add_name_test=False)
 
     def _xpath_nth_last_of_type(self, xpath, expr):
-        return self._xpath_nth_of_type(xpath, expr, last=True)
+        return self._xpath_nth_child(xpath, expr, last=True, add_name_test=False)
 
     def _xpath_contains(self, xpath, expr):
         # text content, minus tags, must contain expr
@@ -149,6 +141,7 @@
             expr = expr._format_element()
         xpath.add_condition('contains(css:lower-case(string(.)), %s)'
                             % xpath_repr(expr.lower()))
+        # FIXME: Currently case insensitive matching doesn't seem to be happening
         return xpath
 
     def _xpath_not(self, xpath, expr):
@@ -199,7 +192,8 @@
         return el_xpath
 
     def _xpath_checked(self, xpath):
-        xpath.add_condition("(@selected or @checked) and (node-name(.) = 'input' or node-name(.) = 'option')")
+        # FIXME: is this really all the elements?
+        xpath.add_condition("(@selected or @checked) and (name(.) = 'input' or name(.) = 'option')")
         return xpath
 
     def _xpath_root(self, xpath):
@@ -207,35 +201,38 @@
         raise NotImplementedError
 
     def _xpath_first_child(self, xpath):
-        xpath = XPath('*/%s' % xpath)
-        xpath.add_condition('position() = 0')
+        xpath.add_star_prefix()
+        xpath.add_name_test()
+        xpath.add_condition('position() = 1')
         return xpath
 
     def _xpath_last_child(self, xpath):
-        xpath = XPath('*/%s' % xpath)
+        xpath.add_star_prefix()
+        xpath.add_name_test()
         xpath.add_condition('position() = last()')
         return xpath
 
     def _xpath_first_of_type(self, xpath):
-        xpath = XPath('*/%s' % xpath)
-        xpath.add_index(0)
+        xpath.add_star_prefix()
+        xpath.add_condition('position() = 1')
         return xpath
 
     def _xpath_last_of_type(self, xpath):
-        xpath.add_index('last()')
+        xpath.add_star_prefix()
+        xpath.add_condition('position() = last()')
         return xpath
 
     def _xpath_only_child(self, xpath):
-        xpath.add_condition('count(..) = 1')
+        xpath.add_name_test()
+        xpath.add_condition('last() = 1')
         return xpath
 
     def _xpath_only_of_type(self, xpath):
-        # FIXME: I doubt this is right
-        xpath.add_condition('count(../node-name(.)) = 1')
+        xpath.add_condition('last() = 1')
         return xpath
 
     def _xpath_empty(self, xpath):
-        xpath.add_condition("count(.) = 0 and string(.) = ''")
+        xpath.add_condition("count(./children::*) = 0 and string(.) = ''")
         return xpath
 
 class Attrib(object):
@@ -311,6 +308,7 @@
             path.add_condition('substring(%s, string-length(%s)-%s) = %s'
                                % (attrib, attrib, len(value)-1, xpath_repr(value)))
         elif self.operator == '*=':
+            # FIXME: case sensitive?
             path.add_condition('contains(%s, %s)' % (
                 attrib, xpath_repr(value)))
         else:
@@ -339,9 +337,11 @@
 
     def xpath(self):
         if self.namespace == '*':
-            return XPath(self.element.lower())
+            el = self.element.lower()
         else:
-            return XPath('%s:%s' % (self.namespace, self.element))
+            # FIXME: Should we lowercase here?
+            el = '%s:%s' % (self.namespace, self.element)
+        return XPath(element=el)
 
 class Hash(object):
     """
@@ -359,7 +359,7 @@
 
     def xpath(self):
         path = self.selector.xpath()
-        path.add_condition('@id=%s' % xpath_repr(self.id))
+        path.add_condition('@id = %s' % xpath_repr(self.id))
         return path
 
 class Or(object):
@@ -412,23 +412,25 @@
 
     def _xpath_descendant(self, xpath, sub):
         # when sub is a descendant in any way of xpath
-        return XPath('%s/descendant::%s' % (xpath, sub.xpath()))
-
+        xpath.join('/descendant::', sub.xpath())
+        return xpath
+    
     def _xpath_child(self, xpath, sub):
         # when sub is an immediate child of xpath
-        return XPath(str(xpath) + '/' + str(sub.xpath()))
+        xpath.join('/', sub.xpath())
+        return xpath
 
     def _xpath_direct_adjacent(self, xpath, sub):
         # when sub immediately follows xpath
-        path = self._xpath_indirect_adjacent(xpath, sub)
-        path.add_index(0)
-        return path
+        xpath.join('/following-sibling::', sub.xpath())
+        xpath.add_name_test()
+        xpath.add_condition('position() = 1')
+        return xpath
 
     def _xpath_indirect_adjacent(self, xpath, sub):
         # when sub comes somewhere after xpath as a sibling
-        return XPath('%s/following-sibling::%s' % (
-            xpath, sub.xpath()))
-
+        xpath.join('/following-sibling::', sub.xpath())
+        return xpath
 
 ##############################
 ## XPath objects:
@@ -439,11 +441,8 @@
     expr = css_expr.xpath()
     assert expr is not None, (
         "Got None for xpath expression from %s" % repr(css_expr))
-    if isinstance(expr, XPathOr):
-        for item in expr.items:
-            item.element_path = prefix + item.element_path
-    else:
-        expr.element_path = prefix + expr.element_path
+    if prefix:
+        expr.add_prefix(prefix)
     return str(expr)
 
 def run_xpath(doc, xpath):
@@ -455,12 +454,19 @@
 
 class XPath(object):
 
-    def __init__(self, element_path, condition=None):
-        self.element_path = element_path
+    def __init__(self, prefix=None, path=None, element='*', condition=None):
+        self.prefix = prefix
+        self.path = path
+        self.element = element
         self.condition = condition
 
     def __str__(self):
-        path = str(self.element_path)
+        path = ''
+        if self.prefix is not None:
+            path += str(self.prefix)
+        if self.path is not None:
+            path += str(self.path)
+        path += str(self.element)
         if self.condition:
             path += '[%s]' % self.condition
         return path
@@ -475,8 +481,40 @@
         else:
             self.condition = condition
 
-    def add_index(self, index):
-        self.element_path = '%s[%s]' % (self.element_path, index)
+    def add_path(self, part):
+        if self.path is None:
+            self.path = self.element
+        else:
+            self.path += self.element
+        self.element = part
+
+    def add_prefix(self, prefix):
+        if self.prefix:
+            self.prefix = prefix + self.prefix
+        else:
+            self.prefix = prefix
+
+    def add_name_test(self):
+        if self.element == '*':
+            # We weren't doing a test anyway
+            return
+        self.add_condition("name() = %s" % xpath_repr(self.element))
+        self.element = '*'
+
+    def add_star_prefix(self):
+        if self.path:
+            self.path += '*/'
+        else:
+            self.path = '*/'
+
+    def join(self, combiner, other):
+        prefix = str(self)
+        prefix += combiner
+        path = (other.prefix or '') + (other.path or '')
+        self.prefix = prefix
+        self.path = path
+        self.element = other.element
+        self.condition = other.condition
 
 class XPathOr(XPath):
 
@@ -485,14 +523,15 @@
     the union, it's the sum, so duplicate elements will appear.
     """
 
-    def __init__(self, items):
+    def __init__(self, items, prefix=None):
         for item in items:
             assert item is not None
         self.items = items
+        self.prefix = prefix
 
     def __str__(self):
-        return ' | '.join(map(str, self.items))
-
+        prefix = self.prefix or ''
+        return ' | '.join([prefix + str(i) for i in self.items])
 
 def xpath_repr(s):
     # FIXME: I don't think this is right
@@ -650,6 +689,9 @@
     """
     if isinstance(s, Element):
         s = s._format_element()
+    if not s or s == '*':
+        # Happens when there's nothing, which CSS things of as *
+        return (1, 0)
     if isinstance(s, int):
         # Happens when you just get a number
         return (1, s)
@@ -657,6 +699,8 @@
         return (2, 1)
     elif s == 'even':
         return (2, 0)
+    elif s == 'n':
+        return (1, 0)
     if 'n' not in s:
         # Just a b
         return int(s)

Modified: lxml/branch/html/src/lxml/html/tests/test_css.py
==============================================================================
--- lxml/branch/html/src/lxml/html/tests/test_css.py	(original)
+++ lxml/branch/html/src/lxml/html/tests/test_css.py	Fri Jun 29 23:50:47 2007
@@ -107,6 +107,7 @@
         
 def test_suite():
     suite = unittest.TestSuite()
-    suite.addTests([doctest.DocFileSuite('test_css.txt')])
+    for fn in 'test_css.txt', 'test_css_select.txt':
+        suite.addTests([doctest.DocFileSuite(fn)])
     suite.addTests(list(CSSTestCase.all()))
     return suite

Modified: lxml/branch/html/src/lxml/html/tests/test_css.txt
==============================================================================
--- lxml/branch/html/src/lxml/html/tests/test_css.txt	(original)
+++ lxml/branch/html/src/lxml/html/tests/test_css.txt	Fri Jun 29 23:50:47 2007
@@ -69,33 +69,35 @@
     e[@hreflang = 'en' or starts-with(@hreflang, 'en-')]
     >>> #xpath('E:root')
     >>> xpath('E:nth-child(1)')
-    */e[0]
+    */*[name() = 'e' and (position() = 1)]
     >>> xpath('E:nth-last-child(1)')
-    */e[last() - 0]
+    */*[name() = 'e' and (position() = last() - 1)]
+    >>> xpath('E:nth-last-child(2n+2)')
+    */*[name() = 'e' and ((position() +2) mod -2 = 0 and position() < (last() -2))]
     >>> xpath('E:nth-of-type(1)')
-    */e[0]
+    */e[position() = 1]
     >>> xpath('E:nth-last-of-type(1)')
-    */e[last() - 0]
+    */e[position() = last() - 1]
     >>> xpath('E:first-child')
-    */e[position() = 0]
+    */*[name() = 'e' and (position() = 1)]
     >>> xpath('E:last-child')
-    */e[position() = last()]
+    */*[name() = 'e' and (position() = last())]
     >>> xpath('E:first-of-type')
-    */e[0]
+    */e[position() = 1]
     >>> xpath('E:last-of-type')
-    e[last()]
+    */e[position() = last()]
     >>> xpath('E:only-child')
-    e[count(..) = 1]
+    *[name() = 'e' and (last() = 1)]
     >>> xpath('E:only-of-type')
-    e[count(../node-name(.)) = 1]
+    e[last() = 1]
     >>> xpath('E:empty')
-    e[count(.) = 0 and string(.) = '']
+    e[count(./children::*) = 0 and string(.) = '']
     >>> xpath('E:contains("foo")')
     e[contains(css:lower-case(string(.)), 'foo')]
     >>> xpath('E.warning')
     e[contains(concat(' ', normalize-space(@class), ' '), ' warning ')]
     >>> xpath('E#myid')
-    e[@id='myid']
+    e[@id = 'myid']
     >>> xpath('E:not(:contains("foo"))')
     e[not(contains(css:lower-case(string(.)), 'foo'))]
     >>> xpath('E F')
@@ -103,8 +105,11 @@
     >>> xpath('E > F')
     e/f
     >>> xpath('E + F')
-    e/following-sibling::f[0]
+    e/following-sibling::*[name() = 'f' and (position() = 1)]
     >>> xpath('E ~ F')
     e/following-sibling::f
     >>> xpath('div#container p')
-    div[@id='container']/descendant::p
+    div[@id = 'container']/descendant::p
+    >>> # FIXME: This isn't right, but I don't know what *is* right
+    >>> xpath('p *:only-of-type')
+    p/descendant::*[last() = 1]

Added: lxml/branch/html/src/lxml/html/tests/test_css_select.txt
==============================================================================
--- (empty file)
+++ lxml/branch/html/src/lxml/html/tests/test_css_select.txt	Fri Jun 29 23:50:47 2007
@@ -0,0 +1,149 @@
+This is a test of CSS selectors.  We setup a document we'll use for
+all our selections, and a function make querying simpler:
+
+    >>> from lxml.html.css import run_css
+    >>> from lxml.html import HTML
+    >>> doc = HTML('''
+    ... <html><head></head><body>
+    ... <div id="outer-div">
+    ...  <a id="name-anchor" name="foo"></a>
+    ...  <a id="tag-anchor" rel="tag" href="http://localhost/foo">link</a>
+    ...  <a id="nofollow-anchor" rel="nofollow" href="https://example.org">link</a>
+    ...  <ol id="first-ol" class="a b c">
+    ...    <li id="first-li">content</li>
+    ...    <li id="second-li" lang="en-US">
+    ...      <div id="li-div">
+    ...      </div>
+    ...    </li>
+    ...    <li id="third-li" class="ab c"></li>
+    ...    <li id="fourth-li" class="ab
+    ... c"></li>
+    ...    <li id="fifth-li"></li>
+    ...    <li id="sixth-li"></li>
+    ...    <li id="seventh-li"></li>
+    ...  </ol>
+    ...  <p id="paragraph">
+    ...    <b id="p-b">hi</b> <em id="p-em">there</em>
+    ...    <b id="p-b2">guy</b></p>
+    ...  <ol id="second-ol">
+    ...  </ol>
+    ... </div>
+    ... <div id="foobar-div" foobar="ab bc
+    ... cde"><span id="foobar-span"></span></div>
+    ... </body></html>''')
+    >>> order = {}
+    >>> for count, el in enumerate(doc.getiterator()):
+    ...     order[el] = count
+    >>> def select_ids(selector):
+    ...     items = run_css(doc, selector)
+    ...     if not items:
+    ...         return 'empty'
+    ...     items = run_css(doc, selector)
+    ...     items.sort(key=lambda el: order[el])
+    ...     return ', '.join([el.get('id', 'nil') for el in items])
+    >>> def pcss(main, *selectors):
+    ...     result = select_ids(main)
+    ...     for selector in selectors:
+    ...         sel_result = select_ids(selector)
+    ...         if sel_result != result:
+    ...             print 'Selector %r returns %s' % (selector, sel_result)
+    ...     print result
+
+Now, the tests:
+
+    >>> pcss('*') # doctest: +ELLIPSIS
+    nil, nil, nil, outer-div, ... foobar-span
+    >>> pcss('div')
+    outer-div, li-div, foobar-div
+    >>> pcss('a[name]')
+    name-anchor
+    >>> pcss('a[rel]')
+    tag-anchor, nofollow-anchor
+    >>> pcss('a[rel="tag"]')
+    tag-anchor
+    >>> pcss('a[href*="localhost"]')
+    tag-anchor
+    >>> pcss('a[href^="http"]')
+    tag-anchor, nofollow-anchor
+    >>> pcss('a[href^="http:"]')
+    tag-anchor
+    >>> pcss('a[href$="org"]')
+    nofollow-anchor
+    >>> pcss('div[foobar~="bc"]', 'div[foobar~="cde"]')
+    foobar-div
+    >>> pcss('div[foobar~="cd"]')
+    empty
+    >>> pcss('*[lang|="en"]', '*[lang|="en-US"]')
+    second-li
+    >>> pcss('*[lang|="e"]')
+    empty
+    >>> pcss('li:nth-child(3)')
+    third-li
+    >>> pcss('li:nth-child(10)')
+    empty
+    >>> pcss('li:nth-child(2n)', 'li:nth-child(even)', 'li:nth-child(2n+0)')
+    second-li, fourth-li, sixth-li
+    >>> pcss('li:nth-child(+2n+1)', 'li:nth-child(odd)')
+    first-li, third-li, fifth-li, seventh-li
+    >>> pcss('li:nth-child(2n+4)')
+    fourth-li, sixth-li
+    >>> # FIXME: I'm not 100% sure this is right:
+    >>> pcss('li:nth-child(3n+1)')
+    first-li, fourth-li, seventh-li
+    >>> # FIXME: I'm not sure if nth-last-child(1) or nth-last-child(1)
+    >>> # should be equivalent to nth-last-child()
+    >>> pcss('li:nth-last-child()', 'li:nth-last-child(0)')
+    seventh-li
+    >>> pcss('li:nth-last-child(2n)', 'li:nth-last-child(even)')
+    second-li, fourth-li, sixth-li
+    >>> pcss('li:nth-last-child(2n+2)')
+    second-li, fourth-li
+    >>> pcss('ol:first-of-type')
+    first-ol
+    >>> pcss('ol:nth-child(1)')
+    empty
+    >>> pcss('ol:nth-of-type(2)')
+    second-ol
+    >>> # FIXME: like above, (1) or (2)?
+    >>> pcss('ol:nth-last-of-type(1)')
+    first-ol
+    >>> pcss('span:only-child')
+    foobar-span
+    >>> pcss('li div:only-child')
+    li-div
+    >>> pcss('div *:only-child')
+    foobar-span
+    >>> pcss('p *:only-of-type')
+    p-em
+    >>> pcss('p:only-of-type')
+    paragraph
+    >>> pcss('a:empty')
+    name-anchor
+    >>> pcss('li:empty')
+    third-li, fourth-li, fifth-li, sixth-li
+    >>> pcss('*:contains("link")')
+    nil, nil, outer-div, tag-anchor, nofollow-anchor
+    >>> pcss('*:contains("E")')
+    nil, nil, outer-div, first-ol, first-li, paragraph, p-em
+    >>> pcss('.a', '.b', '*.a', 'ol.a')
+    first-ol
+    >>> pcss('.c', '*.c')
+    first-ol, third-li, fourth-li
+    >>> pcss('ol *.c', 'ol li.c', 'ol ~ li.c', 'ol > li.c')
+    third-li, fourth-li
+    >>> pcss('#first-li', 'li#first-li', '*#first-li')
+    first-li
+    >>> # Need some tests of :not()
+    >>> pcss('li div', 'li > div', 'div div')
+    li-div
+    >>> pcss('div > div')
+    empty
+    >>> pcss('div + div')
+    foobar-div
+    >>> pcss('a ~ a')
+    tag-anchor, nofollow-anchor
+    >>> pcss('a[rel="tag"] ~ a')
+    nofollow-anchor
+    >>> pcss('ol#first-ol li:last-child', 'ol#first-ol *:last-child')
+    seventh-li
+    
\ No newline at end of file


More information about the lxml-checkins mailing list