[Lxml-checkins] r44752 - in lxml/branch/html/src/lxml: . html

scoder at codespeak.net scoder at codespeak.net
Thu Jul 5 21:45:25 CEST 2007


Author: scoder
Date: Thu Jul  5 21:45:24 2007
New Revision: 44752

Added:
   lxml/branch/html/src/lxml/cssselect.py
      - copied unchanged from r44751, lxml/branch/html/src/lxml/html/css.py
Removed:
   lxml/branch/html/src/lxml/html/css.py
Log:
renamed lxml.html.css to lxml.cssselect

Deleted: /lxml/branch/html/src/lxml/html/css.py
==============================================================================
--- /lxml/branch/html/src/lxml/html/css.py	Thu Jul  5 21:45:24 2007
+++ (empty file)
@@ -1,905 +0,0 @@
-import re
-from lxml import etree
-
-__all__ = ['SelectorSyntaxError', 'ExpressionError',
-           'CSSSelector']
-
-class SelectorSyntaxError(SyntaxError):
-    pass
-
-class ExpressionError(RuntimeError):
-    pass
-
-class CSSSelector(etree.XPath):
-
-    def __init__(self, css):
-        path = css_to_xpath(css)
-        etree.XPath.__init__(self, path)
-        self.css = css
-
-    def __repr__(self):
-        return '<%s %s for %r>' % (
-            self.__class__.__name__,
-            hex(abs(id(self)))[2:],
-            self.css)
-
-##############################
-## Token objects:
-
-class _UniToken(unicode):
-    def __new__(cls, contents, pos):
-        obj = unicode.__new__(cls, contents)
-        obj.pos = pos
-        return obj
-        
-    def __repr__(self):
-        return '%s(%s, %r)' % (
-            self.__class__.__name__,
-            unicode.__repr__(self),
-            self.pos)
-
-class Symbol(_UniToken):
-    pass
-
-class String(_UniToken):
-    pass
-
-class Token(_UniToken):
-    pass
-
-############################################################
-## Parsing
-############################################################
-
-##############################
-## Syntax objects:
-
-class Class(object):
-    """
-    Represents selector.class_name
-    """
-
-    def __init__(self, selector, class_name):
-        self.selector = selector
-        self.class_name = class_name
-
-    def __repr__(self):
-        return '%s[%r.%s]' % (
-            self.__class__.__name__,
-            self.selector,
-            self.class_name)
-
-    def xpath(self):
-        sel_xpath = self.selector.xpath()
-        sel_xpath.add_condition(
-            "contains(concat(' ', normalize-space(@class), ' '), %s)" % xpath_repr(' '+self.class_name+' '))
-        return sel_xpath
-
-class Function(object):
-    """
-    Represents selector:name(expr)
-    """
-
-    unsupported = [
-        'target', 'lang', 'enabled', 'disabled',]
-
-    def __init__(self, selector, type, name, expr):
-        self.selector = selector
-        self.type = type
-        self.name = name
-        self.expr = expr
-
-    def __repr__(self):
-        return '%s[%r%s%s(%r)]' % (
-            self.__class__.__name__,
-            self.selector,
-            self.type, self.name, self.expr)
-
-    def xpath(self):
-        sel_path = self.selector.xpath()
-        if self.name in self.unsupported:
-            raise ExpressionError(
-                "The psuedo-class %r is not supported" % self.name)
-        method = '_xpath_' + self.name.replace('-', '_')
-        if not hasattr(self, method):
-            raise ExpressionError(
-                "The psuedo-class %r is unknown" % self.name)
-        method = getattr(self, method)
-        return method(sel_path, self.expr)
-
-    def _xpath_nth_child(self, xpath, expr, last=False,
-                         add_name_test=True):
-        a, b = parse_series(expr)
-        if not a and not b and not last:
-            # a=0 means nothing is returned...
-            xpath.add_condition('false() and position() = 0')
-            return xpath
-        if add_name_test:
-            xpath.add_name_test()
-        xpath.add_star_prefix()
-        if a == 0:
-            if last:
-                b = 'last() - %s' % b
-            xpath.add_condition('position() = %s' % b)
-            return xpath
-        if last:
-            # FIXME: I'm not sure if this is right
-            a = -a
-            b = -b
-        if b > 0:
-            b_neg = str(-b)
-        else:
-            b_neg = '+%s' % (-b)
-        if a != 1:
-            expr = ['(position() %s) mod %s = 0' % (b_neg, a)]
-        else:
-            expr = []
-        if b >= 0:
-            expr.append('position() >= %s' % b)
-        elif b < 0 and last:
-            expr.append('position() < (last() %s)' % b)
-        expr = ' and '.join(expr)
-        if expr:
-            xpath.add_condition(expr)
-        return xpath
-        # FIXME: handle an+b, odd, even
-        # an+b means every-a, plus b, e.g., 2n+1 means odd
-        # 0n+b means b
-        # n+0 means a=1, i.e., all elements
-        # an means every a elements, i.e., 2n means even
-        # -n means -1n
-        # -1n+6 means elements 6 and previous
-
-    def _xpath_nth_last_child(self, xpath, expr):
-        return self._xpath_nth_child(xpath, expr, last=True)
-
-    def _xpath_nth_of_type(self, xpath, expr):
-        if xpath.element == '*':
-            raise NotImplementedError(
-                "*:nth-of-type() is not implemented")
-        return self._xpath_nth_child(xpath, expr, add_name_test=False)
-
-    def _xpath_nth_last_of_type(self, xpath, expr):
-        return self._xpath_nth_child(xpath, expr, last=True, add_name_test=False)
-
-    def _xpath_contains(self, xpath, expr):
-        # text content, minus tags, must contain expr
-        if isinstance(expr, Element):
-            expr = expr._format_element()
-        xpath.add_condition('contains(css:lower-case(string(.)), %s)'
-                            % xpath_repr(expr.lower()))
-        # FIXME: Currently case insensitive matching doesn't seem to be happening
-        return xpath
-
-    def _xpath_not(self, xpath, expr):
-        # everything for which not expr applies
-        expr = expr.xpath()
-        cond = expr.condition
-        # FIXME: should I do something about element_path?
-        xpath.add_condition('not(%s)' % cond)
-        return xpath
-
-def _make_lower_case(context, s):
-    return s.lower()
-
-ns = etree.FunctionNamespace('http://codespeak.net/lxml/css/')
-ns.prefix = 'css'
-ns['lower-case'] = _make_lower_case
-
-class Pseudo(object):
-    """
-    Represents selector:ident
-    """
-
-    unsupported = ['indeterminate', 'first-line', 'first-letter',
-                   'selection', 'before', 'after', 'link', 'visited',
-                   'active', 'focus', 'hover']
-
-    def __init__(self, element, type, ident):
-        self.element = element
-        assert type in (':', '::')
-        self.type = type
-        self.ident = ident
-
-    def __repr__(self):
-        return '%s[%r%s%s]' % (
-            self.__class__.__name__,
-            self.element,
-            self.type, self.ident)
-
-    def xpath(self):
-        el_xpath = self.element.xpath()
-        if self.ident in self.unsupported:
-            raise ExpressionError(
-                "The psuedo-class %r is unsupported" % self.ident)
-        method = '_xpath_' + self.ident.replace('-', '_')
-        if not hasattr(self, method):
-            raise ExpressionError(
-                "The psuedo-class %r is unknown" % self.ident)
-        method = getattr(self, method)
-        el_xpath = method(el_xpath)
-        return el_xpath
-
-    def _xpath_checked(self, xpath):
-        # FIXME: is this really all the elements?
-        xpath.add_condition("(@selected or @checked) and (name(.) = 'input' or name(.) = 'option')")
-        return xpath
-
-    def _xpath_root(self, xpath):
-        # if this element is the root element
-        raise NotImplementedError
-
-    def _xpath_first_child(self, xpath):
-        xpath.add_star_prefix()
-        xpath.add_name_test()
-        xpath.add_condition('position() = 1')
-        return xpath
-
-    def _xpath_last_child(self, xpath):
-        xpath.add_star_prefix()
-        xpath.add_name_test()
-        xpath.add_condition('position() = last()')
-        return xpath
-
-    def _xpath_first_of_type(self, xpath):
-        if xpath.element == '*':
-            raise NotImplementedError(
-                "*:first-of-type is not implemented")
-        xpath.add_star_prefix()
-        xpath.add_condition('position() = 1')
-        return xpath
-
-    def _xpath_last_of_type(self, xpath):
-        if xpath.element == '*':
-            raise NotImplementedError(
-                "*:last-of-type is not implemented")
-        xpath.add_star_prefix()
-        xpath.add_condition('position() = last()')
-        return xpath
-
-    def _xpath_only_child(self, xpath):
-        xpath.add_name_test()
-        xpath.add_star_prefix()
-        xpath.add_condition('last() = 1')
-        return xpath
-
-    def _xpath_only_of_type(self, xpath):
-        if xpath.element == '*':
-            raise NotImplementedError(
-                "*:only-of-type is not implemented")
-        xpath.add_condition('last() = 1')
-        return xpath
-
-    def _xpath_empty(self, xpath):
-        xpath.add_condition("not(*) and not(normalize-space())")
-        return xpath
-
-class Attrib(object):
-    """
-    Represents selector[namespace|attrib operator value]
-    """
-
-    def __init__(self, selector, namespace, attrib, operator, value):
-        self.selector = selector
-        self.namespace = namespace
-        self.attrib = attrib
-        self.operator = operator
-        self.value = value
-
-    def __repr__(self):
-        if self.operator == 'exists':
-            return '%s[%r[%s]]' % (
-                self.__class__.__name__,
-                self.selector,
-                self._format_attrib())
-        else:
-            return '%s[%r[%s %s %r]]' % (
-                self.__class__.__name__,
-                self.selector,
-                self._format_attrib(),
-                self.operator,
-                self.value)
-
-    def _format_attrib(self):
-        if self.namespace == '*':
-            return self.attrib
-        else:
-            return '%s|%s' % (self.namespace, self.attrib)
-
-    def _xpath_attrib(self):
-        # FIXME: if attrib is *?
-        if self.namespace == '*':
-            return '@' + self.attrib
-        else:
-            return '@%s:%s' % (self.namespace, self.attrib)
-
-    def xpath(self):
-        path = self.selector.xpath()
-        attrib = self._xpath_attrib()
-        value = self.value
-        if self.operator == 'exists':
-            assert not value
-            path.add_condition(attrib)
-        elif self.operator == '=':
-            path.add_condition('%s = %s' % (attrib,
-                                            xpath_repr(value)))
-        elif self.operator == '!=':
-            # FIXME: this seems like a weird hack...
-            if value:
-                path.add_condition('not(%s) or %s != %s'
-                                   % (attrib, attrib, xpath_repr(value)))
-            else:
-                path.add_condition('%s != %s'
-                                   % (attrib, xpath_repr(value)))
-            #path.add_condition('%s != %s' % (attrib, xpath_repr(value)))
-        elif self.operator == '~=':
-            path.add_condition("contains(concat(' ', normalize-space(%s), ' '), %s)" % (attrib, xpath_repr(' '+value+' ')))
-        elif self.operator == '|=':
-            # Weird, but true...
-            path.add_condition('%s = %s or starts-with(%s, %s)' % (
-                attrib, xpath_repr(value),
-                attrib, xpath_repr(value + '-')))
-        elif self.operator == '^=':
-            path.add_condition('starts-with(%s, %s)' % (
-                attrib, xpath_repr(value)))
-        elif self.operator == '$=':
-            # Oddly there is a starts-with in XPath 1.0, but not ends-with
-            path.add_condition('substring(%s, string-length(%s)-%s) = %s'
-                               % (attrib, attrib, len(value)-1, xpath_repr(value)))
-        elif self.operator == '*=':
-            # FIXME: case sensitive?
-            path.add_condition('contains(%s, %s)' % (
-                attrib, xpath_repr(value)))
-        else:
-            assert 0, ("Unknown operator: %r" % self.operator)
-        return path
-
-class Element(object):
-    """
-    Represents namespace|element
-    """
-
-    def __init__(self, namespace, element):
-        self.namespace = namespace
-        self.element = element
-
-    def __repr__(self):
-        return '%s[%s]' % (
-            self.__class__.__name__,
-            self._format_element())
-
-    def _format_element(self):
-        if self.namespace == '*':
-            return self.element
-        else:
-            return '%s|%s' % (self.namespace, self.element)
-
-    def xpath(self):
-        if self.namespace == '*':
-            el = self.element.lower()
-        else:
-            # FIXME: Should we lowercase here?
-            el = '%s:%s' % (self.namespace, self.element)
-        return XPathExpr(element=el)
-
-class Hash(object):
-    """
-    Represents selector#id
-    """
-
-    def __init__(self, selector, id):
-        self.selector = selector
-        self.id = id
-
-    def __repr__(self):
-        return '%s[%r#%s]' % (
-            self.__class__.__name__,
-            self.selector, self.id)
-
-    def xpath(self):
-        path = self.selector.xpath()
-        path.add_condition('@id = %s' % xpath_repr(self.id))
-        return path
-
-class Or(object):
-
-    def __init__(self, items):
-        self.items = items
-    def __repr__(self):
-        return '%s(%r)' % (
-            self.__class__.__name__,
-            self.items)    
-
-    def xpath(self):
-        paths = [item.xpath() for item in self.items]
-        return XPathExprOr(paths)
-
-class CombinedSelector(object):
-
-    _method_mapping = {
-        ' ': 'descendant',
-        '>': 'child',
-        '+': 'direct_adjacent',
-        '~': 'indirect_adjacent',
-        }
-
-    def __init__(self, selector, combinator, subselector):
-        assert selector is not None
-        self.selector = selector
-        self.combinator = combinator
-        self.subselector = subselector
-
-    def __repr__(self):
-        if self.combinator == ' ':
-            comb = '<followed>'
-        else:
-            comb = self.combinator
-        return '%s[%r %s %r]' % (
-            self.__class__.__name__,
-            self.selector,
-            comb,
-            self.subselector)
-
-    def xpath(self):
-        if self.combinator not in self._method_mapping:
-            raise ExpressionError(
-                "Unknown combinator: %r" % self.combinator)
-        method = '_xpath_' + self._method_mapping[self.combinator]
-        method = getattr(self, method)
-        path = self.selector.xpath()
-        return method(path, self.subselector)
-
-    def _xpath_descendant(self, xpath, sub):
-        # when sub is a descendant in any way of xpath
-        xpath.join('/descendant::', sub.xpath())
-        return xpath
-    
-    def _xpath_child(self, xpath, sub):
-        # when sub is an immediate child of xpath
-        xpath.join('/', sub.xpath())
-        return xpath
-
-    def _xpath_direct_adjacent(self, xpath, sub):
-        # when sub immediately follows xpath
-        xpath.join('/following-sibling::', sub.xpath())
-        xpath.add_name_test()
-        xpath.add_condition('position() = 1')
-        return xpath
-
-    def _xpath_indirect_adjacent(self, xpath, sub):
-        # when sub comes somewhere after xpath as a sibling
-        xpath.join('/following-sibling::', sub.xpath())
-        return xpath
-
-##############################
-## XPathExpr objects:
-
-_el_re = re.compile(r'^\w+\s*$')
-_id_re = re.compile(r'^(\w*)#(\w+)\s*$')
-_class_re = re.compile(r'^(\w*)\.(\w+)\s*$')
-
-def css_to_xpath(css_expr, prefix='descendant-or-self::'):
-    if isinstance(css_expr, basestring):
-        match = _el_re.search(css_expr)
-        if match is not None:
-            return '%s%s' % (prefix, match.group(0).strip())
-        match = _id_re.search(css_expr)
-        if match is not None:
-            return "%s%s[@id = '%s']" % (
-                prefix, match.group(1) or '*', match.group(2))
-        match = _class_re.search(css_expr)
-        if match is not None:
-            return "%s%s[contains(concat(' ', normalize-space(@class), ' '), ' %s ')]" % (
-                prefix, match.group(1) or '*', match.group(2))
-        css_expr = parse(css_expr)
-    expr = css_expr.xpath()
-    assert expr is not None, (
-        "Got None for xpath expression from %s" % repr(css_expr))
-    if prefix:
-        expr.add_prefix(prefix)
-    return str(expr)
-
-class XPathExpr(object):
-
-    def __init__(self, prefix=None, path=None, element='*', condition=None,
-                 star_prefix=False):
-        self.prefix = prefix
-        self.path = path
-        self.element = element
-        self.condition = condition
-        self.star_prefix = star_prefix
-
-    def __str__(self):
-        path = ''
-        if self.prefix is not None:
-            path += str(self.prefix)
-        if self.path is not None:
-            path += str(self.path)
-        path += str(self.element)
-        if self.condition:
-            path += '[%s]' % self.condition
-        return path
-
-    def __repr__(self):
-        return '%s[%s]' % (
-            self.__class__.__name__, self)
-
-    def add_condition(self, condition):
-        if self.condition:
-            self.condition = '%s and (%s)' % (self.condition, condition)
-        else:
-            self.condition = condition
-
-    def add_path(self, part):
-        if self.path is None:
-            self.path = self.element
-        else:
-            self.path += self.element
-        self.element = part
-
-    def add_prefix(self, prefix):
-        if self.prefix:
-            self.prefix = prefix + self.prefix
-        else:
-            self.prefix = prefix
-
-    def add_name_test(self):
-        if self.element == '*':
-            # We weren't doing a test anyway
-            return
-        self.add_condition("name() = %s" % xpath_repr(self.element))
-        self.element = '*'
-
-    def add_star_prefix(self):
-        """
-        Adds a /* prefix if there is no prefix.  This is when you need
-        to keep context's constrained to a single parent.
-        """
-        if self.path:
-            self.path += '*/'
-        else:
-            self.path = '*/'
-        self.star_prefix = True
-
-    def join(self, combiner, other):
-        prefix = str(self)
-        prefix += combiner
-        path = (other.prefix or '') + (other.path or '')
-        # We don't need a star prefix if we are joining to this other
-        # prefix; so we'll get rid of it
-        if other.star_prefix and path == '*/':
-            path = ''
-        self.prefix = prefix
-        self.path = path
-        self.element = other.element
-        self.condition = other.condition
-
-class XPathExprOr(XPathExpr):
-
-    """
-    Represents on |'d expressions.  Note that unfortunately it isn't
-    the union, it's the sum, so duplicate elements will appear.
-    """
-
-    def __init__(self, items, prefix=None):
-        for item in items:
-            assert item is not None
-        self.items = items
-        self.prefix = prefix
-
-    def __str__(self):
-        prefix = self.prefix or ''
-        return ' | '.join([prefix + str(i) for i in self.items])
-
-def xpath_repr(s):
-    # FIXME: I don't think this is right, but lacking any reasonable
-    # specification on what XPath literals look like (which doesn't seem
-    # to be in the XPath specification) it is hard to do 'right'
-    if isinstance(s, Element):
-        # This is probably a symbol that looks like an expression...
-        s = s._format_element()
-    return repr(str(s))
-
-##############################
-## Parsing functions
-
-def parse(string):
-    stream = TokenStream(tokenize(string))
-    stream.source = string
-    try:
-        return parse_selector_group(stream)
-    except SelectorSyntaxError, e:
-        e.args = tuple(["%s at %s -> %s" % (
-            e, stream.used, list(stream))])
-        raise
-
-def parse_selector_group(stream):
-    result = []
-    while 1:
-        result.append(parse_selector(stream))
-        if stream.peek() == ',':
-            stream.next()
-        else:
-            break
-    if len(result) == 1:
-        return result[0]
-    else:
-        return Or(result)
-
-def parse_selector(stream):
-    result = parse_simple_selector(stream)
-    while 1:
-        peek = stream.peek()
-        if peek == ',' or peek == ')' or peek is None:
-            return result
-        if stream.peek() in ('+', '>', '~'):
-            # A combinator
-            combinator = stream.next()
-        else:
-            combinator = ' '
-        next_selector = parse_simple_selector(stream)
-        result = CombinedSelector(result, combinator, next_selector)
-    return result
-
-def parse_simple_selector(stream):
-    peek = stream.peek()
-    if peek != '*' and not isinstance(peek, Symbol):
-        element = namespace = '*'
-    else:
-        next = stream.next()
-        if next != '*' and not isinstance(next, Symbol):
-            raise SelectorSyntaxError(
-                "Expected symbol, got %r" % next)
-        if stream.peek() == '|':
-            namespace = next
-            stream.next()
-            element = stream.next()
-            if element != '*' and not isinstance(next, Symbol):
-                raise SelectorSyntaxError(
-                    "Expected symbol, got %r" % next)
-        else:
-            namespace = '*'
-            element = next
-    result = Element(namespace, element)
-    has_hash = False
-    while 1:
-        peek = stream.peek()
-        if peek == '#':
-            if has_hash:
-                # You can't have two hashes
-                # (FIXME: is there some more general rule I'm missing?)
-                break
-            stream.next()
-            result = Hash(result, stream.next())
-            has_hash = True
-            continue
-        elif peek == '.':
-            stream.next()
-            result = Class(result, stream.next())
-            continue
-        elif peek == '[':
-            stream.next()
-            result = parse_attrib(result, stream)
-            next = stream.next()
-            if not next == ']':
-                raise SelectorSyntaxError(
-                    "] expected, got %r" % next)
-            continue
-        elif peek == ':' or peek == '::':
-            type = stream.next()
-            ident = stream.next()
-            if not isinstance(ident, Symbol):
-                raise SelectorSyntaxError(
-                    "Expected symbol, got %r" % ident)
-            if stream.peek() == '(':
-                stream.next()
-                peek = stream.peek()
-                if isinstance(peek, String):
-                    selector = stream.next()
-                elif isinstance(peek, Symbol) and is_int(peek):
-                    selector = int(stream.next())
-                else:
-                    # FIXME: parse_simple_selector, or selector, or...?
-                    selector = parse_simple_selector(stream)
-                    next = stream.next()
-                    if not next == ')':
-                        raise SelectorSyntaxError(
-                            "Expected ), got %r and %r"
-                            % (next, selector))
-                result = Function(result, type, ident, selector)
-            else:
-                result = Pseudo(result, type, ident)
-            continue
-        else:
-            break
-        # FIXME: not sure what "negation" is
-    return result
-
-def is_int(v):
-    try:
-        int(v)
-    except ValueError:
-        return False
-    else:
-        return True
-
-def parse_attrib(selector, stream):
-    attrib = stream.next()
-    if stream.peek() == '|':
-        namespace = attrib
-        stream.next()
-        attrib = stream.next()
-    else:
-        namespace = '*'
-    if stream.peek() == ']':
-        return Attrib(selector, namespace, attrib, 'exists', None)
-    op = stream.next()
-    if not op in ('^=', '$=', '*=', '=', '~=', '|=', '!='):
-        raise SelectorSyntaxError(
-            "Operator expected, got %r" % op)
-    value = stream.next()
-    if not isinstance(value, (Symbol, String)):
-        raise SelectorSyntaxError(
-            "Expected string or symbol, got %r" % value)
-    return Attrib(selector, namespace, attrib, op, value)
-
-def parse_series(s):
-    """
-    Parses things like '1n+2', or 'an+b' generally, returning (a, b)
-    """
-    if isinstance(s, Element):
-        s = s._format_element()
-    if not s or s == '*':
-        # Happens when there's nothing, which the CSS parser thinks of as *
-        return (0, 0)
-    if isinstance(s, int):
-        # Happens when you just get a number
-        return (0, s)
-    if s == 'odd':
-        return (2, 1)
-    elif s == 'even':
-        return (2, 0)
-    elif s == 'n':
-        return (1, 0)
-    if 'n' not in s:
-        # Just a b
-        return (0, int(s))
-    a, b = s.split('n', 1)
-    if not a:
-        a = 1
-    elif a == '-' or a == '+':
-        a = int(a+'1')
-    else:
-        a = int(a)
-    if not b:
-        b = 0
-    elif b == '-' or b == '+':
-        b = int(b+'1')
-    else:
-        b = int(b)
-    return (a, b)
-    
-
-############################################################
-## Tokenizing
-############################################################
-
-_whitespace_re = re.compile(r'\s+')
-
-_comment_re = re.compile(r'/\*.*?\*/', re.S)
-
-_count_re = re.compile(r'[+-]?\d*n(?:[+-]\d+)?')
-
-def tokenize(s):
-    pos = 0
-    s = _comment_re.sub('', s)
-    while 1:
-        match = _whitespace_re.match(s, pos=pos)
-        if match:
-            pos = match.end()
-        if pos >= len(s):
-            return
-        match = _count_re.match(s, pos=pos)
-        if match and match.group() != 'n':
-            sym = s[pos:match.end()]
-            yield Symbol(sym, pos)
-            pos = match.end()
-            continue
-        c = s[pos]
-        c2 = s[pos:pos+2]
-        if c2 in ('~=', '|=', '^=', '$=', '*=', '::', '!='):
-            yield Token(c2, pos)
-            pos += 2
-            continue
-        if c in '>+~,.*=[]()|:#':
-            yield Token(c, pos)
-            pos += 1
-            continue
-        if c == '"' or c == "'":
-            # Quoted string
-            old_pos = pos
-            sym, pos = tokenize_escaped_string(s, pos)
-            yield String(sym, old_pos)
-            continue
-        old_pos = pos
-        sym, pos = tokenize_symbol(s, pos)
-        yield Symbol(sym, old_pos)
-        continue
-
-def tokenize_escaped_string(s, pos):
-    quote = s[pos]
-    assert quote in ('"', "'")
-    pos = pos+1
-    start = pos
-    while 1:
-        next = s.find(quote, pos)
-        if next == -1:
-            raise SelectorSyntaxError(
-                "Expected closing %s for string in: %r"
-                % (quote, s[start:]))
-        result = s[start:next]
-        try:
-            result = result.decode('unicode_escape')
-        except UnicodeDecodeError:
-            # Probably a hanging \
-            pos = next+1
-        else:
-            return result, next+1
-    
-_illegal_symbol = re.compile(r'[^\w\\-]', re.UNICODE)
-
-def tokenize_symbol(s, pos):
-    start = pos
-    match = _illegal_symbol.search(s, pos=pos)
-    if not match:
-        # Goes to end of s
-        return s[start:], len(s)
-    if match.start() == pos:
-        assert 0, (
-            "Unexpected symbol: %r at %s" % (s[pos], pos))
-    if not match:
-        result = s[start:]
-        pos = len(s)
-    else:
-        result = s[start:match.start()]
-        pos = match.start()
-    try:
-        result = result.decode('unicode_escape')
-    except UnicodeDecodeError, e:
-        raise SelectorSyntaxError(
-            "Bad symbol %r: %s" % (result, e))
-    return result, pos
-
-class TokenStream(object):
-
-    def __init__(self, tokens, source=None):
-        self.used = []
-        self.tokens = iter(tokens)
-        self.source = source
-        self.peeked = None
-        self._peeking = False
-
-    def next(self):
-        if self._peeking:
-            self._peeking = False
-            self.used.append(self.peeked)
-            return self.peeked
-        else:
-            try:
-                next = self.tokens.next()
-                self.used.append(next)
-                return next
-            except StopIteration:
-                return None
-
-    def __iter__(self):
-        return iter(self.next, None)
-
-    def peek(self):
-        if not self._peeking:
-            try:
-                self.peeked = self.tokens.next()
-            except StopIteration:
-                return None
-            self._peeking = True
-        return self.peeked


More information about the lxml-checkins mailing list