[py-svn] r35303 - in py/dist/py/apigen/source: . testing

guido at codespeak.net guido at codespeak.net
Tue Dec 5 15:41:47 CET 2006


Author: guido
Date: Tue Dec  5 15:41:44 2006
New Revision: 35303

Added:
   py/dist/py/apigen/source/color.py
   py/dist/py/apigen/source/testing/test_color.py
Log:
Added simple tokenizer for syntax coloring. Tokenizer has support for stuff
like multiline strings.


Added: py/dist/py/apigen/source/color.py
==============================================================================
--- (empty file)
+++ py/dist/py/apigen/source/color.py	Tue Dec  5 15:41:44 2006
@@ -0,0 +1,150 @@
+""" simple Python syntax coloring """
+
+import re
+
+class PythonSchema(object):
+    """ contains information for syntax coloring """
+    comment = [('#', '\n')]
+    multiline_string = ['"""', "'''"]
+    string = ['"""', "'''", '"', "'"]
+    # XXX not complete
+    keyword = ['for', 'if', 'not', 'then', 'else', 'while', 'from', 'import',
+               'try', 'except', 'finally', 'raise', 'print', 'exec', 'eval',
+               'break', 'in', 'assert', 'None']
+    alt_keyword = ['def', 'class', 'return']
+
+class Token(object):
+    data = None
+    type = 'unknown'
+
+    def __init__(self, data, type='unknown'):
+        self.data = data
+        self.type = type
+
+    def __repr__(self):
+        return '<Token type="%s" %r>' % (self.type, self.data)
+
+    def __eq__(self, other):
+        return self.data == other.data and self.type == other.type
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+class Tokenizer(object):
+    """ when fed lists strings, it will return tokens with type info
+    
+        very simple tokenizer, state is recorded for multi-line strings, etc.
+    """
+
+    _re_word = re.compile('[\w_]+')
+    _re_space = re.compile('\s+')
+    _re_number = re.compile('[\d\.]*\d+')
+    _re_rest = re.compile('[^\w\s\d]+')
+
+    # these will be filled using the schema
+    _re_strings_full = None
+    _re_strings_multiline = None
+    _re_strings_comments = None
+
+    def __init__(self, schema):
+        self.schema = schema
+        self._inside_multiline = False
+        
+        self._re_strings_full = []
+        self._re_strings_multiline = []
+        for d in schema.string + schema.multiline_string:
+            self._re_strings_full.append(re.compile('%s.*?%s' % (d, d)))
+        for d in schema.multiline_string:
+            self._re_strings_multiline.append((re.compile('%s.*' % (d,), re.S),
+                                               re.compile('.*?%s' % (d,))))
+        # no multi-line comments in Python... phew :)
+        self._re_comments = []
+        for start, end in schema.comment:
+            self._re_comments.append(re.compile('%s.*?%s' % (start, end)))
+
+    def tokenize(self, data):
+        if self._inside_multiline:
+            m = self._inside_multiline.match(data)
+            if not m:
+                yield Token(data, 'string')
+                data = ''
+            else:
+                s = m.group(0)
+                data = data[len(s):]
+                self._inside_multiline = False
+                yield Token(s, 'string')
+        while data:
+            for f in [self._check_multiline_strings, self._check_full_strings,
+                      self._check_comments, self._check_word,
+                      self._check_space, self._check_number, self._check_rest]:
+                data, t = f(data)
+                if t:
+                    yield t
+                    break
+            else:
+                raise ValueError(
+                        'no token found in %r (bug in tokenizer)' % (data,))
+                
+    def _check_full_strings(self, data):
+        token = None
+        for r in self._re_strings_full:
+            m = r.match(data)
+            if m:
+                s = m.group(0)
+                data = data[len(s):]
+                token = Token(s, type='string')
+                break
+        return data, token
+
+    def _check_multiline_strings(self, data):
+        token = None
+        for start, end in self._re_strings_multiline:
+            m = start.match(data)
+            if m:
+                s = m.group(0)
+                data = ''
+                self._inside_multiline = end
+                token = Token(s, 'string')
+                break
+        return data, token
+
+    def _check_comments(self, data):
+        # fortunately we don't have to deal with multi-line comments
+        token = None
+        for r in self._re_comments:
+            m = r.match(data)
+            if m:
+                s = m.group(0)
+                data = data[len(s):]
+                token = Token(s, 'comment')
+                break
+        return data, token
+
+    def _check_word(self, data):
+        m = self._re_word.match(data)
+        if m:
+            s = m.group(0)
+            return data[len(s):], Token(s, 'word')
+        return data, None
+
+    def _check_space(self, data):
+        m = self._re_space.match(data)
+        if m:
+            s = m.group(0)
+            return data[len(s):], Token(s, 'whitespace')
+        return data, None
+
+    def _check_number(self, data):
+        m = self._re_number.match(data)
+        if m:
+            s = m.group(0)
+            return data[len(s):], Token(s, 'number')
+        return data, None
+
+    def _check_rest(self, data):
+        m = self._re_rest.match(data)
+        if m:
+            s = m.group(0)
+            return data[len(s):], Token(s, 'unknown')
+        return data, None
+

Added: py/dist/py/apigen/source/testing/test_color.py
==============================================================================
--- (empty file)
+++ py/dist/py/apigen/source/testing/test_color.py	Tue Dec  5 15:41:44 2006
@@ -0,0 +1,47 @@
+import py
+from py.__.apigen.source.color import Tokenizer, Token, PythonSchema
+
+class TestTokenizer(object):
+    def tokens(self, data):
+        t = Tokenizer(PythonSchema)
+        return list(t.tokenize(data))
+
+    def test_word(self):
+        assert self.tokens('foo') == [Token('foo', type='word')]
+        assert self.tokens('_1_word') == [Token('_1_word', type='word')]
+
+    def test_space(self):
+        assert self.tokens(' ') == [Token(' ', type='whitespace')]
+        assert self.tokens(' \n') == [Token(' \n', type='whitespace')]
+
+    def test_printable(self):
+        assert self.tokens('.') == [Token('.', 'unknown')]
+        assert self.tokens(';#$@\n') == [Token(';#$@', type='unknown'),
+                                         Token('\n', type='whitespace')]
+
+    def test_comment(self):
+        assert self.tokens('# foo\n') == [Token('# foo\n', type='comment')]
+        assert self.tokens('foo # bar\n') == [Token('foo', type='word'),
+                                              Token(' ', type='whitespace'),
+                                              Token('# bar\n', type='comment')]
+
+    def test_string_simple(self):
+        assert self.tokens('"foo"') == [Token('"foo"', type='string')]
+        assert self.tokens('"foo"\'bar\'') == [Token('"foo"', type='string'),
+                                               Token("'bar'", type='string')]
+
+    def test_string_escape(self):
+        py.test.skip('not yet implemented')
+        assert self.tokens('"foo \\" bar"') == [Token('"foo \\" bar"',
+                                                      type='string')]
+    def test_string_multiline(self):
+        t = Tokenizer(PythonSchema)
+        res = list(t.tokenize('"""foo\n'))
+        assert res == [Token('"""foo\n', type='string')]
+        res = list(t.tokenize('bar\n'))
+        print res
+        assert res == [Token('bar\n', type='string')]
+        res = list(t.tokenize('"""\n'))
+        assert res == [Token('"""', type='string'),
+                       Token('\n', type='whitespace')]
+


More information about the py-svn mailing list