[py-svn] r35303 - in py/dist/py/apigen/source: . testing
guido at codespeak.net
guido at codespeak.net
Tue Dec 5 15:41:47 CET 2006
Author: guido
Date: Tue Dec 5 15:41:44 2006
New Revision: 35303
Added:
py/dist/py/apigen/source/color.py
py/dist/py/apigen/source/testing/test_color.py
Log:
Added simple tokenizer for syntax coloring. Tokenizer has support for stuff
like multiline strings.
Added: py/dist/py/apigen/source/color.py
==============================================================================
--- (empty file)
+++ py/dist/py/apigen/source/color.py Tue Dec 5 15:41:44 2006
@@ -0,0 +1,150 @@
+""" simple Python syntax coloring """
+
+import re
+
+class PythonSchema(object):
+ """ contains information for syntax coloring """
+ comment = [('#', '\n')]
+ multiline_string = ['"""', "'''"]
+ string = ['"""', "'''", '"', "'"]
+ # XXX not complete
+ keyword = ['for', 'if', 'not', 'then', 'else', 'while', 'from', 'import',
+ 'try', 'except', 'finally', 'raise', 'print', 'exec', 'eval',
+ 'break', 'in', 'assert', 'None']
+ alt_keyword = ['def', 'class', 'return']
+
+class Token(object):
+ data = None
+ type = 'unknown'
+
+ def __init__(self, data, type='unknown'):
+ self.data = data
+ self.type = type
+
+ def __repr__(self):
+ return '<Token type="%s" %r>' % (self.type, self.data)
+
+ def __eq__(self, other):
+ return self.data == other.data and self.type == other.type
+
+ def __ne__(self, other):
+ return not self.__eq__(other)
+
+class Tokenizer(object):
+ """ when fed lists strings, it will return tokens with type info
+
+ very simple tokenizer, state is recorded for multi-line strings, etc.
+ """
+
+ _re_word = re.compile('[\w_]+')
+ _re_space = re.compile('\s+')
+ _re_number = re.compile('[\d\.]*\d+')
+ _re_rest = re.compile('[^\w\s\d]+')
+
+ # these will be filled using the schema
+ _re_strings_full = None
+ _re_strings_multiline = None
+ _re_strings_comments = None
+
+ def __init__(self, schema):
+ self.schema = schema
+ self._inside_multiline = False
+
+ self._re_strings_full = []
+ self._re_strings_multiline = []
+ for d in schema.string + schema.multiline_string:
+ self._re_strings_full.append(re.compile('%s.*?%s' % (d, d)))
+ for d in schema.multiline_string:
+ self._re_strings_multiline.append((re.compile('%s.*' % (d,), re.S),
+ re.compile('.*?%s' % (d,))))
+ # no multi-line comments in Python... phew :)
+ self._re_comments = []
+ for start, end in schema.comment:
+ self._re_comments.append(re.compile('%s.*?%s' % (start, end)))
+
+ def tokenize(self, data):
+ if self._inside_multiline:
+ m = self._inside_multiline.match(data)
+ if not m:
+ yield Token(data, 'string')
+ data = ''
+ else:
+ s = m.group(0)
+ data = data[len(s):]
+ self._inside_multiline = False
+ yield Token(s, 'string')
+ while data:
+ for f in [self._check_multiline_strings, self._check_full_strings,
+ self._check_comments, self._check_word,
+ self._check_space, self._check_number, self._check_rest]:
+ data, t = f(data)
+ if t:
+ yield t
+ break
+ else:
+ raise ValueError(
+ 'no token found in %r (bug in tokenizer)' % (data,))
+
+ def _check_full_strings(self, data):
+ token = None
+ for r in self._re_strings_full:
+ m = r.match(data)
+ if m:
+ s = m.group(0)
+ data = data[len(s):]
+ token = Token(s, type='string')
+ break
+ return data, token
+
+ def _check_multiline_strings(self, data):
+ token = None
+ for start, end in self._re_strings_multiline:
+ m = start.match(data)
+ if m:
+ s = m.group(0)
+ data = ''
+ self._inside_multiline = end
+ token = Token(s, 'string')
+ break
+ return data, token
+
+ def _check_comments(self, data):
+ # fortunately we don't have to deal with multi-line comments
+ token = None
+ for r in self._re_comments:
+ m = r.match(data)
+ if m:
+ s = m.group(0)
+ data = data[len(s):]
+ token = Token(s, 'comment')
+ break
+ return data, token
+
+ def _check_word(self, data):
+ m = self._re_word.match(data)
+ if m:
+ s = m.group(0)
+ return data[len(s):], Token(s, 'word')
+ return data, None
+
+ def _check_space(self, data):
+ m = self._re_space.match(data)
+ if m:
+ s = m.group(0)
+ return data[len(s):], Token(s, 'whitespace')
+ return data, None
+
+ def _check_number(self, data):
+ m = self._re_number.match(data)
+ if m:
+ s = m.group(0)
+ return data[len(s):], Token(s, 'number')
+ return data, None
+
+ def _check_rest(self, data):
+ m = self._re_rest.match(data)
+ if m:
+ s = m.group(0)
+ return data[len(s):], Token(s, 'unknown')
+ return data, None
+
Added: py/dist/py/apigen/source/testing/test_color.py
==============================================================================
--- (empty file)
+++ py/dist/py/apigen/source/testing/test_color.py Tue Dec 5 15:41:44 2006
@@ -0,0 +1,47 @@
+import py
+from py.__.apigen.source.color import Tokenizer, Token, PythonSchema
+
+class TestTokenizer(object):
+ def tokens(self, data):
+ t = Tokenizer(PythonSchema)
+ return list(t.tokenize(data))
+
+ def test_word(self):
+ assert self.tokens('foo') == [Token('foo', type='word')]
+ assert self.tokens('_1_word') == [Token('_1_word', type='word')]
+
+ def test_space(self):
+ assert self.tokens(' ') == [Token(' ', type='whitespace')]
+ assert self.tokens(' \n') == [Token(' \n', type='whitespace')]
+
+ def test_printable(self):
+ assert self.tokens('.') == [Token('.', 'unknown')]
+ assert self.tokens(';#$@\n') == [Token(';#$@', type='unknown'),
+ Token('\n', type='whitespace')]
+
+ def test_comment(self):
+ assert self.tokens('# foo\n') == [Token('# foo\n', type='comment')]
+ assert self.tokens('foo # bar\n') == [Token('foo', type='word'),
+ Token(' ', type='whitespace'),
+ Token('# bar\n', type='comment')]
+
+ def test_string_simple(self):
+ assert self.tokens('"foo"') == [Token('"foo"', type='string')]
+ assert self.tokens('"foo"\'bar\'') == [Token('"foo"', type='string'),
+ Token("'bar'", type='string')]
+
+ def test_string_escape(self):
+ py.test.skip('not yet implemented')
+ assert self.tokens('"foo \\" bar"') == [Token('"foo \\" bar"',
+ type='string')]
+ def test_string_multiline(self):
+ t = Tokenizer(PythonSchema)
+ res = list(t.tokenize('"""foo\n'))
+ assert res == [Token('"""foo\n', type='string')]
+ res = list(t.tokenize('bar\n'))
+ print res
+ assert res == [Token('bar\n', type='string')]
+ res = list(t.tokenize('"""\n'))
+ assert res == [Token('"""', type='string'),
+ Token('\n', type='whitespace')]
+
More information about the py-svn
mailing list