[pypy-svn] r50258 - pypy/dist/pypy/rlib/parsing/test

cfbolz at codespeak.net cfbolz at codespeak.net
Wed Jan 2 15:22:07 CET 2008


Author: cfbolz
Date: Wed Jan  2 15:22:06 2008
New Revision: 50258

Added:
   pypy/dist/pypy/rlib/parsing/test/test_pythonlexer.py   (contents, props changed)
Log:
todays train-experiment: an rlib.parsing based python lexer. Should work, what
is missing is mapping this to what our parser expects plus some corner cases.


Added: pypy/dist/pypy/rlib/parsing/test/test_pythonlexer.py
==============================================================================
--- (empty file)
+++ pypy/dist/pypy/rlib/parsing/test/test_pythonlexer.py	Wed Jan  2 15:22:06 2008
@@ -0,0 +1,239 @@
+import py
+from pypy.rlib.parsing.regexparse import parse_regex, make_runner
+from pypy.rlib.parsing.lexer import Lexer
+
+# attempts at writing a Python-lexer
+
+def group(*choices):
+    return '(' + '|'.join(choices) + ')'
+def any(*choices):
+    return group(*choices) + '*'
+def maybe(*choices):
+    return group(*choices) + '?'
+
+#____________________________________________________________
+# Numbers
+
+Hexnumber = r'0[xX][0-9a-fA-F]*[lL]?'
+Octnumber = r'0[0-7]*[lL]?'
+Decnumber = r'[1-9][0-9]*[lL]?'
+Intnumber = group(Hexnumber, Octnumber, Decnumber)
+Exponent = r'[eE][\-\+]?[0-9]+'
+Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)
+Expfloat = r'[0-9]+' + Exponent
+Floatnumber = group(Pointfloat, Expfloat)
+Imagnumber = group(r'(0|[1-9][0-9]*)[jJ]', Floatnumber + r'[jJ]')
+Number = group(Imagnumber, Floatnumber, Intnumber)
+
+#____________________________________________________________
+# Strings
+
+_stringheader = r"[uU]?[rR]?"
+
+# ' or " string.
+def make_single_string(delim):
+    normal_chars = r"[^\n\%s]*" % (delim, )
+    return "".join([_stringheader, delim, normal_chars,
+                    any(r"\\." + normal_chars), delim])
+
+# triple-quoted-strings
+def make_triple_string(delim):
+    harmless = r"[^\%s]" % (delim, )
+    anyharmless = harmless + "*"
+    atleastoneharmless = harmless + "+"
+    normal_chars = anyharmless + any(group(delim, 2 * delim) +
+                                     atleastoneharmless)
+    triple = delim * 3
+    return "".join([_stringheader, triple, normal_chars,
+                    any(r"\\." + normal_chars), triple])
+
+def test_triple_regex():
+    delim = '"'
+    harmless = r"[^\%s]" % (delim, )
+    anyharmless = harmless + "*"
+    atleastoneharmless = harmless + "+"
+    normal_chars = anyharmless + any(group(delim, 2 * delim) +
+                                     atleastoneharmless)
+    runner = make_runner(normal_chars)
+    assert runner.recognize('""a""a""a""a')
+    assert not runner.recognize('""a""a"""a""a')
+
+SingleString = group(make_single_string("'"),
+                     make_single_string('"'))
+
+TripleString = group(make_triple_string("'"),
+                     make_triple_string('"'))
+
+String = group(SingleString, TripleString)
+
+#____________________________________________________________
+# Ignored
+
+Whitespace = r'[ \f\t]*'
+Newline = r'\r?\n'
+Linecontinue = r'\\' + Newline + any(Whitespace)
+Comment = r'#[^\r\n]*'
+Indent = Newline + any(Whitespace)
+Simpleignore = Whitespace + any(Whitespace) + maybe(group(Comment, Linecontinue))
+Ignore = group(Linecontinue, Comment, Simpleignore)
+
+#____________________________________________________________
+
+Special = r'[\:\;\.\,\`\@]'
+Name = r'[a-zA-Z_][a-zA-Z0-9_]*'
+
+Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
+                 r"//=?",
+                 r"[\+\-\*\/\%\&\|\^\=\<\>]=?",
+                 r"~")
+
+OpenBracket = r'[\[\(\{]'
+CloseBracket = r'[\]\)\}]'
+
+#____________________________________________________________
+# all tokens
+
+tokens = ["Number", "String", "Name", "Ignore", "Special", "Indent", 
+          "OpenBracket", "CloseBracket", "Operator"]
+
+def make_lexer():
+    return Lexer([parse_regex(globals()[r]) for r in tokens], tokens[:])
+    
+pythonlexer = make_lexer()
+
+def postprocess(tokens):
+    parenthesis_level = 0
+    indentation_levels = [0]
+    output_tokens = []
+    for token in tokens:
+        if token.name == "OpenBracket":
+            parenthesis_level += 1
+            token.name = "Operator"
+            output_tokens.append(token)
+        elif token.name == "CloseBracket":
+            parenthesis_level -= 1
+            if parenthesis_level < 0:
+                XXX
+            token.name = "Operator"
+            output_tokens.append(token)
+        elif token.name == "Indent":
+            if parenthesis_level == 0:
+                s = token.source
+                length = len(s)
+                pos = 1
+                column = 0
+                while pos < length:  # measure leading whitespace
+                    c = s[pos]
+                    if c == ' ':
+                        column = column + 1
+                    elif c == '\t':
+                        column = (column // tabsize + 1) * tabsize
+                    elif c == '\f':
+                        column = 0
+                    else:
+                        break
+                    pos = pos + 1
+                if column > indentation_levels[-1]: # count indents or dedents
+                    indentation_levels.append(column)
+                    token.name = "Indent"
+                while column < indentation_levels[-1]:
+                    indentation_levels.pop()
+                    token.name = "Dedent"
+                output_tokens.append(token)
+            else:
+                pass # implicit line-continuations within parenthesis
+        elif token.name == "Ignore":
+            pass
+        else:
+            output_tokens.append(token)
+    return output_tokens
+
+def pythonlex(s):
+    return postprocess(pythonlexer.tokenize(s))
+
+
+def test_number():
+    for num in ['1.231e-4', '1j', '0J', '123J'
+                ]:
+        tokens = pythonlexer.tokenize(num)
+        token, = tokens
+        assert token.name == 'Number'
+    for intnum in ['1', '0', '0xABFfaf1928375']:
+        for suffix in ['', 'l', 'L']:
+            tokens = pythonlexer.tokenize(intnum + suffix)
+            token, = tokens
+            assert token.name == 'Number'
+
+def test_single_quoted_string():
+    for s in ["""u'abc'""",
+              """ur'ab"c'""",
+              """UR'ab\\'c'""",
+              """'ab\\\nc'"""]:
+        tokens = pythonlexer.tokenize(s)
+        token, = tokens
+        assert token.name == 'String'
+
+def test_triple_quoted_string():
+    for s in ["""'''abc'''""",
+              """'''a'b'c''d'f'''""",
+              """uR'''a\\''''""",
+              """'''\na\nk\n\"\"\"'''"""]:
+        tokens = pythonlexer.tokenize(s)
+        token, = tokens
+        assert token.name == 'String'
+
+def test_name():
+    for s in ["abc",
+              "_",
+              "a_0",
+              "_0",
+              ]:
+        tokens = pythonlexer.tokenize(s)
+        token, = tokens
+        assert token.name == 'Name'
+
+def test_long():
+    for s, numtoken in [
+            ("if x:\n    print x", 8),
+            ("if x:#foo\n    x *= 17", 11),
+            ("1 + \\\n 2", 5)]:
+        tokens = pythonlexer.tokenize(s)
+        assert len(tokens) == numtoken
+        print tokens
+
+def test_complex_quoting():
+    s = '''"""u'abc'""",
+           """ur'ab"c'""",
+           """UR'ab\\'c'""",
+           """'ab\\\nc'"""'''
+    tokens = pythonlexer.tokenize(s)
+    assert len(tokens) == 10
+    for i in range(4):
+        assert tokens[i * 3].name == 'String'
+
+def test_self():
+    s = py.magic.autopath().read()
+    tokens = pythonlexer.tokenize(s)
+    print tokens
+
+def test_indentation():
+    s = """a
+b
+    c
+        d
+    e"""
+    tokens = pythonlex(s)
+    assert [t.name for t in tokens] == ["Name", "Indent", "Name", "Indent",
+                                        "Name", "Indent", "Name", "Dedent",
+                                        "Name"]
+
+def test_linecont():
+    s = "a + \\\n     b"
+    tokens = pythonlex(s)
+    assert [t.name for t in tokens] == ["Name", "Operator", "Name"]
+
+def test_parenthesis():
+    s = "(a + \n     b)"
+    tokens = pythonlex(s)
+    assert [t.name for t in tokens] == ["Operator", "Name", "Operator", "Name",
+                                        "Operator"]


More information about the pypy-svn mailing list