"""Tokenization help for Python programs. generate_tokens(readline) is a generator that breaks a stream of text into Python tokens. It accepts a readline-like method which is called repeatedly to get the next line of input (or "" for EOF). It generates 5-tuples with these members: the token type (see token.py) the token (a string) the starting (row, column) indices of the token (a 2-tuple of ints) the ending (row, column) indices of the token (a 2-tuple of ints) the original line (string) It is designed to match the working of the Python tokenizer exactly, except that it produces COMMENT tokens for comments and gives type OP for all operators Older entry points tokenize_loop(readline, tokeneater) tokenize(readline, tokeneater=printtoken) are the same, except instead of generating tokens, tokeneater is a callback function to which the 5 fields described above are passed as 5 arguments, each time a new token is found.""" __author__ = 'Ka-Ping Yee ' __credits__ = \ 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro' import string, re from token import * import token __all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize", "generate_tokens", "NL", "untokenize"] del x del token COMMENT = N_TOKENS tok_name[COMMENT] = 'COMMENT' NL = N_TOKENS + 1 tok_name[NL] = 'NL' N_TOKENS += 2 def group(*choices): return '(' + '|'.join(choices) + ')' def any(*choices): return group(*choices) + '*' def maybe(*choices): return group(*choices) + '?' Whitespace = r'[ \f\t]*' Comment = r'#[^\r\n]*' Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment) Name = r'[a-zA-Z_]\w*' Hexnumber = r'0[xX][\da-fA-F]*[lL]?' Octnumber = r'0[0-7]*[lL]?' Decnumber = r'[1-9]\d*[lL]?' Intnumber = group(Hexnumber, Octnumber, Decnumber) Exponent = r'[eE][-+]?\d+' Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent) Expfloat = r'\d+' + Exponent Floatnumber = group(Pointfloat, Expfloat) Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]') Number = group(Imagnumber, Floatnumber, Intnumber) # Tail end of ' string. Single = r"[^'\\]*(?:\\.[^'\\]*)*'" # Tail end of " string. Double = r'[^"\\]*(?:\\.[^"\\]*)*"' # Tail end of ''' string. Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''" # Tail end of """ string. Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""') # Single-line ' or " string. String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'", r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"') # Because of leftmost-then-longest match semantics, be sure to put the # longest operators first (e.g., if = came before ==, == would get # recognized as two instances of =). Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=", r"//=?", r"[+\-*/%&|^=<>]=?", r"~") Bracket = '[][(){}]' Special = group(r'\r?\n', r'[:;.,`@]') Funny = group(Operator, Bracket, Special) PlainToken = group(Number, Funny, String, Name) Token = Ignore + PlainToken # First (or only) line of ' or " string. ContStr = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" + group("'", r'\\\r?\n'), r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' + group('"', r'\\\r?\n')) PseudoExtras = group(r'\\\r?\n', Comment, Triple) PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) tokenprog, pseudoprog, single3prog, double3prog = map( re.compile, (Token, PseudoToken, Single3, Double3)) endprogs = {"'": re.compile(Single), '"': re.compile(Double), "'''": single3prog, '"""': double3prog, "r'''": single3prog, 'r"""': double3prog, "u'''": single3prog, 'u"""': double3prog, "ur'''": single3prog, 'ur"""': double3prog, "R'''": single3prog, 'R"""': double3prog, "U'''": single3prog, 'U"""': double3prog, "uR'''": single3prog, 'uR"""': double3prog, "Ur'''": single3prog, 'Ur"""': double3prog, "UR'''": single3prog, 'UR"""': double3prog, 'r': None, 'R': None, 'u': None, 'U': None} triple_quoted = {} for t in ("'''", '"""', "r'''", 'r"""', "R'''", 'R"""', "u'''", 'u"""', "U'''", 'U"""', "ur'''", 'ur"""', "Ur'''", 'Ur"""', "uR'''", 'uR"""', "UR'''", 'UR"""'): triple_quoted[t] = t single_quoted = {} for t in ("'", '"', "r'", 'r"', "R'", 'R"', "u'", 'u"', "U'", 'U"', "ur'", 'ur"', "Ur'", 'Ur"', "uR'", 'uR"', "UR'", 'UR"' ): single_quoted[t] = t tabsize = 8 class TokenError(Exception): pass class StopTokenizing(Exception): pass def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing print "%d,%d-%d,%d:\t%s\t%s" % \ (srow, scol, erow, ecol, tok_name[type], repr(token)) def tokenize_loop_obj(readline, tokeneater): genobj = GenToken(readline) for token_info in genobj: tokeneater(*token_info) def tokenize_obj(readline, tokeneater=printtoken): try: tokenize_loop_obj(readline, tokeneater) except StopTokenizing: pass import psyco class GenToken(psyco.compact): def __init__ (self, readline): self.readline = readline self.result = [] def __iter__ (self): del self.result[:] self.generate_tokens() return self.result.__iter__() def push (self, *args): self.result.append (args) def generate_tokens(self): self.lnum = self.parenlev = self.continued = 0 self.namechars, self.numchars = string.ascii_letters + '_', '0123456789' self.contstr, self.needcont = '', 0 self.contline = None self.indents = [0] while 1: # loop over lines in stream try: self.line = self.readline() except StopIteration: self.line = '' self.lnum = self.lnum + 1 self.pos, self.max = 0, len(self.line) if self.contstr: # self.continued string if not self.line: raise TokenError, ("EOF in multi-line string", self.strstart) self.endmatch = self.endprog.match(self.line) if self.endmatch: self.pos = self.end = self.endmatch.end(0) self.push (STRING, self.contstr + self.line[:self.end], self.strstart, (self.lnum, self.end), self.contline + self.line) self.contstr, self.needcont = '', 0 self.contline = None elif self.needcont and self.line[-2:] != '\\\n' and self.line[-3:] != '\\\r\n': self.push (ERRORTOKEN, self.contstr + self.line, self.strstart, (self.lnum, len(self.line)), self.contline) self.contstr = '' self.contline = None continue else: self.contstr = self.contstr + self.line self.contline = self.contline + self.line continue elif self.parenlev == 0 and not self.continued: # new statement if not self.line: break self.column = 0 while self.pos < self.max: # measure leading whitespace if self.line[self.pos] == ' ': self.column = self.column + 1 elif self.line[self.pos] == '\t': self.column = (self.column/tabsize + 1)*tabsize elif self.line[self.pos] == '\f': self.column = 0 else: break self.pos = self.pos + 1 if self.pos == self.max: break if self.line[self.pos] in '#\r\n': # skip comments or blank lines self.push ((NL, COMMENT)[self.line[self.pos] == '#'], self.line[self.pos:], (self.lnum, self.pos), (self.lnum, len(self.line)), self.line) continue if self.column > self.indents[-1]: # count self.indents or dedents self.indents.append(self.column) self.push (INDENT, self.line[:self.pos], (self.lnum, 0), (self.lnum, self.pos), self.line) while self.column < self.indents[-1]: if self.column not in self.indents: raise IndentationError( "unindent does not match any outer indentation level", ("", self.lnum, self.pos, self.line)) self.indents = self.indents[:-1] self.push (DEDENT, '', (self.lnum, self.pos), (self.lnum, self.pos), self.line) else: # self.continued statement if not self.line: raise TokenError, ("EOF in multi-line statement", (self.lnum, 0)) self.continued = 0 while self.pos < self.max: self.pseudomatch = pseudoprog.match(self.line, self.pos) if self.pseudomatch: # scan for tokens self.start, self.end = self.pseudomatch.span(1) self.spos, self.epos, self.pos = (self.lnum, self.start), (self.lnum, self.end), self.end self.token, self.initial = self.line[self.start:self.end], self.line[self.start] if self.initial in self.numchars or \ (self.initial == '.' and self.token != '.'): # ordinary number self.push (NUMBER, self.token, self.spos, self.epos, self.line) elif self.initial in '\r\n': self.push (self.parenlev > 0 and NL or NEWLINE, self.token, self.spos, self.epos, self.line) elif self.initial == '#': self.push (COMMENT, self.token, self.spos, self.epos, self.line) elif self.token in triple_quoted: self.endprog = endprogs[self.token] self.endmatch = self.endprog.match(self.line, self.pos) if self.endmatch: # all on one line self.pos = self.endmatch.end(0) self.token = self.line[self.start:self.pos] self.push (STRING, self.token, self.spos, (self.lnum, self.pos), self.line) else: self.strstart = (self.lnum, self.start) # multiple lines self.contstr = self.line[self.start:] self.contline = self.line break elif self.initial in single_quoted or \ self.token[:2] in single_quoted or \ self.token[:3] in single_quoted: if self.token[-1] == '\n': # self.continued string self.strstart = (self.lnum, self.start) self.endprog = (endprogs[self.initial] or endprogs[self.token[1]] or endprogs[self.token[2]]) self.contstr, self.needcont = self.line[self.start:], 1 self.contline = self.line break else: # ordinary string self.push (STRING, self.token, self.spos, self.epos, self.line) elif self.initial in self.namechars: # ordinary name self.push (NAME, self.token, self.spos, self.epos, self.line) elif self.initial == '\\': # self.continued stmt self.continued = 1 else: if self.initial in '([{': self.parenlev = self.parenlev + 1 elif self.initial in ')]}': self.parenlev = self.parenlev - 1 self.push (OP, self.token, self.spos, self.epos, self.line) else: self.push (ERRORTOKEN, self.line[self.pos], (self.lnum, self.pos), (self.lnum, self.pos+1), self.line) self.pos = self.pos + 1 for self.indent in self.indents[1:]: # pop remaining indent levels self.push (DEDENT, '', (self.lnum, 0), (self.lnum, 0), '') self.push (ENDMARKER, '', (self.lnum, 0), (self.lnum, 0), '')