[pypy-svn] r52024 - in pypy/dist/pypy: interpreter/pyparser rlib/parsing rlib/parsing/test
jared.grubb at codespeak.net
jared.grubb at codespeak.net
Sun Mar 2 02:15:51 CET 2008
Author: jared.grubb
Date: Sun Mar 2 02:15:50 2008
New Revision: 52024
Modified:
pypy/dist/pypy/interpreter/pyparser/parsestring.py
pypy/dist/pypy/rlib/parsing/regexparse.py
pypy/dist/pypy/rlib/parsing/test/test_pcre_regtest.py
Log:
parsestring: simplify the octal parsing a bit
regexparse: simplify the unescape() function
test_pcre_regtest: preprocesses the testoutput1 file and runs most of the tests
Modified: pypy/dist/pypy/interpreter/pyparser/parsestring.py
==============================================================================
--- pypy/dist/pypy/interpreter/pyparser/parsestring.py (original)
+++ pypy/dist/pypy/interpreter/pyparser/parsestring.py Sun Mar 2 02:15:50 2008
@@ -155,15 +155,13 @@
lis.append('\013') # VT
elif ch == 'a':
lis.append('\007') # BEL, not classic C
- elif '0' <= ch <= '7':
- c = ord(s[ps - 1]) - ord('0')
- if ps < end and '0' <= s[ps] <= '7':
- c = (c << 3) + ord(s[ps]) - ord('0')
- ps += 1
- if ps < end and '0' <= s[ps] <= '7':
- c = (c << 3) + ord(s[ps]) - ord('0')
- ps += 1
- lis.append(chr(c))
+ elif ch in '01234567':
+ # Look for up to two more octal digits
+ span = ps
+ span += (span < end) and (s[span] in '01234567')
+ span += (span < end) and (s[span] in '01234567')
+ lis.append(chr(int(s[ps - 1 : span], 8)))
+ ps = span
elif ch == 'x':
if ps+2 <= end and isxdigit(s[ps]) and isxdigit(s[ps + 1]):
lis.append(chr(int(s[ps : ps + 2], 16)))
Modified: pypy/dist/pypy/rlib/parsing/regexparse.py
==============================================================================
--- pypy/dist/pypy/rlib/parsing/regexparse.py (original)
+++ pypy/dist/pypy/rlib/parsing/regexparse.py Sun Mar 2 02:15:50 2008
@@ -9,72 +9,59 @@
set = py.builtin.set
ESCAPES = {
- "\\a": "\a",
- "\\b": "\b",
- "\\e": "\x1b",
- "\\f": "\f",
- "\\n": "\n",
- "\\r": "\r",
- "\\t": "\t",
- "\\v": "\v",
- "\\": "\\"
+ "a": "\a",
+ "b": "\b",
+ "e": "\x1b",
+ "f": "\f",
+ "n": "\n",
+ "r": "\r",
+ "t": "\t",
+ "v": "\v",
}
for i in range(256):
- if chr(i) not in 'x01234567sSwWdD':
- # 'x' and numbers are reserved for hexadecimal/octal escapes
- escaped = "\\" + chr(i)
- if escaped not in ESCAPES:
- ESCAPES[escaped] = chr(i)
-
- # Three digit octals
- escaped = "\\%03o" % i
- if escaped not in ESCAPES:
- ESCAPES[escaped] = chr(i)
-
- if 0 <= i <= 077:
- # Two digit octal digs are ok too
- escaped = "\\%02o" % i
- if escaped not in ESCAPES:
- ESCAPES[escaped] = chr(i)
-
# Add the ctrl-x types:
# Rule, according to PCRE:
# if x is a lower case letter, it is converted to upper case.
# Then bit 6 of the character (hex 40) is inverted.
- # Thus, \cz => 0x1A, but \c{ => 0x3B, while \c; => 0x7B.
- escaped = "\\c%s" % chr(i)
- if escaped not in ESCAPES:
- ESCAPES[escaped] = chr(ord(chr(i).upper()) ^ 0x40)
+ # Thus, \cz => 0x1A, \c{ => 0x3B, \c; => 0x7B.
+ escaped = "c%s" % chr(i)
+ ESCAPES[escaped] = chr(ord(chr(i).upper()) ^ 0x40)
+
+def unescape_muncher(string):
+ """Return a tuple, representing the first character of the string
+ (appropriately unescaped) and the rest of the string that wasn't
+ handled."""
+ if string[0] != '\\':
+ # Not an escape character
+ return string[0], string[1:]
+ if string[1] == 'x':
+ # Hex char, must have two hex digits
+ char = chr(int(string[2:4], 16))
+ return char, string[4:]
+ if string[1] in '01234567':
+ # Octal number, up to three digits long
+ span = 2
+ span += (span < len(string)) and (string[span] in '01234567')
+ span += (span < len(string)) and (string[span] in '01234567')
+ char = chr(int(string[1:span], 8))
+ return char, string[span:]
+ if string[1] == 'c':
+ # Special \cx types
+ return ESCAPES['c'+string[2]], string[3:]
+ if string[1] in ESCAPES:
+ # Special escapes are in ESCAPE
+ return ESCAPES[string[1]], string[2:]
+ # Otherwise, it's just the character it's meant to be (e.g., '\.')
+ return string[1], string[2:]
-
-for a in "0123456789ABCDEFabcdef":
- for b in "0123456789ABCDEFabcdef":
- escaped = "\\x%s%s" % (a, b)
- if escaped not in ESCAPES:
- ESCAPES[escaped] = chr(int("%s%s" % (a, b), 16))
-
+
def unescape(s):
+ """Unescape a whole string."""
result = []
- i = 0
- while i < len(s):
- if s[i] != "\\":
- result.append(s[i])
- i += 1
- continue
- if s[i + 1] == "x":
- escaped = s[i: i + 4]
- i += 4
- elif s[i + 1] in "01234567":
- escaped = s[i: i + 4]
- i += 4
- else:
- escaped = s[i: i + 2]
- i += 2
- if escaped not in ESCAPES:
- raise ValueError("escape %r unknown" % (escaped, ))
- else:
- result.append(ESCAPES[escaped])
+ while s:
+ char, s = unescape_muncher(s)
+ result.append(char)
return "".join(result)
syntax = r"""
@@ -184,20 +171,15 @@
charclass:
'\' 'd'
return { set([chr(c) for c in range(ord('0'), ord('9')+1)]) }
- | '\'
- 's'
+ | '\' 's'
return { set(['\t', '\n', '\f', '\r', ' ']) }
- | '\'
- 'w'
+ | '\' 'w'
return { set([chr(c) for c in range(ord('a'), ord('z')+1)] + [chr(c) for c in range(ord('A'), ord('Z')+1)] + [chr(c) for c in range(ord('0'), ord('9')+1)] + ['_']) }
- | '\'
- 'D'
+ | '\' 'D'
return { set([chr(c) for c in range(256)]) - set([chr(c) for c in range(ord('0'), ord('9')+1)]) }
- | '\'
- 'S'
+ | '\' 'S'
return { set([chr(c) for c in range(256)]) - set(['\t', '\n', '\f', '\r', ' ']) }
- | '\'
- 'W'
+ | '\' 'W'
return { set([chr(c) for c in range(256)]) - set([chr(c) for c in range(ord('a'), ord('z')+1)] + [chr(c) for c in range(ord('A'), ord('Z')+1)] + [chr(c) for c in range(ord('0'), ord('9')+1)] + ['_'])};
NUM:
@@ -1862,6 +1844,9 @@
+
+
+
def test_generate():
f = py.magic.autopath()
oldcontent = f.read()
Modified: pypy/dist/pypy/rlib/parsing/test/test_pcre_regtest.py
==============================================================================
--- pypy/dist/pypy/rlib/parsing/test/test_pcre_regtest.py (original)
+++ pypy/dist/pypy/rlib/parsing/test/test_pcre_regtest.py Sun Mar 2 02:15:50 2008
@@ -12,143 +12,121 @@
py.test.skip("In Progress...")
-def get_simult_lines(tests, results, test_line_num=0):
- """Returns a line from the input/output, ensuring that
- we are sync'd up between the two."""
- test = tests.pop(0)
- result = results.pop(0)
+def read_file(file):
+ lines = [line for line in file.readlines()]
- test_line_num += 1
-
- if test != result:
- raise Exception("Lost sync between files at input line %d.\n INPUT: %s\n OUTPUT: %s" % (test_line_num, test, result))
-
- return test
-
-def create_regex_iterator(tests, results):
- """Gets a test definition line, formatted per the PCRE spec. This is a
- generator that returns each regex test."""
- while tests:
+ # Look for things to skip...
+ no_escape = r'(^|[^\\])(\\\\)*' # Make sure there's no escaping \
+ greedy_ops = re.compile(no_escape + r'[*?+}\(]\?') # Look for *? +? }? (?
+ back_refs = re.compile(no_escape + r'\(.*' + no_escape + r'\\1') # find a \1
+
+ # suite = [
+ # [regex, flags, [(test,result),(test,result),...]]
+ # [regex, flags, [(test,result),(test,result),...]]
+ # ]
+ suite = []
+ while lines:
delim = None
regex = ''
-
# A line is marked by a start-delimeter and an end-delimeter.
# The delimeter is non-alphanumeric
# If a backslash follows the delimiter, then the backslash should
# be appended to the end. (Otherwise, \ + delim would not be a
# delim anymore!)
while 1:
- regex += get_simult_lines(tests, results)
-
- if delim is None:
- delim = regex[0]
+ regex += lines.pop(0)
+ if not delim:
+ if not regex.strip(): # Suppress blank lanes before delim
+ regex = ''
+ continue
+ delim = regex.strip()[0]
assert delim in (set(string.printable) - set(string.letters) - set(string.digits))
- test_re = re.compile(r'%(delim)s(([^%(delim)s]|\\%(delim)s)*([^\\]))%(delim)s(\\?)(.*)' % {'delim': delim})
+ test_re = re.compile(r'%(delim)s(([^%(delim)s]|\\%(delim)s)*([^\\]))%(delim)s(\\?)([^\n\r]*)' % {'delim': delim})
# last two groups are an optional backslash and optional flags
matches = test_re.findall(regex)
if matches:
break
- assert len(matches)==1
+ assert len(matches)==1 # check to make sure we matched right
regex = matches[0][0]
regex += matches[0][-2] # Add the backslash, if we gotta
flags = matches[0][-1] # Get the flags for the regex
- yield regex, flags
+ tests = []
-def create_result_iterator(tests, results):
- """Gets the expected return sets for each regular expression."""
- # Second line is the test to run against the regex
- # ' TEXT'
- while 1:
- test = get_simult_lines(tests, results)
- if not test:
- raise StopIteration
- if not test.startswith(' '):
- raise Exception("Input & output match, but I don't understand. (Got %r)" % test)
- if test.endswith('\\'): # Tests that end in \ expect the \ to be chopped off
- assert not test.endswith('\\\\') # make sure there are no \\ at end
- test = test[:-1]
- test = unescape(test[4:])
-
- # Third line in the OUTPUT is the result, either:
- # ' 0: ...' for a match (but this is ONLY escaped by \x__ types)
- # 'No match' for no match
- result = results.pop(0)
- result = re.sub(r'\\x([0-9a-fA-F]{2})', lambda m: chr(int(m.group(1),16)), result)
- if result == 'No match':
+ if greedy_ops.search(regex) or back_refs.search(regex):
+ # Suppress complex features we can't do
+ pass
+ elif flags:
+ # Suppress any test that requires PCRE flags
pass
- elif result.startswith(' 0:'):
- # Now we need to eat any further lines like:
- # ' 1: ....' a subgroup match
- while results[0]:
- if results[0][2] == ':':
- results.pop(0)
- else:
- break
else:
- raise Exception("Lost sync in output.")
- yield test, result
-
-class SkipException(Exception):
- pass
-
+ # In any other case, we're going to add the test
+ # All the above test fall through and DONT get appended
+ suite.append([regex, flags, tests])
+
+ # Now find the test and expected result
+ while lines:
+ test = lines.pop(0).strip()
+ if not test:
+ break # blank line ends the set
+ if test.endswith('\\'): # Tests that end in \ expect the \ to be chopped off
+ assert not test.endswith('\\\\\\') # Make sure not three \'s. otherwise this check will get ridiculous
+ if not test.endswith('\\\\'): # Two \'s means a real \
+ test = test[:-1]
+ test = unescape(test)
+
+ # Third line in the OUTPUT is the result, either:
+ # ' 0: ...' for a match (but this is ONLY escaped by \x__ types)
+ # 'No match' for no match
+ match = lines.pop(0).rstrip('\r\n')
+ match = re.sub(r'\\x([0-9a-fA-F]{2})', lambda m: chr(int(m.group(1),16)), match)
+ if match.startswith('No match'):
+ pass
+ elif match.startswith(' 0:'):
+ # Now we need to eat any further lines like:
+ # ' 1: ....' a subgroup match
+ while lines[0].strip():
+ # ' 0+ ...' is also possible here
+ if lines[0][2] in [':','+']:
+ lines.pop(0)
+ else:
+ break
+ else:
+ print " *** %r ***" % match
+ raise Exception("Lost sync in output.")
+ tests.append((test,match))
+ return suite
+
def test_file():
"""Open the PCRE tests and run them."""
- tests = [line.rstrip() for line in open('testinput1','r').readlines()]
- results = [line.rstrip() for line in open('testoutput1','r').readlines()]
-
- regex_flag_mapping = { '': lambda s: s,
- 'i': lambda s: s.upper()
- }
-
- regex_set = create_regex_iterator(tests, results)
- import pdb
- for regex, regex_flags in regex_set:
- try:
- print '%r' % regex
-
- # Create an iterator to grab the test/results for this regex
- result_set = create_result_iterator(tests, results)
-
- # Handle the flags:
- if regex_flags in regex_flag_mapping:
- text_prepare = regex_flag_mapping[regex_flags]
- elif 'x' in regex_flags:
- raise SkipException("Cant do extended PRCE expressions")
- else:
- print "UNKNOWN FLAGS: %s" % regex_flags
- continue
-
- skipped = any([op in regex for op in ['*?', '??', '+?', '}?', '(?']])
- if skipped:
- raise SkipException("Cant do non-greedy operators or '(?' constructions)")
-
- regex_to_use = text_prepare(regex)
+ suite = read_file(open('testoutput1','r'))
- anchor_left = regex_to_use.startswith('^')
- anchor_right = regex_to_use.endswith('$') and not regex_to_use.endswith('\\$')
- if anchor_left:
- regex_to_use = regex_to_use[1:] # chop the ^ if it's there
- if anchor_right:
- regex_to_use = regex_to_use[:-1] # chop the $ if it's there
-
- if not regex_to_use:
- raise SkipException("Cant do blank regex")
- except SkipException, e:
- print " SKIPPED (%s)" % e.message
- # now burn all the tests for this regex
- for _ in result_set:
- pass
+ import pdb
+ while suite:
+ regex, flags, tests = suite.pop(0)
+ print '/%r/%s' % (regex, flags)
+
+ regex_to_use = regex
+
+ anchor_left = regex_to_use.startswith('^')
+ anchor_right = regex_to_use.endswith('$') and not regex_to_use.endswith('\\$')
+ if anchor_left:
+ regex_to_use = regex_to_use[1:] # chop the ^ if it's there
+ if anchor_right:
+ regex_to_use = regex_to_use[:-1] # chop the $ if it's there
+
+ if not regex_to_use:
+ print " SKIPPED (Cant do blank regex)"
continue
# Finally, we make the pypy regex runner
runner = make_runner(regex_to_use)
# Now run the test expressions against the Regex
- for test, result in result_set:
+ for test, match in tests:
# Create possible subsequences that we should test
if anchor_left:
start_range = [0]
@@ -163,21 +141,23 @@
# Search the possibilities for a match...
for start, end in subseq_gen:
- attempt = text_prepare(test[start:end])
+ attempt = test[start:end]
matched = runner.recognize(attempt)
if matched:
break
# Did we get what we expected?
- if result == 'No match':
+ if match == 'No match':
if matched:
print " FALSE MATCH: regex==%r test==%r" % (regex, test)
else:
- print " pass: regex==%r test==%r" % (regex, test)
- elif result.startswith(' 0: '):
+ pass
+ #print " pass: regex==%r test==%r" % (regex, test)
+ elif match.startswith(' 0: '):
if not matched:
print " MISSED: regex==%r test==%r" % (regex, test)
- elif not attempt==text_prepare(result[4:]):
- print " BAD MATCH: regex==%r test==%r found==%r expect==%r" % (regex, test, attempt, result[4:])
+ elif not attempt==match[4:]:
+ print " BAD MATCH: regex==%r test==%r found==%r expect==%r" % (regex, test, attempt, match[4:])
else:
- print " pass: regex==%r test==%r" % (regex, test)
+ pass
+ #print " pass: regex==%r test==%r" % (regex, test)
More information about the pypy-svn
mailing list