[pypy-svn] r52024 - in pypy/dist/pypy: interpreter/pyparser rlib/parsing rlib/parsing/test

jared.grubb at codespeak.net jared.grubb at codespeak.net
Sun Mar 2 02:15:51 CET 2008


Author: jared.grubb
Date: Sun Mar  2 02:15:50 2008
New Revision: 52024

Modified:
   pypy/dist/pypy/interpreter/pyparser/parsestring.py
   pypy/dist/pypy/rlib/parsing/regexparse.py
   pypy/dist/pypy/rlib/parsing/test/test_pcre_regtest.py
Log:
parsestring: simplify the octal parsing a bit
regexparse: simplify the unescape() function
test_pcre_regtest: preprocesses the testoutput1 file and runs most of the tests

Modified: pypy/dist/pypy/interpreter/pyparser/parsestring.py
==============================================================================
--- pypy/dist/pypy/interpreter/pyparser/parsestring.py	(original)
+++ pypy/dist/pypy/interpreter/pyparser/parsestring.py	Sun Mar  2 02:15:50 2008
@@ -155,15 +155,13 @@
             lis.append('\013') # VT
         elif ch == 'a':
             lis.append('\007') # BEL, not classic C
-        elif '0' <= ch <= '7':
-            c = ord(s[ps - 1]) - ord('0')
-            if ps < end and '0' <= s[ps] <= '7':
-                c = (c << 3) + ord(s[ps]) - ord('0')
-                ps += 1
-                if ps < end and '0' <= s[ps] <= '7':
-                    c = (c << 3) + ord(s[ps]) - ord('0')
-                    ps += 1
-            lis.append(chr(c))
+        elif ch in '01234567':
+            # Look for up to two more octal digits
+            span = ps
+            span += (span < end) and (s[span] in '01234567')
+            span += (span < end) and (s[span] in '01234567')
+            lis.append(chr(int(s[ps - 1 : span], 8)))
+            ps = span
         elif ch == 'x':
             if ps+2 <= end and isxdigit(s[ps]) and isxdigit(s[ps + 1]):
                 lis.append(chr(int(s[ps : ps + 2], 16)))

Modified: pypy/dist/pypy/rlib/parsing/regexparse.py
==============================================================================
--- pypy/dist/pypy/rlib/parsing/regexparse.py	(original)
+++ pypy/dist/pypy/rlib/parsing/regexparse.py	Sun Mar  2 02:15:50 2008
@@ -9,72 +9,59 @@
 set = py.builtin.set
 
 ESCAPES = {
-    "\\a": "\a",
-    "\\b": "\b",
-    "\\e": "\x1b",
-    "\\f": "\f",
-    "\\n": "\n",
-    "\\r": "\r",
-    "\\t": "\t",
-    "\\v": "\v",
-    "\\":  "\\"
+    "a": "\a",
+    "b": "\b",
+    "e": "\x1b",
+    "f": "\f",
+    "n": "\n",
+    "r": "\r",
+    "t": "\t",
+    "v": "\v",
 }
 
 for i in range(256):
-    if chr(i) not in 'x01234567sSwWdD':
-        # 'x' and numbers are reserved for hexadecimal/octal escapes
-        escaped = "\\" + chr(i)
-        if escaped not in ESCAPES:
-            ESCAPES[escaped] = chr(i)
-
-    # Three digit octals
-    escaped = "\\%03o" % i
-    if escaped not in ESCAPES:
-        ESCAPES[escaped] = chr(i)
-
-    if 0 <= i <= 077:
-        # Two digit octal digs are ok too
-        escaped = "\\%02o" % i
-        if escaped not in ESCAPES:
-            ESCAPES[escaped] = chr(i)
-    
     # Add the ctrl-x types:
     #   Rule, according to PCRE:
     #     if x is a lower case letter, it is converted to upper case. 
     #     Then bit 6 of the character (hex 40) is inverted.   
-    #     Thus, \cz => 0x1A, but \c{ => 0x3B, while \c; => 0x7B.
-    escaped = "\\c%s" % chr(i)
-    if escaped not in ESCAPES:
-        ESCAPES[escaped] = chr(ord(chr(i).upper()) ^ 0x40)
+    #     Thus, \cz => 0x1A, \c{ => 0x3B, \c; => 0x7B.
+    escaped = "c%s" % chr(i)
+    ESCAPES[escaped] = chr(ord(chr(i).upper()) ^ 0x40)
+
+def unescape_muncher(string):
+    """Return a tuple, representing the first character of the string
+    (appropriately unescaped) and the rest of the string that wasn't
+    handled."""
+    if string[0] != '\\':
+        # Not an escape character
+        return string[0], string[1:]
+    if string[1] == 'x':
+        # Hex char, must have two hex digits
+        char = chr(int(string[2:4], 16))
+        return char, string[4:]
+    if string[1] in '01234567':
+        # Octal number, up to three digits long
+        span = 2
+        span += (span < len(string)) and (string[span] in '01234567')
+        span += (span < len(string)) and (string[span] in '01234567')
+        char = chr(int(string[1:span], 8))
+        return char, string[span:]
+    if string[1] == 'c':
+        # Special \cx types
+        return ESCAPES['c'+string[2]], string[3:]
+    if string[1] in ESCAPES:
+        # Special escapes are in ESCAPE
+        return ESCAPES[string[1]], string[2:]
+    # Otherwise, it's just the character it's meant to be (e.g., '\.')
+    return string[1], string[2:]
     
-
-for a in "0123456789ABCDEFabcdef":
-    for b in "0123456789ABCDEFabcdef":
-        escaped = "\\x%s%s" % (a, b)
-        if escaped not in ESCAPES:
-            ESCAPES[escaped] = chr(int("%s%s" % (a, b), 16))
-
+        
 def unescape(s):
+    """Unescape a whole string."""
     result = []
-    i = 0
-    while i < len(s):
-        if s[i] != "\\":
-            result.append(s[i])
-            i += 1
-            continue
-        if s[i + 1] == "x":
-            escaped = s[i: i + 4]
-            i += 4
-        elif s[i + 1] in "01234567":
-            escaped = s[i: i + 4]
-            i += 4
-        else:
-            escaped = s[i: i + 2]
-            i += 2
-        if escaped not in ESCAPES:
-            raise ValueError("escape %r unknown" % (escaped, ))
-        else:
-            result.append(ESCAPES[escaped])
+    while s:
+        char, s = unescape_muncher(s)
+        result.append(char)
     return "".join(result)
 
 syntax =  r"""
@@ -184,20 +171,15 @@
 charclass:
     '\' 'd'
     return { set([chr(c) for c in range(ord('0'), ord('9')+1)]) }
-  | '\' 
-    's'
+  | '\' 's'
     return { set(['\t', '\n', '\f', '\r', ' ']) }
-  | '\' 
-    'w'
+  | '\' 'w'
     return { set([chr(c) for c in range(ord('a'), ord('z')+1)] + [chr(c) for c in range(ord('A'), ord('Z')+1)] + [chr(c) for c in range(ord('0'), ord('9')+1)] + ['_']) }
-  | '\' 
-    'D'
+  | '\' 'D'
     return { set([chr(c) for c in range(256)]) - set([chr(c) for c in range(ord('0'), ord('9')+1)]) }
-  | '\' 
-    'S'
+  | '\' 'S'
     return { set([chr(c) for c in range(256)]) - set(['\t', '\n', '\f', '\r', ' ']) }
-  | '\' 
-    'W'
+  | '\' 'W'
     return { set([chr(c) for c in range(256)]) - set([chr(c) for c in range(ord('a'), ord('z')+1)] + [chr(c) for c in range(ord('A'), ord('Z')+1)] + [chr(c) for c in range(ord('0'), ord('9')+1)] + ['_'])};
 
 NUM:
@@ -1862,6 +1844,9 @@
 
 
 
+
+
+
 def test_generate():
     f = py.magic.autopath()
     oldcontent = f.read()

Modified: pypy/dist/pypy/rlib/parsing/test/test_pcre_regtest.py
==============================================================================
--- pypy/dist/pypy/rlib/parsing/test/test_pcre_regtest.py	(original)
+++ pypy/dist/pypy/rlib/parsing/test/test_pcre_regtest.py	Sun Mar  2 02:15:50 2008
@@ -12,143 +12,121 @@
 
 py.test.skip("In Progress...")
 
-def get_simult_lines(tests, results, test_line_num=0):
-    """Returns a line from the input/output, ensuring that
-    we are sync'd up between the two."""
-    test = tests.pop(0)
-    result = results.pop(0)
+def read_file(file):
+    lines = [line for line in file.readlines()]
     
-    test_line_num += 1
-    
-    if test != result:
-        raise Exception("Lost sync between files at input line %d.\n  INPUT: %s\n  OUTPUT: %s" % (test_line_num, test, result))
-        
-    return test
-    
-def create_regex_iterator(tests, results):
-    """Gets a test definition line, formatted per the PCRE spec. This is a 
-    generator that returns each regex test."""
-    while tests:
+    # Look for things to skip...
+    no_escape = r'(^|[^\\])(\\\\)*'                   # Make sure there's no escaping \
+    greedy_ops = re.compile(no_escape + r'[*?+}\(]\?')  # Look for *? +? }? (?
+    back_refs  = re.compile(no_escape + r'\(.*' + no_escape + r'\\1') # find a \1
+    
+    # suite = [ 
+    #            [regex, flags, [(test,result),(test,result),...]]
+    #            [regex, flags, [(test,result),(test,result),...]]
+    #         ]
+    suite = []
+    while lines:
         delim = None
         regex = ''
-    
         # A line is marked by a start-delimeter and an end-delimeter.
         # The delimeter is non-alphanumeric
         # If a backslash follows the delimiter, then the backslash should
         #   be appended to the end. (Otherwise, \ + delim would not be a
         #   delim anymore!)
         while 1:
-            regex += get_simult_lines(tests, results)
-    
-            if delim is None:
-                delim = regex[0]
+            regex += lines.pop(0)
+            if not delim:
+                if not regex.strip():   # Suppress blank lanes before delim
+                    regex = ''
+                    continue
+                delim = regex.strip()[0]
                 assert delim in (set(string.printable) - set(string.letters) - set(string.digits))
-                test_re = re.compile(r'%(delim)s(([^%(delim)s]|\\%(delim)s)*([^\\]))%(delim)s(\\?)(.*)' % {'delim': delim})
+                test_re = re.compile(r'%(delim)s(([^%(delim)s]|\\%(delim)s)*([^\\]))%(delim)s(\\?)([^\n\r]*)' % {'delim': delim})
                 # last two groups are an optional backslash and optional flags
             
             matches = test_re.findall(regex)
             if matches:
                 break
 
-        assert len(matches)==1
+        assert len(matches)==1  # check to make sure we matched right
     
         regex = matches[0][0]
         regex += matches[0][-2] # Add the backslash, if we gotta
         flags = matches[0][-1] # Get the flags for the regex
 
-        yield regex, flags
+        tests = []
 
-def create_result_iterator(tests, results):
-    """Gets the expected return sets for each regular expression."""
-    # Second line is the test to run against the regex
-    # '    TEXT'
-    while 1:
-        test = get_simult_lines(tests, results)
-        if not test:
-            raise StopIteration
-        if not test.startswith('    '):
-            raise Exception("Input & output match, but I don't understand. (Got %r)" % test)
-        if test.endswith('\\'): # Tests that end in \ expect the \ to be chopped off
-            assert not test.endswith('\\\\')    # make sure there are no \\ at end
-            test = test[:-1]
-        test = unescape(test[4:])
-    
-        # Third line in the OUTPUT is the result, either:
-        # ' 0: ...' for a match (but this is ONLY escaped by \x__ types)
-        # 'No match' for no match
-        result = results.pop(0)
-        result = re.sub(r'\\x([0-9a-fA-F]{2})', lambda m: chr(int(m.group(1),16)), result)
-        if result == 'No match':
+        if greedy_ops.search(regex) or back_refs.search(regex):
+            # Suppress complex features we can't do
+            pass
+        elif flags:
+            # Suppress any test that requires PCRE flags
             pass
-        elif result.startswith(' 0:'):
-            # Now we need to eat any further lines like:
-            # ' 1: ....' a subgroup match
-            while results[0]:
-                if results[0][2] == ':':
-                    results.pop(0)
-                else:
-                    break
         else:
-            raise Exception("Lost sync in output.")
-        yield test, result
-    
-class SkipException(Exception):
-    pass
-    
+            # In any other case, we're going to add the test
+            # All the above test fall through and DONT get appended
+            suite.append([regex, flags, tests]) 
+            
+        # Now find the test and expected result
+        while lines:
+            test = lines.pop(0).strip()
+            if not test:
+                break   # blank line ends the set
+            if test.endswith('\\'): # Tests that end in \ expect the \ to be chopped off
+                assert not test.endswith('\\\\\\') # Make sure not three \'s. otherwise this check will get ridiculous
+                if not test.endswith('\\\\'): # Two \'s means a real \
+                    test = test[:-1]
+            test = unescape(test)
+
+            # Third line in the OUTPUT is the result, either:
+            # ' 0: ...' for a match (but this is ONLY escaped by \x__ types)
+            # 'No match' for no match
+            match = lines.pop(0).rstrip('\r\n')
+            match = re.sub(r'\\x([0-9a-fA-F]{2})', lambda m: chr(int(m.group(1),16)), match)
+            if match.startswith('No match'):
+                pass
+            elif match.startswith(' 0:'):
+                # Now we need to eat any further lines like:
+                # ' 1: ....' a subgroup match
+                while lines[0].strip():
+                    # ' 0+ ...' is also possible here
+                    if lines[0][2] in [':','+']:
+                        lines.pop(0)
+                    else:
+                        break
+            else:
+                print " *** %r ***" % match
+                raise Exception("Lost sync in output.")
+            tests.append((test,match))
+    return suite
+
 def test_file():
     """Open the PCRE tests and run them."""
-    tests = [line.rstrip() for line in open('testinput1','r').readlines()]
-    results = [line.rstrip() for line in open('testoutput1','r').readlines()]
-    
-    regex_flag_mapping = { '': lambda s: s, 
-                           'i': lambda s: s.upper()
-                         }
-    
-    regex_set = create_regex_iterator(tests, results)    
-    import pdb
-    for regex, regex_flags in regex_set:
-        try:
-            print '%r' % regex
-
-            # Create an iterator to grab the test/results for this regex
-            result_set = create_result_iterator(tests, results)
-
-            # Handle the flags:
-            if regex_flags in regex_flag_mapping:
-                text_prepare = regex_flag_mapping[regex_flags]
-            elif 'x' in regex_flags:
-                raise SkipException("Cant do extended PRCE expressions")            
-            else:
-                print "UNKNOWN FLAGS: %s" % regex_flags
-                continue
-        
-            skipped = any([op in regex for op in ['*?', '??', '+?', '}?', '(?']])        
-            if skipped:
-                raise SkipException("Cant do non-greedy operators or '(?' constructions)")
-                
-            regex_to_use = text_prepare(regex)
+    suite = read_file(open('testoutput1','r'))
         
-            anchor_left = regex_to_use.startswith('^')
-            anchor_right = regex_to_use.endswith('$') and not regex_to_use.endswith('\\$')
-            if anchor_left:
-                regex_to_use = regex_to_use[1:]   # chop the ^ if it's there
-            if anchor_right:
-                regex_to_use = regex_to_use[:-1]  # chop the $ if it's there
-        
-            if not regex_to_use:
-                raise SkipException("Cant do blank regex")
-        except SkipException, e:
-            print "  SKIPPED (%s)" % e.message
-            # now burn all the tests for this regex
-            for _ in result_set:
-                pass
+    import pdb
+    while suite:
+        regex, flags, tests = suite.pop(0)
+        print '/%r/%s' % (regex, flags)
+    
+        regex_to_use = regex
+    
+        anchor_left = regex_to_use.startswith('^')
+        anchor_right = regex_to_use.endswith('$') and not regex_to_use.endswith('\\$')
+        if anchor_left:
+            regex_to_use = regex_to_use[1:]   # chop the ^ if it's there
+        if anchor_right:
+            regex_to_use = regex_to_use[:-1]  # chop the $ if it's there
+    
+        if not regex_to_use:
+            print "  SKIPPED (Cant do blank regex)"
             continue
             
         # Finally, we make the pypy regex runner
         runner = make_runner(regex_to_use)
         
         # Now run the test expressions against the Regex
-        for test, result in result_set:
+        for test, match in tests:
             # Create possible subsequences that we should test
             if anchor_left:
                 start_range = [0]
@@ -163,21 +141,23 @@
 
             # Search the possibilities for a match...
             for start, end in subseq_gen:
-                attempt = text_prepare(test[start:end])
+                attempt = test[start:end]
                 matched = runner.recognize(attempt)
                 if matched: 
                     break
             
             # Did we get what we expected?
-            if result == 'No match':
+            if match == 'No match':
                 if matched:
                     print "  FALSE MATCH: regex==%r test==%r" % (regex, test)
                 else:
-                    print "  pass:        regex==%r test==%r" % (regex, test)
-            elif result.startswith(' 0: '):
+                    pass
+                    #print "  pass:        regex==%r test==%r" % (regex, test)
+            elif match.startswith(' 0: '):
                 if not matched:
                     print "  MISSED:      regex==%r test==%r" % (regex, test)
-                elif not attempt==text_prepare(result[4:]):
-                    print "  BAD MATCH:   regex==%r test==%r found==%r expect==%r" % (regex, test, attempt, result[4:])
+                elif not attempt==match[4:]:
+                    print "  BAD MATCH:   regex==%r test==%r found==%r expect==%r" % (regex, test, attempt, match[4:])
                 else:
-                    print "  pass:        regex==%r test==%r" % (regex, test)
+                    pass
+                    #print "  pass:        regex==%r test==%r" % (regex, test)


More information about the pypy-svn mailing list