"""Regular expression tests specific to _sre.py and accumulated during TDD.""" import locale, sys, unittest import re, _sre, sre_constants from sre_constants import ATCODES, OPCODES, CHCODES # constants for testing various locale and unicode stuff UPPER_AE = "\xc4" LOWER_AE = "\xe4" UPPER_PI = u"\u03a0" LOWER_PI = u"\u03c0" INDIAN_DIGIT = u"\u0966" # a unicode digit EM_SPACE = u"\u2001" # a unicode space LINE_SEP = u"\u2028" # a unicode linebreak # get types in a way independent of sre.c/sre.py SRE_Pattern = type(re.compile("b")) SRE_Match = type(re.match("b", "b")) py23 = sys.version_info[:2] == (2, 3) class SrePyTest(unittest.TestCase): def test_magic(self): self.assertEquals(sre_constants.MAGIC, _sre.MAGIC) def test_codesize(self): self.assertEquals(_sre.getcodesize(), _sre.CODESIZE) class SrePatternTest(unittest.TestCase): def get_instance(self): return re.compile("b") def test_copy(self): # copy support is disabled by default in _sre.c p = self.get_instance() self.assertRaises(TypeError, p.__copy__) self.assertRaises(TypeError, p.__deepcopy__) def test_creation_attributes(self): pattern_string = "(b)l(?Pa)" p = re.compile(pattern_string, re.I | re.M) self.assertEquals(SRE_Pattern, type(p)) self.assertEquals(pattern_string, p.pattern) self.assertEquals(re.I | re.M, p.flags) self.assertEquals(2, p.groups) self.assertEquals({"g": 2}, p.groupindex) def test_match_none(self): p = re.compile("bla") none_matches = ["b", "bl", "blub", "jupidu"] for string in none_matches: self.assertEquals(None, p.match(string)) def test_pos_endpos(self): # XXX maybe fancier tests here p = re.compile("bl(a)") tests = [("abla", 0, 4), ("abla", 1, 4), ("ablaa", 1, 4)] for string, pos, endpos in tests: self.assert_(p.search(string, pos, endpos)) tests = [("abla", 0, 3), ("abla", 2, 4)] for string, pos, endpos in tests: self.failIf(p.search(string, pos, endpos)) def test_findall(self): self.assertEquals(["b"], re.findall("b", "bla")) self.assertEquals(["a", "u"], re.findall("b(.)", "abalbus")) self.assertEquals([("a", "l"), ("u", "s")], re.findall("b(.)(.)", "abalbus")) self.assertEquals([("a", ""), ("s", "s")], re.findall("b(a|(s))", "babs")) def test_finditer(self): it = re.finditer("b(.)", "brabbel") self.assertEquals("br", it.next().group(0)) self.assertEquals("bb", it.next().group(0)) self.assertRaises(StopIteration, it.next) def test_split(self): self.assertEquals(["a", "o", "u", ""], re.split("b", "abobub")) self.assertEquals(["a", "o", "ub"], re.split("b", "abobub", 2)) self.assertEquals(['', 'a', 'l', 'a', 'lla'], re.split("b(a)", "balballa")) self.assertEquals(['', 'a', None, 'l', 'u', None, 'lla'], re.split("b([ua]|(s))", "balbulla")) class SreMatchTest(unittest.TestCase): def test_copy(self): # copy support is disabled by default in _sre.c m = re.match("bla", "bla") self.assertRaises(TypeError, m.__copy__) self.assertRaises(TypeError, m.__deepcopy__) def test_match_attributes(self): c = re.compile("bla") m = c.match("blastring") self.assertEquals(SRE_Match, type(m)) self.assertEquals("blastring", m.string) self.assertEquals(c, m.re) self.assertEquals(0, m.pos) self.assertEquals(9, m.endpos) self.assertEquals(None, m.lastindex) self.assertEquals(None, m.lastgroup) self.assertEquals(((0, 3),), m.regs) def test_match_attributes_with_groups(self): m = re.search("a(b)(?Pc)", "aabcd") self.assertEquals(0, m.pos) self.assertEquals(5, m.endpos) self.assertEquals(2, m.lastindex) self.assertEquals("name", m.lastgroup) self.assertEquals(((1, 4), (2, 3), (3, 4)), m.regs) def test_regs_overlapping_groups(self): m = re.match("a((b)c)", "abc") self.assertEquals(((0, 3), (1, 3), (1, 2)), m.regs) def test_start_end_span(self): m = re.search("a((b)c)", "aabcd") self.assertEquals((1, 4), (m.start(), m.end())) self.assertEquals((1, 4), m.span()) self.assertEquals((2, 4), (m.start(1), m.end(1))) self.assertEquals((2, 4), m.span(1)) self.assertEquals((2, 3), (m.start(2), m.end(2))) self.assertEquals((2, 3), m.span(2)) self.assertRaises(IndexError, m.start, 3) self.assertRaises(IndexError, m.end, 3) self.assertRaises(IndexError, m.span, 3) self.assertRaises(IndexError, m.start, -1) def test_groups(self): m = re.search("a((.).)", "aabcd") self.assertEquals(("ab", "a"), m.groups()) self.assertEquals(("ab", "a"), m.groups(True)) m = re.search("a((\d)|(\s))", "aa1b") self.assertEquals(("1", "1", None), m.groups()) self.assertEquals(("1", "1", True), m.groups(True)) m = re.search("a((\d)|(\s))", "a ") self.assertEquals((" ", None, " "), m.groups()) m = re.match("(a)", "a") self.assertEquals(("a",), m.groups()) def test_groupdict(self): m = re.search("a((.).)", "aabcd") self.assertEquals({}, m.groupdict()) m = re.search("a((?P.).)", "aabcd") self.assertEquals({"first": "a"}, m.groupdict()) m = re.search("a((?P\d)|(?P\s))", "aa1b") self.assertEquals({"first": "1", "second": None}, m.groupdict()) self.assertEquals({"first": "1", "second": True}, m.groupdict(True)) def test_group(self): m = re.search("a((?P\d)|(?P\s))", "aa1b") self.assertEquals("a1", m.group()) self.assertEquals(("1", "1", None), m.group(1, 2, 3)) self.assertEquals(("1", None), m.group("first", "second")) self.assertRaises(IndexError, m.group, 1, 4) def test_expand(self): m = re.search("a(..)(?P..)", "ab1bc") self.assertEquals("b1bcbc", m.expand(r"\1\g\2")) def test_sub(self): self.assertEquals("bbbbb", re.sub("a", "b", "ababa")) self.assertEquals(("bbbbb", 3), re.subn("a", "b", "ababa")) self.assertEquals("dddd", re.sub("[abc]", "d", "abcd")) self.assertEquals(("dddd", 3), re.subn("[abc]", "d", "abcd")) self.assertEquals("rbd\nbr\n", re.sub("a(.)", r"b\1\n", "radar")) self.assertEquals(("rbd\nbr\n", 2), re.subn("a(.)", r"b\1\n", "radar")) self.assertEquals(("bbbba", 2), re.subn("a", "b", "ababa", 2)) def test_sub_callable(self): def call_me(match): ret = "" for char in match.group(): ret += chr(ord(char) + 1) return ret self.assertEquals(("bbbbb", 3), re.subn("a", call_me, "ababa")) class SreScannerTest(unittest.TestCase): def test_scanner_attributes(self): p = re.compile("bla") s = p.scanner("blablubla") self.assertEquals(p, s.pattern) def test_scanner_match(self): p = re.compile(".").scanner("bla") self.assertEquals(("b", "l", "a"), (p.match().group(0), p.match().group(0), p.match().group(0))) self.assertEquals(None, p.match()) def test_scanner_search(self): p = re.compile("\d").scanner("bla23c5a") self.assertEquals(("2", "3", "5"), (p.search().group(0), p.search().group(0), p.search().group(0))) self.assertEquals(None, p.search()) def test_scanner_zero_width_match(self): if py23: return p = re.compile(".*").scanner("bla") self.assertEquals(("bla", ""), (p.search().group(0), p.search().group(0))) self.assertEquals(None, p.search()) class GetlowerTest(unittest.TestCase): def setUp(self): locale.setlocale(locale.LC_ALL, (None, None)) def tearDown(self): locale.setlocale(locale.LC_ALL, (None, None)) def assertLowerEqual(self, tests, flags): for arg, expected in tests: self.assertEquals(ord(expected), _sre.getlower(ord(arg), flags)) def test_getlower_no_flags(self): self.assertLowerEqual([("a", "a"), ("A", "a"), (UPPER_AE, UPPER_AE), (u"\u00c4", u"\u00c4"), (u"\u4444", u"\u4444")], 0) def test_getlower_locale(self): try: locale.setlocale(locale.LC_ALL, "de_DE") self.assertLowerEqual([("a", "a"), ("A", "a"), (UPPER_AE, LOWER_AE), (u"\u00c4", u"\u00e4"), (UPPER_PI, UPPER_PI)], sre_constants.SRE_FLAG_LOCALE) except locale.Error: # skip test pass def test_getlower_unicode(self): self.assertLowerEqual([("a", "a"), ("A", "a"), (UPPER_AE, LOWER_AE), (u"\u00c4", u"\u00e4"), (UPPER_PI, LOWER_PI), (u"\u4444", u"\u4444")], sre_constants.SRE_FLAG_UNICODE) class SimpleSearchesTest(unittest.TestCase): def test_search_simple_literal(self): self.assert_(re.search("bla", "bla")) self.assert_(re.search("bla", "blab")) self.failIf(re.search("bla", "blu")) def test_search_simple_ats(self): self.assert_(re.search("^bla", "bla")) self.assert_(re.search("^bla", "blab")) self.failIf(re.search("^bla", "bbla")) self.assert_(re.search("bla$", "abla")) self.assert_(re.search("bla$", "bla\n")) self.failIf(re.search("bla$", "blaa")) def test_search_simple_boundaries(self): self.assert_(re.search(r"bla\b", "bla")) self.assert_(re.search(r"bla\b", "bla ja")) self.assert_(re.search(r"bla\b", u"bla%s" % UPPER_PI)) self.failIf(re.search(r"bla\b", "blano")) self.failIf(re.search(r"bla\b", u"bla%s" % UPPER_PI, re.UNICODE)) def test_search_simple_categories(self): self.assert_(re.search(r"bla\d\s\w", "bla3 b")) self.assert_(re.search(r"b\d", u"b%s" % INDIAN_DIGIT, re.UNICODE)) self.failIf(re.search(r"b\D", u"b%s" % INDIAN_DIGIT, re.UNICODE)) self.assert_(re.search(r"b\s", u"b%s" % EM_SPACE, re.UNICODE)) self.failIf(re.search(r"b\S", u"b%s" % EM_SPACE, re.UNICODE)) self.assert_(re.search(r"b\w", u"b%s" % LOWER_PI, re.UNICODE)) self.failIf(re.search(r"b\W", u"b%s" % LOWER_PI, re.UNICODE)) self.assert_(re.search(r"b\w", "b%s" % LOWER_AE, re.UNICODE)) def test_search_simple_any(self): self.assert_(re.search(r"b..a", "jboaas")) self.failIf(re.search(r"b..a", "jbo\nas")) self.assert_(re.search(r"b..a", "jbo\nas", re.DOTALL)) def test_search_simple_in(self): self.assert_(re.search(r"b[\da-z]a", "bb1a")) self.assert_(re.search(r"b[\da-z]a", "bbsa")) self.failIf(re.search(r"b[\da-z]a", "bbSa")) self.assert_(re.search(r"b[^okd]a", "bsa")) self.failIf(re.search(r"b[^okd]a", "bda")) self.assert_(re.search(u"b[%s%s%s]a" % (LOWER_PI, UPPER_PI, EM_SPACE), u"b%sa" % UPPER_PI)) # bigcharset self.assert_(re.search(u"b[%s%s%s]a" % (LOWER_PI, UPPER_PI, EM_SPACE), u"b%sa" % EM_SPACE)) self.failIf(re.search(u"b[%s%s%s]a" % (LOWER_PI, UPPER_PI, EM_SPACE), u"b%sa" % LINE_SEP)) def test_search_simple_literal_ignore(self): self.assert_(re.search(r"ba", "ba", re.IGNORECASE)) self.assert_(re.search(r"ba", "BA", re.IGNORECASE)) self.assert_(re.search(u"b%s" % UPPER_PI, u"B%s" % LOWER_PI, re.IGNORECASE | re.UNICODE)) def test_search_simple_in_ignore(self): self.assert_(re.search(r"ba[A-C]", "bac", re.IGNORECASE)) self.assert_(re.search(r"ba[a-c]", "baB", re.IGNORECASE)) self.assert_(re.search(u"ba[%s]" % UPPER_PI, "ba%s" % LOWER_PI, re.IGNORECASE | re.UNICODE)) self.assert_(re.search(r"ba[^A-C]", "bar", re.IGNORECASE)) self.failIf(re.search(r"ba[^A-C]", "baA", re.IGNORECASE)) self.failIf(re.search(r"ba[^A-C]", "baa", re.IGNORECASE)) def test_search_simple_branch(self): self.assert_(re.search(r"a(bb|d[ef])b", "adeb")) self.assert_(re.search(r"a(bb|d[ef])b", "abbb")) def test_search_simple_repeat_one(self): self.assert_(re.search(r"aa+", "aa")) # empty tail self.assert_(re.search(r"aa+ab", "aaaab")) # backtracking self.assert_(re.search(r"aa*ab", "aab")) # empty match self.assert_(re.search(r"a[bc]+", "abbccb")) self.assertEquals("abbcb", re.search(r"a.+b", "abbcb\nb").group()) self.assertEquals("abbcb\nb", re.search(r"a.+b", "abbcb\nb", re.DOTALL).group()) self.assert_(re.search(r"ab+c", "aBbBbBc", re.IGNORECASE)) self.failIf(re.search(r"aa{2,3}", "aa")) # string too short self.failIf(re.search(r"aa{2,3}b", "aab")) # too few repetitions self.failIf(re.search(r"aa+b", "aaaac")) # tail doesn't match def test_search_simple_min_repeat_one(self): self.assert_(re.search(r"aa+?", "aa")) # empty tail self.assert_(re.search(r"aa+?ab", "aaaab")) # forward tracking self.assert_(re.search(r"a[bc]+?", "abbccb")) self.assertEquals("abb", re.search(r"a.+?b", "abbcb\nb").group()) self.assertEquals("a\nbb", re.search(r"a.+b", "a\nbbc", re.DOTALL).group()) self.assert_(re.search(r"ab+?c", "aBbBbBc", re.IGNORECASE)) self.failIf(re.search(r"aa+?", "a")) # string too short self.failIf(re.search(r"aa{2,3}?b", "aab")) # too few repetitions self.failIf(re.search(r"aa+?b", "aaaac")) # tail doesn't match self.assertEquals(re.match(".*?cd", "abcabcde").end(0), 7) def test_search_simple_repeat_maximizing(self): self.failIf(re.search(r"(ab){3,5}", "abab")) self.failIf(re.search(r"(ab){3,5}", "ababa")) self.assert_(re.search(r"(ab){3,5}", "ababab")) self.assertEquals(re.search(r"(ab){3,5}", "abababababab").end(0), 10) self.assertEquals("ad", re.search(r"(a.)*", "abacad").group(1)) self.assertEquals(("abcg", "cg"), re.search(r"(ab(c.)*)+", "ababcecfabcg").groups()) self.assertEquals(("cg", "cg"), re.search(r"(ab|(c.))+", "abcg").groups()) self.assertEquals(("ab", "cf"), re.search(r"((c.)|ab)+", "cfab").groups()) self.assert_(re.search(r".*", "")) def test_search_simple_repeat_minimizing(self): self.failIf(re.search(r"(ab){3,5}?", "abab")) self.assert_(re.search(r"(ab){3,5}?", "ababab")) self.assert_(re.search(r"b(a){3,5}?b", "baaaaab")) self.failIf(re.search(r"b(a){3,5}?b", "baaaaaab")) self.assert_(re.search(r"a(b(.)+?)*", "abdbebb")) def test_search_simple_groupref(self): self.assert_(re.match(r"((ab)+)c\1", "ababcabab")) self.failIf(re.match(r"((ab)+)c\1", "ababcab")) self.failIf(re.search(r"(a|(b))\2", "aa")) self.assert_(re.match(r"((ab)+)c\1", "aBAbcAbaB", re.IGNORECASE)) self.assert_(re.match(r"((a.)+)c\1", u"a%sca%s" % (UPPER_PI, LOWER_PI), re.IGNORECASE | re.UNICODE)) def test_search_simple_groupref_exists(self): if not py23: self.assert_(re.search(r"(<)?bla(?(1)>)", "")) self.assert_(re.search(r"(<)?bla(?(1)>)", "bla")) self.failIf(re.match(r"(<)?bla(?(1)>)", "|u)", "blau")) def test_search_simple_assert(self): self.assert_(re.search(r"b(?=\d\d).{3,}", "b23a")) self.failIf(re.search(r"b(?=\d\d).{3,}", "b2aa")) self.assert_(re.search(r"b(?<=\d.)a", "2ba")) self.failIf(re.search(r"b(?<=\d.)a", "ba")) def test_search_simple_assert_not(self): self.assert_(re.search(r"b(?