# # Search for identical lines in stdin. Line starting with "*" # are assumed to be repeated that number of times. # stdout report is formatted according to sys.argv[1]. # import sys, re re1 = re.compile(r"([0-9]+)[*](.*)") def samelines(infile, outfile, format, verbose=1, minimum=2): lines = {} total = 0 try: for line in infile: verbose -= 1 if not verbose: if total: print >> sys.stderr, '%d lines, %d without duplicates...' % ( total, len(lines)) total += 5000 verbose = 5000 if line.endswith('\n'): line = line[:-1] match = re1.match(line) if match: count = int(match.group(1)) line = match.group(2) else: count = 1 lines[line] = lines.get(line, 0) + count finally: for line, count in lines.iteritems(): if count >= minimum: print >> outfile, format % (line, count) if __name__ == '__main__': format = sys.argv[1] samelines(sys.stdin, sys.stdout, format)