"""A disk-based hash table. Works like a (typed) dictionary. Both for the keys and the values, the type must be specified as a format string as in the 'struct' module. The keys correspond to single-character format strings, so they are simple objects like integers or strings of fixed length. The values on the other hand are always tuples, corresponding to a format string that is not limited to one character. """ import struct, mmap, os sig_bytes = struct.calcsize("!ii") class MMapTable(object): SIGNATURE = ~0xb2b2 fd = None def __init__(self, filename, nodesize, init_n_nodes=8): assert is_p2(init_n_nodes) assert nodesize >= sig_bytes self.filename = filename self.nodesize = nodesize self.init_n_nodes = init_n_nodes def close(self, _os_close=os.close): fd = self.fd if fd is not None: if 'mmap' in self.__dict__: mmap = self.mmap del self.mmap mmap.close() self.fd = None _os_close(fd) __del__ = close def flush(self): if 'mmap' in self.__dict__: self.mmap.flush() def map_file(self): self.close() try: fd = os.open(self.filename, os.O_RDWR) self.fd = fd except OSError: fd = os.open(self.filename, os.O_RDWR|os.O_CREAT|os.O_EXCL|os.O_TRUNC, 0644) self.fd = fd used = 0 writeall(fd, struct.pack("!ii", self.SIGNATURE, used)) totalsize = self.init_n_nodes * self.nodesize os.ftruncate(fd, totalsize) else: sig, used = struct.unpack("!ii", readall(fd, sig_bytes)) assert sig == self.SIGNATURE totalsize = os.lseek(fd, 0, 2) n_nodes = totalsize // self.nodesize assert is_p2(n_nodes) os.lseek(fd, 0, 0) self.mmap = mmap.mmap(fd, n_nodes * self.nodesize) self.n_nodes = n_nodes self.used = used def __getattr__(self, attr): if attr in ('mmap', 'n_nodes', 'used'): self.map_file() return getattr(self, attr) else: raise AttributeError("%s object has no attribute %r" % ( self.__class__.__name__, attr)) def fileno(self): if self.fd is None: self.map_file() return self.fd def __len__(self): return self.n_nodes def __getitem__(self, n): p = n * self.nodesize return self.mmap[p : p + self.nodesize] def __setitem__(self, n, value): p = n * self.nodesize self.mmap[p : p + self.nodesize] = value def iterfrom(self, n0): nodesize = self.nodesize mmap = self.mmap for p in xrange(n0 * nodesize, len(self) * nodesize, nodesize): yield mmap[p : p + nodesize] def update_used(self, delta): self.used += delta self.mmap[:sig_bytes] = struct.pack("!ii", self.SIGNATURE, self.used) def is_p2(n): return n > 0 and not (n & (n-1)) def p2_round_down(n): result = 1 while result + result <= n: result += result return result def p2_round_up(n): result = 1 while result < n: result += result return result def readall(fd, size): buf = '' while size > 0: t = os.read(fd, size) if not t: raise EOFError buf += t size -= len(t) return buf def writeall(fd, data): while data: count = os.write(fd, data) data = data[count:] class HashTable(object): MMapTable = MMapTable def __init__(self, filename, keyfmt, valuefmt, hashfn=int): # note: the key "all-zeroes" is reserved for empty buckets self.filename = filename self.keyfmt = "!" + keyfmt self.nodefmt = "!" + keyfmt + valuefmt self.size_id_bytes = struct.calcsize(self.keyfmt) self.zero_id_bytes = '\x00' * self.size_id_bytes self.hashfn = hashfn nodesize = struct.calcsize(self.nodefmt) self.table = self.MMapTable(filename, nodesize) def clone_fresh_table(self, newfilename): return HashTable(newfilename, self.keyfmt.lstrip('!'), self.nodefmt[len(self.keyfmt):], self.hashfn) def close(self): if self.table is not None: self.table.close() self.table = None def flush(self): self.table.flush() def fileno(self): return self.table.fileno() def outdated(self, bucket, index=None): return False def bucketkey(self, bucket): return struct.unpack(self.nodefmt, bucket)[0] def bucketvalues(self, bucket): return struct.unpack(self.nodefmt, bucket)[1:] def bucketitems(self, bucket): flat = struct.unpack(self.nodefmt, bucket) return flat[0], flat[1:] def overwritebucket(self, index, newbucket): assert (self.table[index][:self.size_id_bytes] == newbucket[:self.size_id_bytes]) self.table[index] = newbucket def lookup(self, id, _noforce=True): hmask = len(self.table) - 1 index = self.hashfn(id) & hmask lookstring = struct.pack(self.keyfmt, id) stride = 1 firstoutdated = None while True: if index: bucket = self.table[index] if bucket.startswith(lookstring): return index, bucket # found if bucket.startswith(self.zero_id_bytes): return firstoutdated or (index, None) # empty if firstoutdated is None and self.outdated(bucket, _noforce and index): firstoutdated = index, bucket index = (index + stride) & hmask stride += 1 #def __len__(self): # return self.table.used def __getitem__(self, id): index, bucket = self.lookup(id) if bucket is None or self.outdated(bucket, index): raise KeyError(id) return struct.unpack(self.nodefmt, bucket)[1:] def __contains__(self, id): index, bucket = self.lookup(id) return bucket is not None and not self.outdated(bucket, index) def __setitem__(self, id, values): index, prevbucket = self.lookup(id) bucket = struct.pack(self.nodefmt, id, *values) if prevbucket is None: self.table.update_used(+1) self.table[index] = bucket if self.table.used * 3 > self.table.n_nodes * 2: self.rehash() def __iter__(self, builder=bucketkey): zero_id_bytes = self.zero_id_bytes for bucket in self.table.iterfrom(1): if not (bucket.startswith(zero_id_bytes) or self.outdated(bucket)): yield builder(self, bucket) def iterkeys(self): return self.__iter__(HashTable.bucketkey) def itervalues(self): return self.__iter__(HashTable.bucketvalues) def iteritems(self): return self.__iter__(HashTable.bucketitems) def rehash(self): newsize = p2_round_up(self.table.used * 2) #print 'rehash to %d...' % newsize nodesize = self.table.nodesize altname = self.filename + '~' if os.path.exists(altname): os.unlink(altname) oldtable = self.table newtable = self.MMapTable(altname, nodesize, init_n_nodes=newsize) self.table = newtable newused = 0 zero_id_bytes = self.zero_id_bytes size_id_bytes = self.size_id_bytes for bucket in oldtable.iterfrom(1): if not (bucket.startswith(zero_id_bytes) or self.outdated(bucket)): id, = struct.unpack(self.keyfmt, bucket[:size_id_bytes]) index, prevbucket = self.lookup(id, None) assert prevbucket is None newtable[index] = bucket newused += 1 newtable.update_used(newused) oldtable.close() newtable.close() os.rename(altname, self.filename) self.table = self.MMapTable(self.filename, nodesize) #print 'done' if self.table.used * 4 + 100 < len(self.table): self.rehash() # many outdated items died, re-rehash to shrink def hashhead(s): return struct.unpack("!i", s[:4])[0] def ChecksumHashTable(filename, keylength, valuefmt): assert keylength >= 4 return HashTable(filename, "%ds" % (keylength,), valuefmt, hashhead)