import os from hashtable import ChecksumHashTable from valuetree import ValueTree import md5, sha class FilesDB: def __init__(self, dbpath): self.dbpath = dbpath fn_md5 = os.path.join(dbpath, 'md5') fn_sha = os.path.join(dbpath, 'sha') fn_link = os.path.join(dbpath, 'link') fn_data = os.path.join(dbpath, 'data') self.md5table = ChecksumHashTable(fn_md5, md5.digest_size, "iqq") self.shatable = ChecksumHashTable(fn_sha, sha.digest_size, "iqq") self.linktree = ValueTree (fn_link, "qd") self.fn_data = fn_data def _getfilename(self, id): if id <= 0: assert id == -1 return self.fn_data else: key, _ = self.linktree.load_node(id) return key def _openwrite(self): f = open(self.fn_data, 'ab') pos = f.tell() if pos & 7: # some alignment newpos = (pos+7)&~7 f.write('\x00' * (newpos-pos)) pos = newpos return f, -1, pos def get_entry(self, filename, start=0, length=None): filename = os.path.normpath(os.path.abspath(filename)) if length is None: st = os.stat(filename) length = st.st_size id = self.linktree.locate_key(filename) return FileEntry(self, id, start, length) def add_link(self, filename, start=0, length=None, reload=False): filename = os.path.normpath(os.path.abspath(filename)) st = os.stat(filename) if length is None: length = st.st_size if not reload and start+length <= st.st_size: try: id = self.linktree.locate_key(filename) except KeyError: pass else: _, (stored_size, stored_mtime) = self.linktree.load_node(id) if stored_size == st.st_size and stored_mtime == st.st_mtime: return FileEntry(self, id, start, length) md5sum, shasum = checksums(filename, start, length) self.linktree[filename] = st.st_size, st.st_mtime id = self.linktree.locate_key(filename) self.md5table[md5sum] = id, start, length self.shatable[shasum] = id, start, length return FileEntry(self, id, start, length, md5sum, shasum) def import_data(self, buffer): shasum = sha.new(buffer).digest() try: entry = self.find_sha(shasum) except KeyError: pass else: if entry.is_imported(): return entry # already imported md5sum = md5.new(buffer).digest() length = len(buffer) f, id, start = self._openwrite() try: f.write(buffer) self.md5table[md5sum] = id, start, length self.shatable[shasum] = id, start, length except: f.truncate(start) raise f.close() return FileEntry(self, id, start, length, md5sum, shasum) def import_file(self, filename): g = open(filename, 'rb') try: md5sum, shasum, length = fchecksums(g) try: entry = self.find_sha(shasum) except KeyError: pass else: if entry.is_imported(): return entry # already imported g.seek(0) f, id, start = self._openwrite() try: copysize = length while copysize > 0: n = min(32768, copysize) block = g.read(n) if len(block) != n: raise EOFError("file shrunk during access") f.write(block) copysize -= n self.md5table[md5sum] = id, start, length self.shatable[shasum] = id, start, length except: f.truncate(start) raise f.close() finally: g.close() return FileEntry(self, id, start, length, md5sum, shasum) def find_md5(self, md5sum): id, start, length = self.md5table[md5sum] return FileEntry(self, id, start, length, md5sum=md5sum) def find_sha(self, shasum): id, start, length = self.shatable[shasum] return FileEntry(self, id, start, length, shasum=shasum) class FileEntry: def __init__(self, srv, fileid, start, length, md5sum=None, shasum=None): self.srv = srv self._fileid = fileid self.start = start self.length = length self._md5sum = md5sum self._shasum = shasum def getfilename(self): return self.srv._getfilename(self._fileid) def is_imported(self): return self._fileid < 0 def open(self): return FileSection(self.getfilename(), self.start, self.length) def get_md5(self): if self._md5sum is None: self.checksums() return self._md5sum def get_sha(self): if self._shasum is None: self.checksums() return self._shasum def checksums(self): if self._md5sum is None or self._shasum is None: md5sum, shasum = checksums(self.getfilename(), self.start, self.length) if self._md5sum is not None: if self._md5sum != md5sum: raise ConsistencyError("data doesn't match any more") if self._shasum is not None: if self._shasum != shasum: raise ConsistencyError("data doesn't match any more") self._md5sum, self._shasum = md5sum, shasum return self._md5sum, self._shasum def __repr__(self): return '' % (self,) def __str__(self): if self.is_imported(): filename = '' else: filename = self.getfilename() if self.start: filename = '%s (offset %d)' % (filename, self.start) return '%s %11d %s' % (self.get_sha().encode('hex'), self.length, filename) def __eq__(self, other): if isinstance(other, FileEntry): return self.__key() == other.__key() else: return NotImplemented def __ne__(self, other): if isinstance(other, FileEntry): return self.__key() != other.__key() else: return NotImplemented def __hash__(self): return hash(self.__key()) def __key(self): return (self.srv, self._fileid, self.start, self.length) class ConsistencyError(Exception): pass def checksums(filename, start, length): f = open(filename, 'rb') f.seek(start) md5sum, shasum, total = fchecksums(f, length) f.close() if total != length: raise EOFError("file shrunk during access") return md5sum, shasum def fchecksums(f, length=1L<<64): md5sum = md5.new() shasum = sha.new() total = 0 while total < length: n = min(32768, length - total) block = f.read(n) n = len(block) if not n: break md5sum.update(block) shasum.update(block) total += n return md5sum.digest(), shasum.digest(), total class FileSection(object): def __init__(self, filename, start, length): self.f = open(filename, 'rb') self.start = start self.length = length self.f.seek(start, 0) self._pos = 0 def __repr__(self): return '' % (self.f.name, self.start, self.length) def close(self): self.f.close() def isatty(self): return False def read(self, n=-1): readmax = self.length - self._pos if readmax <= 0: return '' if n < 0 or n > readmax: n = readmax buf = self.f.read(n) self._pos += len(buf) return buf def __iter__(self): return self def next(self): readmax = self.length - self._pos if readmax <= 0: raise StopIteration return self.f.readline(readmax) def readline(self, size=-1): readmax = self.length - self._pos if 0 <= size < readmax: readmax = size if readmax <= 0: return '' return self.f.readline(readmax) def readlines(self, size=-1): return self.read().splitlines(True) def seek(self, offset, whence=0): if whence == 1: offset += self._pos elif whence == 2: offset += self.length self._pos = self.f.seek(self.start + offset) - self.start def tell(self): return self._pos