From 3a7e1a506479eec99fbcff31f54c42d674ea6150 Mon Sep 17 00:00:00 2001 From: Eduardo Robles Elvira Date: Fri, 9 Aug 2013 12:08:48 +0200 Subject: [PATCH] adding support for compressed index files --- deltatar/deltatar.py | 98 +++++++++++++++++++++++++++++++++++++-------- deltatar/tarfile.py | 57 +++++++++++++++++--------- testing/test_deltatar.py | 2 +- 3 files changed, 118 insertions(+), 39 deletions(-) diff --git a/deltatar/deltatar.py b/deltatar/deltatar.py index 47b4a29..5bf5eb3 100644 --- a/deltatar/deltatar.py +++ b/deltatar/deltatar.py @@ -73,9 +73,10 @@ class DeltaTar(object): # python logger object. logger = None - # whether the index is encrypted or not. Only makes sense to set it as True - # if mode includes aes128 or aes256. - index_encrypted = None + # specifies the index mode in the same format as @param mode, but without + # the ':', '|' or '#' at the begining. It doesn't make sense to specify + # that the index is encrypted if no no password is given in the constructor. + index_mode = None # current time for this backup. Used for file names and file creation checks current_time = None @@ -94,6 +95,15 @@ class DeltaTar(object): '#gz.aes256': '.gz.aes256' } + # valid index modes and their corresponding default file extension + __index_extensions_dict = { + '': '', + 'gz': '.gz', + 'bz2': '.bz2', + 'gz.aes128': '.gz.aes128', + 'gz.aes256': '.gz.aes256' + } + # valid path prefixes __path_prefix_list = [ u'snapshot://', @@ -103,8 +113,7 @@ class DeltaTar(object): def __init__(self, excluded_files=[], included_files=[], filter_func=None, mode="", password=None, - logger=None, - index_encrypted=False, index_name_func=None, + logger=None, index_mode=None, index_name_func=None, volume_name_func=None): ''' Constructor. Configures the diff engine. @@ -143,8 +152,18 @@ class DeltaTar(object): - logger: python logger object. Optional. - - index_encrypted: whether the index is encrypted or not. Only makes - sense to set it as True if mode includes aes128 or aes256. + - index_mode: specifies the index mode in the same format as @param + mode, but without the ':', '|' or '#' at the begining. It doesn't + make sense to specify that the index is encrypted if no no password + is given in the constructor. This is an optional parameter that will + automatically mimic @param mode by default if not provided. Valid + modes are: + + '' open uncompressed + 'gz' open with gzip compression + 'bz2' open with bzip2 compression + 'gz.aes128' open an aes128 encrypted stream of gzip compressed tar blocks + 'gz.aes256' open an aes256 encrypted stream of gzip compressed tar blocks - index_name_func: function that sets a custom name for the index file. This function receives the backup_path and if it's a full backup as @@ -168,7 +187,20 @@ class DeltaTar(object): self.logger.addHandler(logger) self.mode = mode self.password = password - self.index_encrypted = index_encrypted + + # generate index_mode + if index_mode is None: + index_mode = '' + if 'gz.aes' in mode: + index_mode = mode[1:] + elif 'gz' in mode: + index_mode = "gz" + elif 'bz2' in mode: + index_mode = "bz2" + elif mode not in self.__index_extensions_dict: + raise Exception('Unrecognized extension') + + self.index_mode = index_mode self.current_time = datetime.datetime.now() if index_name_func is not None: @@ -186,10 +218,7 @@ class DeltaTar(object): ''' prefix = "bfull" if is_full else "bdiff" date_str = self.current_time.strftime("%y-%m-%d-%H%M") - extension = '' - - if self.index_encrypted and 'aes' in self.mode: - extension = self.__file_extensions_dict[self.mode] + extension = self.__index_extensions_dict[self.index_mode] return "%s-%s.index%s" % (prefix, date_str, extension) @@ -405,6 +434,35 @@ class DeltaTar(object): return path[len(prefix):] return path + def open_index(self, path, mode='r'): + ''' + Given the specified configuration, opens the index for reading or + writing. It transparently handles if the index is encrypted and/or + compressed, returning a file object reading to use. + ''' + filemode = None + + if self.index_mode.startswith('gz'): + comptype = 'gz' + elif self.index_mode.startswith('bz2'): + comptype = 'bz2' + else: + comptype = 'tar' + + enctype = '' + if 'aes' in self.index_mode: + enctype = 'aes' + + key_length = 128 + if 'aes256' in self.index_mode: + key_length = 256 + + return tarfile._Stream(name=path, mode=mode, comptype=comptype, + bufsize=tarfile.RECORDSIZE, fileobj=None, + enctype=enctype, password=self.password, + key_length=key_length) + + def create_full_backup(self, source_path, backup_path, max_volume_size=None): ''' @@ -466,8 +524,7 @@ class DeltaTar(object): # init index index_name = self.index_name_func(True) index_path = os.path.join(backup_path, index_name) - # TODO: encrypt or compress it if necessary - index_fd = open(index_path, 'w') + index_fd = self.open_index(index_path, 'w') cwd = os.getcwd() @@ -548,6 +605,9 @@ class DeltaTar(object): which files changed since then. - max_volume_size: maximum volume size in megabytes (MB). Used to split the backup in volumes. Optional (won't split in volumes by default). + + NOTE: previous index is assumed to follow exactly the same format as + the index_mode setup in the constructor. ''' # check/sanitize input if not isinstance(source_path, basestring): @@ -608,8 +668,7 @@ class DeltaTar(object): # init index index_name = self.index_name_func(True) index_path = os.path.join(backup_path, index_name) - # TODO: encrypt or compress it if necessary - index_fd = open(index_path, 'w') + index_fd = self.open_index(index_path, 'w') cwd = os.getcwd() @@ -732,7 +791,7 @@ class DeltaTar(object): def iterate_index_path(self, index_path): # open - f = open(index_path, 'r') + f = self.open_index(index_path, 'r') # check index header j, l_no = self._parse_json_line(f, 0) if j.get("type", '') != 'python-delta-tar-index' or\ @@ -867,9 +926,12 @@ class DeltaTar(object): using any file index. If it's a multivol tarfile, volume_name_func will be called. - Note: If you want to use an index to restore a backup, this function + NOTE: If you want to use an index to restore a backup, this function only supports to do so when the tarfile mode is either uncompressed or uses concat compress mode, because otherwise it would be very slow. + + NOTE: Indices are assumed to follow the same format as the index_mode + specified in the constructor. ''' # check/sanitize input if not isinstance(target_path, basestring): diff --git a/deltatar/tarfile.py b/deltatar/tarfile.py index 9981760..237e841 100644 --- a/deltatar/tarfile.py +++ b/deltatar/tarfile.py @@ -438,6 +438,7 @@ class _Stream: self.key_length = key_length self.password = password self.last_block_offset = 0L + self.dbuf = "" if comptype == "gz": try: @@ -611,7 +612,6 @@ class _Stream: """Initialize for reading a gzip compressed fileobj. """ self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS) - self.dbuf = "" # if aes, we decrypt before the compression if self.enctype == 'aes': @@ -684,35 +684,52 @@ class _Stream: self.pos += len(buf) return buf + def readline(self): + """Reads just one line, new line character included + """ + buf = [] + pos = 0 + while True: + chunk = self._read(self.bufsize) + + if not chunk: + return ''.join(buf) + + buf.append(chunk) + if '\n' in chunk: + dbuf = ''.join(buf) + pos = dbuf.index('\n') + 1 + self.dbuf = dbuf[pos:] + return dbuf[:pos] + def _read(self, size): """Return size bytes from the stream. """ - if self.comptype == "tar": - return self.__read(size) - c = len(self.dbuf) t = [self.dbuf] while c < size: buf = self.__read(self.bufsize) if not buf: break - try: - buf = self.cmp.decompress(buf) - except IOError: - raise ReadError("invalid compressed data") - if self.comptype == "gz" and hasattr(self, "crc"): - self.crc = self.zlib.crc32(buf, self.crc) & 0xffffffffL - if self.concat_stream and len(self.cmp.unused_data) != 0: - self.buf = self.cmp.unused_data + self.buf - self.close(close_fileobj=False) + if self.comptype != "tar": try: - self._init_read_gz() - except: - # happens at the end of the file - pass - self.crc = self.zlib.crc32("") & 0xffffffffL - self.closed = False + buf = self.cmp.decompress(buf) + except IOError: + raise ReadError("invalid compressed data") + + if self.comptype == "gz" and hasattr(self, "crc"): + self.crc = self.zlib.crc32(buf, self.crc) & 0xffffffffL + if self.concat_stream and len(self.cmp.unused_data) != 0: + self.buf = self.cmp.unused_data + self.buf + self.close(close_fileobj=False) + try: + self._init_read_gz() + except: + # happens at the end of the file + pass + self.crc = self.zlib.crc32("") & 0xffffffffL + self.closed = False t.append(buf) c += len(buf) t = "".join(t) diff --git a/testing/test_deltatar.py b/testing/test_deltatar.py index 903e75e..95c0b85 100644 --- a/testing/test_deltatar.py +++ b/testing/test_deltatar.py @@ -205,7 +205,7 @@ class DeltaTarTest(BaseTest): index_path = os.path.join("backup_dir", index_filename) # this should automatically restore the huge file - f = open(index_path, 'r') + f = deltatar.open_index(index_path, mode='r') for l in f.readline(): data = json.loads(f.readline()) if data.get('type', '') == 'file' and\ -- 1.7.1