From: Eduardo Robles Elvira Date: Sat, 14 Sep 2013 06:42:49 +0000 (+0200) Subject: fixing multiple errors in deltatar at once: X-Git-Tag: v2.2~105 X-Git-Url: http://developer.intra2net.com/git/?a=commitdiff_plain;h=df86af81861cab98435063a1dfea4cf10573c5ac;p=python-delta-tar fixing multiple errors in deltatar at once: * naming of the generated backups was incorrect because they were always named as if they were full backups, even if it was a diff one. This has been changed both at backup creation and at backup finding. Related to that is the backup volume "finder": it was trying to find volumes with current date which is not what we want, so now it guesses the date of the volume based on a prefix and a suffix inside the backup directory. * changing the ordering of the files in _recursive_walk_dir to follow an ascending naming sorted Breadth First Search algorithm. This has been fixed without having to retrieve the full list and then apply a sort to be efficient, which was something Daniel applied as a fix and got reverted for memory efficiency. * fixing index iterator also removing the need to have the full index in memory and allowing it to be used with the "with" python statement. * fixing collate_iterators sorting because when the second index element was less than the first one, we were not taking into account that in a BFS algorithm files nearer to the root in depth go first regardless of the sorting. * some optimizations and fixes related to the deletion of files in restore_backup for diff backups have been done --- diff --git a/deltatar/deltatar.py b/deltatar/deltatar.py index 4601e1d..ab0643e 100644 --- a/deltatar/deltatar.py +++ b/deltatar/deltatar.py @@ -222,18 +222,33 @@ class DeltaTar(object): return "%s-%s.index%s" % (prefix, date_str, extension) - def volume_name_func(self, backup_path, is_full, volume_number): + def volume_name_func(self, backup_path, is_full, volume_number, guess_name=False): ''' function that defines the name of tar volumes. It receives the backup_path, if it's a full backup and the volume number, and must return the name for the corresponding volume name. Optional, DeltaTar has default names for tar volumes. + + If guess_name is activated, the file is intended not to be created but + to be found, and thus the date will be guessed. ''' prefix = "bfull" if is_full else "bdiff" - date_str = self.current_time.strftime("%y-%m-%d-%H%M") extension = self.__file_extensions_dict[self.mode] - return "%s-%s-%03d.tar%s" % (prefix, date_str, volume_number + 1, extension) + if not guess_name: + date_str = self.current_time.strftime("%y-%m-%d-%H%M") + return "%s-%s-%03d.tar%s" % (prefix, date_str, volume_number + 1, extension) + else: + prefix = prefix + "-" + postfix = "-%03d.tar%s" % (volume_number + 1, extension) + try: + for f in os.listdir(backup_path): + if f.startswith(prefix) and f.endswith(postfix): + return f + except Exception, e: + import ipdb; ipdb.set_trace() + raise Exception("volume not found") + def filter_path(self, path, source_path="", is_dir=None): ''' @@ -347,7 +362,7 @@ class DeltaTar(object): queue = [source_path] while queue: - cur_path = queue.pop() + cur_path = queue.pop(0) # it might have been removed in the mean time if not os.path.exists(cur_path): @@ -658,11 +673,12 @@ class DeltaTar(object): self.vol_no = 0 # generate the first volume name - vol_name = self.volume_name_func(backup_path, True, 0) + vol_name = self.volume_name_func(backup_path, is_full=False, + volume_number=0) tarfile_path = os.path.join(backup_path, vol_name) # init index - index_name = self.index_name_func(True) + index_name = self.index_name_func(is_full=False) index_path = os.path.join(backup_path, index_name) index_fd = self.open_index(index_path, 'w') @@ -672,7 +688,8 @@ class DeltaTar(object): ''' Handles the new volumes ''' - volume_name = deltarobj.volume_name_func(backup_path, True, volume_number) + volume_name = deltarobj.volume_name_func(backup_path, is_full=False, + volume_number=volume_number) volume_path = os.path.join(backup_path, volume_name) deltarobj.vol_no = volume_number @@ -712,14 +729,18 @@ class DeltaTar(object): dir_it = self._recursive_walk_dir('.') dir_path_it = self.jsonize_path_iterator(dir_it) - index_it = sorted(index_it, key=lambda x: x[0]['path']) - dir_path_it = sorted(dir_path_it, key=lambda x: x['path']) + def pr(path): + if not path: + return "None" + else: + return path["path"] # for each file to be in the backup, do: - for ipath, dpath, l_no in self.collate_iterators(iter(index_it), iter(dir_path_it)): + for ipath, dpath, l_no in self.collate_iterators(index_it, dir_path_it): action = None # if file is not in the index, it means it's a new file, so we have # to take a snapshot + if not ipath: action = 'snapshot' # if the file is not in the directory iterator, it means that it has @@ -785,54 +806,87 @@ class DeltaTar(object): index_fd.write('{"type": "file-list-checksum", "checksum": %d}\n' %\ crc) index_fd.close() + index_it.release() os.chdir(cwd) tarobj.close() def iterate_index_path(self, index_path): - # open - f = self.open_index(index_path, 'r') - # check index header - j, l_no = self._parse_json_line(f, 0) - if j.get("type", '') != 'python-delta-tar-index' or\ - j.get('version', -1) != 1: - raise Exception("invalid index file format: %s" % json.dumps(j)) - - # find BEGIN-FILE-LIST, ignore other headers - while True: - j, l_no = self._parse_json_line(f, l_no) - if j.get('type', '') == 'BEGIN-FILE-LIST': - break + ''' + Returns an index iterator. Internally, it uses a classic iterator class. + We do that instead of just yielding so that the iterator object can have + an additional function to close the file descriptor that is opened in + the constructor. + ''' + class IndexPathIterator(object): + def __init__(self, delta_tar, index_path): + self.delta_tar = delta_tar + self.index_path = index_path + self.f = None + self.__enter__() - # current volume number - curr_vol_no = None - # current volume file - vol_fd = None - offset = -1 - tarobj = None + def __iter__(self): + return self - # read each file in the index and process it to do the retore - while True: - try: - j, l_no = self._parse_json_line(f, l_no) - except Exception, e: - f.close() - raise e + def release(self): + if self.f: + self.f.close() + + def __enter__(self): + ''' + Allows this iterator to be used with the "with" statement + ''' + if self.f is None: + self.f = self.delta_tar.open_index(self.index_path, 'r') + # check index header + j, l_no = self.delta_tar._parse_json_line(self.f, 0) + if j.get("type", '') != 'python-delta-tar-index' or\ + j.get('version', -1) != 1: + raise Exception("invalid index file format: %s" % json.dumps(j)) + + # find BEGIN-FILE-LIST, ignore other headers + while True: + j, l_no = self.delta_tar._parse_json_line(self.f, l_no) + if j.get('type', '') == 'BEGIN-FILE-LIST': + break + return self + + def __exit__(self, type, value, tb): + ''' + Allows this iterator to be used with the "with" statement + ''' + self.f.close() + self.f = None - op_type = j.get('type', '') + def next(self): + # read each file in the index and process it to do the retore + j = {} + l_no = -1 + try: + j, l_no = self.delta_tar._parse_json_line(self.f, l_no) + except Exception, e: + if self.f: + self.f.close() + raise e - # when we detect the end of the list, break the loop - if op_type == 'END-FILE-LIST': - f.close() - break + op_type = j.get('type', '') - # check input - if op_type not in ['directory', 'file', 'link']: - self.logger.warn('unrecognized type to be ' - 'restored: %s, line %d' % (op_type, l_no)) - continue + # when we detect the end of the list, break the loop + if op_type == 'END-FILE-LIST': + if self.f: + self.f.close() + raise StopIteration + + # check input + if op_type not in ['directory', 'file', 'link']: + self.delta_tar.logger.warn('unrecognized type to be ' + 'restored: %s, line %d' % (op_type, l_no)) + # iterate again + return self.next() + + return j, l_no - yield j, l_no + return IndexPathIterator(self, index_path) def jsonize_path_iterator(self, iter, strip=0): ''' @@ -903,9 +957,12 @@ class DeltaTar(object): yield (elem1, elem2, l_no) elem1, elem2 = None, None else: - # index2 is less - yield (None, elem2, l_no) - elem2 = None + if index2.count('/') > index1.count('/'): + yield (elem1, None, l_no) + elem1 = None + else: + yield (None, elem2, l_no) + elem2 = None def restore_backup(self, target_path, backup_indexes_paths=[], backup_tar_path=None): @@ -987,7 +1044,8 @@ class DeltaTar(object): ''' Handles the new volumes ''' - volume_name = deltarobj.volume_name_func(backup_path, True, volume_number) + volume_name = deltarobj.volume_name_func(backup_path, True, + volume_number, guess_name=True) volume_path = os.path.join(backup_path, volume_name) # we convert relative paths into absolute because CWD is changed @@ -996,6 +1054,8 @@ class DeltaTar(object): tarobj.open_volume(volume_path) backup_path = os.path.dirname(backup_tar_path) + if not os.path.isabs(backup_path): + backup_path = os.path.join(cwd, backup_path) new_volume_handler = partial(new_volume_handler, self, cwd, backup_path) tarobj = tarfile.TarFile.open(backup_tar_path, mode='r' + self.mode, @@ -1029,16 +1089,12 @@ class DeltaTar(object): os.chdir(target_path) helper = RestoreHelper(self, cwd, backup_indexes_paths) - index_it = helper._data[0]['iterator'] + index_it = self.iterate_index_path(helper._data[0]["path"]) dir_it = self._recursive_walk_dir('.') dir_path_it = self.jsonize_path_iterator(dir_it) - index_it = sorted(index_it, key=lambda x: x[0]['path']) - dir_path_it = sorted(dir_path_it, key=lambda x: x['path']) - helper._data[0]['iterator'] = iter(index_it) - # for each file to be in the backup, do: - for ipath, dpath, l_no in self.collate_iterators(iter(index_it), iter(dir_path_it)): + for ipath, dpath, l_no in self.collate_iterators(index_it, dir_path_it): if not ipath: upath = dpath['path'] op_type = dpath['type'] @@ -1054,6 +1110,10 @@ class DeltaTar(object): # if file not found in dpath, we can directly restore from index if not dpath: + # if the file doesn't exist and it needs to be deleted, it + # means that work is already done + if ipath['path'].startswith('delete://'): + continue helper.restore(ipath, l_no) continue @@ -1062,12 +1122,18 @@ class DeltaTar(object): continue # we have to restore the file, but first we need to delete the - # current existing file - helper.delete(upath) + # current existing file. + # we don't delete the file if it's a directory, because it might + # just have changed mtime, so it's quite improductive to remove + # it + if ipath and (ipath['type'] != 'directory' or + ipath['path'].startswith('delete://')): + helper.delete(upath) if ipath: helper.restore(ipath, l_no) helper.restore_directories_permissions() + index_it.release() os.chdir(cwd) helper.cleanup() @@ -1111,8 +1177,11 @@ class RestoreHelper(object): self._directories = [] self._deltatar = deltatar self._cwd = cwd + self._index_list = index_list for index in index_list: + is_full = (index == index_list[-1]) + # make paths absolute to avoid cwd problems if not os.path.isabs(index): index = os.path.join(cwd, index) @@ -1123,9 +1192,9 @@ class RestoreHelper(object): offset = -1, tarobj = None, path = index, - iterator = deltatar.iterate_index_path(index), + is_full = is_full, new_volume_handler = partial(self.new_volume_handler, - self._deltatar, self._cwd, index) + self._deltatar, self._cwd, is_full, os.path.dirname(index)) ) self._data.append(s) @@ -1137,8 +1206,6 @@ class RestoreHelper(object): if data['tarobj']: data['tarobj'].close() data['tarobj'] = None - # TODO: ad a way to close the iterator fd - data['iterator'] def delete(self, path): ''' @@ -1157,16 +1224,19 @@ class RestoreHelper(object): Restore the path from the appropiate backup. Receives the current path from the first index iterator. itpath must be not null. ''' - data = self._data[0] path = itpath['path'] - upath = self._deltatar.unprefixed(path) - # if path is found in the first index as to be deleted or snapshotted, - # deal with it and finish if path.startswith('delete://'): - self.delete(self._deltatar.unprefixed(path)) + # the file has previously been deleted already in restore_backup in + # all cases so we just need to finish return - elif path.startswith('snapshot://'): + + data = self._data[0] + upath = self._deltatar.unprefixed(path) + + # if path is found in the first index as to be snapshotted, deal with it + # and finish + if path.startswith('snapshot://'): self.restore_file(itpath, data, path, l_no, self._deltatar.unprefixed(path)) return @@ -1175,48 +1245,49 @@ class RestoreHelper(object): cur_index = 1 while cur_index < len(self._data): data = self._data[cur_index] - listit = list(data['iterator']) - it = iter(listit) - data['iterator'] = iter(listit) - - # find the path in the index - d = None - l_no = None - dpath = None - while True: - try: - d, l_no = it.next() - except StopIteration: - break + # NOTE: we restart the iterator each time instead of reusing it + # because the iterator can be walked over completely multiple times, + # for example if one path if not found in one index and we have to + # go to the next index. + with self._deltatar.iterate_index_path(data["path"]) as it: + # find the path in the index + d = None + l_no = None + dpath = None + while True: + try: + d, l_no = it.next() + except StopIteration: + break - dpath = self._deltatar.unprefixed(d.get('path', '')) + dpath = self._deltatar.unprefixed(d.get('path', '')) - if upath == dpath: - break + if upath == dpath: + break - if not d: - # file not found, so it's not in the index, so it must be - # removed - if cur_index == 0: - self.delete(path) + if not d: + # file not found, so it's not in the index, so it must be + # removed + if cur_index == 0: + self.delete(path) + return + # this means that the path was found in the first index but + # not in a previous one, so something wrong happened. + else: + self._deltatar.logger.warn('Error restoring file %s from ' + 'index, not found in index %s' % (path, data['path'])) + return + + if d.get('path', '').startswith('delete://'): + self._deltatar.logger.warn(('Strange thing happened, file ' + '%s was listed in first index but deleted by another ' + 'one. Path was ignored and untouched.') % path) return - # this means that the path was found in the first index but - # not in a previous one, so something wrong happened. - else: - self._deltatar.logger.warn('Error restoring file %s from ' - 'index, not found in index %s' % (path, data['path'])) + elif d.get('path', '').startswith('snapshot://'): + self.restore_file(d, data, path, l_no, dpath) return - - if d.get('path', '').startswith('delete://'): - self._deltatar.logger.warn(('Strange thing happened, file ' - '%s was listed in first index but deleted by another ' - 'one. Path was ignored and untouched.') % path) - return - elif d.get('path', '').startswith('snapshot://'): - self.restore_file(d, data, path, l_no, dpath) - return - elif d.get('path', '').startswith('list://'): - continue + elif d.get('path', '').startswith('list://'): + continue self._deltatar.logger.warn(('Error restoring file %s from index, ' 'snapshot not found in any index') % path) @@ -1239,12 +1310,13 @@ class RestoreHelper(object): except tarfile.ExtractError, e: self._deltatar.logger.warn('tarfile: %s' % e) - @classmethod - def new_volume_handler(deltarobj, cwd, backup_path, tarobj, base_name, volume_number): + @staticmethod + def new_volume_handler(deltarobj, cwd, is_full, backup_path, tarobj, base_name, volume_number): ''' Handles the new volumes ''' - volume_name = deltarobj.volume_name_func(backup_path, True, volume_number) + volume_name = deltarobj.volume_name_func(backup_path, is_full, + volume_number, guess_name=True) volume_path = os.path.join(backup_path, volume_name) # we convert relative paths into absolute because CWD is changed @@ -1268,7 +1340,8 @@ class RestoreHelper(object): if index_data['curr_vol_no'] != vol_no: index_data['curr_vol_no'] = vol_no backup_path = os.path.dirname(index_data['path']) - vol_name = self._deltatar.volume_name_func(backup_path, True, vol_no) + vol_name = self._deltatar.volume_name_func(backup_path, + index_data['is_full'], vol_no, guess_name=True) vol_path = os.path.join(backup_path, vol_name) if index_data['vol_fd']: index_data['vol_fd'].close() @@ -1308,5 +1381,10 @@ class RestoreHelper(object): member = copy.copy(member) member.mode = 0700 + # if it's an existing directory, we then don't need to recreate it + # just set the right permissions, mtime and that kind of stuff + if os.path.exists(member.path): + return + # finally, restore the file index_data['tarobj'].extract(member) diff --git a/runtests.py b/runtests.py index b9e6c03..19b3740 100644 --- a/runtests.py +++ b/runtests.py @@ -23,7 +23,7 @@ from testing.test_multivol import MultivolGnuFormatTest, MultivolPaxFormatTest from testing.test_concat_compress import ConcatCompressTest from testing.test_rescue_tar import RescueTarTest from testing.test_encryption import EncryptionTest -from testing.test_deltatar import * +from testing.test_deltatar import DeltaTarTest if __name__ == "__main__": unittest.main() diff --git a/testing/test_deltatar.py b/testing/test_deltatar.py index fc8f765..8bd8a52 100644 --- a/testing/test_deltatar.py +++ b/testing/test_deltatar.py @@ -864,11 +864,11 @@ class DeltaTarTest(BaseTest): (u'./small', u'./small'), (u'./test', u'./test'), (None, u'./zzzz'), + (None, u'./bigdir/a'), + (None, u'./bigdir/b'), (u'./test/huge', u'./test/huge'), (u'./test/huge2', u'./test/huge2'), (u'./test/test2', u'./test/test2'), - (None, u'./bigdir/a'), - (None, u'./bigdir/b') ] os.chdir(cwd) @@ -891,7 +891,8 @@ class DeltaTarTest(BaseTest): prev_index_path) # check index items - index_path = os.path.join("backup_dir2", prev_index_filename) + index_path = os.path.join("backup_dir2", + deltatar.index_name_func(is_full=False)) index_it = deltatar.iterate_index_path(index_path) n = 0 for i in index_it: @@ -904,7 +905,8 @@ class DeltaTarTest(BaseTest): assert os.path.exists("backup_dir2") shutil.rmtree("source_dir") - tar_filename = deltatar.volume_name_func('backup_dir2', True, 0) + tar_filename = deltatar.volume_name_func('backup_dir2', + is_full=False, volume_number=0) tar_path = os.path.join("backup_dir2", tar_filename) # no file restored, because the diff was empty @@ -940,21 +942,22 @@ class DeltaTarTest(BaseTest): prev_index_path) # check index items - index_path = os.path.join("backup_dir2", prev_index_filename) + index_path = os.path.join("backup_dir2", deltatar.index_name_func(is_full=False)) index_it = deltatar.iterate_index_path(index_path) - l = sorted([i[0]['path'] for i in index_it]) - assert l == sorted([ + l = [i[0]['path'] for i in index_it] + + assert l == [ 'list://./big', 'snapshot://./bigdir', 'delete://./small', 'list://./test', 'snapshot://./zzzz', + 'snapshot://./bigdir/a', + 'snapshot://./bigdir/b', 'list://./test/huge', 'list://./test/huge2', 'list://./test/test2', - 'snapshot://./bigdir/a', - 'snapshot://./bigdir/b' - ]) + ] # check the tar file assert os.path.exists("backup_dir2") @@ -965,7 +968,8 @@ class DeltaTarTest(BaseTest): os.mkdir("source_dir") open("source_dir/small", 'w').close() - tar_filename = deltatar.volume_name_func('backup_dir2', True, 0) + tar_filename = deltatar.volume_name_func('backup_dir2', + is_full=False, volume_number=0) tar_path = os.path.join("backup_dir2", tar_filename) # restore the backup, this will create only the new files @@ -1006,7 +1010,7 @@ class DeltaTarTest(BaseTest): prev_index_path) # apply diff backup in target_dir - index_filename = deltatar.index_name_func(is_full=True) + index_filename = deltatar.index_name_func(is_full=False) index_path = os.path.join("backup_dir2", index_filename) deltatar.restore_backup("target_dir", backup_indexes_paths=[index_path, prev_index_path]) @@ -1048,12 +1052,12 @@ class DeltaTarTest(BaseTest): prev_index_path) # first restore initial backup in target_dir - tar_filename = deltatar.volume_name_func('backup_dir', True, 0) + tar_filename = deltatar.volume_name_func('backup_dir', is_full=True, volume_number=0) tar_path = os.path.join("backup_dir", tar_filename) deltatar.restore_backup("target_dir", backup_tar_path=tar_path) # then apply diff backup in target_dir - index_filename = deltatar.index_name_func(is_full=True) + index_filename = deltatar.index_name_func(is_full=False) index_path = os.path.join("backup_dir2", index_filename) deltatar.restore_backup("target_dir", backup_indexes_paths=[index_path, prev_index_path]) @@ -1106,7 +1110,7 @@ class DeltaTarTest(BaseTest): prev_index_path) # first restore initial backup in target_dir - tar_filename = deltatar.volume_name_func('backup_dir', True, 0) + tar_filename = deltatar.volume_name_func('backup_dir', is_full=True, volume_number=0) tar_path = os.path.join("backup_dir", tar_filename) deltatar.restore_backup("target_dir", backup_tar_path=tar_path) @@ -1115,7 +1119,7 @@ class DeltaTarTest(BaseTest): self.check_equal_dirs('source_dir', 'target_dir', deltatar) # then apply diff backup in target_dir - index_filename = deltatar.index_name_func(is_full=True) + index_filename = deltatar.index_name_func(is_full=False) index_path = os.path.join("backup_dir2", index_filename) deltatar.restore_backup("target_dir", backup_indexes_paths=[index_path, prev_index_path]) @@ -1187,7 +1191,7 @@ class DeltaTarTest(BaseTest): self.check_equal_dirs('source_dir', 'target_dir', deltatar) # then apply diff backup in target_dir - index_filename = deltatar.index_name_func(is_full=True) + index_filename = deltatar.index_name_func(is_full=False) index_path = os.path.join("backup_dir2", index_filename) deltatar.restore_backup("target_dir", backup_indexes_paths=[index_path, prev_index_path])