fixing multiple errors in deltatar at once:
authorEduardo Robles Elvira <edulix@wadobo.com>
Sat, 14 Sep 2013 06:42:49 +0000 (08:42 +0200)
committerEduardo Robles Elvira <edulix@wadobo.com>
Sat, 14 Sep 2013 06:42:49 +0000 (08:42 +0200)
* naming of the generated backups was incorrect because they were
always named as if they were full backups, even if it was a diff
one. This has been changed both at backup creation and at backup
finding.

Related to that is the backup volume "finder": it was trying to
find volumes with current date which is not what we want, so now
it guesses the date of the volume based on a prefix and a suffix
inside the backup directory.

* changing the ordering of the files in _recursive_walk_dir to
follow an ascending naming sorted Breadth First Search algorithm.
This has been fixed without having to retrieve the full list and
then apply a sort to be efficient, which was something Daniel
applied as a fix and got reverted for memory efficiency.

* fixing index iterator also removing the need to have the full
index in memory and allowing it to be used with the "with"
python statement.

* fixing collate_iterators sorting because when the second index
element was less than the first one, we were not taking into
account that in a BFS algorithm files nearer to the root in depth
go first regardless of the sorting.

* some optimizations and fixes related to the deletion of files
in restore_backup for diff backups have been done

deltatar/deltatar.py
runtests.py
testing/test_deltatar.py

index 4601e1d..ab0643e 100644 (file)
@@ -222,18 +222,33 @@ class DeltaTar(object):
 
         return "%s-%s.index%s" % (prefix, date_str, extension)
 
-    def volume_name_func(self, backup_path, is_full, volume_number):
+    def volume_name_func(self, backup_path, is_full, volume_number, guess_name=False):
         '''
         function that defines the name of tar volumes. It receives the
         backup_path, if it's a full backup and the volume number, and must return
         the name for the corresponding volume name. Optional, DeltaTar has default
         names for tar volumes.
+
+        If guess_name is activated, the file is intended not to be created but
+        to be found, and thus the date will be guessed.
         '''
         prefix = "bfull" if is_full else "bdiff"
-        date_str = self.current_time.strftime("%y-%m-%d-%H%M")
         extension = self.__file_extensions_dict[self.mode]
 
-        return "%s-%s-%03d.tar%s" % (prefix, date_str, volume_number + 1, extension)
+        if not guess_name:
+            date_str = self.current_time.strftime("%y-%m-%d-%H%M")
+            return "%s-%s-%03d.tar%s" % (prefix, date_str, volume_number + 1, extension)
+        else:
+            prefix = prefix + "-"
+            postfix = "-%03d.tar%s" % (volume_number + 1, extension)
+            try:
+                for f in os.listdir(backup_path):
+                    if f.startswith(prefix) and f.endswith(postfix):
+                        return f
+            except Exception, e:
+                import ipdb; ipdb.set_trace()
+            raise Exception("volume not found")
+
 
     def filter_path(self, path, source_path="", is_dir=None):
         '''
@@ -347,7 +362,7 @@ class DeltaTar(object):
         queue = [source_path]
 
         while queue:
-            cur_path = queue.pop()
+            cur_path = queue.pop(0)
 
             # it might have been removed in the mean time
             if not os.path.exists(cur_path):
@@ -658,11 +673,12 @@ class DeltaTar(object):
         self.vol_no = 0
 
         # generate the first volume name
-        vol_name = self.volume_name_func(backup_path, True, 0)
+        vol_name = self.volume_name_func(backup_path, is_full=False,
+                                         volume_number=0)
         tarfile_path = os.path.join(backup_path, vol_name)
 
         # init index
-        index_name = self.index_name_func(True)
+        index_name = self.index_name_func(is_full=False)
         index_path = os.path.join(backup_path, index_name)
         index_fd = self.open_index(index_path, 'w')
 
@@ -672,7 +688,8 @@ class DeltaTar(object):
             '''
             Handles the new volumes
             '''
-            volume_name = deltarobj.volume_name_func(backup_path, True, volume_number)
+            volume_name = deltarobj.volume_name_func(backup_path, is_full=False,
+                volume_number=volume_number)
             volume_path = os.path.join(backup_path, volume_name)
             deltarobj.vol_no = volume_number
 
@@ -712,14 +729,18 @@ class DeltaTar(object):
         dir_it = self._recursive_walk_dir('.')
         dir_path_it = self.jsonize_path_iterator(dir_it)
 
-        index_it = sorted(index_it, key=lambda x: x[0]['path'])
-        dir_path_it = sorted(dir_path_it, key=lambda x: x['path'])
+        def pr(path):
+            if not path:
+                return "None"
+            else:
+                return path["path"]
 
         # for each file to be in the backup, do:
-        for ipath, dpath, l_no in self.collate_iterators(iter(index_it), iter(dir_path_it)):
+        for ipath, dpath, l_no in self.collate_iterators(index_it, dir_path_it):
             action = None
             # if file is not in the index, it means it's a new file, so we have
             # to take a snapshot
+
             if not ipath:
                 action = 'snapshot'
             # if the file is not in the directory iterator, it means that it has
@@ -785,54 +806,87 @@ class DeltaTar(object):
         index_fd.write('{"type": "file-list-checksum", "checksum": %d}\n' %\
                         crc)
         index_fd.close()
+        index_it.release()
         os.chdir(cwd)
         tarobj.close()
 
     def iterate_index_path(self, index_path):
-        # open
-        f = self.open_index(index_path, 'r')
-        # check index header
-        j, l_no = self._parse_json_line(f, 0)
-        if j.get("type", '') != 'python-delta-tar-index' or\
-                j.get('version', -1) != 1:
-            raise Exception("invalid index file format: %s" % json.dumps(j))
-
-        # find BEGIN-FILE-LIST, ignore other headers
-        while True:
-            j, l_no = self._parse_json_line(f, l_no)
-            if j.get('type', '') == 'BEGIN-FILE-LIST':
-                break
+        '''
+        Returns an index iterator. Internally, it uses a classic iterator class.
+        We do that instead of just yielding so that the iterator object can have
+        an additional function to close the file descriptor that is opened in
+        the constructor.
+        '''
 
+        class IndexPathIterator(object):
+            def __init__(self, delta_tar, index_path):
+                self.delta_tar = delta_tar
+                self.index_path = index_path
+                self.f = None
+                self.__enter__()
 
-        # current volume number
-        curr_vol_no = None
-        # current volume file
-        vol_fd = None
-        offset = -1
-        tarobj = None
+            def __iter__(self):
+                return self
 
-        # read each file in the index and process it to do the retore
-        while True:
-            try:
-                j, l_no = self._parse_json_line(f, l_no)
-            except Exception, e:
-                f.close()
-                raise e
+            def release(self):
+                if self.f:
+                    self.f.close()
+
+            def __enter__(self):
+                '''
+                Allows this iterator to be used with the "with" statement
+                '''
+                if self.f is None:
+                    self.f = self.delta_tar.open_index(self.index_path, 'r')
+                    # check index header
+                    j, l_no = self.delta_tar._parse_json_line(self.f, 0)
+                    if j.get("type", '') != 'python-delta-tar-index' or\
+                            j.get('version', -1) != 1:
+                        raise Exception("invalid index file format: %s" % json.dumps(j))
+
+                    # find BEGIN-FILE-LIST, ignore other headers
+                    while True:
+                        j, l_no = self.delta_tar._parse_json_line(self.f, l_no)
+                        if j.get('type', '') == 'BEGIN-FILE-LIST':
+                            break
+                return self
+
+            def __exit__(self, type, value, tb):
+                '''
+                Allows this iterator to be used with the "with" statement
+                '''
+                self.f.close()
+                self.f = None
 
-            op_type = j.get('type', '')
+            def next(self):
+                # read each file in the index and process it to do the retore
+                j = {}
+                l_no = -1
+                try:
+                    j, l_no = self.delta_tar._parse_json_line(self.f, l_no)
+                except Exception, e:
+                    if self.f:
+                        self.f.close()
+                    raise e
 
-            # when we detect the end of the list, break the loop
-            if op_type == 'END-FILE-LIST':
-                f.close()
-                break
+                op_type = j.get('type', '')
 
-            # check input
-            if op_type not in ['directory', 'file', 'link']:
-                self.logger.warn('unrecognized type to be '
-                                    'restored: %s, line %d' % (op_type, l_no))
-                continue
+                # when we detect the end of the list, break the loop
+                if op_type == 'END-FILE-LIST':
+                    if self.f:
+                        self.f.close()
+                    raise StopIteration
+
+                # check input
+                if op_type not in ['directory', 'file', 'link']:
+                    self.delta_tar.logger.warn('unrecognized type to be '
+                                        'restored: %s, line %d' % (op_type, l_no))
+                    # iterate again
+                    return self.next()
+
+                return j, l_no
 
-            yield j, l_no
+        return IndexPathIterator(self, index_path)
 
     def jsonize_path_iterator(self, iter, strip=0):
         '''
@@ -903,9 +957,12 @@ class DeltaTar(object):
                 yield (elem1, elem2, l_no)
                 elem1, elem2 = None, None
             else:
-                # index2 is less
-                yield (None, elem2, l_no)
-                elem2 = None
+                if index2.count('/') > index1.count('/'):
+                    yield (elem1, None, l_no)
+                    elem1 = None
+                else:
+                    yield (None, elem2, l_no)
+                    elem2 = None
 
     def restore_backup(self, target_path, backup_indexes_paths=[],
                        backup_tar_path=None):
@@ -987,7 +1044,8 @@ class DeltaTar(object):
                 '''
                 Handles the new volumes
                 '''
-                volume_name = deltarobj.volume_name_func(backup_path, True, volume_number)
+                volume_name = deltarobj.volume_name_func(backup_path, True,
+                    volume_number, guess_name=True)
                 volume_path = os.path.join(backup_path, volume_name)
 
                 # we convert relative paths into absolute because CWD is changed
@@ -996,6 +1054,8 @@ class DeltaTar(object):
                 tarobj.open_volume(volume_path)
 
             backup_path = os.path.dirname(backup_tar_path)
+            if not os.path.isabs(backup_path):
+                backup_path = os.path.join(cwd, backup_path)
             new_volume_handler = partial(new_volume_handler, self, cwd, backup_path)
             tarobj = tarfile.TarFile.open(backup_tar_path,
                                 mode='r' + self.mode,
@@ -1029,16 +1089,12 @@ class DeltaTar(object):
             os.chdir(target_path)
             helper = RestoreHelper(self, cwd, backup_indexes_paths)
 
-            index_it = helper._data[0]['iterator']
+            index_it = self.iterate_index_path(helper._data[0]["path"])
             dir_it = self._recursive_walk_dir('.')
             dir_path_it = self.jsonize_path_iterator(dir_it)
 
-            index_it = sorted(index_it, key=lambda x: x[0]['path'])
-            dir_path_it = sorted(dir_path_it, key=lambda x: x['path'])
-            helper._data[0]['iterator'] = iter(index_it)
-
             # for each file to be in the backup, do:
-            for ipath, dpath, l_no in self.collate_iterators(iter(index_it), iter(dir_path_it)):
+            for ipath, dpath, l_no in self.collate_iterators(index_it, dir_path_it):
                 if not ipath:
                     upath = dpath['path']
                     op_type = dpath['type']
@@ -1054,6 +1110,10 @@ class DeltaTar(object):
 
                 # if file not found in dpath, we can directly restore from index
                 if not dpath:
+                    # if the file doesn't exist and it needs to be deleted, it
+                    # means that work is already done
+                    if ipath['path'].startswith('delete://'):
+                        continue
                     helper.restore(ipath, l_no)
                     continue
 
@@ -1062,12 +1122,18 @@ class DeltaTar(object):
                     continue
 
                 # we have to restore the file, but first we need to delete the
-                # current existing file
-                helper.delete(upath)
+                # current existing file.
+                # we don't delete the file if it's a directory, because it might
+                # just have changed mtime, so it's quite improductive to remove
+                # it
+                if ipath and (ipath['type'] != 'directory' or
+                        ipath['path'].startswith('delete://')):
+                    helper.delete(upath)
                 if ipath:
                     helper.restore(ipath, l_no)
 
             helper.restore_directories_permissions()
+            index_it.release()
             os.chdir(cwd)
             helper.cleanup()
 
@@ -1111,8 +1177,11 @@ class RestoreHelper(object):
         self._directories = []
         self._deltatar = deltatar
         self._cwd = cwd
+        self._index_list = index_list
 
         for index in index_list:
+            is_full = (index == index_list[-1])
+
             # make paths absolute to avoid cwd problems
             if not os.path.isabs(index):
                 index = os.path.join(cwd, index)
@@ -1123,9 +1192,9 @@ class RestoreHelper(object):
                 offset = -1,
                 tarobj = None,
                 path = index,
-                iterator = deltatar.iterate_index_path(index),
+                is_full = is_full,
                 new_volume_handler = partial(self.new_volume_handler,
-                                     self._deltatar, self._cwd, index)
+                    self._deltatar, self._cwd, is_full, os.path.dirname(index))
             )
             self._data.append(s)
 
@@ -1137,8 +1206,6 @@ class RestoreHelper(object):
             if data['tarobj']:
                 data['tarobj'].close()
                 data['tarobj'] = None
-            # TODO: ad a way to close the iterator fd
-            data['iterator']
 
     def delete(self, path):
         '''
@@ -1157,16 +1224,19 @@ class RestoreHelper(object):
         Restore the path from the appropiate backup. Receives the current path
         from the first index iterator. itpath must be not null.
         '''
-        data = self._data[0]
         path = itpath['path']
-        upath = self._deltatar.unprefixed(path)
 
-        # if path is found in the first index as to be deleted or snapshotted,
-        # deal with it and finish
         if path.startswith('delete://'):
-            self.delete(self._deltatar.unprefixed(path))
+            # the file has previously been deleted already in restore_backup in
+            # all cases so we just need to finish
             return
-        elif path.startswith('snapshot://'):
+
+        data = self._data[0]
+        upath = self._deltatar.unprefixed(path)
+
+        # if path is found in the first index as to be snapshotted, deal with it
+        # and finish
+        if path.startswith('snapshot://'):
             self.restore_file(itpath, data, path, l_no, self._deltatar.unprefixed(path))
             return
 
@@ -1175,48 +1245,49 @@ class RestoreHelper(object):
         cur_index = 1
         while cur_index < len(self._data):
             data = self._data[cur_index]
-            listit = list(data['iterator'])
-            it = iter(listit)
-            data['iterator'] = iter(listit)
-
-            # find the path in the index
-            d = None
-            l_no = None
-            dpath = None
-            while True:
-                try:
-                    d, l_no = it.next()
-                except StopIteration:
-                    break
+            # NOTE: we restart the iterator each time instead of reusing it
+            # because the iterator can be walked over completely multiple times,
+            # for example if one path if not found in one index and we have to
+            # go to the next index.
+            with self._deltatar.iterate_index_path(data["path"]) as it:
+                # find the path in the index
+                d = None
+                l_no = None
+                dpath = None
+                while True:
+                    try:
+                        d, l_no = it.next()
+                    except StopIteration:
+                        break
 
-                dpath = self._deltatar.unprefixed(d.get('path', ''))
+                    dpath = self._deltatar.unprefixed(d.get('path', ''))
 
-                if upath == dpath:
-                    break
+                    if upath == dpath:
+                        break
 
-            if not d:
-                # file not found, so it's not in the index, so it must be
-                # removed
-                if cur_index == 0:
-                    self.delete(path)
+                if not d:
+                    # file not found, so it's not in the index, so it must be
+                    # removed
+                    if cur_index == 0:
+                        self.delete(path)
+                        return
+                    # this means that the path was found in the first index but
+                    # not in a previous one, so something wrong happened.
+                    else:
+                        self._deltatar.logger.warn('Error restoring file %s from '
+                            'index, not found in index %s' % (path, data['path']))
+                        return
+
+                if d.get('path', '').startswith('delete://'):
+                    self._deltatar.logger.warn(('Strange thing happened, file '
+                        '%s was listed in first index but deleted by another '
+                        'one. Path was ignored and untouched.') % path)
                     return
-                # this means that the path was found in the first index but
-                # not in a previous one, so something wrong happened.
-                else:
-                    self._deltatar.logger.warn('Error restoring file %s from '
-                        'index, not found in index %s' % (path, data['path']))
+                elif d.get('path', '').startswith('snapshot://'):
+                    self.restore_file(d, data, path, l_no, dpath)
                     return
-
-            if d.get('path', '').startswith('delete://'):
-                self._deltatar.logger.warn(('Strange thing happened, file '
-                    '%s was listed in first index but deleted by another '
-                    'one. Path was ignored and untouched.') % path)
-                return
-            elif d.get('path', '').startswith('snapshot://'):
-                self.restore_file(d, data, path, l_no, dpath)
-                return
-            elif d.get('path', '').startswith('list://'):
-                continue
+                elif d.get('path', '').startswith('list://'):
+                    continue
 
         self._deltatar.logger.warn(('Error restoring file %s from index, '
                                     'snapshot not found in any index') % path)
@@ -1239,12 +1310,13 @@ class RestoreHelper(object):
             except tarfile.ExtractError, e:
                 self._deltatar.logger.warn('tarfile: %s' % e)
 
-    @classmethod
-    def new_volume_handler(deltarobj, cwd, backup_path, tarobj, base_name, volume_number):
+    @staticmethod
+    def new_volume_handler(deltarobj, cwd, is_full, backup_path, tarobj, base_name, volume_number):
         '''
         Handles the new volumes
         '''
-        volume_name = deltarobj.volume_name_func(backup_path, True, volume_number)
+        volume_name = deltarobj.volume_name_func(backup_path, is_full,
+            volume_number, guess_name=True)
         volume_path = os.path.join(backup_path, volume_name)
 
         # we convert relative paths into absolute because CWD is changed
@@ -1268,7 +1340,8 @@ class RestoreHelper(object):
         if index_data['curr_vol_no'] != vol_no:
             index_data['curr_vol_no'] = vol_no
             backup_path = os.path.dirname(index_data['path'])
-            vol_name = self._deltatar.volume_name_func(backup_path, True, vol_no)
+            vol_name = self._deltatar.volume_name_func(backup_path,
+                index_data['is_full'], vol_no, guess_name=True)
             vol_path = os.path.join(backup_path, vol_name)
             if index_data['vol_fd']:
                 index_data['vol_fd'].close()
@@ -1308,5 +1381,10 @@ class RestoreHelper(object):
             member = copy.copy(member)
             member.mode = 0700
 
+            # if it's an existing directory, we then don't need to recreate it
+            # just set the right permissions, mtime and that kind of stuff
+            if os.path.exists(member.path):
+                return
+
         # finally, restore the file
         index_data['tarobj'].extract(member)
index b9e6c03..19b3740 100644 (file)
@@ -23,7 +23,7 @@ from testing.test_multivol import MultivolGnuFormatTest, MultivolPaxFormatTest
 from testing.test_concat_compress import ConcatCompressTest
 from testing.test_rescue_tar import RescueTarTest
 from testing.test_encryption import EncryptionTest
-from testing.test_deltatar import *
+from testing.test_deltatar import DeltaTarTest
 
 if __name__ == "__main__":
     unittest.main()
index fc8f765..8bd8a52 100644 (file)
@@ -864,11 +864,11 @@ class DeltaTarTest(BaseTest):
                 (u'./small', u'./small'),
                 (u'./test', u'./test'),
                 (None, u'./zzzz'),
+                (None, u'./bigdir/a'),
+                (None, u'./bigdir/b'),
                 (u'./test/huge', u'./test/huge'),
                 (u'./test/huge2', u'./test/huge2'),
                 (u'./test/test2', u'./test/test2'),
-                (None, u'./bigdir/a'),
-                (None, u'./bigdir/b')
             ]
             os.chdir(cwd)
 
@@ -891,7 +891,8 @@ class DeltaTarTest(BaseTest):
                                     prev_index_path)
 
         # check index items
-        index_path = os.path.join("backup_dir2", prev_index_filename)
+        index_path = os.path.join("backup_dir2",
+            deltatar.index_name_func(is_full=False))
         index_it = deltatar.iterate_index_path(index_path)
         n = 0
         for i in index_it:
@@ -904,7 +905,8 @@ class DeltaTarTest(BaseTest):
         assert os.path.exists("backup_dir2")
         shutil.rmtree("source_dir")
 
-        tar_filename = deltatar.volume_name_func('backup_dir2', True, 0)
+        tar_filename = deltatar.volume_name_func('backup_dir2',
+            is_full=False, volume_number=0)
         tar_path = os.path.join("backup_dir2", tar_filename)
 
         # no file restored, because the diff was empty
@@ -940,21 +942,22 @@ class DeltaTarTest(BaseTest):
                                     prev_index_path)
 
         # check index items
-        index_path = os.path.join("backup_dir2", prev_index_filename)
+        index_path = os.path.join("backup_dir2", deltatar.index_name_func(is_full=False))
         index_it = deltatar.iterate_index_path(index_path)
-        l = sorted([i[0]['path'] for i in index_it])
-        assert l == sorted([
+        l = [i[0]['path'] for i in index_it]
+
+        assert l == [
             'list://./big',
             'snapshot://./bigdir',
             'delete://./small',
             'list://./test',
             'snapshot://./zzzz',
+            'snapshot://./bigdir/a',
+            'snapshot://./bigdir/b',
             'list://./test/huge',
             'list://./test/huge2',
             'list://./test/test2',
-            'snapshot://./bigdir/a',
-            'snapshot://./bigdir/b'
-        ])
+        ]
 
         # check the tar file
         assert os.path.exists("backup_dir2")
@@ -965,7 +968,8 @@ class DeltaTarTest(BaseTest):
         os.mkdir("source_dir")
         open("source_dir/small", 'w').close()
 
-        tar_filename = deltatar.volume_name_func('backup_dir2', True, 0)
+        tar_filename = deltatar.volume_name_func('backup_dir2',
+            is_full=False, volume_number=0)
         tar_path = os.path.join("backup_dir2", tar_filename)
 
         # restore the backup, this will create only the new files
@@ -1006,7 +1010,7 @@ class DeltaTarTest(BaseTest):
                                     prev_index_path)
 
         # apply diff backup in target_dir
-        index_filename = deltatar.index_name_func(is_full=True)
+        index_filename = deltatar.index_name_func(is_full=False)
         index_path = os.path.join("backup_dir2", index_filename)
         deltatar.restore_backup("target_dir",
             backup_indexes_paths=[index_path, prev_index_path])
@@ -1048,12 +1052,12 @@ class DeltaTarTest(BaseTest):
                                     prev_index_path)
 
         # first restore initial backup in target_dir
-        tar_filename = deltatar.volume_name_func('backup_dir', True, 0)
+        tar_filename = deltatar.volume_name_func('backup_dir', is_full=True, volume_number=0)
         tar_path = os.path.join("backup_dir", tar_filename)
         deltatar.restore_backup("target_dir", backup_tar_path=tar_path)
 
         # then apply diff backup in target_dir
-        index_filename = deltatar.index_name_func(is_full=True)
+        index_filename = deltatar.index_name_func(is_full=False)
         index_path = os.path.join("backup_dir2", index_filename)
         deltatar.restore_backup("target_dir",
             backup_indexes_paths=[index_path, prev_index_path])
@@ -1106,7 +1110,7 @@ class DeltaTarTest(BaseTest):
                                     prev_index_path)
 
         # first restore initial backup in target_dir
-        tar_filename = deltatar.volume_name_func('backup_dir', True, 0)
+        tar_filename = deltatar.volume_name_func('backup_dir', is_full=True, volume_number=0)
         tar_path = os.path.join("backup_dir", tar_filename)
         deltatar.restore_backup("target_dir", backup_tar_path=tar_path)
 
@@ -1115,7 +1119,7 @@ class DeltaTarTest(BaseTest):
         self.check_equal_dirs('source_dir', 'target_dir', deltatar)
 
         # then apply diff backup in target_dir
-        index_filename = deltatar.index_name_func(is_full=True)
+        index_filename = deltatar.index_name_func(is_full=False)
         index_path = os.path.join("backup_dir2", index_filename)
         deltatar.restore_backup("target_dir",
             backup_indexes_paths=[index_path, prev_index_path])
@@ -1187,7 +1191,7 @@ class DeltaTarTest(BaseTest):
         self.check_equal_dirs('source_dir', 'target_dir', deltatar)
 
         # then apply diff backup in target_dir
-        index_filename = deltatar.index_name_func(is_full=True)
+        index_filename = deltatar.index_name_func(is_full=False)
         index_path = os.path.join("backup_dir2", index_filename)
         deltatar.restore_backup("target_dir",
             backup_indexes_paths=[index_path, prev_index_path])