fixing sorting and improving dramatically restore diff backup:
authorEduardo Robles Elvira <edulix@wadobo.com>
Thu, 19 Sep 2013 15:37:40 +0000 (17:37 +0200)
committerEduardo Robles Elvira <edulix@wadobo.com>
Thu, 19 Sep 2013 15:37:40 +0000 (17:37 +0200)
* sorting was not correct because the directory separator was being taken
  into account as part of the sorting, and thus if a directory contained
  a character like '-' it would affect sorting when compared with '/'.
  Now we sort directory by directory in a path, so the dir separator is
  not sorted itself.

* diffs backups where very slow because we were using a very simple
  algorithm that was also very slow, were for each file we would reopen
  and go across the index until found. Now we go across the index only
  once in the whole restore operation, so the speed of restoring a
  backup has increased dramatically.

deltatar/deltatar.py

index 4a722e8..8527671 100644 (file)
@@ -937,7 +937,6 @@ class DeltaTar(object):
                             elem2 = elem2[0]
                         yield (None, elem2, l_no)
                     break
-                index1 = self.unprefixed(elem1['path'])
             if not elem2:
                 try:
                     elem2 = it2.next()
@@ -949,28 +948,43 @@ class DeltaTar(object):
                     for elem1, l_no in it1:
                         yield (elem1, None, l_no)
                     break
-                index2 = self.unprefixed(elem2['path'])
-
-            if index1 < index2:
-                # if the number of dirs in index1 is greater than in index2,
-                # it means that there's a new parent directory in index2, so
-                # it goes first
-                if index1.count('/') > index2.count('/'):
-                    yield (None, elem2, l_no)
-                    elem2 = None
-                else:
-                    yield (elem1, None, l_no)
-                    elem1 = None
-            elif index1 == index2:
-                yield (elem1, elem2, l_no)
-                elem1, elem2 = None, None
-            else:
-                if index2.count('/') > index1.count('/'):
-                    yield (elem1, None, l_no)
-                    elem1 = None
-                else:
-                    yield (None, elem2, l_no)
-                    elem2 = None
+
+            index1 = self.unprefixed(elem1['path'])
+            index2 = self.unprefixed(elem2['path'])
+            i1, i2 = self.compare_indexes(index1, index2)
+
+            yield1 = yield2 = None
+            if i1 is not None:
+                yield1 = elem1
+                elem1 = None
+            if i2 is not None:
+                yield2 = elem2
+                elem2 = None
+            yield (yield1, yield2, l_no)
+
+    def compare_indexes(self, index1, index2):
+        '''
+        Compare iterator indexes and return a tuple in the following form:
+        if index1 < index2, returns (index1, None)
+        if index1 == index2 returns (index1, index2)
+        else: returns (None, index2)
+        '''
+        l1 = index1.split('/')
+        l2 = index2.split('/')
+        length = len(l2) - len(l1)
+
+        if length > 0:
+            return (index1, None)
+        elif length < 0:
+            return (None, index2)
+
+        for i1, i2 in zip(l1, l2):
+            if i1 < i2:
+                return (index1, None)
+            elif i1 > i2:
+                return (None, index2)
+
+        return (index1, index2)
 
     def restore_backup(self, target_path, backup_indexes_paths=[],
                        backup_tar_path=None):
@@ -1200,6 +1214,9 @@ class RestoreHelper(object):
                 tarobj = None,
                 path = index,
                 is_full = is_full,
+                iterator = None,
+                last_itelement = None,
+                last_lno = 0,
                 new_volume_handler = partial(self.new_volume_handler,
                     self._deltatar, self._cwd, is_full, os.path.dirname(index))
             )
@@ -1256,49 +1273,70 @@ class RestoreHelper(object):
             # because the iterator can be walked over completely multiple times,
             # for example if one path if not found in one index and we have to
             # go to the next index.
-            with self._deltatar.iterate_index_path(data["path"]) as it:
-                # find the path in the index
-                d = None
-                l_no = None
-                dpath = None
-                while True:
-                    try:
-                        d, l_no = it.next()
-                    except StopIteration:
-                        break
-
-                    dpath = self._deltatar.unprefixed(d.get('path', ''))
-
-                    if upath == dpath:
-                        break
-
-                if not d:
-                    # file not found, so it's not in the index, so it must be
-                    # removed
-                    if cur_index == 0:
-                        self.delete(path)
-                        return
-                    # this means that the path was found in the first index but
-                    # not in a previous one, so something wrong happened.
-                    else:
-                        self._deltatar.logger.warn('Error restoring file %s from '
-                            'index, not found in index %s' % (path, data['path']))
-                        return
-
-                if d.get('path', '').startswith('delete://'):
-                    self._deltatar.logger.warn(('Strange thing happened, file '
-                        '%s was listed in first index but deleted by another '
-                        'one. Path was ignored and untouched.') % path)
+            d, l_no, dpath = self.find_path_in_index(data, upath)
+            if not d:
+                # file not found, so it's not in the index, so it must be
+                # removed
+                if cur_index == 0:
+                    self.delete(path)
                     return
-                elif d.get('path', '').startswith('snapshot://'):
-                    self.restore_file(d, data, path, l_no, dpath)
+                # this means that the path was found in the first index as listed
+                # not in a previous one, so something wrong happened.
+                else:
+                    self._deltatar.logger.warn('Error restoring file %s from '
+                        'index, not found in index %s' % (path, data['path']))
                     return
-                elif d.get('path', '').startswith('list://'):
-                    continue
+
+            if d.get('path', '').startswith('delete://'):
+                self._deltatar.logger.warn(('Strange thing happened, file '
+                    '%s was listed in first index but deleted by another '
+                    'one. Path was ignored and untouched.') % path)
+                return
+            elif d.get('path', '').startswith('snapshot://'):
+                self.restore_file(d, data, path, l_no, dpath)
+                return
+            elif d.get('path', '').startswith('list://'):
+                continue
 
         self._deltatar.logger.warn(('Error restoring file %s from index, '
                                     'snapshot not found in any index') % path)
 
+    def find_path_in_index(self, data, upath):
+        # NOTE: we restart the iterator sometimes because the iterator can be
+        # walked over completely multiple times, for example if one path if not
+        # found in one index and we have to go to the next index.
+        if data['iterator'] is None:
+            it = data['iterator'] = self._deltatar.iterate_index_path(data["path"])
+            d, l_no = it.next()
+        else:
+            it = data['iterator']
+            d = data['last_itelement']
+            l_no = data['last_lno']
+
+        dpath = self._deltatar.unprefixed(d.get('path', ''))
+
+        while True:
+            if upath == dpath:
+                data['last_itelement'] = d
+                data['last_lno'] = l_no
+                return d, l_no, dpath
+
+            up, dp = self._deltatar.compare_indexes(upath, dpath)
+            # any time upath should have appeared before current dpath, it means
+            # upath is just not in this index and we should stop
+            if dp is None:
+                data['last_itelement'] = d
+                data['last_lno'] = l_no
+                return None, 0, ''
+
+            try:
+                d, l_no = it.next()
+            except StopIteration:
+                data['last_itelement'] = d
+                data['last_lno'] = l_no
+                return None, 0, ''
+            dpath = self._deltatar.unprefixed(d.get('path', ''))
+
     def restore_directories_permissions(self):
         '''
         Restore directory permissions when everything have been restored