writing iterators for directory diffs and unit testing them
authorEduardo Robles Elvira <edulix@wadobo.com>
Sun, 4 Aug 2013 10:37:46 +0000 (12:37 +0200)
committerEduardo Robles Elvira <edulix@wadobo.com>
Sun, 4 Aug 2013 10:37:46 +0000 (12:37 +0200)
deltatar/deltatar.py
testing/test_deltatar.py

index 87092e8..06473da 100644 (file)
@@ -286,7 +286,6 @@ class DeltaTar(object):
 
         return match
 
-
     def _recursive_walk_dir(self, source_path):
         '''
         Walk a directory recursively, yielding each file/directory
@@ -296,7 +295,7 @@ class DeltaTar(object):
             '''
             Walk a directory, yielding each file/directory
             '''
-            for filename in os.listdir(dir_path):
+            for filename in sorted(os.listdir(dir_path)):
                 file_path = os.path.join(dir_path, filename)
                 is_dir = os.path.isdir(file_path)
                 if self.filter_path(file_path, source_path, is_dir) == NO_MATCH:
@@ -306,37 +305,20 @@ class DeltaTar(object):
                     continue
                 yield file_path
 
-        diryield_stack = [walk_dir(source_path)]
-        delayed_path_stack = []
-
-        while diryield_stack:
-            try:
-                cur_path = diryield_stack[-1].next()
-                is_dir = os.path.isdir(cur_path)
-                status = self.filter_path(cur_path, source_path, is_dir)
-            except StopIteration:
-                diryield_stack.pop()
-                if delayed_path_stack:
-                    delayed_path_stack.pop()
-                continue
+        queue = [source_path]
 
-            if status == MATCH:
-                if delayed_path_stack:
-                    for delayed_path in delayed_path_stack:
-                        is_dir = os.path.isdir(delayed_path)
-                        if self.filter_path(delayed_path, source_path, is_dir) == NO_MATCH:
-                            continue
-                        yield delayed_path
-                    del delayed_path_stack[:]
+        while queue:
+            cur_path = queue.pop()
 
-                yield cur_path
+            for child in walk_dir(cur_path):
+                is_dir = os.path.isdir(child)
+                status = self.filter_path(child, source_path, is_dir)
 
-                if os.path.isdir(cur_path):
-                    diryield_stack.append(walk_dir(cur_path))
+                if status == MATCH:
+                    yield child
 
-            elif status == PARENT_MATCH:
-                delayed_path_stack.append(cur_path)
-                diryield_stack.append(walk_dir(cur_path))
+                if is_dir and (status == MATCH or status == PARENT_MATCH):
+                    queue.append(child)
 
     def _stat_dict(self, path):
         '''
@@ -347,24 +329,35 @@ class DeltaTar(object):
 
         ptype = None
         if stat.S_ISDIR(mode):
-            ptype = 'directory'
+            ptype = u'directory'
         elif stat.S_ISREG(mode):
-            ptype = 'file'
+            ptype = u'file'
         elif stat.S_ISLNK(mode):
-            ptype = 'link'
+            ptype = u'link'
 
         return {
-            'type': ptype,
-            'path': path,
-            'mode': mode,
-            'mtime': stinfo.st_mtime,
-            'ctime': stinfo.st_ctime,
-            'uid': stinfo.st_uid,
-            'gid': stinfo.st_gid,
-            'inode': stinfo.st_ino,
-            'size': stinfo.st_size
+            u'type': ptype,
+            u'path': unicode(path),
+            u'mode': mode,
+            u'mtime': stinfo.st_mtime,
+            u'ctime': stinfo.st_ctime,
+            u'uid': stinfo.st_uid,
+            u'gid': stinfo.st_gid,
+            u'inode': stinfo.st_ino,
+            u'size': stinfo.st_size
         }
 
+    def _equal_stat_dicts(self, d1, d2):
+        '''
+        Return if the dicts are equal in the stat keys
+        '''
+        keys = [u'gid', u'type', u'mode', u'mtime', u'path', u'size', u'inode',
+                u'ctime', u'uid']
+        for key in keys:
+            if d1.get(key, -1) != d2.get(key, -2):
+                return False
+        return True
+
     def create_full_backup(self, source_path, backup_path,
                            max_volume_size=None):
         '''
@@ -388,8 +381,9 @@ class DeltaTar(object):
             raise Exception('Source path "%s" does not exist or is not a '\
                             'directory' % source_path)
 
-        if max_volume_size != None and not isinstance(max_volume_size, int):
-            raise Exception('max_volume_size must be an integer')
+        if max_volume_size != None and (not isinstance(max_volume_size, int) or\
+            max_volume_size < 1):
+            raise Exception('max_volume_size must be a positive integer')
         if max_volume_size != None:
             max_volume_size = max_volume_size*1024*1024
 
@@ -506,7 +500,211 @@ class DeltaTar(object):
         - max_volume_size: maximum volume size in megabytes (MB). Used to split
           the backup in volumes. Optional (won't split in volumes by default).
         '''
-        pass
+        # check/sanitize input
+        if not isinstance(source_path, basestring):
+            raise Exception('Source path must be a string')
+
+        if not isinstance(backup_path, basestring):
+            raise Exception('Backup path must be a string')
+
+        if not os.path.exists(source_path) or not os.path.isdir(source_path):
+            raise Exception('Source path "%s" does not exist or is not a '\
+                            'directory' % source_path)
+
+        if not os.access(source_path, os.R_OK):
+            raise Exception('Source path "%s" is not readable' % source_path)
+
+        if max_volume_size != None and (not isinstance(max_volume_size, int) or\
+            max_volume_size < 1):
+            raise Exception('max_volume_size must be a positive integer')
+        if max_volume_size != None:
+            max_volume_size = max_volume_size*1024*1024
+
+        if not isinstance(previous_index_path, basestring):
+            raise Exception('previous_index_path must be A string')
+
+        if not os.path.exists(previous_index_path) or not os.path.isfile(previous_index_path):
+            raise Exception('Index path "%s" does not exist or is not a '\
+                            'file' % previous_index_path)
+
+        if not os.access(previous_index_path, os.R_OK):
+            raise Exception('Index path "%s" is not readable' % previous_index_path)
+
+        # try to create backup path if needed
+        if not os.path.exists(backup_path):
+            os.makedirs(backup_path)
+
+        if not os.access(backup_path, os.W_OK):
+            raise Exception('Backup path "%s" is not writeable' % backup_path)
+
+        if source_path.endswith('/'):
+            source_path = source_path[:-1]
+
+        if backup_path.endswith('/'):
+            backup_path = backup_path[:-1]
+
+        # update current time
+        self.current_time = datetime.datetime.now()
+
+        if self.mode not in self.__file_extensions_dict:
+            raise Exception('Unrecognized extension')
+
+        # some initialization
+        self.vol_no = 0
+
+        # generate the first volume name
+        vol_name = self.volume_name_func(backup_path, True, 0)
+        tarfile_path = os.path.join(backup_path, vol_name)
+
+        # init index
+        index_name = self.index_name_func(True)
+        index_path = os.path.join(backup_path, index_name)
+        # TODO: encrypt or compress it if necessary
+        index_fd = open(index_path, 'w')
+
+        cwd = os.getcwd()
+
+        def new_volume_handler(deltarobj, cwd, backup_path, tarobj, base_name, volume_number):
+            '''
+            Handles the new volumes
+            '''
+            volume_name = deltarobj.volume_name_func(backup_path, True, volume_number)
+            volume_path = os.path.join(backup_path, volume_name)
+            deltarobj.vol_no = volume_number
+
+            # we convert relative paths into absolute because CWD is changed
+            if not os.path.isabs(volume_path):
+                volume_path = os.path.join(cwd, volume_path)
+
+            tarobj.open_volume(volume_path)
+
+        # wraps some args from context into the handler
+        new_volume_handler = partial(new_volume_handler, self, cwd, backup_path)
+
+        index_fd.write('{"type": "python-delta-tar-index", "version": 1 }\n')
+
+        s = '{"type": "BEGIN-FILE-LIST"}\n'
+        # calculate checksum and write into the stream
+        crc = binascii.crc32(s)
+        index_fd.write(s)
+
+        # start creating the tarfile
+        tarobj = tarfile.TarFile.open(tarfile_path,
+                              mode='w' + self.mode,
+                              format=tarfile.GNU_FORMAT,
+                              concat_compression='#gz' in self.mode,
+                              password=self.password,
+                              max_volume_size=max_volume_size,
+                              new_volume_handler=new_volume_handler)
+
+        os.chdir(source_path)
+
+        # for each file to be in the backup, do:
+        for path in self._recursive_walk_dir('.'):
+            pass
+
+    def iterate_index_path(self, index_path):
+        # open
+        f = open(index_path, 'r')
+        # check index header
+        j, l_no = self._parse_json_line(f, -1)
+        if j.get("type", '') != 'python-delta-tar-index' or\
+                j.get('version', -1) != 1:
+            raise Exception("invalid index file format: %s" % json.dumps(j))
+
+        # find BEGIN-FILE-LIST, ignore other headers
+        while True:
+            j, l_no = self._parse_json_line(f, -1)
+            if j.get('type', '') == 'BEGIN-FILE-LIST':
+                break
+
+
+        # current volume number
+        curr_vol_no = None
+        # current volume file
+        vol_fd = None
+        offset = -1
+        tarobj = None
+
+        # read each file in the index and process it to do the retore
+        while True:
+            try:
+                j, l_no = self._parse_json_line(f, -1)
+            except Exception, e:
+                f.close()
+                raise e
+
+            op_type = j.get('type', '')
+
+            # when we detect the end of the list, break the loop
+            if op_type == 'END-FILE-LIST':
+                f.close()
+                break
+
+            # check input
+            if op_type not in ['directory', 'file', 'link']:
+                self.logger.warn('unrecognized type to be '
+                                    'restored: %s, line %d' % (op_type, l_no))
+                continue
+
+            yield j, l_no
+
+    def jsonize_path_iterator(self, iter):
+        '''
+        converts the yielded items of an iterator into json path lines.
+        '''
+        while True:
+            try:
+                path = iter.next()
+                yield self._stat_dict(path)
+            except StopIteration:
+                break
+
+    def collate_iterators(self, it1, it2):
+        '''
+        Collate two iterators, so that it returns pairs of the items of each
+        iterator (if the items are the same), or (None, elem2) or (elem1, None)
+        when there's no match for the items in the other iterator.
+
+        It assumes that the items in both lists are ordered in the same way.
+        '''
+        elem1, elem2 = None, None
+        while True:
+            if not elem1:
+                try:
+                    elem1 = it1.next()[0]
+                    if isinstance(elem1, tuple):
+                        elem1 = elem1[0]
+                except StopIteration:
+                    if elem2:
+                        yield (None, elem2)
+                    for elem2 in it2:
+                        yield (None, elem2)
+                    break
+                index1 = elem1['path']
+            if not elem2:
+                try:
+                    elem2 = it2.next()
+                    if isinstance(elem2, tuple):
+                        elem2 = elem2[0]
+                except StopIteration:
+                    if elem1:
+                        yield (elem1, None)
+                    for elem1 in it1:
+                        yield (elem1, None)
+                    break
+                index2 = elem2['path']
+
+            if index1 < index2:
+                yield (elem1, None)
+                elem1 = None
+            elif index1 == index2:
+                yield (elem1, elem2)
+                elem1, elem2 = None, None
+            else:
+                # index2 is less
+                yield (None, elem2)
+                elem2 = None
 
     def restore_backup(self, target_path, backup_indexes_paths=[],
                        backup_tar_path=None):
@@ -618,21 +816,7 @@ class DeltaTar(object):
             backup_path = os.path.dirname(backup_index_path)
             new_volume_handler = partial(new_volume_handler, self, cwd, backup_path)
 
-            # open
-            f = open(backup_index_path, 'r')
-
-            # check index header
-            j, l_no = self._parse_json_line(f, -1)
-            if j.get("type", '') != 'python-delta-tar-index' or\
-                    j.get('version', -1) != 1:
-                raise Exception("invalid index file format: %s" % json.dumps(j))
-
-            # find BEGIN-FILE-LIST, ignore other headers
-            while True:
-                j, l_no = self._parse_json_line(f, -1)
-                if j.get('type', '') == 'BEGIN-FILE-LIST':
-                    break
-
+            # some initialization:
 
             # current volume number
             curr_vol_no = None
@@ -641,26 +825,16 @@ class DeltaTar(object):
             offset = -1
             tarobj = None
 
-            # read each file in the index and process it to do the retore
-            while True:
-                j, l_no = self._parse_json_line(f, -1)
+            # iterate through the
+            for j, l_no in self.iterate_index_path(backup_index_path):
                 op_type = j.get('type', '')
-
-                # when we detect the end of the list, break the loop
-                if op_type == 'END-FILE-LIST':
-                    break
-
-                # check input
-                if op_type not in ['directory', 'file', 'link']:
-                    self.logger.warn('unrecognized type to be '
-                                     'restored: %s, line %d' % (op_type, l_no))
-                    continue
-
-                # filtering paths
                 op_path  = j.get('path', '')
+
+                # filter paths
                 if self.filter_path(op_path, '.', op_type == 'directory') == NO_MATCH:
                     continue
 
+                # check volume number
                 vol_no = j.get('volume', -1)
                 if not isinstance(vol_no, int) or vol_no < 0:
                     self.logger.warn('unrecognized type to be '
@@ -689,6 +863,7 @@ class DeltaTar(object):
                         tarobj.close()
                         tarobj = None
 
+                # open the tarfile if needed
                 if not tarobj:
                     vol_fd.seek(offset)
                     tarobj = tarfile.open(mode="r" + self.mode, fileobj=vol_fd,
index d2056c0..9cd0aba 100644 (file)
@@ -711,6 +711,72 @@ class DeltaTarTest(BaseTest):
         assert not os.path.exists('source_dir/big')
         assert not os.path.exists('source_dir/small')
 
+    def test_collate_iterators(self):
+        '''
+        Tests the collate iterators functionality with two exact directories,
+        using an index iterator from a backup and the exact same source dir.
+        '''
+        deltatar = DeltaTar(mode=self.MODE, password=self.PASSWORD,
+                            logger=self.consoleLogger)
+
+        # create first backup
+        deltatar.create_full_backup(
+            source_path="source_dir",
+            backup_path="backup_dir")
+
+        assert os.path.exists("backup_dir")
+
+        cwd = os.getcwd()
+        index_filename = deltatar.index_name_func(is_full=True)
+        index_path = os.path.join(cwd, "backup_dir", index_filename)
+        index_it = deltatar.iterate_index_path(index_path)
+
+        os.chdir('source_dir')
+        dir_it = deltatar._recursive_walk_dir('.')
+        path_it = deltatar.jsonize_path_iterator(dir_it)
+
+        try:
+            for path1, path2 in deltatar.collate_iterators(index_it, path_it):
+                assert deltatar._equal_stat_dicts(path1, path2)
+        finally:
+            os.chdir(cwd)
+
+    def test_collate_iterators_diffdirs(self):
+        '''
+        Use the collate iterators functionality with two different directories.
+        It must behave in an expected way.
+        '''
+        self.hash["source_dir/zzzz"]  = self.create_file("source_dir/zzzz", 100)
+
+        deltatar = DeltaTar(mode=self.MODE, password=self.PASSWORD,
+                            logger=self.consoleLogger)
+
+        # create first backup
+        deltatar.create_full_backup(
+            source_path="source_dir",
+            backup_path="backup_dir")
+
+        assert os.path.exists("backup_dir")
+        self.hash["source_dir/z"]  = self.create_file("source_dir/z", 100)
+
+        cwd = os.getcwd()
+        index_filename = deltatar.index_name_func(is_full=True)
+        index_path = os.path.join(cwd, "backup_dir", index_filename)
+        index_it = deltatar.iterate_index_path(index_path)
+
+        os.chdir('source_dir')
+        dir_it = deltatar._recursive_walk_dir('.')
+        path_it = deltatar.jsonize_path_iterator(dir_it)
+
+        try:
+            for path1, path2 in deltatar.collate_iterators(index_it, path_it):
+                if path2['path'] == './z':
+                    assert not path1
+                else:
+                    assert deltatar._equal_stat_dicts(path1, path2)
+        finally:
+            os.chdir(cwd)
+
 
 class DeltaTar2Test(DeltaTarTest):
     '''