From d07c8065460ea6a23382c21ef55e6dcc3a3114f8 Mon Sep 17 00:00:00 2001 From: Eduardo Robles Elvira Date: Sun, 4 Aug 2013 12:37:46 +0200 Subject: [PATCH] writing iterators for directory diffs and unit testing them --- deltatar/deltatar.py | 323 +++++++++++++++++++++++++++++++++++----------- testing/test_deltatar.py | 66 ++++++++++ 2 files changed, 315 insertions(+), 74 deletions(-) diff --git a/deltatar/deltatar.py b/deltatar/deltatar.py index 87092e8..06473da 100644 --- a/deltatar/deltatar.py +++ b/deltatar/deltatar.py @@ -286,7 +286,6 @@ class DeltaTar(object): return match - def _recursive_walk_dir(self, source_path): ''' Walk a directory recursively, yielding each file/directory @@ -296,7 +295,7 @@ class DeltaTar(object): ''' Walk a directory, yielding each file/directory ''' - for filename in os.listdir(dir_path): + for filename in sorted(os.listdir(dir_path)): file_path = os.path.join(dir_path, filename) is_dir = os.path.isdir(file_path) if self.filter_path(file_path, source_path, is_dir) == NO_MATCH: @@ -306,37 +305,20 @@ class DeltaTar(object): continue yield file_path - diryield_stack = [walk_dir(source_path)] - delayed_path_stack = [] - - while diryield_stack: - try: - cur_path = diryield_stack[-1].next() - is_dir = os.path.isdir(cur_path) - status = self.filter_path(cur_path, source_path, is_dir) - except StopIteration: - diryield_stack.pop() - if delayed_path_stack: - delayed_path_stack.pop() - continue + queue = [source_path] - if status == MATCH: - if delayed_path_stack: - for delayed_path in delayed_path_stack: - is_dir = os.path.isdir(delayed_path) - if self.filter_path(delayed_path, source_path, is_dir) == NO_MATCH: - continue - yield delayed_path - del delayed_path_stack[:] + while queue: + cur_path = queue.pop() - yield cur_path + for child in walk_dir(cur_path): + is_dir = os.path.isdir(child) + status = self.filter_path(child, source_path, is_dir) - if os.path.isdir(cur_path): - diryield_stack.append(walk_dir(cur_path)) + if status == MATCH: + yield child - elif status == PARENT_MATCH: - delayed_path_stack.append(cur_path) - diryield_stack.append(walk_dir(cur_path)) + if is_dir and (status == MATCH or status == PARENT_MATCH): + queue.append(child) def _stat_dict(self, path): ''' @@ -347,24 +329,35 @@ class DeltaTar(object): ptype = None if stat.S_ISDIR(mode): - ptype = 'directory' + ptype = u'directory' elif stat.S_ISREG(mode): - ptype = 'file' + ptype = u'file' elif stat.S_ISLNK(mode): - ptype = 'link' + ptype = u'link' return { - 'type': ptype, - 'path': path, - 'mode': mode, - 'mtime': stinfo.st_mtime, - 'ctime': stinfo.st_ctime, - 'uid': stinfo.st_uid, - 'gid': stinfo.st_gid, - 'inode': stinfo.st_ino, - 'size': stinfo.st_size + u'type': ptype, + u'path': unicode(path), + u'mode': mode, + u'mtime': stinfo.st_mtime, + u'ctime': stinfo.st_ctime, + u'uid': stinfo.st_uid, + u'gid': stinfo.st_gid, + u'inode': stinfo.st_ino, + u'size': stinfo.st_size } + def _equal_stat_dicts(self, d1, d2): + ''' + Return if the dicts are equal in the stat keys + ''' + keys = [u'gid', u'type', u'mode', u'mtime', u'path', u'size', u'inode', + u'ctime', u'uid'] + for key in keys: + if d1.get(key, -1) != d2.get(key, -2): + return False + return True + def create_full_backup(self, source_path, backup_path, max_volume_size=None): ''' @@ -388,8 +381,9 @@ class DeltaTar(object): raise Exception('Source path "%s" does not exist or is not a '\ 'directory' % source_path) - if max_volume_size != None and not isinstance(max_volume_size, int): - raise Exception('max_volume_size must be an integer') + if max_volume_size != None and (not isinstance(max_volume_size, int) or\ + max_volume_size < 1): + raise Exception('max_volume_size must be a positive integer') if max_volume_size != None: max_volume_size = max_volume_size*1024*1024 @@ -506,7 +500,211 @@ class DeltaTar(object): - max_volume_size: maximum volume size in megabytes (MB). Used to split the backup in volumes. Optional (won't split in volumes by default). ''' - pass + # check/sanitize input + if not isinstance(source_path, basestring): + raise Exception('Source path must be a string') + + if not isinstance(backup_path, basestring): + raise Exception('Backup path must be a string') + + if not os.path.exists(source_path) or not os.path.isdir(source_path): + raise Exception('Source path "%s" does not exist or is not a '\ + 'directory' % source_path) + + if not os.access(source_path, os.R_OK): + raise Exception('Source path "%s" is not readable' % source_path) + + if max_volume_size != None and (not isinstance(max_volume_size, int) or\ + max_volume_size < 1): + raise Exception('max_volume_size must be a positive integer') + if max_volume_size != None: + max_volume_size = max_volume_size*1024*1024 + + if not isinstance(previous_index_path, basestring): + raise Exception('previous_index_path must be A string') + + if not os.path.exists(previous_index_path) or not os.path.isfile(previous_index_path): + raise Exception('Index path "%s" does not exist or is not a '\ + 'file' % previous_index_path) + + if not os.access(previous_index_path, os.R_OK): + raise Exception('Index path "%s" is not readable' % previous_index_path) + + # try to create backup path if needed + if not os.path.exists(backup_path): + os.makedirs(backup_path) + + if not os.access(backup_path, os.W_OK): + raise Exception('Backup path "%s" is not writeable' % backup_path) + + if source_path.endswith('/'): + source_path = source_path[:-1] + + if backup_path.endswith('/'): + backup_path = backup_path[:-1] + + # update current time + self.current_time = datetime.datetime.now() + + if self.mode not in self.__file_extensions_dict: + raise Exception('Unrecognized extension') + + # some initialization + self.vol_no = 0 + + # generate the first volume name + vol_name = self.volume_name_func(backup_path, True, 0) + tarfile_path = os.path.join(backup_path, vol_name) + + # init index + index_name = self.index_name_func(True) + index_path = os.path.join(backup_path, index_name) + # TODO: encrypt or compress it if necessary + index_fd = open(index_path, 'w') + + cwd = os.getcwd() + + def new_volume_handler(deltarobj, cwd, backup_path, tarobj, base_name, volume_number): + ''' + Handles the new volumes + ''' + volume_name = deltarobj.volume_name_func(backup_path, True, volume_number) + volume_path = os.path.join(backup_path, volume_name) + deltarobj.vol_no = volume_number + + # we convert relative paths into absolute because CWD is changed + if not os.path.isabs(volume_path): + volume_path = os.path.join(cwd, volume_path) + + tarobj.open_volume(volume_path) + + # wraps some args from context into the handler + new_volume_handler = partial(new_volume_handler, self, cwd, backup_path) + + index_fd.write('{"type": "python-delta-tar-index", "version": 1 }\n') + + s = '{"type": "BEGIN-FILE-LIST"}\n' + # calculate checksum and write into the stream + crc = binascii.crc32(s) + index_fd.write(s) + + # start creating the tarfile + tarobj = tarfile.TarFile.open(tarfile_path, + mode='w' + self.mode, + format=tarfile.GNU_FORMAT, + concat_compression='#gz' in self.mode, + password=self.password, + max_volume_size=max_volume_size, + new_volume_handler=new_volume_handler) + + os.chdir(source_path) + + # for each file to be in the backup, do: + for path in self._recursive_walk_dir('.'): + pass + + def iterate_index_path(self, index_path): + # open + f = open(index_path, 'r') + # check index header + j, l_no = self._parse_json_line(f, -1) + if j.get("type", '') != 'python-delta-tar-index' or\ + j.get('version', -1) != 1: + raise Exception("invalid index file format: %s" % json.dumps(j)) + + # find BEGIN-FILE-LIST, ignore other headers + while True: + j, l_no = self._parse_json_line(f, -1) + if j.get('type', '') == 'BEGIN-FILE-LIST': + break + + + # current volume number + curr_vol_no = None + # current volume file + vol_fd = None + offset = -1 + tarobj = None + + # read each file in the index and process it to do the retore + while True: + try: + j, l_no = self._parse_json_line(f, -1) + except Exception, e: + f.close() + raise e + + op_type = j.get('type', '') + + # when we detect the end of the list, break the loop + if op_type == 'END-FILE-LIST': + f.close() + break + + # check input + if op_type not in ['directory', 'file', 'link']: + self.logger.warn('unrecognized type to be ' + 'restored: %s, line %d' % (op_type, l_no)) + continue + + yield j, l_no + + def jsonize_path_iterator(self, iter): + ''' + converts the yielded items of an iterator into json path lines. + ''' + while True: + try: + path = iter.next() + yield self._stat_dict(path) + except StopIteration: + break + + def collate_iterators(self, it1, it2): + ''' + Collate two iterators, so that it returns pairs of the items of each + iterator (if the items are the same), or (None, elem2) or (elem1, None) + when there's no match for the items in the other iterator. + + It assumes that the items in both lists are ordered in the same way. + ''' + elem1, elem2 = None, None + while True: + if not elem1: + try: + elem1 = it1.next()[0] + if isinstance(elem1, tuple): + elem1 = elem1[0] + except StopIteration: + if elem2: + yield (None, elem2) + for elem2 in it2: + yield (None, elem2) + break + index1 = elem1['path'] + if not elem2: + try: + elem2 = it2.next() + if isinstance(elem2, tuple): + elem2 = elem2[0] + except StopIteration: + if elem1: + yield (elem1, None) + for elem1 in it1: + yield (elem1, None) + break + index2 = elem2['path'] + + if index1 < index2: + yield (elem1, None) + elem1 = None + elif index1 == index2: + yield (elem1, elem2) + elem1, elem2 = None, None + else: + # index2 is less + yield (None, elem2) + elem2 = None def restore_backup(self, target_path, backup_indexes_paths=[], backup_tar_path=None): @@ -618,21 +816,7 @@ class DeltaTar(object): backup_path = os.path.dirname(backup_index_path) new_volume_handler = partial(new_volume_handler, self, cwd, backup_path) - # open - f = open(backup_index_path, 'r') - - # check index header - j, l_no = self._parse_json_line(f, -1) - if j.get("type", '') != 'python-delta-tar-index' or\ - j.get('version', -1) != 1: - raise Exception("invalid index file format: %s" % json.dumps(j)) - - # find BEGIN-FILE-LIST, ignore other headers - while True: - j, l_no = self._parse_json_line(f, -1) - if j.get('type', '') == 'BEGIN-FILE-LIST': - break - + # some initialization: # current volume number curr_vol_no = None @@ -641,26 +825,16 @@ class DeltaTar(object): offset = -1 tarobj = None - # read each file in the index and process it to do the retore - while True: - j, l_no = self._parse_json_line(f, -1) + # iterate through the + for j, l_no in self.iterate_index_path(backup_index_path): op_type = j.get('type', '') - - # when we detect the end of the list, break the loop - if op_type == 'END-FILE-LIST': - break - - # check input - if op_type not in ['directory', 'file', 'link']: - self.logger.warn('unrecognized type to be ' - 'restored: %s, line %d' % (op_type, l_no)) - continue - - # filtering paths op_path = j.get('path', '') + + # filter paths if self.filter_path(op_path, '.', op_type == 'directory') == NO_MATCH: continue + # check volume number vol_no = j.get('volume', -1) if not isinstance(vol_no, int) or vol_no < 0: self.logger.warn('unrecognized type to be ' @@ -689,6 +863,7 @@ class DeltaTar(object): tarobj.close() tarobj = None + # open the tarfile if needed if not tarobj: vol_fd.seek(offset) tarobj = tarfile.open(mode="r" + self.mode, fileobj=vol_fd, diff --git a/testing/test_deltatar.py b/testing/test_deltatar.py index d2056c0..9cd0aba 100644 --- a/testing/test_deltatar.py +++ b/testing/test_deltatar.py @@ -711,6 +711,72 @@ class DeltaTarTest(BaseTest): assert not os.path.exists('source_dir/big') assert not os.path.exists('source_dir/small') + def test_collate_iterators(self): + ''' + Tests the collate iterators functionality with two exact directories, + using an index iterator from a backup and the exact same source dir. + ''' + deltatar = DeltaTar(mode=self.MODE, password=self.PASSWORD, + logger=self.consoleLogger) + + # create first backup + deltatar.create_full_backup( + source_path="source_dir", + backup_path="backup_dir") + + assert os.path.exists("backup_dir") + + cwd = os.getcwd() + index_filename = deltatar.index_name_func(is_full=True) + index_path = os.path.join(cwd, "backup_dir", index_filename) + index_it = deltatar.iterate_index_path(index_path) + + os.chdir('source_dir') + dir_it = deltatar._recursive_walk_dir('.') + path_it = deltatar.jsonize_path_iterator(dir_it) + + try: + for path1, path2 in deltatar.collate_iterators(index_it, path_it): + assert deltatar._equal_stat_dicts(path1, path2) + finally: + os.chdir(cwd) + + def test_collate_iterators_diffdirs(self): + ''' + Use the collate iterators functionality with two different directories. + It must behave in an expected way. + ''' + self.hash["source_dir/zzzz"] = self.create_file("source_dir/zzzz", 100) + + deltatar = DeltaTar(mode=self.MODE, password=self.PASSWORD, + logger=self.consoleLogger) + + # create first backup + deltatar.create_full_backup( + source_path="source_dir", + backup_path="backup_dir") + + assert os.path.exists("backup_dir") + self.hash["source_dir/z"] = self.create_file("source_dir/z", 100) + + cwd = os.getcwd() + index_filename = deltatar.index_name_func(is_full=True) + index_path = os.path.join(cwd, "backup_dir", index_filename) + index_it = deltatar.iterate_index_path(index_path) + + os.chdir('source_dir') + dir_it = deltatar._recursive_walk_dir('.') + path_it = deltatar.jsonize_path_iterator(dir_it) + + try: + for path1, path2 in deltatar.collate_iterators(index_it, path_it): + if path2['path'] == './z': + assert not path1 + else: + assert deltatar._equal_stat_dicts(path1, path2) + finally: + os.chdir(cwd) + class DeltaTar2Test(DeltaTarTest): ''' -- 1.7.1