From: Christian Herdtweck Date: Wed, 20 Jul 2016 15:58:39 +0000 (+0200) Subject: created new test for volume splitting with compressed (and some day encrypted) files X-Git-Url: http://developer.intra2net.com/git/?a=commitdiff_plain;h=dd3108589597f0eae8be11507ec9c29ec364a45a;p=python-delta-tar created new test for volume splitting with compressed (and some day encrypted) files --- diff --git a/testing/test_volume_split.py b/testing/test_volume_split.py new file mode 100755 index 0000000..43d6121 --- /dev/null +++ b/testing/test_volume_split.py @@ -0,0 +1,313 @@ +#!/usr/bin/env python3 + +""" Test splitting into volumes with compression and/or encryption + +Adds big files to tarfile that almost fill volume, then small files with +varying size to check robustness of volume split + +More precisely: +- create new tar archive +- create a big source file, add to archive +- if needs 2nd volume: done +- create small source file, add to archive +- until need 2nd volume: repeat last step +- extract volumes +- compare files +- repeat whole procedure with differntly-sized big and small source files + +.. codeauthor:: Intra2net AG +""" + +import os +import sys +import random +from math import log2 +from hashlib import md5 as hash_type +from tempfile import TemporaryDirectory, NamedTemporaryFile +from time import time +from traceback import format_exc +from os.path import dirname, abspath + +# try to import the tarfile from source, not the globally installed one +source_base = dirname(dirname(abspath(__file__))) +print('adding {} to python path'.format(source_base)) +if os.path.isdir(source_base): + sys.path.insert(0, source_base) +import inspect +from deltatar.tarfile import TarFile, BLOCKSIZE, RECORDSIZE, _Stream +print('using TarFile from ' + dirname(inspect.getsourcefile(TarFile))) + + +#: number of blocks in a record +N_BLOCKS_PER_RECORD = RECORDSIZE // BLOCKSIZE + +#: number of blocks per tar volume file +MAX_VOLUME_BLOCKS = N_BLOCKS_PER_RECORD + 1 + +#: size of big file +BIG_SIZE = MAX_VOLUME_BLOCKS * BLOCKSIZE + +#: max size of small files +SMALL_MAX_SIZE = 2 * BLOCKSIZE + +#: number of bits used for seeding +SEED_BITS = int(log2(sys.maxsize+1)) + +#: buffer size for reading file for hashing +HASH_BUF_SIZE = 4096 + + +def create_file(file_size, temp_dir): + """ create random file of given size in given dir + + uses :py:func:`fill_file_repetitive` or :py:func:`fill_file_random` to + create contents of file + + returned hash must be compatible with :py:func:`hash_file` + """ + hash_obj = hash_type() + with NamedTemporaryFile(dir=temp_dir, delete=False) as temp_file: + if random.getrandbits(1) == 1: + fill_file_repetitive(temp_file, file_size, hash_obj) + else: + fill_file_random(temp_file, file_size, hash_obj) + + return temp_file.name, hash_obj.hexdigest() + + +def fill_file_repetitive(temp_file, file_size, hash_obj): + """ fills file with repetitive data """ + bytes_written = 0 + data = bytes(range(256)) + while file_size-bytes_written < 256: + temp_file.write(data) + hash_obj.update(data) + bytes_written += 256 + + if file_size - bytes_written > 0: + temp_file.write(data[:file_size-bytes_written]) + hash_obj.update(data[:file_size-bytes_written]) + + +def fill_file_random(temp_file, file_size, hash_obj): + """ fills file with randomized data """ + bytes_written = 0 + data = bytearray(range(256)) + while file_size-bytes_written < 255: + random.shuffle(data) + temp_file.write(data[:-1]) # write all but last to make a difference + hash_obj.update(data[:-1]) # between files of same size + bytes_written += 255 + + if file_size - bytes_written > 0: + random.shuffle(data) + temp_file.write(data[:file_size-bytes_written]) + hash_obj.update(data[:file_size-bytes_written]) + + +def hash_file(file_name): + """ calculate hash of file contents """ + hash_obj = hash_type() + with open(file_name, 'rb') as file_handle: + while True: + data = file_handle.read(HASH_BUF_SIZE) + hash_obj.update(data) + if len(data) < HASH_BUF_SIZE: + break + return hash_obj.hexdigest() + + +def do_test(seed, tar_mode, temp_dir, print_everything=False): + """ a single test run; returns True if everything went ok """ + + # output is not printed but remembered and only printed in the end + # if necessary + output = [] + dprnt = print + everything_ok = False + + # seed random number generator + dprnt('using seed {}'.format(seed)) + random.seed(seed) + + # remember number of files in temp dir + n_files_at_start = len(os.listdir(temp_dir)) + + # create tar archive + temp_file = None + try: + everything_ok = True + temp_file = NamedTemporaryFile(dir=temp_dir, suffix='.' + tar_mode[2:], + delete=False, mode='wb') + files = {} + + # define local volume handler so can read/write volume_handler_called + volume_handler_called = False + def new_volume_handler(tarobj, base_name, volume_number): + """ called from tarobj when creating a new volume """ + nonlocal volume_handler_called + volume_handler_called = True + volume_path = "%s.%d" % (base_name, volume_number) + tarobj.open_volume(volume_path) + + dprnt('creating archive {}'.format(temp_file.name)) + with TarFile.open(mode=tar_mode, fileobj=temp_file, + max_volume_size=MAX_VOLUME_BLOCKS * BLOCKSIZE, + new_volume_handler=new_volume_handler) as tarobj: + + if isinstance(tarobj.fileobj, _Stream): + size_left_func = tarobj._size_left_stream + else: + size_left_func = tarobj._size_left_file + + # add big file + big_name, big_hash = create_file(BIG_SIZE, temp_dir) + files[big_name] = big_hash + dprnt('adding big file {} of size {} with hash {}' + .format(big_name, BIG_SIZE, big_hash)) + tarobj.add(big_name) + dprnt('now offset={}, size_left={}' + .format(tarobj.offset, size_left_func())) + + # loop + while not volume_handler_called: + # add small file + small_size = random.randint(0, SMALL_MAX_SIZE) + small_name, small_hash = create_file(small_size, temp_dir) + files[small_name] = small_hash + dprnt('adding small file {} of size {} with hash {}' + .format(small_name, small_size, small_hash)) + tarobj.add(small_name) + dprnt('now offset={}, size_left={}' + .format(tarobj.offset, size_left_func())) + + # close tarobj -- happens in __exit__ of TarFile context + dprnt('closing tar file') + # now tarobj should be closed + + # remember size of first volume (2nd should always be RECORDSIZE) + dprnt('size of first volume file: {}' + .format(os.stat(temp_file.name).st_size)) + if os.stat(temp_file.name + ".1").st_size != RECORDSIZE: + everything_ok = False + dprnt('strange size of 2nd volume: {}' + .format(os.stat(temp_file.name + ".1").st_size)) + + # delete added files + dprnt('deleting {} original files'.format(len(files))) + for file_name in files: + os.unlink(file_name) + + # extract + with TarFile.open(mode='r' + tar_mode[1:], name=temp_file.name, + new_volume_handler=new_volume_handler) as tarobj: + tarobj.extractall(path=temp_dir) + + # compare files + if len(os.listdir(temp_dir)) != len(files)+2 + n_files_at_start: + everything_ok = False + dprnt('wrong number of files: found {} but expect {}!' + .format(len(os.listdir()), len(files)+2+n_files_at_start)) + for file_name in os.listdir(): + dprnt('listdir: {}'.format(file_name)) + + for file_name, file_hash in files: + if not os.path.exists(file_name): + everything_ok = False + dprnt('failed to find file {} after extraction' + .format(file_name)) + if hash_file(file_name) != file_hash: + everything_ok = False + dprnt('wrong hash for file {} after extraction: {} != {}' + .format(file_name, hash_file(file_name), file_hash)) + + except Exception: + everything_ok = False + dprnt('caught exception:') + for line in format_exc().splitlines(): + dprnt('exc: {}'.format(line)) + finally: + # close file and delete it + if temp_file: + temp_file.close() + try: + os.unlink(temp_file.name) + os.unlink(temp_file.name + ".1") + except FileNotFoundError: + pass + + if print_everything or not everything_ok: + prefix = '{:9d}: '.format(seed) + for line in output: + print(prefix + line) + + return everything_ok + + +def create_seed(): + """ create a seed for seeding module :py:mod:`random` + + uses random.randint, so is also dependant on current seed of module + :py:mod:`random` + """ + return random.getrandbits(SEED_BITS) + + +def test_forever(): + """ Main function, called when running file as script + + runs do_test in infinite loop + """ + + # more params + fast_fail = True + print_everything = False + modes = 'w:tar', 'w:gz', 'w:bz2', 'w|tar', 'w|gz', 'w|bz2', 'w#tar', \ + 'w#gz', #'w#gz.aes128', 'w#gz.aes256', 'w#aes128', 'w#aes256' + + # seed properly + random.seed() + + # preparations + n_runs = 0 + error_seeds = [] + do_stop = False + + # create temp dir + with TemporaryDirectory(prefix='deltatar_test_') as temp_dir: + + try: + start_time = time() + while not do_stop: + for mode in modes: + seed = create_seed() + if not do_test(seed, mode, temp_dir): + error_seeds.append(seed) + if fast_fail: + print('stopping because fast_fail is set') + do_stop = True + break + n_runs += 1 + if n_runs % 100 == 0: + print('at run {} ({:.3f}s per run)' + .format(n_runs, (time()-start_time)/n_runs)) + except KeyboardInterrupt: + print('Stopped by user') + + # summarize + print('') + print('-'*72) + n_errs = len(error_seeds) + duration = time() - start_time + if n_runs == 0: + print('summary: no test run has finished') + else: + print('summary: {} runs, in {}s ({:.3f}s per run); ' + '{} with errs ({:.2f}%)' + .format(n_runs, duration, duration/n_runs, n_errs, + 100.0 * float(n_errs)/float(n_runs))) + print('seeds that created errors: {}'.format(error_seeds)) + + +if __name__ == '__main__': + test_forever()