created new test for volume splitting with compressed (and some day encrypted) files
authorChristian Herdtweck <christian.herdtweck@intra2net.com>
Wed, 20 Jul 2016 15:58:39 +0000 (17:58 +0200)
committerChristian Herdtweck <christian.herdtweck@intra2net.com>
Thu, 12 Nov 2020 14:04:34 +0000 (15:04 +0100)
testing/test_volume_split.py [new file with mode: 0755]

diff --git a/testing/test_volume_split.py b/testing/test_volume_split.py
new file mode 100755 (executable)
index 0000000..43d6121
--- /dev/null
@@ -0,0 +1,313 @@
+#!/usr/bin/env python3
+
+""" Test splitting into volumes with compression and/or encryption
+
+Adds big files to tarfile that almost fill volume, then small files with
+varying size to check robustness of volume split
+
+More precisely:
+- create new tar archive
+- create a big source file, add to archive
+- if needs 2nd volume: done
+- create small source file, add to archive
+- until need 2nd volume: repeat last step
+- extract volumes
+- compare files
+- repeat whole procedure with differntly-sized big and small source files
+
+.. codeauthor:: Intra2net AG <info@intra2net>
+"""
+
+import os
+import sys
+import random
+from math import log2
+from hashlib import md5 as hash_type
+from tempfile import TemporaryDirectory, NamedTemporaryFile
+from time import time
+from traceback import format_exc
+from os.path import dirname, abspath
+
+# try to import the tarfile from source, not the globally installed one
+source_base = dirname(dirname(abspath(__file__)))
+print('adding {} to python path'.format(source_base))
+if os.path.isdir(source_base):
+    sys.path.insert(0, source_base)
+import inspect
+from deltatar.tarfile import TarFile, BLOCKSIZE, RECORDSIZE, _Stream
+print('using TarFile from ' + dirname(inspect.getsourcefile(TarFile)))
+
+
+#: number of blocks in a record
+N_BLOCKS_PER_RECORD = RECORDSIZE // BLOCKSIZE
+
+#: number of blocks per tar volume file
+MAX_VOLUME_BLOCKS = N_BLOCKS_PER_RECORD + 1
+
+#: size of big file
+BIG_SIZE = MAX_VOLUME_BLOCKS * BLOCKSIZE
+
+#: max size of small files
+SMALL_MAX_SIZE = 2 * BLOCKSIZE
+
+#: number of bits used for seeding
+SEED_BITS = int(log2(sys.maxsize+1))
+
+#: buffer size for reading file for hashing
+HASH_BUF_SIZE = 4096
+
+
+def create_file(file_size, temp_dir):
+    """ create random file of given size in given dir
+
+    uses :py:func:`fill_file_repetitive` or :py:func:`fill_file_random` to
+    create contents of file
+
+    returned hash must be compatible with :py:func:`hash_file`
+    """
+    hash_obj = hash_type()
+    with NamedTemporaryFile(dir=temp_dir, delete=False) as temp_file:
+        if random.getrandbits(1) == 1:
+            fill_file_repetitive(temp_file, file_size, hash_obj)
+        else:
+            fill_file_random(temp_file, file_size, hash_obj)
+
+    return temp_file.name, hash_obj.hexdigest()
+
+
+def fill_file_repetitive(temp_file, file_size, hash_obj):
+    """ fills file with repetitive data """
+    bytes_written = 0
+    data = bytes(range(256))
+    while file_size-bytes_written < 256:
+        temp_file.write(data)
+        hash_obj.update(data)
+        bytes_written += 256
+
+    if file_size - bytes_written > 0:
+        temp_file.write(data[:file_size-bytes_written])
+        hash_obj.update(data[:file_size-bytes_written])
+
+
+def fill_file_random(temp_file, file_size, hash_obj):
+    """ fills file with randomized data """
+    bytes_written = 0
+    data = bytearray(range(256))
+    while file_size-bytes_written < 255:
+        random.shuffle(data)
+        temp_file.write(data[:-1])   # write all but last to make a difference
+        hash_obj.update(data[:-1])   # between files of same size
+        bytes_written += 255
+
+    if file_size - bytes_written > 0:
+        random.shuffle(data)
+        temp_file.write(data[:file_size-bytes_written])
+        hash_obj.update(data[:file_size-bytes_written])
+
+
+def hash_file(file_name):
+    """ calculate hash of file contents """
+    hash_obj = hash_type()
+    with open(file_name, 'rb') as file_handle:
+        while True:
+            data = file_handle.read(HASH_BUF_SIZE)
+            hash_obj.update(data)
+            if len(data) < HASH_BUF_SIZE:
+                break
+    return hash_obj.hexdigest()
+
+
+def do_test(seed, tar_mode, temp_dir, print_everything=False):
+    """ a single test run; returns True if everything went ok """
+
+    # output is not printed but remembered and only printed in the end
+    # if necessary
+    output = []
+    dprnt = print
+    everything_ok = False
+
+    # seed random number generator
+    dprnt('using seed {}'.format(seed))
+    random.seed(seed)
+
+    # remember number of files in temp dir
+    n_files_at_start = len(os.listdir(temp_dir))
+
+    # create tar archive
+    temp_file = None
+    try:
+        everything_ok = True
+        temp_file = NamedTemporaryFile(dir=temp_dir, suffix='.' + tar_mode[2:],
+                                       delete=False, mode='wb')
+        files = {}
+
+        # define local volume handler so can read/write volume_handler_called
+        volume_handler_called = False
+        def new_volume_handler(tarobj, base_name, volume_number):
+            """ called from tarobj when creating a new volume """
+            nonlocal volume_handler_called
+            volume_handler_called = True
+            volume_path = "%s.%d" % (base_name, volume_number)
+            tarobj.open_volume(volume_path)
+
+        dprnt('creating archive {}'.format(temp_file.name))
+        with TarFile.open(mode=tar_mode, fileobj=temp_file,
+                          max_volume_size=MAX_VOLUME_BLOCKS * BLOCKSIZE,
+                          new_volume_handler=new_volume_handler) as tarobj:
+
+            if isinstance(tarobj.fileobj, _Stream):
+                size_left_func = tarobj._size_left_stream
+            else:
+                size_left_func = tarobj._size_left_file
+
+            # add big file
+            big_name, big_hash = create_file(BIG_SIZE, temp_dir)
+            files[big_name] = big_hash
+            dprnt('adding big file {} of size {} with hash {}'
+                  .format(big_name, BIG_SIZE, big_hash))
+            tarobj.add(big_name)
+            dprnt('now offset={}, size_left={}'
+                  .format(tarobj.offset, size_left_func()))
+
+            # loop
+            while not volume_handler_called:
+                # add small file
+                small_size = random.randint(0, SMALL_MAX_SIZE)
+                small_name, small_hash = create_file(small_size, temp_dir)
+                files[small_name] = small_hash
+                dprnt('adding small file {} of size {} with hash {}'
+                      .format(small_name, small_size, small_hash))
+                tarobj.add(small_name)
+                dprnt('now offset={}, size_left={}'
+                      .format(tarobj.offset, size_left_func()))
+
+            # close tarobj -- happens in __exit__ of TarFile context
+            dprnt('closing tar file')
+        # now tarobj should be closed
+
+        # remember size of first volume (2nd should always be RECORDSIZE)
+        dprnt('size of first volume file: {}'
+              .format(os.stat(temp_file.name).st_size))
+        if os.stat(temp_file.name + ".1").st_size != RECORDSIZE:
+            everything_ok = False
+            dprnt('strange size of 2nd volume: {}'
+                  .format(os.stat(temp_file.name + ".1").st_size))
+
+        # delete added files
+        dprnt('deleting {} original files'.format(len(files)))
+        for file_name in files:
+            os.unlink(file_name)
+
+        # extract
+        with TarFile.open(mode='r' + tar_mode[1:], name=temp_file.name,
+                          new_volume_handler=new_volume_handler) as tarobj:
+            tarobj.extractall(path=temp_dir)
+
+        # compare files
+        if len(os.listdir(temp_dir)) != len(files)+2 + n_files_at_start:
+            everything_ok = False
+            dprnt('wrong number of files: found {} but expect {}!'
+                  .format(len(os.listdir()), len(files)+2+n_files_at_start))
+            for file_name in os.listdir():
+                dprnt('listdir: {}'.format(file_name))
+
+        for file_name, file_hash in files:
+            if not os.path.exists(file_name):
+                everything_ok = False
+                dprnt('failed to find file {} after extraction'
+                      .format(file_name))
+            if hash_file(file_name) != file_hash:
+                everything_ok = False
+                dprnt('wrong hash for file {} after extraction: {} != {}'
+                      .format(file_name, hash_file(file_name), file_hash))
+
+    except Exception:
+        everything_ok = False
+        dprnt('caught exception:')
+        for line in format_exc().splitlines():
+            dprnt('exc: {}'.format(line))
+    finally:
+        # close file and delete it
+        if temp_file:
+            temp_file.close()
+            try:
+                os.unlink(temp_file.name)
+                os.unlink(temp_file.name + ".1")
+            except FileNotFoundError:
+                pass
+
+    if print_everything or not everything_ok:
+        prefix = '{:9d}: '.format(seed)
+        for line in output:
+            print(prefix + line)
+
+    return everything_ok
+
+
+def create_seed():
+    """ create a seed for seeding module :py:mod:`random`
+
+    uses random.randint, so is also dependant on current seed of module
+    :py:mod:`random`
+    """
+    return random.getrandbits(SEED_BITS)
+
+
+def test_forever():
+    """ Main function, called when running file as script
+
+    runs do_test in infinite loop
+    """
+
+    # more params
+    fast_fail = True
+    print_everything = False
+    modes = 'w:tar', 'w:gz', 'w:bz2', 'w|tar', 'w|gz', 'w|bz2', 'w#tar', \
+            'w#gz', #'w#gz.aes128', 'w#gz.aes256', 'w#aes128', 'w#aes256'
+
+    # seed properly
+    random.seed()
+
+    # preparations
+    n_runs = 0
+    error_seeds = []
+    do_stop = False
+
+    # create temp dir
+    with TemporaryDirectory(prefix='deltatar_test_') as temp_dir:
+
+        try:
+            start_time = time()
+            while not do_stop:
+                for mode in modes:
+                    seed = create_seed()
+                    if not do_test(seed, mode, temp_dir):
+                        error_seeds.append(seed)
+                        if fast_fail:
+                            print('stopping because fast_fail is set')
+                            do_stop = True
+                            break
+                    n_runs += 1
+                    if n_runs % 100 == 0:
+                        print('at run {} ({:.3f}s per run)'
+                              .format(n_runs, (time()-start_time)/n_runs))
+        except KeyboardInterrupt:
+            print('Stopped by user')
+
+    # summarize
+    print('')
+    print('-'*72)
+    n_errs = len(error_seeds)
+    duration = time() - start_time
+    if n_runs == 0:
+        print('summary: no test run has finished')
+    else:
+        print('summary: {} runs, in {}s ({:.3f}s per run); '
+              '{} with errs ({:.2f}%)'
+              .format(n_runs, duration, duration/n_runs, n_errs,
+                      100.0 * float(n_errs)/float(n_runs)))
+        print('seeds that created errors: {}'.format(error_seeds))
+
+
+if __name__ == '__main__':
+    test_forever()