created another test for multivolume compression size
authorChristian Herdtweck <christian.herdtweck@intra2net.com>
Mon, 13 Jun 2016 11:07:08 +0000 (13:07 +0200)
committerChristian Herdtweck <christian.herdtweck@intra2net.com>
Wed, 15 Jun 2016 11:18:03 +0000 (13:18 +0200)
testing/test_multivol_compression_sizes.py [new file with mode: 0644]

diff --git a/testing/test_multivol_compression_sizes.py b/testing/test_multivol_compression_sizes.py
new file mode 100644 (file)
index 0000000..70895a9
--- /dev/null
@@ -0,0 +1,370 @@
+#!/usr/bin/env python3
+
+""" Test size of volumes when using multiple volumes and compression is on
+
+Uses random files from disc as input. That is not very time-efficient but
+provides a realistic setting for the nature of input data (file sizes,
+randomness of data, ...)
+
+Not a unittest, will probably take too long
+"""
+
+from tempfile import mkstemp, mkdtemp
+from shutil import rmtree
+import random
+import os
+from os.path import isdir, dirname, abspath, join as pjoin
+from glob import iglob
+import stat
+import sys
+from traceback import print_exc
+
+if __name__ == '__main__':
+    # ensure we are importing the "right" deltatar
+    parent_dir = dirname(dirname(abspath(__file__)))
+    sys.path.insert(0, parent_dir)
+    print('pre-prended {} to sys path'.format(parent_dir))
+import deltatar
+from deltatar.tarfile import TarFile, BLOCKSIZE
+
+
+#: tolerances of volume sizes
+SIZE_TOLERANCE_GZ = 32*1024   # 32 KiB due to Gzip/Bzip2 compression object buffer
+SIZE_TOLERANCE_BZ2 = 1024*1024   # 1MB!
+SIZE_TOLERANCE_XZ = 72*1024
+SIZE_TOLERANCE_NONE = 5*BLOCKSIZE
+
+#: variables for find_random_files
+DIR_RETURN_MIN_REC = 5
+DIR_RETURN_PROB = 0.0  # disabled
+DIR_MAX_REC = 20
+START_DIR = '/'
+
+#: subdirs of START_DIR that might contain volatile or network-mounted data
+EXCLUDE_DIRS = 'var', 'proc', 'dev', 'tmp', 'media', 'mnt', 'sys'
+
+OK_MODES = stat.S_ISREG, stat.S_ISDIR, stat.S_ISFIFO, stat.S_ISLNK, \
+           stat.S_ISCHR, stat.S_ISBLK
+
+
+def _get_random_file(dir_name, rec_level):
+    """ recursive helper for find_random_files """
+
+    if rec_level > DIR_MAX_REC:
+        return None
+
+    #print('_get_random_file in {}, level {}'.format(dir_name, rec_level))
+    contents = os.listdir(dir_name)
+    if not contents:
+        return None
+
+    entry = pjoin(dir_name, random.choice(contents))
+
+    if isdir(entry):
+        if rec_level > DIR_RETURN_MIN_REC and \
+                random.random() < DIR_RETURN_PROB:
+            return entry    # with a small probability return a dir
+        else:
+            return _get_random_file(entry, rec_level + 1)
+    else:
+        return entry
+
+
+def find_random_files():
+    """ generator over random file names
+
+    Checks if files are readable by user (os.access) and excludes dirs with
+    most volatile files and mounts; will still yield links or -- with a small
+    probablility -- names of dirs with many parents (no '/usr' but maybe
+    /usr/local/lib/python/site-packages/deltatar)
+    """
+
+    # prepare list of dirs in START_DIR that are not EXCLUDE_DIRS
+    start_contents = [pjoin(START_DIR, dirn) for dirn in os.listdir(START_DIR)]
+    for excl in EXCLUDE_DIRS:
+        try:
+            start_contents.remove(pjoin(START_DIR, excl))
+        except ValueError:
+            pass
+
+    # infinite main loop
+    while True:
+        #print('_get_random_file in {}, level {}'.format(START_DIR, 0))
+        entry = random.choice(start_contents)
+        if isdir(entry):
+            next_result = _get_random_file(entry, 1)
+        else:
+            next_result = entry
+            #print('found non-dir in START_DIR: {}'.format(next_result))
+        if next_result is None:
+            #print('received None, try next')
+            continue
+        if not os.access(next_result, os.R_OK):
+            #print('cannot access {}, try next'.format(next_result))
+            continue
+        mode = os.stat(next_result).st_mode
+        if not any(mode_test(mode) for mode_test in OK_MODES):
+            #print('mode not accepted for {}, try next'.format(next_result))
+            continue
+        yield next_result
+
+
+def new_volume_handler(tarobj, base_name, volume_number,
+                       prefix='', debug_level=0):
+    """ called when creating a new volume from TarFile.addfile """
+
+    if debug_level:
+        print(prefix + 'new volume handler called with {} and new vol {}'
+                       .format(base_name, volume_number))
+
+    # close current volume file object
+    tarobj.fileobj.close()
+
+    # create name for next volume file
+    idx = base_name.rindex('.0.')
+    new_vol_path = '{}.{}.{}'.format(base_name[:idx], volume_number,
+                                     base_name[idx+3:])
+
+    tarobj.open_volume(new_vol_path)
+
+
+def test(volume_size, input_size_factor, mode, temp_dir, prefix='',
+         clean_up_if_error=False, debug_level=0):
+    """ create TarFile with given vol_size, add vol_size*input_size
+
+    :param volume_size: in MB
+    :param str prefix: optional output prefix
+    :param str mode: compression mode for TarFile's mode argument
+    :param bool clean_up_if_error: True will ensure there are no files left;
+                                   False (default): leave volumes if error
+    :param int debug_level: 0-3 where 0=no debug output, 3=lots of debug output
+                            (forwarded to TarFile constructor)
+    :returns: True if test failed (some size wrong, file missing, ...)
+    """
+
+    input_size = volume_size * input_size_factor * 1e6
+    something_strange = False
+
+    if 'gz' in mode:
+        suffix = 'tgz'
+        size_tolerance = SIZE_TOLERANCE_GZ
+    elif 'bz' in mode:
+        suffix = 'tbz'
+        size_tolerance = SIZE_TOLERANCE_BZ2
+    elif 'xz' in mode:
+        suffix = 'txz'
+        size_tolerance = SIZE_TOLERANCE_XZ
+    else:
+        suffix = 'tar'
+        size_tolerance = SIZE_TOLERANCE_NONE
+
+
+    temp_name = None
+    file_handle = None
+    base_name = None
+    try:
+        # create temp file
+        file_handle, temp_name = mkstemp(dir=temp_dir, suffix='.0.' + suffix)
+        os.close(file_handle)
+        file_handle = None
+
+        # preparations
+        base_name = temp_name.replace('.0.' + suffix, '')
+        if debug_level:
+            print(prefix + 'tarfile: ' + temp_name)
+
+        volume_prefix = prefix + 'vol={}MB, in=*{}, mode={}: ' \
+                                 .format(volume_size, input_size_factor, mode)
+        def vol_handler(a,b,c):
+            return new_volume_handler(a,b,c, volume_prefix, debug_level)
+
+        # create tar object
+        tarobj = TarFile.open(temp_name, mode=mode,
+                              max_volume_size=volume_size*1.e6,
+                              new_volume_handler=vol_handler,
+                              password='test1234', debug=debug_level)
+
+        # add data
+        added_size = 0
+        new_size = 0
+        files_added = []
+        for count, file_name in enumerate(find_random_files()):
+            if file_name.startswith(base_name):
+                continue    # do not accidentally add self
+            new_size = os.lstat(file_name).st_size
+            if new_size > max(volume_size, input_size-added_size):
+                continue    # add at most one volume_size too much
+            new_name = '{}_{:04d}_{}_{:09d}' \
+                       .format(base_name, count,
+                               file_name.replace('/','_')[:200],
+                               new_size)
+            tarobj.add(file_name, arcname=new_name)
+            files_added.append(new_name)
+            added_size += new_size
+            if debug_level > 2:
+                print('{}vol={}MB, in=*{}, mode={}: added {:.1f}MB/{:.1f}MB'
+                      .format(prefix, volume_size, input_size_factor, mode,
+                              added_size/1e6, input_size/1e6))
+            if added_size > input_size:
+                break
+        tarobj.close()
+
+        # check volume files
+        n_wrong_size = 0
+        n_volumes = 0
+        volume_size_sum = 0
+        for file_name in iglob(pjoin(temp_dir, base_name + '*')):
+            n_volumes += 1
+            vol_size = os.lstat(file_name).st_size
+            volume_size_sum += vol_size
+            if debug_level:
+                print('{} - {}: {:.3f}'.format(prefix, file_name,
+                                               vol_size/1.e6))
+            if abs(vol_size - volume_size*1e6) > size_tolerance:
+                n_wrong_size += 1
+
+        if debug_level:
+            print(prefix + 'compression ratio (input/compressed size): {:.1f}'
+                           .format(added_size/volume_size_sum))
+
+        if n_wrong_size > 1:
+            print(prefix + 'wrong size!')
+            something_strange = True
+        if n_volumes == 0:
+            print(prefix + 'no volumes!')
+            something_strange = True
+
+        # extract data
+        if debug_level:
+            print(prefix + 'extracting:')
+        tarobj = TarFile.open(temp_name, mode=mode.replace('w', 'r'),
+                              new_volume_handler=new_volume_handler,
+                              password='test1234')
+        tarobj.extractall(path='/')
+        tarobj.close()
+
+        # check whether all original files are accounted for
+        n_files_found = 0
+        files_found = [False for _ in files_added]
+
+        for file_name in iglob(pjoin(temp_dir, base_name + '_*')):
+            n_files_found += 1
+            orig_size = int(file_name[-9:])
+            if os.lstat(file_name).st_size != orig_size:
+                print(prefix + 'wrong size: {} instead of {} for {}!'
+                               .format(os.lstat(file_name).st_size, orig_size,
+                                       file_name))
+                something_strange = True
+            try:
+                idx = files_added.index(file_name)
+            except ValueError:
+                print(prefix + 'extracted file that was not added: '
+                      + file_name)
+                something_strange = True
+            else:
+                files_found[idx] = True
+
+        not_found = [file_name
+                     for file_name, found in zip(files_added, files_found)
+                     if not found]
+
+        for file_name in not_found:
+            print(prefix + 'original file not found: ' + file_name)
+            something_strange = True
+
+        if n_files_found != len(files_added):
+            print(prefix + 'added {} files but extracted {}!'
+                           .format(len(files_added), n_files_found))
+            something_strange = True
+    except Exception as exc:
+        print('caught exception {}'.format(exc))
+        print_exc()
+        something_strange = True
+    finally:
+        if file_handle:
+            os.close(file_handle)
+
+        # clean up
+        if base_name:
+            for file_name in iglob(base_name + '*'):
+                if clean_up_if_error:
+                    os.unlink(file_name)
+                elif something_strange and file_name.endswith('.' + suffix):
+                    continue   # skip
+                else:
+                    os.unlink(file_name)   # remove
+            if debug_level and something_strange and not clean_up_if_error:
+                print(prefix + 'leaving volume files ' + base_name
+                      + '.*.'+suffix)
+
+    # summarize
+    if something_strange:
+        print('{}test with volume_size={}, input_factor={}, mode={} failed!'
+              .format(prefix, volume_size, input_size_factor, mode))
+    elif debug_level:
+        print(prefix + 'test succeeded')
+
+    return something_strange
+
+
+def test_lots(fast_fail=False):
+    """ Tests a lot of combinations of volume_size, input_size and mode
+
+    :param bool fast_fail: set to True to stop after first error
+    :retuns: number of failed tests
+    """
+
+    # volume sizes in MB
+    volume_sizes = 10, 100
+
+    # input size factor (multiplied with volume size)
+    input_size_factors = 3, 10, 30
+
+    # compression modes (including uncompressed as comparison)
+    modes = 'w|gz', 'w|bz2', 'w|xz', 'w#gz', 'w#gz.aes128', 'w#gz.aes256', \
+            'w#aes128'
+
+    debug_level = 2
+    clean_up_if_error = False
+
+    # create a temp dir for all input and output data
+    temp_dir = mkdtemp(prefix='deltatar_cmprs_tst_')
+    n_errs = 0
+    n_tests = len(volume_sizes) * len(input_size_factors) * len(modes)
+    test_idx = 0
+    stop_now = False
+    for volume_size in volume_sizes:
+        if stop_now:
+            break
+        for input_size_factor in input_size_factors:
+            if stop_now:
+                break
+            for mode in modes:
+                test_idx += 1
+                prefix = 'test{:d}: '.format(test_idx)
+                something_strange = test(volume_size, input_size_factor, mode,
+                                         temp_dir, prefix,
+                                         clean_up_if_error=False,
+                                         debug_level=debug_level)
+                if something_strange:
+                    n_errs += 1
+                    if fast_fail:
+                        stop_now = True
+                        break
+                print('after running test {:3d}/{} have {} errs'
+                      .format(test_idx, n_tests, n_errs))
+    if n_errs == 0:
+        print('removing temp dir {}'.format(temp_dir))
+        rmtree(temp_dir)
+    else:
+        print('leaving temp dir {}'.format(temp_dir))
+
+    return n_errs
+
+
+if __name__ == '__main__':
+    # run test
+    n_errs = test_lots()
+
+    # forward number of errors to shell
+    sys.exit(n_errs)