From: Christian Herdtweck Date: Mon, 13 Jun 2016 11:07:08 +0000 (+0200) Subject: created another test for multivolume compression size X-Git-Tag: v2.2~35^2~7 X-Git-Url: http://developer.intra2net.com/git/?a=commitdiff_plain;h=ca33c4ba013e59d6a3dcbb9027a1baf65e28a7ff;p=python-delta-tar created another test for multivolume compression size --- diff --git a/testing/test_multivol_compression_sizes.py b/testing/test_multivol_compression_sizes.py new file mode 100644 index 0000000..70895a9 --- /dev/null +++ b/testing/test_multivol_compression_sizes.py @@ -0,0 +1,370 @@ +#!/usr/bin/env python3 + +""" Test size of volumes when using multiple volumes and compression is on + +Uses random files from disc as input. That is not very time-efficient but +provides a realistic setting for the nature of input data (file sizes, +randomness of data, ...) + +Not a unittest, will probably take too long +""" + +from tempfile import mkstemp, mkdtemp +from shutil import rmtree +import random +import os +from os.path import isdir, dirname, abspath, join as pjoin +from glob import iglob +import stat +import sys +from traceback import print_exc + +if __name__ == '__main__': + # ensure we are importing the "right" deltatar + parent_dir = dirname(dirname(abspath(__file__))) + sys.path.insert(0, parent_dir) + print('pre-prended {} to sys path'.format(parent_dir)) +import deltatar +from deltatar.tarfile import TarFile, BLOCKSIZE + + +#: tolerances of volume sizes +SIZE_TOLERANCE_GZ = 32*1024 # 32 KiB due to Gzip/Bzip2 compression object buffer +SIZE_TOLERANCE_BZ2 = 1024*1024 # 1MB! +SIZE_TOLERANCE_XZ = 72*1024 +SIZE_TOLERANCE_NONE = 5*BLOCKSIZE + +#: variables for find_random_files +DIR_RETURN_MIN_REC = 5 +DIR_RETURN_PROB = 0.0 # disabled +DIR_MAX_REC = 20 +START_DIR = '/' + +#: subdirs of START_DIR that might contain volatile or network-mounted data +EXCLUDE_DIRS = 'var', 'proc', 'dev', 'tmp', 'media', 'mnt', 'sys' + +OK_MODES = stat.S_ISREG, stat.S_ISDIR, stat.S_ISFIFO, stat.S_ISLNK, \ + stat.S_ISCHR, stat.S_ISBLK + + +def _get_random_file(dir_name, rec_level): + """ recursive helper for find_random_files """ + + if rec_level > DIR_MAX_REC: + return None + + #print('_get_random_file in {}, level {}'.format(dir_name, rec_level)) + contents = os.listdir(dir_name) + if not contents: + return None + + entry = pjoin(dir_name, random.choice(contents)) + + if isdir(entry): + if rec_level > DIR_RETURN_MIN_REC and \ + random.random() < DIR_RETURN_PROB: + return entry # with a small probability return a dir + else: + return _get_random_file(entry, rec_level + 1) + else: + return entry + + +def find_random_files(): + """ generator over random file names + + Checks if files are readable by user (os.access) and excludes dirs with + most volatile files and mounts; will still yield links or -- with a small + probablility -- names of dirs with many parents (no '/usr' but maybe + /usr/local/lib/python/site-packages/deltatar) + """ + + # prepare list of dirs in START_DIR that are not EXCLUDE_DIRS + start_contents = [pjoin(START_DIR, dirn) for dirn in os.listdir(START_DIR)] + for excl in EXCLUDE_DIRS: + try: + start_contents.remove(pjoin(START_DIR, excl)) + except ValueError: + pass + + # infinite main loop + while True: + #print('_get_random_file in {}, level {}'.format(START_DIR, 0)) + entry = random.choice(start_contents) + if isdir(entry): + next_result = _get_random_file(entry, 1) + else: + next_result = entry + #print('found non-dir in START_DIR: {}'.format(next_result)) + if next_result is None: + #print('received None, try next') + continue + if not os.access(next_result, os.R_OK): + #print('cannot access {}, try next'.format(next_result)) + continue + mode = os.stat(next_result).st_mode + if not any(mode_test(mode) for mode_test in OK_MODES): + #print('mode not accepted for {}, try next'.format(next_result)) + continue + yield next_result + + +def new_volume_handler(tarobj, base_name, volume_number, + prefix='', debug_level=0): + """ called when creating a new volume from TarFile.addfile """ + + if debug_level: + print(prefix + 'new volume handler called with {} and new vol {}' + .format(base_name, volume_number)) + + # close current volume file object + tarobj.fileobj.close() + + # create name for next volume file + idx = base_name.rindex('.0.') + new_vol_path = '{}.{}.{}'.format(base_name[:idx], volume_number, + base_name[idx+3:]) + + tarobj.open_volume(new_vol_path) + + +def test(volume_size, input_size_factor, mode, temp_dir, prefix='', + clean_up_if_error=False, debug_level=0): + """ create TarFile with given vol_size, add vol_size*input_size + + :param volume_size: in MB + :param str prefix: optional output prefix + :param str mode: compression mode for TarFile's mode argument + :param bool clean_up_if_error: True will ensure there are no files left; + False (default): leave volumes if error + :param int debug_level: 0-3 where 0=no debug output, 3=lots of debug output + (forwarded to TarFile constructor) + :returns: True if test failed (some size wrong, file missing, ...) + """ + + input_size = volume_size * input_size_factor * 1e6 + something_strange = False + + if 'gz' in mode: + suffix = 'tgz' + size_tolerance = SIZE_TOLERANCE_GZ + elif 'bz' in mode: + suffix = 'tbz' + size_tolerance = SIZE_TOLERANCE_BZ2 + elif 'xz' in mode: + suffix = 'txz' + size_tolerance = SIZE_TOLERANCE_XZ + else: + suffix = 'tar' + size_tolerance = SIZE_TOLERANCE_NONE + + + temp_name = None + file_handle = None + base_name = None + try: + # create temp file + file_handle, temp_name = mkstemp(dir=temp_dir, suffix='.0.' + suffix) + os.close(file_handle) + file_handle = None + + # preparations + base_name = temp_name.replace('.0.' + suffix, '') + if debug_level: + print(prefix + 'tarfile: ' + temp_name) + + volume_prefix = prefix + 'vol={}MB, in=*{}, mode={}: ' \ + .format(volume_size, input_size_factor, mode) + def vol_handler(a,b,c): + return new_volume_handler(a,b,c, volume_prefix, debug_level) + + # create tar object + tarobj = TarFile.open(temp_name, mode=mode, + max_volume_size=volume_size*1.e6, + new_volume_handler=vol_handler, + password='test1234', debug=debug_level) + + # add data + added_size = 0 + new_size = 0 + files_added = [] + for count, file_name in enumerate(find_random_files()): + if file_name.startswith(base_name): + continue # do not accidentally add self + new_size = os.lstat(file_name).st_size + if new_size > max(volume_size, input_size-added_size): + continue # add at most one volume_size too much + new_name = '{}_{:04d}_{}_{:09d}' \ + .format(base_name, count, + file_name.replace('/','_')[:200], + new_size) + tarobj.add(file_name, arcname=new_name) + files_added.append(new_name) + added_size += new_size + if debug_level > 2: + print('{}vol={}MB, in=*{}, mode={}: added {:.1f}MB/{:.1f}MB' + .format(prefix, volume_size, input_size_factor, mode, + added_size/1e6, input_size/1e6)) + if added_size > input_size: + break + tarobj.close() + + # check volume files + n_wrong_size = 0 + n_volumes = 0 + volume_size_sum = 0 + for file_name in iglob(pjoin(temp_dir, base_name + '*')): + n_volumes += 1 + vol_size = os.lstat(file_name).st_size + volume_size_sum += vol_size + if debug_level: + print('{} - {}: {:.3f}'.format(prefix, file_name, + vol_size/1.e6)) + if abs(vol_size - volume_size*1e6) > size_tolerance: + n_wrong_size += 1 + + if debug_level: + print(prefix + 'compression ratio (input/compressed size): {:.1f}' + .format(added_size/volume_size_sum)) + + if n_wrong_size > 1: + print(prefix + 'wrong size!') + something_strange = True + if n_volumes == 0: + print(prefix + 'no volumes!') + something_strange = True + + # extract data + if debug_level: + print(prefix + 'extracting:') + tarobj = TarFile.open(temp_name, mode=mode.replace('w', 'r'), + new_volume_handler=new_volume_handler, + password='test1234') + tarobj.extractall(path='/') + tarobj.close() + + # check whether all original files are accounted for + n_files_found = 0 + files_found = [False for _ in files_added] + + for file_name in iglob(pjoin(temp_dir, base_name + '_*')): + n_files_found += 1 + orig_size = int(file_name[-9:]) + if os.lstat(file_name).st_size != orig_size: + print(prefix + 'wrong size: {} instead of {} for {}!' + .format(os.lstat(file_name).st_size, orig_size, + file_name)) + something_strange = True + try: + idx = files_added.index(file_name) + except ValueError: + print(prefix + 'extracted file that was not added: ' + + file_name) + something_strange = True + else: + files_found[idx] = True + + not_found = [file_name + for file_name, found in zip(files_added, files_found) + if not found] + + for file_name in not_found: + print(prefix + 'original file not found: ' + file_name) + something_strange = True + + if n_files_found != len(files_added): + print(prefix + 'added {} files but extracted {}!' + .format(len(files_added), n_files_found)) + something_strange = True + except Exception as exc: + print('caught exception {}'.format(exc)) + print_exc() + something_strange = True + finally: + if file_handle: + os.close(file_handle) + + # clean up + if base_name: + for file_name in iglob(base_name + '*'): + if clean_up_if_error: + os.unlink(file_name) + elif something_strange and file_name.endswith('.' + suffix): + continue # skip + else: + os.unlink(file_name) # remove + if debug_level and something_strange and not clean_up_if_error: + print(prefix + 'leaving volume files ' + base_name + + '.*.'+suffix) + + # summarize + if something_strange: + print('{}test with volume_size={}, input_factor={}, mode={} failed!' + .format(prefix, volume_size, input_size_factor, mode)) + elif debug_level: + print(prefix + 'test succeeded') + + return something_strange + + +def test_lots(fast_fail=False): + """ Tests a lot of combinations of volume_size, input_size and mode + + :param bool fast_fail: set to True to stop after first error + :retuns: number of failed tests + """ + + # volume sizes in MB + volume_sizes = 10, 100 + + # input size factor (multiplied with volume size) + input_size_factors = 3, 10, 30 + + # compression modes (including uncompressed as comparison) + modes = 'w|gz', 'w|bz2', 'w|xz', 'w#gz', 'w#gz.aes128', 'w#gz.aes256', \ + 'w#aes128' + + debug_level = 2 + clean_up_if_error = False + + # create a temp dir for all input and output data + temp_dir = mkdtemp(prefix='deltatar_cmprs_tst_') + n_errs = 0 + n_tests = len(volume_sizes) * len(input_size_factors) * len(modes) + test_idx = 0 + stop_now = False + for volume_size in volume_sizes: + if stop_now: + break + for input_size_factor in input_size_factors: + if stop_now: + break + for mode in modes: + test_idx += 1 + prefix = 'test{:d}: '.format(test_idx) + something_strange = test(volume_size, input_size_factor, mode, + temp_dir, prefix, + clean_up_if_error=False, + debug_level=debug_level) + if something_strange: + n_errs += 1 + if fast_fail: + stop_now = True + break + print('after running test {:3d}/{} have {} errs' + .format(test_idx, n_tests, n_errs)) + if n_errs == 0: + print('removing temp dir {}'.format(temp_dir)) + rmtree(temp_dir) + else: + print('leaving temp dir {}'.format(temp_dir)) + + return n_errs + + +if __name__ == '__main__': + # run test + n_errs = test_lots() + + # forward number of errors to shell + sys.exit(n_errs)