From: Eduardo Robles Elvira Date: Wed, 10 Jul 2013 11:00:06 +0000 (+0200) Subject: working in rescue tar utility X-Git-Tag: v2.2~168 X-Git-Url: http://developer.intra2net.com/git/?a=commitdiff_plain;h=0112ba0d01be5f55873d05bacf1d7fbd81c53a6c;p=python-delta-tar working in rescue tar utility --- diff --git a/filesplit.py b/filesplit.py index 8c4a7a2..db1c476 100644 --- a/filesplit.py +++ b/filesplit.py @@ -4,7 +4,7 @@ import argparse BUFSIZE = 16 * 1024 -def split_file(separator, prefix, input_file): +def split_file(separator, prefix, input_file, new_file_func=None): ''' splits a file when it finds a regexp, including the regexp in the begining of the new file @@ -16,7 +16,10 @@ def split_file(separator, prefix, input_file): if sep_len == 0: raise Exception("empty separator") - output = open(prefix + str(i), 'w') + if new_file_func is None: + new_file_func = lambda prefix, i: open(prefix + str(i), 'w') + + output = new_file_func(prefix, i) # buffered search. we try not to have the while input file in memory, as # it's not needed @@ -34,7 +37,7 @@ def split_file(separator, prefix, input_file): output.write(buf[0:idx]) output.close() i += 1 - output = open(prefix + str(i), 'w') + output = new_file_func(prefix, i) output.write(buf[idx:idx + sep_len]) else: output.write(buf[0:sep_len]) diff --git a/rescue_tar.py b/rescue_tar.py new file mode 100644 index 0000000..ce028fd --- /dev/null +++ b/rescue_tar.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python + +import argparse +import os +import tempfile +from functools import partial + +from deltatar import tarfile +import filesplit + +def rescue(tar_files, rescue_dir=None): + ''' + Rescues a multivolume tarfile. Checks file name extension to detect + format (compression, etc). Assumes it to be multivolume tar. + ''' + # setup rescue_dir + if isinstance(tar_files, basestring): + tar_files = [tar_files] + + if not isinstance(tar_files, list): + raise Exception("tar_files must be a list") + + for f in tar_files: + if not isinstance(f, basestring): + raise Exception("tar_files must be a list of strings") + if not os.path.exists(f): + raise Exception("tar file '%s' doesn't exist" % f) + + if rescue_dir is None: + rescue_dir = os.path.dirname(tar_files[0]) + elif rescue_dir is None: + rescue_dir = tempfile.mkdtemp() + + # autodetect file type by extension + first_tar_file = tar_files[0] + if first_tar_file.endswith(".tar.gz"): + mode = "r#gz" + elif first_tar_file.endswith(".tar"): + mode = "r" + + base_name = os.path.basename(first_tar_file) + extract_files = tar_files + + # num the number of files used in rescue mode. Used to name those files + # when creating them. We put num in an object so that it can be referenced + # instead of copied inside new_gz partial + context = dict(num=0) + + # divide in compressed tar block files if it's r#gz + if mode == "r#gz": + extract_files = [] + # function used to create each chunk file + def new_gz(context, extract_files, prefix, i): + path = "%s.%d" %(prefix, context['num']) + extract_files.append(path) + context['num'] += 1 + return open(path, 'w') + new_gz = partial(new_gz, context, extract_files) + + # split in compressed chunks + for f in tar_files: + filesplit.split_file('\x1f\x8b', + os.path.join(rescue_dir, base_name), f, new_gz) + + # includes volumes already extracted with new_volume_handler + already_extracted_vols = [] + + def new_volume_handler(already_extracted_vols, next_num, tarobj, base_name, volume_number): + ''' + Handles the new volumes when extracting + ''' + + # handle the special case where the first file is whatever.tar.gz and + # the second is whatever.tar.gz.0 + base_name_split = base_name.split('.') + next_num = 0 + try: + next_num = int(base_name_split[-1]) + 1 + base_name = ".".join(base_name_split[:-1]) + except ValueError as e: + pass + + volume_path = "%s.%d" % (base_name, next_num) + already_extracted_vols.append(volume_path) + tarobj.open_volume(volume_path) + + new_volume_handler = partial(new_volume_handler, already_extracted_vols) + + # extract files, as much as possible + for f in extract_files: + if f in already_extracted_vols: + continue + try: + tarobj = tarfile.TarFile.open(f, mode=mode, + new_volume_handler=new_volume_handler) + tarobj.extractall() + tarobj.close() + except: + pass + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument("--rescue_dir", help="directory where rescue files " + "should be created. /tmp by default") + parser.add_argument("tar_files", nargs="+", help="list of files of a " + "multitar file to rescue. Assumes format first.extension " + "second.extension.0 third.extension.1 ...") + + args = parser.parse_args() + rescue(tar_files=args.tar_files, rescue_dir=args.rescue_dir) diff --git a/runtests.py b/runtests.py index 9e5f8bb..856aace 100644 --- a/runtests.py +++ b/runtests.py @@ -4,6 +4,7 @@ import unittest from testing.test_multivol import MultivolGnuFormatTest, MultivolPaxFormatTest from testing.test_concat_compress import ConcatCompressTest +from testing.test_rescue_tar import RescueTarTest if __name__ == "__main__": unittest.main() \ No newline at end of file diff --git a/testing/__init__.py b/testing/__init__.py index e69de29..d47b054 100644 --- a/testing/__init__.py +++ b/testing/__init__.py @@ -0,0 +1,37 @@ +import os, unittest, hashlib, string +import random + +class BaseTest(unittest.TestCase): + """ + Test concatenated compression in tarfiles + """ + + def tearDown(self): + ''' + Remove temporal files created by unit tests + ''' + os.system("rm -rf big small small2 sample.*") + + def create_file(self, path, length): + ''' + Creates a file with some gibberish inside, returning the md5sum of that + file. File path and length are specified as function arguments. + ''' + f = open(path, 'w') + s = string.lowercase + string.digits + "\n" + if len(s) < length: + s += s*(length/len(s)) + data = s[:length] + f.write(data) + f.close() + return self.md5sum(path) + + def md5sum(self, filename): + ''' + Returns the md5sum of a file specified by its filename/path + ''' + md5 = hashlib.md5() + with open(filename,'rb') as f: + for chunk in iter(lambda: f.read(128*md5.block_size), b''): + md5.update(chunk) + return md5.hexdigest() \ No newline at end of file diff --git a/testing/test_concat_compress.py b/testing/test_concat_compress.py index 409ba72..2659792 100644 --- a/testing/test_concat_compress.py +++ b/testing/test_concat_compress.py @@ -1,44 +1,15 @@ import os, unittest, hashlib, string -from deltatar.tarfile import TarFile, PAX_FORMAT, GNU_FORMAT, BLOCKSIZE, _Stream, RECORDSIZE +from deltatar.tarfile import TarFile, GNU_FORMAT import filesplit +from . import BaseTest -class ConcatCompressTest(unittest.TestCase): +class ConcatCompressTest(BaseTest): """ Test concatenated compression in tarfiles """ - def tearDown(self): - ''' - Remove temporal files created by unit tests - ''' - os.system("rm -rf big small small2 sample.*") - - def create_file(self, path, length): - ''' - Creates a file with some gibberish inside, returning the md5sum of that - file. File path and length are specified as function arguments. - ''' - f = open(path, 'w') - s = string.lowercase + string.digits + "\n" - if len(s) < length: - s += s*(length/len(s)) - data = s[:length] - f.write(data) - f.close() - return self.md5sum(path) - - def md5sum(self, filename): - ''' - Returns the md5sum of a file specified by its filename/path - ''' - md5 = hashlib.md5() - with open(filename,'rb') as f: - for chunk in iter(lambda: f.read(128*md5.block_size), b''): - md5.update(chunk) - return md5.hexdigest() - def test_zcat_extract_concat(self): """ Create a tar file with only one file inside, using concat compression diff --git a/testing/test_multivol.py b/testing/test_multivol.py index 60e6f45..cdb2d5b 100644 --- a/testing/test_multivol.py +++ b/testing/test_multivol.py @@ -1,6 +1,7 @@ import os, unittest, hashlib, string from deltatar.tarfile import TarFile, PAX_FORMAT, GNU_FORMAT, BLOCKSIZE +from . import BaseTest def new_volume_handler(tarobj, base_name, volume_number): ''' @@ -10,7 +11,7 @@ def new_volume_handler(tarobj, base_name, volume_number): tarobj.open_volume(volume_path) -class MultivolGnuFormatTest(unittest.TestCase): +class MultivolGnuFormatTest(BaseTest): """ Test multivolume support in tarfile. Tar Format is specified at class level. """ @@ -34,36 +35,6 @@ class MultivolGnuFormatTest(unittest.TestCase): # case of GNU format this is the same as tarfile_overhead. tarvol_overhead = 3*BLOCKSIZE - def tearDown(self): - ''' - Remove temporal files created by unit tests - ''' - os.system("rm -rf big small small2 sample.tar*") - - def create_file(self, path, length): - ''' - Creates a file with some gibberish inside, returning the md5sum of that - file. File path and length are specified as function arguments. - ''' - f = open(path, 'w') - s = string.lowercase + string.digits + "\n" - if len(s) < length: - s += s*(length/len(s)) - data = s[:length] - f.write(data) - f.close() - return self.md5sum(path) - - def md5sum(self, filename): - ''' - Returns the md5sum of a file specified by its filename/path - ''' - md5 = hashlib.md5() - with open(filename,'rb') as f: - for chunk in iter(lambda: f.read(128*md5.block_size), b''): - md5.update(chunk) - return md5.hexdigest() - def test_no_volume(self): """ Create a tar file with only one file inside and no extra volumes diff --git a/testing/test_rescue_tar.py b/testing/test_rescue_tar.py new file mode 100644 index 0000000..829b158 --- /dev/null +++ b/testing/test_rescue_tar.py @@ -0,0 +1,97 @@ +import os, unittest, hashlib, string + +from deltatar.tarfile import TarFile, PAX_FORMAT, GNU_FORMAT, BLOCKSIZE +from . import BaseTest +import rescue_tar + +def new_volume_handler(tarobj, base_name, volume_number): + ''' + Handles the new volumes + ''' + volume_path = "%s.%d" % (base_name, volume_number) + tarobj.open_volume(volume_path) + +class RescueTarTest(BaseTest): + def test_rescue_ok(self): + ''' + Test rescue_tar when no file is broken, without using multivol tars. + ''' + + # create sample data + hash = dict() + hash["big"] = self.create_file("big", 50000) + hash["big2"] = self.create_file("big2", 10200) + hash["small"] = self.create_file("small", 100) + hash["small2"] = self.create_file("small2", 354) + + # create the tar file with volumes + tarobj = TarFile.open("sample.tar.gz", + mode="w#gz", + concat_compression=True) + tarobj.add("big") + tarobj.add("big2") + tarobj.add("small") + tarobj.add("small2") + tarobj.close() + + assert os.path.exists("sample.tar.gz") + os.unlink("big") + os.unlink("big2") + os.unlink("small") + os.unlink("small2") + + # extract + rescue_tar.rescue("sample.tar.gz") + + # check output + for key, value in hash.iteritems(): + assert os.path.exists(key) + assert value == self.md5sum(key) + + def test_rescue_broken(self): + ''' + Use rescue_tar utility to split the file in compressed tar blocks that + individually decompressed and "untarred", thanks to be using the + concat gzip tar format. In this case, we simulate that one of the files + is corrupted. The rest will decompress just fine. + ''' + + # create sample data + hash = dict() + hash["big"] = self.create_file("big", 50000) + hash["big2"] = self.create_file("big2", 10200) + hash["small"] = self.create_file("small", 100) + hash["small2"] = self.create_file("small2", 354) + + # create the tar file with volumes + tarobj = TarFile.open("sample.tar.gz", + mode="w#gz", + concat_compression=True) + tarobj.add("big") + tarobj.add("big2") + tarobj.add("small") + tarobj.add("small2") + tarobj.close() + + assert os.path.exists("sample.tar.gz") + + # overwrite stuff in the middle of the big file + f = open('sample.tar.gz', 'r+b') + f.seek(100) + f.write("breaking things") + f.close() + + os.unlink("big") + os.unlink("big2") + os.unlink("small") + os.unlink("small2") + + # extract + rescue_tar.rescue("sample.tar.gz") + + # check output + for key, value in hash.iteritems(): + if key == "big": + continue + assert os.path.exists(key) + assert value == self.md5sum(key)