BUFSIZE = 16 * 1024
-def split_file(separator, prefix, input_file):
+def split_file(separator, prefix, input_file, new_file_func=None):
'''
splits a file when it finds a regexp, including the regexp in the begining
of the new file
if sep_len == 0:
raise Exception("empty separator")
- output = open(prefix + str(i), 'w')
+ if new_file_func is None:
+ new_file_func = lambda prefix, i: open(prefix + str(i), 'w')
+
+ output = new_file_func(prefix, i)
# buffered search. we try not to have the while input file in memory, as
# it's not needed
output.write(buf[0:idx])
output.close()
i += 1
- output = open(prefix + str(i), 'w')
+ output = new_file_func(prefix, i)
output.write(buf[idx:idx + sep_len])
else:
output.write(buf[0:sep_len])
--- /dev/null
+#!/usr/bin/env python
+
+import argparse
+import os
+import tempfile
+from functools import partial
+
+from deltatar import tarfile
+import filesplit
+
+def rescue(tar_files, rescue_dir=None):
+ '''
+ Rescues a multivolume tarfile. Checks file name extension to detect
+ format (compression, etc). Assumes it to be multivolume tar.
+ '''
+ # setup rescue_dir
+ if isinstance(tar_files, basestring):
+ tar_files = [tar_files]
+
+ if not isinstance(tar_files, list):
+ raise Exception("tar_files must be a list")
+
+ for f in tar_files:
+ if not isinstance(f, basestring):
+ raise Exception("tar_files must be a list of strings")
+ if not os.path.exists(f):
+ raise Exception("tar file '%s' doesn't exist" % f)
+
+ if rescue_dir is None:
+ rescue_dir = os.path.dirname(tar_files[0])
+ elif rescue_dir is None:
+ rescue_dir = tempfile.mkdtemp()
+
+ # autodetect file type by extension
+ first_tar_file = tar_files[0]
+ if first_tar_file.endswith(".tar.gz"):
+ mode = "r#gz"
+ elif first_tar_file.endswith(".tar"):
+ mode = "r"
+
+ base_name = os.path.basename(first_tar_file)
+ extract_files = tar_files
+
+ # num the number of files used in rescue mode. Used to name those files
+ # when creating them. We put num in an object so that it can be referenced
+ # instead of copied inside new_gz partial
+ context = dict(num=0)
+
+ # divide in compressed tar block files if it's r#gz
+ if mode == "r#gz":
+ extract_files = []
+ # function used to create each chunk file
+ def new_gz(context, extract_files, prefix, i):
+ path = "%s.%d" %(prefix, context['num'])
+ extract_files.append(path)
+ context['num'] += 1
+ return open(path, 'w')
+ new_gz = partial(new_gz, context, extract_files)
+
+ # split in compressed chunks
+ for f in tar_files:
+ filesplit.split_file('\x1f\x8b',
+ os.path.join(rescue_dir, base_name), f, new_gz)
+
+ # includes volumes already extracted with new_volume_handler
+ already_extracted_vols = []
+
+ def new_volume_handler(already_extracted_vols, next_num, tarobj, base_name, volume_number):
+ '''
+ Handles the new volumes when extracting
+ '''
+
+ # handle the special case where the first file is whatever.tar.gz and
+ # the second is whatever.tar.gz.0
+ base_name_split = base_name.split('.')
+ next_num = 0
+ try:
+ next_num = int(base_name_split[-1]) + 1
+ base_name = ".".join(base_name_split[:-1])
+ except ValueError as e:
+ pass
+
+ volume_path = "%s.%d" % (base_name, next_num)
+ already_extracted_vols.append(volume_path)
+ tarobj.open_volume(volume_path)
+
+ new_volume_handler = partial(new_volume_handler, already_extracted_vols)
+
+ # extract files, as much as possible
+ for f in extract_files:
+ if f in already_extracted_vols:
+ continue
+ try:
+ tarobj = tarfile.TarFile.open(f, mode=mode,
+ new_volume_handler=new_volume_handler)
+ tarobj.extractall()
+ tarobj.close()
+ except:
+ pass
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+
+ parser.add_argument("--rescue_dir", help="directory where rescue files "
+ "should be created. /tmp by default")
+ parser.add_argument("tar_files", nargs="+", help="list of files of a "
+ "multitar file to rescue. Assumes format first.extension "
+ "second.extension.0 third.extension.1 ...")
+
+ args = parser.parse_args()
+ rescue(tar_files=args.tar_files, rescue_dir=args.rescue_dir)
from testing.test_multivol import MultivolGnuFormatTest, MultivolPaxFormatTest
from testing.test_concat_compress import ConcatCompressTest
+from testing.test_rescue_tar import RescueTarTest
if __name__ == "__main__":
unittest.main()
\ No newline at end of file
+import os, unittest, hashlib, string
+import random
+
+class BaseTest(unittest.TestCase):
+ """
+ Test concatenated compression in tarfiles
+ """
+
+ def tearDown(self):
+ '''
+ Remove temporal files created by unit tests
+ '''
+ os.system("rm -rf big small small2 sample.*")
+
+ def create_file(self, path, length):
+ '''
+ Creates a file with some gibberish inside, returning the md5sum of that
+ file. File path and length are specified as function arguments.
+ '''
+ f = open(path, 'w')
+ s = string.lowercase + string.digits + "\n"
+ if len(s) < length:
+ s += s*(length/len(s))
+ data = s[:length]
+ f.write(data)
+ f.close()
+ return self.md5sum(path)
+
+ def md5sum(self, filename):
+ '''
+ Returns the md5sum of a file specified by its filename/path
+ '''
+ md5 = hashlib.md5()
+ with open(filename,'rb') as f:
+ for chunk in iter(lambda: f.read(128*md5.block_size), b''):
+ md5.update(chunk)
+ return md5.hexdigest()
\ No newline at end of file
import os, unittest, hashlib, string
-from deltatar.tarfile import TarFile, PAX_FORMAT, GNU_FORMAT, BLOCKSIZE, _Stream, RECORDSIZE
+from deltatar.tarfile import TarFile, GNU_FORMAT
import filesplit
+from . import BaseTest
-class ConcatCompressTest(unittest.TestCase):
+class ConcatCompressTest(BaseTest):
"""
Test concatenated compression in tarfiles
"""
- def tearDown(self):
- '''
- Remove temporal files created by unit tests
- '''
- os.system("rm -rf big small small2 sample.*")
-
- def create_file(self, path, length):
- '''
- Creates a file with some gibberish inside, returning the md5sum of that
- file. File path and length are specified as function arguments.
- '''
- f = open(path, 'w')
- s = string.lowercase + string.digits + "\n"
- if len(s) < length:
- s += s*(length/len(s))
- data = s[:length]
- f.write(data)
- f.close()
- return self.md5sum(path)
-
- def md5sum(self, filename):
- '''
- Returns the md5sum of a file specified by its filename/path
- '''
- md5 = hashlib.md5()
- with open(filename,'rb') as f:
- for chunk in iter(lambda: f.read(128*md5.block_size), b''):
- md5.update(chunk)
- return md5.hexdigest()
-
def test_zcat_extract_concat(self):
"""
Create a tar file with only one file inside, using concat compression
import os, unittest, hashlib, string
from deltatar.tarfile import TarFile, PAX_FORMAT, GNU_FORMAT, BLOCKSIZE
+from . import BaseTest
def new_volume_handler(tarobj, base_name, volume_number):
'''
tarobj.open_volume(volume_path)
-class MultivolGnuFormatTest(unittest.TestCase):
+class MultivolGnuFormatTest(BaseTest):
"""
Test multivolume support in tarfile. Tar Format is specified at class level.
"""
# case of GNU format this is the same as tarfile_overhead.
tarvol_overhead = 3*BLOCKSIZE
- def tearDown(self):
- '''
- Remove temporal files created by unit tests
- '''
- os.system("rm -rf big small small2 sample.tar*")
-
- def create_file(self, path, length):
- '''
- Creates a file with some gibberish inside, returning the md5sum of that
- file. File path and length are specified as function arguments.
- '''
- f = open(path, 'w')
- s = string.lowercase + string.digits + "\n"
- if len(s) < length:
- s += s*(length/len(s))
- data = s[:length]
- f.write(data)
- f.close()
- return self.md5sum(path)
-
- def md5sum(self, filename):
- '''
- Returns the md5sum of a file specified by its filename/path
- '''
- md5 = hashlib.md5()
- with open(filename,'rb') as f:
- for chunk in iter(lambda: f.read(128*md5.block_size), b''):
- md5.update(chunk)
- return md5.hexdigest()
-
def test_no_volume(self):
"""
Create a tar file with only one file inside and no extra volumes
--- /dev/null
+import os, unittest, hashlib, string
+
+from deltatar.tarfile import TarFile, PAX_FORMAT, GNU_FORMAT, BLOCKSIZE
+from . import BaseTest
+import rescue_tar
+
+def new_volume_handler(tarobj, base_name, volume_number):
+ '''
+ Handles the new volumes
+ '''
+ volume_path = "%s.%d" % (base_name, volume_number)
+ tarobj.open_volume(volume_path)
+
+class RescueTarTest(BaseTest):
+ def test_rescue_ok(self):
+ '''
+ Test rescue_tar when no file is broken, without using multivol tars.
+ '''
+
+ # create sample data
+ hash = dict()
+ hash["big"] = self.create_file("big", 50000)
+ hash["big2"] = self.create_file("big2", 10200)
+ hash["small"] = self.create_file("small", 100)
+ hash["small2"] = self.create_file("small2", 354)
+
+ # create the tar file with volumes
+ tarobj = TarFile.open("sample.tar.gz",
+ mode="w#gz",
+ concat_compression=True)
+ tarobj.add("big")
+ tarobj.add("big2")
+ tarobj.add("small")
+ tarobj.add("small2")
+ tarobj.close()
+
+ assert os.path.exists("sample.tar.gz")
+ os.unlink("big")
+ os.unlink("big2")
+ os.unlink("small")
+ os.unlink("small2")
+
+ # extract
+ rescue_tar.rescue("sample.tar.gz")
+
+ # check output
+ for key, value in hash.iteritems():
+ assert os.path.exists(key)
+ assert value == self.md5sum(key)
+
+ def test_rescue_broken(self):
+ '''
+ Use rescue_tar utility to split the file in compressed tar blocks that
+ individually decompressed and "untarred", thanks to be using the
+ concat gzip tar format. In this case, we simulate that one of the files
+ is corrupted. The rest will decompress just fine.
+ '''
+
+ # create sample data
+ hash = dict()
+ hash["big"] = self.create_file("big", 50000)
+ hash["big2"] = self.create_file("big2", 10200)
+ hash["small"] = self.create_file("small", 100)
+ hash["small2"] = self.create_file("small2", 354)
+
+ # create the tar file with volumes
+ tarobj = TarFile.open("sample.tar.gz",
+ mode="w#gz",
+ concat_compression=True)
+ tarobj.add("big")
+ tarobj.add("big2")
+ tarobj.add("small")
+ tarobj.add("small2")
+ tarobj.close()
+
+ assert os.path.exists("sample.tar.gz")
+
+ # overwrite stuff in the middle of the big file
+ f = open('sample.tar.gz', 'r+b')
+ f.seek(100)
+ f.write("breaking things")
+ f.close()
+
+ os.unlink("big")
+ os.unlink("big2")
+ os.unlink("small")
+ os.unlink("small2")
+
+ # extract
+ rescue_tar.rescue("sample.tar.gz")
+
+ # check output
+ for key, value in hash.iteritems():
+ if key == "big":
+ continue
+ assert os.path.exists(key)
+ assert value == self.md5sum(key)