From 3759f7963f84adb812bb628ada60ccad5933819c Mon Sep 17 00:00:00 2001 From: Eduardo Robles Elvira Date: Mon, 8 Jul 2013 13:02:22 +0200 Subject: [PATCH] adding rescue file splitter and some unit tests using it --- filesplit.py | 68 +++++++++++++++++ testing/test_concat_compress.py | 159 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 226 insertions(+), 1 deletions(-) create mode 100644 filesplit.py diff --git a/filesplit.py b/filesplit.py new file mode 100644 index 0000000..8c4a7a2 --- /dev/null +++ b/filesplit.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python + +import argparse + +BUFSIZE = 16 * 1024 + +def split_file(separator, prefix, input_file): + ''' + splits a file when it finds a regexp, including the regexp in the begining + of the new file + ''' + i = 0 + pos = 0 + buf = "" + sep_len = len(separator) + if sep_len == 0: + raise Exception("empty separator") + + output = open(prefix + str(i), 'w') + + # buffered search. we try not to have the while input file in memory, as + # it's not needed + with open(input_file, 'r') as f: + while True: + buf += f.read(BUFSIZE) + if len(buf) == 0: + break + + # split using the separator + while separator in buf: + idx = buf.index(separator) + + if idx > 0: + output.write(buf[0:idx]) + output.close() + i += 1 + output = open(prefix + str(i), 'w') + output.write(buf[idx:idx + sep_len]) + else: + output.write(buf[0:sep_len]) + + buf = buf[idx + sep_len:] + + # corner case: separator is between this buf and next one. In this + # case, we write to current output everything before that and + # iterate + if separator[0] in buf[-sep_len:]: + output.write(buf[:-sep_len]) + buf = buf[-sep_len:] + continue + + # else: continue writing to the current output and iterate + output.write(buf) + buf = "" + + output.close() + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument("-s", "--separator", required=True, + help="string for the separator") + parser.add_argument("-p", "--prefix", required=True, + help="prefix for split files") + parser.add_argument("input_file", help="input file") + + args = parser.parse_args() + split_file(separator=args.separator, prefix=args.prefix, input_file=args.input_file) diff --git a/testing/test_concat_compress.py b/testing/test_concat_compress.py index dbe7806..409ba72 100644 --- a/testing/test_concat_compress.py +++ b/testing/test_concat_compress.py @@ -2,6 +2,7 @@ import os, unittest, hashlib, string from deltatar.tarfile import TarFile, PAX_FORMAT, GNU_FORMAT, BLOCKSIZE, _Stream, RECORDSIZE +import filesplit class ConcatCompressTest(unittest.TestCase): """ @@ -12,7 +13,7 @@ class ConcatCompressTest(unittest.TestCase): ''' Remove temporal files created by unit tests ''' - os.system("rm -rf big small small2 sample.tar*") + os.system("rm -rf big small small2 sample.*") def create_file(self, path, length): ''' @@ -160,3 +161,159 @@ class ConcatCompressTest(unittest.TestCase): for key, value in hash.iteritems(): assert os.path.exists(key) assert value == self.md5sum(key) + + def test_multiple_files_rescue_extract(self): + ''' + Use filesplit utility to split the file in compressed tar blocks that + individually decompressed and "untarred", thanks to be using the + concat gzip tar format. + ''' + + # create sample data + hash = dict() + hash["big"] = self.create_file("big", 50000) + hash["small"] = self.create_file("small", 100) + hash["small2"] = self.create_file("small2", 354) + + # create the tar file with volumes + tarobj = TarFile.open("sample.tar.gz", + mode="w#gz", + concat_compression=True) + tarobj.add("big") + tarobj.add("small") + tarobj.add("small2") + tarobj.close() + + assert os.path.exists("sample.tar.gz") + + os.unlink("big") + os.unlink("small") + os.unlink("small2") + + filesplit.split_file('\x1f\x8b', "sample.tar.gz.", "sample.tar.gz") + + assert os.path.exists("sample.tar.gz.0") # beginning of the tar file + assert os.path.exists("sample.tar.gz.1") # first file + assert os.path.exists("sample.tar.gz.2") # second file + assert os.path.exists("sample.tar.gz.3") # third file + assert not os.path.exists("sample.tar.gz.4") # nothing else + + # extract and check output + for i in xrange(1, 4): + tarobj = TarFile.open("sample.tar.gz.%d" % i, + mode="r|gz") + tarobj.extractall() + tarobj.close() + + for key, value in hash.iteritems(): + assert os.path.exists(key) + assert value == self.md5sum(key) + + def test_multiple_files_rescue_extract_gnu(self): + ''' + Use filesplit utility to split the file in compressed tar blocks that + individually decompressed and "untarred", thanks to be using the + concat gzip tar format. We do the extraction with standard gnu tar and + gzip command line commands. + ''' + + # create sample data + hash = dict() + hash["big"] = self.create_file("big", 50000) + hash["small"] = self.create_file("small", 100) + hash["small2"] = self.create_file("small2", 354) + + # create the tar file with volumes + tarobj = TarFile.open("sample.tar.gz", + mode="w#gz", + concat_compression=True) + tarobj.add("big") + tarobj.add("small") + tarobj.add("small2") + tarobj.close() + + assert os.path.exists("sample.tar.gz") + + os.unlink("big") + os.unlink("small") + os.unlink("small2") + + # extract using the command line this time + os.system("python filesplit.py -s $'\\x1f\\x8b' -p sample.tar.gz. sample.tar.gz") + + assert os.path.exists("sample.tar.gz.0") # beginning of the tar file + assert os.path.exists("sample.tar.gz.1") # first file + assert os.path.exists("sample.tar.gz.2") # second file + assert os.path.exists("sample.tar.gz.3") # third file + assert not os.path.exists("sample.tar.gz.4") # nothing else + + # extract and check output + for i in xrange(1, 4): + os.system("gzip -cd sample.tar.gz.%d > sample.%d.tar" % (i, i)) + os.system("tar xf sample.%d.tar" % i) + + for key, value in hash.iteritems(): + assert os.path.exists(key) + assert value == self.md5sum(key) + + def test_multiple_files_rescue_extract_broken(self): + ''' + Use filesplit utility to split the file in compressed tar blocks that + individually decompressed and "untarred", thanks to be using the + concat gzip tar format. In this case, we simulate that one of the files + is corrupted. The rest will decompress just fine. + ''' + + # create sample data + hash = dict() + hash["big"] = self.create_file("big", 50000) + hash["small"] = self.create_file("small", 100) + hash["small2"] = self.create_file("small2", 354) + + # create the tar file with volumes + tarobj = TarFile.open("sample.tar.gz", + mode="w#gz", + concat_compression=True) + tarobj.add("big") + tarobj.add("small") + tarobj.add("small2") + tarobj.close() + + assert os.path.exists("sample.tar.gz") + + # overwrite stuff in the middle of the big file + f = open('sample.tar.gz', 'r+b') + f.seek(100) + f.write("breaking things") + f.close() + + os.unlink("big") + os.unlink("small") + os.unlink("small2") + + # equivalent to $ python filesplit.py -s $'\x1f\x8b' -p sample.tar.gz. sample.tar.gz + filesplit.split_file('\x1f\x8b', "sample.tar.gz.", "sample.tar.gz") + + assert os.path.exists("sample.tar.gz.0") # beginning of the tar file + assert os.path.exists("sample.tar.gz.1") # first file + assert os.path.exists("sample.tar.gz.2") # second file + assert os.path.exists("sample.tar.gz.3") # third file + assert not os.path.exists("sample.tar.gz.4") # nothing else + + # extract and check output + for i in xrange(1, 4): + try: + tarobj = TarFile.open("sample.tar.gz.%d" % i, + mode="r|gz") + tarobj.extractall() + tarobj.close() + except Exception as e: + if i == 1: # big file doesn't extract well because it's corrupted + pass + else: + raise Exception("Error extracting a tar.gz not related to the broken 'big' file") + + for key, value in hash.iteritems(): + if key != "big": + assert os.path.exists(key) + assert value == self.md5sum(key) -- 1.7.1