--- /dev/null
+#!/usr/bin/env python
+
+import argparse
+
+BUFSIZE = 16 * 1024
+
+def split_file(separator, prefix, input_file):
+ '''
+ splits a file when it finds a regexp, including the regexp in the begining
+ of the new file
+ '''
+ i = 0
+ pos = 0
+ buf = ""
+ sep_len = len(separator)
+ if sep_len == 0:
+ raise Exception("empty separator")
+
+ output = open(prefix + str(i), 'w')
+
+ # buffered search. we try not to have the while input file in memory, as
+ # it's not needed
+ with open(input_file, 'r') as f:
+ while True:
+ buf += f.read(BUFSIZE)
+ if len(buf) == 0:
+ break
+
+ # split using the separator
+ while separator in buf:
+ idx = buf.index(separator)
+
+ if idx > 0:
+ output.write(buf[0:idx])
+ output.close()
+ i += 1
+ output = open(prefix + str(i), 'w')
+ output.write(buf[idx:idx + sep_len])
+ else:
+ output.write(buf[0:sep_len])
+
+ buf = buf[idx + sep_len:]
+
+ # corner case: separator is between this buf and next one. In this
+ # case, we write to current output everything before that and
+ # iterate
+ if separator[0] in buf[-sep_len:]:
+ output.write(buf[:-sep_len])
+ buf = buf[-sep_len:]
+ continue
+
+ # else: continue writing to the current output and iterate
+ output.write(buf)
+ buf = ""
+
+ output.close()
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+
+ parser.add_argument("-s", "--separator", required=True,
+ help="string for the separator")
+ parser.add_argument("-p", "--prefix", required=True,
+ help="prefix for split files")
+ parser.add_argument("input_file", help="input file")
+
+ args = parser.parse_args()
+ split_file(separator=args.separator, prefix=args.prefix, input_file=args.input_file)
from deltatar.tarfile import TarFile, PAX_FORMAT, GNU_FORMAT, BLOCKSIZE, _Stream, RECORDSIZE
+import filesplit
class ConcatCompressTest(unittest.TestCase):
"""
'''
Remove temporal files created by unit tests
'''
- os.system("rm -rf big small small2 sample.tar*")
+ os.system("rm -rf big small small2 sample.*")
def create_file(self, path, length):
'''
for key, value in hash.iteritems():
assert os.path.exists(key)
assert value == self.md5sum(key)
+
+ def test_multiple_files_rescue_extract(self):
+ '''
+ Use filesplit utility to split the file in compressed tar blocks that
+ individually decompressed and "untarred", thanks to be using the
+ concat gzip tar format.
+ '''
+
+ # create sample data
+ hash = dict()
+ hash["big"] = self.create_file("big", 50000)
+ hash["small"] = self.create_file("small", 100)
+ hash["small2"] = self.create_file("small2", 354)
+
+ # create the tar file with volumes
+ tarobj = TarFile.open("sample.tar.gz",
+ mode="w#gz",
+ concat_compression=True)
+ tarobj.add("big")
+ tarobj.add("small")
+ tarobj.add("small2")
+ tarobj.close()
+
+ assert os.path.exists("sample.tar.gz")
+
+ os.unlink("big")
+ os.unlink("small")
+ os.unlink("small2")
+
+ filesplit.split_file('\x1f\x8b', "sample.tar.gz.", "sample.tar.gz")
+
+ assert os.path.exists("sample.tar.gz.0") # beginning of the tar file
+ assert os.path.exists("sample.tar.gz.1") # first file
+ assert os.path.exists("sample.tar.gz.2") # second file
+ assert os.path.exists("sample.tar.gz.3") # third file
+ assert not os.path.exists("sample.tar.gz.4") # nothing else
+
+ # extract and check output
+ for i in xrange(1, 4):
+ tarobj = TarFile.open("sample.tar.gz.%d" % i,
+ mode="r|gz")
+ tarobj.extractall()
+ tarobj.close()
+
+ for key, value in hash.iteritems():
+ assert os.path.exists(key)
+ assert value == self.md5sum(key)
+
+ def test_multiple_files_rescue_extract_gnu(self):
+ '''
+ Use filesplit utility to split the file in compressed tar blocks that
+ individually decompressed and "untarred", thanks to be using the
+ concat gzip tar format. We do the extraction with standard gnu tar and
+ gzip command line commands.
+ '''
+
+ # create sample data
+ hash = dict()
+ hash["big"] = self.create_file("big", 50000)
+ hash["small"] = self.create_file("small", 100)
+ hash["small2"] = self.create_file("small2", 354)
+
+ # create the tar file with volumes
+ tarobj = TarFile.open("sample.tar.gz",
+ mode="w#gz",
+ concat_compression=True)
+ tarobj.add("big")
+ tarobj.add("small")
+ tarobj.add("small2")
+ tarobj.close()
+
+ assert os.path.exists("sample.tar.gz")
+
+ os.unlink("big")
+ os.unlink("small")
+ os.unlink("small2")
+
+ # extract using the command line this time
+ os.system("python filesplit.py -s $'\\x1f\\x8b' -p sample.tar.gz. sample.tar.gz")
+
+ assert os.path.exists("sample.tar.gz.0") # beginning of the tar file
+ assert os.path.exists("sample.tar.gz.1") # first file
+ assert os.path.exists("sample.tar.gz.2") # second file
+ assert os.path.exists("sample.tar.gz.3") # third file
+ assert not os.path.exists("sample.tar.gz.4") # nothing else
+
+ # extract and check output
+ for i in xrange(1, 4):
+ os.system("gzip -cd sample.tar.gz.%d > sample.%d.tar" % (i, i))
+ os.system("tar xf sample.%d.tar" % i)
+
+ for key, value in hash.iteritems():
+ assert os.path.exists(key)
+ assert value == self.md5sum(key)
+
+ def test_multiple_files_rescue_extract_broken(self):
+ '''
+ Use filesplit utility to split the file in compressed tar blocks that
+ individually decompressed and "untarred", thanks to be using the
+ concat gzip tar format. In this case, we simulate that one of the files
+ is corrupted. The rest will decompress just fine.
+ '''
+
+ # create sample data
+ hash = dict()
+ hash["big"] = self.create_file("big", 50000)
+ hash["small"] = self.create_file("small", 100)
+ hash["small2"] = self.create_file("small2", 354)
+
+ # create the tar file with volumes
+ tarobj = TarFile.open("sample.tar.gz",
+ mode="w#gz",
+ concat_compression=True)
+ tarobj.add("big")
+ tarobj.add("small")
+ tarobj.add("small2")
+ tarobj.close()
+
+ assert os.path.exists("sample.tar.gz")
+
+ # overwrite stuff in the middle of the big file
+ f = open('sample.tar.gz', 'r+b')
+ f.seek(100)
+ f.write("breaking things")
+ f.close()
+
+ os.unlink("big")
+ os.unlink("small")
+ os.unlink("small2")
+
+ # equivalent to $ python filesplit.py -s $'\x1f\x8b' -p sample.tar.gz. sample.tar.gz
+ filesplit.split_file('\x1f\x8b', "sample.tar.gz.", "sample.tar.gz")
+
+ assert os.path.exists("sample.tar.gz.0") # beginning of the tar file
+ assert os.path.exists("sample.tar.gz.1") # first file
+ assert os.path.exists("sample.tar.gz.2") # second file
+ assert os.path.exists("sample.tar.gz.3") # third file
+ assert not os.path.exists("sample.tar.gz.4") # nothing else
+
+ # extract and check output
+ for i in xrange(1, 4):
+ try:
+ tarobj = TarFile.open("sample.tar.gz.%d" % i,
+ mode="r|gz")
+ tarobj.extractall()
+ tarobj.close()
+ except Exception as e:
+ if i == 1: # big file doesn't extract well because it's corrupted
+ pass
+ else:
+ raise Exception("Error extracting a tar.gz not related to the broken 'big' file")
+
+ for key, value in hash.iteritems():
+ if key != "big":
+ assert os.path.exists(key)
+ assert value == self.md5sum(key)