From 3759f7963f84adb812bb628ada60ccad5933819c Mon Sep 17 00:00:00 2001
From: Eduardo Robles Elvira <edulix@wadobo.com>
Date: Mon, 8 Jul 2013 13:02:22 +0200
Subject: [PATCH] adding rescue file splitter and some unit tests using it

---
 filesplit.py                    |   68 +++++++++++++++++
 testing/test_concat_compress.py |  159 ++++++++++++++++++++++++++++++++++++++-
 2 files changed, 226 insertions(+), 1 deletions(-)
 create mode 100644 filesplit.py

diff --git a/filesplit.py b/filesplit.py
new file mode 100644
index 0000000..8c4a7a2
--- /dev/null
+++ b/filesplit.py
@@ -0,0 +1,68 @@
+#!/usr/bin/env python
+
+import argparse
+
+BUFSIZE = 16 * 1024
+
+def split_file(separator, prefix, input_file):
+    '''
+    splits a file when it finds a regexp, including the regexp in the begining
+    of the new file
+    '''
+    i = 0
+    pos = 0
+    buf = ""
+    sep_len = len(separator)
+    if sep_len == 0:
+        raise Exception("empty separator")
+
+    output = open(prefix + str(i), 'w')
+
+    # buffered search. we try not to have the while input file in memory, as
+    # it's not needed
+    with open(input_file, 'r') as f:
+        while True:
+            buf += f.read(BUFSIZE)
+            if len(buf) == 0:
+                break
+
+            # split using the separator
+            while separator in buf:
+                idx = buf.index(separator)
+
+                if idx > 0:
+                    output.write(buf[0:idx])
+                    output.close()
+                    i += 1
+                    output = open(prefix + str(i), 'w')
+                    output.write(buf[idx:idx + sep_len])
+                else:
+                    output.write(buf[0:sep_len])
+
+                buf = buf[idx + sep_len:]
+
+            # corner case: separator is between this buf and next one. In this
+            # case, we write to current output everything before that and
+            # iterate
+            if separator[0] in buf[-sep_len:]:
+                output.write(buf[:-sep_len])
+                buf = buf[-sep_len:]
+                continue
+
+            # else: continue writing to the current output and iterate
+            output.write(buf)
+            buf = ""
+
+    output.close()
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("-s", "--separator", required=True,
+                        help="string for the separator")
+    parser.add_argument("-p", "--prefix", required=True,
+                        help="prefix for split files")
+    parser.add_argument("input_file", help="input file")
+
+    args = parser.parse_args()
+    split_file(separator=args.separator, prefix=args.prefix, input_file=args.input_file)
diff --git a/testing/test_concat_compress.py b/testing/test_concat_compress.py
index dbe7806..409ba72 100644
--- a/testing/test_concat_compress.py
+++ b/testing/test_concat_compress.py
@@ -2,6 +2,7 @@ import os, unittest, hashlib, string
 
 from deltatar.tarfile import TarFile, PAX_FORMAT, GNU_FORMAT, BLOCKSIZE, _Stream, RECORDSIZE
 
+import filesplit
 
 class ConcatCompressTest(unittest.TestCase):
     """
@@ -12,7 +13,7 @@ class ConcatCompressTest(unittest.TestCase):
         '''
         Remove temporal files created by unit tests
         '''
-        os.system("rm -rf big small small2 sample.tar*")
+        os.system("rm -rf big small small2 sample.*")
 
     def create_file(self, path, length):
         '''
@@ -160,3 +161,159 @@ class ConcatCompressTest(unittest.TestCase):
         for key, value in hash.iteritems():
             assert os.path.exists(key)
             assert value == self.md5sum(key)
+
+    def test_multiple_files_rescue_extract(self):
+        '''
+        Use filesplit utility to split the file in compressed tar blocks that
+        individually decompressed and "untarred", thanks to be using the
+        concat gzip tar format.
+        '''
+
+        # create sample data
+        hash = dict()
+        hash["big"] = self.create_file("big", 50000)
+        hash["small"] = self.create_file("small", 100)
+        hash["small2"] = self.create_file("small2", 354)
+
+        # create the tar file with volumes
+        tarobj = TarFile.open("sample.tar.gz",
+                              mode="w#gz",
+                              concat_compression=True)
+        tarobj.add("big")
+        tarobj.add("small")
+        tarobj.add("small2")
+        tarobj.close()
+
+        assert os.path.exists("sample.tar.gz")
+
+        os.unlink("big")
+        os.unlink("small")
+        os.unlink("small2")
+
+        filesplit.split_file('\x1f\x8b', "sample.tar.gz.", "sample.tar.gz")
+
+        assert os.path.exists("sample.tar.gz.0") # beginning of the tar file
+        assert os.path.exists("sample.tar.gz.1") # first file
+        assert os.path.exists("sample.tar.gz.2") # second file
+        assert os.path.exists("sample.tar.gz.3") # third file
+        assert not os.path.exists("sample.tar.gz.4") # nothing else
+
+        # extract and check output
+        for i in xrange(1, 4):
+            tarobj = TarFile.open("sample.tar.gz.%d" % i,
+                                mode="r|gz")
+            tarobj.extractall()
+            tarobj.close()
+
+        for key, value in hash.iteritems():
+            assert os.path.exists(key)
+            assert value == self.md5sum(key)
+
+    def test_multiple_files_rescue_extract_gnu(self):
+        '''
+        Use filesplit utility to split the file in compressed tar blocks that
+        individually decompressed and "untarred", thanks to be using the
+        concat gzip tar format. We do the extraction with standard gnu tar and
+        gzip command line commands.
+        '''
+
+        # create sample data
+        hash = dict()
+        hash["big"] = self.create_file("big", 50000)
+        hash["small"] = self.create_file("small", 100)
+        hash["small2"] = self.create_file("small2", 354)
+
+        # create the tar file with volumes
+        tarobj = TarFile.open("sample.tar.gz",
+                              mode="w#gz",
+                              concat_compression=True)
+        tarobj.add("big")
+        tarobj.add("small")
+        tarobj.add("small2")
+        tarobj.close()
+
+        assert os.path.exists("sample.tar.gz")
+
+        os.unlink("big")
+        os.unlink("small")
+        os.unlink("small2")
+
+        # extract using the command line this time
+        os.system("python filesplit.py -s $'\\x1f\\x8b' -p sample.tar.gz. sample.tar.gz")
+
+        assert os.path.exists("sample.tar.gz.0") # beginning of the tar file
+        assert os.path.exists("sample.tar.gz.1") # first file
+        assert os.path.exists("sample.tar.gz.2") # second file
+        assert os.path.exists("sample.tar.gz.3") # third file
+        assert not os.path.exists("sample.tar.gz.4") # nothing else
+
+        # extract and check output
+        for i in xrange(1, 4):
+            os.system("gzip -cd sample.tar.gz.%d > sample.%d.tar" % (i, i))
+            os.system("tar xf sample.%d.tar" % i)
+
+        for key, value in hash.iteritems():
+            assert os.path.exists(key)
+            assert value == self.md5sum(key)
+
+    def test_multiple_files_rescue_extract_broken(self):
+        '''
+        Use filesplit utility to split the file in compressed tar blocks that
+        individually decompressed and "untarred", thanks to be using the
+        concat gzip tar format. In this case, we simulate that one of the files
+        is corrupted. The rest will decompress just fine.
+        '''
+
+        # create sample data
+        hash = dict()
+        hash["big"] = self.create_file("big", 50000)
+        hash["small"] = self.create_file("small", 100)
+        hash["small2"] = self.create_file("small2", 354)
+
+        # create the tar file with volumes
+        tarobj = TarFile.open("sample.tar.gz",
+                              mode="w#gz",
+                              concat_compression=True)
+        tarobj.add("big")
+        tarobj.add("small")
+        tarobj.add("small2")
+        tarobj.close()
+
+        assert os.path.exists("sample.tar.gz")
+
+        # overwrite stuff in the middle of the big file
+        f = open('sample.tar.gz', 'r+b')
+        f.seek(100)
+        f.write("breaking things")
+        f.close()
+
+        os.unlink("big")
+        os.unlink("small")
+        os.unlink("small2")
+
+        # equivalent to $ python filesplit.py -s $'\x1f\x8b' -p sample.tar.gz. sample.tar.gz
+        filesplit.split_file('\x1f\x8b', "sample.tar.gz.", "sample.tar.gz")
+
+        assert os.path.exists("sample.tar.gz.0") # beginning of the tar file
+        assert os.path.exists("sample.tar.gz.1") # first file
+        assert os.path.exists("sample.tar.gz.2") # second file
+        assert os.path.exists("sample.tar.gz.3") # third file
+        assert not os.path.exists("sample.tar.gz.4") # nothing else
+
+        # extract and check output
+        for i in xrange(1, 4):
+            try:
+                tarobj = TarFile.open("sample.tar.gz.%d" % i,
+                                    mode="r|gz")
+                tarobj.extractall()
+                tarobj.close()
+            except Exception as e:
+                if i == 1: # big file doesn't extract well because it's corrupted
+                    pass
+                else:
+                    raise Exception("Error extracting a tar.gz not related to the broken 'big' file")
+
+        for key, value in hash.iteritems():
+            if key != "big":
+                assert os.path.exists(key)
+                assert value == self.md5sum(key)
-- 
1.7.1