working in rescue tar utility

author Eduardo Robles Elvira <edulix@wadobo.com>

Wed, 10 Jul 2013 11:00:06 +0000 (13:00 +0200)

committer Eduardo Robles Elvira <edulix@wadobo.com>

Wed, 10 Jul 2013 11:00:21 +0000 (13:00 +0200)
author Eduardo Robles Elvira <edulix@wadobo.com>
Wed, 10 Jul 2013 11:00:06 +0000 (13:00 +0200)
committer Eduardo Robles Elvira <edulix@wadobo.com>
Wed, 10 Jul 2013 11:00:21 +0000 (13:00 +0200)
diff --git a/filesplit.py b/filesplit.py

index 8c4a7a2..db1c476 100644 (file)
--- a/filesplit.py
+++ b/filesplit.py
@@ -4,7 +4,7 @@ import argparse
 
 BUFSIZE = 16 * 1024
 
-def split_file(separator, prefix, input_file):
+def split_file(separator, prefix, input_file, new_file_func=None):
     '''
     splits a file when it finds a regexp, including the regexp in the begining
     of the new file
@@ -16,7 +16,10 @@ def split_file(separator, prefix, input_file):
     if sep_len == 0:
         raise Exception("empty separator")
 
-    output = open(prefix + str(i), 'w')
+    if new_file_func is None:
+        new_file_func = lambda prefix, i: open(prefix + str(i), 'w')
+
+    output = new_file_func(prefix, i)
 
     # buffered search. we try not to have the while input file in memory, as
     # it's not needed
@@ -34,7 +37,7 @@ def split_file(separator, prefix, input_file):
                     output.write(buf[0:idx])
                     output.close()
                     i += 1
-                    output = open(prefix + str(i), 'w')
+                    output = new_file_func(prefix, i)
                     output.write(buf[idx:idx + sep_len])
                 else:
                     output.write(buf[0:sep_len])
diff --git a/rescue_tar.py b/rescue_tar.py

new file mode 100644 (file)

index 0000000..ce028fd
--- /dev/null
+++ b/rescue_tar.py
@@ -0,0 +1,111 @@
+#!/usr/bin/env python
+
+import argparse
+import os
+import tempfile
+from functools import partial
+
+from deltatar import tarfile
+import filesplit
+
+def rescue(tar_files, rescue_dir=None):
+    '''
+    Rescues a multivolume tarfile. Checks file name extension to detect
+    format (compression, etc). Assumes it to be multivolume tar.
+    '''
+    # setup rescue_dir
+    if isinstance(tar_files, basestring):
+        tar_files = [tar_files]
+
+    if not isinstance(tar_files, list):
+        raise Exception("tar_files must be a list")
+
+    for f in tar_files:
+        if not isinstance(f, basestring):
+            raise Exception("tar_files must be a list of strings")
+        if not os.path.exists(f):
+            raise Exception("tar file '%s' doesn't exist" % f)
+
+    if rescue_dir is None:
+        rescue_dir = os.path.dirname(tar_files[0])
+    elif rescue_dir is None:
+        rescue_dir = tempfile.mkdtemp()
+
+    # autodetect file type by extension
+    first_tar_file = tar_files[0]
+    if first_tar_file.endswith(".tar.gz"):
+        mode = "r#gz"
+    elif first_tar_file.endswith(".tar"):
+        mode = "r"
+
+    base_name = os.path.basename(first_tar_file)
+    extract_files = tar_files
+
+    # num the number of files used in rescue mode. Used to name those files
+    # when creating them. We put num in an object so that it can be referenced
+    # instead of copied inside new_gz partial
+    context = dict(num=0)
+
+    # divide in compressed tar block files if it's r#gz
+    if mode == "r#gz":
+        extract_files = []
+        # function used to create each chunk file
+        def new_gz(context, extract_files, prefix, i):
+            path = "%s.%d" %(prefix, context['num'])
+            extract_files.append(path)
+            context['num'] += 1
+            return open(path, 'w')
+        new_gz = partial(new_gz, context, extract_files)
+
+        # split in compressed chunks
+        for f in tar_files:
+            filesplit.split_file('\x1f\x8b',
+                os.path.join(rescue_dir, base_name), f, new_gz)
+
+    # includes volumes already extracted with new_volume_handler
+    already_extracted_vols = []
+
+    def new_volume_handler(already_extracted_vols, next_num, tarobj, base_name, volume_number):
+        '''
+        Handles the new volumes when extracting
+        '''
+
+        # handle the special case where the first file is whatever.tar.gz and
+        # the second is whatever.tar.gz.0
+        base_name_split = base_name.split('.')
+        next_num = 0
+        try:
+            next_num = int(base_name_split[-1]) + 1
+            base_name = ".".join(base_name_split[:-1])
+        except ValueError as e:
+            pass
+
+        volume_path = "%s.%d" % (base_name, next_num)
+        already_extracted_vols.append(volume_path)
+        tarobj.open_volume(volume_path)
+
+    new_volume_handler = partial(new_volume_handler, already_extracted_vols)
+
+    # extract files, as much as possible
+    for f in extract_files:
+        if f in already_extracted_vols:
+            continue
+        try:
+            tarobj = tarfile.TarFile.open(f, mode=mode,
+                new_volume_handler=new_volume_handler)
+            tarobj.extractall()
+            tarobj.close()
+        except:
+            pass
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--rescue_dir", help="directory where rescue files "
+        "should be created. /tmp by default")
+    parser.add_argument("tar_files", nargs="+", help="list of files of a "
+        "multitar file to rescue. Assumes format first.extension "
+        "second.extension.0 third.extension.1 ...")
+
+    args = parser.parse_args()
+    rescue(tar_files=args.tar_files, rescue_dir=args.rescue_dir)
diff --git a/runtests.py b/runtests.py

index 9e5f8bb..856aace 100644 (file)
--- a/runtests.py
+++ b/runtests.py
@@ -4,6 +4,7 @@ import unittest
 
 from testing.test_multivol import MultivolGnuFormatTest, MultivolPaxFormatTest
 from testing.test_concat_compress import ConcatCompressTest
+from testing.test_rescue_tar import RescueTarTest
 
 if __name__ == "__main__":
     unittest.main()
\ No newline at end of file
diff --git a/testing/__init__.py b/testing/__init__.py

index e69de29..d47b054 100644 (file)
--- a/testing/__init__.py
+++ b/testing/__init__.py
@@ -0,0 +1,37 @@
+import os, unittest, hashlib, string
+import random
+
+class BaseTest(unittest.TestCase):
+    """
+    Test concatenated compression in tarfiles
+    """
+
+    def tearDown(self):
+        '''
+        Remove temporal files created by unit tests
+        '''
+        os.system("rm -rf big small small2 sample.*")
+
+    def create_file(self, path, length):
+        '''
+        Creates a file with some gibberish inside, returning the md5sum of that
+        file. File path and length are specified as function arguments.
+        '''
+        f = open(path, 'w')
+        s = string.lowercase + string.digits + "\n"
+        if len(s) < length:
+            s += s*(length/len(s))
+        data = s[:length]
+        f.write(data)
+        f.close()
+        return self.md5sum(path)
+
+    def md5sum(self, filename):
+        '''
+        Returns the md5sum of a file specified by its filename/path
+        '''
+        md5 = hashlib.md5()
+        with open(filename,'rb') as f:
+            for chunk in iter(lambda: f.read(128*md5.block_size), b''):
+                md5.update(chunk)
+        return md5.hexdigest()
\ No newline at end of file
diff --git a/testing/test_concat_compress.py b/testing/test_concat_compress.py

index 409ba72..2659792 100644 (file)
--- a/testing/test_concat_compress.py
+++ b/testing/test_concat_compress.py
@@ -1,44 +1,15 @@
 import os, unittest, hashlib, string
 
-from deltatar.tarfile import TarFile, PAX_FORMAT, GNU_FORMAT, BLOCKSIZE, _Stream, RECORDSIZE
+from deltatar.tarfile import TarFile, GNU_FORMAT
 
 import filesplit
+from . import BaseTest
 
-class ConcatCompressTest(unittest.TestCase):
+class ConcatCompressTest(BaseTest):
     """
     Test concatenated compression in tarfiles
     """
 
-    def tearDown(self):
-        '''
-        Remove temporal files created by unit tests
-        '''
-        os.system("rm -rf big small small2 sample.*")
-
-    def create_file(self, path, length):
-        '''
-        Creates a file with some gibberish inside, returning the md5sum of that
-        file. File path and length are specified as function arguments.
-        '''
-        f = open(path, 'w')
-        s = string.lowercase + string.digits + "\n"
-        if len(s) < length:
-            s += s*(length/len(s))
-        data = s[:length]
-        f.write(data)
-        f.close()
-        return self.md5sum(path)
-
-    def md5sum(self, filename):
-        '''
-        Returns the md5sum of a file specified by its filename/path
-        '''
-        md5 = hashlib.md5()
-        with open(filename,'rb') as f:
-            for chunk in iter(lambda: f.read(128*md5.block_size), b''):
-                md5.update(chunk)
-        return md5.hexdigest()
-
     def test_zcat_extract_concat(self):
         """
         Create a tar file with only one file inside, using concat compression
diff --git a/testing/test_multivol.py b/testing/test_multivol.py

index 60e6f45..cdb2d5b 100644 (file)
--- a/testing/test_multivol.py
+++ b/testing/test_multivol.py
@@ -1,6 +1,7 @@
 import os, unittest, hashlib, string
 
 from deltatar.tarfile import TarFile, PAX_FORMAT, GNU_FORMAT, BLOCKSIZE
+from . import BaseTest
 
 def new_volume_handler(tarobj, base_name, volume_number):
     '''
@@ -10,7 +11,7 @@ def new_volume_handler(tarobj, base_name, volume_number):
     tarobj.open_volume(volume_path)
 
 
-class MultivolGnuFormatTest(unittest.TestCase):
+class MultivolGnuFormatTest(BaseTest):
     """
     Test multivolume support in tarfile. Tar Format is specified at class level.
     """
@@ -34,36 +35,6 @@ class MultivolGnuFormatTest(unittest.TestCase):
     # case of GNU format this is the same as tarfile_overhead.
     tarvol_overhead = 3*BLOCKSIZE
 
-    def tearDown(self):
-        '''
-        Remove temporal files created by unit tests
-        '''
-        os.system("rm -rf big small small2 sample.tar*")
-
-    def create_file(self, path, length):
-        '''
-        Creates a file with some gibberish inside, returning the md5sum of that
-        file. File path and length are specified as function arguments.
-        '''
-        f = open(path, 'w')
-        s = string.lowercase + string.digits + "\n"
-        if len(s) < length:
-            s += s*(length/len(s))
-        data = s[:length]
-        f.write(data)
-        f.close()
-        return self.md5sum(path)
-
-    def md5sum(self, filename):
-        '''
-        Returns the md5sum of a file specified by its filename/path
-        '''
-        md5 = hashlib.md5()
-        with open(filename,'rb') as f:
-            for chunk in iter(lambda: f.read(128*md5.block_size), b''):
-                md5.update(chunk)
-        return md5.hexdigest()
-
     def test_no_volume(self):
         """
         Create a tar file with only one file inside and no extra volumes
diff --git a/testing/test_rescue_tar.py b/testing/test_rescue_tar.py

new file mode 100644 (file)

index 0000000..829b158
--- /dev/null
+++ b/testing/test_rescue_tar.py
@@ -0,0 +1,97 @@
+import os, unittest, hashlib, string
+
+from deltatar.tarfile import TarFile, PAX_FORMAT, GNU_FORMAT, BLOCKSIZE
+from . import BaseTest
+import rescue_tar
+
+def new_volume_handler(tarobj, base_name, volume_number):
+    '''
+    Handles the new volumes
+    '''
+    volume_path = "%s.%d" % (base_name, volume_number)
+    tarobj.open_volume(volume_path)
+
+class RescueTarTest(BaseTest):
+    def test_rescue_ok(self):
+        '''
+        Test rescue_tar when no file is broken, without using multivol tars.
+        '''
+
+        # create sample data
+        hash = dict()
+        hash["big"] = self.create_file("big", 50000)
+        hash["big2"] = self.create_file("big2", 10200)
+        hash["small"] = self.create_file("small", 100)
+        hash["small2"] = self.create_file("small2", 354)
+
+        # create the tar file with volumes
+        tarobj = TarFile.open("sample.tar.gz",
+                              mode="w#gz",
+                              concat_compression=True)
+        tarobj.add("big")
+        tarobj.add("big2")
+        tarobj.add("small")
+        tarobj.add("small2")
+        tarobj.close()
+
+        assert os.path.exists("sample.tar.gz")
+        os.unlink("big")
+        os.unlink("big2")
+        os.unlink("small")
+        os.unlink("small2")
+
+        # extract
+        rescue_tar.rescue("sample.tar.gz")
+
+        # check output
+        for key, value in hash.iteritems():
+            assert os.path.exists(key)
+            assert value == self.md5sum(key)
+
+    def test_rescue_broken(self):
+        '''
+        Use rescue_tar utility to split the file in compressed tar blocks that
+        individually decompressed and "untarred", thanks to be using the
+        concat gzip tar format. In this case, we simulate that one of the files
+        is corrupted. The rest will decompress just fine.
+        '''
+
+        # create sample data
+        hash = dict()
+        hash["big"] = self.create_file("big", 50000)
+        hash["big2"] = self.create_file("big2", 10200)
+        hash["small"] = self.create_file("small", 100)
+        hash["small2"] = self.create_file("small2", 354)
+
+        # create the tar file with volumes
+        tarobj = TarFile.open("sample.tar.gz",
+                              mode="w#gz",
+                              concat_compression=True)
+        tarobj.add("big")
+        tarobj.add("big2")
+        tarobj.add("small")
+        tarobj.add("small2")
+        tarobj.close()
+
+        assert os.path.exists("sample.tar.gz")
+
+        # overwrite stuff in the middle of the big file
+        f = open('sample.tar.gz', 'r+b')
+        f.seek(100)
+        f.write("breaking things")
+        f.close()
+
+        os.unlink("big")
+        os.unlink("big2")
+        os.unlink("small")
+        os.unlink("small2")
+
+        # extract
+        rescue_tar.rescue("sample.tar.gz")
+
+        # check output
+        for key, value in hash.iteritems():
+            if key == "big":
+                continue
+            assert os.path.exists(key)
+            assert value == self.md5sum(key)
author	Eduardo Robles Elvira <edulix@wadobo.com>
	Wed, 10 Jul 2013 11:00:06 +0000 (13:00 +0200)
committer	Eduardo Robles Elvira <edulix@wadobo.com>
	Wed, 10 Jul 2013 11:00:21 +0000 (13:00 +0200)
filesplit.py		patch \| blob \| blame \| history
rescue_tar.py	[new file with mode: 0644]	patch \| blob
runtests.py		patch \| blob \| blame \| history
testing/__init__.py		patch \| blob \| blame \| history
testing/test_concat_compress.py		patch \| blob \| blame \| history
testing/test_multivol.py		patch \| blob \| blame \| history
testing/test_rescue_tar.py	[new file with mode: 0644]	patch \| blob