adding initial concat compression support and a simple unit test

author Eduardo Robles Elvira <edulix@wadobo.com>

Wed, 3 Jul 2013 11:25:09 +0000 (13:25 +0200)

committer Eduardo Robles Elvira <edulix@wadobo.com>

Wed, 3 Jul 2013 11:25:09 +0000 (13:25 +0200)
author Eduardo Robles Elvira <edulix@wadobo.com>
Wed, 3 Jul 2013 11:25:09 +0000 (13:25 +0200)
committer Eduardo Robles Elvira <edulix@wadobo.com>
Wed, 3 Jul 2013 11:25:09 +0000 (13:25 +0200)
diff --git a/deltatar/tarfile.py b/deltatar/tarfile.py

index 5d356e3..e7fc6da 100644 (file)
--- a/deltatar/tarfile.py
+++ b/deltatar/tarfile.py
@@ -37,7 +37,7 @@ version     = "0.9.0"
 __author__  = "Lars Gustäbel (lars@gustaebel.de)"
 __date__    = "$Date$"
 __cvsid__   = "$Id$"
-__credits__ = "Gustavo Niemeyer, Niels Gustäbel, Richard Townsend."
+__credits__ = "Gustavo Niemeyer, Niels Gustäbel, Richard Townsend, Eduardo Robles."
 
 #---------
 # Imports
@@ -418,6 +418,7 @@ class _Stream:
         self.bufsize  = bufsize
         self.buf      = ""
         self.pos      = 0L
+        self.concat_pos = 0L
         self.closed   = False
 
         if comptype == "gz":
@@ -462,12 +463,41 @@ class _Stream:
             self.name = self.name[:-3]
         self.__write(self.name + NUL)
 
+    def new_compression_block(self):
+        '''
+        Used to notify a new tar block is coming to create a new zip block
+        '''
+        if self.mode != "w":
+            raise CompressionError("new compression blocks can only be added in mode 'w'")
+
+        if self.comptype == "gz":
+            self._new_gz_block()
+        else:
+            raise CompressionError("Concat compression only available for comptype 'gz'")
+
+    def _new_gz_block(self):
+        '''
+        Add a new gzip block, closing last one
+        '''
+        import zlib
+        self.close(close_fileobj=False)
+        self.closed = False
+        self.concat_pos = 0L
+        self.crc = zlib.crc32("") & 0xffffffffL
+        self.cmp = self.zlib.compressobj(9, self.zlib.DEFLATED,
+                                            -self.zlib.MAX_WBITS,
+                                            self.zlib.DEF_MEM_LEVEL,
+                                            0)
+        timestamp = struct.pack("<L", long(time.time()))
+        self.__write("\037\213\010\000%s\002\377" % timestamp)
+
     def write(self, s):
         """Write string s to the stream.
         """
         if self.comptype == "gz":
             self.crc = self.zlib.crc32(s, self.crc) & 0xffffffffL
         self.pos += len(s)
+        self.concat_pos += len(s)
         if self.comptype != "tar":
             s = self.cmp.compress(s)
         self.__write(s)
@@ -481,7 +511,7 @@ class _Stream:
             self.fileobj.write(self.buf[:self.bufsize])
             self.buf = self.buf[self.bufsize:]
 
-    def close(self):
+    def close(self, close_fileobj=True):
         """Close the _Stream object. No operation should be
            done on it afterwards.
         """
@@ -502,9 +532,9 @@ class _Stream:
                 # To avoid irksome warnings from the `struct` module, force
                 # it to look positive on all boxes.
                 self.fileobj.write(struct.pack("<L", self.crc & 0xffffffffL))
-                self.fileobj.write(struct.pack("<L", self.pos & 0xffffFFFFL))
+                self.fileobj.write(struct.pack("<L", self.concat_pos & 0xffffFFFFL))
 
-        if not self._extfileobj:
+        if close_fileobj and not self._extfileobj:
             self.fileobj.close()
 
         self.closed = True
@@ -1545,10 +1575,13 @@ class TarFile(object):
 
     fileobject = ExFileObject   # The default ExFileObject class to use.
 
+    concat_compression = False  # Used to separate in different zip members each
+                                # file, used for robustness.
+
     def __init__(self, name=None, mode="r", fileobj=None, format=None,
             tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
             errors=None, pax_headers=None, debug=None, errorlevel=None,
-            max_volume_size=None, new_volume_handler=None):
+            max_volume_size=None, new_volume_handler=None, concat_compression=False):
         """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
            read from an existing archive, 'a' to append data to an existing
            file or 'w' to create a new file overwriting an existing one. `mode'
@@ -1560,6 +1593,7 @@ class TarFile(object):
         if len(mode) > 1 or mode not in "raw":
             raise ValueError("mode must be 'r', 'a' or 'w'")
         self.mode = mode
+        self.concat_compression = concat_compression
         self._mode = {"r": "rb", "a": "r+b", "w": "wb"}[mode]
 
         if not fileobj:
@@ -1746,6 +1780,25 @@ class TarFile(object):
             t._extfileobj = False
             return t
 
+        elif "#" in mode:
+            filemode, comptype = mode.split("#", 1)
+            filemode = filemode or "r"
+            comptype = comptype
+
+            if filemode not in "rw":
+                raise ValueError("mode must be 'r' or 'w'")
+
+            if comptype not in ["gz"]:
+                raise ValueError("comptype must be 'gz'")
+
+            kwargs['concat_compression'] = True
+
+            t = cls(name, filemode,
+                    _Stream(name, filemode, comptype, fileobj, bufsize),
+                    **kwargs)
+            t._extfileobj = False
+            return t
+
         elif mode in "aw":
             return cls.taropen(name, mode, fileobj, **kwargs)
 
@@ -2061,7 +2114,7 @@ class TarFile(object):
         '''
         size_left = self.max_volume_size - 2*BLOCKSIZE - self.offset
         # limit size left to a discrete number of blocks, because we won't
-        # write only half a block when writting the end of a volume
+        # write only half a block when writing the end of a volume
         # and filling with zeros
         blocks, remainder = divmod(size_left, BLOCKSIZE)
         return blocks*BLOCKSIZE
@@ -2076,6 +2129,8 @@ class TarFile(object):
         self._check("aw")
 
         tarinfo = copy.copy(tarinfo)
+        if self.concat_compression:
+            self.fileobj.new_compression_block()
 
         buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
         self.fileobj.write(buf)
diff --git a/runtests.py b/runtests.py

index 4610920..9e5f8bb 100644 (file)
--- a/runtests.py
+++ b/runtests.py
@@ -3,6 +3,7 @@
 import unittest
 
 from testing.test_multivol import MultivolGnuFormatTest, MultivolPaxFormatTest
+from testing.test_concat_compress import ConcatCompressTest
 
 if __name__ == "__main__":
     unittest.main()
\ No newline at end of file
diff --git a/testing/test_concat_compress.py b/testing/test_concat_compress.py

new file mode 100644 (file)

index 0000000..f125aa2
--- /dev/null
+++ b/testing/test_concat_compress.py
@@ -0,0 +1,63 @@
+import os, unittest, hashlib, string
+
+from deltatar.tarfile import TarFile, PAX_FORMAT, GNU_FORMAT, BLOCKSIZE, _Stream, RECORDSIZE
+
+
+class ConcatCompressTest(unittest.TestCase):
+    """
+    Test concatenated compression in tarfiles
+    """
+
+    def tearDown(self):
+        '''
+        Remove temporal files created by unit tests
+        '''
+        os.system("rm -rf big small small2 sample.tar*")
+
+    def create_file(self, path, length):
+        '''
+        Creates a file with some gibberish inside, returning the md5sum of that
+        file. File path and length are specified as function arguments.
+        '''
+        f = open(path, 'w')
+        s = string.lowercase + string.digits + "\n"
+        if len(s) < length:
+            s += s*(length/len(s))
+        data = s[:length]
+        f.write(data)
+        f.close()
+        return self.md5sum(path)
+
+    def md5sum(self, filename):
+        '''
+        Returns the md5sum of a file specified by its filename/path
+        '''
+        md5 = hashlib.md5()
+        with open(filename,'rb') as f:
+            for chunk in iter(lambda: f.read(128*md5.block_size), b''):
+                md5.update(chunk)
+        return md5.hexdigest()
+
+    def test_zip_compress_concat(self):
+        """
+        Create a tar file with only one file inside, using concat compression
+        mode. Then decompress it with zcat and untar it with gnu tar.
+        """
+
+        # create the content of the file to compress and hash it
+        hash = self.create_file("big", 50000)
+
+        # create the tar file with volumes
+        tarobj = TarFile.open("sample.tar.gz",
+                              mode="w#gz",
+                              format=GNU_FORMAT,
+                              concat_compression=True)
+        tarobj.add("big")
+        tarobj.close()
+        os.unlink("big")
+
+        # extract with normal tar and check output
+        os.system("zcat sample.tar.gz > sample.tar")
+        os.system("tar xf sample.tar")
+        assert os.path.exists("big")
+        assert hash == self.md5sum("big")
author	Eduardo Robles Elvira <edulix@wadobo.com>
	Wed, 3 Jul 2013 11:25:09 +0000 (13:25 +0200)
committer	Eduardo Robles Elvira <edulix@wadobo.com>
	Wed, 3 Jul 2013 11:25:09 +0000 (13:25 +0200)
deltatar/tarfile.py		patch \| blob \| blame \| history
runtests.py		patch \| blob \| blame \| history
testing/test_concat_compress.py	[new file with mode: 0644]	patch \| blob