adding unit tests for tarfile and fixing some cases which were not working
authorEduardo Robles Elvira <edulix@wadobo.com>
Tue, 18 Jun 2013 10:22:36 +0000 (12:22 +0200)
committerEduardo Robles Elvira <edulix@wadobo.com>
Tue, 18 Jun 2013 10:22:36 +0000 (12:22 +0200)
deltatar/__init__.py [new file with mode: 0644]
deltatar/tarfile.py [moved from tarfile.py with 98% similarity]
runtests.py [new file with mode: 0644]
tarfile_multivol_example.py [deleted file]
testing/__init__.py [new file with mode: 0644]
testing/test_multivol.py [new file with mode: 0644]

diff --git a/deltatar/__init__.py b/deltatar/__init__.py
new file mode 100644 (file)
index 0000000..8d1c8b6
--- /dev/null
@@ -0,0 +1 @@
similarity index 98%
rename from tarfile.py
rename to deltatar/tarfile.py
index 60c9dde..9e146a7 100644 (file)
@@ -1584,6 +1584,11 @@ class TarFile(object):
             self.errorlevel = errorlevel
 
         # Init datastructures.
+        if max_volume_size and max_volume_size < 3*BLOCKSIZE:
+            raise ValueError("max_volume_size needs to be at least %d" % 3*BLOCKSIZE)
+        if max_volume_size and not callable(new_volume_handler):
+            raise ValueError("new_volume_handler needs to be set and be callable for multivolume support")
+
         self.max_volume_size = max_volume_size
         self.new_volume_handler = new_volume_handler
         self.closed = False
@@ -2026,6 +2031,17 @@ class TarFile(object):
         else:
             self.addfile(tarinfo)
 
+    def _size_left(self):
+        '''
+        Calculates size left, assumes self.max_volume_size is set
+        '''
+        size_left = self.max_volume_size - 2*BLOCKSIZE - self.offset
+        # limit size left to a discrete number of blocks, because we won't
+        # write only half a block when writting the end of a volume
+        # and filling with zeros
+        blocks, remainder = divmod(size_left, BLOCKSIZE)
+        return blocks*BLOCKSIZE
+
     def addfile(self, tarinfo, fileobj=None):
         """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is
            given, tarinfo.size bytes are read from it and added to the archive.
@@ -2049,7 +2065,7 @@ class TarFile(object):
 
         # handle multivolume support
         if self.max_volume_size:
-            size_left = self.max_volume_size - 2*BLOCKSIZE - self.offset
+            size_left = self._size_left()
             max_size_to_write = min(size_left, tarinfo.size - tarinfo.offset_data)
         else:
             size_left = max_size_to_write = tarinfo.size
@@ -2065,17 +2081,22 @@ class TarFile(object):
             # going to be a file splitted in multiple volumes.
             # if file is going to be split in multiple volumes, having a
             # remainder means that there's no more space left for a block, so
-            # we already need to create a new volume
-            if max_size_to_write == tarinfo.size and remainder > 0:
+            # we already need to create a new volume.
+            if remainder > 0:
                 self.fileobj.write(NUL * (BLOCKSIZE - remainder))
                 blocks += 1
 
+            # we already assured previously that if we are doing multivolume,
+            # there's not going to be a remainder
+            if self.max_volume_size and max_size_to_write == size_left:
+                assert remainder == 0
+
             self.offset += blocks * BLOCKSIZE
+            size_left -= blocks * BLOCKSIZE
             tarinfo.offset_data += blocks * BLOCKSIZE
-            size_left = self.max_volume_size - 2*BLOCKSIZE - self.offset
 
             # check if creating a new volume is needed
-            if size_left <= BLOCKSIZE:
+            if self.max_volume_size and size_left < BLOCKSIZE:
                 tarinfo.type = GNUTYPE_MULTIVOL
 
                 if not self.new_volume_handler or\
@@ -2093,7 +2114,7 @@ class TarFile(object):
                 buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
                 self.offset += len(buf)
                 self.fileobj.write(buf)
-                size_left = self.max_volume_size - 2*BLOCKSIZE - self.offset
+                size_left = self._size_left()
                 max_size_to_write = min(size_left, tarinfo.size - tarinfo.offset_data)
 
         self.members.append(tarinfo)
diff --git a/runtests.py b/runtests.py
new file mode 100644 (file)
index 0000000..d07a277
--- /dev/null
@@ -0,0 +1,8 @@
+#!/usr/bin/env python
+
+import unittest
+
+from testing.test_multivol import MultivolTest
+
+if __name__ == "__main__":
+    unittest.main()
\ No newline at end of file
diff --git a/tarfile_multivol_example.py b/tarfile_multivol_example.py
deleted file mode 100644 (file)
index a902bca..0000000
+++ /dev/null
@@ -1,47 +0,0 @@
-
-'''
-when reading, the file being read is not going to fail reading because tar will
-have writen the tar file at appropiate sizes. so it's transparent for _Stream
-
-when writing, it will tarobj who will notice when that the file is too big, and
-thus it will be tarobj job to close the current stream and call to
-new_volume_handler before continue using stream for writing. But it will be
-still transparent from the stream object POV.
-
-
-In the case of restarting gzip compression with #gz:
-
-For writing it will be tarobj job to stop writing current file and tell the
-_Stream object to handle the new file event. So it will be _Stream job to do
-that.
-
-For reading it will be tarobj job to notice the end of a file when reading, and
-call to _Stream object to handle the new file event, in this case for reading.
-
-'''
-
-from tarfile import TarFile
-
-def new_volume_handler(tarobj, base_name, volume_number):
-    volume_path = "%s.%d" % (base_name, volume_number)
-    print "new volume: ", volume_path
-    tarobj.open_volume(volume_path)
-
-
-# write
-tar = TarFile.open("sample.tar",
-                   mode="w",
-                   max_volume_size=(1024**2)*4,
-                   new_volume_handler=new_volume_handler)
-tar.add("big")
-tar.close()
-
-## read
-#tar = tarfile.open("sample.tar.gz",
-                   #mode="r#gz",
-                   #new_volume_handler=new_volume)
-#for name in ["foo", "bar", "quux"]:
-    #tar.add(name)
-#tar.close()
-
-# when creating a
\ No newline at end of file
diff --git a/testing/__init__.py b/testing/__init__.py
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/testing/test_multivol.py b/testing/test_multivol.py
new file mode 100644 (file)
index 0000000..08b1f1e
--- /dev/null
@@ -0,0 +1,159 @@
+import sys, os, unittest, hashlib, random, string
+
+from deltatar.tarfile import TarFile
+
+def new_volume_handler(tarobj, base_name, volume_number):
+    volume_path = "%s.%d" % (base_name, volume_number)
+    tarobj.open_volume(volume_path)
+
+
+class MultivolTest(unittest.TestCase):
+    """Test multivolume support in tarfile"""
+
+    def tearDown(self):
+        os.system("rm -rf big small small2 sample.tar*")
+
+    def create_file(self, path, data):
+        f = open(path, 'w')
+        f.write(data)
+        f.close()
+
+    def create_random_file(self, path, length):
+        f = open(path, 'w')
+        s = string.lowercase + string.digits
+        data = ''.join(random.sample(s, 1) * length)
+        f.write(data)
+        f.close()
+
+    def md5sum(self, filename):
+        md5 = hashlib.md5()
+        with open(filename,'rb') as f:
+            for chunk in iter(lambda: f.read(128*md5.block_size), b''):
+                md5.update(chunk)
+        return md5.hexdigest()
+
+    def test_no_volume(self):
+        """Test normal tarfile creation, no volumes """
+
+        # create the content of the file to compress and hash it
+        s = "hello" * 10000
+        assert len(s) == 50000
+        self.create_file("big", s)
+        hash = self.md5sum("big")
+
+        # create the tar file with volumes
+        tarobj = TarFile.open("sample.tar",
+                           mode="w")
+        tarobj.add("big")
+        tarobj.close()
+
+        # check that the tar volumes were correctly created
+        assert os.path.exists("sample.tar")
+        assert not os.path.exists("sample.tar.1")
+
+        os.unlink("big")
+        assert not os.path.exists("big")
+
+        # extract and check
+        os.system("tar xfM sample.tar")
+        assert os.path.exists("big")
+        assert hash == self.md5sum("big")
+
+    def test_volume_creation1(self):
+        """Test volumes creation"""
+
+        # create the content of the file to compress and hash it
+        s = "hello" * 10000
+        assert len(s) == 50000
+        self.create_file("big", s)
+        hash = self.md5sum("big")
+
+        # create the tar file with volumes
+        tarobj = TarFile.open("sample.tar",
+                              mode="w",
+                              max_volume_size=30000,
+                              new_volume_handler=new_volume_handler)
+        tarobj.add("big")
+        tarobj.close()
+
+        # check that the tar volumes were correctly created
+        assert os.path.exists("sample.tar")
+        assert os.path.exists("sample.tar.1")
+        assert not os.path.exists("sample.tar.2")
+
+        os.unlink("big")
+        assert not os.path.exists("big")
+
+        # extract with normal tar and check output
+        os.system("tar xfM sample.tar --file=sample.tar.1")
+        assert os.path.exists("big")
+        assert hash == self.md5sum("big")
+
+    def test_volume_creation2(self):
+        """Test volumes creation with two volumes"""
+
+        # create the content of the file to compress and hash it
+        s = "hello" * 10000
+        assert len(s) == 50000
+        self.create_file("big", s)
+        hash = self.md5sum("big")
+
+        # create the tar file with volumes
+        tarobj = TarFile.open("sample.tar",
+                              mode="w",
+                              max_volume_size=20000,
+                              new_volume_handler=new_volume_handler)
+        tarobj.add("big")
+        tarobj.close()
+
+        # check that the tar volumes were correctly created
+        assert os.path.exists("sample.tar")
+        assert os.path.exists("sample.tar.1")
+        assert os.path.exists("sample.tar.2")
+        assert not os.path.exists("sample.tar.3")
+
+        os.unlink("big")
+        assert not os.path.exists("big")
+
+        # extract with normal tar and check output
+        os.system("tar xfM sample.tar --file=sample.tar.1 --file=sample.tar.2")
+        assert os.path.exists("big")
+        assert hash == self.md5sum("big")
+
+    def test_multiple_files_volume(self):
+        # create the content of the file to compress and hash it
+
+        # create sample data
+        hash = dict()
+        self.create_random_file("big", 50000)
+        hash["big"] = self.md5sum("big")
+        self.create_random_file("small", 100)
+        hash["small"] = self.md5sum("small")
+        self.create_random_file("small2", 354)
+        hash["small2"] = self.md5sum("small2")
+
+        # create the tar file with volumes
+        tarobj = TarFile.open("sample.tar",
+                              mode="w",
+                              max_volume_size=20000,
+                              new_volume_handler=new_volume_handler)
+        tarobj.add("big")
+        tarobj.add("small")
+        tarobj.add("small2")
+        tarobj.close()
+
+        # check that the tar volumes were correctly created
+        assert os.path.exists("sample.tar")
+        assert os.path.exists("sample.tar.1")
+        assert os.path.exists("sample.tar.2")
+        assert not os.path.exists("sample.tar.3")
+
+        os.unlink("big")
+        os.unlink("small")
+        os.unlink("small2")
+
+        # extract with normal tar and check output
+        os.system("tar xfM sample.tar --file=sample.tar.1 --file=sample.tar.2")
+        for key, value in hash.iteritems():
+            assert os.path.exists(key)
+            assert value == self.md5sum(key)