changed TarFile.addfile to get better sized volumes if compressing
authorChristian Herdtweck <christian.herdtweck@intra2net.com>
Thu, 9 Jun 2016 15:58:04 +0000 (17:58 +0200)
committerChristian Herdtweck <christian.herdtweck@intra2net.com>
Wed, 15 Jun 2016 11:18:02 +0000 (13:18 +0200)
deltatar/tarfile.py

index 230b47e..e0eacde 100644 (file)
@@ -2393,6 +2393,13 @@ class TarFile(object):
         self.fileobj.write(buf)
         self.offset += len(buf)
 
+        if self.max_volume_size:
+            if isinstance(self.fileobj, _Stream):
+                _size_left = self._size_left_stream
+            else:
+                _size_left = self._size_left_file
+        else:
+            _size_left = lambda: tarinfo.size
 
         # If there's no data to follow, finish
         if not fileobj:
@@ -2400,71 +2407,73 @@ class TarFile(object):
                 self.members.append(tarinfo)
             return
 
-        # handle multivolume support
-        if self.max_volume_size:
-            size_left = self._size_left()
-            # we only split volumes in the middle of a file, that means we have
-            # to write at least one block
-            if size_left < BLOCKSIZE:
-                size_left = BLOCKSIZE
-            max_size_to_write = min(size_left, tarinfo.size - tarinfo.volume_offset)
-        else:
-            size_left = max_size_to_write = tarinfo.size
-
-        # iterate, one iteration per volume (usually only one volume)
-        while tarinfo.volume_offset < tarinfo.size:
-            copyfileobj(fileobj, self.fileobj, max_size_to_write)
-            blocks, remainder = divmod(max_size_to_write, BLOCKSIZE)
-
-            # only fill with zeros the remainder in a block if it's not
-            # going to be a file splitted in multiple volumes.
-            # if file is going to be split in multiple volumes, having a
-            # remainder means that there's no more space left for a block, so
-            # we already need to create a new volume.
-            if remainder > 0:
-                self.fileobj.write(NUL * (BLOCKSIZE - remainder))
-                blocks += 1
+        target_size_left = _size_left()
+        source_size_left = tarinfo.size
+        assert tarinfo.volume_offset == 0
+
+        # we only split volumes in the middle of a file, that means we have
+        # to write at least one block
+        if target_size_left < BLOCKSIZE:
+            target_size_left = BLOCKSIZE
 
-            # we already assured previously that if we are doing multivolume,
-            # there's not going to be a remainder
-            if self.max_volume_size and max_size_to_write == size_left:
-                assert remainder == 0
+        # loop over multiple volumes
+        while source_size_left > 0:
 
+            # Write as much data as possble from source into target.
+            # When compressing data, we cannot easily predict how much data we
+            # can write until target_size_left == 0 --> need to iterate
+            size_can_write = min(target_size_left, source_size_left)
 
-            self.offset += blocks * BLOCKSIZE
-            size_left -= blocks * BLOCKSIZE
-            tarinfo.volume_offset += blocks * BLOCKSIZE
+            while size_can_write > 0:
+                copyfileobj(fileobj, self.fileobj, size_can_write)
+                self.offset += size_can_write
+                source_size_left -= size_can_write
+                target_size_left = _size_left()
+                size_can_write = min(target_size_left, source_size_left)
 
-            # check if creating a new volume is needed
-            if tarinfo.volume_offset < tarinfo.size and\
-                self.max_volume_size and size_left < 3*BLOCKSIZE:
+            # now target_size_left == 0 or source_size_left == 0
+
+            # if there is data left to write, we need to create a new volume
+            if source_size_left > 0:
 
                 tarinfo.type = GNUTYPE_MULTIVOL
 
                 if not self.new_volume_handler or\
                     not callable(self.new_volume_handler):
                     raise Exception("We need to create a new volume and you "
-                        "didn't supply a new_volume_handler")
+                                    "didn't supply a new_volume_handler")
 
                 # the new volume handler should do everything needed to
                 # start working in a new volume. usually, the handler calls
                 # to self.open_volume
                 self.volume_number += 1
 
-                # set to be used by open_volume, becuase in the case of a PAX
+                # set to be used by open_volume, because in the case of a PAX
                 # tar it needs to write information about the volume and offset
                 # in the global header
+                tarinfo.volume_offset = tarinfo.size - source_size_left
                 self.volume_tarinfo = tarinfo
+
                 self.new_volume_handler(self, self.base_name, self.volume_number)
 
                 self.volume_tarinfo = None
 
                 # write new volume header
                 buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
-                self.offset += len(buf)
                 self.fileobj.write(buf)
-                size_left = self._size_left()
-                max_size_to_write = min(size_left, tarinfo.size - tarinfo.volume_offset)
+                self.offset += len(buf)
+
+                # adjust variables; open_volume should have reset self.offset
+                # --> _size_left should be big again
+                target_size_left = _size_left()
+                size_can_write = min(target_size_left, source_size_left)
+
+        # now, all data has been written. We may have to fill up the rest of
+        # the block in target with 0s
+        remainder = (tarinfo.size - tarinfo.volume_offset) % BLOCKSIZE
+        if remainder > 0:
+            self.fileobj.write(NUL * (BLOCKSIZE - remainder))
+            self.offset += BLOCKSIZE - remainder
 
         if self.save_to_members:
             self.members.append(tarinfo)