Create a streamable version of ZipFile
authorChristian Herdtweck <christian.herdtweck@intra2net.com>
Fri, 18 May 2018 13:57:04 +0000 (15:57 +0200)
committerChristian Herdtweck <christian.herdtweck@intra2net.com>
Fri, 18 May 2018 13:57:04 +0000 (15:57 +0200)
Python's ZipFile requires data in memory or on disc in order to compress it.
This module contains class ZipStream that extends ZipFile to allow read-only,
non-seekable streams as input.

For python < 3.5 this requires python-zipfile35; implementation is MUCH simpler
for python >= 3.6

src/zip_stream.py [new file with mode: 0644]

diff --git a/src/zip_stream.py b/src/zip_stream.py
new file mode 100644 (file)
index 0000000..36e46ba
--- /dev/null
@@ -0,0 +1,205 @@
+""" Streamable version of zipfile
+
+Python's :py:class:`zipfile.ZipFile` can only write to seekable streams
+since version 3.5 and only implements adding files as wholes. This module
+implements class :py:class:`ZipStream` which is a subclass of ZipFile that can
+read from non-seekable input streams and write to non-seekable output streams.
+
+.. codeauthor:: Intra2net AG <info@intra2net>
+"""
+
+import sys
+import os
+
+if sys.version_info.major >= 3 and sys.version_info.minor >= 6:
+    from zipfile import *
+    import shutil
+elif sys.version_info.major >= 3 and sys.version_info.minor >= 5:
+    from stat import S_ISDIR
+    import time
+    import zlib
+    crc32 = zlib.crc32
+    import bz2
+    import struct
+    from zipfile import *
+else:
+    from stat import S_ISDIR
+    import time
+    import zlib
+    crc32 = zlib.crc32
+    import bz2
+    import struct
+    # backport of zipfile from python 3.5; works at least for py3.3
+    from zipfile35 import *
+from type_helpers import isstr
+
+ZIP64_LIMIT = (1 << 31) - 1
+
+def _get_compressor(compress_type):
+    """Copied fomr zipfile.py in py3.5 (cannot legally import)"""
+    if compress_type == ZIP_DEFLATED:
+        return zlib.compressobj(zlib.Z_DEFAULT_COMPRESSION,
+                                zlib.DEFLATED, -15)
+    elif compress_type == ZIP_BZIP2:
+        return bz2.BZ2Compressor()
+    elif compress_type == ZIP_LZMA:
+        return LZMACompressor()
+    else:
+        return None
+
+class ZipStream(ZipFile):
+    """Subclass of ZipFile that supports non-seekable input and output"""
+
+    def create_zipinfo(self, filename, arcname=None):
+        """
+        Create ZipInfo for given file
+        
+        Optionally set arcname as name of file inside archive.
+
+        Adapted from zipfile.py in (ZipInfo.from_file in py3.6, ZipFile.write
+        in py3.5)
+        """
+        if sys.version_info.major >= 3 and sys.version_info.minor >= 6:
+            return ZipInfo.from_file(filename, arcname)
+
+        st = os.stat(filename)
+        isdir = S_ISDIR(st.st_mode)
+        mtime = time.localtime(st.st_mtime)
+        date_time = mtime[0:6]
+        # Create ZipInfo instance to store file information
+        if arcname is None:
+            arcname = filename
+        arcname = os.path.normpath(os.path.splitdrive(arcname)[1])
+        while arcname[0] in (os.sep, os.altsep):
+            arcname = arcname[1:]
+        if isdir:
+            arcname += '/'
+        zinfo = ZipInfo(arcname, date_time)
+        zinfo.external_attr = (st.st_mode & 0xFFFF) << 16  # Unix attributes
+        if isdir:
+            zinfo.compress_type = ZIP_STORED
+            zinfo.file_size = 0
+            zinfo.external_attr |= 0x10  # MS-DOS directory flag
+        else:
+            zinfo.compress_type = self.compression
+            zinfo.file_size = st.st_size
+
+        return zinfo
+
+    def write_stream(self, src, zinfo):
+        """
+        Add data from byte stream stream src to archive with info in ZipInfo.
+        
+        Param zinfo must be a ZipInfo, created e.g. with
+        :py:meth:`ZipStream.create_zipinfo`
+
+        Note: you cannot add directories this way (removed the corresponding
+        code).
+
+        This is a shortened version of python's
+        :py:func:`zipfile.ZipFile.write`.
+        """
+        if sys.version_info.major >= 3 and sys.version_info.minor >= 6:
+            return self._write_stream_36(src, zinfo)
+        else:
+            return self._write_stream_35(src, zinfo)
+
+
+    def _write_stream_35(self, src, zinfo):
+        """Implementation of _write_stream based on ZipFile.write (py 3.5)"""
+        if not self.fp:
+            raise RuntimeError(
+                "Attempt to write to ZIP archive that was already closed")
+
+        zinfo.flag_bits = 0x00
+
+        with self._lock:
+            zinfo.header_offset = self.fp.tell()    # Start of header bytes
+            if zinfo.compress_type == ZIP_LZMA:
+                # Compressed data includes an end-of-stream (EOS) marker
+                zinfo.flag_bits |= 0x02
+
+            self._writecheck(zinfo)
+            self._didModify = True
+
+            cmpr = _get_compressor(zinfo.compress_type)
+            zinfo.flag_bits |= 0x08
+
+            # Must overwrite CRC and sizes with correct data later
+            zinfo.CRC = CRC = 0
+            zinfo.compress_size = compress_size = 0
+            # Compressed size can be larger than uncompressed size
+            zip64 = self._allowZip64 and \
+                zinfo.file_size * 1.05 > ZIP64_LIMIT
+            self.fp.write(zinfo.FileHeader(zip64))
+            file_size = 0
+            while 1:
+                buf = src.read(1024 * 8)
+                if not buf:
+                    break
+                file_size = file_size + len(buf)
+                CRC = crc32(buf, CRC)
+                if cmpr:
+                    buf = cmpr.compress(buf)
+                    compress_size = compress_size + len(buf)
+                self.fp.write(buf)
+            if cmpr:
+                buf = cmpr.flush()
+                compress_size = compress_size + len(buf)
+                self.fp.write(buf)
+                zinfo.compress_size = compress_size
+            else:
+                zinfo.compress_size = file_size
+            zinfo.CRC = CRC
+            zinfo.file_size = file_size
+
+            # Write CRC and file sizes after the file data
+            fmt = '<LQQ' if zip64 else '<LLL'
+            self.fp.write(struct.pack(fmt, zinfo.CRC, zinfo.compress_size,
+                                      zinfo.file_size))
+            self.start_dir = self.fp.tell()
+            self.filelist.append(zinfo)
+            self.NameToInfo[zinfo.filename] = zinfo
+
+    def _write_stream_36(self, src, zinfo):
+        """Implementation of _write_stream based on ZipFile.write (py 3.6)"""
+        if not self.fp:
+            raise ValueError(
+                "Attempt to write to ZIP archive that was already closed")
+        if self._writing:
+            raise ValueError(
+                "Can't write to ZIP archive while an open writing handle exists"
+            )
+
+        if zinfo.is_dir():
+            raise ValueError('streaming a dir entry does not make sense')
+        if zinfo.compress_type is None:
+            zinfo.compress_type = self.compression
+
+        with self.open(zinfo, 'w') as dest:
+            shutil.copyfileobj(src, dest, 1024*8)
+
+
+if __name__ == '__main__':
+    import gzip
+
+    print('[stderr] Py version is {}, ZipFile is {}, Encoding is {}, is a tty: {}'
+          .format(sys.version, ZipFile, sys.stdout.encoding,
+                  sys.stdout.isatty()),
+          file=sys.stderr)
+
+    with ZipStream(sys.stdout.buffer, 'w', compression=ZIP_DEFLATED) as zip:
+        for arg in sys.argv[1:]:
+            basename = os.path.basename(arg)
+            if arg.endswith('.gz'):
+                print('[stderr] reading from compressed file {}'.format(arg),
+                      file=sys.stderr)
+                info = zip.create_zipinfo(arg, basename[:-3])
+                with gzip.open(arg, 'rb') as reader:
+                    zip.write_stream(reader, info)
+            else:
+                print('[stderr] reading from regular file {}'.format(arg),
+                      file=sys.stderr)
+                info = zip.create_zipinfo(arg, basename)
+                with open(arg, 'rb') as reader:
+                    zip.write_stream(reader, info)