implement tolerant gz header parser

author Philipp Gesang <philipp.gesang@intra2net.com>

Thu, 24 Aug 2017 15:24:36 +0000 (17:24 +0200)

committer Thomas Jarosch <thomas.jarosch@intra2net.com>

Mon, 2 Apr 2018 11:34:09 +0000 (13:34 +0200)
author Philipp Gesang <philipp.gesang@intra2net.com>
Thu, 24 Aug 2017 15:24:36 +0000 (17:24 +0200)
committer Thomas Jarosch <thomas.jarosch@intra2net.com>
Mon, 2 Apr 2018 11:34:09 +0000 (13:34 +0200)
diff --git a/deltatar/tarfile.py b/deltatar/tarfile.py

index cd77208..d44b100 100644 (file)
--- a/deltatar/tarfile.py
+++ b/deltatar/tarfile.py
@@ -42,17 +42,18 @@ __credits__ = "Gustavo Niemeyer, Niels Gustäbel, Richard Townsend, Eduardo Robl
 # Imports
 #---------
 import binascii
-import sys
-import os
+import copy
+import errno
 import io
+import mmap
+import operator
+import os
+import re
 import shutil
 import stat
-import errno
-import time
 import struct
-import copy
-import re
-import operator
+import sys
+import time
 
 import traceback # XXX
 
@@ -119,7 +120,12 @@ GZ_FMT_HEADER        = b"<BBBBLBB"
 GZ_HEADER_SIZE       = 10   # not including the name
 GZ_MAGIC             = (0x1f, 0x8b) # 0o37, 0o213
 GZ_METHOD_DEFLATE    = 0x08 # 0o10
-GZ_FLAG_ORIG_NAME    = 0x08 # 0o10, default in gzip
+GZ_FLAG_FTEXT        = 1 << 0 # ASCII payload
+GZ_FLAG_FHCRC        = 1 << 1 # CRC16
+GZ_FLAG_FEXTRA       = 1 << 2 # extra field
+GZ_FLAG_FNAME        = 1 << 3 # set by default in gzip
+GZ_FLAG_FCOMMENT     = 1 << 4 # NUL-terminated comment
+GZ_FLAG_RESERVED     = 7 << 5 # unassigned
 GZ_DEFLATE_FLAGS     = 0x00 # 0o00, never read (deflate.c)
 GZ_OS_CODE           = 0x03 # 0o03, default in gzip (tailor.h)
 GZ_MAGIC_BYTES       = struct.pack ("<BB", GZ_MAGIC [0], GZ_MAGIC [1])
@@ -130,6 +136,8 @@ TOLERANCE_STRICT  = 0
 TOLERANCE_RECOVER = 1 # rely on offsets in index
 TOLERANCE_RESCUE  = 2 # deduce metadata from archive contents
 
+BUFSIZE           = 16 * 1024
+
 #---------------------------------------------------------
 # archive handling mode
 #---------------------------------------------------------
@@ -309,7 +317,6 @@ def copyfileobj(src, dst, length=None):
         shutil.copyfileobj(src, dst)
         return
 
-    BUFSIZE = 16 * 1024
     blocks, remainder = divmod(length, BUFSIZE)
     for b in range(blocks):
         buf = src.read(BUFSIZE)
@@ -430,7 +437,7 @@ def gz_header (name=None):
     if name is None:
         name = b""
     else:
-        flags |= GZ_FLAG_ORIG_NAME
+        flags |= GZ_FLAG_FNAME
         if type(name) is str:
             name = name.encode("iso-8859-1", "replace")
         if name.endswith(b".pdtcrypt"):
@@ -830,27 +837,27 @@ class _Stream:
         if read2 != GZ_MAGIC_BYTES:
             raise ReadError("not a gzip file")
 
-        read1 = self.__read(1)
-        if read1 != b"\010":
+        read1 = ord (self.__read(1))
+        if read1 != GZ_METHOD_DEFLATE:
             raise CompressionError("unsupported compression method")
 
         self.flags = flag = ord(self.__read(1))
-        self.__read(6)
+        self.__read(6) # discard timestamp[4], deflate flags, os code
 
-        if flag & 4:
+        if flag & GZ_FLAG_FEXTRA:
             xlen = ord(self.__read(1)) + 256 * ord(self.__read(1))
             self.read(xlen)
-        if flag & 8:
+        if flag & GZ_FLAG_FNAME:
             while True:
                 s = self.__read(1)
                 if not s or s == NUL:
                     break
-        if flag & 16:
+        if flag & GZ_FLAG_FCOMMENT:
             while True:
                 s = self.__read(1)
                 if not s or s == NUL:
                     break
-        if flag & 2:
+        if flag & GZ_FLAG_FHCRC:
             self.__read(2)
 
     def _init_read_encrypt (self):
@@ -3308,17 +3315,227 @@ class TarIter:
 # support functionality for rescue mode
 #---------------------------------------------------------
 
+def locate_gz_hdr_candidates (fd):
+    """
+    Walk over instances of the GZ magic in the payload, collecting their
+    positions. If the offset of the first found instance is not zero, the file
+    begins with leading garbage.
+
+    Note that since the GZ magic consists of only two bytes, we expect a lot of
+    false positives inside binary data.
+
+    :return:    The list of offsets in the file.
+    """
+    pos   = 0
+    cands = []
+    mm    = mmap.mmap(fd, 0, mmap.MAP_SHARED, mmap.PROT_READ)
+
+    while True:
+        pos = mm.find (GZ_MAGIC_BYTES, pos)
+        if pos == -1:
+            break
+        cands.append (pos)
+        pos += len (GZ_MAGIC_BYTES)
+
+    return cands
+
+
+HDR_CAND_GOOD       = 0 # header marks begin of valid object
+HDR_CAND_FISHY      = 1 # inconclusive
+HDR_CAND_JUNK       = 2 # not a header / object unreadable
+
+
+def read_cstring (fd, max=-1, encoding=None):
+    """
+    Read one NUL-terminated string from *fd* into a Python string. If *max* is
+    non-negative, reading will terminate after the specified number of bytes.
+
+    Optionally, an *encoding* may be specified to interpret the data as.
+
+    :returns: *None* if parsing failed or the maximum number of bytes has been
+              exceeded; a Python string with the data otherwise.
+    """
+    buf = b""
+    l = 0
+
+    while True:
+        c = os.read (fd, 1)
+        if c == NUL:
+            break
+        if max >= 0 and l > max:
+            return None
+        buf += c
+        l += 1
+    if encoding is not None:
+        buf = buf.decode (encoding)
+
+    return buf
+
+
+def inspect_gz_hdr (fd, off):
+    """
+    Attempt to parse a Gzip header in *fd* at position *off*. The format is
+    documented as RFC1952.
+
+    Returns a verdict about the quality of that header plus the parsed header
+    when readable. Problematic sizes such as fields running past the EOF are
+    treated as garbage. Properties in which the header merely doesn’t conform
+    to the spec (garbage flag bits, bogus timestamp) are considered “fishy”. No
+    validation is possible on embedded strings because they are single-byte
+    encoded.
+    """
+    fname   = None
+    flags   = 0x00
+    dflags  = 0x00
+    mtime   = 0x00000000
+    oscode  = 0x00
+    verdict = HDR_CAND_GOOD
+
+    os.lseek (fd, off, os.SEEK_SET)
+    if os.lseek (fd, 0, os.SEEK_CUR) != off:
+        return HDR_CAND_JUNK, None
+
+    raw = os.read (fd, GZ_HEADER_SIZE)
+    if len (raw) != GZ_HEADER_SIZE:
+        return HDR_CAND_JUNK, None
+
+    flags = 0x0
+    try:
+        _m1, _m2, meth, flags, mtime, dflags, oscode = \
+            struct.unpack (GZ_FMT_HEADER, raw)
+        if meth != GZ_METHOD_DEFLATE: # only deflate is supported
+            return HDR_CAND_JUNK, None
+    except struct.error as exn:
+        return HDR_CAND_JUNK, None
+
+    if mtime > int (time.time ()):
+        verdict = HDR_CAND_FISHY
+
+    if dflags != GZ_DEFLATE_FLAGS:
+        verdict = HDR_CAND_FISHY
+
+    if oscode != GZ_OS_CODE:
+        verdict = HDR_CAND_FISHY
+
+    if flags & GZ_FLAG_FTEXT: # created by some contrarian
+        verdict = HDR_CAND_FISHY
+    if flags & GZ_FLAG_FEXTRA:
+        xlen = struct.unpack ("<H", os.read (fd, 2))
+        xtra = os.read (fd, xlen)
+        if len (xtra) != xlen: # eof inside header
+            return HDR_CAND_JUNK, None
+    if flags & GZ_FLAG_FNAME:
+        # read up to the next NUL byte, not exceeding the maximum path length
+        # allowed by tar(5)
+        fname = read_cstring (fd, max=(LENGTH_NAME + LENGTH_PREFIX),
+                              encoding="iso-8859-1")
+        if fname is None:
+            return HDR_CAND_JUNK, None
+    if flags & GZ_FLAG_FCOMMENT:
+        fname = read_cstring (fd, max=(LENGTH_NAME + LENGTH_PREFIX),
+                              encoding="iso-8859-1")
+        if fname is None:
+            return HDR_CAND_JUNK, None
+    if flags & GZ_FLAG_FHCRC: # half a CRC32; discarded
+        crc16 = os.read (fd, 2)
+        if len (crc16) != 2: # eof inside header
+            return HDR_CAND_JUNK, None
+    if flags & GZ_FLAG_RESERVED:
+        # according to the RFC, these must not be set
+        verdict = HDR_CAND_FISHY
+
+    hlen = os.lseek (fd, 0, os.SEEK_CUR) - off
+
+    return verdict, \
+           { "fname"  : fname
+           , "flags"  : flags
+           , "dflags" : dflags
+           , "mtime"  : mtime
+           , "oscode" : oscode
+           , "hlen"   : hlen
+           }
+
+
+def try_decompress (ifd, off, hdr):
+    """
+    Attempt to process the object starting at *off* with gzip.
+
+    :returns:   A pair containing the values of the decompressed data and
+                the length of the input consumed. Note that the latter value
+                may exceed the length of the compressed data because the
+                *zlib* module does not provide a means to query how much
+                of the input it processed before the end of an object.
+    """
+    import zlib
+    decmp = zlib.decompressobj (-zlib.MAX_WBITS)
+    pos   = off
+    dlen  = 0 # size of decompressed data
+
+    os.lseek (ifd, pos, os.SEEK_SET)
+    while True:
+        cnk = os.read (ifd, BUFSIZE)
+        pos += len (cnk)
+        try:
+            data = decmp.decompress (cnk)
+        except zlib.error as exn: # probably CRC32 mismatch; terminate softly
+            break # fishy
+        dlen += len (data)
+        if decmp.eof is True:
+            break
+        if len (cnk) != BUFSIZE: # eof, but not end of decompressed object!
+            break # fishy
+
+    return dlen, pos - off
+
+def readable_gz_objects_offsets (ifd, cands):
+    """
+    Inspect header candidates for parseable *ifd* gzipped objects.
+    """
+    good = []
+    nobj = 0
+
+    for cand in cands:
+        nobj += 1
+        vdt, hdr = inspect_gz_hdr (ifd, cand)
+        if vdt == HDR_CAND_JUNK:
+            pass # ignore unreadable ones
+        elif vdt in [HDR_CAND_GOOD, HDR_CAND_FISHY]:
+            off0 = cand + hdr ["hlen"]
+            dlen, clen = try_decompress (ifd, off0, hdr)
+            if dlen > 0 and clen > 0:
+                good.append (cand)
+
+    return good
+
+
+def reconstruct_offsets_gz (fname):
+    """
+    From the given file, retrieve all GZ header-like offsets (“candidates”).
+    Then check each of those locations whether they can be processed as
+    compressed data.
+    """
+    ifd = os.open (fname, os.O_RDONLY)
+
+    try:
+        cands = locate_gz_hdr_candidates (ifd)
+        return readable_gz_objects_offsets (ifd, cands)
+    finally:
+        os.close (ifd)
+
+
 def read_tarobj_at_offset (fileobj, offset, mode, secret=None):
     decr = None
-    ks   = secret [0]
 
-    if ks == crypto.PDTCRYPT_SECRET_PW:
-        decr = crypto.Decrypt (password=secret [1])
-    elif ks == crypto.PDTCRYPT_SECRET_KEY:
-        key = binascii.unhexlify (secret [1])
-        decr = crypto.Decrypt (key=key)
-    else:
-        raise RuntimeError
+    if secret is not None:
+        ks   = secret [0]
+
+        if ks == crypto.PDTCRYPT_SECRET_PW:
+            decr = crypto.Decrypt (password=secret [1])
+        elif ks == crypto.PDTCRYPT_SECRET_KEY:
+            key = binascii.unhexlify (secret [1])
+            decr = crypto.Decrypt (key=key)
+        else:
+            raise RuntimeError
 
     tarobj = \
         TarFile.open_at_offset (offset,
@@ -3374,7 +3591,6 @@ def gen_rescue_index (backup_tar_path, mode, password=None, key=None):
     psidx   = [] # pseudo index, return value
     offsets = None
     secret  = None
-    mode    = "r" + mode
 
     if password is not None:
         secret = (crypto.PDTCRYPT_SECRET_PW, password)
@@ -3383,14 +3599,17 @@ def gen_rescue_index (backup_tar_path, mode, password=None, key=None):
 
     if secret is not None:
         offsets = crypto.reconstruct_offsets (backup_tar_path, secret)
-        fileobj = bltn_open (backup_tar_path, "rb")
-        infos   = [ (off, read_tarobj_at_offset (fileobj, off, mode, secret=secret))
-                    for off in offsets ]
-        def aux (o, ti):
-            ie = idxent_of_tarinfo (ti)
-            ie ["offset"] = o
-            return ie
-        psidx   = [ aux (o, ti) for o, ti in infos ]
+    elif mode == "#gz":
+        offsets = reconstruct_offsets_gz (backup_tar_path)
+
+    fileobj = bltn_open (backup_tar_path, "rb")
+    infos   = [ (off, read_tarobj_at_offset (fileobj, off, mode, secret=secret))
+                for off in offsets ]
+    def aux (o, ti):
+        ie = idxent_of_tarinfo (ti)
+        ie ["offset"] = o
+        return ie
+    psidx   = [ aux (o, ti) for o, ti in infos ]
 
     return psidx
author	Philipp Gesang <philipp.gesang@intra2net.com>
	Thu, 24 Aug 2017 15:24:36 +0000 (17:24 +0200)
committer	Thomas Jarosch <thomas.jarosch@intra2net.com>
	Mon, 2 Apr 2018 11:34:09 +0000 (13:34 +0200)