From dfd7865ec6815df79cff6a853be0d749acc420d3 Mon Sep 17 00:00:00 2001 From: Philipp Gesang Date: Thu, 24 Aug 2017 17:24:36 +0200 Subject: [PATCH] implement tolerant gz header parser MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Since they assume a stream object, we cannot rely on the original tarfile GZ handling. Add a “tolerant” one according to the format spec that notices malformed or unexpected (in Deltatar context) values, but glosses over them if they do not necessarily impact the readability of the object. Also use the new symbolic constants in the existing GZ reader instead of magic numbers. --- deltatar/tarfile.py | 287 +++++++++++++++++++++++++++++++++++++++++++++------ 1 files changed, 253 insertions(+), 34 deletions(-) diff --git a/deltatar/tarfile.py b/deltatar/tarfile.py index cd77208..d44b100 100644 --- a/deltatar/tarfile.py +++ b/deltatar/tarfile.py @@ -42,17 +42,18 @@ __credits__ = "Gustavo Niemeyer, Niels Gustäbel, Richard Townsend, Eduardo Robl # Imports #--------- import binascii -import sys -import os +import copy +import errno import io +import mmap +import operator +import os +import re import shutil import stat -import errno -import time import struct -import copy -import re -import operator +import sys +import time import traceback # XXX @@ -119,7 +120,12 @@ GZ_FMT_HEADER = b"= 0 and l > max: + return None + buf += c + l += 1 + if encoding is not None: + buf = buf.decode (encoding) + + return buf + + +def inspect_gz_hdr (fd, off): + """ + Attempt to parse a Gzip header in *fd* at position *off*. The format is + documented as RFC1952. + + Returns a verdict about the quality of that header plus the parsed header + when readable. Problematic sizes such as fields running past the EOF are + treated as garbage. Properties in which the header merely doesn’t conform + to the spec (garbage flag bits, bogus timestamp) are considered “fishy”. No + validation is possible on embedded strings because they are single-byte + encoded. + """ + fname = None + flags = 0x00 + dflags = 0x00 + mtime = 0x00000000 + oscode = 0x00 + verdict = HDR_CAND_GOOD + + os.lseek (fd, off, os.SEEK_SET) + if os.lseek (fd, 0, os.SEEK_CUR) != off: + return HDR_CAND_JUNK, None + + raw = os.read (fd, GZ_HEADER_SIZE) + if len (raw) != GZ_HEADER_SIZE: + return HDR_CAND_JUNK, None + + flags = 0x0 + try: + _m1, _m2, meth, flags, mtime, dflags, oscode = \ + struct.unpack (GZ_FMT_HEADER, raw) + if meth != GZ_METHOD_DEFLATE: # only deflate is supported + return HDR_CAND_JUNK, None + except struct.error as exn: + return HDR_CAND_JUNK, None + + if mtime > int (time.time ()): + verdict = HDR_CAND_FISHY + + if dflags != GZ_DEFLATE_FLAGS: + verdict = HDR_CAND_FISHY + + if oscode != GZ_OS_CODE: + verdict = HDR_CAND_FISHY + + if flags & GZ_FLAG_FTEXT: # created by some contrarian + verdict = HDR_CAND_FISHY + if flags & GZ_FLAG_FEXTRA: + xlen = struct.unpack (" 0 and clen > 0: + good.append (cand) + + return good + + +def reconstruct_offsets_gz (fname): + """ + From the given file, retrieve all GZ header-like offsets (“candidates”). + Then check each of those locations whether they can be processed as + compressed data. + """ + ifd = os.open (fname, os.O_RDONLY) + + try: + cands = locate_gz_hdr_candidates (ifd) + return readable_gz_objects_offsets (ifd, cands) + finally: + os.close (ifd) + + def read_tarobj_at_offset (fileobj, offset, mode, secret=None): decr = None - ks = secret [0] - if ks == crypto.PDTCRYPT_SECRET_PW: - decr = crypto.Decrypt (password=secret [1]) - elif ks == crypto.PDTCRYPT_SECRET_KEY: - key = binascii.unhexlify (secret [1]) - decr = crypto.Decrypt (key=key) - else: - raise RuntimeError + if secret is not None: + ks = secret [0] + + if ks == crypto.PDTCRYPT_SECRET_PW: + decr = crypto.Decrypt (password=secret [1]) + elif ks == crypto.PDTCRYPT_SECRET_KEY: + key = binascii.unhexlify (secret [1]) + decr = crypto.Decrypt (key=key) + else: + raise RuntimeError tarobj = \ TarFile.open_at_offset (offset, @@ -3374,7 +3591,6 @@ def gen_rescue_index (backup_tar_path, mode, password=None, key=None): psidx = [] # pseudo index, return value offsets = None secret = None - mode = "r" + mode if password is not None: secret = (crypto.PDTCRYPT_SECRET_PW, password) @@ -3383,14 +3599,17 @@ def gen_rescue_index (backup_tar_path, mode, password=None, key=None): if secret is not None: offsets = crypto.reconstruct_offsets (backup_tar_path, secret) - fileobj = bltn_open (backup_tar_path, "rb") - infos = [ (off, read_tarobj_at_offset (fileobj, off, mode, secret=secret)) - for off in offsets ] - def aux (o, ti): - ie = idxent_of_tarinfo (ti) - ie ["offset"] = o - return ie - psidx = [ aux (o, ti) for o, ti in infos ] + elif mode == "#gz": + offsets = reconstruct_offsets_gz (backup_tar_path) + + fileobj = bltn_open (backup_tar_path, "rb") + infos = [ (off, read_tarobj_at_offset (fileobj, off, mode, secret=secret)) + for off in offsets ] + def aux (o, ti): + ie = idxent_of_tarinfo (ti) + ie ["offset"] = o + return ie + psidx = [ aux (o, ti) for o, ti in infos ] return psidx -- 1.7.1