# Imports
#---------
import binascii
-import sys
-import os
+import copy
+import errno
import io
+import mmap
+import operator
+import os
+import re
import shutil
import stat
-import errno
-import time
import struct
-import copy
-import re
-import operator
+import sys
+import time
import traceback # XXX
GZ_HEADER_SIZE = 10 # not including the name
GZ_MAGIC = (0x1f, 0x8b) # 0o37, 0o213
GZ_METHOD_DEFLATE = 0x08 # 0o10
-GZ_FLAG_ORIG_NAME = 0x08 # 0o10, default in gzip
+GZ_FLAG_FTEXT = 1 << 0 # ASCII payload
+GZ_FLAG_FHCRC = 1 << 1 # CRC16
+GZ_FLAG_FEXTRA = 1 << 2 # extra field
+GZ_FLAG_FNAME = 1 << 3 # set by default in gzip
+GZ_FLAG_FCOMMENT = 1 << 4 # NUL-terminated comment
+GZ_FLAG_RESERVED = 7 << 5 # unassigned
GZ_DEFLATE_FLAGS = 0x00 # 0o00, never read (deflate.c)
GZ_OS_CODE = 0x03 # 0o03, default in gzip (tailor.h)
GZ_MAGIC_BYTES = struct.pack ("<BB", GZ_MAGIC [0], GZ_MAGIC [1])
TOLERANCE_RECOVER = 1 # rely on offsets in index
TOLERANCE_RESCUE = 2 # deduce metadata from archive contents
+BUFSIZE = 16 * 1024
+
#---------------------------------------------------------
# archive handling mode
#---------------------------------------------------------
shutil.copyfileobj(src, dst)
return
- BUFSIZE = 16 * 1024
blocks, remainder = divmod(length, BUFSIZE)
for b in range(blocks):
buf = src.read(BUFSIZE)
if name is None:
name = b""
else:
- flags |= GZ_FLAG_ORIG_NAME
+ flags |= GZ_FLAG_FNAME
if type(name) is str:
name = name.encode("iso-8859-1", "replace")
if name.endswith(b".pdtcrypt"):
if read2 != GZ_MAGIC_BYTES:
raise ReadError("not a gzip file")
- read1 = self.__read(1)
- if read1 != b"\010":
+ read1 = ord (self.__read(1))
+ if read1 != GZ_METHOD_DEFLATE:
raise CompressionError("unsupported compression method")
self.flags = flag = ord(self.__read(1))
- self.__read(6)
+ self.__read(6) # discard timestamp[4], deflate flags, os code
- if flag & 4:
+ if flag & GZ_FLAG_FEXTRA:
xlen = ord(self.__read(1)) + 256 * ord(self.__read(1))
self.read(xlen)
- if flag & 8:
+ if flag & GZ_FLAG_FNAME:
while True:
s = self.__read(1)
if not s or s == NUL:
break
- if flag & 16:
+ if flag & GZ_FLAG_FCOMMENT:
while True:
s = self.__read(1)
if not s or s == NUL:
break
- if flag & 2:
+ if flag & GZ_FLAG_FHCRC:
self.__read(2)
def _init_read_encrypt (self):
# support functionality for rescue mode
#---------------------------------------------------------
+def locate_gz_hdr_candidates (fd):
+ """
+ Walk over instances of the GZ magic in the payload, collecting their
+ positions. If the offset of the first found instance is not zero, the file
+ begins with leading garbage.
+
+ Note that since the GZ magic consists of only two bytes, we expect a lot of
+ false positives inside binary data.
+
+ :return: The list of offsets in the file.
+ """
+ pos = 0
+ cands = []
+ mm = mmap.mmap(fd, 0, mmap.MAP_SHARED, mmap.PROT_READ)
+
+ while True:
+ pos = mm.find (GZ_MAGIC_BYTES, pos)
+ if pos == -1:
+ break
+ cands.append (pos)
+ pos += len (GZ_MAGIC_BYTES)
+
+ return cands
+
+
+HDR_CAND_GOOD = 0 # header marks begin of valid object
+HDR_CAND_FISHY = 1 # inconclusive
+HDR_CAND_JUNK = 2 # not a header / object unreadable
+
+
+def read_cstring (fd, max=-1, encoding=None):
+ """
+ Read one NUL-terminated string from *fd* into a Python string. If *max* is
+ non-negative, reading will terminate after the specified number of bytes.
+
+ Optionally, an *encoding* may be specified to interpret the data as.
+
+ :returns: *None* if parsing failed or the maximum number of bytes has been
+ exceeded; a Python string with the data otherwise.
+ """
+ buf = b""
+ l = 0
+
+ while True:
+ c = os.read (fd, 1)
+ if c == NUL:
+ break
+ if max >= 0 and l > max:
+ return None
+ buf += c
+ l += 1
+ if encoding is not None:
+ buf = buf.decode (encoding)
+
+ return buf
+
+
+def inspect_gz_hdr (fd, off):
+ """
+ Attempt to parse a Gzip header in *fd* at position *off*. The format is
+ documented as RFC1952.
+
+ Returns a verdict about the quality of that header plus the parsed header
+ when readable. Problematic sizes such as fields running past the EOF are
+ treated as garbage. Properties in which the header merely doesn’t conform
+ to the spec (garbage flag bits, bogus timestamp) are considered “fishy”. No
+ validation is possible on embedded strings because they are single-byte
+ encoded.
+ """
+ fname = None
+ flags = 0x00
+ dflags = 0x00
+ mtime = 0x00000000
+ oscode = 0x00
+ verdict = HDR_CAND_GOOD
+
+ os.lseek (fd, off, os.SEEK_SET)
+ if os.lseek (fd, 0, os.SEEK_CUR) != off:
+ return HDR_CAND_JUNK, None
+
+ raw = os.read (fd, GZ_HEADER_SIZE)
+ if len (raw) != GZ_HEADER_SIZE:
+ return HDR_CAND_JUNK, None
+
+ flags = 0x0
+ try:
+ _m1, _m2, meth, flags, mtime, dflags, oscode = \
+ struct.unpack (GZ_FMT_HEADER, raw)
+ if meth != GZ_METHOD_DEFLATE: # only deflate is supported
+ return HDR_CAND_JUNK, None
+ except struct.error as exn:
+ return HDR_CAND_JUNK, None
+
+ if mtime > int (time.time ()):
+ verdict = HDR_CAND_FISHY
+
+ if dflags != GZ_DEFLATE_FLAGS:
+ verdict = HDR_CAND_FISHY
+
+ if oscode != GZ_OS_CODE:
+ verdict = HDR_CAND_FISHY
+
+ if flags & GZ_FLAG_FTEXT: # created by some contrarian
+ verdict = HDR_CAND_FISHY
+ if flags & GZ_FLAG_FEXTRA:
+ xlen = struct.unpack ("<H", os.read (fd, 2))
+ xtra = os.read (fd, xlen)
+ if len (xtra) != xlen: # eof inside header
+ return HDR_CAND_JUNK, None
+ if flags & GZ_FLAG_FNAME:
+ # read up to the next NUL byte, not exceeding the maximum path length
+ # allowed by tar(5)
+ fname = read_cstring (fd, max=(LENGTH_NAME + LENGTH_PREFIX),
+ encoding="iso-8859-1")
+ if fname is None:
+ return HDR_CAND_JUNK, None
+ if flags & GZ_FLAG_FCOMMENT:
+ fname = read_cstring (fd, max=(LENGTH_NAME + LENGTH_PREFIX),
+ encoding="iso-8859-1")
+ if fname is None:
+ return HDR_CAND_JUNK, None
+ if flags & GZ_FLAG_FHCRC: # half a CRC32; discarded
+ crc16 = os.read (fd, 2)
+ if len (crc16) != 2: # eof inside header
+ return HDR_CAND_JUNK, None
+ if flags & GZ_FLAG_RESERVED:
+ # according to the RFC, these must not be set
+ verdict = HDR_CAND_FISHY
+
+ hlen = os.lseek (fd, 0, os.SEEK_CUR) - off
+
+ return verdict, \
+ { "fname" : fname
+ , "flags" : flags
+ , "dflags" : dflags
+ , "mtime" : mtime
+ , "oscode" : oscode
+ , "hlen" : hlen
+ }
+
+
+def try_decompress (ifd, off, hdr):
+ """
+ Attempt to process the object starting at *off* with gzip.
+
+ :returns: A pair containing the values of the decompressed data and
+ the length of the input consumed. Note that the latter value
+ may exceed the length of the compressed data because the
+ *zlib* module does not provide a means to query how much
+ of the input it processed before the end of an object.
+ """
+ import zlib
+ decmp = zlib.decompressobj (-zlib.MAX_WBITS)
+ pos = off
+ dlen = 0 # size of decompressed data
+
+ os.lseek (ifd, pos, os.SEEK_SET)
+ while True:
+ cnk = os.read (ifd, BUFSIZE)
+ pos += len (cnk)
+ try:
+ data = decmp.decompress (cnk)
+ except zlib.error as exn: # probably CRC32 mismatch; terminate softly
+ break # fishy
+ dlen += len (data)
+ if decmp.eof is True:
+ break
+ if len (cnk) != BUFSIZE: # eof, but not end of decompressed object!
+ break # fishy
+
+ return dlen, pos - off
+
+def readable_gz_objects_offsets (ifd, cands):
+ """
+ Inspect header candidates for parseable *ifd* gzipped objects.
+ """
+ good = []
+ nobj = 0
+
+ for cand in cands:
+ nobj += 1
+ vdt, hdr = inspect_gz_hdr (ifd, cand)
+ if vdt == HDR_CAND_JUNK:
+ pass # ignore unreadable ones
+ elif vdt in [HDR_CAND_GOOD, HDR_CAND_FISHY]:
+ off0 = cand + hdr ["hlen"]
+ dlen, clen = try_decompress (ifd, off0, hdr)
+ if dlen > 0 and clen > 0:
+ good.append (cand)
+
+ return good
+
+
+def reconstruct_offsets_gz (fname):
+ """
+ From the given file, retrieve all GZ header-like offsets (“candidates”).
+ Then check each of those locations whether they can be processed as
+ compressed data.
+ """
+ ifd = os.open (fname, os.O_RDONLY)
+
+ try:
+ cands = locate_gz_hdr_candidates (ifd)
+ return readable_gz_objects_offsets (ifd, cands)
+ finally:
+ os.close (ifd)
+
+
def read_tarobj_at_offset (fileobj, offset, mode, secret=None):
decr = None
- ks = secret [0]
- if ks == crypto.PDTCRYPT_SECRET_PW:
- decr = crypto.Decrypt (password=secret [1])
- elif ks == crypto.PDTCRYPT_SECRET_KEY:
- key = binascii.unhexlify (secret [1])
- decr = crypto.Decrypt (key=key)
- else:
- raise RuntimeError
+ if secret is not None:
+ ks = secret [0]
+
+ if ks == crypto.PDTCRYPT_SECRET_PW:
+ decr = crypto.Decrypt (password=secret [1])
+ elif ks == crypto.PDTCRYPT_SECRET_KEY:
+ key = binascii.unhexlify (secret [1])
+ decr = crypto.Decrypt (key=key)
+ else:
+ raise RuntimeError
tarobj = \
TarFile.open_at_offset (offset,
psidx = [] # pseudo index, return value
offsets = None
secret = None
- mode = "r" + mode
if password is not None:
secret = (crypto.PDTCRYPT_SECRET_PW, password)
if secret is not None:
offsets = crypto.reconstruct_offsets (backup_tar_path, secret)
- fileobj = bltn_open (backup_tar_path, "rb")
- infos = [ (off, read_tarobj_at_offset (fileobj, off, mode, secret=secret))
- for off in offsets ]
- def aux (o, ti):
- ie = idxent_of_tarinfo (ti)
- ie ["offset"] = o
- return ie
- psidx = [ aux (o, ti) for o, ti in infos ]
+ elif mode == "#gz":
+ offsets = reconstruct_offsets_gz (backup_tar_path)
+
+ fileobj = bltn_open (backup_tar_path, "rb")
+ infos = [ (off, read_tarobj_at_offset (fileobj, off, mode, secret=secret))
+ for off in offsets ]
+ def aux (o, ti):
+ ie = idxent_of_tarinfo (ti)
+ ie ["offset"] = o
+ return ie
+ psidx = [ aux (o, ti) for o, ti in infos ]
return psidx