From 8fc6040c3398ac76a76d84018dac6bc2b8015dca Mon Sep 17 00:00:00 2001 From: Philipp Gesang Date: Fri, 25 Aug 2017 14:23:17 +0200 Subject: [PATCH] implement tolerant GNU tar header parser When skimming a file for tar objects, only consider the GNU header magic and whether the blocks are aligned. --- deltatar/tarfile.py | 134 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 files changed, 128 insertions(+), 6 deletions(-) diff --git a/deltatar/tarfile.py b/deltatar/tarfile.py index 5477c04..c63bb23 100644 --- a/deltatar/tarfile.py +++ b/deltatar/tarfile.py @@ -3316,12 +3316,135 @@ class TarIter: # support functionality for rescue mode #--------------------------------------------------------- -def locate_tar_hdr_candidates (fd): - raise NotImplementedError ("too soon") +TAR_FMT_HDR = (# See tar(5): + "<" + "100s" # ← char name[100]; /* 100 */ + "8s" # ← char mode[8]; /* 108 */ + "8s" # ← char uid[8]; /* 116 */ + "8s" # ← char gid[8]; /* 124 */ + "12s" # ← char size[12]; /* 136 */ + "12s" # ← char mtime[12]; /* 148 */ + "8s" # ← char checksum[8]; /* 156 */ + "B" # ← char typeflag[1]; /* 157 */ + "100s" # ← char linkname[100]; /* 257 */ + "6s" # ← char magic[6]; /* 263 */ + "2s" # ← char version[2]; /* 265 */ + "32s" # ← char uname[32]; /* 297 */ + "32s" # ← char gname[32]; /* 329 */ + "8s" # ← char devmajor[8]; /* 337 */ + "8s" # ← char devminor[8]; /* 345 */ + "12s" # ← char atime[12]; /* 357 */ + "12s" # ← char ctime[12]; /* 369 */ + "12s" # ← char offset[12]; /* 381 */ + "4s" # ← char longnames[4]; /* 385 */ + "B" # ← char unused[1]; /* 386 */ + "" # struct { + "12s" # ← char offset[12]; + "12s" # ← char numbytes[12]; + "12s" # ← char offset[12]; + "12s" # ← char numbytes[12]; + "12s" # ← char offset[12]; + "12s" # ← char numbytes[12]; + "12s" # ← char offset[12]; + "12s" # ← char numbytes[12]; + "" # } sparse[4]; /* 482 */ + "B" # ← char isextended[1]; /* 483 */ + "12s" # ← char realsize[12]; /* 495 */ + "17s" # ← char pad[17]; /* 512 */ +) + +# The “magic” and “version” fields are special: +# +# tar(5) +# magic The magic field holds the five characters “ustar” followed by a +# space. Note that POSIX ustar archives have a trailing null. +# +# however, “tar.h”: +# +# /* OLDGNU_MAGIC uses both magic and version fields, which are contiguous. +# Found in an archive, it indicates an old GNU header format, which will be +# hopefully become obsolescent. With OLDGNU_MAGIC, uname and gname are +# valid, though the header is not truly POSIX conforming. */ +# +# +TAR_FMT_OLDGNU_MAGIC = b"ustar " + +def read_gnu_tar_hdr (data): + if len (data) != BLOCKSIZE: # header requires one complete block + return None + + try: + name, mode, \ + uid, gid, \ + size, mtime, \ + checksum, \ + typeflag, \ + linkname, \ + magic, \ + version, \ + uname, \ + gname, \ + devmajor, \ + devminor, \ + atime, \ + ctime, \ + offset, \ + longnames, \ + unused, \ + offset1, numbytes1, \ + offset2, numbytes2, \ + offset3, numbytes3, \ + offset4, numbytes4, \ + isextended, \ + realsize, \ + pad = struct.unpack (TAR_FMT_HDR, data) + except struct.error: + return None + + if magic != TAR_FMT_OLDGNU_MAGIC: + return None + + # return all except “unused” and “pad” + return \ + { "name" : name, "mode" : mode + , "uid" : uid , "gid" : gid + , "size" : size, "mtime" : mtime + , "checksum" : checksum + , "typeflag" : typeflag + , "linkname" : linkname + , "magic" : magic + , "version" : version + , "uname" : uname, "gname" : gname + , "devmajor" : devmajor, "devminor" : devminor + , "atime" : atime, "ctime" : ctime + , "offset" : offset + , "longnames" : longnames + , "offset1" : offset1, "numbytes1" : numbytes1 + , "offset2" : offset2, "numbytes2" : numbytes2 + , "offset3" : offset3, "numbytes3" : numbytes3 + , "offset4" : offset4, "numbytes4" : numbytes4 + , "isextended" : isextended + , "realsize" : realsize + } + + +def readable_tar_objects_offsets (ifd): + """ + Traverse blocks in file, trying to extract tar headers. + """ + pos = 0 + offsets = [] + while True: + blk = os.read (ifd, BLOCKSIZE) + if len (blk) != BLOCKSIZE: + break + hdr = read_gnu_tar_hdr (blk) + if hdr is not None: + offsets.append (pos) + pos += BLOCKSIZE -def readable_tar_objects_offsets (ifd, cands): - raise NotImplementedError ("too soon") + return offsets def locate_gz_hdr_candidates (fd): @@ -3541,8 +3664,7 @@ def reconstruct_offsets_tar (fname): ifd = os.open (fname, os.O_RDONLY) try: - cands = locate_tar_hdr_candidates (ifd) - return readable_tar_objects_offsets (ifd, cands) + return readable_tar_objects_offsets (ifd) finally: os.close (ifd) -- 1.7.1