implement tolerant GNU tar header parser
authorPhilipp Gesang <philipp.gesang@intra2net.com>
Fri, 25 Aug 2017 12:23:17 +0000 (14:23 +0200)
committerThomas Jarosch <thomas.jarosch@intra2net.com>
Mon, 2 Apr 2018 11:34:09 +0000 (13:34 +0200)
When skimming a file for tar objects, only consider the GNU
header magic and whether the blocks are aligned.

deltatar/tarfile.py

index 5477c04..c63bb23 100644 (file)
@@ -3316,12 +3316,135 @@ class TarIter:
 # support functionality for rescue mode
 #---------------------------------------------------------
 
-def locate_tar_hdr_candidates (fd):
-    raise NotImplementedError ("too soon")
+TAR_FMT_HDR = (# See tar(5):
+    "<"
+    "100s" # ← char name[100];          /* 100 */
+      "8s" # ← char mode[8];            /* 108 */
+      "8s" # ← char uid[8];             /* 116 */
+      "8s" # ← char gid[8];             /* 124 */
+     "12s" # ← char size[12];           /* 136 */
+     "12s" # ← char mtime[12];          /* 148 */
+      "8s" # ← char checksum[8];        /* 156 */
+       "B" # ← char typeflag[1];        /* 157 */
+    "100s" # ← char linkname[100];      /* 257 */
+      "6s" # ← char magic[6];           /* 263 */
+      "2s" # ← char version[2];         /* 265 */
+     "32s" # ← char uname[32];          /* 297 */
+     "32s" # ← char gname[32];          /* 329 */
+      "8s" # ← char devmajor[8];        /* 337 */
+      "8s" # ← char devminor[8];        /* 345 */
+     "12s" # ← char atime[12];          /* 357 */
+     "12s" # ← char ctime[12];          /* 369 */
+     "12s" # ← char offset[12];         /* 381 */
+      "4s" # ← char longnames[4];       /* 385 */
+       "B" # ← char unused[1];          /* 386 */
+        "" #   struct {
+     "12s" # ←       char offset[12];
+     "12s" # ←       char numbytes[12];
+     "12s" # ←       char offset[12];
+     "12s" # ←       char numbytes[12];
+     "12s" # ←       char offset[12];
+     "12s" # ←       char numbytes[12];
+     "12s" # ←       char offset[12];
+     "12s" # ←       char numbytes[12];
+        "" #   } sparse[4];             /* 482 */
+       "B" # ← char isextended[1];      /* 483 */
+     "12s" # ← char realsize[12];       /* 495 */
+     "17s" # ← char pad[17];            /* 512 */
+)
+
+# The “magic” and “version” fields are special:
+#
+# tar(5)
+#    magic   The magic field holds the five characters “ustar” followed by a
+#            space.  Note that POSIX ustar archives have a trailing null.
+#
+# however, “tar.h”:
+#
+#   /* OLDGNU_MAGIC uses both magic and version fields, which are contiguous.
+#      Found in an archive, it indicates an old GNU header format, which will be
+#      hopefully become obsolescent.  With OLDGNU_MAGIC, uname and gname are
+#      valid, though the header is not truly POSIX conforming.  */
+#
+#
+TAR_FMT_OLDGNU_MAGIC = b"ustar "
+
+def read_gnu_tar_hdr (data):
+    if len (data) != BLOCKSIZE: # header requires one complete block
+        return None
+
+    try:
+        name, mode, \
+            uid, gid, \
+            size, mtime, \
+            checksum, \
+            typeflag, \
+            linkname, \
+            magic, \
+            version, \
+            uname, \
+            gname, \
+            devmajor, \
+            devminor, \
+            atime, \
+            ctime, \
+            offset, \
+            longnames, \
+            unused, \
+            offset1, numbytes1, \
+            offset2, numbytes2, \
+            offset3, numbytes3, \
+            offset4, numbytes4, \
+            isextended, \
+            realsize, \
+            pad = struct.unpack (TAR_FMT_HDR, data)
+    except struct.error:
+        return None
+
+    if magic != TAR_FMT_OLDGNU_MAGIC:
+        return None
+
+    # return all except “unused” and “pad”
+    return \
+        { "name"        : name,     "mode"        : mode
+        , "uid"         : uid ,     "gid"         : gid
+        , "size"        : size,     "mtime"       : mtime
+        , "checksum"    : checksum
+        , "typeflag"    : typeflag
+        , "linkname"    : linkname
+        , "magic"       : magic
+        , "version"     : version
+        , "uname"       : uname,    "gname"       : gname
+        , "devmajor"    : devmajor, "devminor"    : devminor
+        , "atime"       : atime,    "ctime"       : ctime
+        , "offset"      : offset
+        , "longnames"   : longnames
+        , "offset1"     : offset1,  "numbytes1"   : numbytes1
+        , "offset2"     : offset2,  "numbytes2"   : numbytes2
+        , "offset3"     : offset3,  "numbytes3"   : numbytes3
+        , "offset4"     : offset4,  "numbytes4"   : numbytes4
+        , "isextended"  : isextended
+        , "realsize"    : realsize
+        }
+
+
+def readable_tar_objects_offsets (ifd):
+    """
+    Traverse blocks in file, trying to extract tar headers.
+    """
+    pos     = 0
+    offsets = []
 
+    while True:
+        blk = os.read (ifd, BLOCKSIZE)
+        if len (blk) != BLOCKSIZE:
+            break
+        hdr = read_gnu_tar_hdr (blk)
+        if hdr is not None:
+            offsets.append (pos)
+        pos += BLOCKSIZE
 
-def readable_tar_objects_offsets (ifd, cands):
-    raise NotImplementedError ("too soon")
+    return offsets
 
 
 def locate_gz_hdr_candidates (fd):
@@ -3541,8 +3664,7 @@ def reconstruct_offsets_tar (fname):
     ifd = os.open (fname, os.O_RDONLY)
 
     try:
-        cands = locate_tar_hdr_candidates (ifd)
-        return readable_tar_objects_offsets (ifd, cands)
+        return readable_tar_objects_offsets (ifd)
     finally:
         os.close (ifd)