developer.intra2net.com Git - python-delta-tar/blob - deltatar/tarfile.py

   1 #!/usr/bin/env python3
   2 #-------------------------------------------------------------------
   3 # tarfile.py
   4 #-------------------------------------------------------------------
   5 # Copyright (C) 2002 Lars Gustäbel <lars@gustaebel.de>
   6 # All rights reserved.
   7 #
   8 # Permission  is  hereby granted,  free  of charge,  to  any person
   9 # obtaining a  copy of  this software  and associated documentation
  10 # files  (the  "Software"),  to   deal  in  the  Software   without
  11 # restriction,  including  without limitation  the  rights to  use,
  12 # copy, modify, merge, publish, distribute, sublicense, and/or sell
  13 # copies  of  the  Software,  and to  permit  persons  to  whom the
  14 # Software  is  furnished  to  do  so,  subject  to  the  following
  15 # conditions:
  16 #
  17 # The above copyright  notice and this  permission notice shall  be
  18 # included in all copies or substantial portions of the Software.
  19 #
  20 # THE SOFTWARE IS PROVIDED "AS  IS", WITHOUT WARRANTY OF ANY  KIND,
  21 # EXPRESS OR IMPLIED, INCLUDING  BUT NOT LIMITED TO  THE WARRANTIES
  22 # OF  MERCHANTABILITY,  FITNESS   FOR  A  PARTICULAR   PURPOSE  AND
  23 # NONINFRINGEMENT.  IN  NO  EVENT SHALL  THE  AUTHORS  OR COPYRIGHT
  24 # HOLDERS  BE LIABLE  FOR ANY  CLAIM, DAMAGES  OR OTHER  LIABILITY,
  25 # WHETHER  IN AN  ACTION OF  CONTRACT, TORT  OR OTHERWISE,  ARISING
  26 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  27 # OTHER DEALINGS IN THE SOFTWARE.
  28 #
  29 """Read from and write to tar format archives.
  30 """
  31
  32 __version__ = "$Revision: 85213 $"
  33 # $Source$
  34
  35 version     = "0.9.0"
  36 __author__  = "Lars Gustäbel (lars@gustaebel.de)"
  37 __date__    = "$Date$"
  38 __cvsid__   = "$Id$"
  39 __credits__ = "Gustavo Niemeyer, Niels Gustäbel, Richard Townsend, Eduardo Robles."
  40
  41 #---------
  42 # Imports
  43 #---------
  44 import binascii
  45 import copy
  46 import errno
  47 import functools
  48 import io
  49 import mmap
  50 import operator
  51 import os
  52 import re
  53 import shutil
  54 import stat
  55 import struct
  56 import sys
  57 import time
  58
  59 import traceback # XXX
  60
  61 from . import crypto
  62
  63 try:
  64     import grp, pwd
  65 except ImportError:
  66     grp = pwd = None
  67
  68 # os.symlink on Windows prior to 6.0 raises NotImplementedError
  69 symlink_exception = (AttributeError, NotImplementedError)
  70 try:
  71     # OSError (winerror=1314) will be raised if the caller does not hold the
  72     # SeCreateSymbolicLinkPrivilege privilege
  73     symlink_exception += (OSError,)
  74 except NameError:
  75     pass
  76
  77 # from tarfile import *
  78 __all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError"]
  79
  80 from builtins import open as _open # Since 'open' is TarFile.open
  81
  82 #---------------------------------------------------------
  83 # tar constants
  84 #---------------------------------------------------------
  85 NUL = b"\0"                     # the null character
  86 BLOCKSIZE = 512                 # length of processing blocks
  87 RECORDSIZE = BLOCKSIZE * 20     # length of records
  88 GNU_MAGIC = b"ustar  \0"        # magic gnu tar string
  89 POSIX_MAGIC = b"ustar\x0000"    # magic posix tar string
  90
  91 LENGTH_NAME = 100               # maximum length of a filename
  92 LENGTH_LINK = 100               # maximum length of a linkname
  93 LENGTH_PREFIX = 155             # maximum length of the prefix field
  94
  95 REGTYPE = b"0"                  # regular file
  96 AREGTYPE = b"\0"                # regular file
  97 LNKTYPE = b"1"                  # link (inside tarfile)
  98 SYMTYPE = b"2"                  # symbolic link
  99 CHRTYPE = b"3"                  # character special device
 100 BLKTYPE = b"4"                  # block special device
 101 DIRTYPE = b"5"                  # directory
 102 FIFOTYPE = b"6"                 # fifo special device
 103 CONTTYPE = b"7"                 # contiguous file
 104
 105 GNUTYPE_LONGNAME = b"L"         # GNU tar longname
 106 GNUTYPE_LONGLINK = b"K"         # GNU tar longlink
 107 GNUTYPE_SPARSE = b"S"           # GNU tar sparse file
 108 GNUTYPE_MULTIVOL = b"M"         # GNU tar continuation of a file that began on
 109                                 # another volume
 110
 111 XHDTYPE = b"x"                  # POSIX.1-2001 extended header
 112 XGLTYPE = b"g"                  # POSIX.1-2001 global header
 113 SOLARIS_XHDTYPE = b"X"          # Solaris extended header
 114
 115 USTAR_FORMAT = 0                # POSIX.1-1988 (ustar) format
 116 GNU_FORMAT = 1                  # GNU tar format
 117 PAX_FORMAT = 2                  # POSIX.1-2001 (pax) format
 118 DEFAULT_FORMAT = GNU_FORMAT
 119
 120 GZ_FMT_HEADER        = b"<BBBBLBB"
 121 GZ_HEADER_SIZE       = 10   # not including the name
 122 GZ_MAGIC             = (0x1f, 0x8b) # 0o37, 0o213
 123 GZ_METHOD_DEFLATE    = 0x08 # 0o10
 124 GZ_FLAG_FTEXT        = 1 << 0 # ASCII payload
 125 GZ_FLAG_FHCRC        = 1 << 1 # CRC16
 126 GZ_FLAG_FEXTRA       = 1 << 2 # extra field
 127 GZ_FLAG_FNAME        = 1 << 3 # set by default in gzip
 128 GZ_FLAG_FCOMMENT     = 1 << 4 # NUL-terminated comment
 129 GZ_FLAG_RESERVED     = 7 << 5 # unassigned
 130 GZ_DEFLATE_FLAGS     = 0x00 # 0o00, never read (deflate.c)
 131 GZ_OS_CODE           = 0x03 # 0o03, default in gzip (tailor.h)
 132 GZ_MAGIC_BYTES       = struct.pack ("<BB", GZ_MAGIC [0], GZ_MAGIC [1])
 133 GZ_MAGIC_DEFLATE     = struct.pack ("<BBB", GZ_MAGIC [0], GZ_MAGIC [1],
 134                                     GZ_METHOD_DEFLATE)
 135
 136 TOLERANCE_STRICT  = 0
 137 TOLERANCE_RECOVER = 1 # rely on offsets in index
 138 TOLERANCE_RESCUE  = 2 # deduce metadata from archive contents
 139
 140 BUFSIZE           = 16 * 1024
 141
 142 #---------------------------------------------------------
 143 # archive handling mode
 144 #---------------------------------------------------------
 145
 146 ARCMODE_PLAIN    = 0
 147 ARCMODE_ENCRYPT  = 1 << 0
 148 ARCMODE_COMPRESS = 1 << 1
 149 ARCMODE_CONCAT   = 1 << 2
 150
 151 def arcmode_fmt (m):
 152     if m == ARCMODE_PLAIN:
 153         return "PLAIN"
 154     first = True
 155     ret = "["
 156     def chkappend (b, s):
 157         nonlocal m
 158         nonlocal ret
 159         nonlocal first
 160         if m & b:
 161             if first is True: first = False
 162             else: ret += " |"
 163             ret += " " + s
 164     chkappend (ARCMODE_ENCRYPT,  "ENCRYPT")
 165     chkappend (ARCMODE_COMPRESS, "COMPRESS")
 166     chkappend (ARCMODE_CONCAT,   "CONCAT")
 167     return ret + " ]"
 168
 169
 170 def arcmode_set (concat=False, encryption=None, comptype=None, init=ARCMODE_PLAIN):
 171     ret = init
 172     if bool (concat) is True:
 173         ret |= ARCMODE_CONCAT
 174     if encryption is not None:
 175         ret |= ARCMODE_ENCRYPT
 176     if comptype == "gz":
 177         ret |= ARCMODE_COMPRESS
 178     return ret
 179
 180 #---------------------------------------------------------
 181 # tarfile constants
 182 #---------------------------------------------------------
 183 # File types that tarfile supports:
 184 SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE,
 185                    SYMTYPE, DIRTYPE, FIFOTYPE,
 186                    CONTTYPE, CHRTYPE, BLKTYPE,
 187                    GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
 188                    GNUTYPE_SPARSE, GNUTYPE_MULTIVOL)
 189
 190 # File types that will be treated as a regular file.
 191 REGULAR_TYPES = (REGTYPE, AREGTYPE,
 192                  CONTTYPE, GNUTYPE_SPARSE)
 193
 194 # File types that are part of the GNU tar format.
 195 GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
 196              GNUTYPE_SPARSE, GNUTYPE_MULTIVOL)
 197
 198 # Fields from a pax header that override a TarInfo attribute.
 199 PAX_FIELDS = ("path", "linkpath", "size", "mtime",
 200               "uid", "gid", "uname", "gname")
 201
 202 # Fields from a pax header that are affected by hdrcharset.
 203 PAX_NAME_FIELDS = {"path", "linkpath", "uname", "gname"}
 204
 205 # Fields in a pax header that are numbers, all other fields
 206 # are treated as strings.
 207 PAX_NUMBER_FIELDS = {
 208     "atime": float,
 209     "ctime": float,
 210     "mtime": float,
 211     "uid": int,
 212     "gid": int,
 213     "size": int
 214 }
 215
 216 #---------------------------------------------------------
 217 # initialization
 218 #---------------------------------------------------------
 219
 220 if os.name in ("nt", "ce"):
 221     ENCODING = "utf-8"
 222 else:
 223     ENCODING = sys.getfilesystemencoding()
 224
 225 #---------------------------------------------------------
 226 # Some useful functions
 227 #---------------------------------------------------------
 228
 229 def stn(s, length, encoding, errors):
 230     """Convert a string to a null-terminated bytes object.
 231     """
 232     s = s.encode(encoding, errors)
 233     return s[:length] + (length - len(s)) * NUL
 234
 235 def nts(s, encoding, errors):
 236     """Convert a null-terminated bytes object to a string.
 237     """
 238     p = s.find(b"\0")
 239     if p != -1:
 240         s = s[:p]
 241     return s.decode(encoding, errors)
 242
 243 def sbtn(s, length, encoding, errors):
 244     """Convert a string or a bunch of bytes to a null-terminated bytes object
 245     of specific size.
 246     """
 247     if isinstance(s, str):
 248         s = s.encode(encoding, errors)
 249     return s[:length] + (length - len(s)) * NUL
 250
 251 def nti(s):
 252     """Convert a number field to a python number.
 253     """
 254     # There are two possible encodings for a number field, see
 255     # itn() below.
 256     if s[0] in (0o200, 0o377):
 257         n = 0
 258         for i in range(len(s) - 1):
 259             n <<= 8
 260             n += s[i + 1]
 261         if s[0] == 0o377:
 262             n = -(256 ** (len(s) - 1) - n)
 263     else:
 264         try:
 265             n = int(nts(s, "ascii", "strict") or "0", 8)
 266         except ValueError:
 267             raise InvalidHeaderError("invalid header")
 268     return n
 269
 270 def itn(n, digits=8, format=DEFAULT_FORMAT):
 271     """Convert a python number to a number field.
 272     """
 273     # POSIX 1003.1-1988 requires numbers to be encoded as a string of
 274     # octal digits followed by a null-byte, this allows values up to
 275     # (8**(digits-1))-1. GNU tar allows storing numbers greater than
 276     # that if necessary. A leading 0o200 or 0o377 byte indicate this
 277     # particular encoding, the following digits-1 bytes are a big-endian
 278     # base-256 representation. This allows values up to (256**(digits-1))-1.
 279     # A 0o200 byte indicates a positive number, a 0o377 byte a negative
 280     # number.
 281     if 0 <= n < 8 ** (digits - 1):
 282         s = bytes("%0*o" % (digits - 1, int(n)), "ascii") + NUL
 283     elif format == GNU_FORMAT and -256 ** (digits - 1) <= n < 256 ** (digits - 1):
 284         if n >= 0:
 285             s = bytearray([0o200])
 286         else:
 287             s = bytearray([0o377])
 288             n = 256 ** digits + n
 289
 290         for i in range(digits - 1):
 291             s.insert(1, n & 0o377)
 292             n >>= 8
 293     else:
 294         raise ValueError("overflow in number field")
 295
 296     return s
 297
 298 def calc_chksums(buf):
 299     """Calculate the checksum for a member's header by summing up all
 300        characters except for the chksum field which is treated as if
 301        it was filled with spaces. According to the GNU tar sources,
 302        some tars (Sun and NeXT) calculate chksum with signed char,
 303        which will be different if there are chars in the buffer with
 304        the high bit set. So we calculate two checksums, unsigned and
 305        signed.
 306     """
 307     unsigned_chksum = 256 + sum(struct.unpack_from("148B8x356B", buf))
 308     signed_chksum = 256 + sum(struct.unpack_from("148b8x356b", buf))
 309     return unsigned_chksum, signed_chksum
 310
 311 def copyfileobj(src, dst, length=None):
 312     """Copy length bytes from fileobj src to fileobj dst.
 313        If length is None, copy the entire content.
 314     """
 315     if length == 0:
 316         return
 317     if length is None:
 318         shutil.copyfileobj(src, dst)
 319         return
 320
 321     blocks, remainder = divmod(length, BUFSIZE)
 322     for b in range(blocks):
 323         buf = src.read(BUFSIZE)
 324         dst.write(buf)
 325         if len(buf) < BUFSIZE:
 326             raise OSError("end of file reached")
 327     if remainder != 0:
 328         buf = src.read(remainder)
 329         dst.write(buf)
 330         if len(buf) < remainder:
 331             raise OSError("end of file reached")
 332
 333
 334 def filemode(mode):
 335     """Deprecated in this location; use stat.filemode."""
 336     import warnings
 337     warnings.warn("deprecated in favor of stat.filemode",
 338                   DeprecationWarning, 2)
 339     return stat.filemode(mode)
 340
 341 class TarError(Exception):
 342     """Base exception."""
 343     pass
 344 class ExtractError(TarError):
 345     """General exception for extract errors."""
 346     pass
 347 class ReadError(TarError):
 348     """Exception for unreadable tar archives."""
 349     pass
 350 class CompressionError(TarError):
 351     """Exception for unavailable compression methods."""
 352     pass
 353 class StreamError(TarError):
 354     """Exception for unsupported operations on stream-like TarFiles."""
 355     pass
 356 class HeaderError(TarError):
 357     """Base exception for header errors."""
 358     pass
 359 class EmptyHeaderError(HeaderError):
 360     """Exception for empty headers."""
 361     pass
 362 class TruncatedHeaderError(HeaderError):
 363     """Exception for truncated headers."""
 364     pass
 365 class EOFHeaderError(HeaderError):
 366     """Exception for end of file headers."""
 367     pass
 368 class InvalidHeaderError(HeaderError):
 369     """Exception for invalid headers."""
 370     pass
 371 class SubsequentHeaderError(HeaderError):
 372     """Exception for missing and invalid extended headers."""
 373     pass
 374 class InvalidEncryptionError(TarError):
 375     """Exception for undefined crypto modes and combinations."""
 376     pass
 377 class DecryptionError(TarError):
 378     """Exception for error during decryption."""
 379     pass
 380 class EncryptionError(TarError):
 381     """Exception for error during encryption."""
 382     pass
 383 class EndOfFile(Exception):
 384     """Signal end of file condition when they’re not an error."""
 385     pass
 386
 387 #---------------------------
 388 # internal stream interface
 389 #---------------------------
 390 class _LowLevelFile:
 391     """Low-level file object. Supports reading and writing.
 392        It is used instead of a regular file object for streaming
 393        access.
 394     """
 395
 396     def __init__(self, name, mode):
 397         _mode = {
 398             "r": os.O_RDONLY,
 399             "w": os.O_RDWR | os.O_CREAT | os.O_TRUNC,
 400         }[mode]
 401         if hasattr(os, "O_BINARY"):
 402             _mode |= os.O_BINARY                    # pylint: disable=no-member
 403         self.fd = os.open(name, _mode, 0o666)
 404         self.offset = 0
 405
 406     def close(self):
 407         os.close(self.fd)
 408
 409     def read(self, size):
 410         ret = os.read(self.fd, size)
 411         self.offset += len(ret)
 412         return ret
 413
 414     def write(self, s, pos=None):
 415         if pos is not None:
 416             p0 = self.offset
 417             os.lseek (self.fd, pos, os.SEEK_SET)
 418         n = os.write(self.fd, s)
 419         if pos is None:
 420             self.offset += len(s)
 421         else:
 422             append = pos + n - p0
 423             if append > 0:
 424                 self.offset += append
 425             os.lseek (self.fd, p0, os.SEEK_SET)
 426
 427     def tell(self):
 428         return self.offset
 429
 430     def seek_set (self, pos):
 431         os.lseek (self.fd, pos, os.SEEK_SET)
 432         self.offset = pos
 433
 434
 435 def gz_header (name=None):
 436     timestamp = int(time.time())
 437     flags     = 0x0
 438
 439     if name is None:
 440         name = b""
 441     else:
 442         flags |= GZ_FLAG_FNAME
 443         if type(name) is str:
 444             name = name.encode("iso-8859-1", "replace")
 445         if name.endswith(b".pdtcrypt"):
 446             name = name[:-9]
 447         if name.endswith(b".gz"):
 448             name = name[:-3]
 449         # RFC1952 says we must use ISO-8859-1 for the FNAME field.
 450         name += NUL
 451
 452     hdr = struct.pack (GZ_FMT_HEADER,
 453                        GZ_MAGIC [0], GZ_MAGIC [1],
 454                        GZ_METHOD_DEFLATE, flags,
 455                        timestamp,
 456                        GZ_DEFLATE_FLAGS, GZ_OS_CODE)
 457
 458     return hdr + name
 459
 460
 461 class _Stream:
 462     """Class that serves as an adapter between TarFile and
 463        a stream-like object.  The stream-like object only
 464        needs to have a read() or write() method and is accessed
 465        blockwise.  Use of gzip or bzip2 compression is possible.
 466        A stream-like object could be for example: sys.stdin,
 467        sys.stdout, a socket, a tape device etc.
 468
 469        _Stream is intended to be used only internally but is
 470        nevertherless used externally by Deltatar.
 471
 472        When encrypting, the ``enccounter`` will be used for
 473        initializing the first cryptographic context. When
 474        decrypting, its value will be compared to the decrypted
 475        object. Decryption fails if the value does not match.
 476        In effect, this means that a ``_Stream`` whose ctor was
 477        passed ``enccounter`` can only be used to encrypt or
 478        decrypt a single object.
 479     """
 480
 481     remainder = -1 # track size in encrypted entries
 482     tolerance = TOLERANCE_STRICT
 483
 484     def __init__(self, name, mode, comptype, fileobj, bufsize,
 485                  concat=False, encryption=None, enccounter=None,
 486                  compresslevel=9, tolerance=TOLERANCE_STRICT):
 487         """Construct a _Stream object.
 488         """
 489         self.arcmode = arcmode_set (concat, encryption, comptype)
 490         self.tolerance = tolerance
 491
 492         self._extfileobj = True
 493         if fileobj is None:
 494             fileobj = _LowLevelFile(name, mode)
 495             self._extfileobj = False
 496
 497         if comptype == '*':
 498             # Enable transparent compression detection for the
 499             # stream interface
 500             fileobj = _StreamProxy(fileobj)
 501             comptype = fileobj.getcomptype()
 502         if comptype == '':
 503             comptype = "tar"
 504
 505         self.enccounter = None
 506         if self.arcmode & ARCMODE_ENCRYPT:
 507             self.enccounter = enccounter
 508
 509         self.name     = name or ""
 510         self.mode     = mode
 511         self.comptype = comptype
 512         self.cmp      = None
 513         self.fileobj  = fileobj
 514         self.bufsize  = bufsize
 515         self.buf      = b""
 516         self.pos      = 0
 517         self.concat_pos = 0
 518         self.closed   = False
 519         self.flags    = 0
 520         self.last_block_offset = 0
 521         self.dbuf     = b"" # ???
 522         self.exception = None # communicate decompression failure
 523         self.compresslevel = compresslevel
 524         self.bytes_written = 0
 525         # crypto parameters
 526         self.encryption = encryption
 527         self.lasthdr    = None
 528
 529         try:
 530             if comptype == "gz":
 531                 try:
 532                     import zlib
 533                 except ImportError:
 534                     raise CompressionError("zlib module is not available")
 535                 self.zlib = zlib
 536                 if mode == "r":
 537                     self.exception = zlib.error
 538                     self._init_read_gz()
 539                 elif mode == "w":
 540                     if not (self.arcmode & ARCMODE_CONCAT):
 541                         if self.arcmode & ARCMODE_ENCRYPT:
 542                             self._init_write_encrypt (name)
 543                         self._init_write_gz ()
 544                 self.crc = zlib.crc32(b"") & 0xFFFFffff
 545
 546             elif comptype == "bz2":
 547                 if self.arcmode & ARCMODE_ENCRYPT:
 548                     raise InvalidEncryptionError("encryption not available for "
 549                                                  "compression “%s”" % comptype)
 550                 try:
 551                     import bz2
 552                 except ImportError:
 553                     raise CompressionError("bz2 module is not available")
 554                 if mode == "r":
 555                     self.dbuf = b""
 556                     self.cmp = bz2.BZ2Decompressor()
 557                     self.exception = OSError
 558                 else:
 559                     self.cmp = bz2.BZ2Compressor()
 560
 561             elif comptype == 'xz':
 562                 if self.arcmode & ARCMODE_ENCRYPT:
 563                     raise InvalidEncryptionError("encryption not available for "
 564                                                  "compression “%s”" % comptype)
 565                 try:
 566                     import lzma
 567                 except ImportError:
 568                     raise CompressionError("lzma module is not available")
 569                 if mode == "r":
 570                     self.dbuf = b""
 571                     self.cmp = lzma.LZMADecompressor()
 572                     self.exception = lzma.LZMAError
 573                 else:
 574                     self.cmp = lzma.LZMACompressor()
 575
 576             elif comptype == "tar":
 577                 if not (self.arcmode & ARCMODE_CONCAT) \
 578                         and mode == "w" \
 579                         and self.arcmode & ARCMODE_ENCRYPT:
 580                     self._init_write_encrypt (name)
 581
 582             else:
 583                 if self.arcmode & ARCMODE_ENCRYPT:
 584                     raise InvalidEncryptionError("encryption not available for "
 585                                                  "compression “%s”" % comptype)
 586                 raise CompressionError("unknown compression type %r" % comptype)
 587
 588         except:
 589             if not self._extfileobj:
 590                 self.fileobj.close()
 591             self.closed = True
 592             raise
 593
 594     def __del__(self):
 595         if hasattr(self, "closed") and not self.closed:
 596             try:
 597                 self.close()
 598             except crypto.InternalError:
 599                 # context already finalized due to abort but close() tried
 600                 # to use it
 601                 pass
 602
 603
 604     def next (self, name):
 605         if self.arcmode & ARCMODE_COMPRESS:
 606             if getattr (self, "cmp", None) is not None:
 607                 self._finalize_write_gz ()
 608         self.__sync()
 609         if self.arcmode & ~(ARCMODE_ENCRYPT | ARCMODE_COMPRESS):
 610             self.last_block_offset = self.fileobj.tell()
 611         if self.arcmode & ARCMODE_ENCRYPT:
 612             self._finalize_write_encrypt ()
 613             self._init_write_encrypt (name, set_last_block_offset=True)
 614         if self.arcmode & ARCMODE_COMPRESS:
 615             self._init_write_gz (set_last_block_offset =
 616                                  not (self.arcmode & ARCMODE_ENCRYPT))
 617         return self.last_block_offset
 618
 619
 620     def next_volume (self, name):
 621         # with non-concat modes, this is taken care by the _Stream
 622         # ctor as invoked by the newvol handler
 623         if self.arcmode & ARCMODE_COMPRESS:
 624             if getattr (self, "cmp", None) is not None:
 625                 # e. g. compressed PAX header written
 626                 self._finalize_write_gz ()
 627         if self.arcmode & ARCMODE_ENCRYPT:
 628             self._init_write_encrypt (name)
 629         if self.arcmode & ARCMODE_COMPRESS:
 630             self._init_write_gz ()
 631
 632
 633     def _init_write_encrypt (self, entry=None, set_last_block_offset=False):
 634         """
 635         Save position for delayed write of header; fill the header location
 636         with dummy bytes.
 637         """
 638         # first thing, proclaim new object to the encryption context
 639         # secondly, assemble the header with the updated parameters
 640         # and commit it directly to the underlying stream, bypassing the
 641         # encryption layer in .__write().
 642         dummyhdr = self.encryption.next (entry, counter=self.enccounter)
 643         if dummyhdr is None:
 644             raise EncryptionError ("Crypto.next(): bad dummy header") # XXX
 645         self.lasthdr = self.fileobj.tell()
 646         self.__write_to_file(dummyhdr)
 647         if set_last_block_offset is True:
 648             self.last_block_offset = self.lasthdr
 649
 650
 651     def _finalize_write_encrypt (self):
 652         """
 653         Seek back to header position, read dummy bytes, finalize crypto
 654         obtaining the actual header, write header, seek back to current
 655         position.
 656
 657         Returns the list of IV fixed parts as used during encryption.
 658         """
 659         if self.lasthdr is not None:
 660             pos0 = self.fileobj.tell ()
 661             self.fileobj.seek_set (self.lasthdr)
 662             dummy = self.fileobj.read (crypto.PDTCRYPT_HDR_SIZE)
 663             pos1 = self.fileobj.tell ()
 664             dpos = pos1 - self.lasthdr
 665             assert dpos == crypto.PDTCRYPT_HDR_SIZE
 666             self.fileobj.seek_set (pos0)
 667             data, hdr, _ = self.encryption.done (dummy)
 668             self.__write_to_file(hdr, pos=self.lasthdr)
 669             self.__write_to_file(data) # append remainder of data
 670             self.lasthdr = -1
 671
 672
 673     def _finalize_write_gz (self):
 674         if self.cmp is not None:
 675             chunk = self.buf + self.cmp.flush()
 676             if chunk:
 677                 if self.comptype == "gz":
 678                     # The native zlib crc is an unsigned 32-bit integer, but
 679                     # the Python wrapper implicitly casts that to a signed C
 680                     # long.  So, on a 32-bit box self.crc may "look negative",
 681                     # while the same crc on a 64-bit box may "look positive".
 682                     # To avoid irksome warnings from the `struct` module, force
 683                     # it to look positive on all boxes.
 684                     chunk += struct.pack("<L", self.crc & 0xffffffff)
 685                     chunk += struct.pack("<L", self.concat_pos & 0xffffFFFF)
 686                 self.__enc_write (chunk)
 687                 self.buf = b""
 688
 689
 690     def _init_write_gz (self, set_last_block_offset=False):
 691         '''
 692         Add a new gzip block, closing last one
 693         '''
 694         self.concat_pos = 0
 695         self.crc = self.zlib.crc32(b"") & 0xFFFFffff
 696         first = self.cmp is None
 697         self.cmp = self.zlib.compressobj(self.compresslevel,
 698                                          self.zlib.DEFLATED,
 699                                          -self.zlib.MAX_WBITS,
 700                                          self.zlib.DEF_MEM_LEVEL,
 701                                          0)
 702
 703         # if aes, we encrypt after compression
 704         if set_last_block_offset is True:
 705             self.last_block_offset = self.fileobj.tell()
 706
 707         self.__write(gz_header (self.name if first is True else None))
 708
 709
 710     def write(self, s):
 711         """Write string s to the stream.
 712         """
 713         if self.comptype == "gz":
 714             self.crc = self.zlib.crc32(s, self.crc) & 0xFFFFffff
 715         self.pos += len(s)
 716         self.concat_pos += len(s)
 717         if self.cmp is not None:
 718             s = self.cmp.compress(s)
 719         self.__write(s)
 720
 721     def __sync(self):
 722         """Write what’s left in the buffer to the stream."""
 723         self.__write (b"") # → len (buf) <= bufsiz
 724         self.__enc_write (self.buf)
 725         self.buf = b""
 726
 727     def __write(self, s):
 728         """Writes (and encodes) string s to the stream blockwise
 729
 730         will wait with encoding/writing until block is complete
 731         """
 732         self.buf += s
 733         while len(self.buf) > self.bufsize:
 734             self.__enc_write(self.buf[:self.bufsize])
 735             self.buf = self.buf[self.bufsize:]
 736
 737
 738     def __write_to_file(self, s, pos=None):
 739         '''
 740         Writes directly to the fileobj; updates self.bytes_written. If “pos” is
 741         given, the stream will seek to that position first and back afterwards,
 742         and the total of bytes written is not updated.
 743         '''
 744         self.fileobj.write(s, pos)
 745         if pos is None:
 746             self.bytes_written += len(s)
 747
 748
 749     def __enc_write(self, s):
 750         """
 751         If encryption is active, the string s is encrypted before being written
 752         to the file.
 753         """
 754         if len (s) == 0:
 755             return
 756         if self.arcmode & ARCMODE_ENCRYPT:
 757             buf = s
 758             while len (buf) > 0:
 759                 n, ct = self.encryption.process(buf)
 760                 self.__write_to_file(ct)
 761                 buf = buf [n:]
 762                 if len (buf) > 0:
 763                     # The entire plaintext was not consumed: The size limit
 764                     # for encrypted objects was reached. Transparently create
 765                     # a new encrypted object and continue processing the input.
 766                     self._finalize_write_encrypt ()
 767                     self._init_write_encrypt ()
 768         else:
 769             self.__write_to_file(s)
 770
 771
 772     def estim_file_size(self):
 773         """ estimates size of file if closing it now
 774
 775         The result may differ greatly from the amount of data sent to write()
 776         due to compression, encryption and buffering.
 777
 778         In tests the result (before calling close()) was up to 12k smaller than
 779         the final file size if compression is being used because zlib/bz2
 780         compressors do not allow inspection of their buffered data :-(
 781
 782         Still, we add what close() would add: 8 bytes for gz checksum, one
 783         encryption block size if encryption is used and the size of our own
 784         buffer
 785         """
 786         if self.closed:
 787             return self.bytes_written
 788
 789         result = self.bytes_written
 790         if self.buf:
 791             result += len(self.buf)
 792         if self.comptype == 'gz':
 793             result += 8   # 2 longs = 8 byte (no extra info written for bzip2)
 794         return result
 795
 796     def close(self, close_fileobj=True):
 797         """Close the _Stream object. No operation should be
 798            done on it afterwards.
 799         """
 800
 801         if self.closed:
 802             return
 803
 804         if close_fileobj is True:
 805
 806             if self.mode == "w":
 807                 if self.arcmode & ARCMODE_COMPRESS:
 808                     self._finalize_write_gz ()
 809                 # end of Tar archive marker (two empty blocks) was written
 810                 # finalize encryption last; no writes may be performed after
 811                 # this point
 812                 self.__sync ()
 813                 if self.arcmode & ARCMODE_ENCRYPT:
 814                     self._finalize_write_encrypt ()
 815
 816             if not self._extfileobj:
 817                 self.fileobj.close()
 818         else:
 819             # read the zlib crc and length and check them
 820             if self.mode == "r" and self.comptype == "gz":
 821                 read_crc = self.__read(4)
 822                 read_length = self.__read(4)
 823                 calculated_crc = self.crc
 824                 if struct.unpack("<L", read_crc)[0] != calculated_crc:
 825                     raise CompressionError("bad gzip crc")
 826         self.closed = True
 827
 828
 829     def _init_read_gz(self):
 830         """Initialize for reading a gzip compressed fileobj.
 831         """
 832         self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
 833
 834         read2 = self.__read(2)
 835         if read2 == b"":
 836             raise EndOfFile ("_init_read_gz(): read returned zero bytes at pos "
 837                              "%d" % self.fileobj.tell())
 838         # taken from gzip.GzipFile with some alterations
 839         if read2 != GZ_MAGIC_BYTES:
 840             raise ReadError("not a gzip file")
 841
 842         read1 = self.__read(1)
 843         if read1 == b"":
 844             raise EndOfFile ("_init_read_gz(): read returned zero bytes inside "
 845                              "gzip header at pos %d" % self.fileobj.tell())
 846         if ord (read1) != GZ_METHOD_DEFLATE:
 847             raise CompressionError("unsupported compression method")
 848
 849         self.flags = flag = ord(self.__read(1))
 850         self.__read(6) # discard timestamp[4], deflate flags, os code
 851
 852         if flag & GZ_FLAG_FEXTRA:
 853             xlen = ord(self.__read(1)) + 256 * ord(self.__read(1))
 854             self.read(xlen)
 855         if flag & GZ_FLAG_FNAME:
 856             while True:
 857                 s = self.__read(1)
 858                 if not s or s == NUL:
 859                     break
 860         if flag & GZ_FLAG_FCOMMENT:
 861             while True:
 862                 s = self.__read(1)
 863                 if not s or s == NUL:
 864                     break
 865         if flag & GZ_FLAG_FHCRC:
 866             self.__read(2)
 867
 868     def _init_read_encrypt (self):
 869         """Initialize encryption for next entry in archive. Read a header and
 870         notify the crypto context."""
 871         if self.arcmode & ARCMODE_ENCRYPT:
 872             lasthdr = self.fileobj.tell ()
 873             try:
 874                 hdr = crypto.hdr_read_stream (self.fileobj)
 875             except crypto.EndOfFile:
 876                 return False
 877             except crypto.InvalidHeader as exn:
 878                 raise DecryptionError ("Crypto.hdr_read_stream(): error “%s” "
 879                                        "processing %r at pos %d"
 880                                        % (exn, self.fileobj, lasthdr)) \
 881                       from exn
 882             if self.enccounter is not None:
 883                 # enforce that the iv counter in the header matches an
 884                 # explicitly requested one
 885                 iv = crypto.hdr_iv_counter (hdr)
 886                 if iv != self.enccounter:
 887                     raise DecryptionError ("expected IV counter %d, got %d"
 888                                            % (self.enccounter, iv))
 889             self.lasthdr   = lasthdr
 890             self.remainder = hdr ["ctsize"] # distance to next header
 891             try:
 892                 self.encryption.next (hdr)
 893             except crypto.InvalidParameter as exn:
 894                 raise DecryptionError ("Crypto.next(): error “%s” "
 895                                        "processing %r at pos %d"
 896                                        % (exn, self.fileobj, lasthdr)) \
 897                       from exn
 898
 899         return True
 900
 901
 902     def _read_encrypt (self, buf):
 903         """
 904         Demote a program error to a decryption error in tolerant mode. This
 905         allows recovery from corrupted headers and invalid data.
 906         """
 907         try:
 908             return self.encryption.process (buf)
 909         except RuntimeError as exn:
 910             if self.tolerance != TOLERANCE_STRICT:
 911                 raise DecryptionError (exn)
 912             raise
 913
 914
 915     def _finalize_read_encrypt (self):
 916         """
 917         Finalize decryption.
 918         """
 919         if      self.arcmode & ARCMODE_ENCRYPT \
 920             and self.lasthdr is not None :
 921             assert self.remainder >= 0
 922             if self.remainder > 0:
 923                 self.remainder = 0
 924             try:
 925                 data = self.encryption.done ()
 926             except crypto.InvalidGCMTag as exn:
 927                 raise DecryptionError ("decryption failed: %s" % exn)
 928             return data
 929
 930
 931     def tell(self):
 932         """Return the stream's file pointer position.
 933         """
 934         return self.pos
 935
 936     def seek(self, pos=0):
 937         """Set the stream's file pointer to pos. Negative seeking
 938            is forbidden.
 939         """
 940         if pos - self.pos >= 0:
 941             blocks, remainder = divmod(pos - self.pos, self.bufsize)
 942             for i in range(blocks):
 943                 self.read(self.bufsize)
 944             self.read(remainder)
 945         else:
 946             raise StreamError("seeking backwards is not allowed")
 947         return self.pos
 948
 949     def read(self, size=None):
 950         """Return the next size number of bytes from the stream.
 951            If size is not defined, return all bytes of the stream
 952            up to EOF.
 953         """
 954         if size is None:
 955             t = []
 956             while True:
 957                 buf = self._read(self.bufsize)
 958                 if not buf:
 959                     break
 960                 t.append(buf)
 961             buf = b"".join(t)
 962         else:
 963             buf = self._read(size)
 964         self.pos += len(buf)
 965         return buf
 966
 967     def readline(self):
 968         """Reads just one line, new line character included
 969         """
 970         # if \n in dbuf, no read neads to be done
 971         if b'\n' in self.dbuf:
 972             pos = self.dbuf.index(b'\n') + 1
 973             ret = self.dbuf[:pos]
 974             self.dbuf = self.dbuf[pos:]
 975             return ret
 976
 977         buf = []
 978         while True:
 979             chunk = self._read(self.bufsize)
 980
 981             # nothing more to read, so return the buffer
 982             if not chunk:
 983                 return b''.join(buf)
 984
 985             buf.append(chunk)
 986
 987             # if \n found, return the new line
 988             if b'\n' in chunk:
 989                 dbuf = b''.join(buf)
 990                 pos = dbuf.index(b'\n') + 1
 991                 self.dbuf = dbuf[pos:] + self.dbuf
 992                 return dbuf[:pos]
 993
 994     def _read(self, size):
 995         """Return size bytes from the stream.
 996         """
 997         c = len(self.dbuf)
 998         t = [self.dbuf]
 999
1000         while c < size:
1001             buf = self.__read(self.bufsize)
1002             if not buf:
1003                 break
1004
1005             if self.cmp is not None:
1006                 try:
1007                     buf = self.cmp.decompress(buf)
1008                 except self.exception as exn:
1009                     raise ReadError("invalid compressed data (%r)" % exn)
1010                 except Exception as e:
1011                     # happens at the end of the file
1012                     # _init_read_gz failed in the previous iteration so
1013                     # self.cmp.decompress fails here
1014                     if self.arcmode & ARCMODE_CONCAT:
1015                         pass
1016                     else:
1017                         raise ReadError("invalid compressed data")
1018                 if self.arcmode & ARCMODE_COMPRESS and hasattr(self, "crc"):
1019                     self.crc = self.zlib.crc32(buf, self.crc) & 0xFFFFffff
1020                 if self.arcmode & ARCMODE_CONCAT \
1021                         and len(self.cmp.unused_data) != 0:
1022                     self.buf = self.cmp.unused_data + self.buf
1023                     self.close(close_fileobj=False)
1024                     try:
1025                         self._init_read_gz()
1026                     except DecryptionError:
1027                         if self.tolerance != TOLERANCE_STRICT:
1028                             # return whatever data was processed successfully
1029                             if len (buf) > 0:
1030                                 t.append (buf)
1031                             if len (t) > 0:
1032                                 break
1033                         raise
1034                     except ReadError: # gzip troubles
1035                         if self.tolerance == TOLERANCE_RESCUE:
1036                             if len (buf) > 0:
1037                                 t.append (buf)
1038                             if len (t) > 0:
1039                                 break
1040                         raise
1041                     except EndOfFile:
1042                         # happens at the end of the file
1043                         pass
1044                     self.crc = self.zlib.crc32(b"") & 0xFFFFffff
1045                     self.closed = False
1046             t.append(buf)
1047             c += len(buf)
1048         t = b"".join(t)
1049         self.dbuf = t[size:]
1050         return t[:size]
1051
1052
1053     def __read(self, size):
1054         """
1055         Return size bytes from stream. If internal buffer is empty, read
1056         another block from the stream.
1057
1058         The function returns up to size bytes of data. When an error occurs
1059         during decryption, everything until the end of the last successfully
1060         finalized object is returned.
1061         """
1062         c = len(self.buf)
1063         t = [self.buf] if c > 0 else []
1064         good_crypto = len (t)
1065
1066         while c < size:
1067             todo = size
1068             try:
1069                 if self.arcmode & ARCMODE_ENCRYPT:
1070                     if self.remainder <= 0:
1071                         # prepare next object
1072                         if self._init_read_encrypt () is False: # EOF
1073                             buf = None
1074                             break # while
1075
1076                     # only read up to the end of the encrypted object
1077                     todo = min (size, self.remainder)
1078                 buf = self.fileobj.read(todo)
1079                 if self.arcmode & ARCMODE_ENCRYPT:
1080                     # decrypt the thing
1081                     buf = self._read_encrypt (buf)
1082                     if todo == self.remainder:
1083                         # at the end of a crypto object; finalization will fail if
1084                         # the GCM tag does not match
1085                         trailing = self._finalize_read_encrypt ()
1086                         good_crypto = len (t) + 1
1087                         if len (trailing) > 0:
1088                             buf += trailing
1089                         self.remainder = 0
1090                     else:
1091                         self.remainder -= todo
1092             except DecryptionError:
1093                 if self.tolerance == TOLERANCE_STRICT:
1094                     raise
1095                 self.encryption.drop ()
1096                 if self.tolerance == TOLERANCE_RECOVER:
1097                     if good_crypto == 0:
1098                         raise
1099                     # this may occur at any of the three crypto operations above.
1100                     # some objects did validate; discard all data after it; next
1101                     # call will start with the bad object and error out immediately
1102                     self.buf = b"".join (t [good_crypto:])
1103                     return b"".join (t [:good_crypto])
1104                 elif self.tolerance == TOLERANCE_RESCUE:
1105                     # keep what we have so far despite the finalization issue
1106                     t.append (buf)
1107                     c += len (buf)
1108                     break
1109                 else:
1110                     raise RuntimeError("internal error: bad tolerance level")
1111
1112             if not buf: ## XXX stream terminated prematurely; this should be an error
1113                 break
1114
1115             t.append(buf)
1116             c += len(buf)
1117         t = b"".join(t)
1118         self.buf = t[size:]
1119
1120         return t[:size]
1121
1122
1123 class _StreamProxy(object):
1124     """Small proxy class that enables transparent compression
1125        detection for the Stream interface (mode 'r|*').
1126     """
1127
1128     def __init__(self, fileobj):
1129         self.fileobj = fileobj
1130         self.buf = self.fileobj.read(BLOCKSIZE)
1131
1132     def read(self, size):                       # pylint: disable=method-hidden
1133         self.read = self.fileobj.read
1134         return self.buf
1135
1136     def getcomptype(self):
1137         if self.buf.startswith(GZ_MAGIC_DEFLATE):
1138             return "gz"
1139         elif self.buf[0:3] == b"BZh" and self.buf[4:10] == b"1AY&SY":
1140             return "bz2"
1141         elif self.buf.startswith((b"\x5d\x00\x00\x80", b"\xfd7zXZ")):
1142             return "xz"
1143         else:
1144             return "tar"
1145
1146     def close(self):
1147         self.fileobj.close()
1148 # class StreamProxy
1149
1150 #------------------------
1151 # Extraction file object
1152 #------------------------
1153 class _FileInFile(object):
1154     """A thin wrapper around an existing file object that
1155        provides a part of its data as an individual file
1156        object.
1157     """
1158
1159     def __init__(self, fileobj, offset, size, blockinfo=None):
1160         self.fileobj = fileobj
1161         self.offset = offset
1162         self.size = size
1163         self.position = 0
1164         self.name = getattr(fileobj, "name", None)
1165         self.closed = False
1166
1167         if blockinfo is None:
1168             blockinfo = [(0, size)]
1169
1170         # Construct a map with data and zero blocks.
1171         self.map_index = 0
1172         self.map = []
1173         lastpos = 0
1174         realpos = self.offset
1175         for offset, size in blockinfo:
1176             if offset > lastpos:
1177                 self.map.append((False, lastpos, offset, None))
1178             self.map.append((True, offset, offset + size, realpos))
1179             realpos += size
1180             lastpos = offset + size
1181         if lastpos < self.size:
1182             self.map.append((False, lastpos, self.size, None))
1183
1184     def flush(self):
1185         pass
1186
1187     def readable(self):
1188         return True
1189
1190     def writable(self):
1191         return False
1192
1193     def seekable(self):
1194         return self.fileobj.seekable()
1195
1196     def tell(self):
1197         """Return the current file position.
1198         """
1199         return self.position
1200
1201     def seek(self, position, whence=io.SEEK_SET):
1202         """Seek to a position in the file.
1203         """
1204         if whence == io.SEEK_SET:
1205             self.position = min(max(position, 0), self.size)
1206         elif whence == io.SEEK_CUR:
1207             if position < 0:
1208                 self.position = max(self.position + position, 0)
1209             else:
1210                 self.position = min(self.position + position, self.size)
1211         elif whence == io.SEEK_END:
1212             self.position = max(min(self.size + position, self.size), 0)
1213         else:
1214             raise ValueError("Invalid argument")
1215         return self.position
1216
1217     def read(self, size=None):
1218         """Read data from the file.
1219         """
1220         if size is None:
1221             size = self.size - self.position
1222         else:
1223             size = min(size, self.size - self.position)
1224
1225         buf = b""
1226         while size > 0:
1227             while True:
1228                 data, start, stop, offset = self.map[self.map_index]
1229                 if start <= self.position < stop:
1230                     break
1231                 else:
1232                     self.map_index += 1
1233                     if self.map_index == len(self.map):
1234                         self.map_index = 0
1235             length = min(size, stop - self.position)
1236             if data:
1237                 self.fileobj.seek(offset + (self.position - start))
1238                 buf += self.fileobj.read(length)
1239             else:
1240                 buf += NUL * length
1241             size -= length
1242             self.position += length
1243         return buf
1244
1245     def readinto(self, b):
1246         buf = self.read(len(b))
1247         b[:len(buf)] = buf
1248         return len(buf)
1249
1250     def close(self):
1251         self.closed = True
1252 #class _FileInFile
1253
1254
1255 class ExFileObject(io.BufferedReader):
1256
1257     def __init__(self, tarfile, tarinfo):
1258         fileobj = _FileInFile(tarfile.fileobj, tarinfo.offset_data,
1259                 tarinfo.size, tarinfo.sparse)
1260         super().__init__(fileobj)
1261 #class ExFileObject
1262
1263 #------------------
1264 # Exported Classes
1265 #------------------
1266 class TarInfo(object):
1267     """Informational class which holds the details about an
1268        archive member given by a tar header block.
1269        TarInfo objects are returned by TarFile.getmember(),
1270        TarFile.getmembers() and TarFile.gettarinfo() and are
1271        usually created internally.
1272     """
1273
1274     __slots__ = ("name", "mode", "uid", "gid", "size", "mtime",
1275                  "chksum", "type", "linkname", "uname", "gname",
1276                  "devmajor", "devminor", "volume_offset",
1277                  "offset", "offset_data", "pax_headers", "sparse",
1278                  "tarfile", "_sparse_structs", "_link_target")
1279
1280     def __init__(self, name=""):
1281         """Construct a TarInfo object. name is the optional name
1282            of the member.
1283         """
1284         self.name = name        # member name
1285         self.mode = 0o644       # file permissions
1286         self.uid = 0            # user id
1287         self.gid = 0            # group id
1288         self.size = 0           # file size
1289         self.mtime = 0          # modification time
1290         self.chksum = 0         # header checksum
1291         self.type = REGTYPE     # member type
1292         self.linkname = ""      # link name
1293         self.uname = ""         # user name
1294         self.gname = ""         # group name
1295         self.devmajor = 0       # device major number
1296         self.devminor = 0       # device minor number
1297
1298         self.offset = 0         # the tar header starts here
1299         self.offset_data = 0    # the file's data starts here
1300         self.volume_offset = 0  # the file's data corresponds with the data
1301                                 # starting at this position
1302
1303         self.sparse = None      # sparse member information
1304         self.pax_headers = {}   # pax header information
1305
1306     # In pax headers the "name" and "linkname" field are called
1307     # "path" and "linkpath".
1308     def _getpath(self):
1309         return self.name
1310     def _setpath(self, name):
1311         self.name = name
1312     path = property(_getpath, _setpath)
1313
1314     def _getlinkpath(self):
1315         return self.linkname
1316     def _setlinkpath(self, linkname):
1317         self.linkname = linkname
1318     linkpath = property(_getlinkpath, _setlinkpath)
1319
1320     def __repr__(self):
1321         return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self))
1322
1323     def get_info(self, encoding=None, errors=None):
1324         """Return the TarInfo's attributes as a dictionary.
1325         """
1326         info = {
1327             "name":     self.name,
1328             "mode":     self.mode & 0o7777,
1329             "uid":      self.uid,
1330             "gid":      self.gid,
1331             "size":     self.size,
1332             "mtime":    self.mtime,
1333             "chksum":   self.chksum,
1334             "type":     self.type,
1335             "linkname": self.linkname,
1336             "uname":    self.uname,
1337             "gname":    self.gname,
1338             "devmajor": self.devmajor,
1339             "devminor": self.devminor,
1340             "offset_data": self.offset_data,
1341             "volume_offset": self.volume_offset
1342         }
1343
1344         if info["type"] == DIRTYPE and not info["name"].endswith("/"):
1345             info["name"] += "/"
1346
1347         return info
1348
1349     def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING,
1350               errors="surrogateescape"):
1351         """Return a tar header as a string of 512 byte blocks.
1352         """
1353         info = self.get_info(encoding, errors)
1354
1355         if format == USTAR_FORMAT:
1356             return self.create_ustar_header(info, encoding, errors)
1357         elif format == GNU_FORMAT:
1358             return self.create_gnu_header(info, encoding, errors)
1359         elif format == PAX_FORMAT:
1360             return self.create_pax_header(info, encoding, errors)
1361         else:
1362             raise ValueError("invalid format")
1363
1364     def create_ustar_header(self, info, encoding, errors):
1365         """Return the object as a ustar header block.
1366         """
1367         info["magic"] = POSIX_MAGIC
1368
1369         if len(info["linkname"]) > LENGTH_LINK:
1370             raise ValueError("linkname is too long")
1371
1372         if len(info["name"]) > LENGTH_NAME:
1373             info["prefix"], info["name"] = self._posix_split_name(info["name"])
1374
1375         return self._create_header(info, USTAR_FORMAT, encoding, errors)
1376
1377     def create_gnu_header(self, info, encoding, errors):
1378         """Return the object as a GNU header block sequence.
1379         """
1380         info["magic"] = GNU_MAGIC
1381
1382         if self.ismultivol():
1383             prefix = [
1384                 itn(info.get("atime", 0), 12, GNU_FORMAT),
1385                 itn(info.get("ctime", 0), 12, GNU_FORMAT),
1386                 itn(self.volume_offset, 12, GNU_FORMAT),
1387                 itn(0, 119, GNU_FORMAT), # stuff unused in this tar implementation, set to zero
1388             ]
1389             info['prefix'] = b"".join(prefix)
1390             info['size'] = info['size'] - self.volume_offset
1391
1392         buf = b""
1393         if len(info["linkname"]) > LENGTH_LINK:
1394             buf += self._create_gnu_long_header(info["linkname"],
1395                 GNUTYPE_LONGLINK, encoding, errors)
1396
1397         if len(info["name"]) > LENGTH_NAME:
1398             buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME,
1399                                                 encoding, errors)
1400
1401         return buf + self._create_header(info, GNU_FORMAT, encoding, errors)
1402
1403     def create_pax_header(self, info, encoding, errors):
1404         """Return the object as a ustar header block. If it cannot be
1405            represented this way, prepend a pax extended header sequence
1406            with supplement information.
1407         """
1408         info["magic"] = POSIX_MAGIC
1409         pax_headers = self.pax_headers.copy()
1410         if self.ismultivol():
1411             info['size'] = info['size'] - self.volume_offset
1412
1413         # Test string fields for values that exceed the field length or cannot
1414         # be represented in ASCII encoding.
1415         for name, hname, length in (
1416                 ("name", "path", LENGTH_NAME),
1417                 ("linkname", "linkpath", LENGTH_LINK),
1418                 ("uname", "uname", 32),
1419                 ("gname", "gname", 32)):
1420
1421             if hname in pax_headers:
1422                 # The pax header has priority.
1423                 continue
1424
1425             # Try to encode the string as ASCII.
1426             try:
1427                 info[name].encode("ascii", "strict")
1428             except UnicodeEncodeError:
1429                 pax_headers[hname] = info[name]
1430                 continue
1431
1432             if len(info[name]) > length:
1433                 pax_headers[hname] = info[name]
1434
1435         # Test number fields for values that exceed the field limit or values
1436         # that like to be stored as float.
1437         for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)):
1438             if name in pax_headers:
1439                 # The pax header has priority. Avoid overflow.
1440                 info[name] = 0
1441                 continue
1442
1443             val = info[name]
1444             if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float):
1445                 pax_headers[name] = str(val)
1446                 info[name] = 0
1447
1448         # Create a pax extended header if necessary.
1449         if pax_headers:
1450             buf = self._create_pax_generic_header(pax_headers, XHDTYPE, encoding)
1451         else:
1452             buf = b""
1453
1454         return buf + self._create_header(info, USTAR_FORMAT, "ascii", "replace")
1455
1456     @classmethod
1457     def create_pax_global_header(cls, pax_headers):
1458         """Return the object as a pax global header block sequence.
1459         """
1460         return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf-8")
1461
1462     def _posix_split_name(self, name):
1463         """Split a name longer than 100 chars into a prefix
1464            and a name part.
1465         """
1466         prefix = name[:LENGTH_PREFIX + 1]
1467         while prefix and prefix[-1] != "/":
1468             prefix = prefix[:-1]
1469
1470         name = name[len(prefix):]
1471         prefix = prefix[:-1]
1472
1473         if not prefix or len(name) > LENGTH_NAME:
1474             raise ValueError("name is too long")
1475         return prefix, name
1476
1477     @staticmethod
1478     def _create_header(info, format, encoding, errors):
1479         """Return a header block. info is a dictionary with file
1480            information, format must be one of the *_FORMAT constants.
1481         """
1482         parts = [
1483             stn(info.get("name", ""), 100, encoding, errors),
1484             itn(info.get("mode", 0) & 0o7777, 8, format),
1485             itn(info.get("uid", 0), 8, format),
1486             itn(info.get("gid", 0), 8, format),
1487             itn(info.get("size", 0), 12, format),
1488             itn(info.get("mtime", 0), 12, format),
1489             b"        ", # checksum field
1490             info.get("type", REGTYPE),
1491             stn(info.get("linkname", ""), 100, encoding, errors),
1492             info.get("magic", POSIX_MAGIC),
1493             stn(info.get("uname", ""), 32, encoding, errors),
1494             stn(info.get("gname", ""), 32, encoding, errors),
1495             itn(info.get("devmajor", 0), 8, format),
1496             itn(info.get("devminor", 0), 8, format),
1497             sbtn(info.get("prefix", ""), 155, encoding, errors)
1498         ]
1499
1500         buf = struct.pack("%ds" % BLOCKSIZE, b"".join(parts))
1501         chksum = calc_chksums(buf[-BLOCKSIZE:])[0]
1502         buf = buf[:-364] + bytes("%06o\0" % chksum, "ascii") + buf[-357:]
1503         return buf
1504
1505     @staticmethod
1506     def _create_payload(payload):
1507         """Return the string payload filled with zero bytes
1508            up to the next 512 byte border.
1509         """
1510         blocks, remainder = divmod(len(payload), BLOCKSIZE)
1511         if remainder > 0:
1512             payload += (BLOCKSIZE - remainder) * NUL
1513         return payload
1514
1515     @classmethod
1516     def _create_gnu_long_header(cls, name, type, encoding, errors):
1517         """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence
1518            for name.
1519         """
1520         name = name.encode(encoding, errors) + NUL
1521
1522         info = {}
1523         info["name"] = "././@LongLink"
1524         info["type"] = type
1525         info["size"] = len(name)
1526         info["magic"] = GNU_MAGIC
1527
1528         # create extended header + name blocks.
1529         return cls._create_header(info, USTAR_FORMAT, encoding, errors) + \
1530                 cls._create_payload(name)
1531
1532     @classmethod
1533     def _create_pax_generic_header(cls, pax_headers, type, encoding):
1534         """Return a POSIX.1-2008 extended or global header sequence
1535            that contains a list of keyword, value pairs. The values
1536            must be strings.
1537         """
1538         # Check if one of the fields contains surrogate characters and thereby
1539         # forces hdrcharset=BINARY, see _proc_pax() for more information.
1540         binary = False
1541         for keyword, value in pax_headers.items():
1542             try:
1543                 value.encode("utf-8", "strict")
1544             except UnicodeEncodeError:
1545                 binary = True
1546                 break
1547
1548         records = b""
1549         if binary:
1550             # Put the hdrcharset field at the beginning of the header.
1551             records += b"21 hdrcharset=BINARY\n"
1552
1553         for keyword, value in pax_headers.items():
1554             keyword = keyword.encode("utf-8")
1555             if binary:
1556                 # Try to restore the original byte representation of `value'.
1557                 # Needless to say, that the encoding must match the string.
1558                 value = value.encode(encoding, "surrogateescape")
1559             else:
1560                 value = value.encode("utf-8")
1561
1562             l = len(keyword) + len(value) + 3   # ' ' + '=' + '\n'
1563             n = p = 0
1564             while True:
1565                 n = l + len(str(p))
1566                 if n == p:
1567                     break
1568                 p = n
1569             records += bytes(str(p), "ascii") + b" " + keyword + b"=" + value + b"\n"
1570
1571         # We use a hardcoded "././@PaxHeader" name like star does
1572         # instead of the one that POSIX recommends.
1573         info = {}
1574         info["name"] = "././@PaxHeader"
1575         info["type"] = type
1576         info["size"] = len(records)
1577         info["magic"] = POSIX_MAGIC
1578
1579         # Create pax header + record blocks.
1580         return cls._create_header(info, USTAR_FORMAT, "ascii", "replace") + \
1581                 cls._create_payload(records)
1582
1583     @classmethod
1584     def frombuf(cls, buf, encoding, errors):
1585         """Construct a TarInfo object from a 512 byte bytes object.
1586         """
1587         if len(buf) == 0:
1588             raise EmptyHeaderError("empty header")
1589         if len(buf) != BLOCKSIZE:
1590             raise TruncatedHeaderError("truncated header")
1591         if buf.count(NUL) == BLOCKSIZE:
1592             raise EOFHeaderError("end of file header")
1593
1594         chksum = nti(buf[148:156])
1595         if chksum not in calc_chksums(buf):
1596             raise InvalidHeaderError("bad checksum")
1597
1598         obj = cls()
1599         obj.name = nts(buf[0:100], encoding, errors)
1600         obj.mode = nti(buf[100:108])
1601         obj.uid = nti(buf[108:116])
1602         obj.gid = nti(buf[116:124])
1603         obj.size = nti(buf[124:136])
1604         obj.mtime = nti(buf[136:148])
1605         obj.chksum = chksum
1606         obj.type = buf[156:157]
1607         obj.linkname = nts(buf[157:257], encoding, errors)
1608         obj.uname = nts(buf[265:297], encoding, errors)
1609         obj.gname = nts(buf[297:329], encoding, errors)
1610         obj.devmajor = nti(buf[329:337])
1611         obj.devminor = nti(buf[337:345])
1612         prefix = nts(buf[345:500], encoding, errors)
1613
1614         # The old GNU sparse format occupies some of the unused
1615         # space in the buffer for up to 4 sparse structures.
1616         # Save the them for later processing in _proc_sparse().
1617         if obj.type == GNUTYPE_SPARSE:
1618             pos = 386
1619             structs = []
1620             for i in range(4):
1621                 try:
1622                     offset = nti(buf[pos:pos + 12])
1623                     numbytes = nti(buf[pos + 12:pos + 24])
1624                 except ValueError:
1625                     break
1626                 structs.append((offset, numbytes))
1627                 pos += 24
1628             isextended = bool(buf[482])
1629             origsize = nti(buf[483:495])
1630             obj._sparse_structs = (structs, isextended, origsize)
1631
1632         # Old V7 tar format represents a directory as a regular
1633         # file with a trailing slash.
1634         if obj.type == AREGTYPE and obj.name.endswith("/"):
1635             obj.type = DIRTYPE
1636
1637         # Remove redundant slashes from directories.
1638         if obj.isdir():
1639             obj.name = obj.name.rstrip("/")
1640
1641         # Reconstruct a ustar longname.
1642         if prefix and obj.type not in GNU_TYPES:
1643             obj.name = prefix + "/" + obj.name
1644         else:
1645             obj.offset_data = nti(buf[369:381])
1646         return obj
1647
1648     @classmethod
1649     def fromtarfile(cls, tarfile):
1650         """Return the next TarInfo object from TarFile object
1651            tarfile.
1652         """
1653         buf = tarfile.fileobj.read(BLOCKSIZE)
1654         obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors)
1655         obj.offset = tarfile.fileobj.tell() - BLOCKSIZE
1656         return obj._proc_member(tarfile)
1657
1658     #--------------------------------------------------------------------------
1659     # The following are methods that are called depending on the type of a
1660     # member. The entry point is _proc_member() which can be overridden in a
1661     # subclass to add custom _proc_*() methods. A _proc_*() method MUST
1662     # implement the following
1663     # operations:
1664     # 1. Set self.offset_data to the position where the data blocks begin,
1665     #    if there is data that follows.
1666     # 2. Set tarfile.offset to the position where the next member's header will
1667     #    begin.
1668     # 3. Return self or another valid TarInfo object.
1669     def _proc_member(self, tarfile):
1670         """Choose the right processing method depending on
1671            the type and call it.
1672         """
1673         if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK):
1674             return self._proc_gnulong(tarfile)
1675         elif self.type == GNUTYPE_SPARSE:
1676             return self._proc_sparse(tarfile)
1677         elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE):
1678             return self._proc_pax(tarfile)
1679         else:
1680             return self._proc_builtin(tarfile)
1681
1682     def _proc_builtin(self, tarfile):
1683         """Process a builtin type or an unknown type which
1684            will be treated as a regular file.
1685         """
1686         self.offset_data = tarfile.fileobj.tell()
1687         offset = self.offset_data
1688         if self.isreg() or self.ismultivol() or self.type not in SUPPORTED_TYPES:
1689             # Skip the following data blocks.
1690             offset += self._block(self.size)
1691         tarfile.offset = offset
1692
1693         # Patch the TarInfo object with saved global
1694         # header information.
1695         self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors)
1696
1697         return self
1698
1699     def _proc_gnulong(self, tarfile):
1700         """Process the blocks that hold a GNU longname
1701            or longlink member.
1702         """
1703         buf = tarfile.fileobj.read(self._block(self.size))
1704
1705         # Fetch the next header and process it.
1706         try:
1707             next = self.fromtarfile(tarfile)
1708         except HeaderError:
1709             raise SubsequentHeaderError("missing or bad subsequent header")
1710
1711         # Patch the TarInfo object from the next header with
1712         # the longname information.
1713         next.offset = self.offset
1714         if self.type == GNUTYPE_LONGNAME:
1715             next.name = nts(buf, tarfile.encoding, tarfile.errors)
1716         elif self.type == GNUTYPE_LONGLINK:
1717             next.linkname = nts(buf, tarfile.encoding, tarfile.errors)
1718
1719         return next
1720
1721     def _proc_sparse(self, tarfile):
1722         """Process a GNU sparse header plus extra headers.
1723         """
1724         # We already collected some sparse structures in frombuf().
1725         structs, isextended, origsize = self._sparse_structs
1726         del self._sparse_structs
1727
1728         # Collect sparse structures from extended header blocks.
1729         while isextended:
1730             buf = tarfile.fileobj.read(BLOCKSIZE)
1731             pos = 0
1732             for i in range(21):
1733                 try:
1734                     offset = nti(buf[pos:pos + 12])
1735                     numbytes = nti(buf[pos + 12:pos + 24])
1736                 except ValueError:
1737                     break
1738                 if offset and numbytes:
1739                     structs.append((offset, numbytes))
1740                 pos += 24
1741             isextended = bool(buf[504])
1742         self.sparse = structs
1743
1744         self.offset_data = tarfile.fileobj.tell()
1745         tarfile.offset = self.offset_data + self._block(self.size)
1746         self.size = origsize
1747         return self
1748
1749     def _proc_pax(self, tarfile):
1750         """Process an extended or global header as described in
1751            POSIX.1-2008.
1752         """
1753         # Read the header information.
1754         buf = tarfile.fileobj.read(self._block(self.size))
1755
1756         # A pax header stores supplemental information for either
1757         # the following file (extended) or all following files
1758         # (global).
1759         if self.type == XGLTYPE:
1760             pax_headers = tarfile.pax_headers
1761         else:
1762             pax_headers = tarfile.pax_headers.copy()
1763
1764         # Check if the pax header contains a hdrcharset field. This tells us
1765         # the encoding of the path, linkpath, uname and gname fields. Normally,
1766         # these fields are UTF-8 encoded but since POSIX.1-2008 tar
1767         # implementations are allowed to store them as raw binary strings if
1768         # the translation to UTF-8 fails.
1769         match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf)
1770         if match is not None:
1771             pax_headers["hdrcharset"] = match.group(1).decode("utf-8")
1772
1773         # For the time being, we don't care about anything other than "BINARY".
1774         # The only other value that is currently allowed by the standard is
1775         # "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
1776         hdrcharset = pax_headers.get("hdrcharset")
1777         if hdrcharset == "BINARY":
1778             encoding = tarfile.encoding
1779         else:
1780             encoding = "utf-8"
1781
1782         # Parse pax header information. A record looks like that:
1783         # "%d %s=%s\n" % (length, keyword, value). length is the size
1784         # of the complete record including the length field itself and
1785         # the newline. keyword and value are both UTF-8 encoded strings.
1786         regex = re.compile(br"(\d+) ([^=]+)=")
1787         pos = 0
1788         while True:
1789             match = regex.match(buf, pos)
1790             if not match:
1791                 break
1792
1793             length, keyword = match.groups()
1794             length = int(length)
1795             value = buf[match.end(2) + 1:match.start(1) + length - 1]
1796
1797             # Normally, we could just use "utf-8" as the encoding and "strict"
1798             # as the error handler, but we better not take the risk. For
1799             # example, GNU tar <= 1.23 is known to store filenames it cannot
1800             # translate to UTF-8 as raw strings (unfortunately without a
1801             # hdrcharset=BINARY header).
1802             # We first try the strict standard encoding, and if that fails we
1803             # fall back on the user's encoding and error handler.
1804             keyword = self._decode_pax_field(keyword, "utf-8", "utf-8",
1805                     tarfile.errors)
1806             if keyword in PAX_NAME_FIELDS:
1807                 value = self._decode_pax_field(value, encoding, tarfile.encoding,
1808                         tarfile.errors)
1809             else:
1810                 value = self._decode_pax_field(value, "utf-8", "utf-8",
1811                         tarfile.errors)
1812
1813             pax_headers[keyword] = value
1814             pos += length
1815
1816
1817         # Fetch the next header.
1818         try:
1819             next = self.fromtarfile(tarfile)
1820         except HeaderError:
1821             raise SubsequentHeaderError("missing or bad subsequent header")
1822
1823         # Process GNU sparse information.
1824         if "GNU.sparse.map" in pax_headers:
1825             # GNU extended sparse format version 0.1.
1826             self._proc_gnusparse_01(next, pax_headers)
1827
1828         elif "GNU.sparse.size" in pax_headers:
1829             # GNU extended sparse format version 0.0.
1830             self._proc_gnusparse_00(next, pax_headers, buf)
1831
1832         elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0":
1833             # GNU extended sparse format version 1.0.
1834             self._proc_gnusparse_10(next, pax_headers, tarfile)
1835
1836         if self.type in (XHDTYPE, SOLARIS_XHDTYPE):
1837             # Patch the TarInfo object with the extended header info.
1838             next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors)
1839             next.offset = self.offset
1840
1841             if "size" in pax_headers:
1842                 # If the extended header replaces the size field,
1843                 # we need to recalculate the offset where the next
1844                 # header starts.
1845                 offset = next.offset_data
1846                 if next.isreg() or next.type not in SUPPORTED_TYPES:
1847                     offset += next._block(next.size)
1848                 tarfile.offset = offset
1849
1850         if next is not None:
1851             if "GNU.volume.filename" in pax_headers:
1852                 if pax_headers["GNU.volume.filename"] == next.name:
1853                     if "GNU.volume.size" in pax_headers:
1854                         next.size = int(pax_headers["GNU.volume.size"])
1855                     if "GNU.volume.offset" in pax_headers:
1856                         next.volume_offset = int(pax_headers["GNU.volume.offset"])
1857
1858                 for key in pax_headers.keys():
1859                     if key.startswith("GNU.volume"):
1860                         del tarfile.pax_headers[key]
1861
1862         return next
1863
1864     def _proc_gnusparse_00(self, next, pax_headers, buf):
1865         """Process a GNU tar extended sparse header, version 0.0.
1866         """
1867         offsets = []
1868         for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf):
1869             offsets.append(int(match.group(1)))
1870         numbytes = []
1871         for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf):
1872             numbytes.append(int(match.group(1)))
1873         next.sparse = list(zip(offsets, numbytes))
1874
1875     def _proc_gnusparse_01(self, next, pax_headers):
1876         """Process a GNU tar extended sparse header, version 0.1.
1877         """
1878         sparse = [int(x) for x in pax_headers["GNU.sparse.map"].split(",")]
1879         next.sparse = list(zip(sparse[::2], sparse[1::2]))
1880
1881     def _proc_gnusparse_10(self, next, pax_headers, tarfile):
1882         """Process a GNU tar extended sparse header, version 1.0.
1883         """
1884         fields = None
1885         sparse = []
1886         buf = tarfile.fileobj.read(BLOCKSIZE)
1887         fields, buf = buf.split(b"\n", 1)
1888         fields = int(fields)
1889         while len(sparse) < fields * 2:
1890             if b"\n" not in buf:
1891                 buf += tarfile.fileobj.read(BLOCKSIZE)
1892             number, buf = buf.split(b"\n", 1)
1893             sparse.append(int(number))
1894         next.offset_data = tarfile.fileobj.tell()
1895         next.sparse = list(zip(sparse[::2], sparse[1::2]))
1896
1897     def _apply_pax_info(self, pax_headers, encoding, errors):
1898         """Replace fields with supplemental information from a previous
1899            pax extended or global header.
1900         """
1901         for keyword, value in pax_headers.items():
1902             if keyword == "GNU.sparse.name":
1903                 setattr(self, "path", value)
1904             elif keyword == "GNU.sparse.size":
1905                 setattr(self, "size", int(value))
1906             elif keyword == "GNU.sparse.realsize":
1907                 setattr(self, "size", int(value))
1908             elif keyword in PAX_FIELDS:
1909                 if keyword in PAX_NUMBER_FIELDS:
1910                     try:
1911                         value = PAX_NUMBER_FIELDS[keyword](value)
1912                     except ValueError:
1913                         value = 0
1914                 if keyword == "path":
1915                     value = value.rstrip("/")       # pylint: disable=no-member
1916                 setattr(self, keyword, value)
1917
1918         self.pax_headers = pax_headers.copy()
1919
1920     def _decode_pax_field(self, value, encoding, fallback_encoding, fallback_errors):
1921         """Decode a single field from a pax record.
1922         """
1923         try:
1924             return value.decode(encoding, "strict")
1925         except UnicodeDecodeError:
1926             return value.decode(fallback_encoding, fallback_errors)
1927
1928     def _block(self, count):
1929         """Round up a byte count by BLOCKSIZE and return it,
1930            e.g. _block(834) => 1024.
1931         """
1932         blocks, remainder = divmod(count, BLOCKSIZE)
1933         if remainder:
1934             blocks += 1
1935         return blocks * BLOCKSIZE
1936
1937     def isreg(self):
1938         return self.type in REGULAR_TYPES
1939     def isfile(self):
1940         return self.isreg()
1941     def isdir(self):
1942         return self.type == DIRTYPE
1943     def issym(self):
1944         return self.type == SYMTYPE
1945     def islnk(self):
1946         return self.type == LNKTYPE
1947     def ischr(self):
1948         return self.type == CHRTYPE
1949     def isblk(self):
1950         return self.type == BLKTYPE
1951     def isfifo(self):
1952         return self.type == FIFOTYPE
1953     def issparse(self):
1954         return self.sparse is not None
1955     def isdev(self):
1956         return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE)
1957     def ismultivol(self):
1958         return self.type == GNUTYPE_MULTIVOL or self.volume_offset > 0 or\
1959             "GNU.volume.offset" in self.pax_headers
1960 # class TarInfo
1961
1962 class TarFile(object):
1963     """The TarFile Class provides an interface to tar archives.
1964     """
1965
1966     debug = 0                   # May be set from 0 (no msgs) to 3 (all msgs)
1967
1968     dereference = False         # If true, add content of linked file to the
1969                                 # tar file, else the link.
1970
1971     ignore_zeros = False        # If true, skips empty or invalid blocks and
1972                                 # continues processing.
1973
1974     max_volume_size = None      # If different from None, establishes maximum
1975                                 # size of tar volumes
1976
1977     new_volume_handler = None   # function handler to be executed before when
1978                                 # a new volume is needed
1979
1980     volume_number = 0           # current volume number, used for multi volume
1981                                 # support
1982
1983     errorlevel = 1              # If 0, fatal errors only appear in debug
1984                                 # messages (if debug >= 0). If > 0, errors
1985                                 # are passed to the caller as exceptions.
1986
1987     format = DEFAULT_FORMAT     # The format to use when creating an archive.
1988
1989     encoding = ENCODING         # Encoding for 8-bit character strings.
1990
1991     errors = None               # Error handler for unicode conversion.
1992
1993     tarinfo = TarInfo           # The default TarInfo class to use.
1994
1995     fileobject = ExFileObject   # The file-object for extractfile().
1996
1997     arcmode = ARCMODE_PLAIN     # Object processing mode (“concat”, encryption,
1998                                 # compression)
1999
2000     save_to_members = True      # If new members are saved. This can be disabled
2001                                 # if you manage lots of files and don't want
2002                                 # to have high memory usage
2003
2004     cache_uid2user = {}         # cache to avoid getpwuid calls. It always parses /etc/passwd.
2005     cache_gid2group = {}        # same cache for groups
2006
2007     def __init__(self, name=None, mode="r", fileobj=None, format=None,
2008             tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
2009             errors="surrogateescape", pax_headers=None, debug=None,
2010             errorlevel=None, max_volume_size=None, new_volume_handler=None,
2011             concat=False, nacl=None,
2012             save_to_members=True):
2013         """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
2014            read from an existing archive, 'a' to append data to an existing
2015            file or 'w' to create a new file overwriting an existing one. `mode'
2016            defaults to 'r'.
2017            If `fileobj' is given, it is used for reading or writing data. If it
2018            can be determined, `mode' is overridden by `fileobj's mode.
2019            `fileobj' is not closed, when TarFile is closed.
2020         """
2021         if len(mode) > 1 or mode not in "raw":
2022             raise ValueError("mode must be 'r', 'a' or 'w'")
2023         self.mode = mode
2024         self.arcmode = arcmode_set (concat)
2025         self.nacl = nacl
2026         self._mode = {"r": "rb", "a": "r+b", "w": "wb"}[mode]
2027
2028         if not fileobj:
2029             if self.mode == "a" and not os.path.exists(name):
2030                 # Create nonexistent files in append mode.
2031                 self.mode = "w"
2032                 self._mode = "wb"
2033             fileobj = bltn_open(name, self._mode)
2034             self._extfileobj = False
2035         else:
2036             if name is None and hasattr(fileobj, "name"):
2037                 name = fileobj.name
2038             # when fileobj is a gzip.GzipFile, fileobj.mode is an int (not valid for us)
2039             if hasattr(fileobj, "mode") and isinstance(fileobj.mode, str):
2040                 self._mode = fileobj.mode
2041             self._extfileobj = True
2042         self.name = os.path.abspath(name) if name else None
2043         self.base_name = self.name = os.path.abspath(name) if name else None
2044         self.fileobj = fileobj
2045
2046         # Init attributes.
2047         if format is not None:
2048             self.format = format
2049         if tarinfo is not None:
2050             self.tarinfo = tarinfo
2051         if dereference is not None:
2052             self.dereference = dereference
2053         if ignore_zeros is not None:
2054             self.ignore_zeros = ignore_zeros
2055         if encoding is not None:
2056             self.encoding = encoding
2057
2058         self.errors = errors
2059
2060         if pax_headers is not None and self.format == PAX_FORMAT:
2061             self.pax_headers = pax_headers
2062         else:
2063             self.pax_headers = {}
2064
2065         if debug is not None:
2066             self.debug = debug
2067         if errorlevel is not None:
2068             self.errorlevel = errorlevel
2069
2070         # Init datastructures.
2071         if max_volume_size and max_volume_size < 3*BLOCKSIZE:
2072             raise ValueError("max_volume_size needs to be at least %d" % (3*BLOCKSIZE))
2073         if max_volume_size and not callable(new_volume_handler):
2074             raise ValueError("new_volume_handler needs to be set and be callable for multivolume support")
2075         if max_volume_size:
2076             self.max_volume_size = int(max_volume_size)
2077         else:
2078             self.max_volume_size = None
2079
2080         self.save_to_members = save_to_members
2081         self.new_volume_handler = new_volume_handler
2082         self.closed = False
2083         self.members = []       # list of members as TarInfo objects
2084         self._loaded = False    # flag if all members have been read
2085         self.offset = self.fileobj.tell()
2086                                 # current position in the archive file
2087         self.inodes = {}        # dictionary caching the inodes of
2088                                 # archive members already added
2089
2090         try:
2091             if self.mode == "r":
2092                 self.firstmember = None
2093                 self.firstmember = self.next()
2094
2095             if self.mode == "a":
2096                 # Move to the end of the archive,
2097                 # before the first empty block.
2098                 while True:
2099                     self.fileobj.seek(self.offset)
2100                     try:
2101                         tarinfo = self.tarinfo.fromtarfile(self)
2102                         self.members.append(tarinfo)
2103                     except EOFHeaderError:
2104                         self.fileobj.seek(self.offset)
2105                         break
2106                     except HeaderError as e:
2107                         raise ReadError(str(e))
2108
2109             if self.mode in "aw":
2110                 self._loaded = True
2111
2112                 if self.pax_headers:
2113                     buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy())
2114                     self.fileobj.write(buf)
2115                     self.offset += len(buf)
2116         except:
2117             if not self._extfileobj:
2118                 self.fileobj.close()
2119             self.closed = True
2120             raise
2121
2122     #--------------------------------------------------------------------------
2123     # Below are the classmethods which act as alternate constructors to the
2124     # TarFile class. The open() method is the only one that is needed for
2125     # public use; it is the "super"-constructor and is able to select an
2126     # adequate "sub"-constructor for a particular compression using the mapping
2127     # from OPEN_METH.
2128     #
2129     # This concept allows one to subclass TarFile without losing the comfort of
2130     # the super-constructor. A sub-constructor is registered and made available
2131     # by adding it to the mapping in OPEN_METH.
2132
2133     @classmethod
2134     def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE,
2135              encryption=None, compresslevel=9, tolerance=TOLERANCE_STRICT,
2136              **kwargs):
2137         """Open a tar archive for reading, writing or appending. Return
2138            an appropriate TarFile class.
2139
2140            mode:
2141            'r' or 'r:*' open for reading with transparent compression
2142            'r:'         open for reading exclusively uncompressed
2143            'r:gz'       open for reading with gzip compression
2144            'r:bz2'      open for reading with bzip2 compression
2145            'r:xz'       open for reading with lzma compression
2146            'a' or 'a:'  open for appending, creating the file if necessary
2147            'w' or 'w:'  open for writing without compression
2148            'w:gz'       open for writing with gzip compression
2149            'w:bz2'      open for writing with bzip2 compression
2150            'w:xz'       open for writing with lzma compression
2151
2152            'r|*'        open a stream of tar blocks with transparent compression
2153            'r|'         open an uncompressed stream of tar blocks for reading
2154            'r|gz'       open a gzip compressed stream of tar blocks
2155            'r|bz2'      open a bzip2 compressed stream of tar blocks
2156            'r|xz'       open an lzma compressed stream of tar blocks
2157            'w|'         open an uncompressed stream for writing
2158            'w|gz'       open a gzip compressed stream for writing
2159            'w|bz2'      open a bzip2 compressed stream for writing
2160            'w|xz'       open an lzma compressed stream for writing
2161
2162            'r#gz'       open a stream of gzip compressed tar blocks for reading
2163            'w#gz'       open a stream of gzip compressed tar blocks for writing
2164         """
2165         if not name and not fileobj:
2166             raise ValueError("nothing to open")
2167
2168         if mode in ("r", "r:*"):
2169             # Find out which *open() is appropriate for opening the file.
2170             for comptype in cls.OPEN_METH:
2171                 func = getattr(cls, cls.OPEN_METH[comptype])
2172                 if fileobj is not None:
2173                     saved_pos = fileobj.tell()
2174                 try:
2175                     return func(name, "r", fileobj, **kwargs)
2176                 except (ReadError, CompressionError) as e:
2177                     # usually nothing exceptional but sometimes is
2178                     if fileobj is not None:
2179                         fileobj.seek(saved_pos)
2180                     continue
2181             raise ReadError("file could not be opened successfully")
2182
2183         elif ":" in mode:
2184             filemode, comptype = mode.split(":", 1)
2185             filemode = filemode or "r"
2186             comptype = comptype or "tar"
2187
2188             # Select the *open() function according to
2189             # given compression.
2190             if comptype in cls.OPEN_METH:
2191                 func = getattr(cls, cls.OPEN_METH[comptype])
2192             else:
2193                 raise CompressionError("unknown compression type %r" % comptype)
2194
2195             # Pass on compression level for gzip / bzip2.
2196             if comptype == 'gz' or comptype == 'bz2':
2197                 kwargs['compresslevel'] = compresslevel
2198
2199             if 'max_volume_size' in kwargs:
2200                 if comptype != 'tar' and filemode in 'wa' \
2201                         and kwargs['max_volume_size']:
2202                     import warnings
2203                     warnings.warn('Only the first volume will be compressed '
2204                                   'for modes with "w:"!')
2205
2206             return func(name, filemode, fileobj, **kwargs)
2207
2208         elif "|" in mode:
2209             filemode, comptype = mode.split("|", 1)
2210             filemode = filemode or "r"
2211             comptype = comptype or "tar"
2212
2213             if filemode not in "rw":
2214                 raise ValueError("mode must be 'r' or 'w'")
2215
2216             t = cls(name, filemode,
2217                     _Stream(name, filemode, comptype, fileobj, bufsize,
2218                             compresslevel=compresslevel),
2219                     **kwargs)
2220             t._extfileobj = False
2221             return t
2222
2223         elif "#" in mode:
2224             filemode, comptype = mode.split("#", 1)
2225             filemode = filemode or "r"
2226
2227             if filemode not in "rw":
2228                 raise ValueError ("mode %s not compatible with concat "
2229                                   "archive; must be 'r' or 'w'" % mode)
2230
2231             stream = _Stream(name, filemode, comptype, fileobj, bufsize,
2232                              concat=True, encryption=encryption,
2233                              compresslevel=compresslevel, tolerance=tolerance)
2234             kwargs ["concat"] = True
2235             try:
2236                 t = cls(name, filemode, stream, **kwargs)
2237             except: # XXX except what?
2238                 stream.close()
2239                 raise # XXX raise what?
2240             t._extfileobj = False
2241             return t
2242
2243         elif mode in "aw":
2244             return cls.taropen(name, mode, fileobj, **kwargs)
2245
2246         raise ValueError("undiscernible mode %r" % mode)
2247
2248
2249     @classmethod
2250     def open_at_offset(cls, offset, *a, **kwa):
2251         """
2252         Same as ``.open()``, but start reading at the given offset. Assumes a
2253         seekable file object. Returns *None* if opening failed due to a read
2254         problem.
2255         """
2256         fileobj = kwa.get ("fileobj")
2257         if fileobj is not None:
2258             fileobj.seek (offset)
2259
2260         return cls.open (*a, **kwa)
2261
2262
2263     @classmethod
2264     def taropen(cls, name, mode="r", fileobj=None, **kwargs):
2265         """Open uncompressed tar archive name for reading or writing.
2266         """
2267         if len(mode) > 1 or mode not in "raw":
2268             raise ValueError("mode must be 'r', 'a' or 'w'")
2269         return cls(name, mode, fileobj, **kwargs)
2270
2271     @classmethod
2272     def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
2273         """Open gzip compressed tar archive name for reading or writing.
2274            Appending is not allowed.
2275         """
2276         if len(mode) > 1 or mode not in "rw":
2277             raise ValueError("mode must be 'r' or 'w'")
2278
2279         try:
2280             import gzip
2281             gzip.GzipFile
2282         except (ImportError, AttributeError):
2283             raise CompressionError("gzip module is not available")
2284
2285         extfileobj = fileobj is not None
2286         try:
2287             fileobj = gzip.GzipFile(name, mode + "b", compresslevel, fileobj)
2288             t = cls.taropen(name, mode, fileobj, **kwargs)
2289         except OSError:
2290             if not extfileobj and fileobj is not None:
2291                 fileobj.close()
2292             if fileobj is None:
2293                 raise
2294             raise ReadError("not a gzip file")
2295         except:
2296             if not extfileobj and fileobj is not None:
2297                 fileobj.close()
2298             raise
2299         t._extfileobj = extfileobj
2300         return t
2301
2302     @classmethod
2303     def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
2304         """Open bzip2 compressed tar archive name for reading or writing.
2305            Appending is not allowed.
2306         """
2307         if len(mode) > 1 or mode not in "rw":
2308             raise ValueError("mode must be 'r' or 'w'.")
2309
2310         try:
2311             import bz2
2312         except ImportError:
2313             raise CompressionError("bz2 module is not available")
2314
2315         fileobj = bz2.BZ2File(fileobj or name, mode,
2316                               compresslevel=compresslevel)
2317
2318         try:
2319             t = cls.taropen(name, mode, fileobj, **kwargs)
2320         except (OSError, EOFError):
2321             fileobj.close()
2322             raise ReadError("not a bzip2 file")
2323         t._extfileobj = False
2324         return t
2325
2326     @classmethod
2327     def xzopen(cls, name, mode="r", fileobj=None, preset=None, **kwargs):
2328         """Open lzma compressed tar archive name for reading or writing.
2329            Appending is not allowed.
2330         """
2331         if mode not in ("r", "w"):
2332             raise ValueError("mode must be 'r' or 'w'")
2333
2334         try:
2335             import lzma
2336         except ImportError:
2337             raise CompressionError("lzma module is not available")
2338
2339         fileobj = lzma.LZMAFile(fileobj or name, mode, preset=preset)
2340
2341         try:
2342             t = cls.taropen(name, mode, fileobj, **kwargs)
2343         except (lzma.LZMAError, EOFError):
2344             fileobj.close()
2345             raise ReadError("not an lzma file")
2346         t._extfileobj = False
2347         return t
2348
2349     # All *open() methods are registered here.
2350     OPEN_METH = {
2351         "tar": "taropen",   # uncompressed tar
2352         "gz":  "gzopen",    # gzip compressed tar
2353         "bz2": "bz2open",   # bzip2 compressed tar
2354         "xz":  "xzopen"     # lzma compressed tar
2355     }
2356
2357     #--------------------------------------------------------------------------
2358     # The public methods which TarFile provides:
2359
2360     def close(self):
2361         """Close the TarFile. In write-mode, two finishing zero blocks are
2362            appended to the archive. A special case are empty archives which are
2363            initialized accordingly so the two mandatory blocks of zeros are
2364            written abiding by the requested encryption and compression settings.
2365         """
2366         if self.closed:
2367             return
2368
2369         if self.mode in "aw":
2370             if self.arcmode & ARCMODE_CONCAT and self.fileobj.tell () == 0:
2371                 self.fileobj.next ("")
2372             self.fileobj.write(NUL * (BLOCKSIZE * 2))
2373             self.offset += (BLOCKSIZE * 2)
2374             # fill up the end with zero-blocks
2375             # (like option -b20 for tar does)
2376             blocks, remainder = divmod(self.offset, RECORDSIZE)
2377             if remainder > 0:
2378                 self.fileobj.write(NUL * (RECORDSIZE - remainder))
2379         if not self._extfileobj:
2380             self.fileobj.close()
2381         self.closed = True
2382
2383     def getmember(self, name):
2384         """Return a TarInfo object for member `name'. If `name' can not be
2385            found in the archive, KeyError is raised. If a member occurs more
2386            than once in the archive, its last occurrence is assumed to be the
2387            most up-to-date version.
2388         """
2389         tarinfo = self._getmember(name)
2390         if tarinfo is None:
2391             raise KeyError("filename %r not found" % name)
2392         return tarinfo
2393
2394     def getmembers(self):
2395         """Return the members of the archive as a list of TarInfo objects. The
2396            list has the same order as the members in the archive.
2397         """
2398         self._check()
2399         if not self._loaded:    # if we want to obtain a list of
2400             self._load()        # all members, we first have to
2401                                 # scan the whole archive.
2402         return self.members
2403
2404     def get_last_member_offset(self):
2405         """Return the last member offset. Usually this is self.fileobj.tell(),
2406            but when there's encryption or concat compression going on it's more
2407            complicated than that.
2408         """
2409         return self.last_block_offset
2410
2411     def getnames(self):
2412         """Return the members of the archive as a list of their names. It has
2413            the same order as the list returned by getmembers().
2414         """
2415         return [tarinfo.name for tarinfo in self.getmembers()]
2416
2417     def gettarinfo(self, name=None, arcname=None, fileobj=None):
2418         """Create a TarInfo object for either the file `name' or the file
2419            object `fileobj' (using os.fstat on its file descriptor). You can
2420            modify some of the TarInfo's attributes before you add it using
2421            addfile(). If given, `arcname' specifies an alternative name for the
2422            file in the archive.
2423         """
2424         self._check("aw")
2425
2426         # When fileobj is given, replace name by
2427         # fileobj's real name.
2428         if fileobj is not None:
2429             name = fileobj.name
2430
2431         # Building the name of the member in the archive.
2432         # Backward slashes are converted to forward slashes,
2433         # Absolute paths are turned to relative paths.
2434         if arcname is None:
2435             arcname = name
2436         drv, arcname = os.path.splitdrive(arcname)
2437         arcname = arcname.replace(os.sep, "/")
2438         arcname = arcname.lstrip("/")
2439
2440         # Now, fill the TarInfo object with
2441         # information specific for the file.
2442         tarinfo = self.tarinfo()
2443         tarinfo.tarfile = self
2444
2445         # Use os.stat or os.lstat, depending on platform
2446         # and if symlinks shall be resolved.
2447         if fileobj is None:
2448             if hasattr(os, "lstat") and not self.dereference:
2449                 statres = os.lstat(name)
2450             else:
2451                 statres = os.stat(name)
2452         else:
2453             statres = os.fstat(fileobj.fileno())
2454         linkname = ""
2455
2456         stmd = statres.st_mode
2457         if stat.S_ISREG(stmd):
2458             inode = (statres.st_ino, statres.st_dev)
2459             if not self.dereference and statres.st_nlink > 1 and \
2460                     inode in self.inodes and arcname != self.inodes[inode]:
2461                 # Is it a hardlink to an already
2462                 # archived file?
2463                 type = LNKTYPE
2464                 linkname = self.inodes[inode]
2465             else:
2466                 # The inode is added only if its valid.
2467                 # For win32 it is always 0.
2468                 type = REGTYPE
2469                 if inode[0] and self.save_to_members:
2470                     self.inodes[inode] = arcname
2471         elif stat.S_ISDIR(stmd):
2472             type = DIRTYPE
2473         elif stat.S_ISFIFO(stmd):
2474             type = FIFOTYPE
2475         elif stat.S_ISLNK(stmd):
2476             type = SYMTYPE
2477             linkname = os.readlink(name)
2478         elif stat.S_ISCHR(stmd):
2479             type = CHRTYPE
2480         elif stat.S_ISBLK(stmd):
2481             type = BLKTYPE
2482         else:
2483             return None
2484
2485         # Fill the TarInfo object with all
2486         # information we can get.
2487         tarinfo.name = arcname
2488         tarinfo.mode = stmd
2489         tarinfo.uid = statres.st_uid
2490         tarinfo.gid = statres.st_gid
2491         if type == REGTYPE:
2492             tarinfo.size = statres.st_size
2493         else:
2494             tarinfo.size = 0
2495         tarinfo.mtime = statres.st_mtime
2496         tarinfo.type = type
2497         tarinfo.linkname = linkname
2498         if pwd:
2499             if tarinfo.uid in self.cache_uid2user:
2500                 tarinfo.uname = self.cache_uid2user[tarinfo.uid]
2501             else:
2502                 try:
2503                     tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0]
2504                     self.cache_uid2user[tarinfo.uid] = tarinfo.uname
2505                 except KeyError:
2506                     # remember user does not exist:
2507                     # same default value as in tarinfo class
2508                     self.cache_uid2user[tarinfo.uid] = ""
2509         if grp:
2510             if tarinfo.gid in self.cache_gid2group:
2511                 tarinfo.gname = self.cache_gid2group[tarinfo.gid]
2512             else:
2513                 try:
2514                     tarinfo.gname = grp.getgrgid(tarinfo.gid)[0]
2515                     self.cache_gid2group[tarinfo.gid] = tarinfo.gname
2516                 except KeyError:
2517                     # remember group does not exist:
2518                     # same default value as in tarinfo class
2519                     self.cache_gid2group[tarinfo.gid] = ""
2520
2521         if type in (CHRTYPE, BLKTYPE):
2522             if hasattr(os, "major") and hasattr(os, "minor"):
2523                 tarinfo.devmajor = os.major(statres.st_rdev)
2524                 tarinfo.devminor = os.minor(statres.st_rdev)
2525         return tarinfo
2526
2527     def list(self, verbose=True):
2528         """Print a table of contents to sys.stdout. If `verbose' is False, only
2529            the names of the members are printed. If it is True, an `ls -l'-like
2530            output is produced.
2531         """
2532         self._check()
2533
2534         for tarinfo in self:
2535             if verbose:
2536                 print(stat.filemode(tarinfo.mode), end=' ')
2537                 print("%s/%s" % (tarinfo.uname or tarinfo.uid,
2538                                  tarinfo.gname or tarinfo.gid), end=' ')
2539                 if tarinfo.ischr() or tarinfo.isblk():
2540                     print("%10s" % ("%d,%d" \
2541                                     % (tarinfo.devmajor, tarinfo.devminor)), end=' ')
2542                 else:
2543                     print("%10d" % tarinfo.size, end=' ')
2544                 print("%d-%02d-%02d %02d:%02d:%02d" \
2545                       % time.localtime(tarinfo.mtime)[:6], end=' ')
2546
2547             print(tarinfo.name + ("/" if tarinfo.isdir() else ""), end=' ')
2548
2549             if verbose:
2550                 if tarinfo.issym():
2551                     print("->", tarinfo.linkname, end=' ')
2552                 if tarinfo.islnk():
2553                     print("link to", tarinfo.linkname, end=' ')
2554             print()
2555
2556     def add(self, name, arcname=None, recursive=True, exclude=None, *, filter=None):
2557         """Add the file `name' to the archive. `name' may be any type of file
2558            (directory, fifo, symbolic link, etc.). If given, `arcname'
2559            specifies an alternative name for the file in the archive.
2560            Directories are added recursively by default. This can be avoided by
2561            setting `recursive' to False. `exclude' is a function that should
2562            return True for each filename to be excluded. `filter' is a function
2563            that expects a TarInfo object argument and returns the changed
2564            TarInfo object, if it returns None the TarInfo object will be
2565            excluded from the archive.
2566         """
2567         self._check("aw")
2568
2569         if arcname is None:
2570             arcname = name
2571
2572         # Exclude pathnames.
2573         if exclude is not None:
2574             import warnings
2575             warnings.warn("use the filter argument instead",
2576                     DeprecationWarning, 2)
2577             if exclude(name):
2578                 self._dbg(2, "tarfile: Excluded %r" % name)
2579                 return
2580
2581         # Skip if somebody tries to archive the archive...
2582         if self.name is not None and os.path.abspath(name) == self.name:
2583             self._dbg(2, "tarfile: Skipped %r" % name)
2584             return
2585
2586         self._dbg(1, name)
2587
2588         # Create a TarInfo object from the file.
2589         tarinfo = self.gettarinfo(name, arcname)
2590
2591         if tarinfo is None:
2592             self._dbg(1, "tarfile: Unsupported type %r" % name)
2593             return
2594
2595         # Change or exclude the TarInfo object.
2596         if filter is not None:
2597             tarinfo = filter(tarinfo)
2598             if tarinfo is None:
2599                 self._dbg(2, "tarfile: Excluded %r" % name)
2600                 return
2601
2602         # Append the tar header and data to the archive.
2603         if tarinfo.isreg():
2604             with bltn_open(name, "rb") as f:
2605                 self.addfile(tarinfo, f)
2606
2607         elif tarinfo.isdir():
2608             self.addfile(tarinfo)
2609             if recursive:
2610                 for f in os.listdir(name):
2611                     self.add(os.path.join(name, f), os.path.join(arcname, f),
2612                             recursive, exclude, filter=filter)
2613
2614         else:
2615             self.addfile(tarinfo)
2616
2617     def _size_left_file(self):
2618         """Calculates size left in a volume with a maximum volume size.
2619
2620         Assumes self.max_volume_size is set.
2621         If using compression through a _Stream, use _size_left_stream instead
2622         """
2623         # left-over size = max_size - offset - 2 zero-blocks written in close
2624         size_left = self.max_volume_size - 2*BLOCKSIZE - self.offset
2625         # limit size left to a discrete number of blocks, because we won't
2626         # write only half a block when writting the end of a volume
2627         # and filling with zeros
2628         return BLOCKSIZE * (size_left // BLOCKSIZE)
2629
2630     def _size_left_stream(self):
2631         """ Calculates size left in a volume if using comression/encryption
2632
2633         Assumes self.max_volume_size is set and self.fileobj is a _Stream
2634         (otherwise use _size_left_file)
2635         """
2636         # left-over size = max_size - bytes written - 2 zero-blocks (close)
2637         size_left = self.max_volume_size - self.fileobj.estim_file_size() \
2638             - 2*BLOCKSIZE
2639         return BLOCKSIZE * (size_left // BLOCKSIZE)
2640
2641     def addfile(self, tarinfo, fileobj=None):
2642         """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is
2643            given, tarinfo.size bytes are read from it and added to the archive.
2644            You can create TarInfo objects using gettarinfo().
2645            On Windows platforms, `fileobj' should always be opened with mode
2646            'rb' to avoid irritation about the file size.
2647         """
2648         self._check("aw")
2649
2650         tarinfo = copy.copy(tarinfo)
2651
2652         if self.arcmode & ARCMODE_CONCAT:
2653             self.last_block_offset = self.fileobj.next (tarinfo.name)
2654         else:
2655             self.last_block_offset = self.fileobj.tell()
2656
2657         buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
2658         self.fileobj.write(buf)
2659         self.offset += len(buf)
2660
2661         if self.max_volume_size:
2662             if isinstance(self.fileobj, _Stream):
2663                 _size_left = self._size_left_stream
2664             else:
2665                 _size_left = self._size_left_file
2666         else:
2667             _size_left = lambda: tarinfo.size
2668
2669         # If there's no data to follow, finish
2670         if not fileobj:
2671             if self.save_to_members:
2672                 self.members.append(tarinfo)
2673             return
2674
2675         target_size_left = _size_left()
2676         source_size_left = tarinfo.size
2677         assert tarinfo.volume_offset == 0
2678
2679         # we only split volumes in the middle of a file, that means we have
2680         # to write at least one block
2681         if target_size_left < BLOCKSIZE:
2682             target_size_left = BLOCKSIZE
2683
2684         # loop over multiple volumes
2685         while source_size_left > 0:
2686
2687             # Write as much data as possble from source into target.
2688             # When compressing data, we cannot easily predict how much data we
2689             # can write until target_size_left == 0 --> need to iterate
2690             size_can_write = min(target_size_left, source_size_left)
2691
2692             while size_can_write > 0:
2693                 copyfileobj(fileobj, self.fileobj, size_can_write)
2694                 self.offset += size_can_write
2695                 source_size_left -= size_can_write
2696                 target_size_left = _size_left()
2697                 size_can_write = min(target_size_left, source_size_left)
2698
2699             # now target_size_left == 0 or source_size_left == 0
2700
2701             # if there is data left to write, we need to create a new volume
2702             if source_size_left > 0:
2703                 # Only finalize the crypto entry here if we’re continuing with
2704                 # another one; otherwise, the encryption must include the block
2705                 # padding below.
2706                 tarinfo.type = GNUTYPE_MULTIVOL
2707
2708                 if not self.new_volume_handler or\
2709                     not callable(self.new_volume_handler):
2710                     raise Exception("We need to create a new volume and you "
2711                                     "didn't supply a new_volume_handler")
2712
2713
2714                 # the new volume handler should do everything needed to
2715                 # start working in a new volume. usually, the handler calls
2716                 # to self.open_volume
2717                 self.volume_number += 1
2718
2719                 # set to be used by open_volume, because in the case of a PAX
2720                 # tar it needs to write information about the volume and offset
2721                 # in the global header
2722                 tarinfo.volume_offset = tarinfo.size - source_size_left
2723                 self.volume_tarinfo = tarinfo
2724
2725                 # the “new_volume_handler” is supposed to call .close() on the
2726                 # “fileobj” _Stream
2727                 self.new_volume_handler(self, self.base_name, self.volume_number)
2728
2729                 self.volume_tarinfo = None
2730
2731                 if self.arcmode & ARCMODE_CONCAT:
2732                     self.fileobj.next_volume (tarinfo.name)
2733
2734                 # write new volume header
2735                 buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
2736                 self.fileobj.write(buf)
2737                 self.offset += len(buf)
2738
2739                 # adjust variables; open_volume should have reset self.offset
2740                 # --> _size_left should be big again
2741                 target_size_left = _size_left()
2742                 size_can_write = min(target_size_left, source_size_left)
2743                 self._dbg(3, 'new volume')
2744
2745         # now, all data has been written. We may have to fill up the rest of
2746         # the block in target with 0s
2747         remainder = (tarinfo.size - tarinfo.volume_offset) % BLOCKSIZE
2748         if remainder > 0:
2749             self.fileobj.write(NUL * (BLOCKSIZE - remainder))
2750             self.offset += BLOCKSIZE - remainder
2751
2752         if self.save_to_members:
2753             self.members.append(tarinfo)
2754
2755     def open_volume(self, name="", fileobj=None, encryption=None):
2756         '''
2757         Called by the user to change this tar file to point to a new volume.
2758         '''
2759
2760         # open the file using either fileobj or name
2761         if not fileobj:
2762             if self.mode == "a" and not os.path.exists(name):
2763                 # Create nonexistent files in append mode.
2764                 self.mode = "w"
2765                 self._mode = "wb"
2766             self._extfileobj = False
2767
2768             if isinstance(self.fileobj, _Stream):
2769                 self._dbg(3, 'open_volume: create a _Stream')
2770                 fileobj = _Stream(name=name,
2771                             mode=self.fileobj.mode,
2772                             comptype=self.fileobj.comptype,
2773                             fileobj=None,
2774                             bufsize=self.fileobj.bufsize,
2775                             encryption=encryption or self.fileobj.encryption,
2776                             concat=self.fileobj.arcmode & ARCMODE_CONCAT,
2777                             tolerance=self.fileobj.tolerance)
2778             else:
2779                 # here, we lose information about compression/encryption!
2780                 self._dbg(3, 'open_volume: builtin open')
2781                 fileobj = bltn_open(name, self._mode)
2782         else:
2783             if name is None and hasattr(fileobj, "name"):
2784                 name = fileobj.name
2785             if hasattr(fileobj, "mode"):
2786                 self._mode = fileobj.mode
2787             self._extfileobj = True
2788             self._dbg(3, 'open_volume: using external fileobj {}', fileobj)
2789         self.name = os.path.abspath(name) if name else None
2790         self.fileobj = fileobj
2791
2792         # init data structures
2793         self.closed = False
2794         self.members = []       # list of members as TarInfo objects
2795         self._loaded = False    # flag if all members have been read
2796         self.offset = self.fileobj.tell()
2797                                 # current position in the archive file
2798         self.inodes = {}        # dictionary caching the inodes of
2799                                 # archive members already added
2800
2801         try:
2802             if self.mode == "r":
2803                 self.firstmember = None
2804                 self.firstmember = self.next()
2805
2806             if self.mode == "a":
2807                 # Move to the end of the archive,
2808                 # before the first empty block.
2809                 while True:
2810                     self.fileobj.seek(self.offset)
2811                     try:
2812                         tarinfo = self.tarinfo.fromtarfile(self)
2813                         self.members.append(tarinfo)
2814                     except EOFHeaderError:
2815                         self.fileobj.seek(self.offset)
2816                         break
2817                     except HeaderError as e:
2818                         raise ReadError(str(e))
2819
2820             if self.mode in "aw":
2821                 self._loaded = True
2822
2823                 if  self.format == PAX_FORMAT:
2824                     volume_info = {
2825                         "GNU.volume.filename": str(self.volume_tarinfo.name),
2826                         "GNU.volume.size": str(self.volume_tarinfo.size - self.volume_tarinfo.volume_offset),
2827                         "GNU.volume.offset": str(self.volume_tarinfo.volume_offset),
2828                     }
2829
2830                     self.pax_headers.update(volume_info)
2831
2832                     if isinstance(self.fileobj, _Stream):
2833                         self.fileobj._init_write_gz ()
2834                     buf = self.tarinfo.create_pax_global_header(volume_info.copy())
2835                     self.fileobj.write(buf)
2836                     self.offset += len(buf)
2837         except Exception as exn:
2838             if not self._extfileobj:
2839                 self.fileobj.close()
2840             self.closed = True
2841             raise
2842
2843     def extractall(self, path=".", members=None, filter=None):
2844         """Extract all members from the archive to the current working
2845            directory and set owner, modification time and permissions on
2846            directories afterwards. `path' specifies a different directory
2847            to extract to. `members' is optional and must be a subset of the
2848            list returned by getmembers().
2849         """
2850         directories = []
2851
2852         if members is None:
2853             members = self
2854
2855         for tarinfo in members:
2856             if self.volume_number > 0 and tarinfo.ismultivol():
2857                 continue
2858
2859             if filter and not filter(tarinfo):
2860                 continue
2861
2862             if tarinfo.isdir():
2863                 # Extract directories with a safe mode.
2864                 directories.append(tarinfo)
2865                 tarinfo = copy.copy(tarinfo)
2866                 tarinfo.mode = 0o0700
2867             # Do not set_attrs directories, as we will do that further down
2868             self.extract(tarinfo, path, set_attrs=not tarinfo.isdir())
2869
2870         # Reverse sort directories.
2871         directories.sort(key=lambda a: a.name)
2872         directories.reverse()
2873
2874         # Set correct owner, mtime and filemode on directories.
2875         for tarinfo in directories:
2876             dirpath = os.path.join(path, tarinfo.name)
2877             try:
2878                 self.chown(tarinfo, dirpath)
2879                 self.utime(tarinfo, dirpath)
2880                 self.chmod(tarinfo, dirpath)
2881             except ExtractError as e:
2882                 if self.errorlevel > 1:
2883                     raise
2884                 else:
2885                     self._dbg(1, "tarfile: %s" % e)
2886
2887     def extract(self, member, path="", set_attrs=True, symlink_cb=None):
2888         """Extract a member from the archive to the current working directory,
2889            using its full name. Its file information is extracted as accurately
2890            as possible. `member' may be a filename or a TarInfo object. You can
2891            specify a different directory using `path'. File attributes (owner,
2892            mtime, mode) are set unless `set_attrs' is False.
2893            ``symlink_cb`` is a hook accepting a function that is passed the
2894            ``member``, ``path``, and ``set_attrs`` arguments if the tarinfo for
2895            ``member`` indicates a symlink in which case only the callback
2896            passed will be applied, skipping the actual extraction. In case the
2897            callback is invoked, its return value is passed on to the caller.
2898         """
2899         self._check("r")
2900
2901         if isinstance(member, str):
2902             tarinfo = self.getmember(member)
2903         else:
2904             tarinfo = member
2905
2906         # Prepare the link target for makelink().
2907         if tarinfo.islnk():
2908             tarinfo._link_target = os.path.join(path, tarinfo.linkname)
2909
2910         if symlink_cb is not None and tarinfo.issym():
2911             return symlink_cb(member, path, set_attrs)
2912
2913         try:
2914             self._extract_member(tarinfo, os.path.join(path, tarinfo.name),
2915                                  set_attrs=set_attrs)
2916         except EnvironmentError as e:
2917             if self.errorlevel > 0:
2918                 raise
2919             else:
2920                 if e.filename is None:
2921                     self._dbg(1, "tarfile: %s" % e.strerror)
2922                 else:
2923                     self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename))
2924         except ExtractError as e:
2925             if self.errorlevel > 1:
2926                 raise
2927             else:
2928                 self._dbg(1, "tarfile: %s" % e)
2929
2930     def extractfile(self, member):
2931         """Extract a member from the archive as a file object. `member' may be
2932            a filename or a TarInfo object. If `member' is a regular file or a
2933            link, an io.BufferedReader object is returned. Otherwise, None is
2934            returned.
2935         """
2936         self._check("r")
2937
2938         if isinstance(member, str):
2939             tarinfo = self.getmember(member)
2940         else:
2941             tarinfo = member
2942
2943         if tarinfo.isreg() or tarinfo.ismultivol() or\
2944             tarinfo.type not in SUPPORTED_TYPES:
2945             # If a member's type is unknown, it is treated as a
2946             # regular file.
2947             return self.fileobject(self, tarinfo)
2948
2949         elif tarinfo.islnk() or tarinfo.issym():
2950             if isinstance(self.fileobj, _Stream):
2951                 # A small but ugly workaround for the case that someone tries
2952                 # to extract a (sym)link as a file-object from a non-seekable
2953                 # stream of tar blocks.
2954                 raise StreamError("cannot extract (sym)link as file object")
2955             else:
2956                 # A (sym)link's file object is its target's file object.
2957                 return self.extractfile(self._find_link_target(tarinfo))
2958         else:
2959             # If there's no data associated with the member (directory, chrdev,
2960             # blkdev, etc.), return None instead of a file object.
2961             return None
2962
2963     def _extract_member(self, tarinfo, targetpath, set_attrs=True):
2964         """Extract the TarInfo object tarinfo to a physical
2965            file called targetpath.
2966         """
2967         # Fetch the TarInfo object for the given name
2968         # and build the destination pathname, replacing
2969         # forward slashes to platform specific separators.
2970         targetpath = targetpath.rstrip("/")
2971         targetpath = targetpath.replace("/", os.sep)
2972
2973         # Create all upper directories.
2974         upperdirs = os.path.dirname(targetpath)
2975         if upperdirs and not os.path.exists(upperdirs):
2976             # Create directories that are not part of the archive with
2977             # default permissions.
2978             os.makedirs(upperdirs)
2979
2980         if tarinfo.islnk() or tarinfo.issym():
2981             self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname))
2982         else:
2983             self._dbg(1, tarinfo.name)
2984
2985         if tarinfo.isreg():
2986             self.makefile(tarinfo, targetpath)
2987         elif tarinfo.isdir():
2988             self.makedir(tarinfo, targetpath)
2989         elif tarinfo.isfifo():
2990             self.makefifo(tarinfo, targetpath)
2991         elif tarinfo.ischr() or tarinfo.isblk():
2992             self.makedev(tarinfo, targetpath)
2993         elif tarinfo.islnk() or tarinfo.issym():
2994             self.makelink(tarinfo, targetpath)
2995         elif tarinfo.type not in SUPPORTED_TYPES:
2996             self.makeunknown(tarinfo, targetpath)
2997         else:
2998             self.makefile(tarinfo, targetpath)
2999
3000         if set_attrs:
3001             self.chown(tarinfo, targetpath)
3002             if not tarinfo.issym():
3003                 self.chmod(tarinfo, targetpath)
3004                 self.utime(tarinfo, targetpath)
3005
3006     #--------------------------------------------------------------------------
3007     # Below are the different file methods. They are called via
3008     # _extract_member() when extract() is called. They can be replaced in a
3009     # subclass to implement other functionality.
3010
3011     def makedir(self, tarinfo, targetpath):
3012         """Make a directory called targetpath.
3013         """
3014         try:
3015             # Use a safe mode for the directory, the real mode is set
3016             # later in _extract_member().
3017             os.mkdir(targetpath, 0o0700)
3018         except FileExistsError:
3019             pass
3020
3021     def makefile(self, tarinfo, targetpath):
3022         """Make a file called targetpath.
3023         """
3024         source = self.fileobj
3025         source.seek(tarinfo.offset_data)
3026         decrypt = False
3027         iterate = True
3028         target = bltn_open(targetpath, "wb")
3029
3030         if tarinfo.sparse is not None:
3031             try:
3032                 for offset, size in tarinfo.sparse:
3033                     target.seek(offset)
3034                     copyfileobj(source, target, size)
3035                 target.seek(tarinfo.size)
3036                 target.truncate()
3037             finally:
3038                 target.close()
3039                 return
3040
3041         while iterate:
3042             iterate = False
3043             try:
3044                 copyfileobj(source, target, tarinfo.size)
3045             except OSError:
3046                 source.close()
3047                 # only if we are extracting a multivolume this can be treated
3048                 if not self.new_volume_handler:
3049                     target.close()
3050                     raise Exception("We need to read a new volume and you"
3051                         " didn't supply a new_volume_handler")
3052
3053                 # the new volume handler should do everything needed to
3054                 # start working in a new volume. usually, the handler calls
3055                 # to self.open_volume
3056                 self.volume_number += 1
3057                 self.new_volume_handler(self, self.base_name, self.volume_number)
3058                 tarinfo = self.firstmember
3059                 source = self.fileobj
3060                 iterate = True
3061         target.close()
3062
3063
3064     def makeunknown(self, tarinfo, targetpath):
3065         """Make a file from a TarInfo object with an unknown type
3066            at targetpath.
3067         """
3068         self.makefile(tarinfo, targetpath)
3069         self._dbg(1, "tarfile: Unknown file type %r, " \
3070                      "extracted as regular file." % tarinfo.type)
3071
3072     def makefifo(self, tarinfo, targetpath):
3073         """Make a fifo called targetpath.
3074         """
3075         if hasattr(os, "mkfifo"):
3076             os.mkfifo(targetpath)
3077         else:
3078             raise ExtractError("fifo not supported by system")
3079
3080     def makedev(self, tarinfo, targetpath):
3081         """Make a character or block device called targetpath.
3082         """
3083         if not hasattr(os, "mknod") or not hasattr(os, "makedev"):
3084             raise ExtractError("special devices not supported by system")
3085
3086         mode = tarinfo.mode
3087         if tarinfo.isblk():
3088             mode |= stat.S_IFBLK
3089         else:
3090             mode |= stat.S_IFCHR
3091
3092         os.mknod(targetpath, mode,
3093                  os.makedev(tarinfo.devmajor, tarinfo.devminor))
3094
3095     def makelink(self, tarinfo, targetpath):
3096         """Make a (symbolic) link called targetpath. If it cannot be created
3097           (platform limitation), we try to make a copy of the referenced file
3098           instead of a link.
3099         """
3100         try:
3101             # For systems that support symbolic and hard links.
3102             if tarinfo.issym():
3103                 os.symlink(tarinfo.linkname, targetpath)
3104             else:
3105                 # See extract().
3106                 if os.path.exists(tarinfo._link_target):
3107                     os.link(tarinfo._link_target, targetpath)
3108                 else:
3109                     self._extract_member(self._find_link_target(tarinfo),
3110                                          targetpath)
3111         except symlink_exception:
3112             try:
3113                 self._extract_member(self._find_link_target(tarinfo),
3114                                      targetpath)
3115             except KeyError:
3116                 raise ExtractError("unable to resolve link inside archive")
3117
3118     def chown(self, tarinfo, targetpath):
3119         """Set owner of targetpath according to tarinfo.
3120         """
3121         if pwd and hasattr(os, "geteuid") and os.geteuid() == 0:
3122             # We have to be root to do so.
3123             try:
3124                 g = grp.getgrnam(tarinfo.gname)[2]
3125             except KeyError:
3126                 g = tarinfo.gid
3127             try:
3128                 u = pwd.getpwnam(tarinfo.uname)[2]
3129             except KeyError:
3130                 u = tarinfo.uid
3131             try:
3132                 if tarinfo.issym() and hasattr(os, "lchown"):
3133                     os.lchown(targetpath, u, g)
3134                 else:
3135                     os.chown(targetpath, u, g)
3136             except OSError as e:
3137                 raise ExtractError("could not change owner")
3138
3139     def chmod(self, tarinfo, targetpath):
3140         """Set file permissions of targetpath according to tarinfo.
3141         """
3142         if hasattr(os, 'chmod'):
3143             try:
3144                 os.chmod(targetpath, tarinfo.mode)
3145             except OSError as e:
3146                 raise ExtractError("could not change mode")
3147
3148     def utime(self, tarinfo, targetpath):
3149         """Set modification time of targetpath according to tarinfo.
3150         """
3151         if not hasattr(os, 'utime'):
3152             return
3153         try:
3154             os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime))
3155         except OSError as e:
3156             raise ExtractError("could not change modification time")
3157
3158     #--------------------------------------------------------------------------
3159     def next(self):
3160         """Return the next member of the archive as a TarInfo object, when
3161            TarFile is opened for reading. Return None if there is no more
3162            available.
3163         """
3164         self._check("ra")
3165         if self.firstmember is not None:
3166             m = self.firstmember
3167             self.firstmember = None
3168             return m
3169
3170         # Read the next block.
3171         self.fileobj.seek(self.offset)
3172         tarinfo = None
3173         while True:
3174             try:
3175                 tarinfo = self.tarinfo.fromtarfile(self)
3176             except EOFHeaderError as e:
3177                 if self.ignore_zeros:
3178                     self._dbg(2, "0x%X: %s" % (self.offset, e))
3179                     self.offset += BLOCKSIZE
3180                     continue
3181             except InvalidHeaderError as e:
3182                 if self.ignore_zeros:
3183                     self._dbg(2, "0x%X: %s" % (self.offset, e))
3184                     self.offset += BLOCKSIZE
3185                     continue
3186                 elif self.offset == 0:
3187                     raise ReadError(str(e))
3188             except EmptyHeaderError:
3189                 if self.offset == 0:
3190                     raise ReadError("empty file")
3191             except TruncatedHeaderError as e:
3192                 if self.offset == 0:
3193                     raise ReadError(str(e))
3194             except SubsequentHeaderError as e:
3195                 raise ReadError(str(e))
3196             break
3197
3198         if tarinfo is not None:
3199             if self.save_to_members:
3200                 self.members.append(tarinfo)
3201         else:
3202             self._loaded = True
3203
3204         return tarinfo
3205
3206     #--------------------------------------------------------------------------
3207     # Little helper methods:
3208
3209     def _getmember(self, name, tarinfo=None, normalize=False):
3210         """Find an archive member by name from bottom to top.
3211            If tarinfo is given, it is used as the starting point.
3212         """
3213         # Ensure that all members have been loaded.
3214         members = self.getmembers()
3215
3216         # Limit the member search list up to tarinfo.
3217         if tarinfo is not None:
3218             members = members[:members.index(tarinfo)]
3219
3220         if normalize:
3221             name = os.path.normpath(name)
3222
3223         for member in reversed(members):
3224             if normalize:
3225                 member_name = os.path.normpath(member.name)
3226             else:
3227                 member_name = member.name
3228
3229             if name == member_name:
3230                 return member
3231
3232     def _load(self):
3233         """Read through the entire archive file and look for readable
3234            members.
3235         """
3236         while True:
3237             tarinfo = self.next()
3238             if tarinfo is None:
3239                 break
3240         self._loaded = True
3241
3242     def _check(self, mode=None):
3243         """Check if TarFile is still open, and if the operation's mode
3244            corresponds to TarFile's mode.
3245         """
3246         if self.closed:
3247             raise OSError("%s is closed" % self.__class__.__name__)
3248         if mode is not None and self.mode not in mode:
3249             raise OSError("bad operation for mode %r" % self.mode)
3250
3251     def _find_link_target(self, tarinfo):
3252         """Find the target member of a symlink or hardlink member in the
3253            archive.
3254         """
3255         if tarinfo.issym():
3256             # Always search the entire archive.
3257             linkname = "/".join(filter(None, (os.path.dirname(tarinfo.name), tarinfo.linkname)))
3258             limit = None
3259         else:
3260             # Search the archive before the link, because a hard link is
3261             # just a reference to an already archived file.
3262             linkname = tarinfo.linkname
3263             limit = tarinfo
3264
3265         member = self._getmember(linkname, tarinfo=limit, normalize=True)
3266         if member is None:
3267             raise KeyError("linkname %r not found" % linkname)
3268         return member
3269
3270     def __iter__(self):
3271         """Provide an iterator object.
3272         """
3273         if self._loaded:
3274             return iter(self.members)
3275         else:
3276             return TarIter(self)
3277
3278     def _dbg(self, level, msg, *args):
3279         """Write debugging output to sys.stderr.
3280         """
3281         if level <= self.debug:
3282             print(msg.format(*args), file=sys.stderr)
3283
3284     def __enter__(self):
3285         self._check()
3286         return self
3287
3288     def __exit__(self, type, value, traceback):
3289         if type is None:
3290             self.close()
3291         else:
3292             # An exception occurred. We must not call close() because
3293             # it would try to write end-of-archive blocks and padding.
3294             if not self._extfileobj:
3295                 self.fileobj.close()
3296             self.closed = True
3297 # class TarFile
3298
3299 class TarIter:
3300     """Iterator Class.
3301
3302        for tarinfo in TarFile(...):
3303            suite...
3304     """
3305
3306     def __init__(self, tarfile):
3307         """Construct a TarIter object.
3308         """
3309         self.tarfile = tarfile
3310         self.index = 0
3311     def __iter__(self):
3312         """Return iterator object.
3313         """
3314         return self
3315     def __next__(self):
3316         """Return the next item using TarFile's next() method.
3317            When all members have been read, set TarFile as _loaded.
3318         """
3319         # Fix for SF #1100429: Under rare circumstances it can
3320         # happen that getmembers() is called during iteration,
3321         # which will cause TarIter to stop prematurely.
3322
3323         if self.index == 0 and self.tarfile.firstmember is not None:
3324             tarinfo = self.tarfile.next()
3325         elif self.index < len(self.tarfile.members):
3326             tarinfo = self.tarfile.members[self.index]
3327         elif not self.tarfile._loaded:
3328             tarinfo = self.tarfile.next()
3329             if not tarinfo:
3330                 self.tarfile._loaded = True
3331                 raise StopIteration
3332         else:
3333             raise StopIteration
3334         self.index += 1
3335
3336         return tarinfo
3337
3338 #---------------------------------------------------------
3339 # support functionality for rescue mode
3340 #---------------------------------------------------------
3341
3342 TAR_FMT_HDR = (# See tar(5):
3343     "<"
3344     "100s" # ← char name[100];          /* 100 */
3345       "8s" # ← char mode[8];            /* 108 */
3346       "8s" # ← char uid[8];             /* 116 */
3347       "8s" # ← char gid[8];             /* 124 */
3348      "12s" # ← char size[12];           /* 136 */
3349      "12s" # ← char mtime[12];          /* 148 */
3350       "8s" # ← char checksum[8];        /* 156 */
3351        "B" # ← char typeflag[1];        /* 157 */
3352     "100s" # ← char linkname[100];      /* 257 */
3353       "6s" # ← char magic[6];           /* 263 */
3354       "2s" # ← char version[2];         /* 265 */
3355      "32s" # ← char uname[32];          /* 297 */
3356      "32s" # ← char gname[32];          /* 329 */
3357       "8s" # ← char devmajor[8];        /* 337 */
3358       "8s" # ← char devminor[8];        /* 345 */
3359      "12s" # ← char atime[12];          /* 357 */
3360      "12s" # ← char ctime[12];          /* 369 */
3361      "12s" # ← char offset[12];         /* 381 */
3362       "4s" # ← char longnames[4];       /* 385 */
3363        "B" # ← char unused[1];          /* 386 */
3364         "" #   struct {
3365      "12s" # ←       char offset[12];
3366      "12s" # ←       char numbytes[12];
3367      "12s" # ←       char offset[12];
3368      "12s" # ←       char numbytes[12];
3369      "12s" # ←       char offset[12];
3370      "12s" # ←       char numbytes[12];
3371      "12s" # ←       char offset[12];
3372      "12s" # ←       char numbytes[12];
3373         "" #   } sparse[4];             /* 482 */
3374        "B" # ← char isextended[1];      /* 483 */
3375      "12s" # ← char realsize[12];       /* 495 */
3376      "17s" # ← char pad[17];            /* 512 */
3377 )
3378
3379 # The “magic” and “version” fields are special:
3380 #
3381 # tar(5)
3382 #    magic   The magic field holds the five characters “ustar” followed by a
3383 #            space.  Note that POSIX ustar archives have a trailing null.
3384 #
3385 # however, “tar.h”:
3386 #
3387 #   /* OLDGNU_MAGIC uses both magic and version fields, which are contiguous.
3388 #      Found in an archive, it indicates an old GNU header format, which will be
3389 #      hopefully become obsolescent.  With OLDGNU_MAGIC, uname and gname are
3390 #      valid, though the header is not truly POSIX conforming.  */
3391 #
3392 #
3393 TAR_HDR_OFF_MAGIC    = 257
3394 TAR_FMT_OLDGNU_MAGIC = b"ustar "
3395
3396 def read_gnu_tar_hdr (data):
3397     if len (data) != BLOCKSIZE: # header requires one complete block
3398         return None
3399
3400     try:
3401         name, mode, \
3402             uid, gid, \
3403             size, mtime, \
3404             checksum, \
3405             typeflag, \
3406             linkname, \
3407             magic, \
3408             version, \
3409             uname, \
3410             gname, \
3411             devmajor, \
3412             devminor, \
3413             atime, \
3414             ctime, \
3415             offset, \
3416             longnames, \
3417             unused, \
3418             offset1, numbytes1, \
3419             offset2, numbytes2, \
3420             offset3, numbytes3, \
3421             offset4, numbytes4, \
3422             isextended, \
3423             realsize, \
3424             pad = struct.unpack (TAR_FMT_HDR, data)
3425     except struct.error:
3426         return None
3427
3428     if magic != TAR_FMT_OLDGNU_MAGIC:
3429         return None
3430
3431     # return all except “unused” and “pad”
3432     return \
3433         { "name"        : name,     "mode"        : mode
3434         , "uid"         : uid ,     "gid"         : gid
3435         , "size"        : size,     "mtime"       : mtime
3436         , "checksum"    : checksum
3437         , "typeflag"    : typeflag
3438         , "linkname"    : linkname
3439         , "magic"       : magic
3440         , "version"     : version
3441         , "uname"       : uname,    "gname"       : gname
3442         , "devmajor"    : devmajor, "devminor"    : devminor
3443         , "atime"       : atime,    "ctime"       : ctime
3444         , "offset"      : offset
3445         , "longnames"   : longnames
3446         , "offset1"     : offset1,  "numbytes1"   : numbytes1
3447         , "offset2"     : offset2,  "numbytes2"   : numbytes2
3448         , "offset3"     : offset3,  "numbytes3"   : numbytes3
3449         , "offset4"     : offset4,  "numbytes4"   : numbytes4
3450         , "isextended"  : isextended
3451         , "realsize"    : realsize
3452         }
3453
3454
3455 def tar_hdr_check_chksum (data):
3456     hdr = read_gnu_tar_hdr (data)
3457     if hdr is None:
3458         return False
3459     s = calc_chksums (data)
3460     return nti (hdr ["checksum"]) in s
3461
3462
3463 def readable_tar_objects_offsets (ifd):
3464     """
3465     Traverse blocks in file, trying to extract tar headers.
3466     """
3467     pos     = 0
3468     offsets = []
3469
3470     mm = mmap.mmap(ifd, 0, mmap.MAP_SHARED, mmap.PROT_READ)
3471     pos = TAR_HDR_OFF_MAGIC
3472
3473     while True:
3474         pos = mm.find (TAR_FMT_OLDGNU_MAGIC, pos)
3475         if pos == -1:
3476             break
3477         off = pos - TAR_HDR_OFF_MAGIC
3478         mm.seek (off)
3479         blk = mm.read (BLOCKSIZE)
3480         if tar_hdr_check_chksum (blk) is True:
3481             offsets.append (off)
3482         pos += 1
3483
3484     return offsets
3485
3486
3487 def locate_gz_hdr_candidates (fd):
3488     """
3489     Walk over instances of the GZ magic in the payload, collecting their
3490     positions. If the offset of the first found instance is not zero, the file
3491     begins with leading garbage.
3492
3493     Note that since the GZ magic consists of only two bytes, we expect a lot of
3494     false positives inside binary data.
3495
3496     :return:    The list of offsets in the file.
3497     """
3498     pos   = 0
3499     cands = []
3500     mm    = mmap.mmap(fd, 0, mmap.MAP_SHARED, mmap.PROT_READ)
3501
3502     while True:
3503         pos = mm.find (GZ_MAGIC_BYTES, pos)
3504         if pos == -1:
3505             break
3506         cands.append (pos)
3507         pos += len (GZ_MAGIC_BYTES)
3508
3509     return cands
3510
3511
3512 HDR_CAND_GOOD       = 0 # header marks begin of valid object
3513 HDR_CAND_FISHY      = 1 # inconclusive
3514 HDR_CAND_JUNK       = 2 # not a header / object unreadable
3515
3516
3517 def read_cstring (fd, max=-1, encoding=None):
3518     """
3519     Read one NUL-terminated string from *fd* into a Python string. If *max* is
3520     non-negative, reading will terminate after the specified number of bytes.
3521
3522     Optionally, an *encoding* may be specified to interpret the data as.
3523
3524     :returns: *None* if parsing failed or the maximum number of bytes has been
3525               exceeded; a Python string with the data otherwise.
3526     """
3527     buf = b""
3528     l = 0
3529
3530     while True:
3531         c = os.read (fd, 1)
3532         if c == NUL:
3533             break
3534         if max >= 0 and l > max:
3535             return None
3536         buf += c
3537         l += 1
3538     if encoding is not None:
3539         buf = buf.decode (encoding)
3540
3541     return buf
3542
3543
3544 def inspect_gz_hdr (fd, off):
3545     """
3546     Attempt to parse a Gzip header in *fd* at position *off*. The format is
3547     documented as RFC1952.
3548
3549     Returns a verdict about the quality of that header plus the parsed header
3550     when readable. Problematic sizes such as fields running past the EOF are
3551     treated as garbage. Properties in which the header merely doesn’t conform
3552     to the spec (garbage flag bits, bogus timestamp) are considered “fishy”. No
3553     validation is possible on embedded strings because they are single-byte
3554     encoded.
3555     """
3556     fname   = None
3557     flags   = 0x00
3558     dflags  = 0x00
3559     mtime   = 0x00000000
3560     oscode  = 0x00
3561     verdict = HDR_CAND_GOOD
3562
3563     os.lseek (fd, off, os.SEEK_SET)
3564     if os.lseek (fd, 0, os.SEEK_CUR) != off:
3565         return HDR_CAND_JUNK, None
3566
3567     raw = os.read (fd, GZ_HEADER_SIZE)
3568     if len (raw) != GZ_HEADER_SIZE:
3569         return HDR_CAND_JUNK, None
3570
3571     flags = 0x0
3572     try:
3573         _m1, _m2, meth, flags, mtime, dflags, oscode = \
3574             struct.unpack (GZ_FMT_HEADER, raw)
3575         if meth != GZ_METHOD_DEFLATE: # only deflate is supported
3576             return HDR_CAND_JUNK, None
3577     except struct.error as exn:
3578         return HDR_CAND_JUNK, None
3579
3580     if mtime > int (time.time ()):
3581         verdict = HDR_CAND_FISHY
3582
3583     if dflags != GZ_DEFLATE_FLAGS:
3584         verdict = HDR_CAND_FISHY
3585
3586     if oscode != GZ_OS_CODE:
3587         verdict = HDR_CAND_FISHY
3588
3589     if flags & GZ_FLAG_FTEXT: # created by some contrarian
3590         verdict = HDR_CAND_FISHY
3591     if flags & GZ_FLAG_FEXTRA:
3592         xlen = struct.unpack ("<H", os.read (fd, 2))
3593         xtra = os.read (fd, xlen)
3594         if len (xtra) != xlen: # eof inside header
3595             return HDR_CAND_JUNK, None
3596     if flags & GZ_FLAG_FNAME:
3597         # read up to the next NUL byte, not exceeding the maximum path length
3598         # allowed by tar(5)
3599         fname = read_cstring (fd, max=(LENGTH_NAME + LENGTH_PREFIX),
3600                               encoding="iso-8859-1")
3601         if fname is None:
3602             return HDR_CAND_JUNK, None
3603     if flags & GZ_FLAG_FCOMMENT:
3604         fname = read_cstring (fd, max=(LENGTH_NAME + LENGTH_PREFIX),
3605                               encoding="iso-8859-1")
3606         if fname is None:
3607             return HDR_CAND_JUNK, None
3608     if flags & GZ_FLAG_FHCRC: # half a CRC32; discarded
3609         crc16 = os.read (fd, 2)
3610         if len (crc16) != 2: # eof inside header
3611             return HDR_CAND_JUNK, None
3612     if flags & GZ_FLAG_RESERVED:
3613         # according to the RFC, these must not be set
3614         verdict = HDR_CAND_FISHY
3615
3616     hlen = os.lseek (fd, 0, os.SEEK_CUR) - off
3617
3618     return verdict, \
3619            { "fname"  : fname
3620            , "flags"  : flags
3621            , "dflags" : dflags
3622            , "mtime"  : mtime
3623            , "oscode" : oscode
3624            , "hlen"   : hlen
3625            }
3626
3627
3628 def try_decompress (ifd, off, hdr):
3629     """
3630     Attempt to process the object starting at *off* with gzip.
3631
3632     :returns:   A pair containing the values of the decompressed data and
3633                 the length of the input consumed. Note that the latter value
3634                 may exceed the length of the compressed data because the
3635                 *zlib* module does not provide a means to query how much
3636                 of the input it processed before the end of an object.
3637     """
3638     import zlib
3639     decmp = zlib.decompressobj (-zlib.MAX_WBITS)
3640     pos   = off
3641     dlen  = 0 # size of decompressed data
3642
3643     os.lseek (ifd, pos, os.SEEK_SET)
3644     while True:
3645         cnk = os.read (ifd, BUFSIZE)
3646         pos += len (cnk)
3647         try:
3648             data = decmp.decompress (cnk)
3649         except zlib.error as exn: # probably CRC32 mismatch; terminate softly
3650             break # fishy
3651         dlen += len (data)
3652         if decmp.eof is True:
3653             break
3654         if len (cnk) != BUFSIZE: # eof, but not end of decompressed object!
3655             break # fishy
3656
3657     return dlen, pos - off
3658
3659 def readable_gz_objects_offsets (ifd, cands):
3660     """
3661     Inspect header candidates for parseable *ifd* gzipped objects.
3662     """
3663     good = []
3664     nobj = 0
3665
3666     for cand in cands:
3667         nobj += 1
3668         vdt, hdr = inspect_gz_hdr (ifd, cand)
3669         if vdt == HDR_CAND_JUNK:
3670             pass # ignore unreadable ones
3671         elif vdt in [HDR_CAND_GOOD, HDR_CAND_FISHY]:
3672             off0 = cand + hdr ["hlen"]
3673             dlen, clen = try_decompress (ifd, off0, hdr)
3674             if dlen > 0 and clen > 0:
3675                 good.append (cand)
3676
3677     return good
3678
3679
3680 def reconstruct_offsets_gz (fname):
3681     """
3682     From the given file, retrieve all GZ header-like offsets (“candidates”).
3683     Then check each of those locations whether they can be processed as
3684     compressed data.
3685     """
3686     ifd = os.open (fname, os.O_RDONLY)
3687
3688     try:
3689         cands = locate_gz_hdr_candidates (ifd)
3690         return readable_gz_objects_offsets (ifd, cands)
3691     finally:
3692         os.close (ifd)
3693
3694
3695 def reconstruct_offsets_tar (fname):
3696     """
3697     From the given file, retrieve all tar header-like offsets (“candidates”).
3698     Then check each of those locations whether they can be processed as tar
3699     data.
3700     """
3701     ifd = os.open (fname, os.O_RDONLY)
3702
3703     try:
3704         return readable_tar_objects_offsets (ifd)
3705     finally:
3706         os.close (ifd)
3707
3708
3709 def read_tarobj_at_offset (fileobj, offset, mode, secret=None):
3710     decr = None
3711
3712     if secret is not None:
3713         ks   = secret [0]
3714
3715         if ks == crypto.PDTCRYPT_SECRET_PW:
3716             decr = crypto.Decrypt (password=secret [1])
3717         elif ks == crypto.PDTCRYPT_SECRET_KEY:
3718             key = binascii.unhexlify (secret [1])
3719             decr = crypto.Decrypt (key=key)
3720         else:
3721             raise RuntimeError
3722
3723     try:
3724         tarobj = \
3725             TarFile.open_at_offset (offset,
3726                                     mode=mode,
3727                                     fileobj=fileobj,
3728                                     format=GNU_FORMAT,
3729                                     concat='#' in mode,
3730                                     encryption=decr,
3731                                     save_to_members=False,
3732                                     tolerance=TOLERANCE_RESCUE)
3733     except (ReadError, EndOfFile):
3734         return None
3735
3736     return tarobj.next ()
3737
3738
3739 def idxent_of_tarinfo (tarinfo):
3740     """
3741     Scrape the information relevant for the index from a *TarInfo* object.
3742     Keys like the inode number that lack a corresponding field in a TarInfo
3743     will be set to some neutral value.
3744     Example output:
3745
3746         { "inode"  : 0
3747         , "uid"    : 0
3748         , "path"   : "snapshot://annotations.db"
3749         , "offset" : 0
3750         , "volume" : 0
3751         , "mode"   : 33152
3752         , "ctime"  : 1502798115
3753         , "mtime"  : 1502196423
3754         , "size"   : 144
3755         , "type"   : "file"
3756         , "gid"    : 0
3757         }
3758
3759     """
3760
3761     return \
3762         { "inode"  : 0            # ignored when reading the index
3763         , "uid"    : tarinfo.uid
3764         , "gid"    : tarinfo.gid
3765         , "path"   : tarinfo.name # keeping URI scheme
3766         , "offset" : 0            # to be added by the caller
3767         , "volume" : tarinfo.volume_offset
3768         , "mode"   : tarinfo.mode
3769         , "ctime"  : tarinfo.mtime
3770         , "mtime"  : tarinfo.mtime
3771         , "size"   : tarinfo.size
3772         , "type"   : tarinfo.type
3773         }
3774
3775
3776 def gen_rescue_index (gen_volume_name, mode, maxvol=None, password=None, key=None):
3777     infos   = []
3778     psidx   = [] # pseudo index, return value
3779     offsets = None
3780     secret  = crypto.make_secret (password=password, key=key)
3781
3782     nvol = 0
3783
3784     while True:
3785         vpath = gen_volume_name (nvol)
3786         try:
3787             if secret is not None:
3788                 offsets = crypto.reconstruct_offsets (vpath, secret)
3789             elif mode == "#gz":
3790                 offsets = reconstruct_offsets_gz (vpath)
3791             elif mode == "#":
3792                 offsets = reconstruct_offsets_tar (vpath)
3793             else:
3794                 raise TarError ("no rescue handling for mode “%s”" % mode)
3795         except FileNotFoundError as exn:
3796             # volume does not exist
3797             if maxvol is not None and i < maxvol:
3798                 continue # explicit volume number specified, ignore missing ones
3799             else:
3800                 break
3801
3802         fileobj = bltn_open (vpath, "rb")
3803
3804         def aux (acc, off):
3805             obj = read_tarobj_at_offset (fileobj, off, mode, secret=secret)
3806             if obj is not None:
3807                 acc.append ((off, nvol, obj))
3808             return acc
3809         infos += functools.reduce (aux, offsets, [])
3810
3811         nvol += 1
3812
3813     def aux (o, nvol, ti):
3814         ie = idxent_of_tarinfo (ti)
3815         ie ["offset"] = o
3816         ie ["volume"] = nvol
3817         return ie
3818
3819     psidx   = [ aux (o, nvol, ti) for o, nvol, ti in infos ]
3820
3821     return psidx
3822
3823 #--------------------
3824 # exported functions
3825 #--------------------
3826 def is_tarfile(name):
3827     """Return True if name points to a tar archive that we
3828        are able to handle, else return False.
3829     """
3830     try:
3831         t = open(name)
3832         t.close()
3833         return True
3834     except TarError:
3835         return False
3836
3837 bltn_open = open
3838 open = TarFile.open