developer.intra2net.com Git - python-delta-tar/blob - deltatar/tarfile.py

   1 #!/usr/bin/env python3
   2 #-------------------------------------------------------------------
   3 # tarfile.py
   4 #-------------------------------------------------------------------
   5 # Copyright (C) 2002 Lars Gustäbel <lars@gustaebel.de>
   6 # All rights reserved.
   7 #
   8 # Permission  is  hereby granted,  free  of charge,  to  any person
   9 # obtaining a  copy of  this software  and associated documentation
  10 # files  (the  "Software"),  to   deal  in  the  Software   without
  11 # restriction,  including  without limitation  the  rights to  use,
  12 # copy, modify, merge, publish, distribute, sublicense, and/or sell
  13 # copies  of  the  Software,  and to  permit  persons  to  whom the
  14 # Software  is  furnished  to  do  so,  subject  to  the  following
  15 # conditions:
  16 #
  17 # The above copyright  notice and this  permission notice shall  be
  18 # included in all copies or substantial portions of the Software.
  19 #
  20 # THE SOFTWARE IS PROVIDED "AS  IS", WITHOUT WARRANTY OF ANY  KIND,
  21 # EXPRESS OR IMPLIED, INCLUDING  BUT NOT LIMITED TO  THE WARRANTIES
  22 # OF  MERCHANTABILITY,  FITNESS   FOR  A  PARTICULAR   PURPOSE  AND
  23 # NONINFRINGEMENT.  IN  NO  EVENT SHALL  THE  AUTHORS  OR COPYRIGHT
  24 # HOLDERS  BE LIABLE  FOR ANY  CLAIM, DAMAGES  OR OTHER  LIABILITY,
  25 # WHETHER  IN AN  ACTION OF  CONTRACT, TORT  OR OTHERWISE,  ARISING
  26 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  27 # OTHER DEALINGS IN THE SOFTWARE.
  28 #
  29 """Read from and write to tar format archives.
  30 """
  31
  32 __version__ = "$Revision: 85213 $"
  33 # $Source$
  34
  35 version     = "0.9.0"
  36 __author__  = "Lars Gustäbel (lars@gustaebel.de)"
  37 __date__    = "$Date$"
  38 __cvsid__   = "$Id$"
  39 __credits__ = "Gustavo Niemeyer, Niels Gustäbel, Richard Townsend, Eduardo Robles."
  40
  41 #---------
  42 # Imports
  43 #---------
  44 import binascii
  45 import copy
  46 import errno
  47 import io
  48 import mmap
  49 import operator
  50 import os
  51 import re
  52 import shutil
  53 import stat
  54 import struct
  55 import sys
  56 import time
  57
  58 import traceback # XXX
  59
  60 from . import crypto
  61
  62 try:
  63     import grp, pwd
  64 except ImportError:
  65     grp = pwd = None
  66
  67 # os.symlink on Windows prior to 6.0 raises NotImplementedError
  68 symlink_exception = (AttributeError, NotImplementedError)
  69 try:
  70     # OSError (winerror=1314) will be raised if the caller does not hold the
  71     # SeCreateSymbolicLinkPrivilege privilege
  72     symlink_exception += (OSError,)
  73 except NameError:
  74     pass
  75
  76 # from tarfile import *
  77 __all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError"]
  78
  79 from builtins import open as _open # Since 'open' is TarFile.open
  80
  81 #---------------------------------------------------------
  82 # tar constants
  83 #---------------------------------------------------------
  84 NUL = b"\0"                     # the null character
  85 BLOCKSIZE = 512                 # length of processing blocks
  86 RECORDSIZE = BLOCKSIZE * 20     # length of records
  87 GNU_MAGIC = b"ustar  \0"        # magic gnu tar string
  88 POSIX_MAGIC = b"ustar\x0000"    # magic posix tar string
  89
  90 LENGTH_NAME = 100               # maximum length of a filename
  91 LENGTH_LINK = 100               # maximum length of a linkname
  92 LENGTH_PREFIX = 155             # maximum length of the prefix field
  93
  94 REGTYPE = b"0"                  # regular file
  95 AREGTYPE = b"\0"                # regular file
  96 LNKTYPE = b"1"                  # link (inside tarfile)
  97 SYMTYPE = b"2"                  # symbolic link
  98 CHRTYPE = b"3"                  # character special device
  99 BLKTYPE = b"4"                  # block special device
 100 DIRTYPE = b"5"                  # directory
 101 FIFOTYPE = b"6"                 # fifo special device
 102 CONTTYPE = b"7"                 # contiguous file
 103
 104 GNUTYPE_LONGNAME = b"L"         # GNU tar longname
 105 GNUTYPE_LONGLINK = b"K"         # GNU tar longlink
 106 GNUTYPE_SPARSE = b"S"           # GNU tar sparse file
 107 GNUTYPE_MULTIVOL = b"M"         # GNU tar continuation of a file that began on
 108                                 # another volume
 109
 110 XHDTYPE = b"x"                  # POSIX.1-2001 extended header
 111 XGLTYPE = b"g"                  # POSIX.1-2001 global header
 112 SOLARIS_XHDTYPE = b"X"          # Solaris extended header
 113
 114 USTAR_FORMAT = 0                # POSIX.1-1988 (ustar) format
 115 GNU_FORMAT = 1                  # GNU tar format
 116 PAX_FORMAT = 2                  # POSIX.1-2001 (pax) format
 117 DEFAULT_FORMAT = GNU_FORMAT
 118
 119 GZ_FMT_HEADER        = b"<BBBBLBB"
 120 GZ_HEADER_SIZE       = 10   # not including the name
 121 GZ_MAGIC             = (0x1f, 0x8b) # 0o37, 0o213
 122 GZ_METHOD_DEFLATE    = 0x08 # 0o10
 123 GZ_FLAG_FTEXT        = 1 << 0 # ASCII payload
 124 GZ_FLAG_FHCRC        = 1 << 1 # CRC16
 125 GZ_FLAG_FEXTRA       = 1 << 2 # extra field
 126 GZ_FLAG_FNAME        = 1 << 3 # set by default in gzip
 127 GZ_FLAG_FCOMMENT     = 1 << 4 # NUL-terminated comment
 128 GZ_FLAG_RESERVED     = 7 << 5 # unassigned
 129 GZ_DEFLATE_FLAGS     = 0x00 # 0o00, never read (deflate.c)
 130 GZ_OS_CODE           = 0x03 # 0o03, default in gzip (tailor.h)
 131 GZ_MAGIC_BYTES       = struct.pack ("<BB", GZ_MAGIC [0], GZ_MAGIC [1])
 132 GZ_MAGIC_DEFLATE     = struct.pack ("<BBB", GZ_MAGIC [0], GZ_MAGIC [1],
 133                                     GZ_METHOD_DEFLATE)
 134
 135 TOLERANCE_STRICT  = 0
 136 TOLERANCE_RECOVER = 1 # rely on offsets in index
 137 TOLERANCE_RESCUE  = 2 # deduce metadata from archive contents
 138
 139 BUFSIZE           = 16 * 1024
 140
 141 #---------------------------------------------------------
 142 # archive handling mode
 143 #---------------------------------------------------------
 144
 145 ARCMODE_PLAIN    = 0
 146 ARCMODE_ENCRYPT  = 1 << 0
 147 ARCMODE_COMPRESS = 1 << 1
 148 ARCMODE_CONCAT   = 1 << 2
 149
 150 def arcmode_fmt (m):
 151     if m == ARCMODE_PLAIN:
 152         return "PLAIN"
 153     first = True
 154     ret = "["
 155     def chkappend (b, s):
 156         nonlocal m
 157         nonlocal ret
 158         nonlocal first
 159         if m & b:
 160             if first is True: first = False
 161             else: ret += " |"
 162             ret += " " + s
 163     chkappend (ARCMODE_ENCRYPT,  "ENCRYPT")
 164     chkappend (ARCMODE_COMPRESS, "COMPRESS")
 165     chkappend (ARCMODE_CONCAT,   "CONCAT")
 166     return ret + " ]"
 167
 168
 169 def arcmode_set (concat=False, encryption=None, comptype=None, init=ARCMODE_PLAIN):
 170     ret = init
 171     if bool (concat) is True:
 172         ret |= ARCMODE_CONCAT
 173     if encryption is not None:
 174         ret |= ARCMODE_ENCRYPT
 175     if comptype == "gz":
 176         ret |= ARCMODE_COMPRESS
 177     return ret
 178
 179 #---------------------------------------------------------
 180 # tarfile constants
 181 #---------------------------------------------------------
 182 # File types that tarfile supports:
 183 SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE,
 184                    SYMTYPE, DIRTYPE, FIFOTYPE,
 185                    CONTTYPE, CHRTYPE, BLKTYPE,
 186                    GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
 187                    GNUTYPE_SPARSE, GNUTYPE_MULTIVOL)
 188
 189 # File types that will be treated as a regular file.
 190 REGULAR_TYPES = (REGTYPE, AREGTYPE,
 191                  CONTTYPE, GNUTYPE_SPARSE)
 192
 193 # File types that are part of the GNU tar format.
 194 GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
 195              GNUTYPE_SPARSE, GNUTYPE_MULTIVOL)
 196
 197 # Fields from a pax header that override a TarInfo attribute.
 198 PAX_FIELDS = ("path", "linkpath", "size", "mtime",
 199               "uid", "gid", "uname", "gname")
 200
 201 # Fields from a pax header that are affected by hdrcharset.
 202 PAX_NAME_FIELDS = {"path", "linkpath", "uname", "gname"}
 203
 204 # Fields in a pax header that are numbers, all other fields
 205 # are treated as strings.
 206 PAX_NUMBER_FIELDS = {
 207     "atime": float,
 208     "ctime": float,
 209     "mtime": float,
 210     "uid": int,
 211     "gid": int,
 212     "size": int
 213 }
 214
 215 #---------------------------------------------------------
 216 # initialization
 217 #---------------------------------------------------------
 218
 219 if os.name in ("nt", "ce"):
 220     ENCODING = "utf-8"
 221 else:
 222     ENCODING = sys.getfilesystemencoding()
 223
 224 #---------------------------------------------------------
 225 # Some useful functions
 226 #---------------------------------------------------------
 227
 228 def stn(s, length, encoding, errors):
 229     """Convert a string to a null-terminated bytes object.
 230     """
 231     s = s.encode(encoding, errors)
 232     return s[:length] + (length - len(s)) * NUL
 233
 234 def nts(s, encoding, errors):
 235     """Convert a null-terminated bytes object to a string.
 236     """
 237     p = s.find(b"\0")
 238     if p != -1:
 239         s = s[:p]
 240     return s.decode(encoding, errors)
 241
 242 def sbtn(s, length, encoding, errors):
 243     """Convert a string or a bunch of bytes to a null-terminated bytes object
 244     of specific size.
 245     """
 246     if isinstance(s, str):
 247         s = s.encode(encoding, errors)
 248     return s[:length] + (length - len(s)) * NUL
 249
 250 def nti(s):
 251     """Convert a number field to a python number.
 252     """
 253     # There are two possible encodings for a number field, see
 254     # itn() below.
 255     if s[0] in (0o200, 0o377):
 256         n = 0
 257         for i in range(len(s) - 1):
 258             n <<= 8
 259             n += s[i + 1]
 260         if s[0] == 0o377:
 261             n = -(256 ** (len(s) - 1) - n)
 262     else:
 263         try:
 264             n = int(nts(s, "ascii", "strict") or "0", 8)
 265         except ValueError:
 266             raise InvalidHeaderError("invalid header")
 267     return n
 268
 269 def itn(n, digits=8, format=DEFAULT_FORMAT):
 270     """Convert a python number to a number field.
 271     """
 272     # POSIX 1003.1-1988 requires numbers to be encoded as a string of
 273     # octal digits followed by a null-byte, this allows values up to
 274     # (8**(digits-1))-1. GNU tar allows storing numbers greater than
 275     # that if necessary. A leading 0o200 or 0o377 byte indicate this
 276     # particular encoding, the following digits-1 bytes are a big-endian
 277     # base-256 representation. This allows values up to (256**(digits-1))-1.
 278     # A 0o200 byte indicates a positive number, a 0o377 byte a negative
 279     # number.
 280     if 0 <= n < 8 ** (digits - 1):
 281         s = bytes("%0*o" % (digits - 1, int(n)), "ascii") + NUL
 282     elif format == GNU_FORMAT and -256 ** (digits - 1) <= n < 256 ** (digits - 1):
 283         if n >= 0:
 284             s = bytearray([0o200])
 285         else:
 286             s = bytearray([0o377])
 287             n = 256 ** digits + n
 288
 289         for i in range(digits - 1):
 290             s.insert(1, n & 0o377)
 291             n >>= 8
 292     else:
 293         raise ValueError("overflow in number field")
 294
 295     return s
 296
 297 def calc_chksums(buf):
 298     """Calculate the checksum for a member's header by summing up all
 299        characters except for the chksum field which is treated as if
 300        it was filled with spaces. According to the GNU tar sources,
 301        some tars (Sun and NeXT) calculate chksum with signed char,
 302        which will be different if there are chars in the buffer with
 303        the high bit set. So we calculate two checksums, unsigned and
 304        signed.
 305     """
 306     unsigned_chksum = 256 + sum(struct.unpack_from("148B8x356B", buf))
 307     signed_chksum = 256 + sum(struct.unpack_from("148b8x356b", buf))
 308     return unsigned_chksum, signed_chksum
 309
 310 def copyfileobj(src, dst, length=None):
 311     """Copy length bytes from fileobj src to fileobj dst.
 312        If length is None, copy the entire content.
 313     """
 314     if length == 0:
 315         return
 316     if length is None:
 317         shutil.copyfileobj(src, dst)
 318         return
 319
 320     blocks, remainder = divmod(length, BUFSIZE)
 321     for b in range(blocks):
 322         buf = src.read(BUFSIZE)
 323         dst.write(buf)
 324         if len(buf) < BUFSIZE:
 325             raise OSError("end of file reached")
 326     if remainder != 0:
 327         buf = src.read(remainder)
 328         dst.write(buf)
 329         if len(buf) < remainder:
 330             raise OSError("end of file reached")
 331
 332
 333 def filemode(mode):
 334     """Deprecated in this location; use stat.filemode."""
 335     import warnings
 336     warnings.warn("deprecated in favor of stat.filemode",
 337                   DeprecationWarning, 2)
 338     return stat.filemode(mode)
 339
 340 class TarError(Exception):
 341     """Base exception."""
 342     pass
 343 class ExtractError(TarError):
 344     """General exception for extract errors."""
 345     pass
 346 class ReadError(TarError):
 347     """Exception for unreadable tar archives."""
 348     pass
 349 class CompressionError(TarError):
 350     """Exception for unavailable compression methods."""
 351     pass
 352 class StreamError(TarError):
 353     """Exception for unsupported operations on stream-like TarFiles."""
 354     pass
 355 class HeaderError(TarError):
 356     """Base exception for header errors."""
 357     pass
 358 class EmptyHeaderError(HeaderError):
 359     """Exception for empty headers."""
 360     pass
 361 class TruncatedHeaderError(HeaderError):
 362     """Exception for truncated headers."""
 363     pass
 364 class EOFHeaderError(HeaderError):
 365     """Exception for end of file headers."""
 366     pass
 367 class InvalidHeaderError(HeaderError):
 368     """Exception for invalid headers."""
 369     pass
 370 class SubsequentHeaderError(HeaderError):
 371     """Exception for missing and invalid extended headers."""
 372     pass
 373 class InvalidEncryptionError(TarError):
 374     """Exception for undefined crypto modes and combinations."""
 375     pass
 376 class DecryptionError(TarError):
 377     """Exception for error during decryption."""
 378     pass
 379 class EncryptionError(TarError):
 380     """Exception for error during encryption."""
 381     pass
 382 class EndOfFile(Exception):
 383     """Signal end of file condition when they’re not an error."""
 384     pass
 385
 386 #---------------------------
 387 # internal stream interface
 388 #---------------------------
 389 class _LowLevelFile:
 390     """Low-level file object. Supports reading and writing.
 391        It is used instead of a regular file object for streaming
 392        access.
 393     """
 394
 395     def __init__(self, name, mode):
 396         _mode = {
 397             "r": os.O_RDONLY,
 398             "w": os.O_RDWR | os.O_CREAT | os.O_TRUNC,
 399         }[mode]
 400         if hasattr(os, "O_BINARY"):
 401             _mode |= os.O_BINARY                    # pylint: disable=no-member
 402         self.fd = os.open(name, _mode, 0o666)
 403         self.offset = 0
 404
 405     def close(self):
 406         os.close(self.fd)
 407
 408     def read(self, size):
 409         ret = os.read(self.fd, size)
 410         self.offset += len(ret)
 411         return ret
 412
 413     def write(self, s, pos=None):
 414         if pos is not None:
 415             p0 = self.offset
 416             os.lseek (self.fd, pos, os.SEEK_SET)
 417         n = os.write(self.fd, s)
 418         if pos is None:
 419             self.offset += len(s)
 420         else:
 421             append = pos + n - p0
 422             if append > 0:
 423                 self.offset += append
 424             os.lseek (self.fd, p0, os.SEEK_SET)
 425
 426     def tell(self):
 427         return self.offset
 428
 429     def seek_set (self, pos):
 430         os.lseek (self.fd, pos, os.SEEK_SET)
 431         self.offset = pos
 432
 433
 434 def gz_header (name=None):
 435     timestamp = int(time.time())
 436     flags     = 0x0
 437
 438     if name is None:
 439         name = b""
 440     else:
 441         flags |= GZ_FLAG_FNAME
 442         if type(name) is str:
 443             name = name.encode("iso-8859-1", "replace")
 444         if name.endswith(b".pdtcrypt"):
 445             name = name[:-9]
 446         if name.endswith(b".gz"):
 447             name = name[:-3]
 448         # RFC1952 says we must use ISO-8859-1 for the FNAME field.
 449         name += NUL
 450
 451     hdr = struct.pack (GZ_FMT_HEADER,
 452                        GZ_MAGIC [0], GZ_MAGIC [1],
 453                        GZ_METHOD_DEFLATE, flags,
 454                        timestamp,
 455                        GZ_DEFLATE_FLAGS, GZ_OS_CODE)
 456
 457     return hdr + name
 458
 459
 460 class _Stream:
 461     """Class that serves as an adapter between TarFile and
 462        a stream-like object.  The stream-like object only
 463        needs to have a read() or write() method and is accessed
 464        blockwise.  Use of gzip or bzip2 compression is possible.
 465        A stream-like object could be for example: sys.stdin,
 466        sys.stdout, a socket, a tape device etc.
 467
 468        _Stream is intended to be used only internally but is
 469        nevertherless used externally by Deltatar.
 470
 471        When encrypting, the ``enccounter`` will be used for
 472        initializing the first cryptographic context. When
 473        decrypting, its value will be compared to the decrypted
 474        object. Decryption fails if the value does not match.
 475        In effect, this means that a ``_Stream`` whose ctor was
 476        passed ``enccounter`` can only be used to encrypt or
 477        decrypt a single object.
 478     """
 479
 480     remainder = -1 # track size in encrypted entries
 481     tolerance = TOLERANCE_STRICT
 482
 483     def __init__(self, name, mode, comptype, fileobj, bufsize,
 484                  concat=False, encryption=None, enccounter=None,
 485                  compresslevel=9, tolerance=TOLERANCE_STRICT):
 486         """Construct a _Stream object.
 487         """
 488         self.arcmode = arcmode_set (concat, encryption, comptype)
 489         self.tolerance = tolerance
 490
 491         self._extfileobj = True
 492         if fileobj is None:
 493             fileobj = _LowLevelFile(name, mode)
 494             self._extfileobj = False
 495
 496         if comptype == '*':
 497             # Enable transparent compression detection for the
 498             # stream interface
 499             fileobj = _StreamProxy(fileobj)
 500             comptype = fileobj.getcomptype()
 501         if comptype == '':
 502             comptype = "tar"
 503
 504         self.enccounter = None
 505         if self.arcmode & ARCMODE_ENCRYPT:
 506             self.enccounter = enccounter
 507
 508         self.name     = name or ""
 509         self.mode     = mode
 510         self.comptype = comptype
 511         self.cmp      = None
 512         self.fileobj  = fileobj
 513         self.bufsize  = bufsize
 514         self.buf      = b""
 515         self.pos      = 0
 516         self.concat_pos = 0
 517         self.closed   = False
 518         self.flags    = 0
 519         self.last_block_offset = 0
 520         self.dbuf     = b"" # ???
 521         self.exception = None # communicate decompression failure
 522         self.compresslevel = compresslevel
 523         self.bytes_written = 0
 524         # crypto parameters
 525         self.encryption = encryption
 526         self.lasthdr    = None
 527
 528         try:
 529             if comptype == "gz":
 530                 try:
 531                     import zlib
 532                 except ImportError:
 533                     raise CompressionError("zlib module is not available")
 534                 self.zlib = zlib
 535                 if mode == "r":
 536                     self.exception = zlib.error
 537                     self._init_read_gz()
 538                 elif mode == "w":
 539                     if not (self.arcmode & ARCMODE_CONCAT):
 540                         if self.arcmode & ARCMODE_ENCRYPT:
 541                             self._init_write_encrypt (name)
 542                         self._init_write_gz ()
 543                 self.crc = zlib.crc32(b"") & 0xFFFFffff
 544
 545             elif comptype == "bz2":
 546                 if self.arcmode & ARCMODE_ENCRYPT:
 547                     raise InvalidEncryptionError("encryption not available for "
 548                                                  "compression “%s”" % comptype)
 549                 try:
 550                     import bz2
 551                 except ImportError:
 552                     raise CompressionError("bz2 module is not available")
 553                 if mode == "r":
 554                     self.dbuf = b""
 555                     self.cmp = bz2.BZ2Decompressor()
 556                     self.exception = OSError
 557                 else:
 558                     self.cmp = bz2.BZ2Compressor()
 559
 560             elif comptype == 'xz':
 561                 if self.arcmode & ARCMODE_ENCRYPT:
 562                     raise InvalidEncryptionError("encryption not available for "
 563                                                  "compression “%s”" % comptype)
 564                 try:
 565                     import lzma
 566                 except ImportError:
 567                     raise CompressionError("lzma module is not available")
 568                 if mode == "r":
 569                     self.dbuf = b""
 570                     self.cmp = lzma.LZMADecompressor()
 571                     self.exception = lzma.LZMAError
 572                 else:
 573                     self.cmp = lzma.LZMACompressor()
 574
 575             elif comptype == "tar":
 576                 if not (self.arcmode & ARCMODE_CONCAT) \
 577                         and mode == "w" \
 578                         and self.arcmode & ARCMODE_ENCRYPT:
 579                     self._init_write_encrypt (name)
 580
 581             else:
 582                 if self.arcmode & ARCMODE_ENCRYPT:
 583                     raise InvalidEncryptionError("encryption not available for "
 584                                                  "compression “%s”" % comptype)
 585                 raise CompressionError("unknown compression type %r" % comptype)
 586
 587         except:
 588             if not self._extfileobj:
 589                 self.fileobj.close()
 590             self.closed = True
 591             raise
 592
 593     def __del__(self):
 594         if hasattr(self, "closed") and not self.closed:
 595             try:
 596                 self.close()
 597             except crypto.InternalError:
 598                 # context already finalized due to abort but close() tried
 599                 # to use it
 600                 pass
 601
 602
 603     def next (self, name):
 604         if self.arcmode & ARCMODE_COMPRESS:
 605             if getattr (self, "cmp", None) is not None:
 606                 self._finalize_write_gz ()
 607         self.__sync()
 608         if self.arcmode & ~(ARCMODE_ENCRYPT | ARCMODE_COMPRESS):
 609             self.last_block_offset = self.fileobj.tell()
 610         if self.arcmode & ARCMODE_ENCRYPT:
 611             self._finalize_write_encrypt ()
 612             self._init_write_encrypt (name, set_last_block_offset=True)
 613         if self.arcmode & ARCMODE_COMPRESS:
 614             self._init_write_gz (set_last_block_offset =
 615                                  not (self.arcmode & ARCMODE_ENCRYPT))
 616         return self.last_block_offset
 617
 618
 619     def next_volume (self, name):
 620         # with non-concat modes, this is taken care by the _Stream
 621         # ctor as invoked by the newvol handler
 622         if self.arcmode & ARCMODE_COMPRESS:
 623             if getattr (self, "cmp", None) is not None:
 624                 # e. g. compressed PAX header written
 625                 self._finalize_write_gz ()
 626         if self.arcmode & ARCMODE_ENCRYPT:
 627             self._init_write_encrypt (name)
 628         if self.arcmode & ARCMODE_COMPRESS:
 629             self._init_write_gz ()
 630
 631
 632     def _init_write_encrypt (self, entry=None, set_last_block_offset=False):
 633         """
 634         Save position for delayed write of header; fill the header location
 635         with dummy bytes.
 636         """
 637         # first thing, proclaim new object to the encryption context
 638         # secondly, assemble the header with the updated parameters
 639         # and commit it directly to the underlying stream, bypassing the
 640         # encryption layer in .__write().
 641         dummyhdr = self.encryption.next (entry, counter=self.enccounter)
 642         if dummyhdr is None:
 643             raise EncryptionError ("Crypto.next(): bad dummy header") # XXX
 644         self.lasthdr = self.fileobj.tell()
 645         self.__write_to_file(dummyhdr)
 646         if set_last_block_offset is True:
 647             self.last_block_offset = self.lasthdr
 648
 649
 650     def _finalize_write_encrypt (self):
 651         """
 652         Seek back to header position, read dummy bytes, finalize crypto
 653         obtaining the actual header, write header, seek back to current
 654         position.
 655
 656         Returns the list of IV fixed parts as used during encryption.
 657         """
 658         if self.lasthdr is not None:
 659             pos0 = self.fileobj.tell ()
 660             self.fileobj.seek_set (self.lasthdr)
 661             dummy = self.fileobj.read (crypto.PDTCRYPT_HDR_SIZE)
 662             pos1 = self.fileobj.tell ()
 663             dpos = pos1 - self.lasthdr
 664             assert dpos == crypto.PDTCRYPT_HDR_SIZE
 665             self.fileobj.seek_set (pos0)
 666             data, hdr, _ = self.encryption.done (dummy)
 667             self.__write_to_file(hdr, pos=self.lasthdr)
 668             self.__write_to_file(data) # append remainder of data
 669             self.lasthdr = -1
 670
 671
 672     def _finalize_write_gz (self):
 673         if self.cmp is not None:
 674             chunk = self.buf + self.cmp.flush()
 675             if chunk:
 676                 if self.comptype == "gz":
 677                     # The native zlib crc is an unsigned 32-bit integer, but
 678                     # the Python wrapper implicitly casts that to a signed C
 679                     # long.  So, on a 32-bit box self.crc may "look negative",
 680                     # while the same crc on a 64-bit box may "look positive".
 681                     # To avoid irksome warnings from the `struct` module, force
 682                     # it to look positive on all boxes.
 683                     chunk += struct.pack("<L", self.crc & 0xffffffff)
 684                     chunk += struct.pack("<L", self.concat_pos & 0xffffFFFF)
 685                 self.__enc_write (chunk)
 686                 self.buf = b""
 687
 688
 689     def _init_write_gz (self, set_last_block_offset=False):
 690         '''
 691         Add a new gzip block, closing last one
 692         '''
 693         self.concat_pos = 0
 694         self.crc = self.zlib.crc32(b"") & 0xFFFFffff
 695         first = self.cmp is None
 696         self.cmp = self.zlib.compressobj(self.compresslevel,
 697                                          self.zlib.DEFLATED,
 698                                          -self.zlib.MAX_WBITS,
 699                                          self.zlib.DEF_MEM_LEVEL,
 700                                          0)
 701
 702         # if aes, we encrypt after compression
 703         if set_last_block_offset is True:
 704             self.last_block_offset = self.fileobj.tell()
 705
 706         self.__write(gz_header (self.name if first is True else None))
 707
 708
 709     def write(self, s):
 710         """Write string s to the stream.
 711         """
 712         if self.comptype == "gz":
 713             self.crc = self.zlib.crc32(s, self.crc) & 0xFFFFffff
 714         self.pos += len(s)
 715         self.concat_pos += len(s)
 716         if self.cmp is not None:
 717             s = self.cmp.compress(s)
 718         self.__write(s)
 719
 720     def __sync(self):
 721         """Write what’s left in the buffer to the stream."""
 722         self.__write (b"") # → len (buf) <= bufsiz
 723         self.__enc_write (self.buf)
 724         self.buf = b""
 725
 726     def __write(self, s):
 727         """Writes (and encodes) string s to the stream blockwise
 728
 729         will wait with encoding/writing until block is complete
 730         """
 731         self.buf += s
 732         while len(self.buf) > self.bufsize:
 733             self.__enc_write(self.buf[:self.bufsize])
 734             self.buf = self.buf[self.bufsize:]
 735
 736
 737     def __write_to_file(self, s, pos=None):
 738         '''
 739         Writes directly to the fileobj; updates self.bytes_written. If “pos” is
 740         given, the stream will seek to that position first and back afterwards,
 741         and the total of bytes written is not updated.
 742         '''
 743         self.fileobj.write(s, pos)
 744         if pos is None:
 745             self.bytes_written += len(s)
 746
 747
 748     def __enc_write(self, s):
 749         """
 750         If encryption is active, the string s is encrypted before being written
 751         to the file.
 752         """
 753         if len (s) == 0:
 754             return
 755         if self.arcmode & ARCMODE_ENCRYPT:
 756             buf = s
 757             while len (buf) > 0:
 758                 n, ct = self.encryption.process(buf)
 759                 self.__write_to_file(ct)
 760                 buf = buf [n:]
 761                 if len (buf) > 0:
 762                     # The entire plaintext was not consumed: The size limit
 763                     # for encrypted objects was reached. Transparently create
 764                     # a new encrypted object and continue processing the input.
 765                     self._finalize_write_encrypt ()
 766                     self._init_write_encrypt ()
 767         else:
 768             self.__write_to_file(s)
 769
 770
 771     def estim_file_size(self):
 772         """ estimates size of file if closing it now
 773
 774         The result may differ greatly from the amount of data sent to write()
 775         due to compression, encryption and buffering.
 776
 777         In tests the result (before calling close()) was up to 12k smaller than
 778         the final file size if compression is being used because zlib/bz2
 779         compressors do not allow inspection of their buffered data :-(
 780
 781         Still, we add what close() would add: 8 bytes for gz checksum, one
 782         encryption block size if encryption is used and the size of our own
 783         buffer
 784         """
 785         if self.closed:
 786             return self.bytes_written
 787
 788         result = self.bytes_written
 789         if self.buf:
 790             result += len(self.buf)
 791         if self.comptype == 'gz':
 792             result += 8   # 2 longs = 8 byte (no extra info written for bzip2)
 793         return result
 794
 795     def close(self, close_fileobj=True):
 796         """Close the _Stream object. No operation should be
 797            done on it afterwards.
 798         """
 799
 800         if self.closed:
 801             return
 802
 803         if close_fileobj is True:
 804
 805             if self.mode == "w":
 806                 if self.arcmode & ARCMODE_COMPRESS:
 807                     self._finalize_write_gz ()
 808                 # end of Tar archive marker (two empty blocks) was written
 809                 # finalize encryption last; no writes may be performed after
 810                 # this point
 811                 self.__sync ()
 812                 if self.arcmode & ARCMODE_ENCRYPT:
 813                     self._finalize_write_encrypt ()
 814
 815             if not self._extfileobj:
 816                 self.fileobj.close()
 817         else:
 818             # read the zlib crc and length and check them
 819             if self.mode == "r" and self.comptype == "gz":
 820                 read_crc = self.__read(4)
 821                 read_length = self.__read(4)
 822                 calculated_crc = self.crc
 823                 if struct.unpack("<L", read_crc)[0] != calculated_crc:
 824                     raise CompressionError("bad gzip crc")
 825         self.closed = True
 826
 827
 828     def _init_read_gz(self):
 829         """Initialize for reading a gzip compressed fileobj.
 830         """
 831         self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
 832
 833         read2 = self.__read(2)
 834         if read2 == b"":
 835             raise EndOfFile ("_init_read_gz(): read returned zero bytes at pos "
 836                              "%d" % self.fileobj.tell())
 837         # taken from gzip.GzipFile with some alterations
 838         if read2 != GZ_MAGIC_BYTES:
 839             raise ReadError("not a gzip file")
 840
 841         read1 = ord (self.__read(1))
 842         if read1 != GZ_METHOD_DEFLATE:
 843             raise CompressionError("unsupported compression method")
 844
 845         self.flags = flag = ord(self.__read(1))
 846         self.__read(6) # discard timestamp[4], deflate flags, os code
 847
 848         if flag & GZ_FLAG_FEXTRA:
 849             xlen = ord(self.__read(1)) + 256 * ord(self.__read(1))
 850             self.read(xlen)
 851         if flag & GZ_FLAG_FNAME:
 852             while True:
 853                 s = self.__read(1)
 854                 if not s or s == NUL:
 855                     break
 856         if flag & GZ_FLAG_FCOMMENT:
 857             while True:
 858                 s = self.__read(1)
 859                 if not s or s == NUL:
 860                     break
 861         if flag & GZ_FLAG_FHCRC:
 862             self.__read(2)
 863
 864     def _init_read_encrypt (self):
 865         """Initialize encryption for next entry in archive. Read a header and
 866         notify the crypto context."""
 867         if self.arcmode & ARCMODE_ENCRYPT:
 868             lasthdr = self.fileobj.tell ()
 869             try:
 870                 hdr = crypto.hdr_read_stream (self.fileobj)
 871             except crypto.EndOfFile:
 872                 return False
 873             except crypto.InvalidHeader as exn:
 874                 raise DecryptionError ("Crypto.hdr_read_stream(): error “%s” "
 875                                        "processing %r at pos %d"
 876                                        % (exn, self.fileobj, lasthdr)) \
 877                       from exn
 878             if self.enccounter is not None:
 879                 # enforce that the iv counter in the header matches an
 880                 # explicitly requested one
 881                 iv = crypto.hdr_iv_counter (hdr)
 882                 if iv != self.enccounter:
 883                     raise DecryptionError ("expected IV counter %d, got %d"
 884                                            % (self.enccounter, iv))
 885             self.lasthdr   = lasthdr
 886             self.remainder = hdr ["ctsize"] # distance to next header
 887             try:
 888                 self.encryption.next (hdr)
 889             except crypto.InvalidParameter as exn:
 890                 raise DecryptionError ("Crypto.next(): error “%s” "
 891                                        "processing %r at pos %d"
 892                                        % (exn, self.fileobj, lasthdr)) \
 893                       from exn
 894
 895         return True
 896
 897
 898     def _read_encrypt (self, buf):
 899         """
 900         Demote a program error to a decryption error in tolerant mode. This
 901         allows recovery from corrupted headers and invalid data.
 902         """
 903         try:
 904             return self.encryption.process (buf)
 905         except RuntimeError as exn:
 906             if self.tolerance != TOLERANCE_STRICT:
 907                 raise DecryptionError (exn)
 908             raise
 909
 910
 911     def _finalize_read_encrypt (self):
 912         """
 913         Finalize decryption.
 914         """
 915         if      self.arcmode & ARCMODE_ENCRYPT \
 916             and self.lasthdr is not None :
 917             assert self.remainder >= 0
 918             if self.remainder > 0:
 919                 self.remainder = 0
 920             try:
 921                 data = self.encryption.done ()
 922             except crypto.InvalidGCMTag as exn:
 923                 raise DecryptionError ("decryption failed: %s" % exn)
 924             return data
 925
 926
 927     def tell(self):
 928         """Return the stream's file pointer position.
 929         """
 930         return self.pos
 931
 932     def seek(self, pos=0):
 933         """Set the stream's file pointer to pos. Negative seeking
 934            is forbidden.
 935         """
 936         if pos - self.pos >= 0:
 937             blocks, remainder = divmod(pos - self.pos, self.bufsize)
 938             for i in range(blocks):
 939                 self.read(self.bufsize)
 940             self.read(remainder)
 941         else:
 942             raise StreamError("seeking backwards is not allowed")
 943         return self.pos
 944
 945     def read(self, size=None):
 946         """Return the next size number of bytes from the stream.
 947            If size is not defined, return all bytes of the stream
 948            up to EOF.
 949         """
 950         if size is None:
 951             t = []
 952             while True:
 953                 buf = self._read(self.bufsize)
 954                 if not buf:
 955                     break
 956                 t.append(buf)
 957             buf = b"".join(t)
 958         else:
 959             buf = self._read(size)
 960         self.pos += len(buf)
 961         return buf
 962
 963     def readline(self):
 964         """Reads just one line, new line character included
 965         """
 966         # if \n in dbuf, no read neads to be done
 967         if b'\n' in self.dbuf:
 968             pos = self.dbuf.index(b'\n') + 1
 969             ret = self.dbuf[:pos]
 970             self.dbuf = self.dbuf[pos:]
 971             return ret
 972
 973         buf = []
 974         while True:
 975             chunk = self._read(self.bufsize)
 976
 977             # nothing more to read, so return the buffer
 978             if not chunk:
 979                 return b''.join(buf)
 980
 981             buf.append(chunk)
 982
 983             # if \n found, return the new line
 984             if b'\n' in chunk:
 985                 dbuf = b''.join(buf)
 986                 pos = dbuf.index(b'\n') + 1
 987                 self.dbuf = dbuf[pos:] + self.dbuf
 988                 return dbuf[:pos]
 989
 990     def _read(self, size):
 991         """Return size bytes from the stream.
 992         """
 993         c = len(self.dbuf)
 994         t = [self.dbuf]
 995
 996         while c < size:
 997             buf = self.__read(self.bufsize)
 998             if not buf:
 999                 break
1000
1001             if self.cmp is not None:
1002                 try:
1003                     buf = self.cmp.decompress(buf)
1004                 except self.exception as exn:
1005                     raise ReadError("invalid compressed data (%r)" % exn)
1006                 except Exception as e:
1007                     # happens at the end of the file
1008                     # _init_read_gz failed in the previous iteration so
1009                     # self.cmp.decompress fails here
1010                     if self.arcmode & ARCMODE_CONCAT:
1011                         pass
1012                     else:
1013                         raise ReadError("invalid compressed data")
1014                 if self.arcmode & ARCMODE_COMPRESS and hasattr(self, "crc"):
1015                     self.crc = self.zlib.crc32(buf, self.crc) & 0xFFFFffff
1016                 if self.arcmode & ARCMODE_CONCAT \
1017                         and len(self.cmp.unused_data) != 0:
1018                     self.buf = self.cmp.unused_data + self.buf
1019                     self.close(close_fileobj=False)
1020                     try:
1021                         self._init_read_gz()
1022                     except DecryptionError:
1023                         if self.tolerance != TOLERANCE_STRICT:
1024                             # return whatever data was processed successfully
1025                             if len (buf) > 0:
1026                                 t.append (buf)
1027                             if len (t) > 0:
1028                                 break
1029                         raise
1030                     except ReadError: # gzip troubles
1031                         if self.tolerance == TOLERANCE_RESCUE:
1032                             if len (buf) > 0:
1033                                 t.append (buf)
1034                             if len (t) > 0:
1035                                 break
1036                         raise
1037                     except EndOfFile:
1038                         # happens at the end of the file
1039                         pass
1040                     self.crc = self.zlib.crc32(b"") & 0xFFFFffff
1041                     self.closed = False
1042             t.append(buf)
1043             c += len(buf)
1044         t = b"".join(t)
1045         self.dbuf = t[size:]
1046         return t[:size]
1047
1048
1049     def __read(self, size):
1050         """
1051         Return size bytes from stream. If internal buffer is empty, read
1052         another block from the stream.
1053
1054         The function returns up to size bytes of data. When an error occurs
1055         during decryption, everything until the end of the last successfully
1056         finalized object is returned.
1057         """
1058         c = len(self.buf)
1059         t = [self.buf] if c > 0 else []
1060         good_crypto = len (t)
1061
1062         while c < size:
1063             todo = size
1064             try:
1065                 if self.arcmode & ARCMODE_ENCRYPT:
1066                     if self.remainder <= 0:
1067                         # prepare next object
1068                         if self._init_read_encrypt () is False: # EOF
1069                             buf = None
1070                             break # while
1071
1072                     # only read up to the end of the encrypted object
1073                     todo = min (size, self.remainder)
1074                 buf = self.fileobj.read(todo)
1075                 if self.arcmode & ARCMODE_ENCRYPT:
1076                     # decrypt the thing
1077                     buf = self._read_encrypt (buf)
1078                     if todo == self.remainder:
1079                         # at the end of a crypto object; finalization will fail if
1080                         # the GCM tag does not match
1081                         trailing = self._finalize_read_encrypt ()
1082                         good_crypto = len (t) + 1
1083                         if len (trailing) > 0:
1084                             buf += trailing
1085                         self.remainder = 0
1086                     else:
1087                         self.remainder -= todo
1088             except DecryptionError:
1089                 if self.tolerance == TOLERANCE_STRICT:
1090                     raise
1091                 self.encryption.drop ()
1092                 if self.tolerance == TOLERANCE_RECOVER:
1093                     if good_crypto == 0:
1094                         raise
1095                     # this may occur at any of the three crypto operations above.
1096                     # some objects did validate; discard all data after it; next
1097                     # call will start with the bad object and error out immediately
1098                     self.buf = b"".join (t [good_crypto:])
1099                     return b"".join (t [:good_crypto])
1100                 elif self.tolerance == TOLERANCE_RESCUE:
1101                     # keep what we have so far despite the finalization issue
1102                     t.append (buf)
1103                     c += len (buf)
1104                     break
1105                 else:
1106                     raise RuntimeError("internal error: bad tolerance level")
1107
1108             if not buf: ## XXX stream terminated prematurely; this should be an error
1109                 break
1110
1111             t.append(buf)
1112             c += len(buf)
1113         t = b"".join(t)
1114         self.buf = t[size:]
1115
1116         return t[:size]
1117
1118
1119 class _StreamProxy(object):
1120     """Small proxy class that enables transparent compression
1121        detection for the Stream interface (mode 'r|*').
1122     """
1123
1124     def __init__(self, fileobj):
1125         self.fileobj = fileobj
1126         self.buf = self.fileobj.read(BLOCKSIZE)
1127
1128     def read(self, size):                       # pylint: disable=method-hidden
1129         self.read = self.fileobj.read
1130         return self.buf
1131
1132     def getcomptype(self):
1133         if self.buf.startswith(GZ_MAGIC_DEFLATE):
1134             return "gz"
1135         elif self.buf[0:3] == b"BZh" and self.buf[4:10] == b"1AY&SY":
1136             return "bz2"
1137         elif self.buf.startswith((b"\x5d\x00\x00\x80", b"\xfd7zXZ")):
1138             return "xz"
1139         else:
1140             return "tar"
1141
1142     def close(self):
1143         self.fileobj.close()
1144 # class StreamProxy
1145
1146 #------------------------
1147 # Extraction file object
1148 #------------------------
1149 class _FileInFile(object):
1150     """A thin wrapper around an existing file object that
1151        provides a part of its data as an individual file
1152        object.
1153     """
1154
1155     def __init__(self, fileobj, offset, size, blockinfo=None):
1156         self.fileobj = fileobj
1157         self.offset = offset
1158         self.size = size
1159         self.position = 0
1160         self.name = getattr(fileobj, "name", None)
1161         self.closed = False
1162
1163         if blockinfo is None:
1164             blockinfo = [(0, size)]
1165
1166         # Construct a map with data and zero blocks.
1167         self.map_index = 0
1168         self.map = []
1169         lastpos = 0
1170         realpos = self.offset
1171         for offset, size in blockinfo:
1172             if offset > lastpos:
1173                 self.map.append((False, lastpos, offset, None))
1174             self.map.append((True, offset, offset + size, realpos))
1175             realpos += size
1176             lastpos = offset + size
1177         if lastpos < self.size:
1178             self.map.append((False, lastpos, self.size, None))
1179
1180     def flush(self):
1181         pass
1182
1183     def readable(self):
1184         return True
1185
1186     def writable(self):
1187         return False
1188
1189     def seekable(self):
1190         return self.fileobj.seekable()
1191
1192     def tell(self):
1193         """Return the current file position.
1194         """
1195         return self.position
1196
1197     def seek(self, position, whence=io.SEEK_SET):
1198         """Seek to a position in the file.
1199         """
1200         if whence == io.SEEK_SET:
1201             self.position = min(max(position, 0), self.size)
1202         elif whence == io.SEEK_CUR:
1203             if position < 0:
1204                 self.position = max(self.position + position, 0)
1205             else:
1206                 self.position = min(self.position + position, self.size)
1207         elif whence == io.SEEK_END:
1208             self.position = max(min(self.size + position, self.size), 0)
1209         else:
1210             raise ValueError("Invalid argument")
1211         return self.position
1212
1213     def read(self, size=None):
1214         """Read data from the file.
1215         """
1216         if size is None:
1217             size = self.size - self.position
1218         else:
1219             size = min(size, self.size - self.position)
1220
1221         buf = b""
1222         while size > 0:
1223             while True:
1224                 data, start, stop, offset = self.map[self.map_index]
1225                 if start <= self.position < stop:
1226                     break
1227                 else:
1228                     self.map_index += 1
1229                     if self.map_index == len(self.map):
1230                         self.map_index = 0
1231             length = min(size, stop - self.position)
1232             if data:
1233                 self.fileobj.seek(offset + (self.position - start))
1234                 buf += self.fileobj.read(length)
1235             else:
1236                 buf += NUL * length
1237             size -= length
1238             self.position += length
1239         return buf
1240
1241     def readinto(self, b):
1242         buf = self.read(len(b))
1243         b[:len(buf)] = buf
1244         return len(buf)
1245
1246     def close(self):
1247         self.closed = True
1248 #class _FileInFile
1249
1250
1251 class ExFileObject(io.BufferedReader):
1252
1253     def __init__(self, tarfile, tarinfo):
1254         fileobj = _FileInFile(tarfile.fileobj, tarinfo.offset_data,
1255                 tarinfo.size, tarinfo.sparse)
1256         super().__init__(fileobj)
1257 #class ExFileObject
1258
1259 #------------------
1260 # Exported Classes
1261 #------------------
1262 class TarInfo(object):
1263     """Informational class which holds the details about an
1264        archive member given by a tar header block.
1265        TarInfo objects are returned by TarFile.getmember(),
1266        TarFile.getmembers() and TarFile.gettarinfo() and are
1267        usually created internally.
1268     """
1269
1270     __slots__ = ("name", "mode", "uid", "gid", "size", "mtime",
1271                  "chksum", "type", "linkname", "uname", "gname",
1272                  "devmajor", "devminor", "volume_offset",
1273                  "offset", "offset_data", "pax_headers", "sparse",
1274                  "tarfile", "_sparse_structs", "_link_target")
1275
1276     def __init__(self, name=""):
1277         """Construct a TarInfo object. name is the optional name
1278            of the member.
1279         """
1280         self.name = name        # member name
1281         self.mode = 0o644       # file permissions
1282         self.uid = 0            # user id
1283         self.gid = 0            # group id
1284         self.size = 0           # file size
1285         self.mtime = 0          # modification time
1286         self.chksum = 0         # header checksum
1287         self.type = REGTYPE     # member type
1288         self.linkname = ""      # link name
1289         self.uname = ""         # user name
1290         self.gname = ""         # group name
1291         self.devmajor = 0       # device major number
1292         self.devminor = 0       # device minor number
1293
1294         self.offset = 0         # the tar header starts here
1295         self.offset_data = 0    # the file's data starts here
1296         self.volume_offset = 0  # the file's data corresponds with the data
1297                                 # starting at this position
1298
1299         self.sparse = None      # sparse member information
1300         self.pax_headers = {}   # pax header information
1301
1302     # In pax headers the "name" and "linkname" field are called
1303     # "path" and "linkpath".
1304     def _getpath(self):
1305         return self.name
1306     def _setpath(self, name):
1307         self.name = name
1308     path = property(_getpath, _setpath)
1309
1310     def _getlinkpath(self):
1311         return self.linkname
1312     def _setlinkpath(self, linkname):
1313         self.linkname = linkname
1314     linkpath = property(_getlinkpath, _setlinkpath)
1315
1316     def __repr__(self):
1317         return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self))
1318
1319     def get_info(self, encoding=None, errors=None):
1320         """Return the TarInfo's attributes as a dictionary.
1321         """
1322         info = {
1323             "name":     self.name,
1324             "mode":     self.mode & 0o7777,
1325             "uid":      self.uid,
1326             "gid":      self.gid,
1327             "size":     self.size,
1328             "mtime":    self.mtime,
1329             "chksum":   self.chksum,
1330             "type":     self.type,
1331             "linkname": self.linkname,
1332             "uname":    self.uname,
1333             "gname":    self.gname,
1334             "devmajor": self.devmajor,
1335             "devminor": self.devminor,
1336             "offset_data": self.offset_data,
1337             "volume_offset": self.volume_offset
1338         }
1339
1340         if info["type"] == DIRTYPE and not info["name"].endswith("/"):
1341             info["name"] += "/"
1342
1343         return info
1344
1345     def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING,
1346               errors="surrogateescape"):
1347         """Return a tar header as a string of 512 byte blocks.
1348         """
1349         info = self.get_info(encoding, errors)
1350
1351         if format == USTAR_FORMAT:
1352             return self.create_ustar_header(info, encoding, errors)
1353         elif format == GNU_FORMAT:
1354             return self.create_gnu_header(info, encoding, errors)
1355         elif format == PAX_FORMAT:
1356             return self.create_pax_header(info, encoding, errors)
1357         else:
1358             raise ValueError("invalid format")
1359
1360     def create_ustar_header(self, info, encoding, errors):
1361         """Return the object as a ustar header block.
1362         """
1363         info["magic"] = POSIX_MAGIC
1364
1365         if len(info["linkname"]) > LENGTH_LINK:
1366             raise ValueError("linkname is too long")
1367
1368         if len(info["name"]) > LENGTH_NAME:
1369             info["prefix"], info["name"] = self._posix_split_name(info["name"])
1370
1371         return self._create_header(info, USTAR_FORMAT, encoding, errors)
1372
1373     def create_gnu_header(self, info, encoding, errors):
1374         """Return the object as a GNU header block sequence.
1375         """
1376         info["magic"] = GNU_MAGIC
1377
1378         if self.ismultivol():
1379             prefix = [
1380                 itn(info.get("atime", 0), 12, GNU_FORMAT),
1381                 itn(info.get("ctime", 0), 12, GNU_FORMAT),
1382                 itn(self.volume_offset, 12, GNU_FORMAT),
1383                 itn(0, 119, GNU_FORMAT), # stuff unused in this tar implementation, set to zero
1384             ]
1385             info['prefix'] = b"".join(prefix)
1386             info['size'] = info['size'] - self.volume_offset
1387
1388         buf = b""
1389         if len(info["linkname"]) > LENGTH_LINK:
1390             buf += self._create_gnu_long_header(info["linkname"],
1391                 GNUTYPE_LONGLINK, encoding, errors)
1392
1393         if len(info["name"]) > LENGTH_NAME:
1394             buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME,
1395                                                 encoding, errors)
1396
1397         return buf + self._create_header(info, GNU_FORMAT, encoding, errors)
1398
1399     def create_pax_header(self, info, encoding, errors):
1400         """Return the object as a ustar header block. If it cannot be
1401            represented this way, prepend a pax extended header sequence
1402            with supplement information.
1403         """
1404         info["magic"] = POSIX_MAGIC
1405         pax_headers = self.pax_headers.copy()
1406         if self.ismultivol():
1407             info['size'] = info['size'] - self.volume_offset
1408
1409         # Test string fields for values that exceed the field length or cannot
1410         # be represented in ASCII encoding.
1411         for name, hname, length in (
1412                 ("name", "path", LENGTH_NAME),
1413                 ("linkname", "linkpath", LENGTH_LINK),
1414                 ("uname", "uname", 32),
1415                 ("gname", "gname", 32)):
1416
1417             if hname in pax_headers:
1418                 # The pax header has priority.
1419                 continue
1420
1421             # Try to encode the string as ASCII.
1422             try:
1423                 info[name].encode("ascii", "strict")
1424             except UnicodeEncodeError:
1425                 pax_headers[hname] = info[name]
1426                 continue
1427
1428             if len(info[name]) > length:
1429                 pax_headers[hname] = info[name]
1430
1431         # Test number fields for values that exceed the field limit or values
1432         # that like to be stored as float.
1433         for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)):
1434             if name in pax_headers:
1435                 # The pax header has priority. Avoid overflow.
1436                 info[name] = 0
1437                 continue
1438
1439             val = info[name]
1440             if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float):
1441                 pax_headers[name] = str(val)
1442                 info[name] = 0
1443
1444         # Create a pax extended header if necessary.
1445         if pax_headers:
1446             buf = self._create_pax_generic_header(pax_headers, XHDTYPE, encoding)
1447         else:
1448             buf = b""
1449
1450         return buf + self._create_header(info, USTAR_FORMAT, "ascii", "replace")
1451
1452     @classmethod
1453     def create_pax_global_header(cls, pax_headers):
1454         """Return the object as a pax global header block sequence.
1455         """
1456         return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf-8")
1457
1458     def _posix_split_name(self, name):
1459         """Split a name longer than 100 chars into a prefix
1460            and a name part.
1461         """
1462         prefix = name[:LENGTH_PREFIX + 1]
1463         while prefix and prefix[-1] != "/":
1464             prefix = prefix[:-1]
1465
1466         name = name[len(prefix):]
1467         prefix = prefix[:-1]
1468
1469         if not prefix or len(name) > LENGTH_NAME:
1470             raise ValueError("name is too long")
1471         return prefix, name
1472
1473     @staticmethod
1474     def _create_header(info, format, encoding, errors):
1475         """Return a header block. info is a dictionary with file
1476            information, format must be one of the *_FORMAT constants.
1477         """
1478         parts = [
1479             stn(info.get("name", ""), 100, encoding, errors),
1480             itn(info.get("mode", 0) & 0o7777, 8, format),
1481             itn(info.get("uid", 0), 8, format),
1482             itn(info.get("gid", 0), 8, format),
1483             itn(info.get("size", 0), 12, format),
1484             itn(info.get("mtime", 0), 12, format),
1485             b"        ", # checksum field
1486             info.get("type", REGTYPE),
1487             stn(info.get("linkname", ""), 100, encoding, errors),
1488             info.get("magic", POSIX_MAGIC),
1489             stn(info.get("uname", ""), 32, encoding, errors),
1490             stn(info.get("gname", ""), 32, encoding, errors),
1491             itn(info.get("devmajor", 0), 8, format),
1492             itn(info.get("devminor", 0), 8, format),
1493             sbtn(info.get("prefix", ""), 155, encoding, errors)
1494         ]
1495
1496         buf = struct.pack("%ds" % BLOCKSIZE, b"".join(parts))
1497         chksum = calc_chksums(buf[-BLOCKSIZE:])[0]
1498         buf = buf[:-364] + bytes("%06o\0" % chksum, "ascii") + buf[-357:]
1499         return buf
1500
1501     @staticmethod
1502     def _create_payload(payload):
1503         """Return the string payload filled with zero bytes
1504            up to the next 512 byte border.
1505         """
1506         blocks, remainder = divmod(len(payload), BLOCKSIZE)
1507         if remainder > 0:
1508             payload += (BLOCKSIZE - remainder) * NUL
1509         return payload
1510
1511     @classmethod
1512     def _create_gnu_long_header(cls, name, type, encoding, errors):
1513         """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence
1514            for name.
1515         """
1516         name = name.encode(encoding, errors) + NUL
1517
1518         info = {}
1519         info["name"] = "././@LongLink"
1520         info["type"] = type
1521         info["size"] = len(name)
1522         info["magic"] = GNU_MAGIC
1523
1524         # create extended header + name blocks.
1525         return cls._create_header(info, USTAR_FORMAT, encoding, errors) + \
1526                 cls._create_payload(name)
1527
1528     @classmethod
1529     def _create_pax_generic_header(cls, pax_headers, type, encoding):
1530         """Return a POSIX.1-2008 extended or global header sequence
1531            that contains a list of keyword, value pairs. The values
1532            must be strings.
1533         """
1534         # Check if one of the fields contains surrogate characters and thereby
1535         # forces hdrcharset=BINARY, see _proc_pax() for more information.
1536         binary = False
1537         for keyword, value in pax_headers.items():
1538             try:
1539                 value.encode("utf-8", "strict")
1540             except UnicodeEncodeError:
1541                 binary = True
1542                 break
1543
1544         records = b""
1545         if binary:
1546             # Put the hdrcharset field at the beginning of the header.
1547             records += b"21 hdrcharset=BINARY\n"
1548
1549         for keyword, value in pax_headers.items():
1550             keyword = keyword.encode("utf-8")
1551             if binary:
1552                 # Try to restore the original byte representation of `value'.
1553                 # Needless to say, that the encoding must match the string.
1554                 value = value.encode(encoding, "surrogateescape")
1555             else:
1556                 value = value.encode("utf-8")
1557
1558             l = len(keyword) + len(value) + 3   # ' ' + '=' + '\n'
1559             n = p = 0
1560             while True:
1561                 n = l + len(str(p))
1562                 if n == p:
1563                     break
1564                 p = n
1565             records += bytes(str(p), "ascii") + b" " + keyword + b"=" + value + b"\n"
1566
1567         # We use a hardcoded "././@PaxHeader" name like star does
1568         # instead of the one that POSIX recommends.
1569         info = {}
1570         info["name"] = "././@PaxHeader"
1571         info["type"] = type
1572         info["size"] = len(records)
1573         info["magic"] = POSIX_MAGIC
1574
1575         # Create pax header + record blocks.
1576         return cls._create_header(info, USTAR_FORMAT, "ascii", "replace") + \
1577                 cls._create_payload(records)
1578
1579     @classmethod
1580     def frombuf(cls, buf, encoding, errors):
1581         """Construct a TarInfo object from a 512 byte bytes object.
1582         """
1583         if len(buf) == 0:
1584             raise EmptyHeaderError("empty header")
1585         if len(buf) != BLOCKSIZE:
1586             raise TruncatedHeaderError("truncated header")
1587         if buf.count(NUL) == BLOCKSIZE:
1588             raise EOFHeaderError("end of file header")
1589
1590         chksum = nti(buf[148:156])
1591         if chksum not in calc_chksums(buf):
1592             raise InvalidHeaderError("bad checksum")
1593
1594         obj = cls()
1595         obj.name = nts(buf[0:100], encoding, errors)
1596         obj.mode = nti(buf[100:108])
1597         obj.uid = nti(buf[108:116])
1598         obj.gid = nti(buf[116:124])
1599         obj.size = nti(buf[124:136])
1600         obj.mtime = nti(buf[136:148])
1601         obj.chksum = chksum
1602         obj.type = buf[156:157]
1603         obj.linkname = nts(buf[157:257], encoding, errors)
1604         obj.uname = nts(buf[265:297], encoding, errors)
1605         obj.gname = nts(buf[297:329], encoding, errors)
1606         obj.devmajor = nti(buf[329:337])
1607         obj.devminor = nti(buf[337:345])
1608         prefix = nts(buf[345:500], encoding, errors)
1609
1610         # The old GNU sparse format occupies some of the unused
1611         # space in the buffer for up to 4 sparse structures.
1612         # Save the them for later processing in _proc_sparse().
1613         if obj.type == GNUTYPE_SPARSE:
1614             pos = 386
1615             structs = []
1616             for i in range(4):
1617                 try:
1618                     offset = nti(buf[pos:pos + 12])
1619                     numbytes = nti(buf[pos + 12:pos + 24])
1620                 except ValueError:
1621                     break
1622                 structs.append((offset, numbytes))
1623                 pos += 24
1624             isextended = bool(buf[482])
1625             origsize = nti(buf[483:495])
1626             obj._sparse_structs = (structs, isextended, origsize)
1627
1628         # Old V7 tar format represents a directory as a regular
1629         # file with a trailing slash.
1630         if obj.type == AREGTYPE and obj.name.endswith("/"):
1631             obj.type = DIRTYPE
1632
1633         # Remove redundant slashes from directories.
1634         if obj.isdir():
1635             obj.name = obj.name.rstrip("/")
1636
1637         # Reconstruct a ustar longname.
1638         if prefix and obj.type not in GNU_TYPES:
1639             obj.name = prefix + "/" + obj.name
1640         else:
1641             obj.offset_data = nti(buf[369:381])
1642         return obj
1643
1644     @classmethod
1645     def fromtarfile(cls, tarfile):
1646         """Return the next TarInfo object from TarFile object
1647            tarfile.
1648         """
1649         buf = tarfile.fileobj.read(BLOCKSIZE)
1650         obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors)
1651         obj.offset = tarfile.fileobj.tell() - BLOCKSIZE
1652         return obj._proc_member(tarfile)
1653
1654     #--------------------------------------------------------------------------
1655     # The following are methods that are called depending on the type of a
1656     # member. The entry point is _proc_member() which can be overridden in a
1657     # subclass to add custom _proc_*() methods. A _proc_*() method MUST
1658     # implement the following
1659     # operations:
1660     # 1. Set self.offset_data to the position where the data blocks begin,
1661     #    if there is data that follows.
1662     # 2. Set tarfile.offset to the position where the next member's header will
1663     #    begin.
1664     # 3. Return self or another valid TarInfo object.
1665     def _proc_member(self, tarfile):
1666         """Choose the right processing method depending on
1667            the type and call it.
1668         """
1669         if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK):
1670             return self._proc_gnulong(tarfile)
1671         elif self.type == GNUTYPE_SPARSE:
1672             return self._proc_sparse(tarfile)
1673         elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE):
1674             return self._proc_pax(tarfile)
1675         else:
1676             return self._proc_builtin(tarfile)
1677
1678     def _proc_builtin(self, tarfile):
1679         """Process a builtin type or an unknown type which
1680            will be treated as a regular file.
1681         """
1682         self.offset_data = tarfile.fileobj.tell()
1683         offset = self.offset_data
1684         if self.isreg() or self.ismultivol() or self.type not in SUPPORTED_TYPES:
1685             # Skip the following data blocks.
1686             offset += self._block(self.size)
1687         tarfile.offset = offset
1688
1689         # Patch the TarInfo object with saved global
1690         # header information.
1691         self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors)
1692
1693         return self
1694
1695     def _proc_gnulong(self, tarfile):
1696         """Process the blocks that hold a GNU longname
1697            or longlink member.
1698         """
1699         buf = tarfile.fileobj.read(self._block(self.size))
1700
1701         # Fetch the next header and process it.
1702         try:
1703             next = self.fromtarfile(tarfile)
1704         except HeaderError:
1705             raise SubsequentHeaderError("missing or bad subsequent header")
1706
1707         # Patch the TarInfo object from the next header with
1708         # the longname information.
1709         next.offset = self.offset
1710         if self.type == GNUTYPE_LONGNAME:
1711             next.name = nts(buf, tarfile.encoding, tarfile.errors)
1712         elif self.type == GNUTYPE_LONGLINK:
1713             next.linkname = nts(buf, tarfile.encoding, tarfile.errors)
1714
1715         return next
1716
1717     def _proc_sparse(self, tarfile):
1718         """Process a GNU sparse header plus extra headers.
1719         """
1720         # We already collected some sparse structures in frombuf().
1721         structs, isextended, origsize = self._sparse_structs
1722         del self._sparse_structs
1723
1724         # Collect sparse structures from extended header blocks.
1725         while isextended:
1726             buf = tarfile.fileobj.read(BLOCKSIZE)
1727             pos = 0
1728             for i in range(21):
1729                 try:
1730                     offset = nti(buf[pos:pos + 12])
1731                     numbytes = nti(buf[pos + 12:pos + 24])
1732                 except ValueError:
1733                     break
1734                 if offset and numbytes:
1735                     structs.append((offset, numbytes))
1736                 pos += 24
1737             isextended = bool(buf[504])
1738         self.sparse = structs
1739
1740         self.offset_data = tarfile.fileobj.tell()
1741         tarfile.offset = self.offset_data + self._block(self.size)
1742         self.size = origsize
1743         return self
1744
1745     def _proc_pax(self, tarfile):
1746         """Process an extended or global header as described in
1747            POSIX.1-2008.
1748         """
1749         # Read the header information.
1750         buf = tarfile.fileobj.read(self._block(self.size))
1751
1752         # A pax header stores supplemental information for either
1753         # the following file (extended) or all following files
1754         # (global).
1755         if self.type == XGLTYPE:
1756             pax_headers = tarfile.pax_headers
1757         else:
1758             pax_headers = tarfile.pax_headers.copy()
1759
1760         # Check if the pax header contains a hdrcharset field. This tells us
1761         # the encoding of the path, linkpath, uname and gname fields. Normally,
1762         # these fields are UTF-8 encoded but since POSIX.1-2008 tar
1763         # implementations are allowed to store them as raw binary strings if
1764         # the translation to UTF-8 fails.
1765         match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf)
1766         if match is not None:
1767             pax_headers["hdrcharset"] = match.group(1).decode("utf-8")
1768
1769         # For the time being, we don't care about anything other than "BINARY".
1770         # The only other value that is currently allowed by the standard is
1771         # "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
1772         hdrcharset = pax_headers.get("hdrcharset")
1773         if hdrcharset == "BINARY":
1774             encoding = tarfile.encoding
1775         else:
1776             encoding = "utf-8"
1777
1778         # Parse pax header information. A record looks like that:
1779         # "%d %s=%s\n" % (length, keyword, value). length is the size
1780         # of the complete record including the length field itself and
1781         # the newline. keyword and value are both UTF-8 encoded strings.
1782         regex = re.compile(br"(\d+) ([^=]+)=")
1783         pos = 0
1784         while True:
1785             match = regex.match(buf, pos)
1786             if not match:
1787                 break
1788
1789             length, keyword = match.groups()
1790             length = int(length)
1791             value = buf[match.end(2) + 1:match.start(1) + length - 1]
1792
1793             # Normally, we could just use "utf-8" as the encoding and "strict"
1794             # as the error handler, but we better not take the risk. For
1795             # example, GNU tar <= 1.23 is known to store filenames it cannot
1796             # translate to UTF-8 as raw strings (unfortunately without a
1797             # hdrcharset=BINARY header).
1798             # We first try the strict standard encoding, and if that fails we
1799             # fall back on the user's encoding and error handler.
1800             keyword = self._decode_pax_field(keyword, "utf-8", "utf-8",
1801                     tarfile.errors)
1802             if keyword in PAX_NAME_FIELDS:
1803                 value = self._decode_pax_field(value, encoding, tarfile.encoding,
1804                         tarfile.errors)
1805             else:
1806                 value = self._decode_pax_field(value, "utf-8", "utf-8",
1807                         tarfile.errors)
1808
1809             pax_headers[keyword] = value
1810             pos += length
1811
1812
1813         # Fetch the next header.
1814         try:
1815             next = self.fromtarfile(tarfile)
1816         except HeaderError:
1817             raise SubsequentHeaderError("missing or bad subsequent header")
1818
1819         # Process GNU sparse information.
1820         if "GNU.sparse.map" in pax_headers:
1821             # GNU extended sparse format version 0.1.
1822             self._proc_gnusparse_01(next, pax_headers)
1823
1824         elif "GNU.sparse.size" in pax_headers:
1825             # GNU extended sparse format version 0.0.
1826             self._proc_gnusparse_00(next, pax_headers, buf)
1827
1828         elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0":
1829             # GNU extended sparse format version 1.0.
1830             self._proc_gnusparse_10(next, pax_headers, tarfile)
1831
1832         if self.type in (XHDTYPE, SOLARIS_XHDTYPE):
1833             # Patch the TarInfo object with the extended header info.
1834             next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors)
1835             next.offset = self.offset
1836
1837             if "size" in pax_headers:
1838                 # If the extended header replaces the size field,
1839                 # we need to recalculate the offset where the next
1840                 # header starts.
1841                 offset = next.offset_data
1842                 if next.isreg() or next.type not in SUPPORTED_TYPES:
1843                     offset += next._block(next.size)
1844                 tarfile.offset = offset
1845
1846         if next is not None:
1847             if "GNU.volume.filename" in pax_headers:
1848                 if pax_headers["GNU.volume.filename"] == next.name:
1849                     if "GNU.volume.size" in pax_headers:
1850                         next.size = int(pax_headers["GNU.volume.size"])
1851                     if "GNU.volume.offset" in pax_headers:
1852                         next.volume_offset = int(pax_headers["GNU.volume.offset"])
1853
1854                 for key in pax_headers.keys():
1855                     if key.startswith("GNU.volume"):
1856                         del tarfile.pax_headers[key]
1857
1858         return next
1859
1860     def _proc_gnusparse_00(self, next, pax_headers, buf):
1861         """Process a GNU tar extended sparse header, version 0.0.
1862         """
1863         offsets = []
1864         for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf):
1865             offsets.append(int(match.group(1)))
1866         numbytes = []
1867         for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf):
1868             numbytes.append(int(match.group(1)))
1869         next.sparse = list(zip(offsets, numbytes))
1870
1871     def _proc_gnusparse_01(self, next, pax_headers):
1872         """Process a GNU tar extended sparse header, version 0.1.
1873         """
1874         sparse = [int(x) for x in pax_headers["GNU.sparse.map"].split(",")]
1875         next.sparse = list(zip(sparse[::2], sparse[1::2]))
1876
1877     def _proc_gnusparse_10(self, next, pax_headers, tarfile):
1878         """Process a GNU tar extended sparse header, version 1.0.
1879         """
1880         fields = None
1881         sparse = []
1882         buf = tarfile.fileobj.read(BLOCKSIZE)
1883         fields, buf = buf.split(b"\n", 1)
1884         fields = int(fields)
1885         while len(sparse) < fields * 2:
1886             if b"\n" not in buf:
1887                 buf += tarfile.fileobj.read(BLOCKSIZE)
1888             number, buf = buf.split(b"\n", 1)
1889             sparse.append(int(number))
1890         next.offset_data = tarfile.fileobj.tell()
1891         next.sparse = list(zip(sparse[::2], sparse[1::2]))
1892
1893     def _apply_pax_info(self, pax_headers, encoding, errors):
1894         """Replace fields with supplemental information from a previous
1895            pax extended or global header.
1896         """
1897         for keyword, value in pax_headers.items():
1898             if keyword == "GNU.sparse.name":
1899                 setattr(self, "path", value)
1900             elif keyword == "GNU.sparse.size":
1901                 setattr(self, "size", int(value))
1902             elif keyword == "GNU.sparse.realsize":
1903                 setattr(self, "size", int(value))
1904             elif keyword in PAX_FIELDS:
1905                 if keyword in PAX_NUMBER_FIELDS:
1906                     try:
1907                         value = PAX_NUMBER_FIELDS[keyword](value)
1908                     except ValueError:
1909                         value = 0
1910                 if keyword == "path":
1911                     value = value.rstrip("/")       # pylint: disable=no-member
1912                 setattr(self, keyword, value)
1913
1914         self.pax_headers = pax_headers.copy()
1915
1916     def _decode_pax_field(self, value, encoding, fallback_encoding, fallback_errors):
1917         """Decode a single field from a pax record.
1918         """
1919         try:
1920             return value.decode(encoding, "strict")
1921         except UnicodeDecodeError:
1922             return value.decode(fallback_encoding, fallback_errors)
1923
1924     def _block(self, count):
1925         """Round up a byte count by BLOCKSIZE and return it,
1926            e.g. _block(834) => 1024.
1927         """
1928         blocks, remainder = divmod(count, BLOCKSIZE)
1929         if remainder:
1930             blocks += 1
1931         return blocks * BLOCKSIZE
1932
1933     def isreg(self):
1934         return self.type in REGULAR_TYPES
1935     def isfile(self):
1936         return self.isreg()
1937     def isdir(self):
1938         return self.type == DIRTYPE
1939     def issym(self):
1940         return self.type == SYMTYPE
1941     def islnk(self):
1942         return self.type == LNKTYPE
1943     def ischr(self):
1944         return self.type == CHRTYPE
1945     def isblk(self):
1946         return self.type == BLKTYPE
1947     def isfifo(self):
1948         return self.type == FIFOTYPE
1949     def issparse(self):
1950         return self.sparse is not None
1951     def isdev(self):
1952         return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE)
1953     def ismultivol(self):
1954         return self.type == GNUTYPE_MULTIVOL or self.volume_offset > 0 or\
1955             "GNU.volume.offset" in self.pax_headers
1956 # class TarInfo
1957
1958 class TarFile(object):
1959     """The TarFile Class provides an interface to tar archives.
1960     """
1961
1962     debug = 0                   # May be set from 0 (no msgs) to 3 (all msgs)
1963
1964     dereference = False         # If true, add content of linked file to the
1965                                 # tar file, else the link.
1966
1967     ignore_zeros = False        # If true, skips empty or invalid blocks and
1968                                 # continues processing.
1969
1970     max_volume_size = None      # If different from None, establishes maximum
1971                                 # size of tar volumes
1972
1973     new_volume_handler = None   # function handler to be executed before when
1974                                 # a new volume is needed
1975
1976     volume_number = 0           # current volume number, used for multi volume
1977                                 # support
1978
1979     errorlevel = 1              # If 0, fatal errors only appear in debug
1980                                 # messages (if debug >= 0). If > 0, errors
1981                                 # are passed to the caller as exceptions.
1982
1983     format = DEFAULT_FORMAT     # The format to use when creating an archive.
1984
1985     encoding = ENCODING         # Encoding for 8-bit character strings.
1986
1987     errors = None               # Error handler for unicode conversion.
1988
1989     tarinfo = TarInfo           # The default TarInfo class to use.
1990
1991     fileobject = ExFileObject   # The file-object for extractfile().
1992
1993     arcmode = ARCMODE_PLAIN     # Object processing mode (“concat”, encryption,
1994                                 # compression)
1995
1996     save_to_members = True      # If new members are saved. This can be disabled
1997                                 # if you manage lots of files and don't want
1998                                 # to have high memory usage
1999
2000     cache_uid2user = {}         # cache to avoid getpwuid calls. It always parses /etc/passwd.
2001     cache_gid2group = {}        # same cache for groups
2002
2003     def __init__(self, name=None, mode="r", fileobj=None, format=None,
2004             tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
2005             errors="surrogateescape", pax_headers=None, debug=None,
2006             errorlevel=None, max_volume_size=None, new_volume_handler=None,
2007             concat=False, nacl=None,
2008             save_to_members=True):
2009         """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
2010            read from an existing archive, 'a' to append data to an existing
2011            file or 'w' to create a new file overwriting an existing one. `mode'
2012            defaults to 'r'.
2013            If `fileobj' is given, it is used for reading or writing data. If it
2014            can be determined, `mode' is overridden by `fileobj's mode.
2015            `fileobj' is not closed, when TarFile is closed.
2016         """
2017         if len(mode) > 1 or mode not in "raw":
2018             raise ValueError("mode must be 'r', 'a' or 'w'")
2019         self.mode = mode
2020         self.arcmode = arcmode_set (concat)
2021         self.nacl = nacl
2022         self._mode = {"r": "rb", "a": "r+b", "w": "wb"}[mode]
2023
2024         if not fileobj:
2025             if self.mode == "a" and not os.path.exists(name):
2026                 # Create nonexistent files in append mode.
2027                 self.mode = "w"
2028                 self._mode = "wb"
2029             fileobj = bltn_open(name, self._mode)
2030             self._extfileobj = False
2031         else:
2032             if name is None and hasattr(fileobj, "name"):
2033                 name = fileobj.name
2034             # when fileobj is a gzip.GzipFile, fileobj.mode is an int (not valid for us)
2035             if hasattr(fileobj, "mode") and isinstance(fileobj.mode, str):
2036                 self._mode = fileobj.mode
2037             self._extfileobj = True
2038         self.name = os.path.abspath(name) if name else None
2039         self.base_name = self.name = os.path.abspath(name) if name else None
2040         self.fileobj = fileobj
2041
2042         # Init attributes.
2043         if format is not None:
2044             self.format = format
2045         if tarinfo is not None:
2046             self.tarinfo = tarinfo
2047         if dereference is not None:
2048             self.dereference = dereference
2049         if ignore_zeros is not None:
2050             self.ignore_zeros = ignore_zeros
2051         if encoding is not None:
2052             self.encoding = encoding
2053
2054         self.errors = errors
2055
2056         if pax_headers is not None and self.format == PAX_FORMAT:
2057             self.pax_headers = pax_headers
2058         else:
2059             self.pax_headers = {}
2060
2061         if debug is not None:
2062             self.debug = debug
2063         if errorlevel is not None:
2064             self.errorlevel = errorlevel
2065
2066         # Init datastructures.
2067         if max_volume_size and max_volume_size < 3*BLOCKSIZE:
2068             raise ValueError("max_volume_size needs to be at least %d" % (3*BLOCKSIZE))
2069         if max_volume_size and not callable(new_volume_handler):
2070             raise ValueError("new_volume_handler needs to be set and be callable for multivolume support")
2071         if max_volume_size:
2072             self.max_volume_size = int(max_volume_size)
2073         else:
2074             self.max_volume_size = None
2075
2076         self.save_to_members = save_to_members
2077         self.new_volume_handler = new_volume_handler
2078         self.closed = False
2079         self.members = []       # list of members as TarInfo objects
2080         self._loaded = False    # flag if all members have been read
2081         self.offset = self.fileobj.tell()
2082                                 # current position in the archive file
2083         self.inodes = {}        # dictionary caching the inodes of
2084                                 # archive members already added
2085
2086         try:
2087             if self.mode == "r":
2088                 self.firstmember = None
2089                 self.firstmember = self.next()
2090
2091             if self.mode == "a":
2092                 # Move to the end of the archive,
2093                 # before the first empty block.
2094                 while True:
2095                     self.fileobj.seek(self.offset)
2096                     try:
2097                         tarinfo = self.tarinfo.fromtarfile(self)
2098                         self.members.append(tarinfo)
2099                     except EOFHeaderError:
2100                         self.fileobj.seek(self.offset)
2101                         break
2102                     except HeaderError as e:
2103                         raise ReadError(str(e))
2104
2105             if self.mode in "aw":
2106                 self._loaded = True
2107
2108                 if self.pax_headers:
2109                     buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy())
2110                     self.fileobj.write(buf)
2111                     self.offset += len(buf)
2112         except:
2113             if not self._extfileobj:
2114                 self.fileobj.close()
2115             self.closed = True
2116             raise
2117
2118     #--------------------------------------------------------------------------
2119     # Below are the classmethods which act as alternate constructors to the
2120     # TarFile class. The open() method is the only one that is needed for
2121     # public use; it is the "super"-constructor and is able to select an
2122     # adequate "sub"-constructor for a particular compression using the mapping
2123     # from OPEN_METH.
2124     #
2125     # This concept allows one to subclass TarFile without losing the comfort of
2126     # the super-constructor. A sub-constructor is registered and made available
2127     # by adding it to the mapping in OPEN_METH.
2128
2129     @classmethod
2130     def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE,
2131              encryption=None, compresslevel=9, tolerance=TOLERANCE_STRICT,
2132              **kwargs):
2133         """Open a tar archive for reading, writing or appending. Return
2134            an appropriate TarFile class.
2135
2136            mode:
2137            'r' or 'r:*' open for reading with transparent compression
2138            'r:'         open for reading exclusively uncompressed
2139            'r:gz'       open for reading with gzip compression
2140            'r:bz2'      open for reading with bzip2 compression
2141            'r:xz'       open for reading with lzma compression
2142            'a' or 'a:'  open for appending, creating the file if necessary
2143            'w' or 'w:'  open for writing without compression
2144            'w:gz'       open for writing with gzip compression
2145            'w:bz2'      open for writing with bzip2 compression
2146            'w:xz'       open for writing with lzma compression
2147
2148            'r|*'        open a stream of tar blocks with transparent compression
2149            'r|'         open an uncompressed stream of tar blocks for reading
2150            'r|gz'       open a gzip compressed stream of tar blocks
2151            'r|bz2'      open a bzip2 compressed stream of tar blocks
2152            'r|xz'       open an lzma compressed stream of tar blocks
2153            'w|'         open an uncompressed stream for writing
2154            'w|gz'       open a gzip compressed stream for writing
2155            'w|bz2'      open a bzip2 compressed stream for writing
2156            'w|xz'       open an lzma compressed stream for writing
2157
2158            'r#gz'       open a stream of gzip compressed tar blocks for reading
2159            'w#gz'       open a stream of gzip compressed tar blocks for writing
2160         """
2161         if not name and not fileobj:
2162             raise ValueError("nothing to open")
2163
2164         if mode in ("r", "r:*"):
2165             # Find out which *open() is appropriate for opening the file.
2166             for comptype in cls.OPEN_METH:
2167                 func = getattr(cls, cls.OPEN_METH[comptype])
2168                 if fileobj is not None:
2169                     saved_pos = fileobj.tell()
2170                 try:
2171                     return func(name, "r", fileobj, **kwargs)
2172                 except (ReadError, CompressionError) as e:
2173                     # usually nothing exceptional but sometimes is
2174                     if fileobj is not None:
2175                         fileobj.seek(saved_pos)
2176                     continue
2177             raise ReadError("file could not be opened successfully")
2178
2179         elif ":" in mode:
2180             filemode, comptype = mode.split(":", 1)
2181             filemode = filemode or "r"
2182             comptype = comptype or "tar"
2183
2184             # Select the *open() function according to
2185             # given compression.
2186             if comptype in cls.OPEN_METH:
2187                 func = getattr(cls, cls.OPEN_METH[comptype])
2188             else:
2189                 raise CompressionError("unknown compression type %r" % comptype)
2190
2191             # Pass on compression level for gzip / bzip2.
2192             if comptype == 'gz' or comptype == 'bz2':
2193                 kwargs['compresslevel'] = compresslevel
2194
2195             if 'max_volume_size' in kwargs:
2196                 if comptype != 'tar' and filemode in 'wa' \
2197                         and kwargs['max_volume_size']:
2198                     import warnings
2199                     warnings.warn('Only the first volume will be compressed '
2200                                   'for modes with "w:"!')
2201
2202             return func(name, filemode, fileobj, **kwargs)
2203
2204         elif "|" in mode:
2205             filemode, comptype = mode.split("|", 1)
2206             filemode = filemode or "r"
2207             comptype = comptype or "tar"
2208
2209             if filemode not in "rw":
2210                 raise ValueError("mode must be 'r' or 'w'")
2211
2212             t = cls(name, filemode,
2213                     _Stream(name, filemode, comptype, fileobj, bufsize,
2214                             compresslevel=compresslevel),
2215                     **kwargs)
2216             t._extfileobj = False
2217             return t
2218
2219         elif "#" in mode:
2220             filemode, comptype = mode.split("#", 1)
2221             filemode = filemode or "r"
2222
2223             if filemode not in "rw":
2224                 raise ValueError ("mode %s not compatible with concat "
2225                                   "archive; must be 'r' or 'w'" % mode)
2226
2227             stream = _Stream(name, filemode, comptype, fileobj, bufsize,
2228                              concat=True, encryption=encryption,
2229                              compresslevel=compresslevel, tolerance=tolerance)
2230             kwargs ["concat"] = True
2231             try:
2232                 t = cls(name, filemode, stream, **kwargs)
2233             except: # XXX except what?
2234                 stream.close()
2235                 raise # XXX raise what?
2236             t._extfileobj = False
2237             return t
2238
2239         elif mode in "aw":
2240             return cls.taropen(name, mode, fileobj, **kwargs)
2241
2242         raise ValueError("undiscernible mode %r" % mode)
2243
2244
2245     @classmethod
2246     def open_at_offset(cls, offset, *a, **kwa):
2247         """
2248         Same as ``.open()``, but start reading at the given offset. Assumes a
2249         seekable file object.
2250         """
2251         fileobj = kwa.get ("fileobj")
2252         if fileobj is not None:
2253             fileobj.seek (offset)
2254         return cls.open (*a, **kwa)
2255
2256
2257     @classmethod
2258     def taropen(cls, name, mode="r", fileobj=None, **kwargs):
2259         """Open uncompressed tar archive name for reading or writing.
2260         """
2261         if len(mode) > 1 or mode not in "raw":
2262             raise ValueError("mode must be 'r', 'a' or 'w'")
2263         return cls(name, mode, fileobj, **kwargs)
2264
2265     @classmethod
2266     def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
2267         """Open gzip compressed tar archive name for reading or writing.
2268            Appending is not allowed.
2269         """
2270         if len(mode) > 1 or mode not in "rw":
2271             raise ValueError("mode must be 'r' or 'w'")
2272
2273         try:
2274             import gzip
2275             gzip.GzipFile
2276         except (ImportError, AttributeError):
2277             raise CompressionError("gzip module is not available")
2278
2279         extfileobj = fileobj is not None
2280         try:
2281             fileobj = gzip.GzipFile(name, mode + "b", compresslevel, fileobj)
2282             t = cls.taropen(name, mode, fileobj, **kwargs)
2283         except OSError:
2284             if not extfileobj and fileobj is not None:
2285                 fileobj.close()
2286             if fileobj is None:
2287                 raise
2288             raise ReadError("not a gzip file")
2289         except:
2290             if not extfileobj and fileobj is not None:
2291                 fileobj.close()
2292             raise
2293         t._extfileobj = extfileobj
2294         return t
2295
2296     @classmethod
2297     def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
2298         """Open bzip2 compressed tar archive name for reading or writing.
2299            Appending is not allowed.
2300         """
2301         if len(mode) > 1 or mode not in "rw":
2302             raise ValueError("mode must be 'r' or 'w'.")
2303
2304         try:
2305             import bz2
2306         except ImportError:
2307             raise CompressionError("bz2 module is not available")
2308
2309         fileobj = bz2.BZ2File(fileobj or name, mode,
2310                               compresslevel=compresslevel)
2311
2312         try:
2313             t = cls.taropen(name, mode, fileobj, **kwargs)
2314         except (OSError, EOFError):
2315             fileobj.close()
2316             raise ReadError("not a bzip2 file")
2317         t._extfileobj = False
2318         return t
2319
2320     @classmethod
2321     def xzopen(cls, name, mode="r", fileobj=None, preset=None, **kwargs):
2322         """Open lzma compressed tar archive name for reading or writing.
2323            Appending is not allowed.
2324         """
2325         if mode not in ("r", "w"):
2326             raise ValueError("mode must be 'r' or 'w'")
2327
2328         try:
2329             import lzma
2330         except ImportError:
2331             raise CompressionError("lzma module is not available")
2332
2333         fileobj = lzma.LZMAFile(fileobj or name, mode, preset=preset)
2334
2335         try:
2336             t = cls.taropen(name, mode, fileobj, **kwargs)
2337         except (lzma.LZMAError, EOFError):
2338             fileobj.close()
2339             raise ReadError("not an lzma file")
2340         t._extfileobj = False
2341         return t
2342
2343     # All *open() methods are registered here.
2344     OPEN_METH = {
2345         "tar": "taropen",   # uncompressed tar
2346         "gz":  "gzopen",    # gzip compressed tar
2347         "bz2": "bz2open",   # bzip2 compressed tar
2348         "xz":  "xzopen"     # lzma compressed tar
2349     }
2350
2351     #--------------------------------------------------------------------------
2352     # The public methods which TarFile provides:
2353
2354     def close(self):
2355         """Close the TarFile. In write-mode, two finishing zero blocks are
2356            appended to the archive. A special case are empty archives which are
2357            initialized accordingly so the two mandatory blocks of zeros are
2358            written abiding by the requested encryption and compression settings.
2359         """
2360         if self.closed:
2361             return
2362
2363         if self.mode in "aw":
2364             if self.arcmode & ARCMODE_CONCAT and self.fileobj.tell () == 0:
2365                 self.fileobj.next ("")
2366             self.fileobj.write(NUL * (BLOCKSIZE * 2))
2367             self.offset += (BLOCKSIZE * 2)
2368             # fill up the end with zero-blocks
2369             # (like option -b20 for tar does)
2370             blocks, remainder = divmod(self.offset, RECORDSIZE)
2371             if remainder > 0:
2372                 self.fileobj.write(NUL * (RECORDSIZE - remainder))
2373         if not self._extfileobj:
2374             self.fileobj.close()
2375         self.closed = True
2376
2377     def getmember(self, name):
2378         """Return a TarInfo object for member `name'. If `name' can not be
2379            found in the archive, KeyError is raised. If a member occurs more
2380            than once in the archive, its last occurrence is assumed to be the
2381            most up-to-date version.
2382         """
2383         tarinfo = self._getmember(name)
2384         if tarinfo is None:
2385             raise KeyError("filename %r not found" % name)
2386         return tarinfo
2387
2388     def getmembers(self):
2389         """Return the members of the archive as a list of TarInfo objects. The
2390            list has the same order as the members in the archive.
2391         """
2392         self._check()
2393         if not self._loaded:    # if we want to obtain a list of
2394             self._load()        # all members, we first have to
2395                                 # scan the whole archive.
2396         return self.members
2397
2398     def get_last_member_offset(self):
2399         """Return the last member offset. Usually this is self.fileobj.tell(),
2400            but when there's encryption or concat compression going on it's more
2401            complicated than that.
2402         """
2403         return self.last_block_offset
2404
2405     def getnames(self):
2406         """Return the members of the archive as a list of their names. It has
2407            the same order as the list returned by getmembers().
2408         """
2409         return [tarinfo.name for tarinfo in self.getmembers()]
2410
2411     def gettarinfo(self, name=None, arcname=None, fileobj=None):
2412         """Create a TarInfo object for either the file `name' or the file
2413            object `fileobj' (using os.fstat on its file descriptor). You can
2414            modify some of the TarInfo's attributes before you add it using
2415            addfile(). If given, `arcname' specifies an alternative name for the
2416            file in the archive.
2417         """
2418         self._check("aw")
2419
2420         # When fileobj is given, replace name by
2421         # fileobj's real name.
2422         if fileobj is not None:
2423             name = fileobj.name
2424
2425         # Building the name of the member in the archive.
2426         # Backward slashes are converted to forward slashes,
2427         # Absolute paths are turned to relative paths.
2428         if arcname is None:
2429             arcname = name
2430         drv, arcname = os.path.splitdrive(arcname)
2431         arcname = arcname.replace(os.sep, "/")
2432         arcname = arcname.lstrip("/")
2433
2434         # Now, fill the TarInfo object with
2435         # information specific for the file.
2436         tarinfo = self.tarinfo()
2437         tarinfo.tarfile = self
2438
2439         # Use os.stat or os.lstat, depending on platform
2440         # and if symlinks shall be resolved.
2441         if fileobj is None:
2442             if hasattr(os, "lstat") and not self.dereference:
2443                 statres = os.lstat(name)
2444             else:
2445                 statres = os.stat(name)
2446         else:
2447             statres = os.fstat(fileobj.fileno())
2448         linkname = ""
2449
2450         stmd = statres.st_mode
2451         if stat.S_ISREG(stmd):
2452             inode = (statres.st_ino, statres.st_dev)
2453             if not self.dereference and statres.st_nlink > 1 and \
2454                     inode in self.inodes and arcname != self.inodes[inode]:
2455                 # Is it a hardlink to an already
2456                 # archived file?
2457                 type = LNKTYPE
2458                 linkname = self.inodes[inode]
2459             else:
2460                 # The inode is added only if its valid.
2461                 # For win32 it is always 0.
2462                 type = REGTYPE
2463                 if inode[0] and self.save_to_members:
2464                     self.inodes[inode] = arcname
2465         elif stat.S_ISDIR(stmd):
2466             type = DIRTYPE
2467         elif stat.S_ISFIFO(stmd):
2468             type = FIFOTYPE
2469         elif stat.S_ISLNK(stmd):
2470             type = SYMTYPE
2471             linkname = os.readlink(name)
2472         elif stat.S_ISCHR(stmd):
2473             type = CHRTYPE
2474         elif stat.S_ISBLK(stmd):
2475             type = BLKTYPE
2476         else:
2477             return None
2478
2479         # Fill the TarInfo object with all
2480         # information we can get.
2481         tarinfo.name = arcname
2482         tarinfo.mode = stmd
2483         tarinfo.uid = statres.st_uid
2484         tarinfo.gid = statres.st_gid
2485         if type == REGTYPE:
2486             tarinfo.size = statres.st_size
2487         else:
2488             tarinfo.size = 0
2489         tarinfo.mtime = statres.st_mtime
2490         tarinfo.type = type
2491         tarinfo.linkname = linkname
2492         if pwd:
2493             if tarinfo.uid in self.cache_uid2user:
2494                 tarinfo.uname = self.cache_uid2user[tarinfo.uid]
2495             else:
2496                 try:
2497                     tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0]
2498                     self.cache_uid2user[tarinfo.uid] = tarinfo.uname
2499                 except KeyError:
2500                     # remember user does not exist:
2501                     # same default value as in tarinfo class
2502                     self.cache_uid2user[tarinfo.uid] = ""
2503         if grp:
2504             if tarinfo.gid in self.cache_gid2group:
2505                 tarinfo.gname = self.cache_gid2group[tarinfo.gid]
2506             else:
2507                 try:
2508                     tarinfo.gname = grp.getgrgid(tarinfo.gid)[0]
2509                     self.cache_gid2group[tarinfo.gid] = tarinfo.gname
2510                 except KeyError:
2511                     # remember group does not exist:
2512                     # same default value as in tarinfo class
2513                     self.cache_gid2group[tarinfo.gid] = ""
2514
2515         if type in (CHRTYPE, BLKTYPE):
2516             if hasattr(os, "major") and hasattr(os, "minor"):
2517                 tarinfo.devmajor = os.major(statres.st_rdev)
2518                 tarinfo.devminor = os.minor(statres.st_rdev)
2519         return tarinfo
2520
2521     def list(self, verbose=True):
2522         """Print a table of contents to sys.stdout. If `verbose' is False, only
2523            the names of the members are printed. If it is True, an `ls -l'-like
2524            output is produced.
2525         """
2526         self._check()
2527
2528         for tarinfo in self:
2529             if verbose:
2530                 print(stat.filemode(tarinfo.mode), end=' ')
2531                 print("%s/%s" % (tarinfo.uname or tarinfo.uid,
2532                                  tarinfo.gname or tarinfo.gid), end=' ')
2533                 if tarinfo.ischr() or tarinfo.isblk():
2534                     print("%10s" % ("%d,%d" \
2535                                     % (tarinfo.devmajor, tarinfo.devminor)), end=' ')
2536                 else:
2537                     print("%10d" % tarinfo.size, end=' ')
2538                 print("%d-%02d-%02d %02d:%02d:%02d" \
2539                       % time.localtime(tarinfo.mtime)[:6], end=' ')
2540
2541             print(tarinfo.name + ("/" if tarinfo.isdir() else ""), end=' ')
2542
2543             if verbose:
2544                 if tarinfo.issym():
2545                     print("->", tarinfo.linkname, end=' ')
2546                 if tarinfo.islnk():
2547                     print("link to", tarinfo.linkname, end=' ')
2548             print()
2549
2550     def add(self, name, arcname=None, recursive=True, exclude=None, *, filter=None):
2551         """Add the file `name' to the archive. `name' may be any type of file
2552            (directory, fifo, symbolic link, etc.). If given, `arcname'
2553            specifies an alternative name for the file in the archive.
2554            Directories are added recursively by default. This can be avoided by
2555            setting `recursive' to False. `exclude' is a function that should
2556            return True for each filename to be excluded. `filter' is a function
2557            that expects a TarInfo object argument and returns the changed
2558            TarInfo object, if it returns None the TarInfo object will be
2559            excluded from the archive.
2560         """
2561         self._check("aw")
2562
2563         if arcname is None:
2564             arcname = name
2565
2566         # Exclude pathnames.
2567         if exclude is not None:
2568             import warnings
2569             warnings.warn("use the filter argument instead",
2570                     DeprecationWarning, 2)
2571             if exclude(name):
2572                 self._dbg(2, "tarfile: Excluded %r" % name)
2573                 return
2574
2575         # Skip if somebody tries to archive the archive...
2576         if self.name is not None and os.path.abspath(name) == self.name:
2577             self._dbg(2, "tarfile: Skipped %r" % name)
2578             return
2579
2580         self._dbg(1, name)
2581
2582         # Create a TarInfo object from the file.
2583         tarinfo = self.gettarinfo(name, arcname)
2584
2585         if tarinfo is None:
2586             self._dbg(1, "tarfile: Unsupported type %r" % name)
2587             return
2588
2589         # Change or exclude the TarInfo object.
2590         if filter is not None:
2591             tarinfo = filter(tarinfo)
2592             if tarinfo is None:
2593                 self._dbg(2, "tarfile: Excluded %r" % name)
2594                 return
2595
2596         # Append the tar header and data to the archive.
2597         if tarinfo.isreg():
2598             with bltn_open(name, "rb") as f:
2599                 self.addfile(tarinfo, f)
2600
2601         elif tarinfo.isdir():
2602             self.addfile(tarinfo)
2603             if recursive:
2604                 for f in os.listdir(name):
2605                     self.add(os.path.join(name, f), os.path.join(arcname, f),
2606                             recursive, exclude, filter=filter)
2607
2608         else:
2609             self.addfile(tarinfo)
2610
2611     def _size_left_file(self):
2612         """Calculates size left in a volume with a maximum volume size.
2613
2614         Assumes self.max_volume_size is set.
2615         If using compression through a _Stream, use _size_left_stream instead
2616         """
2617         # left-over size = max_size - offset - 2 zero-blocks written in close
2618         size_left = self.max_volume_size - 2*BLOCKSIZE - self.offset
2619         # limit size left to a discrete number of blocks, because we won't
2620         # write only half a block when writting the end of a volume
2621         # and filling with zeros
2622         return BLOCKSIZE * (size_left // BLOCKSIZE)
2623
2624     def _size_left_stream(self):
2625         """ Calculates size left in a volume if using comression/encryption
2626
2627         Assumes self.max_volume_size is set and self.fileobj is a _Stream
2628         (otherwise use _size_left_file)
2629         """
2630         # left-over size = max_size - bytes written - 2 zero-blocks (close)
2631         size_left = self.max_volume_size - self.fileobj.estim_file_size() \
2632             - 2*BLOCKSIZE
2633         return BLOCKSIZE * (size_left // BLOCKSIZE)
2634
2635     def addfile(self, tarinfo, fileobj=None):
2636         """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is
2637            given, tarinfo.size bytes are read from it and added to the archive.
2638            You can create TarInfo objects using gettarinfo().
2639            On Windows platforms, `fileobj' should always be opened with mode
2640            'rb' to avoid irritation about the file size.
2641         """
2642         self._check("aw")
2643
2644         tarinfo = copy.copy(tarinfo)
2645
2646         if self.arcmode & ARCMODE_CONCAT:
2647             self.last_block_offset = self.fileobj.next (tarinfo.name)
2648         else:
2649             self.last_block_offset = self.fileobj.tell()
2650
2651         buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
2652         self.fileobj.write(buf)
2653         self.offset += len(buf)
2654
2655         if self.max_volume_size:
2656             if isinstance(self.fileobj, _Stream):
2657                 _size_left = self._size_left_stream
2658             else:
2659                 _size_left = self._size_left_file
2660         else:
2661             _size_left = lambda: tarinfo.size
2662
2663         # If there's no data to follow, finish
2664         if not fileobj:
2665             if self.save_to_members:
2666                 self.members.append(tarinfo)
2667             return
2668
2669         target_size_left = _size_left()
2670         source_size_left = tarinfo.size
2671         assert tarinfo.volume_offset == 0
2672
2673         # we only split volumes in the middle of a file, that means we have
2674         # to write at least one block
2675         if target_size_left < BLOCKSIZE:
2676             target_size_left = BLOCKSIZE
2677
2678         # loop over multiple volumes
2679         while source_size_left > 0:
2680
2681             # Write as much data as possble from source into target.
2682             # When compressing data, we cannot easily predict how much data we
2683             # can write until target_size_left == 0 --> need to iterate
2684             size_can_write = min(target_size_left, source_size_left)
2685
2686             while size_can_write > 0:
2687                 copyfileobj(fileobj, self.fileobj, size_can_write)
2688                 self.offset += size_can_write
2689                 source_size_left -= size_can_write
2690                 target_size_left = _size_left()
2691                 size_can_write = min(target_size_left, source_size_left)
2692
2693             # now target_size_left == 0 or source_size_left == 0
2694
2695             # if there is data left to write, we need to create a new volume
2696             if source_size_left > 0:
2697                 # Only finalize the crypto entry here if we’re continuing with
2698                 # another one; otherwise, the encryption must include the block
2699                 # padding below.
2700                 tarinfo.type = GNUTYPE_MULTIVOL
2701
2702                 if not self.new_volume_handler or\
2703                     not callable(self.new_volume_handler):
2704                     raise Exception("We need to create a new volume and you "
2705                                     "didn't supply a new_volume_handler")
2706
2707
2708                 # the new volume handler should do everything needed to
2709                 # start working in a new volume. usually, the handler calls
2710                 # to self.open_volume
2711                 self.volume_number += 1
2712
2713                 # set to be used by open_volume, because in the case of a PAX
2714                 # tar it needs to write information about the volume and offset
2715                 # in the global header
2716                 tarinfo.volume_offset = tarinfo.size - source_size_left
2717                 self.volume_tarinfo = tarinfo
2718
2719                 # the “new_volume_handler” is supposed to call .close() on the
2720                 # “fileobj” _Stream
2721                 self.new_volume_handler(self, self.base_name, self.volume_number)
2722
2723                 self.volume_tarinfo = None
2724
2725                 if self.arcmode & ARCMODE_CONCAT:
2726                     self.fileobj.next_volume (tarinfo.name)
2727
2728                 # write new volume header
2729                 buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
2730                 self.fileobj.write(buf)
2731                 self.offset += len(buf)
2732
2733                 # adjust variables; open_volume should have reset self.offset
2734                 # --> _size_left should be big again
2735                 target_size_left = _size_left()
2736                 size_can_write = min(target_size_left, source_size_left)
2737                 self._dbg(3, 'new volume')
2738
2739         # now, all data has been written. We may have to fill up the rest of
2740         # the block in target with 0s
2741         remainder = (tarinfo.size - tarinfo.volume_offset) % BLOCKSIZE
2742         if remainder > 0:
2743             self.fileobj.write(NUL * (BLOCKSIZE - remainder))
2744             self.offset += BLOCKSIZE - remainder
2745
2746         if self.save_to_members:
2747             self.members.append(tarinfo)
2748
2749     def open_volume(self, name="", fileobj=None, encryption=None):
2750         '''
2751         Called by the user to change this tar file to point to a new volume.
2752         '''
2753         # open the file using either fileobj or name
2754         if not fileobj:
2755             if self.mode == "a" and not os.path.exists(name):
2756                 # Create nonexistent files in append mode.
2757                 self.mode = "w"
2758                 self._mode = "wb"
2759             self._extfileobj = False
2760
2761             if isinstance(self.fileobj, _Stream):
2762                 self._dbg(3, 'open_volume: create a _Stream')
2763                 fileobj = _Stream(name=name,
2764                             mode=self.fileobj.mode,
2765                             comptype=self.fileobj.comptype,
2766                             fileobj=None,
2767                             bufsize=self.fileobj.bufsize,
2768                             encryption=encryption or self.fileobj.encryption,
2769                             concat=self.fileobj.arcmode & ARCMODE_CONCAT)
2770             else:
2771                 # here, we lose information about compression/encryption!
2772                 self._dbg(3, 'open_volume: builtin open')
2773                 fileobj = bltn_open(name, self._mode)
2774         else:
2775             if name is None and hasattr(fileobj, "name"):
2776                 name = fileobj.name
2777             if hasattr(fileobj, "mode"):
2778                 self._mode = fileobj.mode
2779             self._extfileobj = True
2780             self._dbg(3, 'open_volume: using external fileobj {}', fileobj)
2781         self.name = os.path.abspath(name) if name else None
2782         self.fileobj = fileobj
2783
2784         # init data structures
2785         self.closed = False
2786         self.members = []       # list of members as TarInfo objects
2787         self._loaded = False    # flag if all members have been read
2788         self.offset = self.fileobj.tell()
2789                                 # current position in the archive file
2790         self.inodes = {}        # dictionary caching the inodes of
2791                                 # archive members already added
2792
2793         try:
2794             if self.mode == "r":
2795                 self.firstmember = None
2796                 self.firstmember = self.next()
2797
2798             if self.mode == "a":
2799                 # Move to the end of the archive,
2800                 # before the first empty block.
2801                 while True:
2802                     self.fileobj.seek(self.offset)
2803                     try:
2804                         tarinfo = self.tarinfo.fromtarfile(self)
2805                         self.members.append(tarinfo)
2806                     except EOFHeaderError:
2807                         self.fileobj.seek(self.offset)
2808                         break
2809                     except HeaderError as e:
2810                         raise ReadError(str(e))
2811
2812             if self.mode in "aw":
2813                 self._loaded = True
2814
2815                 if  self.format == PAX_FORMAT:
2816                     volume_info = {
2817                         "GNU.volume.filename": str(self.volume_tarinfo.name),
2818                         "GNU.volume.size": str(self.volume_tarinfo.size - self.volume_tarinfo.volume_offset),
2819                         "GNU.volume.offset": str(self.volume_tarinfo.volume_offset),
2820                     }
2821
2822                     self.pax_headers.update(volume_info)
2823
2824                     if isinstance(self.fileobj, _Stream):
2825                         self.fileobj._init_write_gz ()
2826                     buf = self.tarinfo.create_pax_global_header(volume_info.copy())
2827                     self.fileobj.write(buf)
2828                     self.offset += len(buf)
2829         except Exception as exn:
2830             if not self._extfileobj:
2831                 self.fileobj.close()
2832             self.closed = True
2833             raise
2834
2835     def extractall(self, path=".", members=None, filter=None):
2836         """Extract all members from the archive to the current working
2837            directory and set owner, modification time and permissions on
2838            directories afterwards. `path' specifies a different directory
2839            to extract to. `members' is optional and must be a subset of the
2840            list returned by getmembers().
2841         """
2842         directories = []
2843
2844         if members is None:
2845             members = self
2846
2847         for tarinfo in members:
2848             if self.volume_number > 0 and tarinfo.ismultivol():
2849                 continue
2850
2851             if filter and not filter(tarinfo):
2852                 continue
2853
2854             if tarinfo.isdir():
2855                 # Extract directories with a safe mode.
2856                 directories.append(tarinfo)
2857                 tarinfo = copy.copy(tarinfo)
2858                 tarinfo.mode = 0o0700
2859             # Do not set_attrs directories, as we will do that further down
2860             self.extract(tarinfo, path, set_attrs=not tarinfo.isdir())
2861
2862         # Reverse sort directories.
2863         directories.sort(key=lambda a: a.name)
2864         directories.reverse()
2865
2866         # Set correct owner, mtime and filemode on directories.
2867         for tarinfo in directories:
2868             dirpath = os.path.join(path, tarinfo.name)
2869             try:
2870                 self.chown(tarinfo, dirpath)
2871                 self.utime(tarinfo, dirpath)
2872                 self.chmod(tarinfo, dirpath)
2873             except ExtractError as e:
2874                 if self.errorlevel > 1:
2875                     raise
2876                 else:
2877                     self._dbg(1, "tarfile: %s" % e)
2878
2879     def extract(self, member, path="", set_attrs=True, symlink_cb=None):
2880         """Extract a member from the archive to the current working directory,
2881            using its full name. Its file information is extracted as accurately
2882            as possible. `member' may be a filename or a TarInfo object. You can
2883            specify a different directory using `path'. File attributes (owner,
2884            mtime, mode) are set unless `set_attrs' is False.
2885            ``symlink_cb`` is a hook accepting a function that is passed the
2886            ``member``, ``path``, and ``set_attrs`` arguments if the tarinfo for
2887            ``member`` indicates a symlink in which case only the callback
2888            passed will be applied, skipping the actual extraction. In case the
2889            callback is invoked, its return value is passed on to the caller.
2890         """
2891         self._check("r")
2892
2893         if isinstance(member, str):
2894             tarinfo = self.getmember(member)
2895         else:
2896             tarinfo = member
2897
2898         # Prepare the link target for makelink().
2899         if tarinfo.islnk():
2900             tarinfo._link_target = os.path.join(path, tarinfo.linkname)
2901
2902         if symlink_cb is not None and tarinfo.issym():
2903             return symlink_cb(member, path, set_attrs)
2904
2905         try:
2906             self._extract_member(tarinfo, os.path.join(path, tarinfo.name),
2907                                  set_attrs=set_attrs)
2908         except EnvironmentError as e:
2909             if self.errorlevel > 0:
2910                 raise
2911             else:
2912                 if e.filename is None:
2913                     self._dbg(1, "tarfile: %s" % e.strerror)
2914                 else:
2915                     self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename))
2916         except ExtractError as e:
2917             if self.errorlevel > 1:
2918                 raise
2919             else:
2920                 self._dbg(1, "tarfile: %s" % e)
2921
2922     def extractfile(self, member):
2923         """Extract a member from the archive as a file object. `member' may be
2924            a filename or a TarInfo object. If `member' is a regular file or a
2925            link, an io.BufferedReader object is returned. Otherwise, None is
2926            returned.
2927         """
2928         self._check("r")
2929
2930         if isinstance(member, str):
2931             tarinfo = self.getmember(member)
2932         else:
2933             tarinfo = member
2934
2935         if tarinfo.isreg() or tarinfo.ismultivol() or\
2936             tarinfo.type not in SUPPORTED_TYPES:
2937             # If a member's type is unknown, it is treated as a
2938             # regular file.
2939             return self.fileobject(self, tarinfo)
2940
2941         elif tarinfo.islnk() or tarinfo.issym():
2942             if isinstance(self.fileobj, _Stream):
2943                 # A small but ugly workaround for the case that someone tries
2944                 # to extract a (sym)link as a file-object from a non-seekable
2945                 # stream of tar blocks.
2946                 raise StreamError("cannot extract (sym)link as file object")
2947             else:
2948                 # A (sym)link's file object is its target's file object.
2949                 return self.extractfile(self._find_link_target(tarinfo))
2950         else:
2951             # If there's no data associated with the member (directory, chrdev,
2952             # blkdev, etc.), return None instead of a file object.
2953             return None
2954
2955     def _extract_member(self, tarinfo, targetpath, set_attrs=True):
2956         """Extract the TarInfo object tarinfo to a physical
2957            file called targetpath.
2958         """
2959         # Fetch the TarInfo object for the given name
2960         # and build the destination pathname, replacing
2961         # forward slashes to platform specific separators.
2962         targetpath = targetpath.rstrip("/")
2963         targetpath = targetpath.replace("/", os.sep)
2964
2965         # Create all upper directories.
2966         upperdirs = os.path.dirname(targetpath)
2967         if upperdirs and not os.path.exists(upperdirs):
2968             # Create directories that are not part of the archive with
2969             # default permissions.
2970             os.makedirs(upperdirs)
2971
2972         if tarinfo.islnk() or tarinfo.issym():
2973             self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname))
2974         else:
2975             self._dbg(1, tarinfo.name)
2976
2977         if tarinfo.isreg():
2978             self.makefile(tarinfo, targetpath)
2979         elif tarinfo.isdir():
2980             self.makedir(tarinfo, targetpath)
2981         elif tarinfo.isfifo():
2982             self.makefifo(tarinfo, targetpath)
2983         elif tarinfo.ischr() or tarinfo.isblk():
2984             self.makedev(tarinfo, targetpath)
2985         elif tarinfo.islnk() or tarinfo.issym():
2986             self.makelink(tarinfo, targetpath)
2987         elif tarinfo.type not in SUPPORTED_TYPES:
2988             self.makeunknown(tarinfo, targetpath)
2989         else:
2990             self.makefile(tarinfo, targetpath)
2991
2992         if set_attrs:
2993             self.chown(tarinfo, targetpath)
2994             if not tarinfo.issym():
2995                 self.chmod(tarinfo, targetpath)
2996                 self.utime(tarinfo, targetpath)
2997
2998     #--------------------------------------------------------------------------
2999     # Below are the different file methods. They are called via
3000     # _extract_member() when extract() is called. They can be replaced in a
3001     # subclass to implement other functionality.
3002
3003     def makedir(self, tarinfo, targetpath):
3004         """Make a directory called targetpath.
3005         """
3006         try:
3007             # Use a safe mode for the directory, the real mode is set
3008             # later in _extract_member().
3009             os.mkdir(targetpath, 0o0700)
3010         except FileExistsError:
3011             pass
3012
3013     def makefile(self, tarinfo, targetpath):
3014         """Make a file called targetpath.
3015         """
3016         source = self.fileobj
3017         source.seek(tarinfo.offset_data)
3018         decrypt = False
3019         iterate = True
3020         target = bltn_open(targetpath, "wb")
3021
3022         if tarinfo.sparse is not None:
3023             try:
3024                 for offset, size in tarinfo.sparse:
3025                     target.seek(offset)
3026                     copyfileobj(source, target, size)
3027                 target.seek(tarinfo.size)
3028                 target.truncate()
3029             finally:
3030                 target.close()
3031                 return
3032
3033         while iterate:
3034             iterate = False
3035             try:
3036                 copyfileobj(source, target, tarinfo.size)
3037             except OSError:
3038                 source.close()
3039                 # only if we are extracting a multivolume this can be treated
3040                 if not self.new_volume_handler:
3041                     target.close()
3042                     raise Exception("We need to read a new volume and you"
3043                         " didn't supply a new_volume_handler")
3044
3045                 # the new volume handler should do everything needed to
3046                 # start working in a new volume. usually, the handler calls
3047                 # to self.open_volume
3048                 self.volume_number += 1
3049                 self.new_volume_handler(self, self.base_name, self.volume_number)
3050                 tarinfo = self.firstmember
3051                 source = self.fileobj
3052                 iterate = True
3053         target.close()
3054
3055
3056     def makeunknown(self, tarinfo, targetpath):
3057         """Make a file from a TarInfo object with an unknown type
3058            at targetpath.
3059         """
3060         self.makefile(tarinfo, targetpath)
3061         self._dbg(1, "tarfile: Unknown file type %r, " \
3062                      "extracted as regular file." % tarinfo.type)
3063
3064     def makefifo(self, tarinfo, targetpath):
3065         """Make a fifo called targetpath.
3066         """
3067         if hasattr(os, "mkfifo"):
3068             os.mkfifo(targetpath)
3069         else:
3070             raise ExtractError("fifo not supported by system")
3071
3072     def makedev(self, tarinfo, targetpath):
3073         """Make a character or block device called targetpath.
3074         """
3075         if not hasattr(os, "mknod") or not hasattr(os, "makedev"):
3076             raise ExtractError("special devices not supported by system")
3077
3078         mode = tarinfo.mode
3079         if tarinfo.isblk():
3080             mode |= stat.S_IFBLK
3081         else:
3082             mode |= stat.S_IFCHR
3083
3084         os.mknod(targetpath, mode,
3085                  os.makedev(tarinfo.devmajor, tarinfo.devminor))
3086
3087     def makelink(self, tarinfo, targetpath):
3088         """Make a (symbolic) link called targetpath. If it cannot be created
3089           (platform limitation), we try to make a copy of the referenced file
3090           instead of a link.
3091         """
3092         try:
3093             # For systems that support symbolic and hard links.
3094             if tarinfo.issym():
3095                 os.symlink(tarinfo.linkname, targetpath)
3096             else:
3097                 # See extract().
3098                 if os.path.exists(tarinfo._link_target):
3099                     os.link(tarinfo._link_target, targetpath)
3100                 else:
3101                     self._extract_member(self._find_link_target(tarinfo),
3102                                          targetpath)
3103         except symlink_exception:
3104             try:
3105                 self._extract_member(self._find_link_target(tarinfo),
3106                                      targetpath)
3107             except KeyError:
3108                 raise ExtractError("unable to resolve link inside archive")
3109
3110     def chown(self, tarinfo, targetpath):
3111         """Set owner of targetpath according to tarinfo.
3112         """
3113         if pwd and hasattr(os, "geteuid") and os.geteuid() == 0:
3114             # We have to be root to do so.
3115             try:
3116                 g = grp.getgrnam(tarinfo.gname)[2]
3117             except KeyError:
3118                 g = tarinfo.gid
3119             try:
3120                 u = pwd.getpwnam(tarinfo.uname)[2]
3121             except KeyError:
3122                 u = tarinfo.uid
3123             try:
3124                 if tarinfo.issym() and hasattr(os, "lchown"):
3125                     os.lchown(targetpath, u, g)
3126                 else:
3127                     os.chown(targetpath, u, g)
3128             except OSError as e:
3129                 raise ExtractError("could not change owner")
3130
3131     def chmod(self, tarinfo, targetpath):
3132         """Set file permissions of targetpath according to tarinfo.
3133         """
3134         if hasattr(os, 'chmod'):
3135             try:
3136                 os.chmod(targetpath, tarinfo.mode)
3137             except OSError as e:
3138                 raise ExtractError("could not change mode")
3139
3140     def utime(self, tarinfo, targetpath):
3141         """Set modification time of targetpath according to tarinfo.
3142         """
3143         if not hasattr(os, 'utime'):
3144             return
3145         try:
3146             os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime))
3147         except OSError as e:
3148             raise ExtractError("could not change modification time")
3149
3150     #--------------------------------------------------------------------------
3151     def next(self):
3152         """Return the next member of the archive as a TarInfo object, when
3153            TarFile is opened for reading. Return None if there is no more
3154            available.
3155         """
3156         self._check("ra")
3157         if self.firstmember is not None:
3158             m = self.firstmember
3159             self.firstmember = None
3160             return m
3161
3162         # Read the next block.
3163         self.fileobj.seek(self.offset)
3164         tarinfo = None
3165         while True:
3166             try:
3167                 tarinfo = self.tarinfo.fromtarfile(self)
3168             except EOFHeaderError as e:
3169                 if self.ignore_zeros:
3170                     self._dbg(2, "0x%X: %s" % (self.offset, e))
3171                     self.offset += BLOCKSIZE
3172                     continue
3173             except InvalidHeaderError as e:
3174                 if self.ignore_zeros:
3175                     self._dbg(2, "0x%X: %s" % (self.offset, e))
3176                     self.offset += BLOCKSIZE
3177                     continue
3178                 elif self.offset == 0:
3179                     raise ReadError(str(e))
3180             except EmptyHeaderError:
3181                 if self.offset == 0:
3182                     raise ReadError("empty file")
3183             except TruncatedHeaderError as e:
3184                 if self.offset == 0:
3185                     raise ReadError(str(e))
3186             except SubsequentHeaderError as e:
3187                 raise ReadError(str(e))
3188             break
3189
3190         if tarinfo is not None:
3191             if self.save_to_members:
3192                 self.members.append(tarinfo)
3193         else:
3194             self._loaded = True
3195
3196         return tarinfo
3197
3198     #--------------------------------------------------------------------------
3199     # Little helper methods:
3200
3201     def _getmember(self, name, tarinfo=None, normalize=False):
3202         """Find an archive member by name from bottom to top.
3203            If tarinfo is given, it is used as the starting point.
3204         """
3205         # Ensure that all members have been loaded.
3206         members = self.getmembers()
3207
3208         # Limit the member search list up to tarinfo.
3209         if tarinfo is not None:
3210             members = members[:members.index(tarinfo)]
3211
3212         if normalize:
3213             name = os.path.normpath(name)
3214
3215         for member in reversed(members):
3216             if normalize:
3217                 member_name = os.path.normpath(member.name)
3218             else:
3219                 member_name = member.name
3220
3221             if name == member_name:
3222                 return member
3223
3224     def _load(self):
3225         """Read through the entire archive file and look for readable
3226            members.
3227         """
3228         while True:
3229             tarinfo = self.next()
3230             if tarinfo is None:
3231                 break
3232         self._loaded = True
3233
3234     def _check(self, mode=None):
3235         """Check if TarFile is still open, and if the operation's mode
3236            corresponds to TarFile's mode.
3237         """
3238         if self.closed:
3239             raise OSError("%s is closed" % self.__class__.__name__)
3240         if mode is not None and self.mode not in mode:
3241             raise OSError("bad operation for mode %r" % self.mode)
3242
3243     def _find_link_target(self, tarinfo):
3244         """Find the target member of a symlink or hardlink member in the
3245            archive.
3246         """
3247         if tarinfo.issym():
3248             # Always search the entire archive.
3249             linkname = "/".join(filter(None, (os.path.dirname(tarinfo.name), tarinfo.linkname)))
3250             limit = None
3251         else:
3252             # Search the archive before the link, because a hard link is
3253             # just a reference to an already archived file.
3254             linkname = tarinfo.linkname
3255             limit = tarinfo
3256
3257         member = self._getmember(linkname, tarinfo=limit, normalize=True)
3258         if member is None:
3259             raise KeyError("linkname %r not found" % linkname)
3260         return member
3261
3262     def __iter__(self):
3263         """Provide an iterator object.
3264         """
3265         if self._loaded:
3266             return iter(self.members)
3267         else:
3268             return TarIter(self)
3269
3270     def _dbg(self, level, msg, *args):
3271         """Write debugging output to sys.stderr.
3272         """
3273         if level <= self.debug:
3274             print(msg.format(*args), file=sys.stderr)
3275
3276     def __enter__(self):
3277         self._check()
3278         return self
3279
3280     def __exit__(self, type, value, traceback):
3281         if type is None:
3282             self.close()
3283         else:
3284             # An exception occurred. We must not call close() because
3285             # it would try to write end-of-archive blocks and padding.
3286             if not self._extfileobj:
3287                 self.fileobj.close()
3288             self.closed = True
3289 # class TarFile
3290
3291 class TarIter:
3292     """Iterator Class.
3293
3294        for tarinfo in TarFile(...):
3295            suite...
3296     """
3297
3298     def __init__(self, tarfile):
3299         """Construct a TarIter object.
3300         """
3301         self.tarfile = tarfile
3302         self.index = 0
3303     def __iter__(self):
3304         """Return iterator object.
3305         """
3306         return self
3307     def __next__(self):
3308         """Return the next item using TarFile's next() method.
3309            When all members have been read, set TarFile as _loaded.
3310         """
3311         # Fix for SF #1100429: Under rare circumstances it can
3312         # happen that getmembers() is called during iteration,
3313         # which will cause TarIter to stop prematurely.
3314
3315         if self.index == 0 and self.tarfile.firstmember is not None:
3316             tarinfo = self.tarfile.next()
3317         elif self.index < len(self.tarfile.members):
3318             tarinfo = self.tarfile.members[self.index]
3319         elif not self.tarfile._loaded:
3320             tarinfo = self.tarfile.next()
3321             if not tarinfo:
3322                 self.tarfile._loaded = True
3323                 raise StopIteration
3324         else:
3325             raise StopIteration
3326         self.index += 1
3327
3328         return tarinfo
3329
3330 #---------------------------------------------------------
3331 # support functionality for rescue mode
3332 #---------------------------------------------------------
3333
3334 TAR_FMT_HDR = (# See tar(5):
3335     "<"
3336     "100s" # ← char name[100];          /* 100 */
3337       "8s" # ← char mode[8];            /* 108 */
3338       "8s" # ← char uid[8];             /* 116 */
3339       "8s" # ← char gid[8];             /* 124 */
3340      "12s" # ← char size[12];           /* 136 */
3341      "12s" # ← char mtime[12];          /* 148 */
3342       "8s" # ← char checksum[8];        /* 156 */
3343        "B" # ← char typeflag[1];        /* 157 */
3344     "100s" # ← char linkname[100];      /* 257 */
3345       "6s" # ← char magic[6];           /* 263 */
3346       "2s" # ← char version[2];         /* 265 */
3347      "32s" # ← char uname[32];          /* 297 */
3348      "32s" # ← char gname[32];          /* 329 */
3349       "8s" # ← char devmajor[8];        /* 337 */
3350       "8s" # ← char devminor[8];        /* 345 */
3351      "12s" # ← char atime[12];          /* 357 */
3352      "12s" # ← char ctime[12];          /* 369 */
3353      "12s" # ← char offset[12];         /* 381 */
3354       "4s" # ← char longnames[4];       /* 385 */
3355        "B" # ← char unused[1];          /* 386 */
3356         "" #   struct {
3357      "12s" # ←       char offset[12];
3358      "12s" # ←       char numbytes[12];
3359      "12s" # ←       char offset[12];
3360      "12s" # ←       char numbytes[12];
3361      "12s" # ←       char offset[12];
3362      "12s" # ←       char numbytes[12];
3363      "12s" # ←       char offset[12];
3364      "12s" # ←       char numbytes[12];
3365         "" #   } sparse[4];             /* 482 */
3366        "B" # ← char isextended[1];      /* 483 */
3367      "12s" # ← char realsize[12];       /* 495 */
3368      "17s" # ← char pad[17];            /* 512 */
3369 )
3370
3371 # The “magic” and “version” fields are special:
3372 #
3373 # tar(5)
3374 #    magic   The magic field holds the five characters “ustar” followed by a
3375 #            space.  Note that POSIX ustar archives have a trailing null.
3376 #
3377 # however, “tar.h”:
3378 #
3379 #   /* OLDGNU_MAGIC uses both magic and version fields, which are contiguous.
3380 #      Found in an archive, it indicates an old GNU header format, which will be
3381 #      hopefully become obsolescent.  With OLDGNU_MAGIC, uname and gname are
3382 #      valid, though the header is not truly POSIX conforming.  */
3383 #
3384 #
3385 TAR_FMT_OLDGNU_MAGIC = b"ustar "
3386
3387 def read_gnu_tar_hdr (data):
3388     if len (data) != BLOCKSIZE: # header requires one complete block
3389         return None
3390
3391     try:
3392         name, mode, \
3393             uid, gid, \
3394             size, mtime, \
3395             checksum, \
3396             typeflag, \
3397             linkname, \
3398             magic, \
3399             version, \
3400             uname, \
3401             gname, \
3402             devmajor, \
3403             devminor, \
3404             atime, \
3405             ctime, \
3406             offset, \
3407             longnames, \
3408             unused, \
3409             offset1, numbytes1, \
3410             offset2, numbytes2, \
3411             offset3, numbytes3, \
3412             offset4, numbytes4, \
3413             isextended, \
3414             realsize, \
3415             pad = struct.unpack (TAR_FMT_HDR, data)
3416     except struct.error:
3417         return None
3418
3419     if magic != TAR_FMT_OLDGNU_MAGIC:
3420         return None
3421
3422     # return all except “unused” and “pad”
3423     return \
3424         { "name"        : name,     "mode"        : mode
3425         , "uid"         : uid ,     "gid"         : gid
3426         , "size"        : size,     "mtime"       : mtime
3427         , "checksum"    : checksum
3428         , "typeflag"    : typeflag
3429         , "linkname"    : linkname
3430         , "magic"       : magic
3431         , "version"     : version
3432         , "uname"       : uname,    "gname"       : gname
3433         , "devmajor"    : devmajor, "devminor"    : devminor
3434         , "atime"       : atime,    "ctime"       : ctime
3435         , "offset"      : offset
3436         , "longnames"   : longnames
3437         , "offset1"     : offset1,  "numbytes1"   : numbytes1
3438         , "offset2"     : offset2,  "numbytes2"   : numbytes2
3439         , "offset3"     : offset3,  "numbytes3"   : numbytes3
3440         , "offset4"     : offset4,  "numbytes4"   : numbytes4
3441         , "isextended"  : isextended
3442         , "realsize"    : realsize
3443         }
3444
3445
3446 def readable_tar_objects_offsets (ifd):
3447     """
3448     Traverse blocks in file, trying to extract tar headers.
3449     """
3450     pos     = 0
3451     offsets = []
3452
3453     while True:
3454         blk = os.read (ifd, BLOCKSIZE)
3455         if len (blk) != BLOCKSIZE:
3456             break
3457         hdr = read_gnu_tar_hdr (blk)
3458         if hdr is not None:
3459             offsets.append (pos)
3460         pos += BLOCKSIZE
3461
3462     return offsets
3463
3464
3465 def locate_gz_hdr_candidates (fd):
3466     """
3467     Walk over instances of the GZ magic in the payload, collecting their
3468     positions. If the offset of the first found instance is not zero, the file
3469     begins with leading garbage.
3470
3471     Note that since the GZ magic consists of only two bytes, we expect a lot of
3472     false positives inside binary data.
3473
3474     :return:    The list of offsets in the file.
3475     """
3476     pos   = 0
3477     cands = []
3478     mm    = mmap.mmap(fd, 0, mmap.MAP_SHARED, mmap.PROT_READ)
3479
3480     while True:
3481         pos = mm.find (GZ_MAGIC_BYTES, pos)
3482         if pos == -1:
3483             break
3484         cands.append (pos)
3485         pos += len (GZ_MAGIC_BYTES)
3486
3487     return cands
3488
3489
3490 HDR_CAND_GOOD       = 0 # header marks begin of valid object
3491 HDR_CAND_FISHY      = 1 # inconclusive
3492 HDR_CAND_JUNK       = 2 # not a header / object unreadable
3493
3494
3495 def read_cstring (fd, max=-1, encoding=None):
3496     """
3497     Read one NUL-terminated string from *fd* into a Python string. If *max* is
3498     non-negative, reading will terminate after the specified number of bytes.
3499
3500     Optionally, an *encoding* may be specified to interpret the data as.
3501
3502     :returns: *None* if parsing failed or the maximum number of bytes has been
3503               exceeded; a Python string with the data otherwise.
3504     """
3505     buf = b""
3506     l = 0
3507
3508     while True:
3509         c = os.read (fd, 1)
3510         if c == NUL:
3511             break
3512         if max >= 0 and l > max:
3513             return None
3514         buf += c
3515         l += 1
3516     if encoding is not None:
3517         buf = buf.decode (encoding)
3518
3519     return buf
3520
3521
3522 def inspect_gz_hdr (fd, off):
3523     """
3524     Attempt to parse a Gzip header in *fd* at position *off*. The format is
3525     documented as RFC1952.
3526
3527     Returns a verdict about the quality of that header plus the parsed header
3528     when readable. Problematic sizes such as fields running past the EOF are
3529     treated as garbage. Properties in which the header merely doesn’t conform
3530     to the spec (garbage flag bits, bogus timestamp) are considered “fishy”. No
3531     validation is possible on embedded strings because they are single-byte
3532     encoded.
3533     """
3534     fname   = None
3535     flags   = 0x00
3536     dflags  = 0x00
3537     mtime   = 0x00000000
3538     oscode  = 0x00
3539     verdict = HDR_CAND_GOOD
3540
3541     os.lseek (fd, off, os.SEEK_SET)
3542     if os.lseek (fd, 0, os.SEEK_CUR) != off:
3543         return HDR_CAND_JUNK, None
3544
3545     raw = os.read (fd, GZ_HEADER_SIZE)
3546     if len (raw) != GZ_HEADER_SIZE:
3547         return HDR_CAND_JUNK, None
3548
3549     flags = 0x0
3550     try:
3551         _m1, _m2, meth, flags, mtime, dflags, oscode = \
3552             struct.unpack (GZ_FMT_HEADER, raw)
3553         if meth != GZ_METHOD_DEFLATE: # only deflate is supported
3554             return HDR_CAND_JUNK, None
3555     except struct.error as exn:
3556         return HDR_CAND_JUNK, None
3557
3558     if mtime > int (time.time ()):
3559         verdict = HDR_CAND_FISHY
3560
3561     if dflags != GZ_DEFLATE_FLAGS:
3562         verdict = HDR_CAND_FISHY
3563
3564     if oscode != GZ_OS_CODE:
3565         verdict = HDR_CAND_FISHY
3566
3567     if flags & GZ_FLAG_FTEXT: # created by some contrarian
3568         verdict = HDR_CAND_FISHY
3569     if flags & GZ_FLAG_FEXTRA:
3570         xlen = struct.unpack ("<H", os.read (fd, 2))
3571         xtra = os.read (fd, xlen)
3572         if len (xtra) != xlen: # eof inside header
3573             return HDR_CAND_JUNK, None
3574     if flags & GZ_FLAG_FNAME:
3575         # read up to the next NUL byte, not exceeding the maximum path length
3576         # allowed by tar(5)
3577         fname = read_cstring (fd, max=(LENGTH_NAME + LENGTH_PREFIX),
3578                               encoding="iso-8859-1")
3579         if fname is None:
3580             return HDR_CAND_JUNK, None
3581     if flags & GZ_FLAG_FCOMMENT:
3582         fname = read_cstring (fd, max=(LENGTH_NAME + LENGTH_PREFIX),
3583                               encoding="iso-8859-1")
3584         if fname is None:
3585             return HDR_CAND_JUNK, None
3586     if flags & GZ_FLAG_FHCRC: # half a CRC32; discarded
3587         crc16 = os.read (fd, 2)
3588         if len (crc16) != 2: # eof inside header
3589             return HDR_CAND_JUNK, None
3590     if flags & GZ_FLAG_RESERVED:
3591         # according to the RFC, these must not be set
3592         verdict = HDR_CAND_FISHY
3593
3594     hlen = os.lseek (fd, 0, os.SEEK_CUR) - off
3595
3596     return verdict, \
3597            { "fname"  : fname
3598            , "flags"  : flags
3599            , "dflags" : dflags
3600            , "mtime"  : mtime
3601            , "oscode" : oscode
3602            , "hlen"   : hlen
3603            }
3604
3605
3606 def try_decompress (ifd, off, hdr):
3607     """
3608     Attempt to process the object starting at *off* with gzip.
3609
3610     :returns:   A pair containing the values of the decompressed data and
3611                 the length of the input consumed. Note that the latter value
3612                 may exceed the length of the compressed data because the
3613                 *zlib* module does not provide a means to query how much
3614                 of the input it processed before the end of an object.
3615     """
3616     import zlib
3617     decmp = zlib.decompressobj (-zlib.MAX_WBITS)
3618     pos   = off
3619     dlen  = 0 # size of decompressed data
3620
3621     os.lseek (ifd, pos, os.SEEK_SET)
3622     while True:
3623         cnk = os.read (ifd, BUFSIZE)
3624         pos += len (cnk)
3625         try:
3626             data = decmp.decompress (cnk)
3627         except zlib.error as exn: # probably CRC32 mismatch; terminate softly
3628             break # fishy
3629         dlen += len (data)
3630         if decmp.eof is True:
3631             break
3632         if len (cnk) != BUFSIZE: # eof, but not end of decompressed object!
3633             break # fishy
3634
3635     return dlen, pos - off
3636
3637 def readable_gz_objects_offsets (ifd, cands):
3638     """
3639     Inspect header candidates for parseable *ifd* gzipped objects.
3640     """
3641     good = []
3642     nobj = 0
3643
3644     for cand in cands:
3645         nobj += 1
3646         vdt, hdr = inspect_gz_hdr (ifd, cand)
3647         if vdt == HDR_CAND_JUNK:
3648             pass # ignore unreadable ones
3649         elif vdt in [HDR_CAND_GOOD, HDR_CAND_FISHY]:
3650             off0 = cand + hdr ["hlen"]
3651             dlen, clen = try_decompress (ifd, off0, hdr)
3652             if dlen > 0 and clen > 0:
3653                 good.append (cand)
3654
3655     return good
3656
3657
3658 def reconstruct_offsets_gz (fname):
3659     """
3660     From the given file, retrieve all GZ header-like offsets (“candidates”).
3661     Then check each of those locations whether they can be processed as
3662     compressed data.
3663     """
3664     ifd = os.open (fname, os.O_RDONLY)
3665
3666     try:
3667         cands = locate_gz_hdr_candidates (ifd)
3668         return readable_gz_objects_offsets (ifd, cands)
3669     finally:
3670         os.close (ifd)
3671
3672
3673 def reconstruct_offsets_tar (fname):
3674     """
3675     From the given file, retrieve all tar header-like offsets (“candidates”).
3676     Then check each of those locations whether they can be processed as tar
3677     data.
3678     """
3679     ifd = os.open (fname, os.O_RDONLY)
3680
3681     try:
3682         return readable_tar_objects_offsets (ifd)
3683     finally:
3684         os.close (ifd)
3685
3686
3687 def read_tarobj_at_offset (fileobj, offset, mode, secret=None):
3688     decr = None
3689
3690     if secret is not None:
3691         ks   = secret [0]
3692
3693         if ks == crypto.PDTCRYPT_SECRET_PW:
3694             decr = crypto.Decrypt (password=secret [1])
3695         elif ks == crypto.PDTCRYPT_SECRET_KEY:
3696             key = binascii.unhexlify (secret [1])
3697             decr = crypto.Decrypt (key=key)
3698         else:
3699             raise RuntimeError
3700
3701     tarobj = \
3702         TarFile.open_at_offset (offset,
3703                                 mode=mode,
3704                                 fileobj=fileobj,
3705                                 format=GNU_FORMAT,
3706                                 concat='#' in mode,
3707                                 encryption=decr,
3708                                 save_to_members=False,
3709                                 tolerance=TOLERANCE_RESCUE)
3710
3711     return tarobj.next ()
3712
3713
3714 def idxent_of_tarinfo (tarinfo):
3715     """
3716     Scrape the information relevant for the index from a *TarInfo* object.
3717     Keys like the inode number that lack a corresponding field in a TarInfo
3718     will be set to some neutral value.
3719     Example output:
3720
3721         { "inode"  : 0
3722         , "uid"    : 0
3723         , "path"   : "snapshot://annotations.db"
3724         , "offset" : 0
3725         , "volume" : 0
3726         , "mode"   : 33152
3727         , "ctime"  : 1502798115
3728         , "mtime"  : 1502196423
3729         , "size"   : 144
3730         , "type"   : "file"
3731         , "gid"    : 0
3732         }
3733
3734     """
3735
3736     return \
3737         { "inode"  : 0            # ignored when reading the index
3738         , "uid"    : tarinfo.uid
3739         , "gid"    : tarinfo.gid
3740         , "path"   : tarinfo.name # keeping URI scheme
3741         , "offset" : 0            # to be added by the caller
3742         , "volume" : tarinfo.volume_offset
3743         , "mode"   : tarinfo.mode
3744         , "ctime"  : tarinfo.mtime
3745         , "mtime"  : tarinfo.mtime
3746         , "size"   : tarinfo.size
3747         , "type"   : tarinfo.type
3748         }
3749
3750
3751 def gen_rescue_index (backup_tar_path, mode, password=None, key=None):
3752     psidx   = [] # pseudo index, return value
3753     offsets = None
3754     secret  = crypto.make_secret (password=password, key=key)
3755
3756     if secret is not None:
3757         offsets = crypto.reconstruct_offsets (backup_tar_path, secret)
3758     elif mode == "#gz":
3759         offsets = reconstruct_offsets_gz (backup_tar_path)
3760     elif mode == "#":
3761         offsets = reconstruct_offsets_tar (backup_tar_path)
3762     else:
3763         raise TarError ("no rescue handling for mode “%s”" % mode)
3764
3765     fileobj = bltn_open (backup_tar_path, "rb")
3766     infos   = [ (off, read_tarobj_at_offset (fileobj, off, mode, secret=secret))
3767                 for off in offsets ]
3768     def aux (o, ti):
3769         ie = idxent_of_tarinfo (ti)
3770         ie ["offset"] = o
3771         return ie
3772     psidx   = [ aux (o, ti) for o, ti in infos ]
3773
3774     return psidx
3775
3776 #--------------------
3777 # exported functions
3778 #--------------------
3779 def is_tarfile(name):
3780     """Return True if name points to a tar archive that we
3781        are able to handle, else return False.
3782     """
3783     try:
3784         t = open(name)
3785         t.close()
3786         return True
3787     except TarError:
3788         return False
3789
3790 bltn_open = open
3791 open = TarFile.open