do not discard valid data in buffers when in tolerant mode
[python-delta-tar] / deltatar / tarfile.py
CommitLineData
be60ffd0 1#!/usr/bin/env python3
7584f5c9
ERE
2#-------------------------------------------------------------------
3# tarfile.py
4#-------------------------------------------------------------------
5# Copyright (C) 2002 Lars Gustäbel <lars@gustaebel.de>
6# All rights reserved.
7#
8# Permission is hereby granted, free of charge, to any person
9# obtaining a copy of this software and associated documentation
10# files (the "Software"), to deal in the Software without
11# restriction, including without limitation the rights to use,
12# copy, modify, merge, publish, distribute, sublicense, and/or sell
13# copies of the Software, and to permit persons to whom the
14# Software is furnished to do so, subject to the following
15# conditions:
16#
17# The above copyright notice and this permission notice shall be
18# included in all copies or substantial portions of the Software.
19#
20# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27# OTHER DEALINGS IN THE SOFTWARE.
28#
29"""Read from and write to tar format archives.
30"""
31
32__version__ = "$Revision: 85213 $"
33# $Source$
34
35version = "0.9.0"
36__author__ = "Lars Gustäbel (lars@gustaebel.de)"
37__date__ = "$Date$"
38__cvsid__ = "$Id$"
5fdff89f 39__credits__ = "Gustavo Niemeyer, Niels Gustäbel, Richard Townsend, Eduardo Robles."
7584f5c9
ERE
40
41#---------
42# Imports
43#---------
c7c736b6 44import binascii
7584f5c9
ERE
45import sys
46import os
be60ffd0 47import io
7584f5c9
ERE
48import shutil
49import stat
50import errno
51import time
52import struct
53import copy
54import re
55import operator
56
c7c736b6
PG
57import traceback # XXX
58
8ab8fac5 59from . import crypto
6e812ad9 60
7584f5c9
ERE
61try:
62 import grp, pwd
63except ImportError:
64 grp = pwd = None
65
be60ffd0
ERE
66# os.symlink on Windows prior to 6.0 raises NotImplementedError
67symlink_exception = (AttributeError, NotImplementedError)
68try:
69 # OSError (winerror=1314) will be raised if the caller does not hold the
70 # SeCreateSymbolicLinkPrivilege privilege
71 symlink_exception += (OSError,)
72except NameError:
73 pass
74
7584f5c9
ERE
75# from tarfile import *
76__all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError"]
77
be60ffd0
ERE
78from builtins import open as _open # Since 'open' is TarFile.open
79
7584f5c9
ERE
80#---------------------------------------------------------
81# tar constants
82#---------------------------------------------------------
be60ffd0 83NUL = b"\0" # the null character
7584f5c9
ERE
84BLOCKSIZE = 512 # length of processing blocks
85RECORDSIZE = BLOCKSIZE * 20 # length of records
be60ffd0
ERE
86GNU_MAGIC = b"ustar \0" # magic gnu tar string
87POSIX_MAGIC = b"ustar\x0000" # magic posix tar string
7584f5c9
ERE
88
89LENGTH_NAME = 100 # maximum length of a filename
90LENGTH_LINK = 100 # maximum length of a linkname
91LENGTH_PREFIX = 155 # maximum length of the prefix field
92
be60ffd0
ERE
93REGTYPE = b"0" # regular file
94AREGTYPE = b"\0" # regular file
95LNKTYPE = b"1" # link (inside tarfile)
96SYMTYPE = b"2" # symbolic link
97CHRTYPE = b"3" # character special device
98BLKTYPE = b"4" # block special device
99DIRTYPE = b"5" # directory
100FIFOTYPE = b"6" # fifo special device
101CONTTYPE = b"7" # contiguous file
102
103GNUTYPE_LONGNAME = b"L" # GNU tar longname
104GNUTYPE_LONGLINK = b"K" # GNU tar longlink
105GNUTYPE_SPARSE = b"S" # GNU tar sparse file
106GNUTYPE_MULTIVOL = b"M" # GNU tar continuation of a file that began on
68ddf955 107 # another volume
7584f5c9 108
be60ffd0
ERE
109XHDTYPE = b"x" # POSIX.1-2001 extended header
110XGLTYPE = b"g" # POSIX.1-2001 global header
111SOLARIS_XHDTYPE = b"X" # Solaris extended header
7584f5c9
ERE
112
113USTAR_FORMAT = 0 # POSIX.1-1988 (ustar) format
114GNU_FORMAT = 1 # GNU tar format
115PAX_FORMAT = 2 # POSIX.1-2001 (pax) format
116DEFAULT_FORMAT = GNU_FORMAT
117
15a81fc0 118GZ_FMT_HEADER = b"<BBBBLBB"
203cb25e 119GZ_HEADER_SIZE = 10 # not including the name
15a81fc0
PG
120GZ_MAGIC = (0x1f, 0x8b) # 0o37, 0o213
121GZ_METHOD_DEFLATE = 0x08 # 0o10
122GZ_FLAG_ORIG_NAME = 0x08 # 0o10, default in gzip
123GZ_DEFLATE_FLAGS = 0x00 # 0o00, never read (deflate.c)
124GZ_OS_CODE = 0x03 # 0o03, default in gzip (tailor.h)
d601d33b
PG
125GZ_MAGIC_BYTES = struct.pack ("<BB", GZ_MAGIC [0], GZ_MAGIC [1])
126GZ_MAGIC_DEFLATE = struct.pack ("<BBB", GZ_MAGIC [0], GZ_MAGIC [1],
127 GZ_METHOD_DEFLATE)
15a81fc0 128
7584f5c9 129#---------------------------------------------------------
d1c38f40
PG
130# archive handling mode
131#---------------------------------------------------------
132
133ARCMODE_PLAIN = 0
134ARCMODE_ENCRYPT = 1 << 0
135ARCMODE_COMPRESS = 1 << 1
136ARCMODE_CONCAT = 1 << 2
137
138def arcmode_fmt (m):
139 if m == ARCMODE_PLAIN:
140 return "PLAIN"
141 first = True
142 ret = "["
143 def chkappend (b, s):
144 nonlocal m
145 nonlocal ret
146 nonlocal first
147 if m & b:
148 if first is True: first = False
149 else: ret += " |"
150 ret += " " + s
151 chkappend (ARCMODE_ENCRYPT, "ENCRYPT")
152 chkappend (ARCMODE_COMPRESS, "COMPRESS")
153 chkappend (ARCMODE_CONCAT, "CONCAT")
154 return ret + " ]"
155
156
157def arcmode_set (concat=False, encryption=None, comptype=None, init=ARCMODE_PLAIN):
158 ret = init
159 if bool (concat) is True:
160 ret |= ARCMODE_CONCAT
161 if encryption is not None:
162 ret |= ARCMODE_ENCRYPT
163 if comptype == "gz":
164 ret |= ARCMODE_COMPRESS
165 return ret
166
167#---------------------------------------------------------
7584f5c9
ERE
168# tarfile constants
169#---------------------------------------------------------
170# File types that tarfile supports:
171SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE,
172 SYMTYPE, DIRTYPE, FIFOTYPE,
173 CONTTYPE, CHRTYPE, BLKTYPE,
174 GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
68ddf955 175 GNUTYPE_SPARSE, GNUTYPE_MULTIVOL)
7584f5c9
ERE
176
177# File types that will be treated as a regular file.
178REGULAR_TYPES = (REGTYPE, AREGTYPE,
179 CONTTYPE, GNUTYPE_SPARSE)
180
181# File types that are part of the GNU tar format.
182GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
68ddf955 183 GNUTYPE_SPARSE, GNUTYPE_MULTIVOL)
7584f5c9
ERE
184
185# Fields from a pax header that override a TarInfo attribute.
186PAX_FIELDS = ("path", "linkpath", "size", "mtime",
187 "uid", "gid", "uname", "gname")
188
be60ffd0
ERE
189# Fields from a pax header that are affected by hdrcharset.
190PAX_NAME_FIELDS = {"path", "linkpath", "uname", "gname"}
191
7584f5c9
ERE
192# Fields in a pax header that are numbers, all other fields
193# are treated as strings.
194PAX_NUMBER_FIELDS = {
195 "atime": float,
196 "ctime": float,
197 "mtime": float,
198 "uid": int,
199 "gid": int,
200 "size": int
201}
202
203#---------------------------------------------------------
7584f5c9
ERE
204# initialization
205#---------------------------------------------------------
be60ffd0
ERE
206
207if os.name in ("nt", "ce"):
208 ENCODING = "utf-8"
209else:
210 ENCODING = sys.getfilesystemencoding()
7584f5c9
ERE
211
212#---------------------------------------------------------
213# Some useful functions
214#---------------------------------------------------------
215
be60ffd0
ERE
216def stn(s, length, encoding, errors):
217 """Convert a string to a null-terminated bytes object.
7584f5c9 218 """
be60ffd0 219 s = s.encode(encoding, errors)
7584f5c9
ERE
220 return s[:length] + (length - len(s)) * NUL
221
be60ffd0
ERE
222def nts(s, encoding, errors):
223 """Convert a null-terminated bytes object to a string.
7584f5c9 224 """
be60ffd0
ERE
225 p = s.find(b"\0")
226 if p != -1:
227 s = s[:p]
228 return s.decode(encoding, errors)
229
230def sbtn(s, length, encoding, errors):
231 """Convert a string or a bunch of bytes to a null-terminated bytes object
232 of specific size.
233 """
234 if isinstance(s, str):
235 s = s.encode(encoding, errors)
236 return s[:length] + (length - len(s)) * NUL
7584f5c9
ERE
237
238def nti(s):
239 """Convert a number field to a python number.
240 """
241 # There are two possible encodings for a number field, see
242 # itn() below.
be60ffd0
ERE
243 if s[0] in (0o200, 0o377):
244 n = 0
245 for i in range(len(s) - 1):
246 n <<= 8
247 n += s[i + 1]
248 if s[0] == 0o377:
249 n = -(256 ** (len(s) - 1) - n)
250 else:
7584f5c9 251 try:
be60ffd0 252 n = int(nts(s, "ascii", "strict") or "0", 8)
7584f5c9
ERE
253 except ValueError:
254 raise InvalidHeaderError("invalid header")
7584f5c9
ERE
255 return n
256
257def itn(n, digits=8, format=DEFAULT_FORMAT):
258 """Convert a python number to a number field.
259 """
260 # POSIX 1003.1-1988 requires numbers to be encoded as a string of
261 # octal digits followed by a null-byte, this allows values up to
262 # (8**(digits-1))-1. GNU tar allows storing numbers greater than
be60ffd0
ERE
263 # that if necessary. A leading 0o200 or 0o377 byte indicate this
264 # particular encoding, the following digits-1 bytes are a big-endian
265 # base-256 representation. This allows values up to (256**(digits-1))-1.
266 # A 0o200 byte indicates a positive number, a 0o377 byte a negative
267 # number.
7584f5c9 268 if 0 <= n < 8 ** (digits - 1):
8112b0ed 269 s = bytes("%0*o" % (digits - 1, int(n)), "ascii") + NUL
be60ffd0
ERE
270 elif format == GNU_FORMAT and -256 ** (digits - 1) <= n < 256 ** (digits - 1):
271 if n >= 0:
272 s = bytearray([0o200])
273 else:
274 s = bytearray([0o377])
275 n = 256 ** digits + n
7584f5c9 276
be60ffd0
ERE
277 for i in range(digits - 1):
278 s.insert(1, n & 0o377)
7584f5c9 279 n >>= 8
7584f5c9 280 else:
be60ffd0
ERE
281 raise ValueError("overflow in number field")
282
283 return s
7584f5c9
ERE
284
285def calc_chksums(buf):
286 """Calculate the checksum for a member's header by summing up all
287 characters except for the chksum field which is treated as if
288 it was filled with spaces. According to the GNU tar sources,
289 some tars (Sun and NeXT) calculate chksum with signed char,
290 which will be different if there are chars in the buffer with
291 the high bit set. So we calculate two checksums, unsigned and
292 signed.
293 """
be60ffd0
ERE
294 unsigned_chksum = 256 + sum(struct.unpack_from("148B8x356B", buf))
295 signed_chksum = 256 + sum(struct.unpack_from("148b8x356b", buf))
7584f5c9
ERE
296 return unsigned_chksum, signed_chksum
297
298def copyfileobj(src, dst, length=None):
299 """Copy length bytes from fileobj src to fileobj dst.
300 If length is None, copy the entire content.
301 """
302 if length == 0:
303 return
304 if length is None:
305 shutil.copyfileobj(src, dst)
306 return
307
308 BUFSIZE = 16 * 1024
309 blocks, remainder = divmod(length, BUFSIZE)
be60ffd0 310 for b in range(blocks):
7584f5c9 311 buf = src.read(BUFSIZE)
c474439c 312 dst.write(buf)
7584f5c9 313 if len(buf) < BUFSIZE:
be60ffd0 314 raise OSError("end of file reached")
7584f5c9
ERE
315 if remainder != 0:
316 buf = src.read(remainder)
c474439c 317 dst.write(buf)
7584f5c9 318 if len(buf) < remainder:
be60ffd0 319 raise OSError("end of file reached")
c7c736b6 320
7584f5c9 321
7584f5c9 322def filemode(mode):
be60ffd0
ERE
323 """Deprecated in this location; use stat.filemode."""
324 import warnings
325 warnings.warn("deprecated in favor of stat.filemode",
326 DeprecationWarning, 2)
327 return stat.filemode(mode)
7584f5c9
ERE
328
329class TarError(Exception):
330 """Base exception."""
331 pass
332class ExtractError(TarError):
333 """General exception for extract errors."""
334 pass
335class ReadError(TarError):
be60ffd0 336 """Exception for unreadable tar archives."""
7584f5c9
ERE
337 pass
338class CompressionError(TarError):
339 """Exception for unavailable compression methods."""
340 pass
341class StreamError(TarError):
342 """Exception for unsupported operations on stream-like TarFiles."""
343 pass
344class HeaderError(TarError):
345 """Base exception for header errors."""
346 pass
347class EmptyHeaderError(HeaderError):
348 """Exception for empty headers."""
349 pass
350class TruncatedHeaderError(HeaderError):
351 """Exception for truncated headers."""
352 pass
353class EOFHeaderError(HeaderError):
354 """Exception for end of file headers."""
355 pass
356class InvalidHeaderError(HeaderError):
357 """Exception for invalid headers."""
358 pass
359class SubsequentHeaderError(HeaderError):
360 """Exception for missing and invalid extended headers."""
361 pass
8ab8fac5
PG
362class InvalidEncryptionError(TarError):
363 """Exception for undefined crypto modes and combinations."""
364 pass
e4e5d0b8
PG
365class DecryptionError(TarError):
366 """Exception for error during decryption."""
367 pass
c7c736b6 368class EncryptionError(TarError):
e93f83f1 369 """Exception for error during encryption."""
c7c736b6 370 pass
e50fa574
PG
371class EndOfFile(Exception):
372 """Signal end of file condition when they’re not an error."""
7584f5c9
ERE
373
374#---------------------------
375# internal stream interface
376#---------------------------
377class _LowLevelFile:
378 """Low-level file object. Supports reading and writing.
379 It is used instead of a regular file object for streaming
380 access.
381 """
382
383 def __init__(self, name, mode):
ad4402e8 384 _mode = {
7584f5c9 385 "r": os.O_RDONLY,
c7c736b6 386 "w": os.O_RDWR | os.O_CREAT | os.O_TRUNC,
7584f5c9
ERE
387 }[mode]
388 if hasattr(os, "O_BINARY"):
f0287fb7 389 _mode |= os.O_BINARY # pylint: disable=no-member
be60ffd0 390 self.fd = os.open(name, _mode, 0o666)
ad4402e8 391 self.offset = 0
7584f5c9
ERE
392
393 def close(self):
394 os.close(self.fd)
395
396 def read(self, size):
ad4402e8
ERE
397 ret = os.read(self.fd, size)
398 self.offset += len(ret)
399 return ret
7584f5c9 400
867f75f7
PG
401 def write(self, s, pos=None):
402 if pos is not None:
403 p0 = self.offset
404 os.lseek (self.fd, pos, os.SEEK_SET)
405 n = os.write(self.fd, s)
406 if pos is None:
407 self.offset += len(s)
408 else:
409 append = pos + n - p0
410 if append > 0:
411 self.offset += append
412 os.lseek (self.fd, p0, os.SEEK_SET)
7584f5c9 413
ad4402e8
ERE
414 def tell(self):
415 return self.offset
416
c7c736b6
PG
417 def seek_set (self, pos):
418 os.lseek (self.fd, pos, os.SEEK_SET)
419 self.offset = pos
420
8ab8fac5 421
15a81fc0
PG
422def gz_header (name=None):
423 timestamp = int(time.time())
424 flags = 0x0
425
426 if name is None:
427 name = b""
428 else:
429 flags |= GZ_FLAG_ORIG_NAME
430 if type(name) is str:
431 name = name.encode("iso-8859-1", "replace")
6e99d23a
PG
432 if name.endswith(b".pdtcrypt"):
433 name = name[:-9]
15a81fc0
PG
434 if name.endswith(b".gz"):
435 name = name[:-3]
436 # RFC1952 says we must use ISO-8859-1 for the FNAME field.
437 name += NUL
438
439 hdr = struct.pack (GZ_FMT_HEADER,
440 GZ_MAGIC [0], GZ_MAGIC [1],
441 GZ_METHOD_DEFLATE, flags,
442 timestamp,
443 GZ_DEFLATE_FLAGS, GZ_OS_CODE)
444
445 return hdr + name
446
d601d33b 447
7584f5c9
ERE
448class _Stream:
449 """Class that serves as an adapter between TarFile and
450 a stream-like object. The stream-like object only
451 needs to have a read() or write() method and is accessed
452 blockwise. Use of gzip or bzip2 compression is possible.
453 A stream-like object could be for example: sys.stdin,
454 sys.stdout, a socket, a tape device etc.
455
3031b7ae
PG
456 _Stream is intended to be used only internally but is
457 nevertherless used externally by Deltatar.
458
459 When encrypting, the ``enccounter`` will be used for
460 initializing the first cryptographic context. When
461 decrypting, its value will be compared to the decrypted
462 object. Decryption fails if the value does not match.
463 In effect, this means that a ``_Stream`` whose ctor was
464 passed ``enccounter`` can only be used to encrypt or
465 decrypt a single object.
7584f5c9
ERE
466 """
467
c7c736b6 468 remainder = -1 # track size in encrypted entries
e93f83f1 469 tolerant = False
c7c736b6 470
6e812ad9 471 def __init__(self, name, mode, comptype, fileobj, bufsize,
d1c38f40 472 concat=False, encryption=None, enccounter=None,
e93f83f1 473 compresslevel=9, tolerant=False):
7584f5c9
ERE
474 """Construct a _Stream object.
475 """
d1c38f40 476 self.arcmode = arcmode_set (concat, encryption, comptype)
e93f83f1 477 self.tolerant = tolerant
d1c38f40 478
7584f5c9
ERE
479 self._extfileobj = True
480 if fileobj is None:
481 fileobj = _LowLevelFile(name, mode)
482 self._extfileobj = False
483
484 if comptype == '*':
485 # Enable transparent compression detection for the
486 # stream interface
487 fileobj = _StreamProxy(fileobj)
488 comptype = fileobj.getcomptype()
d1c38f40
PG
489 if comptype == '':
490 comptype = "tar"
7584f5c9 491
3031b7ae
PG
492 self.enccounter = None
493 if self.arcmode & ARCMODE_ENCRYPT:
494 self.enccounter = enccounter
495
7584f5c9
ERE
496 self.name = name or ""
497 self.mode = mode
498 self.comptype = comptype
53732900 499 self.cmp = None
7584f5c9
ERE
500 self.fileobj = fileobj
501 self.bufsize = bufsize
be60ffd0
ERE
502 self.buf = b""
503 self.pos = 0
504 self.concat_pos = 0
7584f5c9 505 self.closed = False
be60ffd0 506 self.flags = 0
be60ffd0 507 self.last_block_offset = 0
e4e5d0b8 508 self.dbuf = b"" # ???
46c03c02 509 self.exception = None # communicate decompression failure
2b82f50c 510 self.compresslevel = compresslevel
784175ba 511 self.bytes_written = 0
c7c736b6 512 # crypto parameters
2ae46844 513 self.encryption = encryption
c7c736b6 514 self.lasthdr = None
7584f5c9 515
be60ffd0
ERE
516 try:
517 if comptype == "gz":
518 try:
519 import zlib
520 except ImportError:
521 raise CompressionError("zlib module is not available")
522 self.zlib = zlib
bec34b42
PG
523 if mode == "r":
524 self.exception = zlib.error
8ae983c4 525 self._init_read_gz()
bec34b42 526 elif mode == "w":
d1c38f40
PG
527 if not (self.arcmode & ARCMODE_CONCAT):
528 if self.arcmode & ARCMODE_ENCRYPT:
3031b7ae 529 self._init_write_encrypt (name)
a0873dcc 530 self._init_write_gz ()
c2ffe2ec 531 self.crc = zlib.crc32(b"") & 0xFFFFffff
7584f5c9 532
be60ffd0 533 elif comptype == "bz2":
d1c38f40 534 if self.arcmode & ARCMODE_ENCRYPT:
2ae46844 535 raise InvalidEncryptionError("encryption not available for "
d1c38f40 536 "compression “%s”" % comptype)
be60ffd0
ERE
537 try:
538 import bz2
539 except ImportError:
540 raise CompressionError("bz2 module is not available")
541 if mode == "r":
542 self.dbuf = b""
543 self.cmp = bz2.BZ2Decompressor()
544 self.exception = OSError
545 else:
546 self.cmp = bz2.BZ2Compressor()
7584f5c9 547
be60ffd0 548 elif comptype == 'xz':
d1c38f40 549 if self.arcmode & ARCMODE_ENCRYPT:
2ae46844 550 raise InvalidEncryptionError("encryption not available for "
d1c38f40 551 "compression “%s”" % comptype)
c7c736b6
PG
552 try:
553 import lzma
554 except ImportError:
555 raise CompressionError("lzma module is not available")
556 if mode == "r":
557 self.dbuf = b""
558 self.cmp = lzma.LZMADecompressor()
559 self.exception = lzma.LZMAError
560 else:
561 self.cmp = lzma.LZMACompressor()
562
6de9444a 563 elif comptype == "tar":
d1c38f40 564 if not (self.arcmode & ARCMODE_CONCAT) \
6de9444a 565 and mode == "w" \
d1c38f40 566 and self.arcmode & ARCMODE_ENCRYPT:
3031b7ae 567 self._init_write_encrypt (name)
6de9444a
PG
568
569 else:
d1c38f40 570 if self.arcmode & ARCMODE_ENCRYPT:
2ae46844 571 raise InvalidEncryptionError("encryption not available for "
d1c38f40 572 "compression “%s”" % comptype)
c7c736b6 573 raise CompressionError("unknown compression type %r" % comptype)
be60ffd0 574
200d4866 575 except:
be60ffd0
ERE
576 if not self._extfileobj:
577 self.fileobj.close()
578 self.closed = True
579 raise
ac5e4184 580
7584f5c9
ERE
581 def __del__(self):
582 if hasattr(self, "closed") and not self.closed:
fac2cfe1
PG
583 try:
584 self.close()
585 except crypto.InternalError:
586 # context already finalized due to abort but close() tried
587 # to use it
588 pass
7584f5c9 589
c7c736b6 590
d1c38f40
PG
591 def next (self, name):
592 if self.arcmode & ARCMODE_COMPRESS:
593 if getattr (self, "cmp", None) is not None:
594 self._finalize_write_gz ()
0349168a
PG
595 self.__sync()
596 if self.arcmode & ~(ARCMODE_ENCRYPT | ARCMODE_COMPRESS):
597 self.last_block_offset = self.fileobj.tell()
d1c38f40
PG
598 if self.arcmode & ARCMODE_ENCRYPT:
599 self._finalize_write_encrypt ()
600 self._init_write_encrypt (name, set_last_block_offset=True)
601 if self.arcmode & ARCMODE_COMPRESS:
602 self._init_write_gz (set_last_block_offset =
0349168a 603 not (self.arcmode & ARCMODE_ENCRYPT))
d1c38f40
PG
604 return self.last_block_offset
605
606
607 def next_volume (self, name):
608 # with non-concat modes, this is taken care by the _Stream
609 # ctor as invoked by the newvol handler
610 if self.arcmode & ARCMODE_COMPRESS:
611 if getattr (self, "cmp", None) is not None:
612 # e. g. compressed PAX header written
613 self._finalize_write_gz ()
614 if self.arcmode & ARCMODE_ENCRYPT:
615 self._init_write_encrypt (name)
616 if self.arcmode & ARCMODE_COMPRESS:
617 self._init_write_gz ()
618
c7c736b6 619
d1c38f40
PG
620 def _init_write_encrypt (self, entry=None, set_last_block_offset=False):
621 """
622 Save position for delayed write of header; fill the header location
623 with dummy bytes.
624 """
625 # first thing, proclaim new object to the encryption context
626 # secondly, assemble the header with the updated parameters
627 # and commit it directly to the underlying stream, bypassing the
628 # encryption layer in .__write().
629 dummyhdr = self.encryption.next (entry, counter=self.enccounter)
630 if dummyhdr is None:
631 raise EncryptionError ("Crypto.next(): bad dummy header") # XXX
632 self.lasthdr = self.fileobj.tell()
633 self.__write_to_file(dummyhdr)
634 if set_last_block_offset is True:
635 self.last_block_offset = self.lasthdr
c7c736b6
PG
636
637
638 def _finalize_write_encrypt (self):
639 """
640 Seek back to header position, read dummy bytes, finalize crypto
641 obtaining the actual header, write header, seek back to current
642 position.
963d0db4
PG
643
644 Returns the list of IV fixed parts as used during encryption.
c7c736b6 645 """
d1c38f40 646 if self.lasthdr is not None:
c7c736b6
PG
647 pos0 = self.fileobj.tell ()
648 self.fileobj.seek_set (self.lasthdr)
dd47d6a2 649 dummy = self.fileobj.read (crypto.PDTCRYPT_HDR_SIZE)
c7c736b6
PG
650 pos1 = self.fileobj.tell ()
651 dpos = pos1 - self.lasthdr
dd47d6a2 652 assert dpos == crypto.PDTCRYPT_HDR_SIZE
c7c736b6 653 self.fileobj.seek_set (pos0)
c8c72fe1 654 data, hdr, _ = self.encryption.done (dummy)
5f38bff6 655 self.__write_to_file(hdr, pos=self.lasthdr)
c7c736b6
PG
656 self.__write_to_file(data) # append remainder of data
657 self.lasthdr = -1
658
659
57db1546
PG
660 def _finalize_write_gz (self):
661 if self.cmp is not None:
662 chunk = self.buf + self.cmp.flush()
663 if chunk:
664 if self.comptype == "gz":
665 # The native zlib crc is an unsigned 32-bit integer, but
666 # the Python wrapper implicitly casts that to a signed C
667 # long. So, on a 32-bit box self.crc may "look negative",
668 # while the same crc on a 64-bit box may "look positive".
669 # To avoid irksome warnings from the `struct` module, force
670 # it to look positive on all boxes.
671 chunk += struct.pack("<L", self.crc & 0xffffffff)
672 chunk += struct.pack("<L", self.concat_pos & 0xffffFFFF)
673 self.__enc_write (chunk)
15a81fc0 674 self.buf = b""
57db1546
PG
675
676
a0873dcc 677 def _init_write_gz (self, set_last_block_offset=False):
5fdff89f
ERE
678 '''
679 Add a new gzip block, closing last one
680 '''
be60ffd0 681 self.concat_pos = 0
c2ffe2ec 682 self.crc = self.zlib.crc32(b"") & 0xFFFFffff
6de9444a 683 first = self.cmp is None
2b82f50c
ERE
684 self.cmp = self.zlib.compressobj(self.compresslevel,
685 self.zlib.DEFLATED,
686 -self.zlib.MAX_WBITS,
687 self.zlib.DEF_MEM_LEVEL,
688 0)
6e812ad9
DGM
689
690 # if aes, we encrypt after compression
6de9444a 691 if set_last_block_offset is True:
ad4402e8 692 self.last_block_offset = self.fileobj.tell()
6e812ad9 693
15a81fc0 694 self.__write(gz_header (self.name if first is True else None))
5fdff89f 695
ac5e4184 696
7584f5c9
ERE
697 def write(self, s):
698 """Write string s to the stream.
699 """
700 if self.comptype == "gz":
c2ffe2ec 701 self.crc = self.zlib.crc32(s, self.crc) & 0xFFFFffff
7584f5c9 702 self.pos += len(s)
5fdff89f 703 self.concat_pos += len(s)
53732900 704 if self.cmp is not None:
7584f5c9
ERE
705 s = self.cmp.compress(s)
706 self.__write(s)
707
c7c736b6 708 def __sync(self):
cb7a3911 709 """Write what’s left in the buffer to the stream."""
c7c736b6
PG
710 self.__write (b"") # → len (buf) <= bufsiz
711 self.__enc_write (self.buf)
712 self.buf = b""
713
7584f5c9 714 def __write(self, s):
548bb8d5
CH
715 """Writes (and encodes) string s to the stream blockwise
716
717 will wait with encoding/writing until block is complete
7584f5c9
ERE
718 """
719 self.buf += s
720 while len(self.buf) > self.bufsize:
6e812ad9 721 self.__enc_write(self.buf[:self.bufsize])
7584f5c9
ERE
722 self.buf = self.buf[self.bufsize:]
723
867f75f7 724
5f38bff6 725 def __write_to_file(self, s, pos=None):
6e812ad9 726 '''
5f38bff6 727 Writes directly to the fileobj; updates self.bytes_written. If “pos” is
cb7a3911 728 given, the stream will seek to that position first and back afterwards,
5f38bff6 729 and the total of bytes written is not updated.
6e812ad9 730 '''
867f75f7 731 self.fileobj.write(s, pos)
5f38bff6
PG
732 if pos is None:
733 self.bytes_written += len(s)
867f75f7 734
6e812ad9
DGM
735
736 def __enc_write(self, s):
cb7a3911
PG
737 """
738 If encryption is active, the string s is encrypted before being written
739 to the file.
740 """
741 if len (s) == 0:
742 return
d1c38f40 743 if self.arcmode & ARCMODE_ENCRYPT:
cb7a3911
PG
744 buf = s
745 while len (buf) > 0:
746 n, ct = self.encryption.process(buf)
747 self.__write_to_file(ct)
748 buf = buf [n:]
749 if len (buf) > 0:
750 # The entire plaintext was not consumed: The size limit
751 # for encrypted objects was reached. Transparently create
752 # a new encrypted object and continue processing the input.
753 self._finalize_write_encrypt ()
754 self._init_write_encrypt ()
755 else:
756 self.__write_to_file(s)
757
6e812ad9 758
784175ba
CH
759 def estim_file_size(self):
760 """ estimates size of file if closing it now
761
762 The result may differ greatly from the amount of data sent to write()
763 due to compression, encryption and buffering.
764
765 In tests the result (before calling close()) was up to 12k smaller than
766 the final file size if compression is being used because zlib/bz2
767 compressors do not allow inspection of their buffered data :-(
768
ba5a449e
CH
769 Still, we add what close() would add: 8 bytes for gz checksum, one
770 encryption block size if encryption is used and the size of our own
771 buffer
784175ba
CH
772 """
773 if self.closed:
774 return self.bytes_written
775
776 result = self.bytes_written
777 if self.buf:
778 result += len(self.buf)
779 if self.comptype == 'gz':
ba5a449e 780 result += 8 # 2 longs = 8 byte (no extra info written for bzip2)
784175ba
CH
781 return result
782
5fdff89f 783 def close(self, close_fileobj=True):
7584f5c9
ERE
784 """Close the _Stream object. No operation should be
785 done on it afterwards.
786 """
963d0db4 787
7584f5c9
ERE
788 if self.closed:
789 return
790
963d0db4 791 if close_fileobj is True:
a0873dcc 792
ae3d0f2a 793 if self.mode == "w":
d1c38f40 794 if self.arcmode & ARCMODE_COMPRESS:
a0873dcc 795 self._finalize_write_gz ()
ae3d0f2a 796 # end of Tar archive marker (two empty blocks) was written
267bc643
PG
797 # finalize encryption last; no writes may be performed after
798 # this point
cb7a3911 799 self.__sync ()
d1c38f40
PG
800 if self.arcmode & ARCMODE_ENCRYPT:
801 self._finalize_write_encrypt ()
267bc643 802
963d0db4
PG
803 if not self._extfileobj:
804 self.fileobj.close()
805 else:
806 # read the zlib crc and length and check them
807 if self.mode == "r" and self.comptype == "gz":
808 read_crc = self.__read(4)
809 read_length = self.__read(4)
810 calculated_crc = self.crc
811 if struct.unpack("<L", read_crc)[0] != calculated_crc:
812 raise CompressionError("bad gzip crc")
7584f5c9
ERE
813 self.closed = True
814
54128a00 815
7584f5c9
ERE
816 def _init_read_gz(self):
817 """Initialize for reading a gzip compressed fileobj.
818 """
819 self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
7584f5c9 820
85737f48 821 read2 = self.__read(2)
e50fa574
PG
822 if read2 == b"":
823 raise EndOfFile ("_init_read_gz(): read returned zero bytes at pos "
824 "%d" % self.fileobj.tell())
825 # taken from gzip.GzipFile with some alterations
d601d33b 826 if read2 != GZ_MAGIC_BYTES:
7584f5c9 827 raise ReadError("not a gzip file")
85737f48
ERE
828
829 read1 = self.__read(1)
be60ffd0 830 if read1 != b"\010":
7584f5c9
ERE
831 raise CompressionError("unsupported compression method")
832
85737f48 833 self.flags = flag = ord(self.__read(1))
7584f5c9
ERE
834 self.__read(6)
835
836 if flag & 4:
837 xlen = ord(self.__read(1)) + 256 * ord(self.__read(1))
838 self.read(xlen)
839 if flag & 8:
840 while True:
841 s = self.__read(1)
842 if not s or s == NUL:
843 break
844 if flag & 16:
845 while True:
846 s = self.__read(1)
847 if not s or s == NUL:
848 break
849 if flag & 2:
850 self.__read(2)
851
c7c736b6
PG
852 def _init_read_encrypt (self):
853 """Initialize encryption for next entry in archive. Read a header and
854 notify the crypto context."""
d1c38f40 855 if self.arcmode & ARCMODE_ENCRYPT:
6e99d23a 856 lasthdr = self.fileobj.tell ()
15d3eefd
PG
857 try:
858 hdr = crypto.hdr_read_stream (self.fileobj)
8a8ac469
PG
859 except crypto.EndOfFile:
860 return False
6e99d23a 861 except crypto.InvalidHeader as exn:
c7c736b6 862 raise DecryptionError ("Crypto.hdr_read_stream(): error “%s” "
6e99d23a
PG
863 "processing %r at pos %d"
864 % (exn, self.fileobj, lasthdr)) \
ae3d0f2a 865 from exn
3031b7ae
PG
866 if self.enccounter is not None:
867 # enforce that the iv counter in the header matches an
868 # explicitly requested one
869 iv = crypto.hdr_iv_counter (hdr)
870 if iv != self.enccounter:
871 raise DecryptionError ("expected IV counter %d, got %d"
872 % (self.enccounter, iv))
6e99d23a 873 self.lasthdr = lasthdr
c7c736b6 874 self.remainder = hdr ["ctsize"] # distance to next header
1ed44e7b
PG
875 try:
876 self.encryption.next (hdr)
877 except crypto.InvalidParameter as exn:
878 raise DecryptionError ("Crypto.next(): error “%s” "
879 "processing %r at pos %d"
880 % (exn, self.fileobj, lasthdr)) \
881 from exn
8a8ac469
PG
882
883 return True
c7c736b6
PG
884
885
8de91f4f
PG
886 def _read_encrypt (self, buf):
887 """
888 Demote a program error to a decryption error in tolerant mode. This
889 allows recovery from corrupted headers and invalid data.
890 """
891 try:
892 return self.encryption.process (buf)
893 except RuntimeError as exn:
894 if self.tolerant is True:
895 raise DecryptionError (exn)
896 raise
897
898
c7c736b6
PG
899 def _finalize_read_encrypt (self):
900 """
901 Finalize decryption.
902 """
d1c38f40
PG
903 if self.arcmode & ARCMODE_ENCRYPT \
904 and self.lasthdr is not None :
c7c736b6
PG
905 assert self.remainder >= 0
906 if self.remainder > 0:
c7c736b6 907 self.remainder = 0
b0078f26
PG
908 try:
909 data = self.encryption.done ()
910 except crypto.InvalidGCMTag as exn:
911 raise DecryptionError ("decryption failed: %s" % exn)
c7c736b6
PG
912 return data
913
914
7584f5c9
ERE
915 def tell(self):
916 """Return the stream's file pointer position.
917 """
918 return self.pos
919
920 def seek(self, pos=0):
921 """Set the stream's file pointer to pos. Negative seeking
922 is forbidden.
923 """
924 if pos - self.pos >= 0:
925 blocks, remainder = divmod(pos - self.pos, self.bufsize)
be60ffd0 926 for i in range(blocks):
7584f5c9
ERE
927 self.read(self.bufsize)
928 self.read(remainder)
929 else:
930 raise StreamError("seeking backwards is not allowed")
931 return self.pos
932
933 def read(self, size=None):
934 """Return the next size number of bytes from the stream.
935 If size is not defined, return all bytes of the stream
936 up to EOF.
937 """
938 if size is None:
939 t = []
940 while True:
941 buf = self._read(self.bufsize)
942 if not buf:
943 break
944 t.append(buf)
9dc7ac5c 945 buf = b"".join(t)
7584f5c9
ERE
946 else:
947 buf = self._read(size)
948 self.pos += len(buf)
949 return buf
950
3a7e1a50
ERE
951 def readline(self):
952 """Reads just one line, new line character included
953 """
f0fd5e3a 954 # if \n in dbuf, no read neads to be done
be60ffd0
ERE
955 if b'\n' in self.dbuf:
956 pos = self.dbuf.index(b'\n') + 1
f0fd5e3a
ERE
957 ret = self.dbuf[:pos]
958 self.dbuf = self.dbuf[pos:]
959 return ret
960
1215b602 961 buf = []
3a7e1a50
ERE
962 while True:
963 chunk = self._read(self.bufsize)
964
f0fd5e3a 965 # nothing more to read, so return the buffer
3a7e1a50 966 if not chunk:
be60ffd0 967 return b''.join(buf)
3a7e1a50
ERE
968
969 buf.append(chunk)
f0fd5e3a
ERE
970
971 # if \n found, return the new line
be60ffd0
ERE
972 if b'\n' in chunk:
973 dbuf = b''.join(buf)
974 pos = dbuf.index(b'\n') + 1
1215b602 975 self.dbuf = dbuf[pos:] + self.dbuf
3a7e1a50
ERE
976 return dbuf[:pos]
977
7584f5c9
ERE
978 def _read(self, size):
979 """Return size bytes from the stream.
980 """
7584f5c9
ERE
981 c = len(self.dbuf)
982 t = [self.dbuf]
e4e5d0b8 983
7584f5c9 984 while c < size:
867f75f7 985 buf = self.__read(self.bufsize)
7584f5c9
ERE
986 if not buf:
987 break
3a7e1a50 988
53732900 989 if self.cmp is not None:
85737f48 990 try:
3a7e1a50 991 buf = self.cmp.decompress(buf)
54128a00
PG
992 except self.exception as exn:
993 raise ReadError("invalid compressed data (%r)" % exn)
be60ffd0 994 except Exception as e:
04fb06f4
DGM
995 # happens at the end of the file
996 # _init_read_gz failed in the previous iteration so
e4e5d0b8 997 # self.cmp.decompress fails here
d1c38f40 998 if self.arcmode & ARCMODE_CONCAT:
be60ffd0
ERE
999 pass
1000 else:
1001 raise ReadError("invalid compressed data")
d1c38f40 1002 if self.arcmode & ARCMODE_COMPRESS and hasattr(self, "crc"):
c2ffe2ec 1003 self.crc = self.zlib.crc32(buf, self.crc) & 0xFFFFffff
d1c38f40
PG
1004 if self.arcmode & ARCMODE_CONCAT \
1005 and len(self.cmp.unused_data) != 0:
3a7e1a50
ERE
1006 self.buf = self.cmp.unused_data + self.buf
1007 self.close(close_fileobj=False)
1008 try:
1009 self._init_read_gz()
8de91f4f
PG
1010 except DecryptionError:
1011 if self.tolerant is True:
1012 # return whatever data was processed successfully
1013 if len (buf) > 0:
1014 t.append (buf)
1015 if len (t) > 0:
1016 break
1017 raise
e50fa574 1018 except EndOfFile:
3a7e1a50
ERE
1019 # happens at the end of the file
1020 pass
c2ffe2ec 1021 self.crc = self.zlib.crc32(b"") & 0xFFFFffff
3a7e1a50 1022 self.closed = False
7584f5c9
ERE
1023 t.append(buf)
1024 c += len(buf)
be60ffd0 1025 t = b"".join(t)
7584f5c9
ERE
1026 self.dbuf = t[size:]
1027 return t[:size]
1028
e4e5d0b8 1029
7584f5c9 1030 def __read(self, size):
ef3b4499
PG
1031 """
1032 Return size bytes from stream. If internal buffer is empty, read
1033 another block from the stream.
1034
1035 The function returns up to size bytes of data. When an error occurs
1036 during decryption, everything until the end of the last successfully
1037 finalized object is returned.
7584f5c9
ERE
1038 """
1039 c = len(self.buf)
8de91f4f 1040 t = [self.buf] if c > 0 else []
1ed44e7b 1041 good_crypto = len (t)
8de91f4f 1042
7584f5c9 1043 while c < size:
c7c736b6 1044 todo = size
8de91f4f
PG
1045 try:
1046 if self.arcmode & ARCMODE_ENCRYPT:
1047 if self.remainder <= 0:
1048 # prepare next object
044585c6
PG
1049 if self._init_read_encrypt () is False: # EOF
1050 buf = None
1051 break # while
8de91f4f
PG
1052
1053 # only read up to the end of the encrypted object
1054 todo = min (size, self.remainder)
1055 buf = self.fileobj.read(todo)
1056 if self.arcmode & ARCMODE_ENCRYPT:
1057 # decrypt the thing
1058 buf = self._read_encrypt (buf)
1059 if todo == self.remainder:
1060 # at the end of a crypto object; finalization will fail if
1061 # the GCM tag does not match
ef3b4499 1062 trailing = self._finalize_read_encrypt ()
8de91f4f
PG
1063 good_crypto = len (t) + 1
1064 if len (trailing) > 0:
1065 buf += trailing
1066 self.remainder = 0
1067 else:
1068 self.remainder -= todo
1069 except DecryptionError:
1070 if self.tolerant is False:
1071 raise
1072 self.encryption.drop ()
1073 if good_crypto == 0:
1074 raise
1075 # this may occur at any of the three crypto operations above.
1076 # some objects did validate; discard all data after it; next
1077 # call will start with the bad object and error out immediately
1078 self.buf = b"".join (t [good_crypto:])
1079 return b"".join (t [:good_crypto])
c7c736b6
PG
1080
1081 if not buf: ## XXX stream terminated prematurely; this should be an error
7584f5c9 1082 break
c7c736b6 1083
7584f5c9
ERE
1084 t.append(buf)
1085 c += len(buf)
be60ffd0 1086 t = b"".join(t)
7584f5c9 1087 self.buf = t[size:]
fb27c6e8 1088
7584f5c9 1089 return t[:size]
7d372216 1090
7584f5c9
ERE
1091
1092class _StreamProxy(object):
1093 """Small proxy class that enables transparent compression
1094 detection for the Stream interface (mode 'r|*').
1095 """
1096
1097 def __init__(self, fileobj):
1098 self.fileobj = fileobj
1099 self.buf = self.fileobj.read(BLOCKSIZE)
1100
f0287fb7 1101 def read(self, size): # pylint: disable=method-hidden
7584f5c9
ERE
1102 self.read = self.fileobj.read
1103 return self.buf
1104
1105 def getcomptype(self):
d601d33b 1106 if self.buf.startswith(GZ_MAGIC_DEFLATE):
7584f5c9 1107 return "gz"
be60ffd0 1108 elif self.buf[0:3] == b"BZh" and self.buf[4:10] == b"1AY&SY":
7584f5c9 1109 return "bz2"
be60ffd0
ERE
1110 elif self.buf.startswith((b"\x5d\x00\x00\x80", b"\xfd7zXZ")):
1111 return "xz"
1112 else:
1113 return "tar"
7584f5c9
ERE
1114
1115 def close(self):
1116 self.fileobj.close()
1117# class StreamProxy
1118
7584f5c9
ERE
1119#------------------------
1120# Extraction file object
1121#------------------------
1122class _FileInFile(object):
1123 """A thin wrapper around an existing file object that
1124 provides a part of its data as an individual file
1125 object.
1126 """
1127
be60ffd0 1128 def __init__(self, fileobj, offset, size, blockinfo=None):
7584f5c9
ERE
1129 self.fileobj = fileobj
1130 self.offset = offset
1131 self.size = size
7584f5c9 1132 self.position = 0
be60ffd0
ERE
1133 self.name = getattr(fileobj, "name", None)
1134 self.closed = False
1135
1136 if blockinfo is None:
1137 blockinfo = [(0, size)]
1138
1139 # Construct a map with data and zero blocks.
1140 self.map_index = 0
1141 self.map = []
1142 lastpos = 0
1143 realpos = self.offset
1144 for offset, size in blockinfo:
1145 if offset > lastpos:
1146 self.map.append((False, lastpos, offset, None))
1147 self.map.append((True, offset, offset + size, realpos))
1148 realpos += size
1149 lastpos = offset + size
1150 if lastpos < self.size:
1151 self.map.append((False, lastpos, self.size, None))
1152
1153 def flush(self):
1154 pass
1155
1156 def readable(self):
1157 return True
1158
1159 def writable(self):
1160 return False
1161
1162 def seekable(self):
1163 return self.fileobj.seekable()
7584f5c9
ERE
1164
1165 def tell(self):
1166 """Return the current file position.
1167 """
1168 return self.position
1169
be60ffd0 1170 def seek(self, position, whence=io.SEEK_SET):
7584f5c9
ERE
1171 """Seek to a position in the file.
1172 """
be60ffd0
ERE
1173 if whence == io.SEEK_SET:
1174 self.position = min(max(position, 0), self.size)
1175 elif whence == io.SEEK_CUR:
1176 if position < 0:
1177 self.position = max(self.position + position, 0)
1178 else:
1179 self.position = min(self.position + position, self.size)
1180 elif whence == io.SEEK_END:
1181 self.position = max(min(self.size + position, self.size), 0)
1182 else:
1183 raise ValueError("Invalid argument")
1184 return self.position
7584f5c9
ERE
1185
1186 def read(self, size=None):
1187 """Read data from the file.
1188 """
1189 if size is None:
1190 size = self.size - self.position
1191 else:
1192 size = min(size, self.size - self.position)
1193
be60ffd0 1194 buf = b""
7584f5c9 1195 while size > 0:
7584f5c9 1196 while True:
be60ffd0
ERE
1197 data, start, stop, offset = self.map[self.map_index]
1198 if start <= self.position < stop:
7584f5c9 1199 break
be60ffd0
ERE
1200 else:
1201 self.map_index += 1
1202 if self.map_index == len(self.map):
1203 self.map_index = 0
1204 length = min(size, stop - self.position)
1205 if data:
1206 self.fileobj.seek(offset + (self.position - start))
1207 buf += self.fileobj.read(length)
7584f5c9 1208 else:
be60ffd0
ERE
1209 buf += NUL * length
1210 size -= length
1211 self.position += length
1212 return buf
7584f5c9 1213
be60ffd0
ERE
1214 def readinto(self, b):
1215 buf = self.read(len(b))
1216 b[:len(buf)] = buf
1217 return len(buf)
7584f5c9
ERE
1218
1219 def close(self):
7584f5c9 1220 self.closed = True
be60ffd0 1221#class _FileInFile
7584f5c9 1222
be60ffd0
ERE
1223
1224class ExFileObject(io.BufferedReader):
1225
1226 def __init__(self, tarfile, tarinfo):
1227 fileobj = _FileInFile(tarfile.fileobj, tarinfo.offset_data,
1228 tarinfo.size, tarinfo.sparse)
1229 super().__init__(fileobj)
7584f5c9
ERE
1230#class ExFileObject
1231
1232#------------------
1233# Exported Classes
1234#------------------
1235class TarInfo(object):
1236 """Informational class which holds the details about an
1237 archive member given by a tar header block.
1238 TarInfo objects are returned by TarFile.getmember(),
1239 TarFile.getmembers() and TarFile.gettarinfo() and are
1240 usually created internally.
1241 """
1242
be60ffd0
ERE
1243 __slots__ = ("name", "mode", "uid", "gid", "size", "mtime",
1244 "chksum", "type", "linkname", "uname", "gname",
1245 "devmajor", "devminor", "volume_offset",
1246 "offset", "offset_data", "pax_headers", "sparse",
1247 "tarfile", "_sparse_structs", "_link_target")
1248
7584f5c9
ERE
1249 def __init__(self, name=""):
1250 """Construct a TarInfo object. name is the optional name
1251 of the member.
1252 """
1253 self.name = name # member name
be60ffd0 1254 self.mode = 0o644 # file permissions
7584f5c9
ERE
1255 self.uid = 0 # user id
1256 self.gid = 0 # group id
1257 self.size = 0 # file size
1258 self.mtime = 0 # modification time
1259 self.chksum = 0 # header checksum
1260 self.type = REGTYPE # member type
1261 self.linkname = "" # link name
1262 self.uname = "" # user name
1263 self.gname = "" # group name
1264 self.devmajor = 0 # device major number
1265 self.devminor = 0 # device minor number
1266
1267 self.offset = 0 # the tar header starts here
1268 self.offset_data = 0 # the file's data starts here
0eb5048f
ERE
1269 self.volume_offset = 0 # the file's data corresponds with the data
1270 # starting at this position
7584f5c9 1271
be60ffd0 1272 self.sparse = None # sparse member information
7584f5c9
ERE
1273 self.pax_headers = {} # pax header information
1274
1275 # In pax headers the "name" and "linkname" field are called
1276 # "path" and "linkpath".
1277 def _getpath(self):
1278 return self.name
1279 def _setpath(self, name):
1280 self.name = name
1281 path = property(_getpath, _setpath)
1282
1283 def _getlinkpath(self):
1284 return self.linkname
1285 def _setlinkpath(self, linkname):
1286 self.linkname = linkname
1287 linkpath = property(_getlinkpath, _setlinkpath)
1288
1289 def __repr__(self):
1290 return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self))
1291
be60ffd0 1292 def get_info(self, encoding=None, errors=None):
7584f5c9
ERE
1293 """Return the TarInfo's attributes as a dictionary.
1294 """
1295 info = {
1296 "name": self.name,
be60ffd0 1297 "mode": self.mode & 0o7777,
7584f5c9
ERE
1298 "uid": self.uid,
1299 "gid": self.gid,
1300 "size": self.size,
1301 "mtime": self.mtime,
1302 "chksum": self.chksum,
1303 "type": self.type,
1304 "linkname": self.linkname,
1305 "uname": self.uname,
1306 "gname": self.gname,
1307 "devmajor": self.devmajor,
36a315a0 1308 "devminor": self.devminor,
0eb5048f
ERE
1309 "offset_data": self.offset_data,
1310 "volume_offset": self.volume_offset
7584f5c9
ERE
1311 }
1312
1313 if info["type"] == DIRTYPE and not info["name"].endswith("/"):
1314 info["name"] += "/"
1315
7584f5c9
ERE
1316 return info
1317
be60ffd0
ERE
1318 def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING,
1319 errors="surrogateescape"):
7584f5c9
ERE
1320 """Return a tar header as a string of 512 byte blocks.
1321 """
1322 info = self.get_info(encoding, errors)
1323
1324 if format == USTAR_FORMAT:
be60ffd0 1325 return self.create_ustar_header(info, encoding, errors)
7584f5c9 1326 elif format == GNU_FORMAT:
be60ffd0 1327 return self.create_gnu_header(info, encoding, errors)
7584f5c9
ERE
1328 elif format == PAX_FORMAT:
1329 return self.create_pax_header(info, encoding, errors)
1330 else:
1331 raise ValueError("invalid format")
1332
be60ffd0 1333 def create_ustar_header(self, info, encoding, errors):
7584f5c9
ERE
1334 """Return the object as a ustar header block.
1335 """
1336 info["magic"] = POSIX_MAGIC
1337
1338 if len(info["linkname"]) > LENGTH_LINK:
1339 raise ValueError("linkname is too long")
1340
1341 if len(info["name"]) > LENGTH_NAME:
1342 info["prefix"], info["name"] = self._posix_split_name(info["name"])
1343
be60ffd0 1344 return self._create_header(info, USTAR_FORMAT, encoding, errors)
7584f5c9 1345
be60ffd0 1346 def create_gnu_header(self, info, encoding, errors):
7584f5c9
ERE
1347 """Return the object as a GNU header block sequence.
1348 """
1349 info["magic"] = GNU_MAGIC
1350
2f854e77
ERE
1351 if self.ismultivol():
1352 prefix = [
1353 itn(info.get("atime", 0), 12, GNU_FORMAT),
1354 itn(info.get("ctime", 0), 12, GNU_FORMAT),
0eb5048f 1355 itn(self.volume_offset, 12, GNU_FORMAT),
2f854e77
ERE
1356 itn(0, 119, GNU_FORMAT), # stuff unused in this tar implementation, set to zero
1357 ]
be60ffd0 1358 info['prefix'] = b"".join(prefix)
0eb5048f 1359 info['size'] = info['size'] - self.volume_offset
2f854e77 1360
be60ffd0 1361 buf = b""
7584f5c9 1362 if len(info["linkname"]) > LENGTH_LINK:
be60ffd0
ERE
1363 buf += self._create_gnu_long_header(info["linkname"],
1364 GNUTYPE_LONGLINK, encoding, errors)
7584f5c9
ERE
1365
1366 if len(info["name"]) > LENGTH_NAME:
be60ffd0
ERE
1367 buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME,
1368 encoding, errors)
7584f5c9 1369
be60ffd0 1370 return buf + self._create_header(info, GNU_FORMAT, encoding, errors)
7584f5c9
ERE
1371
1372 def create_pax_header(self, info, encoding, errors):
1373 """Return the object as a ustar header block. If it cannot be
1374 represented this way, prepend a pax extended header sequence
1375 with supplement information.
1376 """
1377 info["magic"] = POSIX_MAGIC
1378 pax_headers = self.pax_headers.copy()
c04e0751
ERE
1379 if self.ismultivol():
1380 info['size'] = info['size'] - self.volume_offset
7584f5c9
ERE
1381
1382 # Test string fields for values that exceed the field length or cannot
1383 # be represented in ASCII encoding.
1384 for name, hname, length in (
36a315a0
ERE
1385 ("name", "path", LENGTH_NAME),
1386 ("linkname", "linkpath", LENGTH_LINK),
1387 ("uname", "uname", 32),
1388 ("gname", "gname", 32)):
7584f5c9
ERE
1389
1390 if hname in pax_headers:
1391 # The pax header has priority.
1392 continue
1393
7584f5c9
ERE
1394 # Try to encode the string as ASCII.
1395 try:
be60ffd0 1396 info[name].encode("ascii", "strict")
7584f5c9 1397 except UnicodeEncodeError:
be60ffd0 1398 pax_headers[hname] = info[name]
7584f5c9
ERE
1399 continue
1400
1401 if len(info[name]) > length:
be60ffd0 1402 pax_headers[hname] = info[name]
7584f5c9
ERE
1403
1404 # Test number fields for values that exceed the field limit or values
1405 # that like to be stored as float.
1406 for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)):
1407 if name in pax_headers:
1408 # The pax header has priority. Avoid overflow.
1409 info[name] = 0
1410 continue
1411
1412 val = info[name]
1413 if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float):
be60ffd0 1414 pax_headers[name] = str(val)
7584f5c9
ERE
1415 info[name] = 0
1416
1417 # Create a pax extended header if necessary.
1418 if pax_headers:
be60ffd0 1419 buf = self._create_pax_generic_header(pax_headers, XHDTYPE, encoding)
7584f5c9 1420 else:
be60ffd0 1421 buf = b""
7584f5c9 1422
be60ffd0 1423 return buf + self._create_header(info, USTAR_FORMAT, "ascii", "replace")
7584f5c9
ERE
1424
1425 @classmethod
1426 def create_pax_global_header(cls, pax_headers):
1427 """Return the object as a pax global header block sequence.
1428 """
be60ffd0 1429 return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf-8")
7584f5c9
ERE
1430
1431 def _posix_split_name(self, name):
1432 """Split a name longer than 100 chars into a prefix
1433 and a name part.
1434 """
1435 prefix = name[:LENGTH_PREFIX + 1]
1436 while prefix and prefix[-1] != "/":
1437 prefix = prefix[:-1]
1438
1439 name = name[len(prefix):]
1440 prefix = prefix[:-1]
1441
1442 if not prefix or len(name) > LENGTH_NAME:
1443 raise ValueError("name is too long")
1444 return prefix, name
1445
1446 @staticmethod
be60ffd0 1447 def _create_header(info, format, encoding, errors):
7584f5c9
ERE
1448 """Return a header block. info is a dictionary with file
1449 information, format must be one of the *_FORMAT constants.
1450 """
1451 parts = [
be60ffd0
ERE
1452 stn(info.get("name", ""), 100, encoding, errors),
1453 itn(info.get("mode", 0) & 0o7777, 8, format),
7584f5c9
ERE
1454 itn(info.get("uid", 0), 8, format),
1455 itn(info.get("gid", 0), 8, format),
1456 itn(info.get("size", 0), 12, format),
1457 itn(info.get("mtime", 0), 12, format),
be60ffd0 1458 b" ", # checksum field
2f854e77 1459 info.get("type", REGTYPE),
be60ffd0
ERE
1460 stn(info.get("linkname", ""), 100, encoding, errors),
1461 info.get("magic", POSIX_MAGIC),
1462 stn(info.get("uname", ""), 32, encoding, errors),
1463 stn(info.get("gname", ""), 32, encoding, errors),
7584f5c9
ERE
1464 itn(info.get("devmajor", 0), 8, format),
1465 itn(info.get("devminor", 0), 8, format),
be60ffd0 1466 sbtn(info.get("prefix", ""), 155, encoding, errors)
7584f5c9
ERE
1467 ]
1468
be60ffd0 1469 buf = struct.pack("%ds" % BLOCKSIZE, b"".join(parts))
7584f5c9 1470 chksum = calc_chksums(buf[-BLOCKSIZE:])[0]
be60ffd0 1471 buf = buf[:-364] + bytes("%06o\0" % chksum, "ascii") + buf[-357:]
7584f5c9
ERE
1472 return buf
1473
1474 @staticmethod
1475 def _create_payload(payload):
1476 """Return the string payload filled with zero bytes
1477 up to the next 512 byte border.
1478 """
1479 blocks, remainder = divmod(len(payload), BLOCKSIZE)
1480 if remainder > 0:
1481 payload += (BLOCKSIZE - remainder) * NUL
1482 return payload
1483
1484 @classmethod
be60ffd0 1485 def _create_gnu_long_header(cls, name, type, encoding, errors):
7584f5c9
ERE
1486 """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence
1487 for name.
1488 """
be60ffd0 1489 name = name.encode(encoding, errors) + NUL
7584f5c9
ERE
1490
1491 info = {}
1492 info["name"] = "././@LongLink"
1493 info["type"] = type
1494 info["size"] = len(name)
1495 info["magic"] = GNU_MAGIC
1496
1497 # create extended header + name blocks.
be60ffd0 1498 return cls._create_header(info, USTAR_FORMAT, encoding, errors) + \
7584f5c9
ERE
1499 cls._create_payload(name)
1500
1501 @classmethod
be60ffd0
ERE
1502 def _create_pax_generic_header(cls, pax_headers, type, encoding):
1503 """Return a POSIX.1-2008 extended or global header sequence
7584f5c9 1504 that contains a list of keyword, value pairs. The values
be60ffd0 1505 must be strings.
7584f5c9 1506 """
be60ffd0
ERE
1507 # Check if one of the fields contains surrogate characters and thereby
1508 # forces hdrcharset=BINARY, see _proc_pax() for more information.
1509 binary = False
1510 for keyword, value in pax_headers.items():
1511 try:
1512 value.encode("utf-8", "strict")
1513 except UnicodeEncodeError:
1514 binary = True
1515 break
1516
1517 records = b""
1518 if binary:
1519 # Put the hdrcharset field at the beginning of the header.
1520 records += b"21 hdrcharset=BINARY\n"
1521
1522 for keyword, value in pax_headers.items():
1523 keyword = keyword.encode("utf-8")
1524 if binary:
1525 # Try to restore the original byte representation of `value'.
1526 # Needless to say, that the encoding must match the string.
1527 value = value.encode(encoding, "surrogateescape")
1528 else:
1529 value = value.encode("utf-8")
1530
7584f5c9
ERE
1531 l = len(keyword) + len(value) + 3 # ' ' + '=' + '\n'
1532 n = p = 0
1533 while True:
1534 n = l + len(str(p))
1535 if n == p:
1536 break
1537 p = n
be60ffd0 1538 records += bytes(str(p), "ascii") + b" " + keyword + b"=" + value + b"\n"
7584f5c9
ERE
1539
1540 # We use a hardcoded "././@PaxHeader" name like star does
1541 # instead of the one that POSIX recommends.
1542 info = {}
1543 info["name"] = "././@PaxHeader"
1544 info["type"] = type
1545 info["size"] = len(records)
1546 info["magic"] = POSIX_MAGIC
1547
1548 # Create pax header + record blocks.
be60ffd0 1549 return cls._create_header(info, USTAR_FORMAT, "ascii", "replace") + \
7584f5c9
ERE
1550 cls._create_payload(records)
1551
1552 @classmethod
be60ffd0
ERE
1553 def frombuf(cls, buf, encoding, errors):
1554 """Construct a TarInfo object from a 512 byte bytes object.
7584f5c9
ERE
1555 """
1556 if len(buf) == 0:
1557 raise EmptyHeaderError("empty header")
1558 if len(buf) != BLOCKSIZE:
1559 raise TruncatedHeaderError("truncated header")
1560 if buf.count(NUL) == BLOCKSIZE:
1561 raise EOFHeaderError("end of file header")
1562
1563 chksum = nti(buf[148:156])
1564 if chksum not in calc_chksums(buf):
1565 raise InvalidHeaderError("bad checksum")
1566
1567 obj = cls()
be60ffd0 1568 obj.name = nts(buf[0:100], encoding, errors)
7584f5c9
ERE
1569 obj.mode = nti(buf[100:108])
1570 obj.uid = nti(buf[108:116])
1571 obj.gid = nti(buf[116:124])
1572 obj.size = nti(buf[124:136])
1573 obj.mtime = nti(buf[136:148])
1574 obj.chksum = chksum
1575 obj.type = buf[156:157]
be60ffd0
ERE
1576 obj.linkname = nts(buf[157:257], encoding, errors)
1577 obj.uname = nts(buf[265:297], encoding, errors)
1578 obj.gname = nts(buf[297:329], encoding, errors)
7584f5c9
ERE
1579 obj.devmajor = nti(buf[329:337])
1580 obj.devminor = nti(buf[337:345])
be60ffd0
ERE
1581 prefix = nts(buf[345:500], encoding, errors)
1582
1583 # The old GNU sparse format occupies some of the unused
1584 # space in the buffer for up to 4 sparse structures.
1585 # Save the them for later processing in _proc_sparse().
1586 if obj.type == GNUTYPE_SPARSE:
1587 pos = 386
1588 structs = []
1589 for i in range(4):
1590 try:
1591 offset = nti(buf[pos:pos + 12])
1592 numbytes = nti(buf[pos + 12:pos + 24])
1593 except ValueError:
1594 break
1595 structs.append((offset, numbytes))
1596 pos += 24
1597 isextended = bool(buf[482])
1598 origsize = nti(buf[483:495])
1599 obj._sparse_structs = (structs, isextended, origsize)
7584f5c9
ERE
1600
1601 # Old V7 tar format represents a directory as a regular
1602 # file with a trailing slash.
1603 if obj.type == AREGTYPE and obj.name.endswith("/"):
1604 obj.type = DIRTYPE
1605
1606 # Remove redundant slashes from directories.
1607 if obj.isdir():
1608 obj.name = obj.name.rstrip("/")
1609
1610 # Reconstruct a ustar longname.
1611 if prefix and obj.type not in GNU_TYPES:
1612 obj.name = prefix + "/" + obj.name
c474439c
ERE
1613 else:
1614 obj.offset_data = nti(buf[369:381])
7584f5c9
ERE
1615 return obj
1616
1617 @classmethod
1618 def fromtarfile(cls, tarfile):
1619 """Return the next TarInfo object from TarFile object
1620 tarfile.
1621 """
1622 buf = tarfile.fileobj.read(BLOCKSIZE)
be60ffd0 1623 obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors)
7584f5c9
ERE
1624 obj.offset = tarfile.fileobj.tell() - BLOCKSIZE
1625 return obj._proc_member(tarfile)
1626
1627 #--------------------------------------------------------------------------
1628 # The following are methods that are called depending on the type of a
1629 # member. The entry point is _proc_member() which can be overridden in a
1630 # subclass to add custom _proc_*() methods. A _proc_*() method MUST
1631 # implement the following
1632 # operations:
1633 # 1. Set self.offset_data to the position where the data blocks begin,
1634 # if there is data that follows.
1635 # 2. Set tarfile.offset to the position where the next member's header will
1636 # begin.
1637 # 3. Return self or another valid TarInfo object.
1638 def _proc_member(self, tarfile):
1639 """Choose the right processing method depending on
1640 the type and call it.
1641 """
1642 if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK):
1643 return self._proc_gnulong(tarfile)
1644 elif self.type == GNUTYPE_SPARSE:
1645 return self._proc_sparse(tarfile)
1646 elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE):
1647 return self._proc_pax(tarfile)
1648 else:
1649 return self._proc_builtin(tarfile)
1650
1651 def _proc_builtin(self, tarfile):
1652 """Process a builtin type or an unknown type which
1653 will be treated as a regular file.
1654 """
1655 self.offset_data = tarfile.fileobj.tell()
1656 offset = self.offset_data
00c34a12 1657 if self.isreg() or self.ismultivol() or self.type not in SUPPORTED_TYPES:
7584f5c9
ERE
1658 # Skip the following data blocks.
1659 offset += self._block(self.size)
1660 tarfile.offset = offset
1661
1662 # Patch the TarInfo object with saved global
1663 # header information.
1664 self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors)
1665
1666 return self
1667
1668 def _proc_gnulong(self, tarfile):
1669 """Process the blocks that hold a GNU longname
1670 or longlink member.
1671 """
1672 buf = tarfile.fileobj.read(self._block(self.size))
1673
1674 # Fetch the next header and process it.
1675 try:
1676 next = self.fromtarfile(tarfile)
1677 except HeaderError:
1678 raise SubsequentHeaderError("missing or bad subsequent header")
1679
1680 # Patch the TarInfo object from the next header with
1681 # the longname information.
1682 next.offset = self.offset
1683 if self.type == GNUTYPE_LONGNAME:
be60ffd0 1684 next.name = nts(buf, tarfile.encoding, tarfile.errors)
7584f5c9 1685 elif self.type == GNUTYPE_LONGLINK:
be60ffd0 1686 next.linkname = nts(buf, tarfile.encoding, tarfile.errors)
7584f5c9
ERE
1687
1688 return next
1689
1690 def _proc_sparse(self, tarfile):
1691 """Process a GNU sparse header plus extra headers.
1692 """
be60ffd0
ERE
1693 # We already collected some sparse structures in frombuf().
1694 structs, isextended, origsize = self._sparse_structs
1695 del self._sparse_structs
1696
1697 # Collect sparse structures from extended header blocks.
1698 while isextended:
7584f5c9
ERE
1699 buf = tarfile.fileobj.read(BLOCKSIZE)
1700 pos = 0
be60ffd0 1701 for i in range(21):
7584f5c9
ERE
1702 try:
1703 offset = nti(buf[pos:pos + 12])
1704 numbytes = nti(buf[pos + 12:pos + 24])
1705 except ValueError:
1706 break
be60ffd0
ERE
1707 if offset and numbytes:
1708 structs.append((offset, numbytes))
7584f5c9 1709 pos += 24
be60ffd0
ERE
1710 isextended = bool(buf[504])
1711 self.sparse = structs
7584f5c9
ERE
1712
1713 self.offset_data = tarfile.fileobj.tell()
1714 tarfile.offset = self.offset_data + self._block(self.size)
1715 self.size = origsize
7584f5c9
ERE
1716 return self
1717
1718 def _proc_pax(self, tarfile):
1719 """Process an extended or global header as described in
be60ffd0 1720 POSIX.1-2008.
7584f5c9
ERE
1721 """
1722 # Read the header information.
1723 buf = tarfile.fileobj.read(self._block(self.size))
1724
1725 # A pax header stores supplemental information for either
1726 # the following file (extended) or all following files
1727 # (global).
1728 if self.type == XGLTYPE:
1729 pax_headers = tarfile.pax_headers
1730 else:
1731 pax_headers = tarfile.pax_headers.copy()
1732
be60ffd0
ERE
1733 # Check if the pax header contains a hdrcharset field. This tells us
1734 # the encoding of the path, linkpath, uname and gname fields. Normally,
1735 # these fields are UTF-8 encoded but since POSIX.1-2008 tar
1736 # implementations are allowed to store them as raw binary strings if
1737 # the translation to UTF-8 fails.
1738 match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf)
1739 if match is not None:
1740 pax_headers["hdrcharset"] = match.group(1).decode("utf-8")
1741
1742 # For the time being, we don't care about anything other than "BINARY".
1743 # The only other value that is currently allowed by the standard is
1744 # "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
1745 hdrcharset = pax_headers.get("hdrcharset")
1746 if hdrcharset == "BINARY":
1747 encoding = tarfile.encoding
1748 else:
1749 encoding = "utf-8"
1750
7584f5c9
ERE
1751 # Parse pax header information. A record looks like that:
1752 # "%d %s=%s\n" % (length, keyword, value). length is the size
1753 # of the complete record including the length field itself and
1754 # the newline. keyword and value are both UTF-8 encoded strings.
be60ffd0 1755 regex = re.compile(br"(\d+) ([^=]+)=")
7584f5c9
ERE
1756 pos = 0
1757 while True:
1758 match = regex.match(buf, pos)
1759 if not match:
1760 break
1761
1762 length, keyword = match.groups()
1763 length = int(length)
1764 value = buf[match.end(2) + 1:match.start(1) + length - 1]
1765
be60ffd0
ERE
1766 # Normally, we could just use "utf-8" as the encoding and "strict"
1767 # as the error handler, but we better not take the risk. For
1768 # example, GNU tar <= 1.23 is known to store filenames it cannot
1769 # translate to UTF-8 as raw strings (unfortunately without a
1770 # hdrcharset=BINARY header).
1771 # We first try the strict standard encoding, and if that fails we
1772 # fall back on the user's encoding and error handler.
1773 keyword = self._decode_pax_field(keyword, "utf-8", "utf-8",
1774 tarfile.errors)
1775 if keyword in PAX_NAME_FIELDS:
1776 value = self._decode_pax_field(value, encoding, tarfile.encoding,
1777 tarfile.errors)
1778 else:
1779 value = self._decode_pax_field(value, "utf-8", "utf-8",
1780 tarfile.errors)
7584f5c9
ERE
1781
1782 pax_headers[keyword] = value
1783 pos += length
1784
36a315a0 1785
7584f5c9
ERE
1786 # Fetch the next header.
1787 try:
1788 next = self.fromtarfile(tarfile)
1789 except HeaderError:
1790 raise SubsequentHeaderError("missing or bad subsequent header")
1791
be60ffd0
ERE
1792 # Process GNU sparse information.
1793 if "GNU.sparse.map" in pax_headers:
1794 # GNU extended sparse format version 0.1.
1795 self._proc_gnusparse_01(next, pax_headers)
1796
1797 elif "GNU.sparse.size" in pax_headers:
1798 # GNU extended sparse format version 0.0.
1799 self._proc_gnusparse_00(next, pax_headers, buf)
1800
1801 elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0":
1802 # GNU extended sparse format version 1.0.
1803 self._proc_gnusparse_10(next, pax_headers, tarfile)
1804
7584f5c9
ERE
1805 if self.type in (XHDTYPE, SOLARIS_XHDTYPE):
1806 # Patch the TarInfo object with the extended header info.
1807 next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors)
1808 next.offset = self.offset
1809
1810 if "size" in pax_headers:
1811 # If the extended header replaces the size field,
1812 # we need to recalculate the offset where the next
1813 # header starts.
1814 offset = next.offset_data
1815 if next.isreg() or next.type not in SUPPORTED_TYPES:
1816 offset += next._block(next.size)
1817 tarfile.offset = offset
1818
c04e0751
ERE
1819 if next is not None:
1820 if "GNU.volume.filename" in pax_headers:
1821 if pax_headers["GNU.volume.filename"] == next.name:
1822 if "GNU.volume.size" in pax_headers:
1823 next.size = int(pax_headers["GNU.volume.size"])
1824 if "GNU.volume.offset" in pax_headers:
1825 next.volume_offset = int(pax_headers["GNU.volume.offset"])
1826
1827 for key in pax_headers.keys():
1828 if key.startswith("GNU.volume"):
1829 del tarfile.pax_headers[key]
0eb5048f 1830
7584f5c9
ERE
1831 return next
1832
be60ffd0
ERE
1833 def _proc_gnusparse_00(self, next, pax_headers, buf):
1834 """Process a GNU tar extended sparse header, version 0.0.
7584f5c9 1835 """
be60ffd0
ERE
1836 offsets = []
1837 for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf):
1838 offsets.append(int(match.group(1)))
1839 numbytes = []
1840 for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf):
1841 numbytes.append(int(match.group(1)))
1842 next.sparse = list(zip(offsets, numbytes))
7584f5c9 1843
be60ffd0
ERE
1844 def _proc_gnusparse_01(self, next, pax_headers):
1845 """Process a GNU tar extended sparse header, version 0.1.
1846 """
1847 sparse = [int(x) for x in pax_headers["GNU.sparse.map"].split(",")]
1848 next.sparse = list(zip(sparse[::2], sparse[1::2]))
7584f5c9 1849
be60ffd0
ERE
1850 def _proc_gnusparse_10(self, next, pax_headers, tarfile):
1851 """Process a GNU tar extended sparse header, version 1.0.
1852 """
1853 fields = None
1854 sparse = []
1855 buf = tarfile.fileobj.read(BLOCKSIZE)
1856 fields, buf = buf.split(b"\n", 1)
1857 fields = int(fields)
1858 while len(sparse) < fields * 2:
1859 if b"\n" not in buf:
1860 buf += tarfile.fileobj.read(BLOCKSIZE)
1861 number, buf = buf.split(b"\n", 1)
1862 sparse.append(int(number))
1863 next.offset_data = tarfile.fileobj.tell()
1864 next.sparse = list(zip(sparse[::2], sparse[1::2]))
7584f5c9 1865
be60ffd0
ERE
1866 def _apply_pax_info(self, pax_headers, encoding, errors):
1867 """Replace fields with supplemental information from a previous
1868 pax extended or global header.
1869 """
1870 for keyword, value in pax_headers.items():
1871 if keyword == "GNU.sparse.name":
1872 setattr(self, "path", value)
1873 elif keyword == "GNU.sparse.size":
1874 setattr(self, "size", int(value))
1875 elif keyword == "GNU.sparse.realsize":
1876 setattr(self, "size", int(value))
1877 elif keyword in PAX_FIELDS:
1878 if keyword in PAX_NUMBER_FIELDS:
1879 try:
1880 value = PAX_NUMBER_FIELDS[keyword](value)
1881 except ValueError:
1882 value = 0
1883 if keyword == "path":
f0287fb7 1884 value = value.rstrip("/") # pylint: disable=no-member
be60ffd0 1885 setattr(self, keyword, value)
7584f5c9
ERE
1886
1887 self.pax_headers = pax_headers.copy()
1888
be60ffd0
ERE
1889 def _decode_pax_field(self, value, encoding, fallback_encoding, fallback_errors):
1890 """Decode a single field from a pax record.
1891 """
1892 try:
1893 return value.decode(encoding, "strict")
1894 except UnicodeDecodeError:
1895 return value.decode(fallback_encoding, fallback_errors)
1896
7584f5c9
ERE
1897 def _block(self, count):
1898 """Round up a byte count by BLOCKSIZE and return it,
1899 e.g. _block(834) => 1024.
1900 """
1901 blocks, remainder = divmod(count, BLOCKSIZE)
1902 if remainder:
1903 blocks += 1
1904 return blocks * BLOCKSIZE
1905
1906 def isreg(self):
1907 return self.type in REGULAR_TYPES
1908 def isfile(self):
1909 return self.isreg()
1910 def isdir(self):
1911 return self.type == DIRTYPE
1912 def issym(self):
1913 return self.type == SYMTYPE
1914 def islnk(self):
1915 return self.type == LNKTYPE
1916 def ischr(self):
1917 return self.type == CHRTYPE
1918 def isblk(self):
1919 return self.type == BLKTYPE
1920 def isfifo(self):
1921 return self.type == FIFOTYPE
1922 def issparse(self):
be60ffd0 1923 return self.sparse is not None
7584f5c9
ERE
1924 def isdev(self):
1925 return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE)
68ddf955 1926 def ismultivol(self):
c04e0751
ERE
1927 return self.type == GNUTYPE_MULTIVOL or self.volume_offset > 0 or\
1928 "GNU.volume.offset" in self.pax_headers
7584f5c9
ERE
1929# class TarInfo
1930
1931class TarFile(object):
1932 """The TarFile Class provides an interface to tar archives.
1933 """
1934
1935 debug = 0 # May be set from 0 (no msgs) to 3 (all msgs)
1936
1937 dereference = False # If true, add content of linked file to the
1938 # tar file, else the link.
1939
1940 ignore_zeros = False # If true, skips empty or invalid blocks and
1941 # continues processing.
1942
83f2d71e 1943 max_volume_size = None # If different from None, establishes maximum
68ddf955
ERE
1944 # size of tar volumes
1945
1946 new_volume_handler = None # function handler to be executed before when
1947 # a new volume is needed
1948
1949 volume_number = 0 # current volume number, used for multi volume
1950 # support
1951
7584f5c9
ERE
1952 errorlevel = 1 # If 0, fatal errors only appear in debug
1953 # messages (if debug >= 0). If > 0, errors
1954 # are passed to the caller as exceptions.
1955
1956 format = DEFAULT_FORMAT # The format to use when creating an archive.
1957
1958 encoding = ENCODING # Encoding for 8-bit character strings.
1959
1960 errors = None # Error handler for unicode conversion.
1961
1962 tarinfo = TarInfo # The default TarInfo class to use.
1963
be60ffd0 1964 fileobject = ExFileObject # The file-object for extractfile().
7584f5c9 1965
d1c38f40
PG
1966 arcmode = ARCMODE_PLAIN # Object processing mode (“concat”, encryption,
1967 # compression)
5fdff89f 1968
ea625b04
ERE
1969 save_to_members = True # If new members are saved. This can be disabled
1970 # if you manage lots of files and don't want
1971 # to have high memory usage
1972
9ef1fb87
TJ
1973 cache_uid2user = {} # cache to avoid getpwuid calls. It always parses /etc/passwd.
1974 cache_gid2group = {} # same cache for groups
1975
7584f5c9
ERE
1976 def __init__(self, name=None, mode="r", fileobj=None, format=None,
1977 tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
be60ffd0 1978 errors="surrogateescape", pax_headers=None, debug=None,
548bb8d5 1979 errorlevel=None, max_volume_size=None, new_volume_handler=None,
d1c38f40 1980 concat=False, nacl=None,
c7c736b6 1981 save_to_members=True):
7584f5c9
ERE
1982 """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
1983 read from an existing archive, 'a' to append data to an existing
1984 file or 'w' to create a new file overwriting an existing one. `mode'
1985 defaults to 'r'.
1986 If `fileobj' is given, it is used for reading or writing data. If it
1987 can be determined, `mode' is overridden by `fileobj's mode.
1988 `fileobj' is not closed, when TarFile is closed.
1989 """
1990 if len(mode) > 1 or mode not in "raw":
1991 raise ValueError("mode must be 'r', 'a' or 'w'")
1992 self.mode = mode
d1c38f40 1993 self.arcmode = arcmode_set (concat)
c7c736b6 1994 self.nacl = nacl
7584f5c9
ERE
1995 self._mode = {"r": "rb", "a": "r+b", "w": "wb"}[mode]
1996
1997 if not fileobj:
1998 if self.mode == "a" and not os.path.exists(name):
1999 # Create nonexistent files in append mode.
2000 self.mode = "w"
2001 self._mode = "wb"
2002 fileobj = bltn_open(name, self._mode)
2003 self._extfileobj = False
2004 else:
2005 if name is None and hasattr(fileobj, "name"):
2006 name = fileobj.name
d5361dac 2007 # when fileobj is a gzip.GzipFile, fileobj.mode is an int (not valid for us)
be60ffd0 2008 if hasattr(fileobj, "mode") and isinstance(fileobj.mode, str):
7584f5c9
ERE
2009 self._mode = fileobj.mode
2010 self._extfileobj = True
be60ffd0 2011 self.name = os.path.abspath(name) if name else None
2f854e77 2012 self.base_name = self.name = os.path.abspath(name) if name else None
7584f5c9
ERE
2013 self.fileobj = fileobj
2014
2015 # Init attributes.
2016 if format is not None:
2017 self.format = format
2018 if tarinfo is not None:
2019 self.tarinfo = tarinfo
2020 if dereference is not None:
2021 self.dereference = dereference
2022 if ignore_zeros is not None:
2023 self.ignore_zeros = ignore_zeros
2024 if encoding is not None:
2025 self.encoding = encoding
2026
be60ffd0 2027 self.errors = errors
7584f5c9
ERE
2028
2029 if pax_headers is not None and self.format == PAX_FORMAT:
2030 self.pax_headers = pax_headers
2031 else:
2032 self.pax_headers = {}
2033
2034 if debug is not None:
2035 self.debug = debug
2036 if errorlevel is not None:
2037 self.errorlevel = errorlevel
2038
2039 # Init datastructures.
ae48acc8 2040 if max_volume_size and max_volume_size < 3*BLOCKSIZE:
0c818a18 2041 raise ValueError("max_volume_size needs to be at least %d" % (3*BLOCKSIZE))
ae48acc8
ERE
2042 if max_volume_size and not callable(new_volume_handler):
2043 raise ValueError("new_volume_handler needs to be set and be callable for multivolume support")
5ab3f8f9
CH
2044 if max_volume_size:
2045 self.max_volume_size = int(max_volume_size)
2046 else:
2047 self.max_volume_size = None
ae48acc8 2048
ea625b04 2049 self.save_to_members = save_to_members
68ddf955 2050 self.new_volume_handler = new_volume_handler
7584f5c9
ERE
2051 self.closed = False
2052 self.members = [] # list of members as TarInfo objects
2053 self._loaded = False # flag if all members have been read
2054 self.offset = self.fileobj.tell()
2055 # current position in the archive file
2056 self.inodes = {} # dictionary caching the inodes of
2057 # archive members already added
2058
2059 try:
2060 if self.mode == "r":
2061 self.firstmember = None
2062 self.firstmember = self.next()
2063
2064 if self.mode == "a":
2065 # Move to the end of the archive,
2066 # before the first empty block.
2067 while True:
2068 self.fileobj.seek(self.offset)
2069 try:
2070 tarinfo = self.tarinfo.fromtarfile(self)
2071 self.members.append(tarinfo)
2072 except EOFHeaderError:
2073 self.fileobj.seek(self.offset)
2074 break
be60ffd0 2075 except HeaderError as e:
7584f5c9
ERE
2076 raise ReadError(str(e))
2077
2078 if self.mode in "aw":
2079 self._loaded = True
2080
2081 if self.pax_headers:
2082 buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy())
2083 self.fileobj.write(buf)
2084 self.offset += len(buf)
2085 except:
2086 if not self._extfileobj:
2087 self.fileobj.close()
2088 self.closed = True
2089 raise
2090
7584f5c9
ERE
2091 #--------------------------------------------------------------------------
2092 # Below are the classmethods which act as alternate constructors to the
2093 # TarFile class. The open() method is the only one that is needed for
2094 # public use; it is the "super"-constructor and is able to select an
2095 # adequate "sub"-constructor for a particular compression using the mapping
2096 # from OPEN_METH.
2097 #
2098 # This concept allows one to subclass TarFile without losing the comfort of
2099 # the super-constructor. A sub-constructor is registered and made available
2100 # by adding it to the mapping in OPEN_METH.
2101
2102 @classmethod
2b82f50c 2103 def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE,
044585c6 2104 encryption=None, compresslevel=9, tolerant=False, **kwargs):
7584f5c9
ERE
2105 """Open a tar archive for reading, writing or appending. Return
2106 an appropriate TarFile class.
2107
2108 mode:
2109 'r' or 'r:*' open for reading with transparent compression
2110 'r:' open for reading exclusively uncompressed
2111 'r:gz' open for reading with gzip compression
2112 'r:bz2' open for reading with bzip2 compression
be60ffd0 2113 'r:xz' open for reading with lzma compression
7584f5c9
ERE
2114 'a' or 'a:' open for appending, creating the file if necessary
2115 'w' or 'w:' open for writing without compression
2116 'w:gz' open for writing with gzip compression
2117 'w:bz2' open for writing with bzip2 compression
be60ffd0 2118 'w:xz' open for writing with lzma compression
7584f5c9
ERE
2119
2120 'r|*' open a stream of tar blocks with transparent compression
2121 'r|' open an uncompressed stream of tar blocks for reading
2122 'r|gz' open a gzip compressed stream of tar blocks
2123 'r|bz2' open a bzip2 compressed stream of tar blocks
be60ffd0 2124 'r|xz' open an lzma compressed stream of tar blocks
7584f5c9
ERE
2125 'w|' open an uncompressed stream for writing
2126 'w|gz' open a gzip compressed stream for writing
2127 'w|bz2' open a bzip2 compressed stream for writing
be60ffd0 2128 'w|xz' open an lzma compressed stream for writing
85737f48
ERE
2129
2130 'r#gz' open a stream of gzip compressed tar blocks for reading
2131 'w#gz' open a stream of gzip compressed tar blocks for writing
7584f5c9 2132 """
7584f5c9
ERE
2133 if not name and not fileobj:
2134 raise ValueError("nothing to open")
2135
2136 if mode in ("r", "r:*"):
2137 # Find out which *open() is appropriate for opening the file.
2138 for comptype in cls.OPEN_METH:
2139 func = getattr(cls, cls.OPEN_METH[comptype])
2140 if fileobj is not None:
2141 saved_pos = fileobj.tell()
2142 try:
2143 return func(name, "r", fileobj, **kwargs)
be60ffd0 2144 except (ReadError, CompressionError) as e:
c7c736b6 2145 # usually nothing exceptional but sometimes is
7584f5c9
ERE
2146 if fileobj is not None:
2147 fileobj.seek(saved_pos)
2148 continue
2149 raise ReadError("file could not be opened successfully")
2150
2151 elif ":" in mode:
2152 filemode, comptype = mode.split(":", 1)
2153 filemode = filemode or "r"
2154 comptype = comptype or "tar"
2155
2156 # Select the *open() function according to
2157 # given compression.
2158 if comptype in cls.OPEN_METH:
2159 func = getattr(cls, cls.OPEN_METH[comptype])
2160 else:
2161 raise CompressionError("unknown compression type %r" % comptype)
e05f0440
TJ
2162
2163 # Pass on compression level for gzip / bzip2.
2164 if comptype == 'gz' or comptype == 'bz2':
2165 kwargs['compresslevel'] = compresslevel
2166
7a2b9329
CH
2167 if 'max_volume_size' in kwargs:
2168 if comptype != 'tar' and filemode in 'wa' \
2169 and kwargs['max_volume_size']:
2170 import warnings
2171 warnings.warn('Only the first volume will be compressed '
2172 'for modes with "w:"!')
2173
e05f0440 2174 return func(name, filemode, fileobj, **kwargs)
7584f5c9
ERE
2175
2176 elif "|" in mode:
2177 filemode, comptype = mode.split("|", 1)
2178 filemode = filemode or "r"
2179 comptype = comptype or "tar"
2180
2181 if filemode not in "rw":
2182 raise ValueError("mode must be 'r' or 'w'")
2183
2184 t = cls(name, filemode,
2b82f50c
ERE
2185 _Stream(name, filemode, comptype, fileobj, bufsize,
2186 compresslevel=compresslevel),
7584f5c9
ERE
2187 **kwargs)
2188 t._extfileobj = False
2189 return t
2190
5fdff89f
ERE
2191 elif "#" in mode:
2192 filemode, comptype = mode.split("#", 1)
2193 filemode = filemode or "r"
5fdff89f
ERE
2194
2195 if filemode not in "rw":
5faea0e1
PG
2196 raise ValueError ("mode %s not compatible with concat "
2197 "archive; must be 'r' or 'w'" % mode)
5fdff89f 2198
be60ffd0 2199 stream = _Stream(name, filemode, comptype, fileobj, bufsize,
d1c38f40 2200 concat=True, encryption=encryption,
044585c6 2201 compresslevel=compresslevel, tolerant=tolerant)
d1c38f40 2202 kwargs ["concat"] = True
be60ffd0
ERE
2203 try:
2204 t = cls(name, filemode, stream, **kwargs)
c7c736b6 2205 except: # XXX except what?
be60ffd0 2206 stream.close()
c7c736b6 2207 raise # XXX raise what?
5fdff89f
ERE
2208 t._extfileobj = False
2209 return t
2210
7584f5c9
ERE
2211 elif mode in "aw":
2212 return cls.taropen(name, mode, fileobj, **kwargs)
2213
133d30da 2214 raise ValueError("undiscernible mode %r" % mode)
7584f5c9
ERE
2215
2216 @classmethod
2217 def taropen(cls, name, mode="r", fileobj=None, **kwargs):
2218 """Open uncompressed tar archive name for reading or writing.
2219 """
2220 if len(mode) > 1 or mode not in "raw":
2221 raise ValueError("mode must be 'r', 'a' or 'w'")
2222 return cls(name, mode, fileobj, **kwargs)
2223
2224 @classmethod
2225 def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
2226 """Open gzip compressed tar archive name for reading or writing.
2227 Appending is not allowed.
2228 """
2229 if len(mode) > 1 or mode not in "rw":
2230 raise ValueError("mode must be 'r' or 'w'")
2231
2232 try:
2233 import gzip
2234 gzip.GzipFile
2235 except (ImportError, AttributeError):
2236 raise CompressionError("gzip module is not available")
2237
be60ffd0 2238 extfileobj = fileobj is not None
7584f5c9 2239 try:
be60ffd0
ERE
2240 fileobj = gzip.GzipFile(name, mode + "b", compresslevel, fileobj)
2241 t = cls.taropen(name, mode, fileobj, **kwargs)
2242 except OSError:
2243 if not extfileobj and fileobj is not None:
2244 fileobj.close()
2245 if fileobj is None:
2246 raise
7584f5c9 2247 raise ReadError("not a gzip file")
be60ffd0
ERE
2248 except:
2249 if not extfileobj and fileobj is not None:
2250 fileobj.close()
2251 raise
2252 t._extfileobj = extfileobj
7584f5c9
ERE
2253 return t
2254
2255 @classmethod
2256 def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
2257 """Open bzip2 compressed tar archive name for reading or writing.
2258 Appending is not allowed.
2259 """
2260 if len(mode) > 1 or mode not in "rw":
2261 raise ValueError("mode must be 'r' or 'w'.")
2262
2263 try:
2264 import bz2
2265 except ImportError:
2266 raise CompressionError("bz2 module is not available")
2267
be60ffd0
ERE
2268 fileobj = bz2.BZ2File(fileobj or name, mode,
2269 compresslevel=compresslevel)
7584f5c9
ERE
2270
2271 try:
2272 t = cls.taropen(name, mode, fileobj, **kwargs)
be60ffd0
ERE
2273 except (OSError, EOFError):
2274 fileobj.close()
7584f5c9
ERE
2275 raise ReadError("not a bzip2 file")
2276 t._extfileobj = False
2277 return t
2278
be60ffd0
ERE
2279 @classmethod
2280 def xzopen(cls, name, mode="r", fileobj=None, preset=None, **kwargs):
2281 """Open lzma compressed tar archive name for reading or writing.
2282 Appending is not allowed.
2283 """
2284 if mode not in ("r", "w"):
2285 raise ValueError("mode must be 'r' or 'w'")
2286
2287 try:
2288 import lzma
2289 except ImportError:
2290 raise CompressionError("lzma module is not available")
2291
2292 fileobj = lzma.LZMAFile(fileobj or name, mode, preset=preset)
2293
2294 try:
2295 t = cls.taropen(name, mode, fileobj, **kwargs)
2296 except (lzma.LZMAError, EOFError):
2297 fileobj.close()
2298 raise ReadError("not an lzma file")
2299 t._extfileobj = False
2300 return t
2301
7584f5c9
ERE
2302 # All *open() methods are registered here.
2303 OPEN_METH = {
2304 "tar": "taropen", # uncompressed tar
2305 "gz": "gzopen", # gzip compressed tar
be60ffd0
ERE
2306 "bz2": "bz2open", # bzip2 compressed tar
2307 "xz": "xzopen" # lzma compressed tar
7584f5c9
ERE
2308 }
2309
2310 #--------------------------------------------------------------------------
2311 # The public methods which TarFile provides:
2312
2313 def close(self):
2314 """Close the TarFile. In write-mode, two finishing zero blocks are
fd2f01f2
PG
2315 appended to the archive. A special case are empty archives which are
2316 initialized accordingly so the two mandatory blocks of zeros are
2317 written abiding by the requested encryption and compression settings.
7584f5c9
ERE
2318 """
2319 if self.closed:
2320 return
2321
2322 if self.mode in "aw":
fd2f01f2
PG
2323 if self.arcmode & ARCMODE_CONCAT and self.fileobj.tell () == 0:
2324 self.fileobj.next ("")
7584f5c9
ERE
2325 self.fileobj.write(NUL * (BLOCKSIZE * 2))
2326 self.offset += (BLOCKSIZE * 2)
2327 # fill up the end with zero-blocks
2328 # (like option -b20 for tar does)
2329 blocks, remainder = divmod(self.offset, RECORDSIZE)
2330 if remainder > 0:
2331 self.fileobj.write(NUL * (RECORDSIZE - remainder))
7584f5c9
ERE
2332 if not self._extfileobj:
2333 self.fileobj.close()
2334 self.closed = True
2335
2336 def getmember(self, name):
2337 """Return a TarInfo object for member `name'. If `name' can not be
2338 found in the archive, KeyError is raised. If a member occurs more
2339 than once in the archive, its last occurrence is assumed to be the
2340 most up-to-date version.
2341 """
2342 tarinfo = self._getmember(name)
2343 if tarinfo is None:
2344 raise KeyError("filename %r not found" % name)
2345 return tarinfo
2346
2347 def getmembers(self):
2348 """Return the members of the archive as a list of TarInfo objects. The
2349 list has the same order as the members in the archive.
2350 """
2351 self._check()
2352 if not self._loaded: # if we want to obtain a list of
2353 self._load() # all members, we first have to
2354 # scan the whole archive.
2355 return self.members
2356
ad4402e8
ERE
2357 def get_last_member_offset(self):
2358 """Return the last member offset. Usually this is self.fileobj.tell(),
2359 but when there's encryption or concat compression going on it's more
2360 complicated than that.
2361 """
b8fc2f5d 2362 return self.last_block_offset
ad4402e8 2363
7584f5c9
ERE
2364 def getnames(self):
2365 """Return the members of the archive as a list of their names. It has
2366 the same order as the list returned by getmembers().
2367 """
2368 return [tarinfo.name for tarinfo in self.getmembers()]
2369
2370 def gettarinfo(self, name=None, arcname=None, fileobj=None):
2371 """Create a TarInfo object for either the file `name' or the file
2372 object `fileobj' (using os.fstat on its file descriptor). You can
2373 modify some of the TarInfo's attributes before you add it using
2374 addfile(). If given, `arcname' specifies an alternative name for the
2375 file in the archive.
2376 """
2377 self._check("aw")
2378
2379 # When fileobj is given, replace name by
2380 # fileobj's real name.
2381 if fileobj is not None:
2382 name = fileobj.name
2383
2384 # Building the name of the member in the archive.
2385 # Backward slashes are converted to forward slashes,
2386 # Absolute paths are turned to relative paths.
2387 if arcname is None:
2388 arcname = name
2389 drv, arcname = os.path.splitdrive(arcname)
be60ffd0 2390 arcname = arcname.replace(os.sep, "/")
7584f5c9
ERE
2391 arcname = arcname.lstrip("/")
2392
2393 # Now, fill the TarInfo object with
2394 # information specific for the file.
2395 tarinfo = self.tarinfo()
2396 tarinfo.tarfile = self
2397
2398 # Use os.stat or os.lstat, depending on platform
2399 # and if symlinks shall be resolved.
2400 if fileobj is None:
2401 if hasattr(os, "lstat") and not self.dereference:
2402 statres = os.lstat(name)
2403 else:
2404 statres = os.stat(name)
2405 else:
2406 statres = os.fstat(fileobj.fileno())
2407 linkname = ""
2408
2409 stmd = statres.st_mode
2410 if stat.S_ISREG(stmd):
2411 inode = (statres.st_ino, statres.st_dev)
2412 if not self.dereference and statres.st_nlink > 1 and \
2413 inode in self.inodes and arcname != self.inodes[inode]:
2414 # Is it a hardlink to an already
2415 # archived file?
2416 type = LNKTYPE
2417 linkname = self.inodes[inode]
2418 else:
2419 # The inode is added only if its valid.
2420 # For win32 it is always 0.
2421 type = REGTYPE
6f422b65 2422 if inode[0] and self.save_to_members:
7584f5c9
ERE
2423 self.inodes[inode] = arcname
2424 elif stat.S_ISDIR(stmd):
2425 type = DIRTYPE
2426 elif stat.S_ISFIFO(stmd):
2427 type = FIFOTYPE
2428 elif stat.S_ISLNK(stmd):
2429 type = SYMTYPE
2430 linkname = os.readlink(name)
2431 elif stat.S_ISCHR(stmd):
2432 type = CHRTYPE
2433 elif stat.S_ISBLK(stmd):
2434 type = BLKTYPE
2435 else:
2436 return None
2437
2438 # Fill the TarInfo object with all
2439 # information we can get.
2440 tarinfo.name = arcname
2441 tarinfo.mode = stmd
2442 tarinfo.uid = statres.st_uid
2443 tarinfo.gid = statres.st_gid
2444 if type == REGTYPE:
2445 tarinfo.size = statres.st_size
2446 else:
be60ffd0 2447 tarinfo.size = 0
7584f5c9
ERE
2448 tarinfo.mtime = statres.st_mtime
2449 tarinfo.type = type
2450 tarinfo.linkname = linkname
2451 if pwd:
9ef1fb87
TJ
2452 if tarinfo.uid in self.cache_uid2user:
2453 tarinfo.uname = self.cache_uid2user[tarinfo.uid]
2454 else:
2455 try:
2456 tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0]
2457 self.cache_uid2user[tarinfo.uid] = tarinfo.uname
2458 except KeyError:
2459 # remember user does not exist:
2460 # same default value as in tarinfo class
2461 self.cache_uid2user[tarinfo.uid] = ""
7584f5c9 2462 if grp:
9ef1fb87
TJ
2463 if tarinfo.gid in self.cache_gid2group:
2464 tarinfo.gname = self.cache_gid2group[tarinfo.gid]
2465 else:
2466 try:
2467 tarinfo.gname = grp.getgrgid(tarinfo.gid)[0]
2468 self.cache_gid2group[tarinfo.gid] = tarinfo.gname
2469 except KeyError:
2470 # remember group does not exist:
2471 # same default value as in tarinfo class
2472 self.cache_gid2group[tarinfo.gid] = ""
7584f5c9
ERE
2473
2474 if type in (CHRTYPE, BLKTYPE):
2475 if hasattr(os, "major") and hasattr(os, "minor"):
2476 tarinfo.devmajor = os.major(statres.st_rdev)
2477 tarinfo.devminor = os.minor(statres.st_rdev)
2478 return tarinfo
2479
2480 def list(self, verbose=True):
2481 """Print a table of contents to sys.stdout. If `verbose' is False, only
2482 the names of the members are printed. If it is True, an `ls -l'-like
2483 output is produced.
2484 """
2485 self._check()
2486
2487 for tarinfo in self:
2488 if verbose:
be60ffd0
ERE
2489 print(stat.filemode(tarinfo.mode), end=' ')
2490 print("%s/%s" % (tarinfo.uname or tarinfo.uid,
2491 tarinfo.gname or tarinfo.gid), end=' ')
7584f5c9 2492 if tarinfo.ischr() or tarinfo.isblk():
be60ffd0
ERE
2493 print("%10s" % ("%d,%d" \
2494 % (tarinfo.devmajor, tarinfo.devminor)), end=' ')
7584f5c9 2495 else:
be60ffd0
ERE
2496 print("%10d" % tarinfo.size, end=' ')
2497 print("%d-%02d-%02d %02d:%02d:%02d" \
2498 % time.localtime(tarinfo.mtime)[:6], end=' ')
7584f5c9 2499
be60ffd0 2500 print(tarinfo.name + ("/" if tarinfo.isdir() else ""), end=' ')
7584f5c9
ERE
2501
2502 if verbose:
2503 if tarinfo.issym():
be60ffd0 2504 print("->", tarinfo.linkname, end=' ')
7584f5c9 2505 if tarinfo.islnk():
be60ffd0
ERE
2506 print("link to", tarinfo.linkname, end=' ')
2507 print()
7584f5c9 2508
be60ffd0 2509 def add(self, name, arcname=None, recursive=True, exclude=None, *, filter=None):
7584f5c9
ERE
2510 """Add the file `name' to the archive. `name' may be any type of file
2511 (directory, fifo, symbolic link, etc.). If given, `arcname'
2512 specifies an alternative name for the file in the archive.
2513 Directories are added recursively by default. This can be avoided by
2514 setting `recursive' to False. `exclude' is a function that should
2515 return True for each filename to be excluded. `filter' is a function
2516 that expects a TarInfo object argument and returns the changed
2517 TarInfo object, if it returns None the TarInfo object will be
2518 excluded from the archive.
2519 """
2520 self._check("aw")
2521
2522 if arcname is None:
2523 arcname = name
2524
2525 # Exclude pathnames.
2526 if exclude is not None:
2527 import warnings
2528 warnings.warn("use the filter argument instead",
2529 DeprecationWarning, 2)
2530 if exclude(name):
2531 self._dbg(2, "tarfile: Excluded %r" % name)
2532 return
2533
2534 # Skip if somebody tries to archive the archive...
2535 if self.name is not None and os.path.abspath(name) == self.name:
2536 self._dbg(2, "tarfile: Skipped %r" % name)
2537 return
2538
2539 self._dbg(1, name)
2540
2541 # Create a TarInfo object from the file.
2542 tarinfo = self.gettarinfo(name, arcname)
2543
2544 if tarinfo is None:
2545 self._dbg(1, "tarfile: Unsupported type %r" % name)
2546 return
2547
2548 # Change or exclude the TarInfo object.
2549 if filter is not None:
2550 tarinfo = filter(tarinfo)
2551 if tarinfo is None:
2552 self._dbg(2, "tarfile: Excluded %r" % name)
2553 return
2554
2555 # Append the tar header and data to the archive.
2556 if tarinfo.isreg():
2557 with bltn_open(name, "rb") as f:
2558 self.addfile(tarinfo, f)
2559
2560 elif tarinfo.isdir():
2561 self.addfile(tarinfo)
2562 if recursive:
2563 for f in os.listdir(name):
2564 self.add(os.path.join(name, f), os.path.join(arcname, f),
be60ffd0 2565 recursive, exclude, filter=filter)
7584f5c9
ERE
2566
2567 else:
2568 self.addfile(tarinfo)
2569
defc9a22 2570 def _size_left_file(self):
be60ffd0 2571 """Calculates size left in a volume with a maximum volume size.
ba5a449e 2572
be60ffd0 2573 Assumes self.max_volume_size is set.
ba5a449e 2574 If using compression through a _Stream, use _size_left_stream instead
be60ffd0 2575 """
ba5a449e 2576 # left-over size = max_size - offset - 2 zero-blocks written in close
ae48acc8
ERE
2577 size_left = self.max_volume_size - 2*BLOCKSIZE - self.offset
2578 # limit size left to a discrete number of blocks, because we won't
be60ffd0 2579 # write only half a block when writting the end of a volume
ae48acc8 2580 # and filling with zeros
defc9a22
CH
2581 return BLOCKSIZE * (size_left // BLOCKSIZE)
2582
2583 def _size_left_stream(self):
ba5a449e
CH
2584 """ Calculates size left in a volume if using comression/encryption
2585
2586 Assumes self.max_volume_size is set and self.fileobj is a _Stream
2587 (otherwise use _size_left_file)
2588 """
2589 # left-over size = max_size - bytes written - 2 zero-blocks (close)
defc9a22
CH
2590 size_left = self.max_volume_size - self.fileobj.estim_file_size() \
2591 - 2*BLOCKSIZE
2592 return BLOCKSIZE * (size_left // BLOCKSIZE)
ae48acc8 2593
7584f5c9
ERE
2594 def addfile(self, tarinfo, fileobj=None):
2595 """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is
2596 given, tarinfo.size bytes are read from it and added to the archive.
2597 You can create TarInfo objects using gettarinfo().
2598 On Windows platforms, `fileobj' should always be opened with mode
2599 'rb' to avoid irritation about the file size.
2600 """
2601 self._check("aw")
2602
2603 tarinfo = copy.copy(tarinfo)
cbf55ffb 2604
d1c38f40
PG
2605 if self.arcmode & ARCMODE_CONCAT:
2606 self.last_block_offset = self.fileobj.next (tarinfo.name)
11684b1d
ERE
2607 else:
2608 self.last_block_offset = self.fileobj.tell()
7584f5c9
ERE
2609
2610 buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
2611 self.fileobj.write(buf)
2612 self.offset += len(buf)
2613
ae9c8de2
CH
2614 if self.max_volume_size:
2615 if isinstance(self.fileobj, _Stream):
2616 _size_left = self._size_left_stream
2617 else:
2618 _size_left = self._size_left_file
2619 else:
2620 _size_left = lambda: tarinfo.size
68ddf955 2621
29c354ac
PG
2622 # If there's no data to follow, finish
2623 if not fileobj:
29c354ac
PG
2624 if self.save_to_members:
2625 self.members.append(tarinfo)
2626 return
2627
2628 target_size_left = _size_left()
2629 source_size_left = tarinfo.size
2630 assert tarinfo.volume_offset == 0
2631
2632 # we only split volumes in the middle of a file, that means we have
2633 # to write at least one block
2634 if target_size_left < BLOCKSIZE:
2635 target_size_left = BLOCKSIZE
2636
ae9c8de2
CH
2637 # loop over multiple volumes
2638 while source_size_left > 0:
ae48acc8 2639
ae9c8de2
CH
2640 # Write as much data as possble from source into target.
2641 # When compressing data, we cannot easily predict how much data we
2642 # can write until target_size_left == 0 --> need to iterate
2643 size_can_write = min(target_size_left, source_size_left)
c04e0751 2644
ae9c8de2
CH
2645 while size_can_write > 0:
2646 copyfileobj(fileobj, self.fileobj, size_can_write)
2647 self.offset += size_can_write
2648 source_size_left -= size_can_write
2649 target_size_left = _size_left()
2650 size_can_write = min(target_size_left, source_size_left)
68ddf955 2651
ae9c8de2
CH
2652 # now target_size_left == 0 or source_size_left == 0
2653
2654 # if there is data left to write, we need to create a new volume
2655 if source_size_left > 0:
5f38bff6
PG
2656 # Only finalize the crypto entry here if we’re continuing with
2657 # another one; otherwise, the encryption must include the block
2658 # padding below.
2f854e77 2659 tarinfo.type = GNUTYPE_MULTIVOL
68ddf955
ERE
2660
2661 if not self.new_volume_handler or\
2662 not callable(self.new_volume_handler):
c04e0751 2663 raise Exception("We need to create a new volume and you "
ae9c8de2 2664 "didn't supply a new_volume_handler")
68ddf955 2665
54128a00 2666
68ddf955
ERE
2667 # the new volume handler should do everything needed to
2668 # start working in a new volume. usually, the handler calls
2669 # to self.open_volume
2f854e77 2670 self.volume_number += 1
0eb5048f 2671
ae9c8de2 2672 # set to be used by open_volume, because in the case of a PAX
0eb5048f
ERE
2673 # tar it needs to write information about the volume and offset
2674 # in the global header
ae9c8de2 2675 tarinfo.volume_offset = tarinfo.size - source_size_left
0eb5048f 2676 self.volume_tarinfo = tarinfo
ae9c8de2 2677
a0873dcc
PG
2678 # the “new_volume_handler” is supposed to call .close() on the
2679 # “fileobj” _Stream
2f854e77
ERE
2680 self.new_volume_handler(self, self.base_name, self.volume_number)
2681
0eb5048f
ERE
2682 self.volume_tarinfo = None
2683
d1c38f40
PG
2684 if self.arcmode & ARCMODE_CONCAT:
2685 self.fileobj.next_volume (tarinfo.name)
5f38bff6 2686
2f854e77
ERE
2687 # write new volume header
2688 buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
2f854e77 2689 self.fileobj.write(buf)
ae9c8de2
CH
2690 self.offset += len(buf)
2691
2692 # adjust variables; open_volume should have reset self.offset
2693 # --> _size_left should be big again
2694 target_size_left = _size_left()
2695 size_can_write = min(target_size_left, source_size_left)
e0da4709 2696 self._dbg(3, 'new volume')
ae9c8de2
CH
2697
2698 # now, all data has been written. We may have to fill up the rest of
2699 # the block in target with 0s
2700 remainder = (tarinfo.size - tarinfo.volume_offset) % BLOCKSIZE
2701 if remainder > 0:
2702 self.fileobj.write(NUL * (BLOCKSIZE - remainder))
2703 self.offset += BLOCKSIZE - remainder
7584f5c9 2704
ea625b04
ERE
2705 if self.save_to_members:
2706 self.members.append(tarinfo)
7584f5c9 2707
170c6c52 2708 def open_volume(self, name="", fileobj=None, encryption=None):
68ddf955 2709 '''
0eb5048f 2710 Called by the user to change this tar file to point to a new volume.
68ddf955
ERE
2711 '''
2712 # open the file using either fileobj or name
2713 if not fileobj:
2714 if self.mode == "a" and not os.path.exists(name):
2715 # Create nonexistent files in append mode.
2716 self.mode = "w"
2717 self._mode = "wb"
68ddf955 2718 self._extfileobj = False
26fa5ad5
ERE
2719
2720 if isinstance(self.fileobj, _Stream):
e0da4709 2721 self._dbg(3, 'open_volume: create a _Stream')
26fa5ad5
ERE
2722 fileobj = _Stream(name=name,
2723 mode=self.fileobj.mode,
2724 comptype=self.fileobj.comptype,
2725 fileobj=None,
2726 bufsize=self.fileobj.bufsize,
cea130ec 2727 encryption=encryption or self.fileobj.encryption,
d1c38f40 2728 concat=self.fileobj.arcmode & ARCMODE_CONCAT)
26fa5ad5 2729 else:
7a2b9329 2730 # here, we lose information about compression/encryption!
e0da4709 2731 self._dbg(3, 'open_volume: builtin open')
26fa5ad5 2732 fileobj = bltn_open(name, self._mode)
68ddf955
ERE
2733 else:
2734 if name is None and hasattr(fileobj, "name"):
2735 name = fileobj.name
2736 if hasattr(fileobj, "mode"):
2737 self._mode = fileobj.mode
2738 self._extfileobj = True
1027433a 2739 self._dbg(3, 'open_volume: using external fileobj {}', fileobj)
68ddf955
ERE
2740 self.name = os.path.abspath(name) if name else None
2741 self.fileobj = fileobj
2742
2743 # init data structures
2744 self.closed = False
2745 self.members = [] # list of members as TarInfo objects
2746 self._loaded = False # flag if all members have been read
2747 self.offset = self.fileobj.tell()
2748 # current position in the archive file
2749 self.inodes = {} # dictionary caching the inodes of
2750 # archive members already added
2751
2752 try:
2753 if self.mode == "r":
2754 self.firstmember = None
2755 self.firstmember = self.next()
2756
2757 if self.mode == "a":
2758 # Move to the end of the archive,
2759 # before the first empty block.
2760 while True:
2761 self.fileobj.seek(self.offset)
2762 try:
2763 tarinfo = self.tarinfo.fromtarfile(self)
2764 self.members.append(tarinfo)
2765 except EOFHeaderError:
2766 self.fileobj.seek(self.offset)
2767 break
be60ffd0 2768 except HeaderError as e:
68ddf955
ERE
2769 raise ReadError(str(e))
2770
2771 if self.mode in "aw":
2772 self._loaded = True
2773
c04e0751
ERE
2774 if self.format == PAX_FORMAT:
2775 volume_info = {
be60ffd0
ERE
2776 "GNU.volume.filename": str(self.volume_tarinfo.name),
2777 "GNU.volume.size": str(self.volume_tarinfo.size - self.volume_tarinfo.volume_offset),
2778 "GNU.volume.offset": str(self.volume_tarinfo.volume_offset),
c04e0751 2779 }
0eb5048f 2780
c04e0751
ERE
2781 self.pax_headers.update(volume_info)
2782
a0873dcc
PG
2783 if isinstance(self.fileobj, _Stream):
2784 self.fileobj._init_write_gz ()
c04e0751
ERE
2785 buf = self.tarinfo.create_pax_global_header(volume_info.copy())
2786 self.fileobj.write(buf)
2787 self.offset += len(buf)
54128a00 2788 except Exception as exn:
68ddf955
ERE
2789 if not self._extfileobj:
2790 self.fileobj.close()
2791 self.closed = True
2792 raise
2793
e5f5681b 2794 def extractall(self, path=".", members=None, filter=None):
7584f5c9
ERE
2795 """Extract all members from the archive to the current working
2796 directory and set owner, modification time and permissions on
2797 directories afterwards. `path' specifies a different directory
2798 to extract to. `members' is optional and must be a subset of the
2799 list returned by getmembers().
2800 """
2801 directories = []
2802
2803 if members is None:
2804 members = self
2805
2806 for tarinfo in members:
c474439c
ERE
2807 if self.volume_number > 0 and tarinfo.ismultivol():
2808 continue
2809
974408b5 2810 if filter and not filter(tarinfo):
e5f5681b
ERE
2811 continue
2812
7584f5c9
ERE
2813 if tarinfo.isdir():
2814 # Extract directories with a safe mode.
2815 directories.append(tarinfo)
2816 tarinfo = copy.copy(tarinfo)
be60ffd0
ERE
2817 tarinfo.mode = 0o0700
2818 # Do not set_attrs directories, as we will do that further down
2819 self.extract(tarinfo, path, set_attrs=not tarinfo.isdir())
7584f5c9
ERE
2820
2821 # Reverse sort directories.
be60ffd0 2822 directories.sort(key=lambda a: a.name)
7584f5c9
ERE
2823 directories.reverse()
2824
2825 # Set correct owner, mtime and filemode on directories.
2826 for tarinfo in directories:
2827 dirpath = os.path.join(path, tarinfo.name)
2828 try:
2829 self.chown(tarinfo, dirpath)
2830 self.utime(tarinfo, dirpath)
2831 self.chmod(tarinfo, dirpath)
be60ffd0 2832 except ExtractError as e:
7584f5c9
ERE
2833 if self.errorlevel > 1:
2834 raise
2835 else:
2836 self._dbg(1, "tarfile: %s" % e)
2837
786addd6 2838 def extract(self, member, path="", set_attrs=True, symlink_cb=None):
7584f5c9
ERE
2839 """Extract a member from the archive to the current working directory,
2840 using its full name. Its file information is extracted as accurately
2841 as possible. `member' may be a filename or a TarInfo object. You can
be60ffd0
ERE
2842 specify a different directory using `path'. File attributes (owner,
2843 mtime, mode) are set unless `set_attrs' is False.
786addd6
PG
2844 ``symlink_cb`` is a hook accepting a function that is passed the
2845 ``member``, ``path``, and ``set_attrs`` arguments if the tarinfo for
2846 ``member`` indicates a symlink in which case only the callback
9b13f5c4
PG
2847 passed will be applied, skipping the actual extraction. In case the
2848 callback is invoked, its return value is passed on to the caller.
7584f5c9
ERE
2849 """
2850 self._check("r")
2851
be60ffd0 2852 if isinstance(member, str):
7584f5c9
ERE
2853 tarinfo = self.getmember(member)
2854 else:
2855 tarinfo = member
2856
2857 # Prepare the link target for makelink().
2858 if tarinfo.islnk():
2859 tarinfo._link_target = os.path.join(path, tarinfo.linkname)
2860
9b13f5c4 2861 if symlink_cb is not None and tarinfo.issym():
83f5fd71 2862 return symlink_cb(member, path, set_attrs)
786addd6 2863
7584f5c9 2864 try:
be60ffd0
ERE
2865 self._extract_member(tarinfo, os.path.join(path, tarinfo.name),
2866 set_attrs=set_attrs)
2867 except EnvironmentError as e:
7584f5c9
ERE
2868 if self.errorlevel > 0:
2869 raise
2870 else:
2871 if e.filename is None:
2872 self._dbg(1, "tarfile: %s" % e.strerror)
2873 else:
2874 self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename))
be60ffd0 2875 except ExtractError as e:
7584f5c9
ERE
2876 if self.errorlevel > 1:
2877 raise
2878 else:
2879 self._dbg(1, "tarfile: %s" % e)
2880
2881 def extractfile(self, member):
2882 """Extract a member from the archive as a file object. `member' may be
be60ffd0
ERE
2883 a filename or a TarInfo object. If `member' is a regular file or a
2884 link, an io.BufferedReader object is returned. Otherwise, None is
2885 returned.
7584f5c9
ERE
2886 """
2887 self._check("r")
2888
be60ffd0 2889 if isinstance(member, str):
7584f5c9
ERE
2890 tarinfo = self.getmember(member)
2891 else:
2892 tarinfo = member
2893
be60ffd0
ERE
2894 if tarinfo.isreg() or tarinfo.ismultivol() or\
2895 tarinfo.type not in SUPPORTED_TYPES:
7584f5c9
ERE
2896 # If a member's type is unknown, it is treated as a
2897 # regular file.
2898 return self.fileobject(self, tarinfo)
2899
2900 elif tarinfo.islnk() or tarinfo.issym():
2901 if isinstance(self.fileobj, _Stream):
2902 # A small but ugly workaround for the case that someone tries
2903 # to extract a (sym)link as a file-object from a non-seekable
2904 # stream of tar blocks.
2905 raise StreamError("cannot extract (sym)link as file object")
2906 else:
2907 # A (sym)link's file object is its target's file object.
2908 return self.extractfile(self._find_link_target(tarinfo))
2909 else:
2910 # If there's no data associated with the member (directory, chrdev,
2911 # blkdev, etc.), return None instead of a file object.
2912 return None
2913
be60ffd0 2914 def _extract_member(self, tarinfo, targetpath, set_attrs=True):
7584f5c9
ERE
2915 """Extract the TarInfo object tarinfo to a physical
2916 file called targetpath.
2917 """
2918 # Fetch the TarInfo object for the given name
2919 # and build the destination pathname, replacing
2920 # forward slashes to platform specific separators.
2921 targetpath = targetpath.rstrip("/")
2922 targetpath = targetpath.replace("/", os.sep)
2923
2924 # Create all upper directories.
2925 upperdirs = os.path.dirname(targetpath)
2926 if upperdirs and not os.path.exists(upperdirs):
2927 # Create directories that are not part of the archive with
2928 # default permissions.
2929 os.makedirs(upperdirs)
2930
2931 if tarinfo.islnk() or tarinfo.issym():
2932 self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname))
2933 else:
2934 self._dbg(1, tarinfo.name)
2935
2936 if tarinfo.isreg():
2937 self.makefile(tarinfo, targetpath)
2938 elif tarinfo.isdir():
2939 self.makedir(tarinfo, targetpath)
2940 elif tarinfo.isfifo():
2941 self.makefifo(tarinfo, targetpath)
2942 elif tarinfo.ischr() or tarinfo.isblk():
2943 self.makedev(tarinfo, targetpath)
2944 elif tarinfo.islnk() or tarinfo.issym():
2945 self.makelink(tarinfo, targetpath)
2946 elif tarinfo.type not in SUPPORTED_TYPES:
2947 self.makeunknown(tarinfo, targetpath)
2948 else:
2949 self.makefile(tarinfo, targetpath)
2950
be60ffd0
ERE
2951 if set_attrs:
2952 self.chown(tarinfo, targetpath)
2953 if not tarinfo.issym():
2954 self.chmod(tarinfo, targetpath)
2955 self.utime(tarinfo, targetpath)
7584f5c9
ERE
2956
2957 #--------------------------------------------------------------------------
2958 # Below are the different file methods. They are called via
2959 # _extract_member() when extract() is called. They can be replaced in a
2960 # subclass to implement other functionality.
2961
2962 def makedir(self, tarinfo, targetpath):
2963 """Make a directory called targetpath.
2964 """
2965 try:
2966 # Use a safe mode for the directory, the real mode is set
2967 # later in _extract_member().
be60ffd0
ERE
2968 os.mkdir(targetpath, 0o0700)
2969 except FileExistsError:
2970 pass
7584f5c9
ERE
2971
2972 def makefile(self, tarinfo, targetpath):
2973 """Make a file called targetpath.
2974 """
be60ffd0
ERE
2975 source = self.fileobj
2976 source.seek(tarinfo.offset_data)
c7c736b6 2977 decrypt = False
c474439c
ERE
2978 iterate = True
2979 target = bltn_open(targetpath, "wb")
2980
be60ffd0
ERE
2981 if tarinfo.sparse is not None:
2982 try:
2983 for offset, size in tarinfo.sparse:
2984 target.seek(offset)
2985 copyfileobj(source, target, size)
2986 target.seek(tarinfo.size)
2987 target.truncate()
2988 finally:
2989 target.close()
2990 return
2991
c474439c
ERE
2992 while iterate:
2993 iterate = False
2994 try:
2995 copyfileobj(source, target, tarinfo.size)
aa828cd1 2996 except OSError:
c474439c
ERE
2997 source.close()
2998 # only if we are extracting a multivolume this can be treated
2999 if not self.new_volume_handler:
3000 target.close()
3001 raise Exception("We need to read a new volume and you"
3002 " didn't supply a new_volume_handler")
3003
3004 # the new volume handler should do everything needed to
3005 # start working in a new volume. usually, the handler calls
3006 # to self.open_volume
3007 self.volume_number += 1
3008 self.new_volume_handler(self, self.base_name, self.volume_number)
be60ffd0
ERE
3009 tarinfo = self.firstmember
3010 source = self.fileobj
c474439c 3011 iterate = True
c474439c
ERE
3012 target.close()
3013
7584f5c9
ERE
3014
3015 def makeunknown(self, tarinfo, targetpath):
3016 """Make a file from a TarInfo object with an unknown type
3017 at targetpath.
3018 """
3019 self.makefile(tarinfo, targetpath)
3020 self._dbg(1, "tarfile: Unknown file type %r, " \
3021 "extracted as regular file." % tarinfo.type)
3022
3023 def makefifo(self, tarinfo, targetpath):
3024 """Make a fifo called targetpath.
3025 """
3026 if hasattr(os, "mkfifo"):
3027 os.mkfifo(targetpath)
3028 else:
3029 raise ExtractError("fifo not supported by system")
3030
3031 def makedev(self, tarinfo, targetpath):
3032 """Make a character or block device called targetpath.
3033 """
3034 if not hasattr(os, "mknod") or not hasattr(os, "makedev"):
3035 raise ExtractError("special devices not supported by system")
3036
3037 mode = tarinfo.mode
3038 if tarinfo.isblk():
3039 mode |= stat.S_IFBLK
3040 else:
3041 mode |= stat.S_IFCHR
3042
3043 os.mknod(targetpath, mode,
3044 os.makedev(tarinfo.devmajor, tarinfo.devminor))
3045
3046 def makelink(self, tarinfo, targetpath):
3047 """Make a (symbolic) link called targetpath. If it cannot be created
3048 (platform limitation), we try to make a copy of the referenced file
3049 instead of a link.
3050 """
be60ffd0 3051 try:
7584f5c9
ERE
3052 # For systems that support symbolic and hard links.
3053 if tarinfo.issym():
7584f5c9
ERE
3054 os.symlink(tarinfo.linkname, targetpath)
3055 else:
3056 # See extract().
3057 if os.path.exists(tarinfo._link_target):
7584f5c9
ERE
3058 os.link(tarinfo._link_target, targetpath)
3059 else:
be60ffd0
ERE
3060 self._extract_member(self._find_link_target(tarinfo),
3061 targetpath)
3062 except symlink_exception:
7584f5c9 3063 try:
be60ffd0
ERE
3064 self._extract_member(self._find_link_target(tarinfo),
3065 targetpath)
7584f5c9
ERE
3066 except KeyError:
3067 raise ExtractError("unable to resolve link inside archive")
3068
3069 def chown(self, tarinfo, targetpath):
3070 """Set owner of targetpath according to tarinfo.
3071 """
3072 if pwd and hasattr(os, "geteuid") and os.geteuid() == 0:
3073 # We have to be root to do so.
3074 try:
3075 g = grp.getgrnam(tarinfo.gname)[2]
3076 except KeyError:
3077 g = tarinfo.gid
3078 try:
3079 u = pwd.getpwnam(tarinfo.uname)[2]
3080 except KeyError:
3081 u = tarinfo.uid
3082 try:
3083 if tarinfo.issym() and hasattr(os, "lchown"):
3084 os.lchown(targetpath, u, g)
3085 else:
be60ffd0
ERE
3086 os.chown(targetpath, u, g)
3087 except OSError as e:
7584f5c9
ERE
3088 raise ExtractError("could not change owner")
3089
3090 def chmod(self, tarinfo, targetpath):
3091 """Set file permissions of targetpath according to tarinfo.
3092 """
3093 if hasattr(os, 'chmod'):
3094 try:
3095 os.chmod(targetpath, tarinfo.mode)
be60ffd0 3096 except OSError as e:
7584f5c9
ERE
3097 raise ExtractError("could not change mode")
3098
3099 def utime(self, tarinfo, targetpath):
3100 """Set modification time of targetpath according to tarinfo.
3101 """
3102 if not hasattr(os, 'utime'):
3103 return
3104 try:
3105 os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime))
be60ffd0 3106 except OSError as e:
7584f5c9
ERE
3107 raise ExtractError("could not change modification time")
3108
3109 #--------------------------------------------------------------------------
3110 def next(self):
3111 """Return the next member of the archive as a TarInfo object, when
3112 TarFile is opened for reading. Return None if there is no more
3113 available.
3114 """
3115 self._check("ra")
3116 if self.firstmember is not None:
3117 m = self.firstmember
3118 self.firstmember = None
3119 return m
3120
be60ffd0
ERE
3121 # Read the next block.
3122 self.fileobj.seek(self.offset)
7584f5c9
ERE
3123 tarinfo = None
3124 while True:
3125 try:
3126 tarinfo = self.tarinfo.fromtarfile(self)
be60ffd0 3127 except EOFHeaderError as e:
7584f5c9
ERE
3128 if self.ignore_zeros:
3129 self._dbg(2, "0x%X: %s" % (self.offset, e))
3130 self.offset += BLOCKSIZE
3131 continue
be60ffd0 3132 except InvalidHeaderError as e:
7584f5c9
ERE
3133 if self.ignore_zeros:
3134 self._dbg(2, "0x%X: %s" % (self.offset, e))
3135 self.offset += BLOCKSIZE
3136 continue
3137 elif self.offset == 0:
3138 raise ReadError(str(e))
3139 except EmptyHeaderError:
3140 if self.offset == 0:
3141 raise ReadError("empty file")
be60ffd0 3142 except TruncatedHeaderError as e:
7584f5c9
ERE
3143 if self.offset == 0:
3144 raise ReadError(str(e))
be60ffd0 3145 except SubsequentHeaderError as e:
7584f5c9
ERE
3146 raise ReadError(str(e))
3147 break
3148
3149 if tarinfo is not None:
ea625b04
ERE
3150 if self.save_to_members:
3151 self.members.append(tarinfo)
7584f5c9
ERE
3152 else:
3153 self._loaded = True
3154
3155 return tarinfo
3156
3157 #--------------------------------------------------------------------------
3158 # Little helper methods:
3159
3160 def _getmember(self, name, tarinfo=None, normalize=False):
3161 """Find an archive member by name from bottom to top.
3162 If tarinfo is given, it is used as the starting point.
3163 """
3164 # Ensure that all members have been loaded.
3165 members = self.getmembers()
3166
3167 # Limit the member search list up to tarinfo.
3168 if tarinfo is not None:
3169 members = members[:members.index(tarinfo)]
3170
3171 if normalize:
3172 name = os.path.normpath(name)
3173
3174 for member in reversed(members):
3175 if normalize:
3176 member_name = os.path.normpath(member.name)
3177 else:
3178 member_name = member.name
3179
3180 if name == member_name:
3181 return member
3182
3183 def _load(self):
3184 """Read through the entire archive file and look for readable
3185 members.
3186 """
3187 while True:
3188 tarinfo = self.next()
3189 if tarinfo is None:
3190 break
3191 self._loaded = True
3192
3193 def _check(self, mode=None):
3194 """Check if TarFile is still open, and if the operation's mode
3195 corresponds to TarFile's mode.
3196 """
3197 if self.closed:
be60ffd0 3198 raise OSError("%s is closed" % self.__class__.__name__)
7584f5c9 3199 if mode is not None and self.mode not in mode:
be60ffd0 3200 raise OSError("bad operation for mode %r" % self.mode)
7584f5c9
ERE
3201
3202 def _find_link_target(self, tarinfo):
3203 """Find the target member of a symlink or hardlink member in the
3204 archive.
3205 """
3206 if tarinfo.issym():
3207 # Always search the entire archive.
3208 linkname = "/".join(filter(None, (os.path.dirname(tarinfo.name), tarinfo.linkname)))
3209 limit = None
3210 else:
3211 # Search the archive before the link, because a hard link is
3212 # just a reference to an already archived file.
3213 linkname = tarinfo.linkname
3214 limit = tarinfo
3215
3216 member = self._getmember(linkname, tarinfo=limit, normalize=True)
3217 if member is None:
3218 raise KeyError("linkname %r not found" % linkname)
3219 return member
3220
3221 def __iter__(self):
3222 """Provide an iterator object.
3223 """
3224 if self._loaded:
3225 return iter(self.members)
3226 else:
3227 return TarIter(self)
3228
1027433a 3229 def _dbg(self, level, msg, *args):
7584f5c9
ERE
3230 """Write debugging output to sys.stderr.
3231 """
3232 if level <= self.debug:
1027433a 3233 print(msg.format(*args), file=sys.stderr)
7584f5c9
ERE
3234
3235 def __enter__(self):
3236 self._check()
3237 return self
3238
3239 def __exit__(self, type, value, traceback):
3240 if type is None:
3241 self.close()
3242 else:
3243 # An exception occurred. We must not call close() because
3244 # it would try to write end-of-archive blocks and padding.
3245 if not self._extfileobj:
3246 self.fileobj.close()
3247 self.closed = True
3248# class TarFile
3249
3250class TarIter:
3251 """Iterator Class.
3252
3253 for tarinfo in TarFile(...):
3254 suite...
3255 """
3256
3257 def __init__(self, tarfile):
3258 """Construct a TarIter object.
3259 """
3260 self.tarfile = tarfile
3261 self.index = 0
3262 def __iter__(self):
3263 """Return iterator object.
3264 """
3265 return self
be60ffd0 3266 def __next__(self):
7584f5c9
ERE
3267 """Return the next item using TarFile's next() method.
3268 When all members have been read, set TarFile as _loaded.
3269 """
3270 # Fix for SF #1100429: Under rare circumstances it can
3271 # happen that getmembers() is called during iteration,
3272 # which will cause TarIter to stop prematurely.
3273
3274 if self.index == 0 and self.tarfile.firstmember is not None:
3275 tarinfo = self.tarfile.next()
3276 elif self.index < len(self.tarfile.members):
3277 tarinfo = self.tarfile.members[self.index]
3278 elif not self.tarfile._loaded:
3279 tarinfo = self.tarfile.next()
3280 if not tarinfo:
3281 self.tarfile._loaded = True
3282 raise StopIteration
3283 else:
3284 raise StopIteration
3285 self.index += 1
fb27c6e8 3286
7584f5c9
ERE
3287 return tarinfo
3288
7584f5c9
ERE
3289
3290#--------------------
3291# exported functions
3292#--------------------
3293def is_tarfile(name):
3294 """Return True if name points to a tar archive that we
3295 are able to handle, else return False.
3296 """
3297 try:
3298 t = open(name)
3299 t.close()
3300 return True
3301 except TarError:
3302 return False
3303
3304bltn_open = open
3305open = TarFile.open