draft rescue mode through all layers
[python-delta-tar] / deltatar / tarfile.py
CommitLineData
be60ffd0 1#!/usr/bin/env python3
7584f5c9
ERE
2#-------------------------------------------------------------------
3# tarfile.py
4#-------------------------------------------------------------------
5# Copyright (C) 2002 Lars Gustäbel <lars@gustaebel.de>
6# All rights reserved.
7#
8# Permission is hereby granted, free of charge, to any person
9# obtaining a copy of this software and associated documentation
10# files (the "Software"), to deal in the Software without
11# restriction, including without limitation the rights to use,
12# copy, modify, merge, publish, distribute, sublicense, and/or sell
13# copies of the Software, and to permit persons to whom the
14# Software is furnished to do so, subject to the following
15# conditions:
16#
17# The above copyright notice and this permission notice shall be
18# included in all copies or substantial portions of the Software.
19#
20# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27# OTHER DEALINGS IN THE SOFTWARE.
28#
29"""Read from and write to tar format archives.
30"""
31
32__version__ = "$Revision: 85213 $"
33# $Source$
34
35version = "0.9.0"
36__author__ = "Lars Gustäbel (lars@gustaebel.de)"
37__date__ = "$Date$"
38__cvsid__ = "$Id$"
5fdff89f 39__credits__ = "Gustavo Niemeyer, Niels Gustäbel, Richard Townsend, Eduardo Robles."
7584f5c9
ERE
40
41#---------
42# Imports
43#---------
c7c736b6 44import binascii
7584f5c9
ERE
45import sys
46import os
be60ffd0 47import io
7584f5c9
ERE
48import shutil
49import stat
50import errno
51import time
52import struct
53import copy
54import re
55import operator
56
c7c736b6
PG
57import traceback # XXX
58
8ab8fac5 59from . import crypto
6e812ad9 60
7584f5c9
ERE
61try:
62 import grp, pwd
63except ImportError:
64 grp = pwd = None
65
be60ffd0
ERE
66# os.symlink on Windows prior to 6.0 raises NotImplementedError
67symlink_exception = (AttributeError, NotImplementedError)
68try:
69 # OSError (winerror=1314) will be raised if the caller does not hold the
70 # SeCreateSymbolicLinkPrivilege privilege
71 symlink_exception += (OSError,)
72except NameError:
73 pass
74
7584f5c9
ERE
75# from tarfile import *
76__all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError"]
77
be60ffd0
ERE
78from builtins import open as _open # Since 'open' is TarFile.open
79
7584f5c9
ERE
80#---------------------------------------------------------
81# tar constants
82#---------------------------------------------------------
be60ffd0 83NUL = b"\0" # the null character
7584f5c9
ERE
84BLOCKSIZE = 512 # length of processing blocks
85RECORDSIZE = BLOCKSIZE * 20 # length of records
be60ffd0
ERE
86GNU_MAGIC = b"ustar \0" # magic gnu tar string
87POSIX_MAGIC = b"ustar\x0000" # magic posix tar string
7584f5c9
ERE
88
89LENGTH_NAME = 100 # maximum length of a filename
90LENGTH_LINK = 100 # maximum length of a linkname
91LENGTH_PREFIX = 155 # maximum length of the prefix field
92
be60ffd0
ERE
93REGTYPE = b"0" # regular file
94AREGTYPE = b"\0" # regular file
95LNKTYPE = b"1" # link (inside tarfile)
96SYMTYPE = b"2" # symbolic link
97CHRTYPE = b"3" # character special device
98BLKTYPE = b"4" # block special device
99DIRTYPE = b"5" # directory
100FIFOTYPE = b"6" # fifo special device
101CONTTYPE = b"7" # contiguous file
102
103GNUTYPE_LONGNAME = b"L" # GNU tar longname
104GNUTYPE_LONGLINK = b"K" # GNU tar longlink
105GNUTYPE_SPARSE = b"S" # GNU tar sparse file
106GNUTYPE_MULTIVOL = b"M" # GNU tar continuation of a file that began on
68ddf955 107 # another volume
7584f5c9 108
be60ffd0
ERE
109XHDTYPE = b"x" # POSIX.1-2001 extended header
110XGLTYPE = b"g" # POSIX.1-2001 global header
111SOLARIS_XHDTYPE = b"X" # Solaris extended header
7584f5c9
ERE
112
113USTAR_FORMAT = 0 # POSIX.1-1988 (ustar) format
114GNU_FORMAT = 1 # GNU tar format
115PAX_FORMAT = 2 # POSIX.1-2001 (pax) format
116DEFAULT_FORMAT = GNU_FORMAT
117
15a81fc0 118GZ_FMT_HEADER = b"<BBBBLBB"
203cb25e 119GZ_HEADER_SIZE = 10 # not including the name
15a81fc0
PG
120GZ_MAGIC = (0x1f, 0x8b) # 0o37, 0o213
121GZ_METHOD_DEFLATE = 0x08 # 0o10
122GZ_FLAG_ORIG_NAME = 0x08 # 0o10, default in gzip
123GZ_DEFLATE_FLAGS = 0x00 # 0o00, never read (deflate.c)
124GZ_OS_CODE = 0x03 # 0o03, default in gzip (tailor.h)
d601d33b
PG
125GZ_MAGIC_BYTES = struct.pack ("<BB", GZ_MAGIC [0], GZ_MAGIC [1])
126GZ_MAGIC_DEFLATE = struct.pack ("<BBB", GZ_MAGIC [0], GZ_MAGIC [1],
127 GZ_METHOD_DEFLATE)
15a81fc0 128
04f4c7ab
PG
129TOLERANCE_STRICT = 0
130TOLERANCE_RECOVER = 1 # rely on offsets in index
131TOLERANCE_RESCUE = 2 # deduce metadata from archive contents
132
7584f5c9 133#---------------------------------------------------------
d1c38f40
PG
134# archive handling mode
135#---------------------------------------------------------
136
137ARCMODE_PLAIN = 0
138ARCMODE_ENCRYPT = 1 << 0
139ARCMODE_COMPRESS = 1 << 1
140ARCMODE_CONCAT = 1 << 2
141
142def arcmode_fmt (m):
143 if m == ARCMODE_PLAIN:
144 return "PLAIN"
145 first = True
146 ret = "["
147 def chkappend (b, s):
148 nonlocal m
149 nonlocal ret
150 nonlocal first
151 if m & b:
152 if first is True: first = False
153 else: ret += " |"
154 ret += " " + s
155 chkappend (ARCMODE_ENCRYPT, "ENCRYPT")
156 chkappend (ARCMODE_COMPRESS, "COMPRESS")
157 chkappend (ARCMODE_CONCAT, "CONCAT")
158 return ret + " ]"
159
160
161def arcmode_set (concat=False, encryption=None, comptype=None, init=ARCMODE_PLAIN):
162 ret = init
163 if bool (concat) is True:
164 ret |= ARCMODE_CONCAT
165 if encryption is not None:
166 ret |= ARCMODE_ENCRYPT
167 if comptype == "gz":
168 ret |= ARCMODE_COMPRESS
169 return ret
170
171#---------------------------------------------------------
7584f5c9
ERE
172# tarfile constants
173#---------------------------------------------------------
174# File types that tarfile supports:
175SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE,
176 SYMTYPE, DIRTYPE, FIFOTYPE,
177 CONTTYPE, CHRTYPE, BLKTYPE,
178 GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
68ddf955 179 GNUTYPE_SPARSE, GNUTYPE_MULTIVOL)
7584f5c9
ERE
180
181# File types that will be treated as a regular file.
182REGULAR_TYPES = (REGTYPE, AREGTYPE,
183 CONTTYPE, GNUTYPE_SPARSE)
184
185# File types that are part of the GNU tar format.
186GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
68ddf955 187 GNUTYPE_SPARSE, GNUTYPE_MULTIVOL)
7584f5c9
ERE
188
189# Fields from a pax header that override a TarInfo attribute.
190PAX_FIELDS = ("path", "linkpath", "size", "mtime",
191 "uid", "gid", "uname", "gname")
192
be60ffd0
ERE
193# Fields from a pax header that are affected by hdrcharset.
194PAX_NAME_FIELDS = {"path", "linkpath", "uname", "gname"}
195
7584f5c9
ERE
196# Fields in a pax header that are numbers, all other fields
197# are treated as strings.
198PAX_NUMBER_FIELDS = {
199 "atime": float,
200 "ctime": float,
201 "mtime": float,
202 "uid": int,
203 "gid": int,
204 "size": int
205}
206
207#---------------------------------------------------------
7584f5c9
ERE
208# initialization
209#---------------------------------------------------------
be60ffd0
ERE
210
211if os.name in ("nt", "ce"):
212 ENCODING = "utf-8"
213else:
214 ENCODING = sys.getfilesystemencoding()
7584f5c9
ERE
215
216#---------------------------------------------------------
217# Some useful functions
218#---------------------------------------------------------
219
be60ffd0
ERE
220def stn(s, length, encoding, errors):
221 """Convert a string to a null-terminated bytes object.
7584f5c9 222 """
be60ffd0 223 s = s.encode(encoding, errors)
7584f5c9
ERE
224 return s[:length] + (length - len(s)) * NUL
225
be60ffd0
ERE
226def nts(s, encoding, errors):
227 """Convert a null-terminated bytes object to a string.
7584f5c9 228 """
be60ffd0
ERE
229 p = s.find(b"\0")
230 if p != -1:
231 s = s[:p]
232 return s.decode(encoding, errors)
233
234def sbtn(s, length, encoding, errors):
235 """Convert a string or a bunch of bytes to a null-terminated bytes object
236 of specific size.
237 """
238 if isinstance(s, str):
239 s = s.encode(encoding, errors)
240 return s[:length] + (length - len(s)) * NUL
7584f5c9
ERE
241
242def nti(s):
243 """Convert a number field to a python number.
244 """
245 # There are two possible encodings for a number field, see
246 # itn() below.
be60ffd0
ERE
247 if s[0] in (0o200, 0o377):
248 n = 0
249 for i in range(len(s) - 1):
250 n <<= 8
251 n += s[i + 1]
252 if s[0] == 0o377:
253 n = -(256 ** (len(s) - 1) - n)
254 else:
7584f5c9 255 try:
be60ffd0 256 n = int(nts(s, "ascii", "strict") or "0", 8)
7584f5c9
ERE
257 except ValueError:
258 raise InvalidHeaderError("invalid header")
7584f5c9
ERE
259 return n
260
261def itn(n, digits=8, format=DEFAULT_FORMAT):
262 """Convert a python number to a number field.
263 """
264 # POSIX 1003.1-1988 requires numbers to be encoded as a string of
265 # octal digits followed by a null-byte, this allows values up to
266 # (8**(digits-1))-1. GNU tar allows storing numbers greater than
be60ffd0
ERE
267 # that if necessary. A leading 0o200 or 0o377 byte indicate this
268 # particular encoding, the following digits-1 bytes are a big-endian
269 # base-256 representation. This allows values up to (256**(digits-1))-1.
270 # A 0o200 byte indicates a positive number, a 0o377 byte a negative
271 # number.
7584f5c9 272 if 0 <= n < 8 ** (digits - 1):
8112b0ed 273 s = bytes("%0*o" % (digits - 1, int(n)), "ascii") + NUL
be60ffd0
ERE
274 elif format == GNU_FORMAT and -256 ** (digits - 1) <= n < 256 ** (digits - 1):
275 if n >= 0:
276 s = bytearray([0o200])
277 else:
278 s = bytearray([0o377])
279 n = 256 ** digits + n
7584f5c9 280
be60ffd0
ERE
281 for i in range(digits - 1):
282 s.insert(1, n & 0o377)
7584f5c9 283 n >>= 8
7584f5c9 284 else:
be60ffd0
ERE
285 raise ValueError("overflow in number field")
286
287 return s
7584f5c9
ERE
288
289def calc_chksums(buf):
290 """Calculate the checksum for a member's header by summing up all
291 characters except for the chksum field which is treated as if
292 it was filled with spaces. According to the GNU tar sources,
293 some tars (Sun and NeXT) calculate chksum with signed char,
294 which will be different if there are chars in the buffer with
295 the high bit set. So we calculate two checksums, unsigned and
296 signed.
297 """
be60ffd0
ERE
298 unsigned_chksum = 256 + sum(struct.unpack_from("148B8x356B", buf))
299 signed_chksum = 256 + sum(struct.unpack_from("148b8x356b", buf))
7584f5c9
ERE
300 return unsigned_chksum, signed_chksum
301
302def copyfileobj(src, dst, length=None):
303 """Copy length bytes from fileobj src to fileobj dst.
304 If length is None, copy the entire content.
305 """
306 if length == 0:
307 return
308 if length is None:
309 shutil.copyfileobj(src, dst)
310 return
311
312 BUFSIZE = 16 * 1024
313 blocks, remainder = divmod(length, BUFSIZE)
be60ffd0 314 for b in range(blocks):
7584f5c9 315 buf = src.read(BUFSIZE)
c474439c 316 dst.write(buf)
7584f5c9 317 if len(buf) < BUFSIZE:
be60ffd0 318 raise OSError("end of file reached")
7584f5c9
ERE
319 if remainder != 0:
320 buf = src.read(remainder)
c474439c 321 dst.write(buf)
7584f5c9 322 if len(buf) < remainder:
be60ffd0 323 raise OSError("end of file reached")
c7c736b6 324
7584f5c9 325
7584f5c9 326def filemode(mode):
be60ffd0
ERE
327 """Deprecated in this location; use stat.filemode."""
328 import warnings
329 warnings.warn("deprecated in favor of stat.filemode",
330 DeprecationWarning, 2)
331 return stat.filemode(mode)
7584f5c9
ERE
332
333class TarError(Exception):
334 """Base exception."""
335 pass
336class ExtractError(TarError):
337 """General exception for extract errors."""
338 pass
339class ReadError(TarError):
be60ffd0 340 """Exception for unreadable tar archives."""
7584f5c9
ERE
341 pass
342class CompressionError(TarError):
343 """Exception for unavailable compression methods."""
344 pass
345class StreamError(TarError):
346 """Exception for unsupported operations on stream-like TarFiles."""
347 pass
348class HeaderError(TarError):
349 """Base exception for header errors."""
350 pass
351class EmptyHeaderError(HeaderError):
352 """Exception for empty headers."""
353 pass
354class TruncatedHeaderError(HeaderError):
355 """Exception for truncated headers."""
356 pass
357class EOFHeaderError(HeaderError):
358 """Exception for end of file headers."""
359 pass
360class InvalidHeaderError(HeaderError):
361 """Exception for invalid headers."""
362 pass
363class SubsequentHeaderError(HeaderError):
364 """Exception for missing and invalid extended headers."""
365 pass
8ab8fac5
PG
366class InvalidEncryptionError(TarError):
367 """Exception for undefined crypto modes and combinations."""
368 pass
e4e5d0b8
PG
369class DecryptionError(TarError):
370 """Exception for error during decryption."""
371 pass
c7c736b6 372class EncryptionError(TarError):
e93f83f1 373 """Exception for error during encryption."""
c7c736b6 374 pass
e50fa574
PG
375class EndOfFile(Exception):
376 """Signal end of file condition when they’re not an error."""
7584f5c9
ERE
377
378#---------------------------
379# internal stream interface
380#---------------------------
381class _LowLevelFile:
382 """Low-level file object. Supports reading and writing.
383 It is used instead of a regular file object for streaming
384 access.
385 """
386
387 def __init__(self, name, mode):
ad4402e8 388 _mode = {
7584f5c9 389 "r": os.O_RDONLY,
c7c736b6 390 "w": os.O_RDWR | os.O_CREAT | os.O_TRUNC,
7584f5c9
ERE
391 }[mode]
392 if hasattr(os, "O_BINARY"):
f0287fb7 393 _mode |= os.O_BINARY # pylint: disable=no-member
be60ffd0 394 self.fd = os.open(name, _mode, 0o666)
ad4402e8 395 self.offset = 0
7584f5c9
ERE
396
397 def close(self):
398 os.close(self.fd)
399
400 def read(self, size):
ad4402e8
ERE
401 ret = os.read(self.fd, size)
402 self.offset += len(ret)
403 return ret
7584f5c9 404
867f75f7
PG
405 def write(self, s, pos=None):
406 if pos is not None:
407 p0 = self.offset
408 os.lseek (self.fd, pos, os.SEEK_SET)
409 n = os.write(self.fd, s)
410 if pos is None:
411 self.offset += len(s)
412 else:
413 append = pos + n - p0
414 if append > 0:
415 self.offset += append
416 os.lseek (self.fd, p0, os.SEEK_SET)
7584f5c9 417
ad4402e8
ERE
418 def tell(self):
419 return self.offset
420
c7c736b6
PG
421 def seek_set (self, pos):
422 os.lseek (self.fd, pos, os.SEEK_SET)
423 self.offset = pos
424
8ab8fac5 425
15a81fc0
PG
426def gz_header (name=None):
427 timestamp = int(time.time())
428 flags = 0x0
429
430 if name is None:
431 name = b""
432 else:
433 flags |= GZ_FLAG_ORIG_NAME
434 if type(name) is str:
435 name = name.encode("iso-8859-1", "replace")
6e99d23a
PG
436 if name.endswith(b".pdtcrypt"):
437 name = name[:-9]
15a81fc0
PG
438 if name.endswith(b".gz"):
439 name = name[:-3]
440 # RFC1952 says we must use ISO-8859-1 for the FNAME field.
441 name += NUL
442
443 hdr = struct.pack (GZ_FMT_HEADER,
444 GZ_MAGIC [0], GZ_MAGIC [1],
445 GZ_METHOD_DEFLATE, flags,
446 timestamp,
447 GZ_DEFLATE_FLAGS, GZ_OS_CODE)
448
449 return hdr + name
450
d601d33b 451
7584f5c9
ERE
452class _Stream:
453 """Class that serves as an adapter between TarFile and
454 a stream-like object. The stream-like object only
455 needs to have a read() or write() method and is accessed
456 blockwise. Use of gzip or bzip2 compression is possible.
457 A stream-like object could be for example: sys.stdin,
458 sys.stdout, a socket, a tape device etc.
459
3031b7ae
PG
460 _Stream is intended to be used only internally but is
461 nevertherless used externally by Deltatar.
462
463 When encrypting, the ``enccounter`` will be used for
464 initializing the first cryptographic context. When
465 decrypting, its value will be compared to the decrypted
466 object. Decryption fails if the value does not match.
467 In effect, this means that a ``_Stream`` whose ctor was
468 passed ``enccounter`` can only be used to encrypt or
469 decrypt a single object.
7584f5c9
ERE
470 """
471
c7c736b6 472 remainder = -1 # track size in encrypted entries
04f4c7ab 473 tolerance = TOLERANCE_STRICT
c7c736b6 474
6e812ad9 475 def __init__(self, name, mode, comptype, fileobj, bufsize,
d1c38f40 476 concat=False, encryption=None, enccounter=None,
04f4c7ab 477 compresslevel=9, tolerance=TOLERANCE_STRICT):
7584f5c9
ERE
478 """Construct a _Stream object.
479 """
d1c38f40 480 self.arcmode = arcmode_set (concat, encryption, comptype)
04f4c7ab 481 self.tolerance = tolerance
d1c38f40 482
7584f5c9
ERE
483 self._extfileobj = True
484 if fileobj is None:
485 fileobj = _LowLevelFile(name, mode)
486 self._extfileobj = False
487
488 if comptype == '*':
489 # Enable transparent compression detection for the
490 # stream interface
491 fileobj = _StreamProxy(fileobj)
492 comptype = fileobj.getcomptype()
d1c38f40
PG
493 if comptype == '':
494 comptype = "tar"
7584f5c9 495
3031b7ae
PG
496 self.enccounter = None
497 if self.arcmode & ARCMODE_ENCRYPT:
498 self.enccounter = enccounter
499
7584f5c9
ERE
500 self.name = name or ""
501 self.mode = mode
502 self.comptype = comptype
53732900 503 self.cmp = None
7584f5c9
ERE
504 self.fileobj = fileobj
505 self.bufsize = bufsize
be60ffd0
ERE
506 self.buf = b""
507 self.pos = 0
508 self.concat_pos = 0
7584f5c9 509 self.closed = False
be60ffd0 510 self.flags = 0
be60ffd0 511 self.last_block_offset = 0
e4e5d0b8 512 self.dbuf = b"" # ???
46c03c02 513 self.exception = None # communicate decompression failure
2b82f50c 514 self.compresslevel = compresslevel
784175ba 515 self.bytes_written = 0
c7c736b6 516 # crypto parameters
2ae46844 517 self.encryption = encryption
c7c736b6 518 self.lasthdr = None
7584f5c9 519
be60ffd0
ERE
520 try:
521 if comptype == "gz":
522 try:
523 import zlib
524 except ImportError:
525 raise CompressionError("zlib module is not available")
526 self.zlib = zlib
bec34b42
PG
527 if mode == "r":
528 self.exception = zlib.error
8ae983c4 529 self._init_read_gz()
bec34b42 530 elif mode == "w":
d1c38f40
PG
531 if not (self.arcmode & ARCMODE_CONCAT):
532 if self.arcmode & ARCMODE_ENCRYPT:
3031b7ae 533 self._init_write_encrypt (name)
a0873dcc 534 self._init_write_gz ()
c2ffe2ec 535 self.crc = zlib.crc32(b"") & 0xFFFFffff
7584f5c9 536
be60ffd0 537 elif comptype == "bz2":
d1c38f40 538 if self.arcmode & ARCMODE_ENCRYPT:
2ae46844 539 raise InvalidEncryptionError("encryption not available for "
d1c38f40 540 "compression “%s”" % comptype)
be60ffd0
ERE
541 try:
542 import bz2
543 except ImportError:
544 raise CompressionError("bz2 module is not available")
545 if mode == "r":
546 self.dbuf = b""
547 self.cmp = bz2.BZ2Decompressor()
548 self.exception = OSError
549 else:
550 self.cmp = bz2.BZ2Compressor()
7584f5c9 551
be60ffd0 552 elif comptype == 'xz':
d1c38f40 553 if self.arcmode & ARCMODE_ENCRYPT:
2ae46844 554 raise InvalidEncryptionError("encryption not available for "
d1c38f40 555 "compression “%s”" % comptype)
c7c736b6
PG
556 try:
557 import lzma
558 except ImportError:
559 raise CompressionError("lzma module is not available")
560 if mode == "r":
561 self.dbuf = b""
562 self.cmp = lzma.LZMADecompressor()
563 self.exception = lzma.LZMAError
564 else:
565 self.cmp = lzma.LZMACompressor()
566
6de9444a 567 elif comptype == "tar":
d1c38f40 568 if not (self.arcmode & ARCMODE_CONCAT) \
6de9444a 569 and mode == "w" \
d1c38f40 570 and self.arcmode & ARCMODE_ENCRYPT:
3031b7ae 571 self._init_write_encrypt (name)
6de9444a
PG
572
573 else:
d1c38f40 574 if self.arcmode & ARCMODE_ENCRYPT:
2ae46844 575 raise InvalidEncryptionError("encryption not available for "
d1c38f40 576 "compression “%s”" % comptype)
c7c736b6 577 raise CompressionError("unknown compression type %r" % comptype)
be60ffd0 578
200d4866 579 except:
be60ffd0
ERE
580 if not self._extfileobj:
581 self.fileobj.close()
582 self.closed = True
583 raise
ac5e4184 584
7584f5c9
ERE
585 def __del__(self):
586 if hasattr(self, "closed") and not self.closed:
fac2cfe1
PG
587 try:
588 self.close()
589 except crypto.InternalError:
590 # context already finalized due to abort but close() tried
591 # to use it
592 pass
7584f5c9 593
c7c736b6 594
d1c38f40
PG
595 def next (self, name):
596 if self.arcmode & ARCMODE_COMPRESS:
597 if getattr (self, "cmp", None) is not None:
598 self._finalize_write_gz ()
0349168a
PG
599 self.__sync()
600 if self.arcmode & ~(ARCMODE_ENCRYPT | ARCMODE_COMPRESS):
601 self.last_block_offset = self.fileobj.tell()
d1c38f40
PG
602 if self.arcmode & ARCMODE_ENCRYPT:
603 self._finalize_write_encrypt ()
604 self._init_write_encrypt (name, set_last_block_offset=True)
605 if self.arcmode & ARCMODE_COMPRESS:
606 self._init_write_gz (set_last_block_offset =
0349168a 607 not (self.arcmode & ARCMODE_ENCRYPT))
d1c38f40
PG
608 return self.last_block_offset
609
610
611 def next_volume (self, name):
612 # with non-concat modes, this is taken care by the _Stream
613 # ctor as invoked by the newvol handler
614 if self.arcmode & ARCMODE_COMPRESS:
615 if getattr (self, "cmp", None) is not None:
616 # e. g. compressed PAX header written
617 self._finalize_write_gz ()
618 if self.arcmode & ARCMODE_ENCRYPT:
619 self._init_write_encrypt (name)
620 if self.arcmode & ARCMODE_COMPRESS:
621 self._init_write_gz ()
622
c7c736b6 623
d1c38f40
PG
624 def _init_write_encrypt (self, entry=None, set_last_block_offset=False):
625 """
626 Save position for delayed write of header; fill the header location
627 with dummy bytes.
628 """
629 # first thing, proclaim new object to the encryption context
630 # secondly, assemble the header with the updated parameters
631 # and commit it directly to the underlying stream, bypassing the
632 # encryption layer in .__write().
633 dummyhdr = self.encryption.next (entry, counter=self.enccounter)
634 if dummyhdr is None:
635 raise EncryptionError ("Crypto.next(): bad dummy header") # XXX
636 self.lasthdr = self.fileobj.tell()
637 self.__write_to_file(dummyhdr)
638 if set_last_block_offset is True:
639 self.last_block_offset = self.lasthdr
c7c736b6
PG
640
641
642 def _finalize_write_encrypt (self):
643 """
644 Seek back to header position, read dummy bytes, finalize crypto
645 obtaining the actual header, write header, seek back to current
646 position.
963d0db4
PG
647
648 Returns the list of IV fixed parts as used during encryption.
c7c736b6 649 """
d1c38f40 650 if self.lasthdr is not None:
c7c736b6
PG
651 pos0 = self.fileobj.tell ()
652 self.fileobj.seek_set (self.lasthdr)
dd47d6a2 653 dummy = self.fileobj.read (crypto.PDTCRYPT_HDR_SIZE)
c7c736b6
PG
654 pos1 = self.fileobj.tell ()
655 dpos = pos1 - self.lasthdr
dd47d6a2 656 assert dpos == crypto.PDTCRYPT_HDR_SIZE
c7c736b6 657 self.fileobj.seek_set (pos0)
c8c72fe1 658 data, hdr, _ = self.encryption.done (dummy)
5f38bff6 659 self.__write_to_file(hdr, pos=self.lasthdr)
c7c736b6
PG
660 self.__write_to_file(data) # append remainder of data
661 self.lasthdr = -1
662
663
57db1546
PG
664 def _finalize_write_gz (self):
665 if self.cmp is not None:
666 chunk = self.buf + self.cmp.flush()
667 if chunk:
668 if self.comptype == "gz":
669 # The native zlib crc is an unsigned 32-bit integer, but
670 # the Python wrapper implicitly casts that to a signed C
671 # long. So, on a 32-bit box self.crc may "look negative",
672 # while the same crc on a 64-bit box may "look positive".
673 # To avoid irksome warnings from the `struct` module, force
674 # it to look positive on all boxes.
675 chunk += struct.pack("<L", self.crc & 0xffffffff)
676 chunk += struct.pack("<L", self.concat_pos & 0xffffFFFF)
677 self.__enc_write (chunk)
15a81fc0 678 self.buf = b""
57db1546
PG
679
680
a0873dcc 681 def _init_write_gz (self, set_last_block_offset=False):
5fdff89f
ERE
682 '''
683 Add a new gzip block, closing last one
684 '''
be60ffd0 685 self.concat_pos = 0
c2ffe2ec 686 self.crc = self.zlib.crc32(b"") & 0xFFFFffff
6de9444a 687 first = self.cmp is None
2b82f50c
ERE
688 self.cmp = self.zlib.compressobj(self.compresslevel,
689 self.zlib.DEFLATED,
690 -self.zlib.MAX_WBITS,
691 self.zlib.DEF_MEM_LEVEL,
692 0)
6e812ad9
DGM
693
694 # if aes, we encrypt after compression
6de9444a 695 if set_last_block_offset is True:
ad4402e8 696 self.last_block_offset = self.fileobj.tell()
6e812ad9 697
15a81fc0 698 self.__write(gz_header (self.name if first is True else None))
5fdff89f 699
ac5e4184 700
7584f5c9
ERE
701 def write(self, s):
702 """Write string s to the stream.
703 """
704 if self.comptype == "gz":
c2ffe2ec 705 self.crc = self.zlib.crc32(s, self.crc) & 0xFFFFffff
7584f5c9 706 self.pos += len(s)
5fdff89f 707 self.concat_pos += len(s)
53732900 708 if self.cmp is not None:
7584f5c9
ERE
709 s = self.cmp.compress(s)
710 self.__write(s)
711
c7c736b6 712 def __sync(self):
cb7a3911 713 """Write what’s left in the buffer to the stream."""
c7c736b6
PG
714 self.__write (b"") # → len (buf) <= bufsiz
715 self.__enc_write (self.buf)
716 self.buf = b""
717
7584f5c9 718 def __write(self, s):
548bb8d5
CH
719 """Writes (and encodes) string s to the stream blockwise
720
721 will wait with encoding/writing until block is complete
7584f5c9
ERE
722 """
723 self.buf += s
724 while len(self.buf) > self.bufsize:
6e812ad9 725 self.__enc_write(self.buf[:self.bufsize])
7584f5c9
ERE
726 self.buf = self.buf[self.bufsize:]
727
867f75f7 728
5f38bff6 729 def __write_to_file(self, s, pos=None):
6e812ad9 730 '''
5f38bff6 731 Writes directly to the fileobj; updates self.bytes_written. If “pos” is
cb7a3911 732 given, the stream will seek to that position first and back afterwards,
5f38bff6 733 and the total of bytes written is not updated.
6e812ad9 734 '''
867f75f7 735 self.fileobj.write(s, pos)
5f38bff6
PG
736 if pos is None:
737 self.bytes_written += len(s)
867f75f7 738
6e812ad9
DGM
739
740 def __enc_write(self, s):
cb7a3911
PG
741 """
742 If encryption is active, the string s is encrypted before being written
743 to the file.
744 """
745 if len (s) == 0:
746 return
d1c38f40 747 if self.arcmode & ARCMODE_ENCRYPT:
cb7a3911
PG
748 buf = s
749 while len (buf) > 0:
750 n, ct = self.encryption.process(buf)
751 self.__write_to_file(ct)
752 buf = buf [n:]
753 if len (buf) > 0:
754 # The entire plaintext was not consumed: The size limit
755 # for encrypted objects was reached. Transparently create
756 # a new encrypted object and continue processing the input.
757 self._finalize_write_encrypt ()
758 self._init_write_encrypt ()
759 else:
760 self.__write_to_file(s)
761
6e812ad9 762
784175ba
CH
763 def estim_file_size(self):
764 """ estimates size of file if closing it now
765
766 The result may differ greatly from the amount of data sent to write()
767 due to compression, encryption and buffering.
768
769 In tests the result (before calling close()) was up to 12k smaller than
770 the final file size if compression is being used because zlib/bz2
771 compressors do not allow inspection of their buffered data :-(
772
ba5a449e
CH
773 Still, we add what close() would add: 8 bytes for gz checksum, one
774 encryption block size if encryption is used and the size of our own
775 buffer
784175ba
CH
776 """
777 if self.closed:
778 return self.bytes_written
779
780 result = self.bytes_written
781 if self.buf:
782 result += len(self.buf)
783 if self.comptype == 'gz':
ba5a449e 784 result += 8 # 2 longs = 8 byte (no extra info written for bzip2)
784175ba
CH
785 return result
786
5fdff89f 787 def close(self, close_fileobj=True):
7584f5c9
ERE
788 """Close the _Stream object. No operation should be
789 done on it afterwards.
790 """
963d0db4 791
7584f5c9
ERE
792 if self.closed:
793 return
794
963d0db4 795 if close_fileobj is True:
a0873dcc 796
ae3d0f2a 797 if self.mode == "w":
d1c38f40 798 if self.arcmode & ARCMODE_COMPRESS:
a0873dcc 799 self._finalize_write_gz ()
ae3d0f2a 800 # end of Tar archive marker (two empty blocks) was written
267bc643
PG
801 # finalize encryption last; no writes may be performed after
802 # this point
cb7a3911 803 self.__sync ()
d1c38f40
PG
804 if self.arcmode & ARCMODE_ENCRYPT:
805 self._finalize_write_encrypt ()
267bc643 806
963d0db4
PG
807 if not self._extfileobj:
808 self.fileobj.close()
809 else:
810 # read the zlib crc and length and check them
811 if self.mode == "r" and self.comptype == "gz":
812 read_crc = self.__read(4)
813 read_length = self.__read(4)
814 calculated_crc = self.crc
815 if struct.unpack("<L", read_crc)[0] != calculated_crc:
816 raise CompressionError("bad gzip crc")
7584f5c9
ERE
817 self.closed = True
818
54128a00 819
7584f5c9
ERE
820 def _init_read_gz(self):
821 """Initialize for reading a gzip compressed fileobj.
822 """
823 self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
7584f5c9 824
85737f48 825 read2 = self.__read(2)
e50fa574
PG
826 if read2 == b"":
827 raise EndOfFile ("_init_read_gz(): read returned zero bytes at pos "
828 "%d" % self.fileobj.tell())
829 # taken from gzip.GzipFile with some alterations
d601d33b 830 if read2 != GZ_MAGIC_BYTES:
7584f5c9 831 raise ReadError("not a gzip file")
85737f48
ERE
832
833 read1 = self.__read(1)
be60ffd0 834 if read1 != b"\010":
7584f5c9
ERE
835 raise CompressionError("unsupported compression method")
836
85737f48 837 self.flags = flag = ord(self.__read(1))
7584f5c9
ERE
838 self.__read(6)
839
840 if flag & 4:
841 xlen = ord(self.__read(1)) + 256 * ord(self.__read(1))
842 self.read(xlen)
843 if flag & 8:
844 while True:
845 s = self.__read(1)
846 if not s or s == NUL:
847 break
848 if flag & 16:
849 while True:
850 s = self.__read(1)
851 if not s or s == NUL:
852 break
853 if flag & 2:
854 self.__read(2)
855
c7c736b6
PG
856 def _init_read_encrypt (self):
857 """Initialize encryption for next entry in archive. Read a header and
858 notify the crypto context."""
d1c38f40 859 if self.arcmode & ARCMODE_ENCRYPT:
6e99d23a 860 lasthdr = self.fileobj.tell ()
15d3eefd
PG
861 try:
862 hdr = crypto.hdr_read_stream (self.fileobj)
8a8ac469
PG
863 except crypto.EndOfFile:
864 return False
6e99d23a 865 except crypto.InvalidHeader as exn:
c7c736b6 866 raise DecryptionError ("Crypto.hdr_read_stream(): error “%s” "
6e99d23a
PG
867 "processing %r at pos %d"
868 % (exn, self.fileobj, lasthdr)) \
ae3d0f2a 869 from exn
3031b7ae
PG
870 if self.enccounter is not None:
871 # enforce that the iv counter in the header matches an
872 # explicitly requested one
873 iv = crypto.hdr_iv_counter (hdr)
874 if iv != self.enccounter:
875 raise DecryptionError ("expected IV counter %d, got %d"
876 % (self.enccounter, iv))
6e99d23a 877 self.lasthdr = lasthdr
c7c736b6 878 self.remainder = hdr ["ctsize"] # distance to next header
1ed44e7b
PG
879 try:
880 self.encryption.next (hdr)
881 except crypto.InvalidParameter as exn:
882 raise DecryptionError ("Crypto.next(): error “%s” "
883 "processing %r at pos %d"
884 % (exn, self.fileobj, lasthdr)) \
885 from exn
8a8ac469
PG
886
887 return True
c7c736b6
PG
888
889
8de91f4f
PG
890 def _read_encrypt (self, buf):
891 """
892 Demote a program error to a decryption error in tolerant mode. This
893 allows recovery from corrupted headers and invalid data.
894 """
895 try:
896 return self.encryption.process (buf)
897 except RuntimeError as exn:
04f4c7ab 898 if self.tolerance != TOLERANCE_STRICT:
8de91f4f
PG
899 raise DecryptionError (exn)
900 raise
901
902
c7c736b6
PG
903 def _finalize_read_encrypt (self):
904 """
905 Finalize decryption.
906 """
d1c38f40
PG
907 if self.arcmode & ARCMODE_ENCRYPT \
908 and self.lasthdr is not None :
c7c736b6
PG
909 assert self.remainder >= 0
910 if self.remainder > 0:
c7c736b6 911 self.remainder = 0
b0078f26
PG
912 try:
913 data = self.encryption.done ()
914 except crypto.InvalidGCMTag as exn:
915 raise DecryptionError ("decryption failed: %s" % exn)
c7c736b6
PG
916 return data
917
918
7584f5c9
ERE
919 def tell(self):
920 """Return the stream's file pointer position.
921 """
922 return self.pos
923
924 def seek(self, pos=0):
925 """Set the stream's file pointer to pos. Negative seeking
926 is forbidden.
927 """
928 if pos - self.pos >= 0:
929 blocks, remainder = divmod(pos - self.pos, self.bufsize)
be60ffd0 930 for i in range(blocks):
7584f5c9
ERE
931 self.read(self.bufsize)
932 self.read(remainder)
933 else:
934 raise StreamError("seeking backwards is not allowed")
935 return self.pos
936
937 def read(self, size=None):
938 """Return the next size number of bytes from the stream.
939 If size is not defined, return all bytes of the stream
940 up to EOF.
941 """
942 if size is None:
943 t = []
944 while True:
945 buf = self._read(self.bufsize)
946 if not buf:
947 break
948 t.append(buf)
9dc7ac5c 949 buf = b"".join(t)
7584f5c9
ERE
950 else:
951 buf = self._read(size)
952 self.pos += len(buf)
953 return buf
954
3a7e1a50
ERE
955 def readline(self):
956 """Reads just one line, new line character included
957 """
f0fd5e3a 958 # if \n in dbuf, no read neads to be done
be60ffd0
ERE
959 if b'\n' in self.dbuf:
960 pos = self.dbuf.index(b'\n') + 1
f0fd5e3a
ERE
961 ret = self.dbuf[:pos]
962 self.dbuf = self.dbuf[pos:]
963 return ret
964
1215b602 965 buf = []
3a7e1a50
ERE
966 while True:
967 chunk = self._read(self.bufsize)
968
f0fd5e3a 969 # nothing more to read, so return the buffer
3a7e1a50 970 if not chunk:
be60ffd0 971 return b''.join(buf)
3a7e1a50
ERE
972
973 buf.append(chunk)
f0fd5e3a
ERE
974
975 # if \n found, return the new line
be60ffd0
ERE
976 if b'\n' in chunk:
977 dbuf = b''.join(buf)
978 pos = dbuf.index(b'\n') + 1
1215b602 979 self.dbuf = dbuf[pos:] + self.dbuf
3a7e1a50
ERE
980 return dbuf[:pos]
981
7584f5c9
ERE
982 def _read(self, size):
983 """Return size bytes from the stream.
984 """
7584f5c9
ERE
985 c = len(self.dbuf)
986 t = [self.dbuf]
e4e5d0b8 987
7584f5c9 988 while c < size:
867f75f7 989 buf = self.__read(self.bufsize)
7584f5c9
ERE
990 if not buf:
991 break
3a7e1a50 992
53732900 993 if self.cmp is not None:
85737f48 994 try:
3a7e1a50 995 buf = self.cmp.decompress(buf)
54128a00
PG
996 except self.exception as exn:
997 raise ReadError("invalid compressed data (%r)" % exn)
be60ffd0 998 except Exception as e:
04fb06f4
DGM
999 # happens at the end of the file
1000 # _init_read_gz failed in the previous iteration so
e4e5d0b8 1001 # self.cmp.decompress fails here
d1c38f40 1002 if self.arcmode & ARCMODE_CONCAT:
be60ffd0
ERE
1003 pass
1004 else:
1005 raise ReadError("invalid compressed data")
d1c38f40 1006 if self.arcmode & ARCMODE_COMPRESS and hasattr(self, "crc"):
c2ffe2ec 1007 self.crc = self.zlib.crc32(buf, self.crc) & 0xFFFFffff
d1c38f40
PG
1008 if self.arcmode & ARCMODE_CONCAT \
1009 and len(self.cmp.unused_data) != 0:
3a7e1a50
ERE
1010 self.buf = self.cmp.unused_data + self.buf
1011 self.close(close_fileobj=False)
1012 try:
1013 self._init_read_gz()
8de91f4f 1014 except DecryptionError:
04f4c7ab 1015 if self.tolerance != TOLERANCE_STRICT:
8de91f4f
PG
1016 # return whatever data was processed successfully
1017 if len (buf) > 0:
1018 t.append (buf)
1019 if len (t) > 0:
1020 break
1021 raise
e50fa574 1022 except EndOfFile:
3a7e1a50
ERE
1023 # happens at the end of the file
1024 pass
c2ffe2ec 1025 self.crc = self.zlib.crc32(b"") & 0xFFFFffff
3a7e1a50 1026 self.closed = False
7584f5c9
ERE
1027 t.append(buf)
1028 c += len(buf)
be60ffd0 1029 t = b"".join(t)
7584f5c9
ERE
1030 self.dbuf = t[size:]
1031 return t[:size]
1032
e4e5d0b8 1033
7584f5c9 1034 def __read(self, size):
ef3b4499
PG
1035 """
1036 Return size bytes from stream. If internal buffer is empty, read
1037 another block from the stream.
1038
1039 The function returns up to size bytes of data. When an error occurs
1040 during decryption, everything until the end of the last successfully
1041 finalized object is returned.
7584f5c9
ERE
1042 """
1043 c = len(self.buf)
8de91f4f 1044 t = [self.buf] if c > 0 else []
1ed44e7b 1045 good_crypto = len (t)
8de91f4f 1046
7584f5c9 1047 while c < size:
c7c736b6 1048 todo = size
8de91f4f
PG
1049 try:
1050 if self.arcmode & ARCMODE_ENCRYPT:
1051 if self.remainder <= 0:
1052 # prepare next object
044585c6
PG
1053 if self._init_read_encrypt () is False: # EOF
1054 buf = None
1055 break # while
8de91f4f
PG
1056
1057 # only read up to the end of the encrypted object
1058 todo = min (size, self.remainder)
1059 buf = self.fileobj.read(todo)
1060 if self.arcmode & ARCMODE_ENCRYPT:
1061 # decrypt the thing
1062 buf = self._read_encrypt (buf)
1063 if todo == self.remainder:
1064 # at the end of a crypto object; finalization will fail if
1065 # the GCM tag does not match
ef3b4499 1066 trailing = self._finalize_read_encrypt ()
8de91f4f
PG
1067 good_crypto = len (t) + 1
1068 if len (trailing) > 0:
1069 buf += trailing
1070 self.remainder = 0
1071 else:
1072 self.remainder -= todo
1073 except DecryptionError:
04f4c7ab 1074 if self.tolerance == TOLERANCE_STRICT:
8de91f4f
PG
1075 raise
1076 self.encryption.drop ()
1077 if good_crypto == 0:
1078 raise
1079 # this may occur at any of the three crypto operations above.
1080 # some objects did validate; discard all data after it; next
1081 # call will start with the bad object and error out immediately
1082 self.buf = b"".join (t [good_crypto:])
1083 return b"".join (t [:good_crypto])
c7c736b6
PG
1084
1085 if not buf: ## XXX stream terminated prematurely; this should be an error
7584f5c9 1086 break
c7c736b6 1087
7584f5c9
ERE
1088 t.append(buf)
1089 c += len(buf)
be60ffd0 1090 t = b"".join(t)
7584f5c9 1091 self.buf = t[size:]
fb27c6e8 1092
7584f5c9 1093 return t[:size]
7d372216 1094
7584f5c9
ERE
1095
1096class _StreamProxy(object):
1097 """Small proxy class that enables transparent compression
1098 detection for the Stream interface (mode 'r|*').
1099 """
1100
1101 def __init__(self, fileobj):
1102 self.fileobj = fileobj
1103 self.buf = self.fileobj.read(BLOCKSIZE)
1104
f0287fb7 1105 def read(self, size): # pylint: disable=method-hidden
7584f5c9
ERE
1106 self.read = self.fileobj.read
1107 return self.buf
1108
1109 def getcomptype(self):
d601d33b 1110 if self.buf.startswith(GZ_MAGIC_DEFLATE):
7584f5c9 1111 return "gz"
be60ffd0 1112 elif self.buf[0:3] == b"BZh" and self.buf[4:10] == b"1AY&SY":
7584f5c9 1113 return "bz2"
be60ffd0
ERE
1114 elif self.buf.startswith((b"\x5d\x00\x00\x80", b"\xfd7zXZ")):
1115 return "xz"
1116 else:
1117 return "tar"
7584f5c9
ERE
1118
1119 def close(self):
1120 self.fileobj.close()
1121# class StreamProxy
1122
7584f5c9
ERE
1123#------------------------
1124# Extraction file object
1125#------------------------
1126class _FileInFile(object):
1127 """A thin wrapper around an existing file object that
1128 provides a part of its data as an individual file
1129 object.
1130 """
1131
be60ffd0 1132 def __init__(self, fileobj, offset, size, blockinfo=None):
7584f5c9
ERE
1133 self.fileobj = fileobj
1134 self.offset = offset
1135 self.size = size
7584f5c9 1136 self.position = 0
be60ffd0
ERE
1137 self.name = getattr(fileobj, "name", None)
1138 self.closed = False
1139
1140 if blockinfo is None:
1141 blockinfo = [(0, size)]
1142
1143 # Construct a map with data and zero blocks.
1144 self.map_index = 0
1145 self.map = []
1146 lastpos = 0
1147 realpos = self.offset
1148 for offset, size in blockinfo:
1149 if offset > lastpos:
1150 self.map.append((False, lastpos, offset, None))
1151 self.map.append((True, offset, offset + size, realpos))
1152 realpos += size
1153 lastpos = offset + size
1154 if lastpos < self.size:
1155 self.map.append((False, lastpos, self.size, None))
1156
1157 def flush(self):
1158 pass
1159
1160 def readable(self):
1161 return True
1162
1163 def writable(self):
1164 return False
1165
1166 def seekable(self):
1167 return self.fileobj.seekable()
7584f5c9
ERE
1168
1169 def tell(self):
1170 """Return the current file position.
1171 """
1172 return self.position
1173
be60ffd0 1174 def seek(self, position, whence=io.SEEK_SET):
7584f5c9
ERE
1175 """Seek to a position in the file.
1176 """
be60ffd0
ERE
1177 if whence == io.SEEK_SET:
1178 self.position = min(max(position, 0), self.size)
1179 elif whence == io.SEEK_CUR:
1180 if position < 0:
1181 self.position = max(self.position + position, 0)
1182 else:
1183 self.position = min(self.position + position, self.size)
1184 elif whence == io.SEEK_END:
1185 self.position = max(min(self.size + position, self.size), 0)
1186 else:
1187 raise ValueError("Invalid argument")
1188 return self.position
7584f5c9
ERE
1189
1190 def read(self, size=None):
1191 """Read data from the file.
1192 """
1193 if size is None:
1194 size = self.size - self.position
1195 else:
1196 size = min(size, self.size - self.position)
1197
be60ffd0 1198 buf = b""
7584f5c9 1199 while size > 0:
7584f5c9 1200 while True:
be60ffd0
ERE
1201 data, start, stop, offset = self.map[self.map_index]
1202 if start <= self.position < stop:
7584f5c9 1203 break
be60ffd0
ERE
1204 else:
1205 self.map_index += 1
1206 if self.map_index == len(self.map):
1207 self.map_index = 0
1208 length = min(size, stop - self.position)
1209 if data:
1210 self.fileobj.seek(offset + (self.position - start))
1211 buf += self.fileobj.read(length)
7584f5c9 1212 else:
be60ffd0
ERE
1213 buf += NUL * length
1214 size -= length
1215 self.position += length
1216 return buf
7584f5c9 1217
be60ffd0
ERE
1218 def readinto(self, b):
1219 buf = self.read(len(b))
1220 b[:len(buf)] = buf
1221 return len(buf)
7584f5c9
ERE
1222
1223 def close(self):
7584f5c9 1224 self.closed = True
be60ffd0 1225#class _FileInFile
7584f5c9 1226
be60ffd0
ERE
1227
1228class ExFileObject(io.BufferedReader):
1229
1230 def __init__(self, tarfile, tarinfo):
1231 fileobj = _FileInFile(tarfile.fileobj, tarinfo.offset_data,
1232 tarinfo.size, tarinfo.sparse)
1233 super().__init__(fileobj)
7584f5c9
ERE
1234#class ExFileObject
1235
1236#------------------
1237# Exported Classes
1238#------------------
1239class TarInfo(object):
1240 """Informational class which holds the details about an
1241 archive member given by a tar header block.
1242 TarInfo objects are returned by TarFile.getmember(),
1243 TarFile.getmembers() and TarFile.gettarinfo() and are
1244 usually created internally.
1245 """
1246
be60ffd0
ERE
1247 __slots__ = ("name", "mode", "uid", "gid", "size", "mtime",
1248 "chksum", "type", "linkname", "uname", "gname",
1249 "devmajor", "devminor", "volume_offset",
1250 "offset", "offset_data", "pax_headers", "sparse",
1251 "tarfile", "_sparse_structs", "_link_target")
1252
7584f5c9
ERE
1253 def __init__(self, name=""):
1254 """Construct a TarInfo object. name is the optional name
1255 of the member.
1256 """
1257 self.name = name # member name
be60ffd0 1258 self.mode = 0o644 # file permissions
7584f5c9
ERE
1259 self.uid = 0 # user id
1260 self.gid = 0 # group id
1261 self.size = 0 # file size
1262 self.mtime = 0 # modification time
1263 self.chksum = 0 # header checksum
1264 self.type = REGTYPE # member type
1265 self.linkname = "" # link name
1266 self.uname = "" # user name
1267 self.gname = "" # group name
1268 self.devmajor = 0 # device major number
1269 self.devminor = 0 # device minor number
1270
1271 self.offset = 0 # the tar header starts here
1272 self.offset_data = 0 # the file's data starts here
0eb5048f
ERE
1273 self.volume_offset = 0 # the file's data corresponds with the data
1274 # starting at this position
7584f5c9 1275
be60ffd0 1276 self.sparse = None # sparse member information
7584f5c9
ERE
1277 self.pax_headers = {} # pax header information
1278
1279 # In pax headers the "name" and "linkname" field are called
1280 # "path" and "linkpath".
1281 def _getpath(self):
1282 return self.name
1283 def _setpath(self, name):
1284 self.name = name
1285 path = property(_getpath, _setpath)
1286
1287 def _getlinkpath(self):
1288 return self.linkname
1289 def _setlinkpath(self, linkname):
1290 self.linkname = linkname
1291 linkpath = property(_getlinkpath, _setlinkpath)
1292
1293 def __repr__(self):
1294 return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self))
1295
be60ffd0 1296 def get_info(self, encoding=None, errors=None):
7584f5c9
ERE
1297 """Return the TarInfo's attributes as a dictionary.
1298 """
1299 info = {
1300 "name": self.name,
be60ffd0 1301 "mode": self.mode & 0o7777,
7584f5c9
ERE
1302 "uid": self.uid,
1303 "gid": self.gid,
1304 "size": self.size,
1305 "mtime": self.mtime,
1306 "chksum": self.chksum,
1307 "type": self.type,
1308 "linkname": self.linkname,
1309 "uname": self.uname,
1310 "gname": self.gname,
1311 "devmajor": self.devmajor,
36a315a0 1312 "devminor": self.devminor,
0eb5048f
ERE
1313 "offset_data": self.offset_data,
1314 "volume_offset": self.volume_offset
7584f5c9
ERE
1315 }
1316
1317 if info["type"] == DIRTYPE and not info["name"].endswith("/"):
1318 info["name"] += "/"
1319
7584f5c9
ERE
1320 return info
1321
be60ffd0
ERE
1322 def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING,
1323 errors="surrogateescape"):
7584f5c9
ERE
1324 """Return a tar header as a string of 512 byte blocks.
1325 """
1326 info = self.get_info(encoding, errors)
1327
1328 if format == USTAR_FORMAT:
be60ffd0 1329 return self.create_ustar_header(info, encoding, errors)
7584f5c9 1330 elif format == GNU_FORMAT:
be60ffd0 1331 return self.create_gnu_header(info, encoding, errors)
7584f5c9
ERE
1332 elif format == PAX_FORMAT:
1333 return self.create_pax_header(info, encoding, errors)
1334 else:
1335 raise ValueError("invalid format")
1336
be60ffd0 1337 def create_ustar_header(self, info, encoding, errors):
7584f5c9
ERE
1338 """Return the object as a ustar header block.
1339 """
1340 info["magic"] = POSIX_MAGIC
1341
1342 if len(info["linkname"]) > LENGTH_LINK:
1343 raise ValueError("linkname is too long")
1344
1345 if len(info["name"]) > LENGTH_NAME:
1346 info["prefix"], info["name"] = self._posix_split_name(info["name"])
1347
be60ffd0 1348 return self._create_header(info, USTAR_FORMAT, encoding, errors)
7584f5c9 1349
be60ffd0 1350 def create_gnu_header(self, info, encoding, errors):
7584f5c9
ERE
1351 """Return the object as a GNU header block sequence.
1352 """
1353 info["magic"] = GNU_MAGIC
1354
2f854e77
ERE
1355 if self.ismultivol():
1356 prefix = [
1357 itn(info.get("atime", 0), 12, GNU_FORMAT),
1358 itn(info.get("ctime", 0), 12, GNU_FORMAT),
0eb5048f 1359 itn(self.volume_offset, 12, GNU_FORMAT),
2f854e77
ERE
1360 itn(0, 119, GNU_FORMAT), # stuff unused in this tar implementation, set to zero
1361 ]
be60ffd0 1362 info['prefix'] = b"".join(prefix)
0eb5048f 1363 info['size'] = info['size'] - self.volume_offset
2f854e77 1364
be60ffd0 1365 buf = b""
7584f5c9 1366 if len(info["linkname"]) > LENGTH_LINK:
be60ffd0
ERE
1367 buf += self._create_gnu_long_header(info["linkname"],
1368 GNUTYPE_LONGLINK, encoding, errors)
7584f5c9
ERE
1369
1370 if len(info["name"]) > LENGTH_NAME:
be60ffd0
ERE
1371 buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME,
1372 encoding, errors)
7584f5c9 1373
be60ffd0 1374 return buf + self._create_header(info, GNU_FORMAT, encoding, errors)
7584f5c9
ERE
1375
1376 def create_pax_header(self, info, encoding, errors):
1377 """Return the object as a ustar header block. If it cannot be
1378 represented this way, prepend a pax extended header sequence
1379 with supplement information.
1380 """
1381 info["magic"] = POSIX_MAGIC
1382 pax_headers = self.pax_headers.copy()
c04e0751
ERE
1383 if self.ismultivol():
1384 info['size'] = info['size'] - self.volume_offset
7584f5c9
ERE
1385
1386 # Test string fields for values that exceed the field length or cannot
1387 # be represented in ASCII encoding.
1388 for name, hname, length in (
36a315a0
ERE
1389 ("name", "path", LENGTH_NAME),
1390 ("linkname", "linkpath", LENGTH_LINK),
1391 ("uname", "uname", 32),
1392 ("gname", "gname", 32)):
7584f5c9
ERE
1393
1394 if hname in pax_headers:
1395 # The pax header has priority.
1396 continue
1397
7584f5c9
ERE
1398 # Try to encode the string as ASCII.
1399 try:
be60ffd0 1400 info[name].encode("ascii", "strict")
7584f5c9 1401 except UnicodeEncodeError:
be60ffd0 1402 pax_headers[hname] = info[name]
7584f5c9
ERE
1403 continue
1404
1405 if len(info[name]) > length:
be60ffd0 1406 pax_headers[hname] = info[name]
7584f5c9
ERE
1407
1408 # Test number fields for values that exceed the field limit or values
1409 # that like to be stored as float.
1410 for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)):
1411 if name in pax_headers:
1412 # The pax header has priority. Avoid overflow.
1413 info[name] = 0
1414 continue
1415
1416 val = info[name]
1417 if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float):
be60ffd0 1418 pax_headers[name] = str(val)
7584f5c9
ERE
1419 info[name] = 0
1420
1421 # Create a pax extended header if necessary.
1422 if pax_headers:
be60ffd0 1423 buf = self._create_pax_generic_header(pax_headers, XHDTYPE, encoding)
7584f5c9 1424 else:
be60ffd0 1425 buf = b""
7584f5c9 1426
be60ffd0 1427 return buf + self._create_header(info, USTAR_FORMAT, "ascii", "replace")
7584f5c9
ERE
1428
1429 @classmethod
1430 def create_pax_global_header(cls, pax_headers):
1431 """Return the object as a pax global header block sequence.
1432 """
be60ffd0 1433 return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf-8")
7584f5c9
ERE
1434
1435 def _posix_split_name(self, name):
1436 """Split a name longer than 100 chars into a prefix
1437 and a name part.
1438 """
1439 prefix = name[:LENGTH_PREFIX + 1]
1440 while prefix and prefix[-1] != "/":
1441 prefix = prefix[:-1]
1442
1443 name = name[len(prefix):]
1444 prefix = prefix[:-1]
1445
1446 if not prefix or len(name) > LENGTH_NAME:
1447 raise ValueError("name is too long")
1448 return prefix, name
1449
1450 @staticmethod
be60ffd0 1451 def _create_header(info, format, encoding, errors):
7584f5c9
ERE
1452 """Return a header block. info is a dictionary with file
1453 information, format must be one of the *_FORMAT constants.
1454 """
1455 parts = [
be60ffd0
ERE
1456 stn(info.get("name", ""), 100, encoding, errors),
1457 itn(info.get("mode", 0) & 0o7777, 8, format),
7584f5c9
ERE
1458 itn(info.get("uid", 0), 8, format),
1459 itn(info.get("gid", 0), 8, format),
1460 itn(info.get("size", 0), 12, format),
1461 itn(info.get("mtime", 0), 12, format),
be60ffd0 1462 b" ", # checksum field
2f854e77 1463 info.get("type", REGTYPE),
be60ffd0
ERE
1464 stn(info.get("linkname", ""), 100, encoding, errors),
1465 info.get("magic", POSIX_MAGIC),
1466 stn(info.get("uname", ""), 32, encoding, errors),
1467 stn(info.get("gname", ""), 32, encoding, errors),
7584f5c9
ERE
1468 itn(info.get("devmajor", 0), 8, format),
1469 itn(info.get("devminor", 0), 8, format),
be60ffd0 1470 sbtn(info.get("prefix", ""), 155, encoding, errors)
7584f5c9
ERE
1471 ]
1472
be60ffd0 1473 buf = struct.pack("%ds" % BLOCKSIZE, b"".join(parts))
7584f5c9 1474 chksum = calc_chksums(buf[-BLOCKSIZE:])[0]
be60ffd0 1475 buf = buf[:-364] + bytes("%06o\0" % chksum, "ascii") + buf[-357:]
7584f5c9
ERE
1476 return buf
1477
1478 @staticmethod
1479 def _create_payload(payload):
1480 """Return the string payload filled with zero bytes
1481 up to the next 512 byte border.
1482 """
1483 blocks, remainder = divmod(len(payload), BLOCKSIZE)
1484 if remainder > 0:
1485 payload += (BLOCKSIZE - remainder) * NUL
1486 return payload
1487
1488 @classmethod
be60ffd0 1489 def _create_gnu_long_header(cls, name, type, encoding, errors):
7584f5c9
ERE
1490 """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence
1491 for name.
1492 """
be60ffd0 1493 name = name.encode(encoding, errors) + NUL
7584f5c9
ERE
1494
1495 info = {}
1496 info["name"] = "././@LongLink"
1497 info["type"] = type
1498 info["size"] = len(name)
1499 info["magic"] = GNU_MAGIC
1500
1501 # create extended header + name blocks.
be60ffd0 1502 return cls._create_header(info, USTAR_FORMAT, encoding, errors) + \
7584f5c9
ERE
1503 cls._create_payload(name)
1504
1505 @classmethod
be60ffd0
ERE
1506 def _create_pax_generic_header(cls, pax_headers, type, encoding):
1507 """Return a POSIX.1-2008 extended or global header sequence
7584f5c9 1508 that contains a list of keyword, value pairs. The values
be60ffd0 1509 must be strings.
7584f5c9 1510 """
be60ffd0
ERE
1511 # Check if one of the fields contains surrogate characters and thereby
1512 # forces hdrcharset=BINARY, see _proc_pax() for more information.
1513 binary = False
1514 for keyword, value in pax_headers.items():
1515 try:
1516 value.encode("utf-8", "strict")
1517 except UnicodeEncodeError:
1518 binary = True
1519 break
1520
1521 records = b""
1522 if binary:
1523 # Put the hdrcharset field at the beginning of the header.
1524 records += b"21 hdrcharset=BINARY\n"
1525
1526 for keyword, value in pax_headers.items():
1527 keyword = keyword.encode("utf-8")
1528 if binary:
1529 # Try to restore the original byte representation of `value'.
1530 # Needless to say, that the encoding must match the string.
1531 value = value.encode(encoding, "surrogateescape")
1532 else:
1533 value = value.encode("utf-8")
1534
7584f5c9
ERE
1535 l = len(keyword) + len(value) + 3 # ' ' + '=' + '\n'
1536 n = p = 0
1537 while True:
1538 n = l + len(str(p))
1539 if n == p:
1540 break
1541 p = n
be60ffd0 1542 records += bytes(str(p), "ascii") + b" " + keyword + b"=" + value + b"\n"
7584f5c9
ERE
1543
1544 # We use a hardcoded "././@PaxHeader" name like star does
1545 # instead of the one that POSIX recommends.
1546 info = {}
1547 info["name"] = "././@PaxHeader"
1548 info["type"] = type
1549 info["size"] = len(records)
1550 info["magic"] = POSIX_MAGIC
1551
1552 # Create pax header + record blocks.
be60ffd0 1553 return cls._create_header(info, USTAR_FORMAT, "ascii", "replace") + \
7584f5c9
ERE
1554 cls._create_payload(records)
1555
1556 @classmethod
be60ffd0
ERE
1557 def frombuf(cls, buf, encoding, errors):
1558 """Construct a TarInfo object from a 512 byte bytes object.
7584f5c9
ERE
1559 """
1560 if len(buf) == 0:
1561 raise EmptyHeaderError("empty header")
1562 if len(buf) != BLOCKSIZE:
1563 raise TruncatedHeaderError("truncated header")
1564 if buf.count(NUL) == BLOCKSIZE:
1565 raise EOFHeaderError("end of file header")
1566
1567 chksum = nti(buf[148:156])
1568 if chksum not in calc_chksums(buf):
1569 raise InvalidHeaderError("bad checksum")
1570
1571 obj = cls()
be60ffd0 1572 obj.name = nts(buf[0:100], encoding, errors)
7584f5c9
ERE
1573 obj.mode = nti(buf[100:108])
1574 obj.uid = nti(buf[108:116])
1575 obj.gid = nti(buf[116:124])
1576 obj.size = nti(buf[124:136])
1577 obj.mtime = nti(buf[136:148])
1578 obj.chksum = chksum
1579 obj.type = buf[156:157]
be60ffd0
ERE
1580 obj.linkname = nts(buf[157:257], encoding, errors)
1581 obj.uname = nts(buf[265:297], encoding, errors)
1582 obj.gname = nts(buf[297:329], encoding, errors)
7584f5c9
ERE
1583 obj.devmajor = nti(buf[329:337])
1584 obj.devminor = nti(buf[337:345])
be60ffd0
ERE
1585 prefix = nts(buf[345:500], encoding, errors)
1586
1587 # The old GNU sparse format occupies some of the unused
1588 # space in the buffer for up to 4 sparse structures.
1589 # Save the them for later processing in _proc_sparse().
1590 if obj.type == GNUTYPE_SPARSE:
1591 pos = 386
1592 structs = []
1593 for i in range(4):
1594 try:
1595 offset = nti(buf[pos:pos + 12])
1596 numbytes = nti(buf[pos + 12:pos + 24])
1597 except ValueError:
1598 break
1599 structs.append((offset, numbytes))
1600 pos += 24
1601 isextended = bool(buf[482])
1602 origsize = nti(buf[483:495])
1603 obj._sparse_structs = (structs, isextended, origsize)
7584f5c9
ERE
1604
1605 # Old V7 tar format represents a directory as a regular
1606 # file with a trailing slash.
1607 if obj.type == AREGTYPE and obj.name.endswith("/"):
1608 obj.type = DIRTYPE
1609
1610 # Remove redundant slashes from directories.
1611 if obj.isdir():
1612 obj.name = obj.name.rstrip("/")
1613
1614 # Reconstruct a ustar longname.
1615 if prefix and obj.type not in GNU_TYPES:
1616 obj.name = prefix + "/" + obj.name
c474439c
ERE
1617 else:
1618 obj.offset_data = nti(buf[369:381])
7584f5c9
ERE
1619 return obj
1620
1621 @classmethod
1622 def fromtarfile(cls, tarfile):
1623 """Return the next TarInfo object from TarFile object
1624 tarfile.
1625 """
1626 buf = tarfile.fileobj.read(BLOCKSIZE)
be60ffd0 1627 obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors)
7584f5c9
ERE
1628 obj.offset = tarfile.fileobj.tell() - BLOCKSIZE
1629 return obj._proc_member(tarfile)
1630
1631 #--------------------------------------------------------------------------
1632 # The following are methods that are called depending on the type of a
1633 # member. The entry point is _proc_member() which can be overridden in a
1634 # subclass to add custom _proc_*() methods. A _proc_*() method MUST
1635 # implement the following
1636 # operations:
1637 # 1. Set self.offset_data to the position where the data blocks begin,
1638 # if there is data that follows.
1639 # 2. Set tarfile.offset to the position where the next member's header will
1640 # begin.
1641 # 3. Return self or another valid TarInfo object.
1642 def _proc_member(self, tarfile):
1643 """Choose the right processing method depending on
1644 the type and call it.
1645 """
1646 if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK):
1647 return self._proc_gnulong(tarfile)
1648 elif self.type == GNUTYPE_SPARSE:
1649 return self._proc_sparse(tarfile)
1650 elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE):
1651 return self._proc_pax(tarfile)
1652 else:
1653 return self._proc_builtin(tarfile)
1654
1655 def _proc_builtin(self, tarfile):
1656 """Process a builtin type or an unknown type which
1657 will be treated as a regular file.
1658 """
1659 self.offset_data = tarfile.fileobj.tell()
1660 offset = self.offset_data
00c34a12 1661 if self.isreg() or self.ismultivol() or self.type not in SUPPORTED_TYPES:
7584f5c9
ERE
1662 # Skip the following data blocks.
1663 offset += self._block(self.size)
1664 tarfile.offset = offset
1665
1666 # Patch the TarInfo object with saved global
1667 # header information.
1668 self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors)
1669
1670 return self
1671
1672 def _proc_gnulong(self, tarfile):
1673 """Process the blocks that hold a GNU longname
1674 or longlink member.
1675 """
1676 buf = tarfile.fileobj.read(self._block(self.size))
1677
1678 # Fetch the next header and process it.
1679 try:
1680 next = self.fromtarfile(tarfile)
1681 except HeaderError:
1682 raise SubsequentHeaderError("missing or bad subsequent header")
1683
1684 # Patch the TarInfo object from the next header with
1685 # the longname information.
1686 next.offset = self.offset
1687 if self.type == GNUTYPE_LONGNAME:
be60ffd0 1688 next.name = nts(buf, tarfile.encoding, tarfile.errors)
7584f5c9 1689 elif self.type == GNUTYPE_LONGLINK:
be60ffd0 1690 next.linkname = nts(buf, tarfile.encoding, tarfile.errors)
7584f5c9
ERE
1691
1692 return next
1693
1694 def _proc_sparse(self, tarfile):
1695 """Process a GNU sparse header plus extra headers.
1696 """
be60ffd0
ERE
1697 # We already collected some sparse structures in frombuf().
1698 structs, isextended, origsize = self._sparse_structs
1699 del self._sparse_structs
1700
1701 # Collect sparse structures from extended header blocks.
1702 while isextended:
7584f5c9
ERE
1703 buf = tarfile.fileobj.read(BLOCKSIZE)
1704 pos = 0
be60ffd0 1705 for i in range(21):
7584f5c9
ERE
1706 try:
1707 offset = nti(buf[pos:pos + 12])
1708 numbytes = nti(buf[pos + 12:pos + 24])
1709 except ValueError:
1710 break
be60ffd0
ERE
1711 if offset and numbytes:
1712 structs.append((offset, numbytes))
7584f5c9 1713 pos += 24
be60ffd0
ERE
1714 isextended = bool(buf[504])
1715 self.sparse = structs
7584f5c9
ERE
1716
1717 self.offset_data = tarfile.fileobj.tell()
1718 tarfile.offset = self.offset_data + self._block(self.size)
1719 self.size = origsize
7584f5c9
ERE
1720 return self
1721
1722 def _proc_pax(self, tarfile):
1723 """Process an extended or global header as described in
be60ffd0 1724 POSIX.1-2008.
7584f5c9
ERE
1725 """
1726 # Read the header information.
1727 buf = tarfile.fileobj.read(self._block(self.size))
1728
1729 # A pax header stores supplemental information for either
1730 # the following file (extended) or all following files
1731 # (global).
1732 if self.type == XGLTYPE:
1733 pax_headers = tarfile.pax_headers
1734 else:
1735 pax_headers = tarfile.pax_headers.copy()
1736
be60ffd0
ERE
1737 # Check if the pax header contains a hdrcharset field. This tells us
1738 # the encoding of the path, linkpath, uname and gname fields. Normally,
1739 # these fields are UTF-8 encoded but since POSIX.1-2008 tar
1740 # implementations are allowed to store them as raw binary strings if
1741 # the translation to UTF-8 fails.
1742 match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf)
1743 if match is not None:
1744 pax_headers["hdrcharset"] = match.group(1).decode("utf-8")
1745
1746 # For the time being, we don't care about anything other than "BINARY".
1747 # The only other value that is currently allowed by the standard is
1748 # "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
1749 hdrcharset = pax_headers.get("hdrcharset")
1750 if hdrcharset == "BINARY":
1751 encoding = tarfile.encoding
1752 else:
1753 encoding = "utf-8"
1754
7584f5c9
ERE
1755 # Parse pax header information. A record looks like that:
1756 # "%d %s=%s\n" % (length, keyword, value). length is the size
1757 # of the complete record including the length field itself and
1758 # the newline. keyword and value are both UTF-8 encoded strings.
be60ffd0 1759 regex = re.compile(br"(\d+) ([^=]+)=")
7584f5c9
ERE
1760 pos = 0
1761 while True:
1762 match = regex.match(buf, pos)
1763 if not match:
1764 break
1765
1766 length, keyword = match.groups()
1767 length = int(length)
1768 value = buf[match.end(2) + 1:match.start(1) + length - 1]
1769
be60ffd0
ERE
1770 # Normally, we could just use "utf-8" as the encoding and "strict"
1771 # as the error handler, but we better not take the risk. For
1772 # example, GNU tar <= 1.23 is known to store filenames it cannot
1773 # translate to UTF-8 as raw strings (unfortunately without a
1774 # hdrcharset=BINARY header).
1775 # We first try the strict standard encoding, and if that fails we
1776 # fall back on the user's encoding and error handler.
1777 keyword = self._decode_pax_field(keyword, "utf-8", "utf-8",
1778 tarfile.errors)
1779 if keyword in PAX_NAME_FIELDS:
1780 value = self._decode_pax_field(value, encoding, tarfile.encoding,
1781 tarfile.errors)
1782 else:
1783 value = self._decode_pax_field(value, "utf-8", "utf-8",
1784 tarfile.errors)
7584f5c9
ERE
1785
1786 pax_headers[keyword] = value
1787 pos += length
1788
36a315a0 1789
7584f5c9
ERE
1790 # Fetch the next header.
1791 try:
1792 next = self.fromtarfile(tarfile)
1793 except HeaderError:
1794 raise SubsequentHeaderError("missing or bad subsequent header")
1795
be60ffd0
ERE
1796 # Process GNU sparse information.
1797 if "GNU.sparse.map" in pax_headers:
1798 # GNU extended sparse format version 0.1.
1799 self._proc_gnusparse_01(next, pax_headers)
1800
1801 elif "GNU.sparse.size" in pax_headers:
1802 # GNU extended sparse format version 0.0.
1803 self._proc_gnusparse_00(next, pax_headers, buf)
1804
1805 elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0":
1806 # GNU extended sparse format version 1.0.
1807 self._proc_gnusparse_10(next, pax_headers, tarfile)
1808
7584f5c9
ERE
1809 if self.type in (XHDTYPE, SOLARIS_XHDTYPE):
1810 # Patch the TarInfo object with the extended header info.
1811 next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors)
1812 next.offset = self.offset
1813
1814 if "size" in pax_headers:
1815 # If the extended header replaces the size field,
1816 # we need to recalculate the offset where the next
1817 # header starts.
1818 offset = next.offset_data
1819 if next.isreg() or next.type not in SUPPORTED_TYPES:
1820 offset += next._block(next.size)
1821 tarfile.offset = offset
1822
c04e0751
ERE
1823 if next is not None:
1824 if "GNU.volume.filename" in pax_headers:
1825 if pax_headers["GNU.volume.filename"] == next.name:
1826 if "GNU.volume.size" in pax_headers:
1827 next.size = int(pax_headers["GNU.volume.size"])
1828 if "GNU.volume.offset" in pax_headers:
1829 next.volume_offset = int(pax_headers["GNU.volume.offset"])
1830
1831 for key in pax_headers.keys():
1832 if key.startswith("GNU.volume"):
1833 del tarfile.pax_headers[key]
0eb5048f 1834
7584f5c9
ERE
1835 return next
1836
be60ffd0
ERE
1837 def _proc_gnusparse_00(self, next, pax_headers, buf):
1838 """Process a GNU tar extended sparse header, version 0.0.
7584f5c9 1839 """
be60ffd0
ERE
1840 offsets = []
1841 for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf):
1842 offsets.append(int(match.group(1)))
1843 numbytes = []
1844 for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf):
1845 numbytes.append(int(match.group(1)))
1846 next.sparse = list(zip(offsets, numbytes))
7584f5c9 1847
be60ffd0
ERE
1848 def _proc_gnusparse_01(self, next, pax_headers):
1849 """Process a GNU tar extended sparse header, version 0.1.
1850 """
1851 sparse = [int(x) for x in pax_headers["GNU.sparse.map"].split(",")]
1852 next.sparse = list(zip(sparse[::2], sparse[1::2]))
7584f5c9 1853
be60ffd0
ERE
1854 def _proc_gnusparse_10(self, next, pax_headers, tarfile):
1855 """Process a GNU tar extended sparse header, version 1.0.
1856 """
1857 fields = None
1858 sparse = []
1859 buf = tarfile.fileobj.read(BLOCKSIZE)
1860 fields, buf = buf.split(b"\n", 1)
1861 fields = int(fields)
1862 while len(sparse) < fields * 2:
1863 if b"\n" not in buf:
1864 buf += tarfile.fileobj.read(BLOCKSIZE)
1865 number, buf = buf.split(b"\n", 1)
1866 sparse.append(int(number))
1867 next.offset_data = tarfile.fileobj.tell()
1868 next.sparse = list(zip(sparse[::2], sparse[1::2]))
7584f5c9 1869
be60ffd0
ERE
1870 def _apply_pax_info(self, pax_headers, encoding, errors):
1871 """Replace fields with supplemental information from a previous
1872 pax extended or global header.
1873 """
1874 for keyword, value in pax_headers.items():
1875 if keyword == "GNU.sparse.name":
1876 setattr(self, "path", value)
1877 elif keyword == "GNU.sparse.size":
1878 setattr(self, "size", int(value))
1879 elif keyword == "GNU.sparse.realsize":
1880 setattr(self, "size", int(value))
1881 elif keyword in PAX_FIELDS:
1882 if keyword in PAX_NUMBER_FIELDS:
1883 try:
1884 value = PAX_NUMBER_FIELDS[keyword](value)
1885 except ValueError:
1886 value = 0
1887 if keyword == "path":
f0287fb7 1888 value = value.rstrip("/") # pylint: disable=no-member
be60ffd0 1889 setattr(self, keyword, value)
7584f5c9
ERE
1890
1891 self.pax_headers = pax_headers.copy()
1892
be60ffd0
ERE
1893 def _decode_pax_field(self, value, encoding, fallback_encoding, fallback_errors):
1894 """Decode a single field from a pax record.
1895 """
1896 try:
1897 return value.decode(encoding, "strict")
1898 except UnicodeDecodeError:
1899 return value.decode(fallback_encoding, fallback_errors)
1900
7584f5c9
ERE
1901 def _block(self, count):
1902 """Round up a byte count by BLOCKSIZE and return it,
1903 e.g. _block(834) => 1024.
1904 """
1905 blocks, remainder = divmod(count, BLOCKSIZE)
1906 if remainder:
1907 blocks += 1
1908 return blocks * BLOCKSIZE
1909
1910 def isreg(self):
1911 return self.type in REGULAR_TYPES
1912 def isfile(self):
1913 return self.isreg()
1914 def isdir(self):
1915 return self.type == DIRTYPE
1916 def issym(self):
1917 return self.type == SYMTYPE
1918 def islnk(self):
1919 return self.type == LNKTYPE
1920 def ischr(self):
1921 return self.type == CHRTYPE
1922 def isblk(self):
1923 return self.type == BLKTYPE
1924 def isfifo(self):
1925 return self.type == FIFOTYPE
1926 def issparse(self):
be60ffd0 1927 return self.sparse is not None
7584f5c9
ERE
1928 def isdev(self):
1929 return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE)
68ddf955 1930 def ismultivol(self):
c04e0751
ERE
1931 return self.type == GNUTYPE_MULTIVOL or self.volume_offset > 0 or\
1932 "GNU.volume.offset" in self.pax_headers
7584f5c9
ERE
1933# class TarInfo
1934
1935class TarFile(object):
1936 """The TarFile Class provides an interface to tar archives.
1937 """
1938
1939 debug = 0 # May be set from 0 (no msgs) to 3 (all msgs)
1940
1941 dereference = False # If true, add content of linked file to the
1942 # tar file, else the link.
1943
1944 ignore_zeros = False # If true, skips empty or invalid blocks and
1945 # continues processing.
1946
83f2d71e 1947 max_volume_size = None # If different from None, establishes maximum
68ddf955
ERE
1948 # size of tar volumes
1949
1950 new_volume_handler = None # function handler to be executed before when
1951 # a new volume is needed
1952
1953 volume_number = 0 # current volume number, used for multi volume
1954 # support
1955
7584f5c9
ERE
1956 errorlevel = 1 # If 0, fatal errors only appear in debug
1957 # messages (if debug >= 0). If > 0, errors
1958 # are passed to the caller as exceptions.
1959
1960 format = DEFAULT_FORMAT # The format to use when creating an archive.
1961
1962 encoding = ENCODING # Encoding for 8-bit character strings.
1963
1964 errors = None # Error handler for unicode conversion.
1965
1966 tarinfo = TarInfo # The default TarInfo class to use.
1967
be60ffd0 1968 fileobject = ExFileObject # The file-object for extractfile().
7584f5c9 1969
d1c38f40
PG
1970 arcmode = ARCMODE_PLAIN # Object processing mode (“concat”, encryption,
1971 # compression)
5fdff89f 1972
ea625b04
ERE
1973 save_to_members = True # If new members are saved. This can be disabled
1974 # if you manage lots of files and don't want
1975 # to have high memory usage
1976
9ef1fb87
TJ
1977 cache_uid2user = {} # cache to avoid getpwuid calls. It always parses /etc/passwd.
1978 cache_gid2group = {} # same cache for groups
1979
7584f5c9
ERE
1980 def __init__(self, name=None, mode="r", fileobj=None, format=None,
1981 tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
be60ffd0 1982 errors="surrogateescape", pax_headers=None, debug=None,
548bb8d5 1983 errorlevel=None, max_volume_size=None, new_volume_handler=None,
d1c38f40 1984 concat=False, nacl=None,
c7c736b6 1985 save_to_members=True):
7584f5c9
ERE
1986 """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
1987 read from an existing archive, 'a' to append data to an existing
1988 file or 'w' to create a new file overwriting an existing one. `mode'
1989 defaults to 'r'.
1990 If `fileobj' is given, it is used for reading or writing data. If it
1991 can be determined, `mode' is overridden by `fileobj's mode.
1992 `fileobj' is not closed, when TarFile is closed.
1993 """
1994 if len(mode) > 1 or mode not in "raw":
1995 raise ValueError("mode must be 'r', 'a' or 'w'")
1996 self.mode = mode
d1c38f40 1997 self.arcmode = arcmode_set (concat)
c7c736b6 1998 self.nacl = nacl
7584f5c9
ERE
1999 self._mode = {"r": "rb", "a": "r+b", "w": "wb"}[mode]
2000
2001 if not fileobj:
2002 if self.mode == "a" and not os.path.exists(name):
2003 # Create nonexistent files in append mode.
2004 self.mode = "w"
2005 self._mode = "wb"
2006 fileobj = bltn_open(name, self._mode)
2007 self._extfileobj = False
2008 else:
2009 if name is None and hasattr(fileobj, "name"):
2010 name = fileobj.name
d5361dac 2011 # when fileobj is a gzip.GzipFile, fileobj.mode is an int (not valid for us)
be60ffd0 2012 if hasattr(fileobj, "mode") and isinstance(fileobj.mode, str):
7584f5c9
ERE
2013 self._mode = fileobj.mode
2014 self._extfileobj = True
be60ffd0 2015 self.name = os.path.abspath(name) if name else None
2f854e77 2016 self.base_name = self.name = os.path.abspath(name) if name else None
7584f5c9
ERE
2017 self.fileobj = fileobj
2018
2019 # Init attributes.
2020 if format is not None:
2021 self.format = format
2022 if tarinfo is not None:
2023 self.tarinfo = tarinfo
2024 if dereference is not None:
2025 self.dereference = dereference
2026 if ignore_zeros is not None:
2027 self.ignore_zeros = ignore_zeros
2028 if encoding is not None:
2029 self.encoding = encoding
2030
be60ffd0 2031 self.errors = errors
7584f5c9
ERE
2032
2033 if pax_headers is not None and self.format == PAX_FORMAT:
2034 self.pax_headers = pax_headers
2035 else:
2036 self.pax_headers = {}
2037
2038 if debug is not None:
2039 self.debug = debug
2040 if errorlevel is not None:
2041 self.errorlevel = errorlevel
2042
2043 # Init datastructures.
ae48acc8 2044 if max_volume_size and max_volume_size < 3*BLOCKSIZE:
0c818a18 2045 raise ValueError("max_volume_size needs to be at least %d" % (3*BLOCKSIZE))
ae48acc8
ERE
2046 if max_volume_size and not callable(new_volume_handler):
2047 raise ValueError("new_volume_handler needs to be set and be callable for multivolume support")
5ab3f8f9
CH
2048 if max_volume_size:
2049 self.max_volume_size = int(max_volume_size)
2050 else:
2051 self.max_volume_size = None
ae48acc8 2052
ea625b04 2053 self.save_to_members = save_to_members
68ddf955 2054 self.new_volume_handler = new_volume_handler
7584f5c9
ERE
2055 self.closed = False
2056 self.members = [] # list of members as TarInfo objects
2057 self._loaded = False # flag if all members have been read
2058 self.offset = self.fileobj.tell()
2059 # current position in the archive file
2060 self.inodes = {} # dictionary caching the inodes of
2061 # archive members already added
2062
2063 try:
2064 if self.mode == "r":
2065 self.firstmember = None
2066 self.firstmember = self.next()
2067
2068 if self.mode == "a":
2069 # Move to the end of the archive,
2070 # before the first empty block.
2071 while True:
2072 self.fileobj.seek(self.offset)
2073 try:
2074 tarinfo = self.tarinfo.fromtarfile(self)
2075 self.members.append(tarinfo)
2076 except EOFHeaderError:
2077 self.fileobj.seek(self.offset)
2078 break
be60ffd0 2079 except HeaderError as e:
7584f5c9
ERE
2080 raise ReadError(str(e))
2081
2082 if self.mode in "aw":
2083 self._loaded = True
2084
2085 if self.pax_headers:
2086 buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy())
2087 self.fileobj.write(buf)
2088 self.offset += len(buf)
2089 except:
2090 if not self._extfileobj:
2091 self.fileobj.close()
2092 self.closed = True
2093 raise
2094
7584f5c9
ERE
2095 #--------------------------------------------------------------------------
2096 # Below are the classmethods which act as alternate constructors to the
2097 # TarFile class. The open() method is the only one that is needed for
2098 # public use; it is the "super"-constructor and is able to select an
2099 # adequate "sub"-constructor for a particular compression using the mapping
2100 # from OPEN_METH.
2101 #
2102 # This concept allows one to subclass TarFile without losing the comfort of
2103 # the super-constructor. A sub-constructor is registered and made available
2104 # by adding it to the mapping in OPEN_METH.
2105
2106 @classmethod
2b82f50c 2107 def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE,
04f4c7ab
PG
2108 encryption=None, compresslevel=9, tolerance=TOLERANCE_STRICT,
2109 **kwargs):
7584f5c9
ERE
2110 """Open a tar archive for reading, writing or appending. Return
2111 an appropriate TarFile class.
2112
2113 mode:
2114 'r' or 'r:*' open for reading with transparent compression
2115 'r:' open for reading exclusively uncompressed
2116 'r:gz' open for reading with gzip compression
2117 'r:bz2' open for reading with bzip2 compression
be60ffd0 2118 'r:xz' open for reading with lzma compression
7584f5c9
ERE
2119 'a' or 'a:' open for appending, creating the file if necessary
2120 'w' or 'w:' open for writing without compression
2121 'w:gz' open for writing with gzip compression
2122 'w:bz2' open for writing with bzip2 compression
be60ffd0 2123 'w:xz' open for writing with lzma compression
7584f5c9
ERE
2124
2125 'r|*' open a stream of tar blocks with transparent compression
2126 'r|' open an uncompressed stream of tar blocks for reading
2127 'r|gz' open a gzip compressed stream of tar blocks
2128 'r|bz2' open a bzip2 compressed stream of tar blocks
be60ffd0 2129 'r|xz' open an lzma compressed stream of tar blocks
7584f5c9
ERE
2130 'w|' open an uncompressed stream for writing
2131 'w|gz' open a gzip compressed stream for writing
2132 'w|bz2' open a bzip2 compressed stream for writing
be60ffd0 2133 'w|xz' open an lzma compressed stream for writing
85737f48
ERE
2134
2135 'r#gz' open a stream of gzip compressed tar blocks for reading
2136 'w#gz' open a stream of gzip compressed tar blocks for writing
7584f5c9 2137 """
7584f5c9
ERE
2138 if not name and not fileobj:
2139 raise ValueError("nothing to open")
2140
2141 if mode in ("r", "r:*"):
2142 # Find out which *open() is appropriate for opening the file.
2143 for comptype in cls.OPEN_METH:
2144 func = getattr(cls, cls.OPEN_METH[comptype])
2145 if fileobj is not None:
2146 saved_pos = fileobj.tell()
2147 try:
2148 return func(name, "r", fileobj, **kwargs)
be60ffd0 2149 except (ReadError, CompressionError) as e:
c7c736b6 2150 # usually nothing exceptional but sometimes is
7584f5c9
ERE
2151 if fileobj is not None:
2152 fileobj.seek(saved_pos)
2153 continue
2154 raise ReadError("file could not be opened successfully")
2155
2156 elif ":" in mode:
2157 filemode, comptype = mode.split(":", 1)
2158 filemode = filemode or "r"
2159 comptype = comptype or "tar"
2160
2161 # Select the *open() function according to
2162 # given compression.
2163 if comptype in cls.OPEN_METH:
2164 func = getattr(cls, cls.OPEN_METH[comptype])
2165 else:
2166 raise CompressionError("unknown compression type %r" % comptype)
e05f0440
TJ
2167
2168 # Pass on compression level for gzip / bzip2.
2169 if comptype == 'gz' or comptype == 'bz2':
2170 kwargs['compresslevel'] = compresslevel
2171
7a2b9329
CH
2172 if 'max_volume_size' in kwargs:
2173 if comptype != 'tar' and filemode in 'wa' \
2174 and kwargs['max_volume_size']:
2175 import warnings
2176 warnings.warn('Only the first volume will be compressed '
2177 'for modes with "w:"!')
2178
e05f0440 2179 return func(name, filemode, fileobj, **kwargs)
7584f5c9
ERE
2180
2181 elif "|" in mode:
2182 filemode, comptype = mode.split("|", 1)
2183 filemode = filemode or "r"
2184 comptype = comptype or "tar"
2185
2186 if filemode not in "rw":
2187 raise ValueError("mode must be 'r' or 'w'")
2188
2189 t = cls(name, filemode,
2b82f50c
ERE
2190 _Stream(name, filemode, comptype, fileobj, bufsize,
2191 compresslevel=compresslevel),
7584f5c9
ERE
2192 **kwargs)
2193 t._extfileobj = False
2194 return t
2195
5fdff89f
ERE
2196 elif "#" in mode:
2197 filemode, comptype = mode.split("#", 1)
2198 filemode = filemode or "r"
5fdff89f
ERE
2199
2200 if filemode not in "rw":
5faea0e1
PG
2201 raise ValueError ("mode %s not compatible with concat "
2202 "archive; must be 'r' or 'w'" % mode)
5fdff89f 2203
be60ffd0 2204 stream = _Stream(name, filemode, comptype, fileobj, bufsize,
d1c38f40 2205 concat=True, encryption=encryption,
04f4c7ab 2206 compresslevel=compresslevel, tolerance=tolerance)
d1c38f40 2207 kwargs ["concat"] = True
be60ffd0
ERE
2208 try:
2209 t = cls(name, filemode, stream, **kwargs)
c7c736b6 2210 except: # XXX except what?
be60ffd0 2211 stream.close()
c7c736b6 2212 raise # XXX raise what?
5fdff89f
ERE
2213 t._extfileobj = False
2214 return t
2215
7584f5c9
ERE
2216 elif mode in "aw":
2217 return cls.taropen(name, mode, fileobj, **kwargs)
2218
133d30da 2219 raise ValueError("undiscernible mode %r" % mode)
7584f5c9
ERE
2220
2221 @classmethod
2222 def taropen(cls, name, mode="r", fileobj=None, **kwargs):
2223 """Open uncompressed tar archive name for reading or writing.
2224 """
2225 if len(mode) > 1 or mode not in "raw":
2226 raise ValueError("mode must be 'r', 'a' or 'w'")
2227 return cls(name, mode, fileobj, **kwargs)
2228
2229 @classmethod
2230 def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
2231 """Open gzip compressed tar archive name for reading or writing.
2232 Appending is not allowed.
2233 """
2234 if len(mode) > 1 or mode not in "rw":
2235 raise ValueError("mode must be 'r' or 'w'")
2236
2237 try:
2238 import gzip
2239 gzip.GzipFile
2240 except (ImportError, AttributeError):
2241 raise CompressionError("gzip module is not available")
2242
be60ffd0 2243 extfileobj = fileobj is not None
7584f5c9 2244 try:
be60ffd0
ERE
2245 fileobj = gzip.GzipFile(name, mode + "b", compresslevel, fileobj)
2246 t = cls.taropen(name, mode, fileobj, **kwargs)
2247 except OSError:
2248 if not extfileobj and fileobj is not None:
2249 fileobj.close()
2250 if fileobj is None:
2251 raise
7584f5c9 2252 raise ReadError("not a gzip file")
be60ffd0
ERE
2253 except:
2254 if not extfileobj and fileobj is not None:
2255 fileobj.close()
2256 raise
2257 t._extfileobj = extfileobj
7584f5c9
ERE
2258 return t
2259
2260 @classmethod
2261 def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
2262 """Open bzip2 compressed tar archive name for reading or writing.
2263 Appending is not allowed.
2264 """
2265 if len(mode) > 1 or mode not in "rw":
2266 raise ValueError("mode must be 'r' or 'w'.")
2267
2268 try:
2269 import bz2
2270 except ImportError:
2271 raise CompressionError("bz2 module is not available")
2272
be60ffd0
ERE
2273 fileobj = bz2.BZ2File(fileobj or name, mode,
2274 compresslevel=compresslevel)
7584f5c9
ERE
2275
2276 try:
2277 t = cls.taropen(name, mode, fileobj, **kwargs)
be60ffd0
ERE
2278 except (OSError, EOFError):
2279 fileobj.close()
7584f5c9
ERE
2280 raise ReadError("not a bzip2 file")
2281 t._extfileobj = False
2282 return t
2283
be60ffd0
ERE
2284 @classmethod
2285 def xzopen(cls, name, mode="r", fileobj=None, preset=None, **kwargs):
2286 """Open lzma compressed tar archive name for reading or writing.
2287 Appending is not allowed.
2288 """
2289 if mode not in ("r", "w"):
2290 raise ValueError("mode must be 'r' or 'w'")
2291
2292 try:
2293 import lzma
2294 except ImportError:
2295 raise CompressionError("lzma module is not available")
2296
2297 fileobj = lzma.LZMAFile(fileobj or name, mode, preset=preset)
2298
2299 try:
2300 t = cls.taropen(name, mode, fileobj, **kwargs)
2301 except (lzma.LZMAError, EOFError):
2302 fileobj.close()
2303 raise ReadError("not an lzma file")
2304 t._extfileobj = False
2305 return t
2306
7584f5c9
ERE
2307 # All *open() methods are registered here.
2308 OPEN_METH = {
2309 "tar": "taropen", # uncompressed tar
2310 "gz": "gzopen", # gzip compressed tar
be60ffd0
ERE
2311 "bz2": "bz2open", # bzip2 compressed tar
2312 "xz": "xzopen" # lzma compressed tar
7584f5c9
ERE
2313 }
2314
2315 #--------------------------------------------------------------------------
2316 # The public methods which TarFile provides:
2317
2318 def close(self):
2319 """Close the TarFile. In write-mode, two finishing zero blocks are
fd2f01f2
PG
2320 appended to the archive. A special case are empty archives which are
2321 initialized accordingly so the two mandatory blocks of zeros are
2322 written abiding by the requested encryption and compression settings.
7584f5c9
ERE
2323 """
2324 if self.closed:
2325 return
2326
2327 if self.mode in "aw":
fd2f01f2
PG
2328 if self.arcmode & ARCMODE_CONCAT and self.fileobj.tell () == 0:
2329 self.fileobj.next ("")
7584f5c9
ERE
2330 self.fileobj.write(NUL * (BLOCKSIZE * 2))
2331 self.offset += (BLOCKSIZE * 2)
2332 # fill up the end with zero-blocks
2333 # (like option -b20 for tar does)
2334 blocks, remainder = divmod(self.offset, RECORDSIZE)
2335 if remainder > 0:
2336 self.fileobj.write(NUL * (RECORDSIZE - remainder))
7584f5c9
ERE
2337 if not self._extfileobj:
2338 self.fileobj.close()
2339 self.closed = True
2340
2341 def getmember(self, name):
2342 """Return a TarInfo object for member `name'. If `name' can not be
2343 found in the archive, KeyError is raised. If a member occurs more
2344 than once in the archive, its last occurrence is assumed to be the
2345 most up-to-date version.
2346 """
2347 tarinfo = self._getmember(name)
2348 if tarinfo is None:
2349 raise KeyError("filename %r not found" % name)
2350 return tarinfo
2351
2352 def getmembers(self):
2353 """Return the members of the archive as a list of TarInfo objects. The
2354 list has the same order as the members in the archive.
2355 """
2356 self._check()
2357 if not self._loaded: # if we want to obtain a list of
2358 self._load() # all members, we first have to
2359 # scan the whole archive.
2360 return self.members
2361
ad4402e8
ERE
2362 def get_last_member_offset(self):
2363 """Return the last member offset. Usually this is self.fileobj.tell(),
2364 but when there's encryption or concat compression going on it's more
2365 complicated than that.
2366 """
b8fc2f5d 2367 return self.last_block_offset
ad4402e8 2368
7584f5c9
ERE
2369 def getnames(self):
2370 """Return the members of the archive as a list of their names. It has
2371 the same order as the list returned by getmembers().
2372 """
2373 return [tarinfo.name for tarinfo in self.getmembers()]
2374
2375 def gettarinfo(self, name=None, arcname=None, fileobj=None):
2376 """Create a TarInfo object for either the file `name' or the file
2377 object `fileobj' (using os.fstat on its file descriptor). You can
2378 modify some of the TarInfo's attributes before you add it using
2379 addfile(). If given, `arcname' specifies an alternative name for the
2380 file in the archive.
2381 """
2382 self._check("aw")
2383
2384 # When fileobj is given, replace name by
2385 # fileobj's real name.
2386 if fileobj is not None:
2387 name = fileobj.name
2388
2389 # Building the name of the member in the archive.
2390 # Backward slashes are converted to forward slashes,
2391 # Absolute paths are turned to relative paths.
2392 if arcname is None:
2393 arcname = name
2394 drv, arcname = os.path.splitdrive(arcname)
be60ffd0 2395 arcname = arcname.replace(os.sep, "/")
7584f5c9
ERE
2396 arcname = arcname.lstrip("/")
2397
2398 # Now, fill the TarInfo object with
2399 # information specific for the file.
2400 tarinfo = self.tarinfo()
2401 tarinfo.tarfile = self
2402
2403 # Use os.stat or os.lstat, depending on platform
2404 # and if symlinks shall be resolved.
2405 if fileobj is None:
2406 if hasattr(os, "lstat") and not self.dereference:
2407 statres = os.lstat(name)
2408 else:
2409 statres = os.stat(name)
2410 else:
2411 statres = os.fstat(fileobj.fileno())
2412 linkname = ""
2413
2414 stmd = statres.st_mode
2415 if stat.S_ISREG(stmd):
2416 inode = (statres.st_ino, statres.st_dev)
2417 if not self.dereference and statres.st_nlink > 1 and \
2418 inode in self.inodes and arcname != self.inodes[inode]:
2419 # Is it a hardlink to an already
2420 # archived file?
2421 type = LNKTYPE
2422 linkname = self.inodes[inode]
2423 else:
2424 # The inode is added only if its valid.
2425 # For win32 it is always 0.
2426 type = REGTYPE
6f422b65 2427 if inode[0] and self.save_to_members:
7584f5c9
ERE
2428 self.inodes[inode] = arcname
2429 elif stat.S_ISDIR(stmd):
2430 type = DIRTYPE
2431 elif stat.S_ISFIFO(stmd):
2432 type = FIFOTYPE
2433 elif stat.S_ISLNK(stmd):
2434 type = SYMTYPE
2435 linkname = os.readlink(name)
2436 elif stat.S_ISCHR(stmd):
2437 type = CHRTYPE
2438 elif stat.S_ISBLK(stmd):
2439 type = BLKTYPE
2440 else:
2441 return None
2442
2443 # Fill the TarInfo object with all
2444 # information we can get.
2445 tarinfo.name = arcname
2446 tarinfo.mode = stmd
2447 tarinfo.uid = statres.st_uid
2448 tarinfo.gid = statres.st_gid
2449 if type == REGTYPE:
2450 tarinfo.size = statres.st_size
2451 else:
be60ffd0 2452 tarinfo.size = 0
7584f5c9
ERE
2453 tarinfo.mtime = statres.st_mtime
2454 tarinfo.type = type
2455 tarinfo.linkname = linkname
2456 if pwd:
9ef1fb87
TJ
2457 if tarinfo.uid in self.cache_uid2user:
2458 tarinfo.uname = self.cache_uid2user[tarinfo.uid]
2459 else:
2460 try:
2461 tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0]
2462 self.cache_uid2user[tarinfo.uid] = tarinfo.uname
2463 except KeyError:
2464 # remember user does not exist:
2465 # same default value as in tarinfo class
2466 self.cache_uid2user[tarinfo.uid] = ""
7584f5c9 2467 if grp:
9ef1fb87
TJ
2468 if tarinfo.gid in self.cache_gid2group:
2469 tarinfo.gname = self.cache_gid2group[tarinfo.gid]
2470 else:
2471 try:
2472 tarinfo.gname = grp.getgrgid(tarinfo.gid)[0]
2473 self.cache_gid2group[tarinfo.gid] = tarinfo.gname
2474 except KeyError:
2475 # remember group does not exist:
2476 # same default value as in tarinfo class
2477 self.cache_gid2group[tarinfo.gid] = ""
7584f5c9
ERE
2478
2479 if type in (CHRTYPE, BLKTYPE):
2480 if hasattr(os, "major") and hasattr(os, "minor"):
2481 tarinfo.devmajor = os.major(statres.st_rdev)
2482 tarinfo.devminor = os.minor(statres.st_rdev)
2483 return tarinfo
2484
2485 def list(self, verbose=True):
2486 """Print a table of contents to sys.stdout. If `verbose' is False, only
2487 the names of the members are printed. If it is True, an `ls -l'-like
2488 output is produced.
2489 """
2490 self._check()
2491
2492 for tarinfo in self:
2493 if verbose:
be60ffd0
ERE
2494 print(stat.filemode(tarinfo.mode), end=' ')
2495 print("%s/%s" % (tarinfo.uname or tarinfo.uid,
2496 tarinfo.gname or tarinfo.gid), end=' ')
7584f5c9 2497 if tarinfo.ischr() or tarinfo.isblk():
be60ffd0
ERE
2498 print("%10s" % ("%d,%d" \
2499 % (tarinfo.devmajor, tarinfo.devminor)), end=' ')
7584f5c9 2500 else:
be60ffd0
ERE
2501 print("%10d" % tarinfo.size, end=' ')
2502 print("%d-%02d-%02d %02d:%02d:%02d" \
2503 % time.localtime(tarinfo.mtime)[:6], end=' ')
7584f5c9 2504
be60ffd0 2505 print(tarinfo.name + ("/" if tarinfo.isdir() else ""), end=' ')
7584f5c9
ERE
2506
2507 if verbose:
2508 if tarinfo.issym():
be60ffd0 2509 print("->", tarinfo.linkname, end=' ')
7584f5c9 2510 if tarinfo.islnk():
be60ffd0
ERE
2511 print("link to", tarinfo.linkname, end=' ')
2512 print()
7584f5c9 2513
be60ffd0 2514 def add(self, name, arcname=None, recursive=True, exclude=None, *, filter=None):
7584f5c9
ERE
2515 """Add the file `name' to the archive. `name' may be any type of file
2516 (directory, fifo, symbolic link, etc.). If given, `arcname'
2517 specifies an alternative name for the file in the archive.
2518 Directories are added recursively by default. This can be avoided by
2519 setting `recursive' to False. `exclude' is a function that should
2520 return True for each filename to be excluded. `filter' is a function
2521 that expects a TarInfo object argument and returns the changed
2522 TarInfo object, if it returns None the TarInfo object will be
2523 excluded from the archive.
2524 """
2525 self._check("aw")
2526
2527 if arcname is None:
2528 arcname = name
2529
2530 # Exclude pathnames.
2531 if exclude is not None:
2532 import warnings
2533 warnings.warn("use the filter argument instead",
2534 DeprecationWarning, 2)
2535 if exclude(name):
2536 self._dbg(2, "tarfile: Excluded %r" % name)
2537 return
2538
2539 # Skip if somebody tries to archive the archive...
2540 if self.name is not None and os.path.abspath(name) == self.name:
2541 self._dbg(2, "tarfile: Skipped %r" % name)
2542 return
2543
2544 self._dbg(1, name)
2545
2546 # Create a TarInfo object from the file.
2547 tarinfo = self.gettarinfo(name, arcname)
2548
2549 if tarinfo is None:
2550 self._dbg(1, "tarfile: Unsupported type %r" % name)
2551 return
2552
2553 # Change or exclude the TarInfo object.
2554 if filter is not None:
2555 tarinfo = filter(tarinfo)
2556 if tarinfo is None:
2557 self._dbg(2, "tarfile: Excluded %r" % name)
2558 return
2559
2560 # Append the tar header and data to the archive.
2561 if tarinfo.isreg():
2562 with bltn_open(name, "rb") as f:
2563 self.addfile(tarinfo, f)
2564
2565 elif tarinfo.isdir():
2566 self.addfile(tarinfo)
2567 if recursive:
2568 for f in os.listdir(name):
2569 self.add(os.path.join(name, f), os.path.join(arcname, f),
be60ffd0 2570 recursive, exclude, filter=filter)
7584f5c9
ERE
2571
2572 else:
2573 self.addfile(tarinfo)
2574
defc9a22 2575 def _size_left_file(self):
be60ffd0 2576 """Calculates size left in a volume with a maximum volume size.
ba5a449e 2577
be60ffd0 2578 Assumes self.max_volume_size is set.
ba5a449e 2579 If using compression through a _Stream, use _size_left_stream instead
be60ffd0 2580 """
ba5a449e 2581 # left-over size = max_size - offset - 2 zero-blocks written in close
ae48acc8
ERE
2582 size_left = self.max_volume_size - 2*BLOCKSIZE - self.offset
2583 # limit size left to a discrete number of blocks, because we won't
be60ffd0 2584 # write only half a block when writting the end of a volume
ae48acc8 2585 # and filling with zeros
defc9a22
CH
2586 return BLOCKSIZE * (size_left // BLOCKSIZE)
2587
2588 def _size_left_stream(self):
ba5a449e
CH
2589 """ Calculates size left in a volume if using comression/encryption
2590
2591 Assumes self.max_volume_size is set and self.fileobj is a _Stream
2592 (otherwise use _size_left_file)
2593 """
2594 # left-over size = max_size - bytes written - 2 zero-blocks (close)
defc9a22
CH
2595 size_left = self.max_volume_size - self.fileobj.estim_file_size() \
2596 - 2*BLOCKSIZE
2597 return BLOCKSIZE * (size_left // BLOCKSIZE)
ae48acc8 2598
7584f5c9
ERE
2599 def addfile(self, tarinfo, fileobj=None):
2600 """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is
2601 given, tarinfo.size bytes are read from it and added to the archive.
2602 You can create TarInfo objects using gettarinfo().
2603 On Windows platforms, `fileobj' should always be opened with mode
2604 'rb' to avoid irritation about the file size.
2605 """
2606 self._check("aw")
2607
2608 tarinfo = copy.copy(tarinfo)
cbf55ffb 2609
d1c38f40
PG
2610 if self.arcmode & ARCMODE_CONCAT:
2611 self.last_block_offset = self.fileobj.next (tarinfo.name)
11684b1d
ERE
2612 else:
2613 self.last_block_offset = self.fileobj.tell()
7584f5c9
ERE
2614
2615 buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
2616 self.fileobj.write(buf)
2617 self.offset += len(buf)
2618
ae9c8de2
CH
2619 if self.max_volume_size:
2620 if isinstance(self.fileobj, _Stream):
2621 _size_left = self._size_left_stream
2622 else:
2623 _size_left = self._size_left_file
2624 else:
2625 _size_left = lambda: tarinfo.size
68ddf955 2626
29c354ac
PG
2627 # If there's no data to follow, finish
2628 if not fileobj:
29c354ac
PG
2629 if self.save_to_members:
2630 self.members.append(tarinfo)
2631 return
2632
2633 target_size_left = _size_left()
2634 source_size_left = tarinfo.size
2635 assert tarinfo.volume_offset == 0
2636
2637 # we only split volumes in the middle of a file, that means we have
2638 # to write at least one block
2639 if target_size_left < BLOCKSIZE:
2640 target_size_left = BLOCKSIZE
2641
ae9c8de2
CH
2642 # loop over multiple volumes
2643 while source_size_left > 0:
ae48acc8 2644
ae9c8de2
CH
2645 # Write as much data as possble from source into target.
2646 # When compressing data, we cannot easily predict how much data we
2647 # can write until target_size_left == 0 --> need to iterate
2648 size_can_write = min(target_size_left, source_size_left)
c04e0751 2649
ae9c8de2
CH
2650 while size_can_write > 0:
2651 copyfileobj(fileobj, self.fileobj, size_can_write)
2652 self.offset += size_can_write
2653 source_size_left -= size_can_write
2654 target_size_left = _size_left()
2655 size_can_write = min(target_size_left, source_size_left)
68ddf955 2656
ae9c8de2
CH
2657 # now target_size_left == 0 or source_size_left == 0
2658
2659 # if there is data left to write, we need to create a new volume
2660 if source_size_left > 0:
5f38bff6
PG
2661 # Only finalize the crypto entry here if we’re continuing with
2662 # another one; otherwise, the encryption must include the block
2663 # padding below.
2f854e77 2664 tarinfo.type = GNUTYPE_MULTIVOL
68ddf955
ERE
2665
2666 if not self.new_volume_handler or\
2667 not callable(self.new_volume_handler):
c04e0751 2668 raise Exception("We need to create a new volume and you "
ae9c8de2 2669 "didn't supply a new_volume_handler")
68ddf955 2670
54128a00 2671
68ddf955
ERE
2672 # the new volume handler should do everything needed to
2673 # start working in a new volume. usually, the handler calls
2674 # to self.open_volume
2f854e77 2675 self.volume_number += 1
0eb5048f 2676
ae9c8de2 2677 # set to be used by open_volume, because in the case of a PAX
0eb5048f
ERE
2678 # tar it needs to write information about the volume and offset
2679 # in the global header
ae9c8de2 2680 tarinfo.volume_offset = tarinfo.size - source_size_left
0eb5048f 2681 self.volume_tarinfo = tarinfo
ae9c8de2 2682
a0873dcc
PG
2683 # the “new_volume_handler” is supposed to call .close() on the
2684 # “fileobj” _Stream
2f854e77
ERE
2685 self.new_volume_handler(self, self.base_name, self.volume_number)
2686
0eb5048f
ERE
2687 self.volume_tarinfo = None
2688
d1c38f40
PG
2689 if self.arcmode & ARCMODE_CONCAT:
2690 self.fileobj.next_volume (tarinfo.name)
5f38bff6 2691
2f854e77
ERE
2692 # write new volume header
2693 buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
2f854e77 2694 self.fileobj.write(buf)
ae9c8de2
CH
2695 self.offset += len(buf)
2696
2697 # adjust variables; open_volume should have reset self.offset
2698 # --> _size_left should be big again
2699 target_size_left = _size_left()
2700 size_can_write = min(target_size_left, source_size_left)
e0da4709 2701 self._dbg(3, 'new volume')
ae9c8de2
CH
2702
2703 # now, all data has been written. We may have to fill up the rest of
2704 # the block in target with 0s
2705 remainder = (tarinfo.size - tarinfo.volume_offset) % BLOCKSIZE
2706 if remainder > 0:
2707 self.fileobj.write(NUL * (BLOCKSIZE - remainder))
2708 self.offset += BLOCKSIZE - remainder
7584f5c9 2709
ea625b04
ERE
2710 if self.save_to_members:
2711 self.members.append(tarinfo)
7584f5c9 2712
170c6c52 2713 def open_volume(self, name="", fileobj=None, encryption=None):
68ddf955 2714 '''
0eb5048f 2715 Called by the user to change this tar file to point to a new volume.
68ddf955
ERE
2716 '''
2717 # open the file using either fileobj or name
2718 if not fileobj:
2719 if self.mode == "a" and not os.path.exists(name):
2720 # Create nonexistent files in append mode.
2721 self.mode = "w"
2722 self._mode = "wb"
68ddf955 2723 self._extfileobj = False
26fa5ad5
ERE
2724
2725 if isinstance(self.fileobj, _Stream):
e0da4709 2726 self._dbg(3, 'open_volume: create a _Stream')
26fa5ad5
ERE
2727 fileobj = _Stream(name=name,
2728 mode=self.fileobj.mode,
2729 comptype=self.fileobj.comptype,
2730 fileobj=None,
2731 bufsize=self.fileobj.bufsize,
cea130ec 2732 encryption=encryption or self.fileobj.encryption,
d1c38f40 2733 concat=self.fileobj.arcmode & ARCMODE_CONCAT)
26fa5ad5 2734 else:
7a2b9329 2735 # here, we lose information about compression/encryption!
e0da4709 2736 self._dbg(3, 'open_volume: builtin open')
26fa5ad5 2737 fileobj = bltn_open(name, self._mode)
68ddf955
ERE
2738 else:
2739 if name is None and hasattr(fileobj, "name"):
2740 name = fileobj.name
2741 if hasattr(fileobj, "mode"):
2742 self._mode = fileobj.mode
2743 self._extfileobj = True
1027433a 2744 self._dbg(3, 'open_volume: using external fileobj {}', fileobj)
68ddf955
ERE
2745 self.name = os.path.abspath(name) if name else None
2746 self.fileobj = fileobj
2747
2748 # init data structures
2749 self.closed = False
2750 self.members = [] # list of members as TarInfo objects
2751 self._loaded = False # flag if all members have been read
2752 self.offset = self.fileobj.tell()
2753 # current position in the archive file
2754 self.inodes = {} # dictionary caching the inodes of
2755 # archive members already added
2756
2757 try:
2758 if self.mode == "r":
2759 self.firstmember = None
2760 self.firstmember = self.next()
2761
2762 if self.mode == "a":
2763 # Move to the end of the archive,
2764 # before the first empty block.
2765 while True:
2766 self.fileobj.seek(self.offset)
2767 try:
2768 tarinfo = self.tarinfo.fromtarfile(self)
2769 self.members.append(tarinfo)
2770 except EOFHeaderError:
2771 self.fileobj.seek(self.offset)
2772 break
be60ffd0 2773 except HeaderError as e:
68ddf955
ERE
2774 raise ReadError(str(e))
2775
2776 if self.mode in "aw":
2777 self._loaded = True
2778
c04e0751
ERE
2779 if self.format == PAX_FORMAT:
2780 volume_info = {
be60ffd0
ERE
2781 "GNU.volume.filename": str(self.volume_tarinfo.name),
2782 "GNU.volume.size": str(self.volume_tarinfo.size - self.volume_tarinfo.volume_offset),
2783 "GNU.volume.offset": str(self.volume_tarinfo.volume_offset),
c04e0751 2784 }
0eb5048f 2785
c04e0751
ERE
2786 self.pax_headers.update(volume_info)
2787
a0873dcc
PG
2788 if isinstance(self.fileobj, _Stream):
2789 self.fileobj._init_write_gz ()
c04e0751
ERE
2790 buf = self.tarinfo.create_pax_global_header(volume_info.copy())
2791 self.fileobj.write(buf)
2792 self.offset += len(buf)
54128a00 2793 except Exception as exn:
68ddf955
ERE
2794 if not self._extfileobj:
2795 self.fileobj.close()
2796 self.closed = True
2797 raise
2798
e5f5681b 2799 def extractall(self, path=".", members=None, filter=None):
7584f5c9
ERE
2800 """Extract all members from the archive to the current working
2801 directory and set owner, modification time and permissions on
2802 directories afterwards. `path' specifies a different directory
2803 to extract to. `members' is optional and must be a subset of the
2804 list returned by getmembers().
2805 """
2806 directories = []
2807
2808 if members is None:
2809 members = self
2810
2811 for tarinfo in members:
c474439c
ERE
2812 if self.volume_number > 0 and tarinfo.ismultivol():
2813 continue
2814
974408b5 2815 if filter and not filter(tarinfo):
e5f5681b
ERE
2816 continue
2817
7584f5c9
ERE
2818 if tarinfo.isdir():
2819 # Extract directories with a safe mode.
2820 directories.append(tarinfo)
2821 tarinfo = copy.copy(tarinfo)
be60ffd0
ERE
2822 tarinfo.mode = 0o0700
2823 # Do not set_attrs directories, as we will do that further down
2824 self.extract(tarinfo, path, set_attrs=not tarinfo.isdir())
7584f5c9
ERE
2825
2826 # Reverse sort directories.
be60ffd0 2827 directories.sort(key=lambda a: a.name)
7584f5c9
ERE
2828 directories.reverse()
2829
2830 # Set correct owner, mtime and filemode on directories.
2831 for tarinfo in directories:
2832 dirpath = os.path.join(path, tarinfo.name)
2833 try:
2834 self.chown(tarinfo, dirpath)
2835 self.utime(tarinfo, dirpath)
2836 self.chmod(tarinfo, dirpath)
be60ffd0 2837 except ExtractError as e:
7584f5c9
ERE
2838 if self.errorlevel > 1:
2839 raise
2840 else:
2841 self._dbg(1, "tarfile: %s" % e)
2842
786addd6 2843 def extract(self, member, path="", set_attrs=True, symlink_cb=None):
7584f5c9
ERE
2844 """Extract a member from the archive to the current working directory,
2845 using its full name. Its file information is extracted as accurately
2846 as possible. `member' may be a filename or a TarInfo object. You can
be60ffd0
ERE
2847 specify a different directory using `path'. File attributes (owner,
2848 mtime, mode) are set unless `set_attrs' is False.
786addd6
PG
2849 ``symlink_cb`` is a hook accepting a function that is passed the
2850 ``member``, ``path``, and ``set_attrs`` arguments if the tarinfo for
2851 ``member`` indicates a symlink in which case only the callback
9b13f5c4
PG
2852 passed will be applied, skipping the actual extraction. In case the
2853 callback is invoked, its return value is passed on to the caller.
7584f5c9
ERE
2854 """
2855 self._check("r")
2856
be60ffd0 2857 if isinstance(member, str):
7584f5c9
ERE
2858 tarinfo = self.getmember(member)
2859 else:
2860 tarinfo = member
2861
2862 # Prepare the link target for makelink().
2863 if tarinfo.islnk():
2864 tarinfo._link_target = os.path.join(path, tarinfo.linkname)
2865
9b13f5c4 2866 if symlink_cb is not None and tarinfo.issym():
83f5fd71 2867 return symlink_cb(member, path, set_attrs)
786addd6 2868
7584f5c9 2869 try:
be60ffd0
ERE
2870 self._extract_member(tarinfo, os.path.join(path, tarinfo.name),
2871 set_attrs=set_attrs)
2872 except EnvironmentError as e:
7584f5c9
ERE
2873 if self.errorlevel > 0:
2874 raise
2875 else:
2876 if e.filename is None:
2877 self._dbg(1, "tarfile: %s" % e.strerror)
2878 else:
2879 self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename))
be60ffd0 2880 except ExtractError as e:
7584f5c9
ERE
2881 if self.errorlevel > 1:
2882 raise
2883 else:
2884 self._dbg(1, "tarfile: %s" % e)
2885
2886 def extractfile(self, member):
2887 """Extract a member from the archive as a file object. `member' may be
be60ffd0
ERE
2888 a filename or a TarInfo object. If `member' is a regular file or a
2889 link, an io.BufferedReader object is returned. Otherwise, None is
2890 returned.
7584f5c9
ERE
2891 """
2892 self._check("r")
2893
be60ffd0 2894 if isinstance(member, str):
7584f5c9
ERE
2895 tarinfo = self.getmember(member)
2896 else:
2897 tarinfo = member
2898
be60ffd0
ERE
2899 if tarinfo.isreg() or tarinfo.ismultivol() or\
2900 tarinfo.type not in SUPPORTED_TYPES:
7584f5c9
ERE
2901 # If a member's type is unknown, it is treated as a
2902 # regular file.
2903 return self.fileobject(self, tarinfo)
2904
2905 elif tarinfo.islnk() or tarinfo.issym():
2906 if isinstance(self.fileobj, _Stream):
2907 # A small but ugly workaround for the case that someone tries
2908 # to extract a (sym)link as a file-object from a non-seekable
2909 # stream of tar blocks.
2910 raise StreamError("cannot extract (sym)link as file object")
2911 else:
2912 # A (sym)link's file object is its target's file object.
2913 return self.extractfile(self._find_link_target(tarinfo))
2914 else:
2915 # If there's no data associated with the member (directory, chrdev,
2916 # blkdev, etc.), return None instead of a file object.
2917 return None
2918
be60ffd0 2919 def _extract_member(self, tarinfo, targetpath, set_attrs=True):
7584f5c9
ERE
2920 """Extract the TarInfo object tarinfo to a physical
2921 file called targetpath.
2922 """
2923 # Fetch the TarInfo object for the given name
2924 # and build the destination pathname, replacing
2925 # forward slashes to platform specific separators.
2926 targetpath = targetpath.rstrip("/")
2927 targetpath = targetpath.replace("/", os.sep)
2928
2929 # Create all upper directories.
2930 upperdirs = os.path.dirname(targetpath)
2931 if upperdirs and not os.path.exists(upperdirs):
2932 # Create directories that are not part of the archive with
2933 # default permissions.
2934 os.makedirs(upperdirs)
2935
2936 if tarinfo.islnk() or tarinfo.issym():
2937 self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname))
2938 else:
2939 self._dbg(1, tarinfo.name)
2940
2941 if tarinfo.isreg():
2942 self.makefile(tarinfo, targetpath)
2943 elif tarinfo.isdir():
2944 self.makedir(tarinfo, targetpath)
2945 elif tarinfo.isfifo():
2946 self.makefifo(tarinfo, targetpath)
2947 elif tarinfo.ischr() or tarinfo.isblk():
2948 self.makedev(tarinfo, targetpath)
2949 elif tarinfo.islnk() or tarinfo.issym():
2950 self.makelink(tarinfo, targetpath)
2951 elif tarinfo.type not in SUPPORTED_TYPES:
2952 self.makeunknown(tarinfo, targetpath)
2953 else:
2954 self.makefile(tarinfo, targetpath)
2955
be60ffd0
ERE
2956 if set_attrs:
2957 self.chown(tarinfo, targetpath)
2958 if not tarinfo.issym():
2959 self.chmod(tarinfo, targetpath)
2960 self.utime(tarinfo, targetpath)
7584f5c9
ERE
2961
2962 #--------------------------------------------------------------------------
2963 # Below are the different file methods. They are called via
2964 # _extract_member() when extract() is called. They can be replaced in a
2965 # subclass to implement other functionality.
2966
2967 def makedir(self, tarinfo, targetpath):
2968 """Make a directory called targetpath.
2969 """
2970 try:
2971 # Use a safe mode for the directory, the real mode is set
2972 # later in _extract_member().
be60ffd0
ERE
2973 os.mkdir(targetpath, 0o0700)
2974 except FileExistsError:
2975 pass
7584f5c9
ERE
2976
2977 def makefile(self, tarinfo, targetpath):
2978 """Make a file called targetpath.
2979 """
be60ffd0
ERE
2980 source = self.fileobj
2981 source.seek(tarinfo.offset_data)
c7c736b6 2982 decrypt = False
c474439c
ERE
2983 iterate = True
2984 target = bltn_open(targetpath, "wb")
2985
be60ffd0
ERE
2986 if tarinfo.sparse is not None:
2987 try:
2988 for offset, size in tarinfo.sparse:
2989 target.seek(offset)
2990 copyfileobj(source, target, size)
2991 target.seek(tarinfo.size)
2992 target.truncate()
2993 finally:
2994 target.close()
2995 return
2996
c474439c
ERE
2997 while iterate:
2998 iterate = False
2999 try:
3000 copyfileobj(source, target, tarinfo.size)
aa828cd1 3001 except OSError:
c474439c
ERE
3002 source.close()
3003 # only if we are extracting a multivolume this can be treated
3004 if not self.new_volume_handler:
3005 target.close()
3006 raise Exception("We need to read a new volume and you"
3007 " didn't supply a new_volume_handler")
3008
3009 # the new volume handler should do everything needed to
3010 # start working in a new volume. usually, the handler calls
3011 # to self.open_volume
3012 self.volume_number += 1
3013 self.new_volume_handler(self, self.base_name, self.volume_number)
be60ffd0
ERE
3014 tarinfo = self.firstmember
3015 source = self.fileobj
c474439c 3016 iterate = True
c474439c
ERE
3017 target.close()
3018
7584f5c9
ERE
3019
3020 def makeunknown(self, tarinfo, targetpath):
3021 """Make a file from a TarInfo object with an unknown type
3022 at targetpath.
3023 """
3024 self.makefile(tarinfo, targetpath)
3025 self._dbg(1, "tarfile: Unknown file type %r, " \
3026 "extracted as regular file." % tarinfo.type)
3027
3028 def makefifo(self, tarinfo, targetpath):
3029 """Make a fifo called targetpath.
3030 """
3031 if hasattr(os, "mkfifo"):
3032 os.mkfifo(targetpath)
3033 else:
3034 raise ExtractError("fifo not supported by system")
3035
3036 def makedev(self, tarinfo, targetpath):
3037 """Make a character or block device called targetpath.
3038 """
3039 if not hasattr(os, "mknod") or not hasattr(os, "makedev"):
3040 raise ExtractError("special devices not supported by system")
3041
3042 mode = tarinfo.mode
3043 if tarinfo.isblk():
3044 mode |= stat.S_IFBLK
3045 else:
3046 mode |= stat.S_IFCHR
3047
3048 os.mknod(targetpath, mode,
3049 os.makedev(tarinfo.devmajor, tarinfo.devminor))
3050
3051 def makelink(self, tarinfo, targetpath):
3052 """Make a (symbolic) link called targetpath. If it cannot be created
3053 (platform limitation), we try to make a copy of the referenced file
3054 instead of a link.
3055 """
be60ffd0 3056 try:
7584f5c9
ERE
3057 # For systems that support symbolic and hard links.
3058 if tarinfo.issym():
7584f5c9
ERE
3059 os.symlink(tarinfo.linkname, targetpath)
3060 else:
3061 # See extract().
3062 if os.path.exists(tarinfo._link_target):
7584f5c9
ERE
3063 os.link(tarinfo._link_target, targetpath)
3064 else:
be60ffd0
ERE
3065 self._extract_member(self._find_link_target(tarinfo),
3066 targetpath)
3067 except symlink_exception:
7584f5c9 3068 try:
be60ffd0
ERE
3069 self._extract_member(self._find_link_target(tarinfo),
3070 targetpath)
7584f5c9
ERE
3071 except KeyError:
3072 raise ExtractError("unable to resolve link inside archive")
3073
3074 def chown(self, tarinfo, targetpath):
3075 """Set owner of targetpath according to tarinfo.
3076 """
3077 if pwd and hasattr(os, "geteuid") and os.geteuid() == 0:
3078 # We have to be root to do so.
3079 try:
3080 g = grp.getgrnam(tarinfo.gname)[2]
3081 except KeyError:
3082 g = tarinfo.gid
3083 try:
3084 u = pwd.getpwnam(tarinfo.uname)[2]
3085 except KeyError:
3086 u = tarinfo.uid
3087 try:
3088 if tarinfo.issym() and hasattr(os, "lchown"):
3089 os.lchown(targetpath, u, g)
3090 else:
be60ffd0
ERE
3091 os.chown(targetpath, u, g)
3092 except OSError as e:
7584f5c9
ERE
3093 raise ExtractError("could not change owner")
3094
3095 def chmod(self, tarinfo, targetpath):
3096 """Set file permissions of targetpath according to tarinfo.
3097 """
3098 if hasattr(os, 'chmod'):
3099 try:
3100 os.chmod(targetpath, tarinfo.mode)
be60ffd0 3101 except OSError as e:
7584f5c9
ERE
3102 raise ExtractError("could not change mode")
3103
3104 def utime(self, tarinfo, targetpath):
3105 """Set modification time of targetpath according to tarinfo.
3106 """
3107 if not hasattr(os, 'utime'):
3108 return
3109 try:
3110 os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime))
be60ffd0 3111 except OSError as e:
7584f5c9
ERE
3112 raise ExtractError("could not change modification time")
3113
3114 #--------------------------------------------------------------------------
3115 def next(self):
3116 """Return the next member of the archive as a TarInfo object, when
3117 TarFile is opened for reading. Return None if there is no more
3118 available.
3119 """
3120 self._check("ra")
3121 if self.firstmember is not None:
3122 m = self.firstmember
3123 self.firstmember = None
3124 return m
3125
be60ffd0
ERE
3126 # Read the next block.
3127 self.fileobj.seek(self.offset)
7584f5c9
ERE
3128 tarinfo = None
3129 while True:
3130 try:
3131 tarinfo = self.tarinfo.fromtarfile(self)
be60ffd0 3132 except EOFHeaderError as e:
7584f5c9
ERE
3133 if self.ignore_zeros:
3134 self._dbg(2, "0x%X: %s" % (self.offset, e))
3135 self.offset += BLOCKSIZE
3136 continue
be60ffd0 3137 except InvalidHeaderError as e:
7584f5c9
ERE
3138 if self.ignore_zeros:
3139 self._dbg(2, "0x%X: %s" % (self.offset, e))
3140 self.offset += BLOCKSIZE
3141 continue
3142 elif self.offset == 0:
3143 raise ReadError(str(e))
3144 except EmptyHeaderError:
3145 if self.offset == 0:
3146 raise ReadError("empty file")
be60ffd0 3147 except TruncatedHeaderError as e:
7584f5c9
ERE
3148 if self.offset == 0:
3149 raise ReadError(str(e))
be60ffd0 3150 except SubsequentHeaderError as e:
7584f5c9
ERE
3151 raise ReadError(str(e))
3152 break
3153
3154 if tarinfo is not None:
ea625b04
ERE
3155 if self.save_to_members:
3156 self.members.append(tarinfo)
7584f5c9
ERE
3157 else:
3158 self._loaded = True
3159
3160 return tarinfo
3161
3162 #--------------------------------------------------------------------------
3163 # Little helper methods:
3164
3165 def _getmember(self, name, tarinfo=None, normalize=False):
3166 """Find an archive member by name from bottom to top.
3167 If tarinfo is given, it is used as the starting point.
3168 """
3169 # Ensure that all members have been loaded.
3170 members = self.getmembers()
3171
3172 # Limit the member search list up to tarinfo.
3173 if tarinfo is not None:
3174 members = members[:members.index(tarinfo)]
3175
3176 if normalize:
3177 name = os.path.normpath(name)
3178
3179 for member in reversed(members):
3180 if normalize:
3181 member_name = os.path.normpath(member.name)
3182 else:
3183 member_name = member.name
3184
3185 if name == member_name:
3186 return member
3187
3188 def _load(self):
3189 """Read through the entire archive file and look for readable
3190 members.
3191 """
3192 while True:
3193 tarinfo = self.next()
3194 if tarinfo is None:
3195 break
3196 self._loaded = True
3197
3198 def _check(self, mode=None):
3199 """Check if TarFile is still open, and if the operation's mode
3200 corresponds to TarFile's mode.
3201 """
3202 if self.closed:
be60ffd0 3203 raise OSError("%s is closed" % self.__class__.__name__)
7584f5c9 3204 if mode is not None and self.mode not in mode:
be60ffd0 3205 raise OSError("bad operation for mode %r" % self.mode)
7584f5c9
ERE
3206
3207 def _find_link_target(self, tarinfo):
3208 """Find the target member of a symlink or hardlink member in the
3209 archive.
3210 """
3211 if tarinfo.issym():
3212 # Always search the entire archive.
3213 linkname = "/".join(filter(None, (os.path.dirname(tarinfo.name), tarinfo.linkname)))
3214 limit = None
3215 else:
3216 # Search the archive before the link, because a hard link is
3217 # just a reference to an already archived file.
3218 linkname = tarinfo.linkname
3219 limit = tarinfo
3220
3221 member = self._getmember(linkname, tarinfo=limit, normalize=True)
3222 if member is None:
3223 raise KeyError("linkname %r not found" % linkname)
3224 return member
3225
3226 def __iter__(self):
3227 """Provide an iterator object.
3228 """
3229 if self._loaded:
3230 return iter(self.members)
3231 else:
3232 return TarIter(self)
3233
1027433a 3234 def _dbg(self, level, msg, *args):
7584f5c9
ERE
3235 """Write debugging output to sys.stderr.
3236 """
3237 if level <= self.debug:
1027433a 3238 print(msg.format(*args), file=sys.stderr)
7584f5c9
ERE
3239
3240 def __enter__(self):
3241 self._check()
3242 return self
3243
3244 def __exit__(self, type, value, traceback):
3245 if type is None:
3246 self.close()
3247 else:
3248 # An exception occurred. We must not call close() because
3249 # it would try to write end-of-archive blocks and padding.
3250 if not self._extfileobj:
3251 self.fileobj.close()
3252 self.closed = True
3253# class TarFile
3254
3255class TarIter:
3256 """Iterator Class.
3257
3258 for tarinfo in TarFile(...):
3259 suite...
3260 """
3261
3262 def __init__(self, tarfile):
3263 """Construct a TarIter object.
3264 """
3265 self.tarfile = tarfile
3266 self.index = 0
3267 def __iter__(self):
3268 """Return iterator object.
3269 """
3270 return self
be60ffd0 3271 def __next__(self):
7584f5c9
ERE
3272 """Return the next item using TarFile's next() method.
3273 When all members have been read, set TarFile as _loaded.
3274 """
3275 # Fix for SF #1100429: Under rare circumstances it can
3276 # happen that getmembers() is called during iteration,
3277 # which will cause TarIter to stop prematurely.
3278
3279 if self.index == 0 and self.tarfile.firstmember is not None:
3280 tarinfo = self.tarfile.next()
3281 elif self.index < len(self.tarfile.members):
3282 tarinfo = self.tarfile.members[self.index]
3283 elif not self.tarfile._loaded:
3284 tarinfo = self.tarfile.next()
3285 if not tarinfo:
3286 self.tarfile._loaded = True
3287 raise StopIteration
3288 else:
3289 raise StopIteration
3290 self.index += 1
fb27c6e8 3291
7584f5c9
ERE
3292 return tarinfo
3293
6690f5e0
PG
3294#---------------------------------------------------------
3295# support functionality for rescue mode
3296#---------------------------------------------------------
3297
3298def gen_rescue_index (backup_tar_path, password=None, key=None):
3299 psidx = [] # pseudo index, return value
3300 offsets = None
3301 secret = None
3302
3303 if password is not None:
3304 secret = (crypto.PDTCRYPT_SECRET_PW, password)
3305 elif key is not None:
3306 secret = (crypto.PDTCRYPT_SECRET_KEY, key)
3307
3308 if secret is not None:
3309 offsets = crypto.reconstruct_offsets (backup_tar_path, secret)
3310
3311 return psidx
7584f5c9
ERE
3312
3313#--------------------
3314# exported functions
3315#--------------------
3316def is_tarfile(name):
3317 """Return True if name points to a tar archive that we
3318 are able to handle, else return False.
3319 """
3320 try:
3321 t = open(name)
3322 t.close()
3323 return True
3324 except TarError:
3325 return False
3326
3327bltn_open = open
3328open = TarFile.open