add restore helper handling for reconstructed indices
[python-delta-tar] / deltatar / tarfile.py
CommitLineData
be60ffd0 1#!/usr/bin/env python3
7584f5c9
ERE
2#-------------------------------------------------------------------
3# tarfile.py
4#-------------------------------------------------------------------
5# Copyright (C) 2002 Lars Gustäbel <lars@gustaebel.de>
6# All rights reserved.
7#
8# Permission is hereby granted, free of charge, to any person
9# obtaining a copy of this software and associated documentation
10# files (the "Software"), to deal in the Software without
11# restriction, including without limitation the rights to use,
12# copy, modify, merge, publish, distribute, sublicense, and/or sell
13# copies of the Software, and to permit persons to whom the
14# Software is furnished to do so, subject to the following
15# conditions:
16#
17# The above copyright notice and this permission notice shall be
18# included in all copies or substantial portions of the Software.
19#
20# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27# OTHER DEALINGS IN THE SOFTWARE.
28#
29"""Read from and write to tar format archives.
30"""
31
32__version__ = "$Revision: 85213 $"
33# $Source$
34
35version = "0.9.0"
36__author__ = "Lars Gustäbel (lars@gustaebel.de)"
37__date__ = "$Date$"
38__cvsid__ = "$Id$"
5fdff89f 39__credits__ = "Gustavo Niemeyer, Niels Gustäbel, Richard Townsend, Eduardo Robles."
7584f5c9
ERE
40
41#---------
42# Imports
43#---------
c7c736b6 44import binascii
dfd7865e
PG
45import copy
46import errno
be60ffd0 47import io
dfd7865e
PG
48import mmap
49import operator
50import os
51import re
7584f5c9
ERE
52import shutil
53import stat
7584f5c9 54import struct
dfd7865e
PG
55import sys
56import time
7584f5c9 57
c7c736b6
PG
58import traceback # XXX
59
8ab8fac5 60from . import crypto
6e812ad9 61
7584f5c9
ERE
62try:
63 import grp, pwd
64except ImportError:
65 grp = pwd = None
66
be60ffd0
ERE
67# os.symlink on Windows prior to 6.0 raises NotImplementedError
68symlink_exception = (AttributeError, NotImplementedError)
69try:
70 # OSError (winerror=1314) will be raised if the caller does not hold the
71 # SeCreateSymbolicLinkPrivilege privilege
72 symlink_exception += (OSError,)
73except NameError:
74 pass
75
7584f5c9
ERE
76# from tarfile import *
77__all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError"]
78
be60ffd0
ERE
79from builtins import open as _open # Since 'open' is TarFile.open
80
7584f5c9
ERE
81#---------------------------------------------------------
82# tar constants
83#---------------------------------------------------------
be60ffd0 84NUL = b"\0" # the null character
7584f5c9
ERE
85BLOCKSIZE = 512 # length of processing blocks
86RECORDSIZE = BLOCKSIZE * 20 # length of records
be60ffd0
ERE
87GNU_MAGIC = b"ustar \0" # magic gnu tar string
88POSIX_MAGIC = b"ustar\x0000" # magic posix tar string
7584f5c9
ERE
89
90LENGTH_NAME = 100 # maximum length of a filename
91LENGTH_LINK = 100 # maximum length of a linkname
92LENGTH_PREFIX = 155 # maximum length of the prefix field
93
be60ffd0
ERE
94REGTYPE = b"0" # regular file
95AREGTYPE = b"\0" # regular file
96LNKTYPE = b"1" # link (inside tarfile)
97SYMTYPE = b"2" # symbolic link
98CHRTYPE = b"3" # character special device
99BLKTYPE = b"4" # block special device
100DIRTYPE = b"5" # directory
101FIFOTYPE = b"6" # fifo special device
102CONTTYPE = b"7" # contiguous file
103
104GNUTYPE_LONGNAME = b"L" # GNU tar longname
105GNUTYPE_LONGLINK = b"K" # GNU tar longlink
106GNUTYPE_SPARSE = b"S" # GNU tar sparse file
107GNUTYPE_MULTIVOL = b"M" # GNU tar continuation of a file that began on
68ddf955 108 # another volume
7584f5c9 109
be60ffd0
ERE
110XHDTYPE = b"x" # POSIX.1-2001 extended header
111XGLTYPE = b"g" # POSIX.1-2001 global header
112SOLARIS_XHDTYPE = b"X" # Solaris extended header
7584f5c9
ERE
113
114USTAR_FORMAT = 0 # POSIX.1-1988 (ustar) format
115GNU_FORMAT = 1 # GNU tar format
116PAX_FORMAT = 2 # POSIX.1-2001 (pax) format
117DEFAULT_FORMAT = GNU_FORMAT
118
15a81fc0 119GZ_FMT_HEADER = b"<BBBBLBB"
203cb25e 120GZ_HEADER_SIZE = 10 # not including the name
15a81fc0
PG
121GZ_MAGIC = (0x1f, 0x8b) # 0o37, 0o213
122GZ_METHOD_DEFLATE = 0x08 # 0o10
dfd7865e
PG
123GZ_FLAG_FTEXT = 1 << 0 # ASCII payload
124GZ_FLAG_FHCRC = 1 << 1 # CRC16
125GZ_FLAG_FEXTRA = 1 << 2 # extra field
126GZ_FLAG_FNAME = 1 << 3 # set by default in gzip
127GZ_FLAG_FCOMMENT = 1 << 4 # NUL-terminated comment
128GZ_FLAG_RESERVED = 7 << 5 # unassigned
15a81fc0
PG
129GZ_DEFLATE_FLAGS = 0x00 # 0o00, never read (deflate.c)
130GZ_OS_CODE = 0x03 # 0o03, default in gzip (tailor.h)
d601d33b
PG
131GZ_MAGIC_BYTES = struct.pack ("<BB", GZ_MAGIC [0], GZ_MAGIC [1])
132GZ_MAGIC_DEFLATE = struct.pack ("<BBB", GZ_MAGIC [0], GZ_MAGIC [1],
133 GZ_METHOD_DEFLATE)
15a81fc0 134
04f4c7ab
PG
135TOLERANCE_STRICT = 0
136TOLERANCE_RECOVER = 1 # rely on offsets in index
137TOLERANCE_RESCUE = 2 # deduce metadata from archive contents
138
dfd7865e
PG
139BUFSIZE = 16 * 1024
140
7584f5c9 141#---------------------------------------------------------
d1c38f40
PG
142# archive handling mode
143#---------------------------------------------------------
144
145ARCMODE_PLAIN = 0
146ARCMODE_ENCRYPT = 1 << 0
147ARCMODE_COMPRESS = 1 << 1
148ARCMODE_CONCAT = 1 << 2
149
150def arcmode_fmt (m):
151 if m == ARCMODE_PLAIN:
152 return "PLAIN"
153 first = True
154 ret = "["
155 def chkappend (b, s):
156 nonlocal m
157 nonlocal ret
158 nonlocal first
159 if m & b:
160 if first is True: first = False
161 else: ret += " |"
162 ret += " " + s
163 chkappend (ARCMODE_ENCRYPT, "ENCRYPT")
164 chkappend (ARCMODE_COMPRESS, "COMPRESS")
165 chkappend (ARCMODE_CONCAT, "CONCAT")
166 return ret + " ]"
167
168
169def arcmode_set (concat=False, encryption=None, comptype=None, init=ARCMODE_PLAIN):
170 ret = init
171 if bool (concat) is True:
172 ret |= ARCMODE_CONCAT
173 if encryption is not None:
174 ret |= ARCMODE_ENCRYPT
175 if comptype == "gz":
176 ret |= ARCMODE_COMPRESS
177 return ret
178
179#---------------------------------------------------------
7584f5c9
ERE
180# tarfile constants
181#---------------------------------------------------------
182# File types that tarfile supports:
183SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE,
184 SYMTYPE, DIRTYPE, FIFOTYPE,
185 CONTTYPE, CHRTYPE, BLKTYPE,
186 GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
68ddf955 187 GNUTYPE_SPARSE, GNUTYPE_MULTIVOL)
7584f5c9
ERE
188
189# File types that will be treated as a regular file.
190REGULAR_TYPES = (REGTYPE, AREGTYPE,
191 CONTTYPE, GNUTYPE_SPARSE)
192
193# File types that are part of the GNU tar format.
194GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
68ddf955 195 GNUTYPE_SPARSE, GNUTYPE_MULTIVOL)
7584f5c9
ERE
196
197# Fields from a pax header that override a TarInfo attribute.
198PAX_FIELDS = ("path", "linkpath", "size", "mtime",
199 "uid", "gid", "uname", "gname")
200
be60ffd0
ERE
201# Fields from a pax header that are affected by hdrcharset.
202PAX_NAME_FIELDS = {"path", "linkpath", "uname", "gname"}
203
7584f5c9
ERE
204# Fields in a pax header that are numbers, all other fields
205# are treated as strings.
206PAX_NUMBER_FIELDS = {
207 "atime": float,
208 "ctime": float,
209 "mtime": float,
210 "uid": int,
211 "gid": int,
212 "size": int
213}
214
215#---------------------------------------------------------
7584f5c9
ERE
216# initialization
217#---------------------------------------------------------
be60ffd0
ERE
218
219if os.name in ("nt", "ce"):
220 ENCODING = "utf-8"
221else:
222 ENCODING = sys.getfilesystemencoding()
7584f5c9
ERE
223
224#---------------------------------------------------------
225# Some useful functions
226#---------------------------------------------------------
227
be60ffd0
ERE
228def stn(s, length, encoding, errors):
229 """Convert a string to a null-terminated bytes object.
7584f5c9 230 """
be60ffd0 231 s = s.encode(encoding, errors)
7584f5c9
ERE
232 return s[:length] + (length - len(s)) * NUL
233
be60ffd0
ERE
234def nts(s, encoding, errors):
235 """Convert a null-terminated bytes object to a string.
7584f5c9 236 """
be60ffd0
ERE
237 p = s.find(b"\0")
238 if p != -1:
239 s = s[:p]
240 return s.decode(encoding, errors)
241
242def sbtn(s, length, encoding, errors):
243 """Convert a string or a bunch of bytes to a null-terminated bytes object
244 of specific size.
245 """
246 if isinstance(s, str):
247 s = s.encode(encoding, errors)
248 return s[:length] + (length - len(s)) * NUL
7584f5c9
ERE
249
250def nti(s):
251 """Convert a number field to a python number.
252 """
253 # There are two possible encodings for a number field, see
254 # itn() below.
be60ffd0
ERE
255 if s[0] in (0o200, 0o377):
256 n = 0
257 for i in range(len(s) - 1):
258 n <<= 8
259 n += s[i + 1]
260 if s[0] == 0o377:
261 n = -(256 ** (len(s) - 1) - n)
262 else:
7584f5c9 263 try:
be60ffd0 264 n = int(nts(s, "ascii", "strict") or "0", 8)
7584f5c9
ERE
265 except ValueError:
266 raise InvalidHeaderError("invalid header")
7584f5c9
ERE
267 return n
268
269def itn(n, digits=8, format=DEFAULT_FORMAT):
270 """Convert a python number to a number field.
271 """
272 # POSIX 1003.1-1988 requires numbers to be encoded as a string of
273 # octal digits followed by a null-byte, this allows values up to
274 # (8**(digits-1))-1. GNU tar allows storing numbers greater than
be60ffd0
ERE
275 # that if necessary. A leading 0o200 or 0o377 byte indicate this
276 # particular encoding, the following digits-1 bytes are a big-endian
277 # base-256 representation. This allows values up to (256**(digits-1))-1.
278 # A 0o200 byte indicates a positive number, a 0o377 byte a negative
279 # number.
7584f5c9 280 if 0 <= n < 8 ** (digits - 1):
8112b0ed 281 s = bytes("%0*o" % (digits - 1, int(n)), "ascii") + NUL
be60ffd0
ERE
282 elif format == GNU_FORMAT and -256 ** (digits - 1) <= n < 256 ** (digits - 1):
283 if n >= 0:
284 s = bytearray([0o200])
285 else:
286 s = bytearray([0o377])
287 n = 256 ** digits + n
7584f5c9 288
be60ffd0
ERE
289 for i in range(digits - 1):
290 s.insert(1, n & 0o377)
7584f5c9 291 n >>= 8
7584f5c9 292 else:
be60ffd0
ERE
293 raise ValueError("overflow in number field")
294
295 return s
7584f5c9
ERE
296
297def calc_chksums(buf):
298 """Calculate the checksum for a member's header by summing up all
299 characters except for the chksum field which is treated as if
300 it was filled with spaces. According to the GNU tar sources,
301 some tars (Sun and NeXT) calculate chksum with signed char,
302 which will be different if there are chars in the buffer with
303 the high bit set. So we calculate two checksums, unsigned and
304 signed.
305 """
be60ffd0
ERE
306 unsigned_chksum = 256 + sum(struct.unpack_from("148B8x356B", buf))
307 signed_chksum = 256 + sum(struct.unpack_from("148b8x356b", buf))
7584f5c9
ERE
308 return unsigned_chksum, signed_chksum
309
310def copyfileobj(src, dst, length=None):
311 """Copy length bytes from fileobj src to fileobj dst.
312 If length is None, copy the entire content.
313 """
314 if length == 0:
315 return
316 if length is None:
317 shutil.copyfileobj(src, dst)
318 return
319
7584f5c9 320 blocks, remainder = divmod(length, BUFSIZE)
be60ffd0 321 for b in range(blocks):
7584f5c9 322 buf = src.read(BUFSIZE)
c474439c 323 dst.write(buf)
7584f5c9 324 if len(buf) < BUFSIZE:
be60ffd0 325 raise OSError("end of file reached")
7584f5c9
ERE
326 if remainder != 0:
327 buf = src.read(remainder)
c474439c 328 dst.write(buf)
7584f5c9 329 if len(buf) < remainder:
be60ffd0 330 raise OSError("end of file reached")
c7c736b6 331
7584f5c9 332
7584f5c9 333def filemode(mode):
be60ffd0
ERE
334 """Deprecated in this location; use stat.filemode."""
335 import warnings
336 warnings.warn("deprecated in favor of stat.filemode",
337 DeprecationWarning, 2)
338 return stat.filemode(mode)
7584f5c9
ERE
339
340class TarError(Exception):
341 """Base exception."""
342 pass
343class ExtractError(TarError):
344 """General exception for extract errors."""
345 pass
346class ReadError(TarError):
be60ffd0 347 """Exception for unreadable tar archives."""
7584f5c9
ERE
348 pass
349class CompressionError(TarError):
350 """Exception for unavailable compression methods."""
351 pass
352class StreamError(TarError):
353 """Exception for unsupported operations on stream-like TarFiles."""
354 pass
355class HeaderError(TarError):
356 """Base exception for header errors."""
357 pass
358class EmptyHeaderError(HeaderError):
359 """Exception for empty headers."""
360 pass
361class TruncatedHeaderError(HeaderError):
362 """Exception for truncated headers."""
363 pass
364class EOFHeaderError(HeaderError):
365 """Exception for end of file headers."""
366 pass
367class InvalidHeaderError(HeaderError):
368 """Exception for invalid headers."""
369 pass
370class SubsequentHeaderError(HeaderError):
371 """Exception for missing and invalid extended headers."""
372 pass
8ab8fac5
PG
373class InvalidEncryptionError(TarError):
374 """Exception for undefined crypto modes and combinations."""
375 pass
e4e5d0b8
PG
376class DecryptionError(TarError):
377 """Exception for error during decryption."""
378 pass
c7c736b6 379class EncryptionError(TarError):
e93f83f1 380 """Exception for error during encryption."""
c7c736b6 381 pass
e50fa574
PG
382class EndOfFile(Exception):
383 """Signal end of file condition when they’re not an error."""
65b35c42 384 pass
7584f5c9
ERE
385
386#---------------------------
387# internal stream interface
388#---------------------------
389class _LowLevelFile:
390 """Low-level file object. Supports reading and writing.
391 It is used instead of a regular file object for streaming
392 access.
393 """
394
395 def __init__(self, name, mode):
ad4402e8 396 _mode = {
7584f5c9 397 "r": os.O_RDONLY,
c7c736b6 398 "w": os.O_RDWR | os.O_CREAT | os.O_TRUNC,
7584f5c9
ERE
399 }[mode]
400 if hasattr(os, "O_BINARY"):
f0287fb7 401 _mode |= os.O_BINARY # pylint: disable=no-member
be60ffd0 402 self.fd = os.open(name, _mode, 0o666)
ad4402e8 403 self.offset = 0
7584f5c9
ERE
404
405 def close(self):
406 os.close(self.fd)
407
408 def read(self, size):
ad4402e8
ERE
409 ret = os.read(self.fd, size)
410 self.offset += len(ret)
411 return ret
7584f5c9 412
867f75f7
PG
413 def write(self, s, pos=None):
414 if pos is not None:
415 p0 = self.offset
416 os.lseek (self.fd, pos, os.SEEK_SET)
417 n = os.write(self.fd, s)
418 if pos is None:
419 self.offset += len(s)
420 else:
421 append = pos + n - p0
422 if append > 0:
423 self.offset += append
424 os.lseek (self.fd, p0, os.SEEK_SET)
7584f5c9 425
ad4402e8
ERE
426 def tell(self):
427 return self.offset
428
c7c736b6
PG
429 def seek_set (self, pos):
430 os.lseek (self.fd, pos, os.SEEK_SET)
431 self.offset = pos
432
8ab8fac5 433
15a81fc0
PG
434def gz_header (name=None):
435 timestamp = int(time.time())
436 flags = 0x0
437
438 if name is None:
439 name = b""
440 else:
dfd7865e 441 flags |= GZ_FLAG_FNAME
15a81fc0
PG
442 if type(name) is str:
443 name = name.encode("iso-8859-1", "replace")
6e99d23a
PG
444 if name.endswith(b".pdtcrypt"):
445 name = name[:-9]
15a81fc0
PG
446 if name.endswith(b".gz"):
447 name = name[:-3]
448 # RFC1952 says we must use ISO-8859-1 for the FNAME field.
449 name += NUL
450
451 hdr = struct.pack (GZ_FMT_HEADER,
452 GZ_MAGIC [0], GZ_MAGIC [1],
453 GZ_METHOD_DEFLATE, flags,
454 timestamp,
455 GZ_DEFLATE_FLAGS, GZ_OS_CODE)
456
457 return hdr + name
458
d601d33b 459
7584f5c9
ERE
460class _Stream:
461 """Class that serves as an adapter between TarFile and
462 a stream-like object. The stream-like object only
463 needs to have a read() or write() method and is accessed
464 blockwise. Use of gzip or bzip2 compression is possible.
465 A stream-like object could be for example: sys.stdin,
466 sys.stdout, a socket, a tape device etc.
467
3031b7ae
PG
468 _Stream is intended to be used only internally but is
469 nevertherless used externally by Deltatar.
470
471 When encrypting, the ``enccounter`` will be used for
472 initializing the first cryptographic context. When
473 decrypting, its value will be compared to the decrypted
474 object. Decryption fails if the value does not match.
475 In effect, this means that a ``_Stream`` whose ctor was
476 passed ``enccounter`` can only be used to encrypt or
477 decrypt a single object.
7584f5c9
ERE
478 """
479
c7c736b6 480 remainder = -1 # track size in encrypted entries
04f4c7ab 481 tolerance = TOLERANCE_STRICT
c7c736b6 482
6e812ad9 483 def __init__(self, name, mode, comptype, fileobj, bufsize,
d1c38f40 484 concat=False, encryption=None, enccounter=None,
04f4c7ab 485 compresslevel=9, tolerance=TOLERANCE_STRICT):
7584f5c9
ERE
486 """Construct a _Stream object.
487 """
d1c38f40 488 self.arcmode = arcmode_set (concat, encryption, comptype)
04f4c7ab 489 self.tolerance = tolerance
d1c38f40 490
7584f5c9
ERE
491 self._extfileobj = True
492 if fileobj is None:
493 fileobj = _LowLevelFile(name, mode)
494 self._extfileobj = False
495
496 if comptype == '*':
497 # Enable transparent compression detection for the
498 # stream interface
499 fileobj = _StreamProxy(fileobj)
500 comptype = fileobj.getcomptype()
d1c38f40
PG
501 if comptype == '':
502 comptype = "tar"
7584f5c9 503
3031b7ae
PG
504 self.enccounter = None
505 if self.arcmode & ARCMODE_ENCRYPT:
506 self.enccounter = enccounter
507
7584f5c9
ERE
508 self.name = name or ""
509 self.mode = mode
510 self.comptype = comptype
53732900 511 self.cmp = None
7584f5c9
ERE
512 self.fileobj = fileobj
513 self.bufsize = bufsize
be60ffd0
ERE
514 self.buf = b""
515 self.pos = 0
516 self.concat_pos = 0
7584f5c9 517 self.closed = False
be60ffd0 518 self.flags = 0
be60ffd0 519 self.last_block_offset = 0
e4e5d0b8 520 self.dbuf = b"" # ???
46c03c02 521 self.exception = None # communicate decompression failure
2b82f50c 522 self.compresslevel = compresslevel
784175ba 523 self.bytes_written = 0
c7c736b6 524 # crypto parameters
2ae46844 525 self.encryption = encryption
c7c736b6 526 self.lasthdr = None
7584f5c9 527
be60ffd0
ERE
528 try:
529 if comptype == "gz":
530 try:
531 import zlib
532 except ImportError:
533 raise CompressionError("zlib module is not available")
534 self.zlib = zlib
bec34b42
PG
535 if mode == "r":
536 self.exception = zlib.error
8ae983c4 537 self._init_read_gz()
bec34b42 538 elif mode == "w":
d1c38f40
PG
539 if not (self.arcmode & ARCMODE_CONCAT):
540 if self.arcmode & ARCMODE_ENCRYPT:
3031b7ae 541 self._init_write_encrypt (name)
a0873dcc 542 self._init_write_gz ()
c2ffe2ec 543 self.crc = zlib.crc32(b"") & 0xFFFFffff
7584f5c9 544
be60ffd0 545 elif comptype == "bz2":
d1c38f40 546 if self.arcmode & ARCMODE_ENCRYPT:
2ae46844 547 raise InvalidEncryptionError("encryption not available for "
d1c38f40 548 "compression “%s”" % comptype)
be60ffd0
ERE
549 try:
550 import bz2
551 except ImportError:
552 raise CompressionError("bz2 module is not available")
553 if mode == "r":
554 self.dbuf = b""
555 self.cmp = bz2.BZ2Decompressor()
556 self.exception = OSError
557 else:
558 self.cmp = bz2.BZ2Compressor()
7584f5c9 559
be60ffd0 560 elif comptype == 'xz':
d1c38f40 561 if self.arcmode & ARCMODE_ENCRYPT:
2ae46844 562 raise InvalidEncryptionError("encryption not available for "
d1c38f40 563 "compression “%s”" % comptype)
c7c736b6
PG
564 try:
565 import lzma
566 except ImportError:
567 raise CompressionError("lzma module is not available")
568 if mode == "r":
569 self.dbuf = b""
570 self.cmp = lzma.LZMADecompressor()
571 self.exception = lzma.LZMAError
572 else:
573 self.cmp = lzma.LZMACompressor()
574
6de9444a 575 elif comptype == "tar":
d1c38f40 576 if not (self.arcmode & ARCMODE_CONCAT) \
6de9444a 577 and mode == "w" \
d1c38f40 578 and self.arcmode & ARCMODE_ENCRYPT:
3031b7ae 579 self._init_write_encrypt (name)
6de9444a
PG
580
581 else:
d1c38f40 582 if self.arcmode & ARCMODE_ENCRYPT:
2ae46844 583 raise InvalidEncryptionError("encryption not available for "
d1c38f40 584 "compression “%s”" % comptype)
c7c736b6 585 raise CompressionError("unknown compression type %r" % comptype)
be60ffd0 586
200d4866 587 except:
be60ffd0
ERE
588 if not self._extfileobj:
589 self.fileobj.close()
590 self.closed = True
591 raise
ac5e4184 592
7584f5c9
ERE
593 def __del__(self):
594 if hasattr(self, "closed") and not self.closed:
fac2cfe1
PG
595 try:
596 self.close()
597 except crypto.InternalError:
598 # context already finalized due to abort but close() tried
599 # to use it
600 pass
7584f5c9 601
c7c736b6 602
d1c38f40
PG
603 def next (self, name):
604 if self.arcmode & ARCMODE_COMPRESS:
605 if getattr (self, "cmp", None) is not None:
606 self._finalize_write_gz ()
0349168a
PG
607 self.__sync()
608 if self.arcmode & ~(ARCMODE_ENCRYPT | ARCMODE_COMPRESS):
609 self.last_block_offset = self.fileobj.tell()
d1c38f40
PG
610 if self.arcmode & ARCMODE_ENCRYPT:
611 self._finalize_write_encrypt ()
612 self._init_write_encrypt (name, set_last_block_offset=True)
613 if self.arcmode & ARCMODE_COMPRESS:
614 self._init_write_gz (set_last_block_offset =
0349168a 615 not (self.arcmode & ARCMODE_ENCRYPT))
d1c38f40
PG
616 return self.last_block_offset
617
618
619 def next_volume (self, name):
620 # with non-concat modes, this is taken care by the _Stream
621 # ctor as invoked by the newvol handler
622 if self.arcmode & ARCMODE_COMPRESS:
623 if getattr (self, "cmp", None) is not None:
624 # e. g. compressed PAX header written
625 self._finalize_write_gz ()
626 if self.arcmode & ARCMODE_ENCRYPT:
627 self._init_write_encrypt (name)
628 if self.arcmode & ARCMODE_COMPRESS:
629 self._init_write_gz ()
630
c7c736b6 631
d1c38f40
PG
632 def _init_write_encrypt (self, entry=None, set_last_block_offset=False):
633 """
634 Save position for delayed write of header; fill the header location
635 with dummy bytes.
636 """
637 # first thing, proclaim new object to the encryption context
638 # secondly, assemble the header with the updated parameters
639 # and commit it directly to the underlying stream, bypassing the
640 # encryption layer in .__write().
641 dummyhdr = self.encryption.next (entry, counter=self.enccounter)
642 if dummyhdr is None:
643 raise EncryptionError ("Crypto.next(): bad dummy header") # XXX
644 self.lasthdr = self.fileobj.tell()
645 self.__write_to_file(dummyhdr)
646 if set_last_block_offset is True:
647 self.last_block_offset = self.lasthdr
c7c736b6
PG
648
649
650 def _finalize_write_encrypt (self):
651 """
652 Seek back to header position, read dummy bytes, finalize crypto
653 obtaining the actual header, write header, seek back to current
654 position.
963d0db4
PG
655
656 Returns the list of IV fixed parts as used during encryption.
c7c736b6 657 """
d1c38f40 658 if self.lasthdr is not None:
c7c736b6
PG
659 pos0 = self.fileobj.tell ()
660 self.fileobj.seek_set (self.lasthdr)
dd47d6a2 661 dummy = self.fileobj.read (crypto.PDTCRYPT_HDR_SIZE)
c7c736b6
PG
662 pos1 = self.fileobj.tell ()
663 dpos = pos1 - self.lasthdr
dd47d6a2 664 assert dpos == crypto.PDTCRYPT_HDR_SIZE
c7c736b6 665 self.fileobj.seek_set (pos0)
c8c72fe1 666 data, hdr, _ = self.encryption.done (dummy)
5f38bff6 667 self.__write_to_file(hdr, pos=self.lasthdr)
c7c736b6
PG
668 self.__write_to_file(data) # append remainder of data
669 self.lasthdr = -1
670
671
57db1546
PG
672 def _finalize_write_gz (self):
673 if self.cmp is not None:
674 chunk = self.buf + self.cmp.flush()
675 if chunk:
676 if self.comptype == "gz":
677 # The native zlib crc is an unsigned 32-bit integer, but
678 # the Python wrapper implicitly casts that to a signed C
679 # long. So, on a 32-bit box self.crc may "look negative",
680 # while the same crc on a 64-bit box may "look positive".
681 # To avoid irksome warnings from the `struct` module, force
682 # it to look positive on all boxes.
683 chunk += struct.pack("<L", self.crc & 0xffffffff)
684 chunk += struct.pack("<L", self.concat_pos & 0xffffFFFF)
685 self.__enc_write (chunk)
15a81fc0 686 self.buf = b""
57db1546
PG
687
688
a0873dcc 689 def _init_write_gz (self, set_last_block_offset=False):
5fdff89f
ERE
690 '''
691 Add a new gzip block, closing last one
692 '''
be60ffd0 693 self.concat_pos = 0
c2ffe2ec 694 self.crc = self.zlib.crc32(b"") & 0xFFFFffff
6de9444a 695 first = self.cmp is None
2b82f50c
ERE
696 self.cmp = self.zlib.compressobj(self.compresslevel,
697 self.zlib.DEFLATED,
698 -self.zlib.MAX_WBITS,
699 self.zlib.DEF_MEM_LEVEL,
700 0)
6e812ad9
DGM
701
702 # if aes, we encrypt after compression
6de9444a 703 if set_last_block_offset is True:
ad4402e8 704 self.last_block_offset = self.fileobj.tell()
6e812ad9 705
15a81fc0 706 self.__write(gz_header (self.name if first is True else None))
5fdff89f 707
ac5e4184 708
7584f5c9
ERE
709 def write(self, s):
710 """Write string s to the stream.
711 """
712 if self.comptype == "gz":
c2ffe2ec 713 self.crc = self.zlib.crc32(s, self.crc) & 0xFFFFffff
7584f5c9 714 self.pos += len(s)
5fdff89f 715 self.concat_pos += len(s)
53732900 716 if self.cmp is not None:
7584f5c9
ERE
717 s = self.cmp.compress(s)
718 self.__write(s)
719
c7c736b6 720 def __sync(self):
cb7a3911 721 """Write what’s left in the buffer to the stream."""
c7c736b6
PG
722 self.__write (b"") # → len (buf) <= bufsiz
723 self.__enc_write (self.buf)
724 self.buf = b""
725
7584f5c9 726 def __write(self, s):
548bb8d5
CH
727 """Writes (and encodes) string s to the stream blockwise
728
729 will wait with encoding/writing until block is complete
7584f5c9
ERE
730 """
731 self.buf += s
732 while len(self.buf) > self.bufsize:
6e812ad9 733 self.__enc_write(self.buf[:self.bufsize])
7584f5c9
ERE
734 self.buf = self.buf[self.bufsize:]
735
867f75f7 736
5f38bff6 737 def __write_to_file(self, s, pos=None):
6e812ad9 738 '''
5f38bff6 739 Writes directly to the fileobj; updates self.bytes_written. If “pos” is
cb7a3911 740 given, the stream will seek to that position first and back afterwards,
5f38bff6 741 and the total of bytes written is not updated.
6e812ad9 742 '''
867f75f7 743 self.fileobj.write(s, pos)
5f38bff6
PG
744 if pos is None:
745 self.bytes_written += len(s)
867f75f7 746
6e812ad9
DGM
747
748 def __enc_write(self, s):
cb7a3911
PG
749 """
750 If encryption is active, the string s is encrypted before being written
751 to the file.
752 """
753 if len (s) == 0:
754 return
d1c38f40 755 if self.arcmode & ARCMODE_ENCRYPT:
cb7a3911
PG
756 buf = s
757 while len (buf) > 0:
758 n, ct = self.encryption.process(buf)
759 self.__write_to_file(ct)
760 buf = buf [n:]
761 if len (buf) > 0:
762 # The entire plaintext was not consumed: The size limit
763 # for encrypted objects was reached. Transparently create
764 # a new encrypted object and continue processing the input.
765 self._finalize_write_encrypt ()
766 self._init_write_encrypt ()
767 else:
768 self.__write_to_file(s)
769
6e812ad9 770
784175ba
CH
771 def estim_file_size(self):
772 """ estimates size of file if closing it now
773
774 The result may differ greatly from the amount of data sent to write()
775 due to compression, encryption and buffering.
776
777 In tests the result (before calling close()) was up to 12k smaller than
778 the final file size if compression is being used because zlib/bz2
779 compressors do not allow inspection of their buffered data :-(
780
ba5a449e
CH
781 Still, we add what close() would add: 8 bytes for gz checksum, one
782 encryption block size if encryption is used and the size of our own
783 buffer
784175ba
CH
784 """
785 if self.closed:
786 return self.bytes_written
787
788 result = self.bytes_written
789 if self.buf:
790 result += len(self.buf)
791 if self.comptype == 'gz':
ba5a449e 792 result += 8 # 2 longs = 8 byte (no extra info written for bzip2)
784175ba
CH
793 return result
794
5fdff89f 795 def close(self, close_fileobj=True):
7584f5c9
ERE
796 """Close the _Stream object. No operation should be
797 done on it afterwards.
798 """
963d0db4 799
7584f5c9
ERE
800 if self.closed:
801 return
802
963d0db4 803 if close_fileobj is True:
a0873dcc 804
ae3d0f2a 805 if self.mode == "w":
d1c38f40 806 if self.arcmode & ARCMODE_COMPRESS:
a0873dcc 807 self._finalize_write_gz ()
ae3d0f2a 808 # end of Tar archive marker (two empty blocks) was written
267bc643
PG
809 # finalize encryption last; no writes may be performed after
810 # this point
cb7a3911 811 self.__sync ()
d1c38f40
PG
812 if self.arcmode & ARCMODE_ENCRYPT:
813 self._finalize_write_encrypt ()
267bc643 814
963d0db4
PG
815 if not self._extfileobj:
816 self.fileobj.close()
817 else:
818 # read the zlib crc and length and check them
819 if self.mode == "r" and self.comptype == "gz":
820 read_crc = self.__read(4)
821 read_length = self.__read(4)
822 calculated_crc = self.crc
823 if struct.unpack("<L", read_crc)[0] != calculated_crc:
824 raise CompressionError("bad gzip crc")
7584f5c9
ERE
825 self.closed = True
826
54128a00 827
7584f5c9
ERE
828 def _init_read_gz(self):
829 """Initialize for reading a gzip compressed fileobj.
830 """
831 self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
7584f5c9 832
85737f48 833 read2 = self.__read(2)
e50fa574
PG
834 if read2 == b"":
835 raise EndOfFile ("_init_read_gz(): read returned zero bytes at pos "
836 "%d" % self.fileobj.tell())
837 # taken from gzip.GzipFile with some alterations
d601d33b 838 if read2 != GZ_MAGIC_BYTES:
7584f5c9 839 raise ReadError("not a gzip file")
85737f48 840
dfd7865e
PG
841 read1 = ord (self.__read(1))
842 if read1 != GZ_METHOD_DEFLATE:
7584f5c9
ERE
843 raise CompressionError("unsupported compression method")
844
85737f48 845 self.flags = flag = ord(self.__read(1))
dfd7865e 846 self.__read(6) # discard timestamp[4], deflate flags, os code
7584f5c9 847
dfd7865e 848 if flag & GZ_FLAG_FEXTRA:
7584f5c9
ERE
849 xlen = ord(self.__read(1)) + 256 * ord(self.__read(1))
850 self.read(xlen)
dfd7865e 851 if flag & GZ_FLAG_FNAME:
7584f5c9
ERE
852 while True:
853 s = self.__read(1)
854 if not s or s == NUL:
855 break
dfd7865e 856 if flag & GZ_FLAG_FCOMMENT:
7584f5c9
ERE
857 while True:
858 s = self.__read(1)
859 if not s or s == NUL:
860 break
dfd7865e 861 if flag & GZ_FLAG_FHCRC:
7584f5c9
ERE
862 self.__read(2)
863
c7c736b6
PG
864 def _init_read_encrypt (self):
865 """Initialize encryption for next entry in archive. Read a header and
866 notify the crypto context."""
d1c38f40 867 if self.arcmode & ARCMODE_ENCRYPT:
6e99d23a 868 lasthdr = self.fileobj.tell ()
15d3eefd
PG
869 try:
870 hdr = crypto.hdr_read_stream (self.fileobj)
8a8ac469
PG
871 except crypto.EndOfFile:
872 return False
6e99d23a 873 except crypto.InvalidHeader as exn:
c7c736b6 874 raise DecryptionError ("Crypto.hdr_read_stream(): error “%s” "
6e99d23a
PG
875 "processing %r at pos %d"
876 % (exn, self.fileobj, lasthdr)) \
ae3d0f2a 877 from exn
3031b7ae
PG
878 if self.enccounter is not None:
879 # enforce that the iv counter in the header matches an
880 # explicitly requested one
881 iv = crypto.hdr_iv_counter (hdr)
882 if iv != self.enccounter:
883 raise DecryptionError ("expected IV counter %d, got %d"
884 % (self.enccounter, iv))
6e99d23a 885 self.lasthdr = lasthdr
c7c736b6 886 self.remainder = hdr ["ctsize"] # distance to next header
1ed44e7b
PG
887 try:
888 self.encryption.next (hdr)
889 except crypto.InvalidParameter as exn:
890 raise DecryptionError ("Crypto.next(): error “%s” "
891 "processing %r at pos %d"
892 % (exn, self.fileobj, lasthdr)) \
893 from exn
8a8ac469
PG
894
895 return True
c7c736b6
PG
896
897
8de91f4f
PG
898 def _read_encrypt (self, buf):
899 """
900 Demote a program error to a decryption error in tolerant mode. This
901 allows recovery from corrupted headers and invalid data.
902 """
903 try:
904 return self.encryption.process (buf)
905 except RuntimeError as exn:
04f4c7ab 906 if self.tolerance != TOLERANCE_STRICT:
8de91f4f
PG
907 raise DecryptionError (exn)
908 raise
909
910
c7c736b6
PG
911 def _finalize_read_encrypt (self):
912 """
913 Finalize decryption.
914 """
d1c38f40
PG
915 if self.arcmode & ARCMODE_ENCRYPT \
916 and self.lasthdr is not None :
c7c736b6
PG
917 assert self.remainder >= 0
918 if self.remainder > 0:
c7c736b6 919 self.remainder = 0
b0078f26
PG
920 try:
921 data = self.encryption.done ()
922 except crypto.InvalidGCMTag as exn:
923 raise DecryptionError ("decryption failed: %s" % exn)
c7c736b6
PG
924 return data
925
926
7584f5c9
ERE
927 def tell(self):
928 """Return the stream's file pointer position.
929 """
930 return self.pos
931
932 def seek(self, pos=0):
933 """Set the stream's file pointer to pos. Negative seeking
934 is forbidden.
935 """
936 if pos - self.pos >= 0:
937 blocks, remainder = divmod(pos - self.pos, self.bufsize)
be60ffd0 938 for i in range(blocks):
7584f5c9
ERE
939 self.read(self.bufsize)
940 self.read(remainder)
941 else:
942 raise StreamError("seeking backwards is not allowed")
943 return self.pos
944
945 def read(self, size=None):
946 """Return the next size number of bytes from the stream.
947 If size is not defined, return all bytes of the stream
948 up to EOF.
949 """
950 if size is None:
951 t = []
952 while True:
953 buf = self._read(self.bufsize)
954 if not buf:
955 break
956 t.append(buf)
9dc7ac5c 957 buf = b"".join(t)
7584f5c9
ERE
958 else:
959 buf = self._read(size)
960 self.pos += len(buf)
961 return buf
962
3a7e1a50
ERE
963 def readline(self):
964 """Reads just one line, new line character included
965 """
f0fd5e3a 966 # if \n in dbuf, no read neads to be done
be60ffd0
ERE
967 if b'\n' in self.dbuf:
968 pos = self.dbuf.index(b'\n') + 1
f0fd5e3a
ERE
969 ret = self.dbuf[:pos]
970 self.dbuf = self.dbuf[pos:]
971 return ret
972
1215b602 973 buf = []
3a7e1a50
ERE
974 while True:
975 chunk = self._read(self.bufsize)
976
f0fd5e3a 977 # nothing more to read, so return the buffer
3a7e1a50 978 if not chunk:
be60ffd0 979 return b''.join(buf)
3a7e1a50
ERE
980
981 buf.append(chunk)
f0fd5e3a
ERE
982
983 # if \n found, return the new line
be60ffd0
ERE
984 if b'\n' in chunk:
985 dbuf = b''.join(buf)
986 pos = dbuf.index(b'\n') + 1
1215b602 987 self.dbuf = dbuf[pos:] + self.dbuf
3a7e1a50
ERE
988 return dbuf[:pos]
989
7584f5c9
ERE
990 def _read(self, size):
991 """Return size bytes from the stream.
992 """
7584f5c9
ERE
993 c = len(self.dbuf)
994 t = [self.dbuf]
e4e5d0b8 995
7584f5c9 996 while c < size:
867f75f7 997 buf = self.__read(self.bufsize)
7584f5c9
ERE
998 if not buf:
999 break
3a7e1a50 1000
53732900 1001 if self.cmp is not None:
85737f48 1002 try:
3a7e1a50 1003 buf = self.cmp.decompress(buf)
54128a00
PG
1004 except self.exception as exn:
1005 raise ReadError("invalid compressed data (%r)" % exn)
be60ffd0 1006 except Exception as e:
04fb06f4
DGM
1007 # happens at the end of the file
1008 # _init_read_gz failed in the previous iteration so
e4e5d0b8 1009 # self.cmp.decompress fails here
d1c38f40 1010 if self.arcmode & ARCMODE_CONCAT:
be60ffd0
ERE
1011 pass
1012 else:
1013 raise ReadError("invalid compressed data")
d1c38f40 1014 if self.arcmode & ARCMODE_COMPRESS and hasattr(self, "crc"):
c2ffe2ec 1015 self.crc = self.zlib.crc32(buf, self.crc) & 0xFFFFffff
d1c38f40
PG
1016 if self.arcmode & ARCMODE_CONCAT \
1017 and len(self.cmp.unused_data) != 0:
3a7e1a50
ERE
1018 self.buf = self.cmp.unused_data + self.buf
1019 self.close(close_fileobj=False)
1020 try:
1021 self._init_read_gz()
8de91f4f 1022 except DecryptionError:
04f4c7ab 1023 if self.tolerance != TOLERANCE_STRICT:
8de91f4f
PG
1024 # return whatever data was processed successfully
1025 if len (buf) > 0:
1026 t.append (buf)
1027 if len (t) > 0:
1028 break
1029 raise
e50fa574 1030 except EndOfFile:
3a7e1a50
ERE
1031 # happens at the end of the file
1032 pass
c2ffe2ec 1033 self.crc = self.zlib.crc32(b"") & 0xFFFFffff
3a7e1a50 1034 self.closed = False
7584f5c9
ERE
1035 t.append(buf)
1036 c += len(buf)
be60ffd0 1037 t = b"".join(t)
7584f5c9
ERE
1038 self.dbuf = t[size:]
1039 return t[:size]
1040
e4e5d0b8 1041
7584f5c9 1042 def __read(self, size):
ef3b4499
PG
1043 """
1044 Return size bytes from stream. If internal buffer is empty, read
1045 another block from the stream.
1046
1047 The function returns up to size bytes of data. When an error occurs
1048 during decryption, everything until the end of the last successfully
1049 finalized object is returned.
7584f5c9
ERE
1050 """
1051 c = len(self.buf)
8de91f4f 1052 t = [self.buf] if c > 0 else []
1ed44e7b 1053 good_crypto = len (t)
8de91f4f 1054
7584f5c9 1055 while c < size:
c7c736b6 1056 todo = size
8de91f4f
PG
1057 try:
1058 if self.arcmode & ARCMODE_ENCRYPT:
1059 if self.remainder <= 0:
1060 # prepare next object
044585c6
PG
1061 if self._init_read_encrypt () is False: # EOF
1062 buf = None
1063 break # while
8de91f4f
PG
1064
1065 # only read up to the end of the encrypted object
1066 todo = min (size, self.remainder)
1067 buf = self.fileobj.read(todo)
1068 if self.arcmode & ARCMODE_ENCRYPT:
1069 # decrypt the thing
1070 buf = self._read_encrypt (buf)
1071 if todo == self.remainder:
1072 # at the end of a crypto object; finalization will fail if
1073 # the GCM tag does not match
ef3b4499 1074 trailing = self._finalize_read_encrypt ()
8de91f4f
PG
1075 good_crypto = len (t) + 1
1076 if len (trailing) > 0:
1077 buf += trailing
1078 self.remainder = 0
1079 else:
1080 self.remainder -= todo
1081 except DecryptionError:
04f4c7ab 1082 if self.tolerance == TOLERANCE_STRICT:
8de91f4f
PG
1083 raise
1084 self.encryption.drop ()
1085 if good_crypto == 0:
1086 raise
1087 # this may occur at any of the three crypto operations above.
1088 # some objects did validate; discard all data after it; next
1089 # call will start with the bad object and error out immediately
1090 self.buf = b"".join (t [good_crypto:])
1091 return b"".join (t [:good_crypto])
c7c736b6
PG
1092
1093 if not buf: ## XXX stream terminated prematurely; this should be an error
7584f5c9 1094 break
c7c736b6 1095
7584f5c9
ERE
1096 t.append(buf)
1097 c += len(buf)
be60ffd0 1098 t = b"".join(t)
7584f5c9 1099 self.buf = t[size:]
fb27c6e8 1100
7584f5c9 1101 return t[:size]
7d372216 1102
7584f5c9
ERE
1103
1104class _StreamProxy(object):
1105 """Small proxy class that enables transparent compression
1106 detection for the Stream interface (mode 'r|*').
1107 """
1108
1109 def __init__(self, fileobj):
1110 self.fileobj = fileobj
1111 self.buf = self.fileobj.read(BLOCKSIZE)
1112
f0287fb7 1113 def read(self, size): # pylint: disable=method-hidden
7584f5c9
ERE
1114 self.read = self.fileobj.read
1115 return self.buf
1116
1117 def getcomptype(self):
d601d33b 1118 if self.buf.startswith(GZ_MAGIC_DEFLATE):
7584f5c9 1119 return "gz"
be60ffd0 1120 elif self.buf[0:3] == b"BZh" and self.buf[4:10] == b"1AY&SY":
7584f5c9 1121 return "bz2"
be60ffd0
ERE
1122 elif self.buf.startswith((b"\x5d\x00\x00\x80", b"\xfd7zXZ")):
1123 return "xz"
1124 else:
1125 return "tar"
7584f5c9
ERE
1126
1127 def close(self):
1128 self.fileobj.close()
1129# class StreamProxy
1130
7584f5c9
ERE
1131#------------------------
1132# Extraction file object
1133#------------------------
1134class _FileInFile(object):
1135 """A thin wrapper around an existing file object that
1136 provides a part of its data as an individual file
1137 object.
1138 """
1139
be60ffd0 1140 def __init__(self, fileobj, offset, size, blockinfo=None):
7584f5c9
ERE
1141 self.fileobj = fileobj
1142 self.offset = offset
1143 self.size = size
7584f5c9 1144 self.position = 0
be60ffd0
ERE
1145 self.name = getattr(fileobj, "name", None)
1146 self.closed = False
1147
1148 if blockinfo is None:
1149 blockinfo = [(0, size)]
1150
1151 # Construct a map with data and zero blocks.
1152 self.map_index = 0
1153 self.map = []
1154 lastpos = 0
1155 realpos = self.offset
1156 for offset, size in blockinfo:
1157 if offset > lastpos:
1158 self.map.append((False, lastpos, offset, None))
1159 self.map.append((True, offset, offset + size, realpos))
1160 realpos += size
1161 lastpos = offset + size
1162 if lastpos < self.size:
1163 self.map.append((False, lastpos, self.size, None))
1164
1165 def flush(self):
1166 pass
1167
1168 def readable(self):
1169 return True
1170
1171 def writable(self):
1172 return False
1173
1174 def seekable(self):
1175 return self.fileobj.seekable()
7584f5c9
ERE
1176
1177 def tell(self):
1178 """Return the current file position.
1179 """
1180 return self.position
1181
be60ffd0 1182 def seek(self, position, whence=io.SEEK_SET):
7584f5c9
ERE
1183 """Seek to a position in the file.
1184 """
be60ffd0
ERE
1185 if whence == io.SEEK_SET:
1186 self.position = min(max(position, 0), self.size)
1187 elif whence == io.SEEK_CUR:
1188 if position < 0:
1189 self.position = max(self.position + position, 0)
1190 else:
1191 self.position = min(self.position + position, self.size)
1192 elif whence == io.SEEK_END:
1193 self.position = max(min(self.size + position, self.size), 0)
1194 else:
1195 raise ValueError("Invalid argument")
1196 return self.position
7584f5c9
ERE
1197
1198 def read(self, size=None):
1199 """Read data from the file.
1200 """
1201 if size is None:
1202 size = self.size - self.position
1203 else:
1204 size = min(size, self.size - self.position)
1205
be60ffd0 1206 buf = b""
7584f5c9 1207 while size > 0:
7584f5c9 1208 while True:
be60ffd0
ERE
1209 data, start, stop, offset = self.map[self.map_index]
1210 if start <= self.position < stop:
7584f5c9 1211 break
be60ffd0
ERE
1212 else:
1213 self.map_index += 1
1214 if self.map_index == len(self.map):
1215 self.map_index = 0
1216 length = min(size, stop - self.position)
1217 if data:
1218 self.fileobj.seek(offset + (self.position - start))
1219 buf += self.fileobj.read(length)
7584f5c9 1220 else:
be60ffd0
ERE
1221 buf += NUL * length
1222 size -= length
1223 self.position += length
1224 return buf
7584f5c9 1225
be60ffd0
ERE
1226 def readinto(self, b):
1227 buf = self.read(len(b))
1228 b[:len(buf)] = buf
1229 return len(buf)
7584f5c9
ERE
1230
1231 def close(self):
7584f5c9 1232 self.closed = True
be60ffd0 1233#class _FileInFile
7584f5c9 1234
be60ffd0
ERE
1235
1236class ExFileObject(io.BufferedReader):
1237
1238 def __init__(self, tarfile, tarinfo):
1239 fileobj = _FileInFile(tarfile.fileobj, tarinfo.offset_data,
1240 tarinfo.size, tarinfo.sparse)
1241 super().__init__(fileobj)
7584f5c9
ERE
1242#class ExFileObject
1243
1244#------------------
1245# Exported Classes
1246#------------------
1247class TarInfo(object):
1248 """Informational class which holds the details about an
1249 archive member given by a tar header block.
1250 TarInfo objects are returned by TarFile.getmember(),
1251 TarFile.getmembers() and TarFile.gettarinfo() and are
1252 usually created internally.
1253 """
1254
be60ffd0
ERE
1255 __slots__ = ("name", "mode", "uid", "gid", "size", "mtime",
1256 "chksum", "type", "linkname", "uname", "gname",
1257 "devmajor", "devminor", "volume_offset",
1258 "offset", "offset_data", "pax_headers", "sparse",
1259 "tarfile", "_sparse_structs", "_link_target")
1260
7584f5c9
ERE
1261 def __init__(self, name=""):
1262 """Construct a TarInfo object. name is the optional name
1263 of the member.
1264 """
1265 self.name = name # member name
be60ffd0 1266 self.mode = 0o644 # file permissions
7584f5c9
ERE
1267 self.uid = 0 # user id
1268 self.gid = 0 # group id
1269 self.size = 0 # file size
1270 self.mtime = 0 # modification time
1271 self.chksum = 0 # header checksum
1272 self.type = REGTYPE # member type
1273 self.linkname = "" # link name
1274 self.uname = "" # user name
1275 self.gname = "" # group name
1276 self.devmajor = 0 # device major number
1277 self.devminor = 0 # device minor number
1278
1279 self.offset = 0 # the tar header starts here
1280 self.offset_data = 0 # the file's data starts here
0eb5048f
ERE
1281 self.volume_offset = 0 # the file's data corresponds with the data
1282 # starting at this position
7584f5c9 1283
be60ffd0 1284 self.sparse = None # sparse member information
7584f5c9
ERE
1285 self.pax_headers = {} # pax header information
1286
1287 # In pax headers the "name" and "linkname" field are called
1288 # "path" and "linkpath".
1289 def _getpath(self):
1290 return self.name
1291 def _setpath(self, name):
1292 self.name = name
1293 path = property(_getpath, _setpath)
1294
1295 def _getlinkpath(self):
1296 return self.linkname
1297 def _setlinkpath(self, linkname):
1298 self.linkname = linkname
1299 linkpath = property(_getlinkpath, _setlinkpath)
1300
1301 def __repr__(self):
1302 return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self))
1303
be60ffd0 1304 def get_info(self, encoding=None, errors=None):
7584f5c9
ERE
1305 """Return the TarInfo's attributes as a dictionary.
1306 """
1307 info = {
1308 "name": self.name,
be60ffd0 1309 "mode": self.mode & 0o7777,
7584f5c9
ERE
1310 "uid": self.uid,
1311 "gid": self.gid,
1312 "size": self.size,
1313 "mtime": self.mtime,
1314 "chksum": self.chksum,
1315 "type": self.type,
1316 "linkname": self.linkname,
1317 "uname": self.uname,
1318 "gname": self.gname,
1319 "devmajor": self.devmajor,
36a315a0 1320 "devminor": self.devminor,
0eb5048f
ERE
1321 "offset_data": self.offset_data,
1322 "volume_offset": self.volume_offset
7584f5c9
ERE
1323 }
1324
1325 if info["type"] == DIRTYPE and not info["name"].endswith("/"):
1326 info["name"] += "/"
1327
7584f5c9
ERE
1328 return info
1329
be60ffd0
ERE
1330 def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING,
1331 errors="surrogateescape"):
7584f5c9
ERE
1332 """Return a tar header as a string of 512 byte blocks.
1333 """
1334 info = self.get_info(encoding, errors)
1335
1336 if format == USTAR_FORMAT:
be60ffd0 1337 return self.create_ustar_header(info, encoding, errors)
7584f5c9 1338 elif format == GNU_FORMAT:
be60ffd0 1339 return self.create_gnu_header(info, encoding, errors)
7584f5c9
ERE
1340 elif format == PAX_FORMAT:
1341 return self.create_pax_header(info, encoding, errors)
1342 else:
1343 raise ValueError("invalid format")
1344
be60ffd0 1345 def create_ustar_header(self, info, encoding, errors):
7584f5c9
ERE
1346 """Return the object as a ustar header block.
1347 """
1348 info["magic"] = POSIX_MAGIC
1349
1350 if len(info["linkname"]) > LENGTH_LINK:
1351 raise ValueError("linkname is too long")
1352
1353 if len(info["name"]) > LENGTH_NAME:
1354 info["prefix"], info["name"] = self._posix_split_name(info["name"])
1355
be60ffd0 1356 return self._create_header(info, USTAR_FORMAT, encoding, errors)
7584f5c9 1357
be60ffd0 1358 def create_gnu_header(self, info, encoding, errors):
7584f5c9
ERE
1359 """Return the object as a GNU header block sequence.
1360 """
1361 info["magic"] = GNU_MAGIC
1362
2f854e77
ERE
1363 if self.ismultivol():
1364 prefix = [
1365 itn(info.get("atime", 0), 12, GNU_FORMAT),
1366 itn(info.get("ctime", 0), 12, GNU_FORMAT),
0eb5048f 1367 itn(self.volume_offset, 12, GNU_FORMAT),
2f854e77
ERE
1368 itn(0, 119, GNU_FORMAT), # stuff unused in this tar implementation, set to zero
1369 ]
be60ffd0 1370 info['prefix'] = b"".join(prefix)
0eb5048f 1371 info['size'] = info['size'] - self.volume_offset
2f854e77 1372
be60ffd0 1373 buf = b""
7584f5c9 1374 if len(info["linkname"]) > LENGTH_LINK:
be60ffd0
ERE
1375 buf += self._create_gnu_long_header(info["linkname"],
1376 GNUTYPE_LONGLINK, encoding, errors)
7584f5c9
ERE
1377
1378 if len(info["name"]) > LENGTH_NAME:
be60ffd0
ERE
1379 buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME,
1380 encoding, errors)
7584f5c9 1381
be60ffd0 1382 return buf + self._create_header(info, GNU_FORMAT, encoding, errors)
7584f5c9
ERE
1383
1384 def create_pax_header(self, info, encoding, errors):
1385 """Return the object as a ustar header block. If it cannot be
1386 represented this way, prepend a pax extended header sequence
1387 with supplement information.
1388 """
1389 info["magic"] = POSIX_MAGIC
1390 pax_headers = self.pax_headers.copy()
c04e0751
ERE
1391 if self.ismultivol():
1392 info['size'] = info['size'] - self.volume_offset
7584f5c9
ERE
1393
1394 # Test string fields for values that exceed the field length or cannot
1395 # be represented in ASCII encoding.
1396 for name, hname, length in (
36a315a0
ERE
1397 ("name", "path", LENGTH_NAME),
1398 ("linkname", "linkpath", LENGTH_LINK),
1399 ("uname", "uname", 32),
1400 ("gname", "gname", 32)):
7584f5c9
ERE
1401
1402 if hname in pax_headers:
1403 # The pax header has priority.
1404 continue
1405
7584f5c9
ERE
1406 # Try to encode the string as ASCII.
1407 try:
be60ffd0 1408 info[name].encode("ascii", "strict")
7584f5c9 1409 except UnicodeEncodeError:
be60ffd0 1410 pax_headers[hname] = info[name]
7584f5c9
ERE
1411 continue
1412
1413 if len(info[name]) > length:
be60ffd0 1414 pax_headers[hname] = info[name]
7584f5c9
ERE
1415
1416 # Test number fields for values that exceed the field limit or values
1417 # that like to be stored as float.
1418 for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)):
1419 if name in pax_headers:
1420 # The pax header has priority. Avoid overflow.
1421 info[name] = 0
1422 continue
1423
1424 val = info[name]
1425 if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float):
be60ffd0 1426 pax_headers[name] = str(val)
7584f5c9
ERE
1427 info[name] = 0
1428
1429 # Create a pax extended header if necessary.
1430 if pax_headers:
be60ffd0 1431 buf = self._create_pax_generic_header(pax_headers, XHDTYPE, encoding)
7584f5c9 1432 else:
be60ffd0 1433 buf = b""
7584f5c9 1434
be60ffd0 1435 return buf + self._create_header(info, USTAR_FORMAT, "ascii", "replace")
7584f5c9
ERE
1436
1437 @classmethod
1438 def create_pax_global_header(cls, pax_headers):
1439 """Return the object as a pax global header block sequence.
1440 """
be60ffd0 1441 return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf-8")
7584f5c9
ERE
1442
1443 def _posix_split_name(self, name):
1444 """Split a name longer than 100 chars into a prefix
1445 and a name part.
1446 """
1447 prefix = name[:LENGTH_PREFIX + 1]
1448 while prefix and prefix[-1] != "/":
1449 prefix = prefix[:-1]
1450
1451 name = name[len(prefix):]
1452 prefix = prefix[:-1]
1453
1454 if not prefix or len(name) > LENGTH_NAME:
1455 raise ValueError("name is too long")
1456 return prefix, name
1457
1458 @staticmethod
be60ffd0 1459 def _create_header(info, format, encoding, errors):
7584f5c9
ERE
1460 """Return a header block. info is a dictionary with file
1461 information, format must be one of the *_FORMAT constants.
1462 """
1463 parts = [
be60ffd0
ERE
1464 stn(info.get("name", ""), 100, encoding, errors),
1465 itn(info.get("mode", 0) & 0o7777, 8, format),
7584f5c9
ERE
1466 itn(info.get("uid", 0), 8, format),
1467 itn(info.get("gid", 0), 8, format),
1468 itn(info.get("size", 0), 12, format),
1469 itn(info.get("mtime", 0), 12, format),
be60ffd0 1470 b" ", # checksum field
2f854e77 1471 info.get("type", REGTYPE),
be60ffd0
ERE
1472 stn(info.get("linkname", ""), 100, encoding, errors),
1473 info.get("magic", POSIX_MAGIC),
1474 stn(info.get("uname", ""), 32, encoding, errors),
1475 stn(info.get("gname", ""), 32, encoding, errors),
7584f5c9
ERE
1476 itn(info.get("devmajor", 0), 8, format),
1477 itn(info.get("devminor", 0), 8, format),
be60ffd0 1478 sbtn(info.get("prefix", ""), 155, encoding, errors)
7584f5c9
ERE
1479 ]
1480
be60ffd0 1481 buf = struct.pack("%ds" % BLOCKSIZE, b"".join(parts))
7584f5c9 1482 chksum = calc_chksums(buf[-BLOCKSIZE:])[0]
be60ffd0 1483 buf = buf[:-364] + bytes("%06o\0" % chksum, "ascii") + buf[-357:]
7584f5c9
ERE
1484 return buf
1485
1486 @staticmethod
1487 def _create_payload(payload):
1488 """Return the string payload filled with zero bytes
1489 up to the next 512 byte border.
1490 """
1491 blocks, remainder = divmod(len(payload), BLOCKSIZE)
1492 if remainder > 0:
1493 payload += (BLOCKSIZE - remainder) * NUL
1494 return payload
1495
1496 @classmethod
be60ffd0 1497 def _create_gnu_long_header(cls, name, type, encoding, errors):
7584f5c9
ERE
1498 """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence
1499 for name.
1500 """
be60ffd0 1501 name = name.encode(encoding, errors) + NUL
7584f5c9
ERE
1502
1503 info = {}
1504 info["name"] = "././@LongLink"
1505 info["type"] = type
1506 info["size"] = len(name)
1507 info["magic"] = GNU_MAGIC
1508
1509 # create extended header + name blocks.
be60ffd0 1510 return cls._create_header(info, USTAR_FORMAT, encoding, errors) + \
7584f5c9
ERE
1511 cls._create_payload(name)
1512
1513 @classmethod
be60ffd0
ERE
1514 def _create_pax_generic_header(cls, pax_headers, type, encoding):
1515 """Return a POSIX.1-2008 extended or global header sequence
7584f5c9 1516 that contains a list of keyword, value pairs. The values
be60ffd0 1517 must be strings.
7584f5c9 1518 """
be60ffd0
ERE
1519 # Check if one of the fields contains surrogate characters and thereby
1520 # forces hdrcharset=BINARY, see _proc_pax() for more information.
1521 binary = False
1522 for keyword, value in pax_headers.items():
1523 try:
1524 value.encode("utf-8", "strict")
1525 except UnicodeEncodeError:
1526 binary = True
1527 break
1528
1529 records = b""
1530 if binary:
1531 # Put the hdrcharset field at the beginning of the header.
1532 records += b"21 hdrcharset=BINARY\n"
1533
1534 for keyword, value in pax_headers.items():
1535 keyword = keyword.encode("utf-8")
1536 if binary:
1537 # Try to restore the original byte representation of `value'.
1538 # Needless to say, that the encoding must match the string.
1539 value = value.encode(encoding, "surrogateescape")
1540 else:
1541 value = value.encode("utf-8")
1542
7584f5c9
ERE
1543 l = len(keyword) + len(value) + 3 # ' ' + '=' + '\n'
1544 n = p = 0
1545 while True:
1546 n = l + len(str(p))
1547 if n == p:
1548 break
1549 p = n
be60ffd0 1550 records += bytes(str(p), "ascii") + b" " + keyword + b"=" + value + b"\n"
7584f5c9
ERE
1551
1552 # We use a hardcoded "././@PaxHeader" name like star does
1553 # instead of the one that POSIX recommends.
1554 info = {}
1555 info["name"] = "././@PaxHeader"
1556 info["type"] = type
1557 info["size"] = len(records)
1558 info["magic"] = POSIX_MAGIC
1559
1560 # Create pax header + record blocks.
be60ffd0 1561 return cls._create_header(info, USTAR_FORMAT, "ascii", "replace") + \
7584f5c9
ERE
1562 cls._create_payload(records)
1563
1564 @classmethod
be60ffd0
ERE
1565 def frombuf(cls, buf, encoding, errors):
1566 """Construct a TarInfo object from a 512 byte bytes object.
7584f5c9
ERE
1567 """
1568 if len(buf) == 0:
1569 raise EmptyHeaderError("empty header")
1570 if len(buf) != BLOCKSIZE:
1571 raise TruncatedHeaderError("truncated header")
1572 if buf.count(NUL) == BLOCKSIZE:
1573 raise EOFHeaderError("end of file header")
1574
1575 chksum = nti(buf[148:156])
1576 if chksum not in calc_chksums(buf):
1577 raise InvalidHeaderError("bad checksum")
1578
1579 obj = cls()
be60ffd0 1580 obj.name = nts(buf[0:100], encoding, errors)
7584f5c9
ERE
1581 obj.mode = nti(buf[100:108])
1582 obj.uid = nti(buf[108:116])
1583 obj.gid = nti(buf[116:124])
1584 obj.size = nti(buf[124:136])
1585 obj.mtime = nti(buf[136:148])
1586 obj.chksum = chksum
1587 obj.type = buf[156:157]
be60ffd0
ERE
1588 obj.linkname = nts(buf[157:257], encoding, errors)
1589 obj.uname = nts(buf[265:297], encoding, errors)
1590 obj.gname = nts(buf[297:329], encoding, errors)
7584f5c9
ERE
1591 obj.devmajor = nti(buf[329:337])
1592 obj.devminor = nti(buf[337:345])
be60ffd0
ERE
1593 prefix = nts(buf[345:500], encoding, errors)
1594
1595 # The old GNU sparse format occupies some of the unused
1596 # space in the buffer for up to 4 sparse structures.
1597 # Save the them for later processing in _proc_sparse().
1598 if obj.type == GNUTYPE_SPARSE:
1599 pos = 386
1600 structs = []
1601 for i in range(4):
1602 try:
1603 offset = nti(buf[pos:pos + 12])
1604 numbytes = nti(buf[pos + 12:pos + 24])
1605 except ValueError:
1606 break
1607 structs.append((offset, numbytes))
1608 pos += 24
1609 isextended = bool(buf[482])
1610 origsize = nti(buf[483:495])
1611 obj._sparse_structs = (structs, isextended, origsize)
7584f5c9
ERE
1612
1613 # Old V7 tar format represents a directory as a regular
1614 # file with a trailing slash.
1615 if obj.type == AREGTYPE and obj.name.endswith("/"):
1616 obj.type = DIRTYPE
1617
1618 # Remove redundant slashes from directories.
1619 if obj.isdir():
1620 obj.name = obj.name.rstrip("/")
1621
1622 # Reconstruct a ustar longname.
1623 if prefix and obj.type not in GNU_TYPES:
1624 obj.name = prefix + "/" + obj.name
c474439c
ERE
1625 else:
1626 obj.offset_data = nti(buf[369:381])
7584f5c9
ERE
1627 return obj
1628
1629 @classmethod
1630 def fromtarfile(cls, tarfile):
1631 """Return the next TarInfo object from TarFile object
1632 tarfile.
1633 """
1634 buf = tarfile.fileobj.read(BLOCKSIZE)
be60ffd0 1635 obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors)
7584f5c9
ERE
1636 obj.offset = tarfile.fileobj.tell() - BLOCKSIZE
1637 return obj._proc_member(tarfile)
1638
1639 #--------------------------------------------------------------------------
1640 # The following are methods that are called depending on the type of a
1641 # member. The entry point is _proc_member() which can be overridden in a
1642 # subclass to add custom _proc_*() methods. A _proc_*() method MUST
1643 # implement the following
1644 # operations:
1645 # 1. Set self.offset_data to the position where the data blocks begin,
1646 # if there is data that follows.
1647 # 2. Set tarfile.offset to the position where the next member's header will
1648 # begin.
1649 # 3. Return self or another valid TarInfo object.
1650 def _proc_member(self, tarfile):
1651 """Choose the right processing method depending on
1652 the type and call it.
1653 """
1654 if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK):
1655 return self._proc_gnulong(tarfile)
1656 elif self.type == GNUTYPE_SPARSE:
1657 return self._proc_sparse(tarfile)
1658 elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE):
1659 return self._proc_pax(tarfile)
1660 else:
1661 return self._proc_builtin(tarfile)
1662
1663 def _proc_builtin(self, tarfile):
1664 """Process a builtin type or an unknown type which
1665 will be treated as a regular file.
1666 """
1667 self.offset_data = tarfile.fileobj.tell()
1668 offset = self.offset_data
00c34a12 1669 if self.isreg() or self.ismultivol() or self.type not in SUPPORTED_TYPES:
7584f5c9
ERE
1670 # Skip the following data blocks.
1671 offset += self._block(self.size)
1672 tarfile.offset = offset
1673
1674 # Patch the TarInfo object with saved global
1675 # header information.
1676 self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors)
1677
1678 return self
1679
1680 def _proc_gnulong(self, tarfile):
1681 """Process the blocks that hold a GNU longname
1682 or longlink member.
1683 """
1684 buf = tarfile.fileobj.read(self._block(self.size))
1685
1686 # Fetch the next header and process it.
1687 try:
1688 next = self.fromtarfile(tarfile)
1689 except HeaderError:
1690 raise SubsequentHeaderError("missing or bad subsequent header")
1691
1692 # Patch the TarInfo object from the next header with
1693 # the longname information.
1694 next.offset = self.offset
1695 if self.type == GNUTYPE_LONGNAME:
be60ffd0 1696 next.name = nts(buf, tarfile.encoding, tarfile.errors)
7584f5c9 1697 elif self.type == GNUTYPE_LONGLINK:
be60ffd0 1698 next.linkname = nts(buf, tarfile.encoding, tarfile.errors)
7584f5c9
ERE
1699
1700 return next
1701
1702 def _proc_sparse(self, tarfile):
1703 """Process a GNU sparse header plus extra headers.
1704 """
be60ffd0
ERE
1705 # We already collected some sparse structures in frombuf().
1706 structs, isextended, origsize = self._sparse_structs
1707 del self._sparse_structs
1708
1709 # Collect sparse structures from extended header blocks.
1710 while isextended:
7584f5c9
ERE
1711 buf = tarfile.fileobj.read(BLOCKSIZE)
1712 pos = 0
be60ffd0 1713 for i in range(21):
7584f5c9
ERE
1714 try:
1715 offset = nti(buf[pos:pos + 12])
1716 numbytes = nti(buf[pos + 12:pos + 24])
1717 except ValueError:
1718 break
be60ffd0
ERE
1719 if offset and numbytes:
1720 structs.append((offset, numbytes))
7584f5c9 1721 pos += 24
be60ffd0
ERE
1722 isextended = bool(buf[504])
1723 self.sparse = structs
7584f5c9
ERE
1724
1725 self.offset_data = tarfile.fileobj.tell()
1726 tarfile.offset = self.offset_data + self._block(self.size)
1727 self.size = origsize
7584f5c9
ERE
1728 return self
1729
1730 def _proc_pax(self, tarfile):
1731 """Process an extended or global header as described in
be60ffd0 1732 POSIX.1-2008.
7584f5c9
ERE
1733 """
1734 # Read the header information.
1735 buf = tarfile.fileobj.read(self._block(self.size))
1736
1737 # A pax header stores supplemental information for either
1738 # the following file (extended) or all following files
1739 # (global).
1740 if self.type == XGLTYPE:
1741 pax_headers = tarfile.pax_headers
1742 else:
1743 pax_headers = tarfile.pax_headers.copy()
1744
be60ffd0
ERE
1745 # Check if the pax header contains a hdrcharset field. This tells us
1746 # the encoding of the path, linkpath, uname and gname fields. Normally,
1747 # these fields are UTF-8 encoded but since POSIX.1-2008 tar
1748 # implementations are allowed to store them as raw binary strings if
1749 # the translation to UTF-8 fails.
1750 match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf)
1751 if match is not None:
1752 pax_headers["hdrcharset"] = match.group(1).decode("utf-8")
1753
1754 # For the time being, we don't care about anything other than "BINARY".
1755 # The only other value that is currently allowed by the standard is
1756 # "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
1757 hdrcharset = pax_headers.get("hdrcharset")
1758 if hdrcharset == "BINARY":
1759 encoding = tarfile.encoding
1760 else:
1761 encoding = "utf-8"
1762
7584f5c9
ERE
1763 # Parse pax header information. A record looks like that:
1764 # "%d %s=%s\n" % (length, keyword, value). length is the size
1765 # of the complete record including the length field itself and
1766 # the newline. keyword and value are both UTF-8 encoded strings.
be60ffd0 1767 regex = re.compile(br"(\d+) ([^=]+)=")
7584f5c9
ERE
1768 pos = 0
1769 while True:
1770 match = regex.match(buf, pos)
1771 if not match:
1772 break
1773
1774 length, keyword = match.groups()
1775 length = int(length)
1776 value = buf[match.end(2) + 1:match.start(1) + length - 1]
1777
be60ffd0
ERE
1778 # Normally, we could just use "utf-8" as the encoding and "strict"
1779 # as the error handler, but we better not take the risk. For
1780 # example, GNU tar <= 1.23 is known to store filenames it cannot
1781 # translate to UTF-8 as raw strings (unfortunately without a
1782 # hdrcharset=BINARY header).
1783 # We first try the strict standard encoding, and if that fails we
1784 # fall back on the user's encoding and error handler.
1785 keyword = self._decode_pax_field(keyword, "utf-8", "utf-8",
1786 tarfile.errors)
1787 if keyword in PAX_NAME_FIELDS:
1788 value = self._decode_pax_field(value, encoding, tarfile.encoding,
1789 tarfile.errors)
1790 else:
1791 value = self._decode_pax_field(value, "utf-8", "utf-8",
1792 tarfile.errors)
7584f5c9
ERE
1793
1794 pax_headers[keyword] = value
1795 pos += length
1796
36a315a0 1797
7584f5c9
ERE
1798 # Fetch the next header.
1799 try:
1800 next = self.fromtarfile(tarfile)
1801 except HeaderError:
1802 raise SubsequentHeaderError("missing or bad subsequent header")
1803
be60ffd0
ERE
1804 # Process GNU sparse information.
1805 if "GNU.sparse.map" in pax_headers:
1806 # GNU extended sparse format version 0.1.
1807 self._proc_gnusparse_01(next, pax_headers)
1808
1809 elif "GNU.sparse.size" in pax_headers:
1810 # GNU extended sparse format version 0.0.
1811 self._proc_gnusparse_00(next, pax_headers, buf)
1812
1813 elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0":
1814 # GNU extended sparse format version 1.0.
1815 self._proc_gnusparse_10(next, pax_headers, tarfile)
1816
7584f5c9
ERE
1817 if self.type in (XHDTYPE, SOLARIS_XHDTYPE):
1818 # Patch the TarInfo object with the extended header info.
1819 next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors)
1820 next.offset = self.offset
1821
1822 if "size" in pax_headers:
1823 # If the extended header replaces the size field,
1824 # we need to recalculate the offset where the next
1825 # header starts.
1826 offset = next.offset_data
1827 if next.isreg() or next.type not in SUPPORTED_TYPES:
1828 offset += next._block(next.size)
1829 tarfile.offset = offset
1830
c04e0751
ERE
1831 if next is not None:
1832 if "GNU.volume.filename" in pax_headers:
1833 if pax_headers["GNU.volume.filename"] == next.name:
1834 if "GNU.volume.size" in pax_headers:
1835 next.size = int(pax_headers["GNU.volume.size"])
1836 if "GNU.volume.offset" in pax_headers:
1837 next.volume_offset = int(pax_headers["GNU.volume.offset"])
1838
1839 for key in pax_headers.keys():
1840 if key.startswith("GNU.volume"):
1841 del tarfile.pax_headers[key]
0eb5048f 1842
7584f5c9
ERE
1843 return next
1844
be60ffd0
ERE
1845 def _proc_gnusparse_00(self, next, pax_headers, buf):
1846 """Process a GNU tar extended sparse header, version 0.0.
7584f5c9 1847 """
be60ffd0
ERE
1848 offsets = []
1849 for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf):
1850 offsets.append(int(match.group(1)))
1851 numbytes = []
1852 for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf):
1853 numbytes.append(int(match.group(1)))
1854 next.sparse = list(zip(offsets, numbytes))
7584f5c9 1855
be60ffd0
ERE
1856 def _proc_gnusparse_01(self, next, pax_headers):
1857 """Process a GNU tar extended sparse header, version 0.1.
1858 """
1859 sparse = [int(x) for x in pax_headers["GNU.sparse.map"].split(",")]
1860 next.sparse = list(zip(sparse[::2], sparse[1::2]))
7584f5c9 1861
be60ffd0
ERE
1862 def _proc_gnusparse_10(self, next, pax_headers, tarfile):
1863 """Process a GNU tar extended sparse header, version 1.0.
1864 """
1865 fields = None
1866 sparse = []
1867 buf = tarfile.fileobj.read(BLOCKSIZE)
1868 fields, buf = buf.split(b"\n", 1)
1869 fields = int(fields)
1870 while len(sparse) < fields * 2:
1871 if b"\n" not in buf:
1872 buf += tarfile.fileobj.read(BLOCKSIZE)
1873 number, buf = buf.split(b"\n", 1)
1874 sparse.append(int(number))
1875 next.offset_data = tarfile.fileobj.tell()
1876 next.sparse = list(zip(sparse[::2], sparse[1::2]))
7584f5c9 1877
be60ffd0
ERE
1878 def _apply_pax_info(self, pax_headers, encoding, errors):
1879 """Replace fields with supplemental information from a previous
1880 pax extended or global header.
1881 """
1882 for keyword, value in pax_headers.items():
1883 if keyword == "GNU.sparse.name":
1884 setattr(self, "path", value)
1885 elif keyword == "GNU.sparse.size":
1886 setattr(self, "size", int(value))
1887 elif keyword == "GNU.sparse.realsize":
1888 setattr(self, "size", int(value))
1889 elif keyword in PAX_FIELDS:
1890 if keyword in PAX_NUMBER_FIELDS:
1891 try:
1892 value = PAX_NUMBER_FIELDS[keyword](value)
1893 except ValueError:
1894 value = 0
1895 if keyword == "path":
f0287fb7 1896 value = value.rstrip("/") # pylint: disable=no-member
be60ffd0 1897 setattr(self, keyword, value)
7584f5c9
ERE
1898
1899 self.pax_headers = pax_headers.copy()
1900
be60ffd0
ERE
1901 def _decode_pax_field(self, value, encoding, fallback_encoding, fallback_errors):
1902 """Decode a single field from a pax record.
1903 """
1904 try:
1905 return value.decode(encoding, "strict")
1906 except UnicodeDecodeError:
1907 return value.decode(fallback_encoding, fallback_errors)
1908
7584f5c9
ERE
1909 def _block(self, count):
1910 """Round up a byte count by BLOCKSIZE and return it,
1911 e.g. _block(834) => 1024.
1912 """
1913 blocks, remainder = divmod(count, BLOCKSIZE)
1914 if remainder:
1915 blocks += 1
1916 return blocks * BLOCKSIZE
1917
1918 def isreg(self):
1919 return self.type in REGULAR_TYPES
1920 def isfile(self):
1921 return self.isreg()
1922 def isdir(self):
1923 return self.type == DIRTYPE
1924 def issym(self):
1925 return self.type == SYMTYPE
1926 def islnk(self):
1927 return self.type == LNKTYPE
1928 def ischr(self):
1929 return self.type == CHRTYPE
1930 def isblk(self):
1931 return self.type == BLKTYPE
1932 def isfifo(self):
1933 return self.type == FIFOTYPE
1934 def issparse(self):
be60ffd0 1935 return self.sparse is not None
7584f5c9
ERE
1936 def isdev(self):
1937 return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE)
68ddf955 1938 def ismultivol(self):
c04e0751
ERE
1939 return self.type == GNUTYPE_MULTIVOL or self.volume_offset > 0 or\
1940 "GNU.volume.offset" in self.pax_headers
7584f5c9
ERE
1941# class TarInfo
1942
1943class TarFile(object):
1944 """The TarFile Class provides an interface to tar archives.
1945 """
1946
1947 debug = 0 # May be set from 0 (no msgs) to 3 (all msgs)
1948
1949 dereference = False # If true, add content of linked file to the
1950 # tar file, else the link.
1951
1952 ignore_zeros = False # If true, skips empty or invalid blocks and
1953 # continues processing.
1954
83f2d71e 1955 max_volume_size = None # If different from None, establishes maximum
68ddf955
ERE
1956 # size of tar volumes
1957
1958 new_volume_handler = None # function handler to be executed before when
1959 # a new volume is needed
1960
1961 volume_number = 0 # current volume number, used for multi volume
1962 # support
1963
7584f5c9
ERE
1964 errorlevel = 1 # If 0, fatal errors only appear in debug
1965 # messages (if debug >= 0). If > 0, errors
1966 # are passed to the caller as exceptions.
1967
1968 format = DEFAULT_FORMAT # The format to use when creating an archive.
1969
1970 encoding = ENCODING # Encoding for 8-bit character strings.
1971
1972 errors = None # Error handler for unicode conversion.
1973
1974 tarinfo = TarInfo # The default TarInfo class to use.
1975
be60ffd0 1976 fileobject = ExFileObject # The file-object for extractfile().
7584f5c9 1977
d1c38f40
PG
1978 arcmode = ARCMODE_PLAIN # Object processing mode (“concat”, encryption,
1979 # compression)
5fdff89f 1980
ea625b04
ERE
1981 save_to_members = True # If new members are saved. This can be disabled
1982 # if you manage lots of files and don't want
1983 # to have high memory usage
1984
9ef1fb87
TJ
1985 cache_uid2user = {} # cache to avoid getpwuid calls. It always parses /etc/passwd.
1986 cache_gid2group = {} # same cache for groups
1987
7584f5c9
ERE
1988 def __init__(self, name=None, mode="r", fileobj=None, format=None,
1989 tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
be60ffd0 1990 errors="surrogateescape", pax_headers=None, debug=None,
548bb8d5 1991 errorlevel=None, max_volume_size=None, new_volume_handler=None,
d1c38f40 1992 concat=False, nacl=None,
c7c736b6 1993 save_to_members=True):
7584f5c9
ERE
1994 """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
1995 read from an existing archive, 'a' to append data to an existing
1996 file or 'w' to create a new file overwriting an existing one. `mode'
1997 defaults to 'r'.
1998 If `fileobj' is given, it is used for reading or writing data. If it
1999 can be determined, `mode' is overridden by `fileobj's mode.
2000 `fileobj' is not closed, when TarFile is closed.
2001 """
2002 if len(mode) > 1 or mode not in "raw":
2003 raise ValueError("mode must be 'r', 'a' or 'w'")
2004 self.mode = mode
d1c38f40 2005 self.arcmode = arcmode_set (concat)
c7c736b6 2006 self.nacl = nacl
7584f5c9
ERE
2007 self._mode = {"r": "rb", "a": "r+b", "w": "wb"}[mode]
2008
2009 if not fileobj:
2010 if self.mode == "a" and not os.path.exists(name):
2011 # Create nonexistent files in append mode.
2012 self.mode = "w"
2013 self._mode = "wb"
2014 fileobj = bltn_open(name, self._mode)
2015 self._extfileobj = False
2016 else:
2017 if name is None and hasattr(fileobj, "name"):
2018 name = fileobj.name
d5361dac 2019 # when fileobj is a gzip.GzipFile, fileobj.mode is an int (not valid for us)
be60ffd0 2020 if hasattr(fileobj, "mode") and isinstance(fileobj.mode, str):
7584f5c9
ERE
2021 self._mode = fileobj.mode
2022 self._extfileobj = True
be60ffd0 2023 self.name = os.path.abspath(name) if name else None
2f854e77 2024 self.base_name = self.name = os.path.abspath(name) if name else None
7584f5c9
ERE
2025 self.fileobj = fileobj
2026
2027 # Init attributes.
2028 if format is not None:
2029 self.format = format
2030 if tarinfo is not None:
2031 self.tarinfo = tarinfo
2032 if dereference is not None:
2033 self.dereference = dereference
2034 if ignore_zeros is not None:
2035 self.ignore_zeros = ignore_zeros
2036 if encoding is not None:
2037 self.encoding = encoding
2038
be60ffd0 2039 self.errors = errors
7584f5c9
ERE
2040
2041 if pax_headers is not None and self.format == PAX_FORMAT:
2042 self.pax_headers = pax_headers
2043 else:
2044 self.pax_headers = {}
2045
2046 if debug is not None:
2047 self.debug = debug
2048 if errorlevel is not None:
2049 self.errorlevel = errorlevel
2050
2051 # Init datastructures.
ae48acc8 2052 if max_volume_size and max_volume_size < 3*BLOCKSIZE:
0c818a18 2053 raise ValueError("max_volume_size needs to be at least %d" % (3*BLOCKSIZE))
ae48acc8
ERE
2054 if max_volume_size and not callable(new_volume_handler):
2055 raise ValueError("new_volume_handler needs to be set and be callable for multivolume support")
5ab3f8f9
CH
2056 if max_volume_size:
2057 self.max_volume_size = int(max_volume_size)
2058 else:
2059 self.max_volume_size = None
ae48acc8 2060
ea625b04 2061 self.save_to_members = save_to_members
68ddf955 2062 self.new_volume_handler = new_volume_handler
7584f5c9
ERE
2063 self.closed = False
2064 self.members = [] # list of members as TarInfo objects
2065 self._loaded = False # flag if all members have been read
2066 self.offset = self.fileobj.tell()
2067 # current position in the archive file
2068 self.inodes = {} # dictionary caching the inodes of
2069 # archive members already added
2070
2071 try:
2072 if self.mode == "r":
2073 self.firstmember = None
2074 self.firstmember = self.next()
2075
2076 if self.mode == "a":
2077 # Move to the end of the archive,
2078 # before the first empty block.
2079 while True:
2080 self.fileobj.seek(self.offset)
2081 try:
2082 tarinfo = self.tarinfo.fromtarfile(self)
2083 self.members.append(tarinfo)
2084 except EOFHeaderError:
2085 self.fileobj.seek(self.offset)
2086 break
be60ffd0 2087 except HeaderError as e:
7584f5c9
ERE
2088 raise ReadError(str(e))
2089
2090 if self.mode in "aw":
2091 self._loaded = True
2092
2093 if self.pax_headers:
2094 buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy())
2095 self.fileobj.write(buf)
2096 self.offset += len(buf)
2097 except:
2098 if not self._extfileobj:
2099 self.fileobj.close()
2100 self.closed = True
2101 raise
2102
7584f5c9
ERE
2103 #--------------------------------------------------------------------------
2104 # Below are the classmethods which act as alternate constructors to the
2105 # TarFile class. The open() method is the only one that is needed for
2106 # public use; it is the "super"-constructor and is able to select an
2107 # adequate "sub"-constructor for a particular compression using the mapping
2108 # from OPEN_METH.
2109 #
2110 # This concept allows one to subclass TarFile without losing the comfort of
2111 # the super-constructor. A sub-constructor is registered and made available
2112 # by adding it to the mapping in OPEN_METH.
2113
2114 @classmethod
2b82f50c 2115 def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE,
04f4c7ab
PG
2116 encryption=None, compresslevel=9, tolerance=TOLERANCE_STRICT,
2117 **kwargs):
7584f5c9
ERE
2118 """Open a tar archive for reading, writing or appending. Return
2119 an appropriate TarFile class.
2120
2121 mode:
2122 'r' or 'r:*' open for reading with transparent compression
2123 'r:' open for reading exclusively uncompressed
2124 'r:gz' open for reading with gzip compression
2125 'r:bz2' open for reading with bzip2 compression
be60ffd0 2126 'r:xz' open for reading with lzma compression
7584f5c9
ERE
2127 'a' or 'a:' open for appending, creating the file if necessary
2128 'w' or 'w:' open for writing without compression
2129 'w:gz' open for writing with gzip compression
2130 'w:bz2' open for writing with bzip2 compression
be60ffd0 2131 'w:xz' open for writing with lzma compression
7584f5c9
ERE
2132
2133 'r|*' open a stream of tar blocks with transparent compression
2134 'r|' open an uncompressed stream of tar blocks for reading
2135 'r|gz' open a gzip compressed stream of tar blocks
2136 'r|bz2' open a bzip2 compressed stream of tar blocks
be60ffd0 2137 'r|xz' open an lzma compressed stream of tar blocks
7584f5c9
ERE
2138 'w|' open an uncompressed stream for writing
2139 'w|gz' open a gzip compressed stream for writing
2140 'w|bz2' open a bzip2 compressed stream for writing
be60ffd0 2141 'w|xz' open an lzma compressed stream for writing
85737f48
ERE
2142
2143 'r#gz' open a stream of gzip compressed tar blocks for reading
2144 'w#gz' open a stream of gzip compressed tar blocks for writing
7584f5c9 2145 """
7584f5c9
ERE
2146 if not name and not fileobj:
2147 raise ValueError("nothing to open")
2148
2149 if mode in ("r", "r:*"):
2150 # Find out which *open() is appropriate for opening the file.
2151 for comptype in cls.OPEN_METH:
2152 func = getattr(cls, cls.OPEN_METH[comptype])
2153 if fileobj is not None:
2154 saved_pos = fileobj.tell()
2155 try:
2156 return func(name, "r", fileobj, **kwargs)
be60ffd0 2157 except (ReadError, CompressionError) as e:
c7c736b6 2158 # usually nothing exceptional but sometimes is
7584f5c9
ERE
2159 if fileobj is not None:
2160 fileobj.seek(saved_pos)
2161 continue
2162 raise ReadError("file could not be opened successfully")
2163
2164 elif ":" in mode:
2165 filemode, comptype = mode.split(":", 1)
2166 filemode = filemode or "r"
2167 comptype = comptype or "tar"
2168
2169 # Select the *open() function according to
2170 # given compression.
2171 if comptype in cls.OPEN_METH:
2172 func = getattr(cls, cls.OPEN_METH[comptype])
2173 else:
2174 raise CompressionError("unknown compression type %r" % comptype)
e05f0440
TJ
2175
2176 # Pass on compression level for gzip / bzip2.
2177 if comptype == 'gz' or comptype == 'bz2':
2178 kwargs['compresslevel'] = compresslevel
2179
7a2b9329
CH
2180 if 'max_volume_size' in kwargs:
2181 if comptype != 'tar' and filemode in 'wa' \
2182 and kwargs['max_volume_size']:
2183 import warnings
2184 warnings.warn('Only the first volume will be compressed '
2185 'for modes with "w:"!')
2186
e05f0440 2187 return func(name, filemode, fileobj, **kwargs)
7584f5c9
ERE
2188
2189 elif "|" in mode:
2190 filemode, comptype = mode.split("|", 1)
2191 filemode = filemode or "r"
2192 comptype = comptype or "tar"
2193
2194 if filemode not in "rw":
2195 raise ValueError("mode must be 'r' or 'w'")
2196
2197 t = cls(name, filemode,
2b82f50c
ERE
2198 _Stream(name, filemode, comptype, fileobj, bufsize,
2199 compresslevel=compresslevel),
7584f5c9
ERE
2200 **kwargs)
2201 t._extfileobj = False
2202 return t
2203
5fdff89f
ERE
2204 elif "#" in mode:
2205 filemode, comptype = mode.split("#", 1)
2206 filemode = filemode or "r"
5fdff89f
ERE
2207
2208 if filemode not in "rw":
5faea0e1
PG
2209 raise ValueError ("mode %s not compatible with concat "
2210 "archive; must be 'r' or 'w'" % mode)
5fdff89f 2211
be60ffd0 2212 stream = _Stream(name, filemode, comptype, fileobj, bufsize,
d1c38f40 2213 concat=True, encryption=encryption,
04f4c7ab 2214 compresslevel=compresslevel, tolerance=tolerance)
d1c38f40 2215 kwargs ["concat"] = True
be60ffd0
ERE
2216 try:
2217 t = cls(name, filemode, stream, **kwargs)
c7c736b6 2218 except: # XXX except what?
be60ffd0 2219 stream.close()
c7c736b6 2220 raise # XXX raise what?
5fdff89f
ERE
2221 t._extfileobj = False
2222 return t
2223
7584f5c9
ERE
2224 elif mode in "aw":
2225 return cls.taropen(name, mode, fileobj, **kwargs)
2226
133d30da 2227 raise ValueError("undiscernible mode %r" % mode)
7584f5c9 2228
d39d4cbf
PG
2229
2230 @classmethod
2231 def open_at_offset(cls, offset, *a, **kwa):
2232 """
2233 Same as ``.open()``, but start reading at the given offset. Assumes a
2234 seekable file object.
2235 """
2236 fileobj = kwa.get ("fileobj")
2237 if fileobj is not None:
2238 fileobj.seek (offset)
2239 return cls.open (*a, **kwa)
2240
2241
7584f5c9
ERE
2242 @classmethod
2243 def taropen(cls, name, mode="r", fileobj=None, **kwargs):
2244 """Open uncompressed tar archive name for reading or writing.
2245 """
2246 if len(mode) > 1 or mode not in "raw":
2247 raise ValueError("mode must be 'r', 'a' or 'w'")
2248 return cls(name, mode, fileobj, **kwargs)
2249
2250 @classmethod
2251 def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
2252 """Open gzip compressed tar archive name for reading or writing.
2253 Appending is not allowed.
2254 """
2255 if len(mode) > 1 or mode not in "rw":
2256 raise ValueError("mode must be 'r' or 'w'")
2257
2258 try:
2259 import gzip
2260 gzip.GzipFile
2261 except (ImportError, AttributeError):
2262 raise CompressionError("gzip module is not available")
2263
be60ffd0 2264 extfileobj = fileobj is not None
7584f5c9 2265 try:
be60ffd0
ERE
2266 fileobj = gzip.GzipFile(name, mode + "b", compresslevel, fileobj)
2267 t = cls.taropen(name, mode, fileobj, **kwargs)
2268 except OSError:
2269 if not extfileobj and fileobj is not None:
2270 fileobj.close()
2271 if fileobj is None:
2272 raise
7584f5c9 2273 raise ReadError("not a gzip file")
be60ffd0
ERE
2274 except:
2275 if not extfileobj and fileobj is not None:
2276 fileobj.close()
2277 raise
2278 t._extfileobj = extfileobj
7584f5c9
ERE
2279 return t
2280
2281 @classmethod
2282 def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
2283 """Open bzip2 compressed tar archive name for reading or writing.
2284 Appending is not allowed.
2285 """
2286 if len(mode) > 1 or mode not in "rw":
2287 raise ValueError("mode must be 'r' or 'w'.")
2288
2289 try:
2290 import bz2
2291 except ImportError:
2292 raise CompressionError("bz2 module is not available")
2293
be60ffd0
ERE
2294 fileobj = bz2.BZ2File(fileobj or name, mode,
2295 compresslevel=compresslevel)
7584f5c9
ERE
2296
2297 try:
2298 t = cls.taropen(name, mode, fileobj, **kwargs)
be60ffd0
ERE
2299 except (OSError, EOFError):
2300 fileobj.close()
7584f5c9
ERE
2301 raise ReadError("not a bzip2 file")
2302 t._extfileobj = False
2303 return t
2304
be60ffd0
ERE
2305 @classmethod
2306 def xzopen(cls, name, mode="r", fileobj=None, preset=None, **kwargs):
2307 """Open lzma compressed tar archive name for reading or writing.
2308 Appending is not allowed.
2309 """
2310 if mode not in ("r", "w"):
2311 raise ValueError("mode must be 'r' or 'w'")
2312
2313 try:
2314 import lzma
2315 except ImportError:
2316 raise CompressionError("lzma module is not available")
2317
2318 fileobj = lzma.LZMAFile(fileobj or name, mode, preset=preset)
2319
2320 try:
2321 t = cls.taropen(name, mode, fileobj, **kwargs)
2322 except (lzma.LZMAError, EOFError):
2323 fileobj.close()
2324 raise ReadError("not an lzma file")
2325 t._extfileobj = False
2326 return t
2327
7584f5c9
ERE
2328 # All *open() methods are registered here.
2329 OPEN_METH = {
2330 "tar": "taropen", # uncompressed tar
2331 "gz": "gzopen", # gzip compressed tar
be60ffd0
ERE
2332 "bz2": "bz2open", # bzip2 compressed tar
2333 "xz": "xzopen" # lzma compressed tar
7584f5c9
ERE
2334 }
2335
2336 #--------------------------------------------------------------------------
2337 # The public methods which TarFile provides:
2338
2339 def close(self):
2340 """Close the TarFile. In write-mode, two finishing zero blocks are
fd2f01f2
PG
2341 appended to the archive. A special case are empty archives which are
2342 initialized accordingly so the two mandatory blocks of zeros are
2343 written abiding by the requested encryption and compression settings.
7584f5c9
ERE
2344 """
2345 if self.closed:
2346 return
2347
2348 if self.mode in "aw":
fd2f01f2
PG
2349 if self.arcmode & ARCMODE_CONCAT and self.fileobj.tell () == 0:
2350 self.fileobj.next ("")
7584f5c9
ERE
2351 self.fileobj.write(NUL * (BLOCKSIZE * 2))
2352 self.offset += (BLOCKSIZE * 2)
2353 # fill up the end with zero-blocks
2354 # (like option -b20 for tar does)
2355 blocks, remainder = divmod(self.offset, RECORDSIZE)
2356 if remainder > 0:
2357 self.fileobj.write(NUL * (RECORDSIZE - remainder))
7584f5c9
ERE
2358 if not self._extfileobj:
2359 self.fileobj.close()
2360 self.closed = True
2361
2362 def getmember(self, name):
2363 """Return a TarInfo object for member `name'. If `name' can not be
2364 found in the archive, KeyError is raised. If a member occurs more
2365 than once in the archive, its last occurrence is assumed to be the
2366 most up-to-date version.
2367 """
2368 tarinfo = self._getmember(name)
2369 if tarinfo is None:
2370 raise KeyError("filename %r not found" % name)
2371 return tarinfo
2372
2373 def getmembers(self):
2374 """Return the members of the archive as a list of TarInfo objects. The
2375 list has the same order as the members in the archive.
2376 """
2377 self._check()
2378 if not self._loaded: # if we want to obtain a list of
2379 self._load() # all members, we first have to
2380 # scan the whole archive.
2381 return self.members
2382
ad4402e8
ERE
2383 def get_last_member_offset(self):
2384 """Return the last member offset. Usually this is self.fileobj.tell(),
2385 but when there's encryption or concat compression going on it's more
2386 complicated than that.
2387 """
b8fc2f5d 2388 return self.last_block_offset
ad4402e8 2389
7584f5c9
ERE
2390 def getnames(self):
2391 """Return the members of the archive as a list of their names. It has
2392 the same order as the list returned by getmembers().
2393 """
2394 return [tarinfo.name for tarinfo in self.getmembers()]
2395
2396 def gettarinfo(self, name=None, arcname=None, fileobj=None):
2397 """Create a TarInfo object for either the file `name' or the file
2398 object `fileobj' (using os.fstat on its file descriptor). You can
2399 modify some of the TarInfo's attributes before you add it using
2400 addfile(). If given, `arcname' specifies an alternative name for the
2401 file in the archive.
2402 """
2403 self._check("aw")
2404
2405 # When fileobj is given, replace name by
2406 # fileobj's real name.
2407 if fileobj is not None:
2408 name = fileobj.name
2409
2410 # Building the name of the member in the archive.
2411 # Backward slashes are converted to forward slashes,
2412 # Absolute paths are turned to relative paths.
2413 if arcname is None:
2414 arcname = name
2415 drv, arcname = os.path.splitdrive(arcname)
be60ffd0 2416 arcname = arcname.replace(os.sep, "/")
7584f5c9
ERE
2417 arcname = arcname.lstrip("/")
2418
2419 # Now, fill the TarInfo object with
2420 # information specific for the file.
2421 tarinfo = self.tarinfo()
2422 tarinfo.tarfile = self
2423
2424 # Use os.stat or os.lstat, depending on platform
2425 # and if symlinks shall be resolved.
2426 if fileobj is None:
2427 if hasattr(os, "lstat") and not self.dereference:
2428 statres = os.lstat(name)
2429 else:
2430 statres = os.stat(name)
2431 else:
2432 statres = os.fstat(fileobj.fileno())
2433 linkname = ""
2434
2435 stmd = statres.st_mode
2436 if stat.S_ISREG(stmd):
2437 inode = (statres.st_ino, statres.st_dev)
2438 if not self.dereference and statres.st_nlink > 1 and \
2439 inode in self.inodes and arcname != self.inodes[inode]:
2440 # Is it a hardlink to an already
2441 # archived file?
2442 type = LNKTYPE
2443 linkname = self.inodes[inode]
2444 else:
2445 # The inode is added only if its valid.
2446 # For win32 it is always 0.
2447 type = REGTYPE
6f422b65 2448 if inode[0] and self.save_to_members:
7584f5c9
ERE
2449 self.inodes[inode] = arcname
2450 elif stat.S_ISDIR(stmd):
2451 type = DIRTYPE
2452 elif stat.S_ISFIFO(stmd):
2453 type = FIFOTYPE
2454 elif stat.S_ISLNK(stmd):
2455 type = SYMTYPE
2456 linkname = os.readlink(name)
2457 elif stat.S_ISCHR(stmd):
2458 type = CHRTYPE
2459 elif stat.S_ISBLK(stmd):
2460 type = BLKTYPE
2461 else:
2462 return None
2463
2464 # Fill the TarInfo object with all
2465 # information we can get.
2466 tarinfo.name = arcname
2467 tarinfo.mode = stmd
2468 tarinfo.uid = statres.st_uid
2469 tarinfo.gid = statres.st_gid
2470 if type == REGTYPE:
2471 tarinfo.size = statres.st_size
2472 else:
be60ffd0 2473 tarinfo.size = 0
7584f5c9
ERE
2474 tarinfo.mtime = statres.st_mtime
2475 tarinfo.type = type
2476 tarinfo.linkname = linkname
2477 if pwd:
9ef1fb87
TJ
2478 if tarinfo.uid in self.cache_uid2user:
2479 tarinfo.uname = self.cache_uid2user[tarinfo.uid]
2480 else:
2481 try:
2482 tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0]
2483 self.cache_uid2user[tarinfo.uid] = tarinfo.uname
2484 except KeyError:
2485 # remember user does not exist:
2486 # same default value as in tarinfo class
2487 self.cache_uid2user[tarinfo.uid] = ""
7584f5c9 2488 if grp:
9ef1fb87
TJ
2489 if tarinfo.gid in self.cache_gid2group:
2490 tarinfo.gname = self.cache_gid2group[tarinfo.gid]
2491 else:
2492 try:
2493 tarinfo.gname = grp.getgrgid(tarinfo.gid)[0]
2494 self.cache_gid2group[tarinfo.gid] = tarinfo.gname
2495 except KeyError:
2496 # remember group does not exist:
2497 # same default value as in tarinfo class
2498 self.cache_gid2group[tarinfo.gid] = ""
7584f5c9
ERE
2499
2500 if type in (CHRTYPE, BLKTYPE):
2501 if hasattr(os, "major") and hasattr(os, "minor"):
2502 tarinfo.devmajor = os.major(statres.st_rdev)
2503 tarinfo.devminor = os.minor(statres.st_rdev)
2504 return tarinfo
2505
2506 def list(self, verbose=True):
2507 """Print a table of contents to sys.stdout. If `verbose' is False, only
2508 the names of the members are printed. If it is True, an `ls -l'-like
2509 output is produced.
2510 """
2511 self._check()
2512
2513 for tarinfo in self:
2514 if verbose:
be60ffd0
ERE
2515 print(stat.filemode(tarinfo.mode), end=' ')
2516 print("%s/%s" % (tarinfo.uname or tarinfo.uid,
2517 tarinfo.gname or tarinfo.gid), end=' ')
7584f5c9 2518 if tarinfo.ischr() or tarinfo.isblk():
be60ffd0
ERE
2519 print("%10s" % ("%d,%d" \
2520 % (tarinfo.devmajor, tarinfo.devminor)), end=' ')
7584f5c9 2521 else:
be60ffd0
ERE
2522 print("%10d" % tarinfo.size, end=' ')
2523 print("%d-%02d-%02d %02d:%02d:%02d" \
2524 % time.localtime(tarinfo.mtime)[:6], end=' ')
7584f5c9 2525
be60ffd0 2526 print(tarinfo.name + ("/" if tarinfo.isdir() else ""), end=' ')
7584f5c9
ERE
2527
2528 if verbose:
2529 if tarinfo.issym():
be60ffd0 2530 print("->", tarinfo.linkname, end=' ')
7584f5c9 2531 if tarinfo.islnk():
be60ffd0
ERE
2532 print("link to", tarinfo.linkname, end=' ')
2533 print()
7584f5c9 2534
be60ffd0 2535 def add(self, name, arcname=None, recursive=True, exclude=None, *, filter=None):
7584f5c9
ERE
2536 """Add the file `name' to the archive. `name' may be any type of file
2537 (directory, fifo, symbolic link, etc.). If given, `arcname'
2538 specifies an alternative name for the file in the archive.
2539 Directories are added recursively by default. This can be avoided by
2540 setting `recursive' to False. `exclude' is a function that should
2541 return True for each filename to be excluded. `filter' is a function
2542 that expects a TarInfo object argument and returns the changed
2543 TarInfo object, if it returns None the TarInfo object will be
2544 excluded from the archive.
2545 """
2546 self._check("aw")
2547
2548 if arcname is None:
2549 arcname = name
2550
2551 # Exclude pathnames.
2552 if exclude is not None:
2553 import warnings
2554 warnings.warn("use the filter argument instead",
2555 DeprecationWarning, 2)
2556 if exclude(name):
2557 self._dbg(2, "tarfile: Excluded %r" % name)
2558 return
2559
2560 # Skip if somebody tries to archive the archive...
2561 if self.name is not None and os.path.abspath(name) == self.name:
2562 self._dbg(2, "tarfile: Skipped %r" % name)
2563 return
2564
2565 self._dbg(1, name)
2566
2567 # Create a TarInfo object from the file.
2568 tarinfo = self.gettarinfo(name, arcname)
2569
2570 if tarinfo is None:
2571 self._dbg(1, "tarfile: Unsupported type %r" % name)
2572 return
2573
2574 # Change or exclude the TarInfo object.
2575 if filter is not None:
2576 tarinfo = filter(tarinfo)
2577 if tarinfo is None:
2578 self._dbg(2, "tarfile: Excluded %r" % name)
2579 return
2580
2581 # Append the tar header and data to the archive.
2582 if tarinfo.isreg():
2583 with bltn_open(name, "rb") as f:
2584 self.addfile(tarinfo, f)
2585
2586 elif tarinfo.isdir():
2587 self.addfile(tarinfo)
2588 if recursive:
2589 for f in os.listdir(name):
2590 self.add(os.path.join(name, f), os.path.join(arcname, f),
be60ffd0 2591 recursive, exclude, filter=filter)
7584f5c9
ERE
2592
2593 else:
2594 self.addfile(tarinfo)
2595
defc9a22 2596 def _size_left_file(self):
be60ffd0 2597 """Calculates size left in a volume with a maximum volume size.
ba5a449e 2598
be60ffd0 2599 Assumes self.max_volume_size is set.
ba5a449e 2600 If using compression through a _Stream, use _size_left_stream instead
be60ffd0 2601 """
ba5a449e 2602 # left-over size = max_size - offset - 2 zero-blocks written in close
ae48acc8
ERE
2603 size_left = self.max_volume_size - 2*BLOCKSIZE - self.offset
2604 # limit size left to a discrete number of blocks, because we won't
be60ffd0 2605 # write only half a block when writting the end of a volume
ae48acc8 2606 # and filling with zeros
defc9a22
CH
2607 return BLOCKSIZE * (size_left // BLOCKSIZE)
2608
2609 def _size_left_stream(self):
ba5a449e
CH
2610 """ Calculates size left in a volume if using comression/encryption
2611
2612 Assumes self.max_volume_size is set and self.fileobj is a _Stream
2613 (otherwise use _size_left_file)
2614 """
2615 # left-over size = max_size - bytes written - 2 zero-blocks (close)
defc9a22
CH
2616 size_left = self.max_volume_size - self.fileobj.estim_file_size() \
2617 - 2*BLOCKSIZE
2618 return BLOCKSIZE * (size_left // BLOCKSIZE)
ae48acc8 2619
7584f5c9
ERE
2620 def addfile(self, tarinfo, fileobj=None):
2621 """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is
2622 given, tarinfo.size bytes are read from it and added to the archive.
2623 You can create TarInfo objects using gettarinfo().
2624 On Windows platforms, `fileobj' should always be opened with mode
2625 'rb' to avoid irritation about the file size.
2626 """
2627 self._check("aw")
2628
2629 tarinfo = copy.copy(tarinfo)
cbf55ffb 2630
d1c38f40
PG
2631 if self.arcmode & ARCMODE_CONCAT:
2632 self.last_block_offset = self.fileobj.next (tarinfo.name)
11684b1d
ERE
2633 else:
2634 self.last_block_offset = self.fileobj.tell()
7584f5c9
ERE
2635
2636 buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
2637 self.fileobj.write(buf)
2638 self.offset += len(buf)
2639
ae9c8de2
CH
2640 if self.max_volume_size:
2641 if isinstance(self.fileobj, _Stream):
2642 _size_left = self._size_left_stream
2643 else:
2644 _size_left = self._size_left_file
2645 else:
2646 _size_left = lambda: tarinfo.size
68ddf955 2647
29c354ac
PG
2648 # If there's no data to follow, finish
2649 if not fileobj:
29c354ac
PG
2650 if self.save_to_members:
2651 self.members.append(tarinfo)
2652 return
2653
2654 target_size_left = _size_left()
2655 source_size_left = tarinfo.size
2656 assert tarinfo.volume_offset == 0
2657
2658 # we only split volumes in the middle of a file, that means we have
2659 # to write at least one block
2660 if target_size_left < BLOCKSIZE:
2661 target_size_left = BLOCKSIZE
2662
ae9c8de2
CH
2663 # loop over multiple volumes
2664 while source_size_left > 0:
ae48acc8 2665
ae9c8de2
CH
2666 # Write as much data as possble from source into target.
2667 # When compressing data, we cannot easily predict how much data we
2668 # can write until target_size_left == 0 --> need to iterate
2669 size_can_write = min(target_size_left, source_size_left)
c04e0751 2670
ae9c8de2
CH
2671 while size_can_write > 0:
2672 copyfileobj(fileobj, self.fileobj, size_can_write)
2673 self.offset += size_can_write
2674 source_size_left -= size_can_write
2675 target_size_left = _size_left()
2676 size_can_write = min(target_size_left, source_size_left)
68ddf955 2677
ae9c8de2
CH
2678 # now target_size_left == 0 or source_size_left == 0
2679
2680 # if there is data left to write, we need to create a new volume
2681 if source_size_left > 0:
5f38bff6
PG
2682 # Only finalize the crypto entry here if we’re continuing with
2683 # another one; otherwise, the encryption must include the block
2684 # padding below.
2f854e77 2685 tarinfo.type = GNUTYPE_MULTIVOL
68ddf955
ERE
2686
2687 if not self.new_volume_handler or\
2688 not callable(self.new_volume_handler):
c04e0751 2689 raise Exception("We need to create a new volume and you "
ae9c8de2 2690 "didn't supply a new_volume_handler")
68ddf955 2691
54128a00 2692
68ddf955
ERE
2693 # the new volume handler should do everything needed to
2694 # start working in a new volume. usually, the handler calls
2695 # to self.open_volume
2f854e77 2696 self.volume_number += 1
0eb5048f 2697
ae9c8de2 2698 # set to be used by open_volume, because in the case of a PAX
0eb5048f
ERE
2699 # tar it needs to write information about the volume and offset
2700 # in the global header
ae9c8de2 2701 tarinfo.volume_offset = tarinfo.size - source_size_left
0eb5048f 2702 self.volume_tarinfo = tarinfo
ae9c8de2 2703
a0873dcc
PG
2704 # the “new_volume_handler” is supposed to call .close() on the
2705 # “fileobj” _Stream
2f854e77
ERE
2706 self.new_volume_handler(self, self.base_name, self.volume_number)
2707
0eb5048f
ERE
2708 self.volume_tarinfo = None
2709
d1c38f40
PG
2710 if self.arcmode & ARCMODE_CONCAT:
2711 self.fileobj.next_volume (tarinfo.name)
5f38bff6 2712
2f854e77
ERE
2713 # write new volume header
2714 buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
2f854e77 2715 self.fileobj.write(buf)
ae9c8de2
CH
2716 self.offset += len(buf)
2717
2718 # adjust variables; open_volume should have reset self.offset
2719 # --> _size_left should be big again
2720 target_size_left = _size_left()
2721 size_can_write = min(target_size_left, source_size_left)
e0da4709 2722 self._dbg(3, 'new volume')
ae9c8de2
CH
2723
2724 # now, all data has been written. We may have to fill up the rest of
2725 # the block in target with 0s
2726 remainder = (tarinfo.size - tarinfo.volume_offset) % BLOCKSIZE
2727 if remainder > 0:
2728 self.fileobj.write(NUL * (BLOCKSIZE - remainder))
2729 self.offset += BLOCKSIZE - remainder
7584f5c9 2730
ea625b04
ERE
2731 if self.save_to_members:
2732 self.members.append(tarinfo)
7584f5c9 2733
170c6c52 2734 def open_volume(self, name="", fileobj=None, encryption=None):
68ddf955 2735 '''
0eb5048f 2736 Called by the user to change this tar file to point to a new volume.
68ddf955
ERE
2737 '''
2738 # open the file using either fileobj or name
2739 if not fileobj:
2740 if self.mode == "a" and not os.path.exists(name):
2741 # Create nonexistent files in append mode.
2742 self.mode = "w"
2743 self._mode = "wb"
68ddf955 2744 self._extfileobj = False
26fa5ad5
ERE
2745
2746 if isinstance(self.fileobj, _Stream):
e0da4709 2747 self._dbg(3, 'open_volume: create a _Stream')
26fa5ad5
ERE
2748 fileobj = _Stream(name=name,
2749 mode=self.fileobj.mode,
2750 comptype=self.fileobj.comptype,
2751 fileobj=None,
2752 bufsize=self.fileobj.bufsize,
cea130ec 2753 encryption=encryption or self.fileobj.encryption,
d1c38f40 2754 concat=self.fileobj.arcmode & ARCMODE_CONCAT)
26fa5ad5 2755 else:
7a2b9329 2756 # here, we lose information about compression/encryption!
e0da4709 2757 self._dbg(3, 'open_volume: builtin open')
26fa5ad5 2758 fileobj = bltn_open(name, self._mode)
68ddf955
ERE
2759 else:
2760 if name is None and hasattr(fileobj, "name"):
2761 name = fileobj.name
2762 if hasattr(fileobj, "mode"):
2763 self._mode = fileobj.mode
2764 self._extfileobj = True
1027433a 2765 self._dbg(3, 'open_volume: using external fileobj {}', fileobj)
68ddf955
ERE
2766 self.name = os.path.abspath(name) if name else None
2767 self.fileobj = fileobj
2768
2769 # init data structures
2770 self.closed = False
2771 self.members = [] # list of members as TarInfo objects
2772 self._loaded = False # flag if all members have been read
2773 self.offset = self.fileobj.tell()
2774 # current position in the archive file
2775 self.inodes = {} # dictionary caching the inodes of
2776 # archive members already added
2777
2778 try:
2779 if self.mode == "r":
2780 self.firstmember = None
2781 self.firstmember = self.next()
2782
2783 if self.mode == "a":
2784 # Move to the end of the archive,
2785 # before the first empty block.
2786 while True:
2787 self.fileobj.seek(self.offset)
2788 try:
2789 tarinfo = self.tarinfo.fromtarfile(self)
2790 self.members.append(tarinfo)
2791 except EOFHeaderError:
2792 self.fileobj.seek(self.offset)
2793 break
be60ffd0 2794 except HeaderError as e:
68ddf955
ERE
2795 raise ReadError(str(e))
2796
2797 if self.mode in "aw":
2798 self._loaded = True
2799
c04e0751
ERE
2800 if self.format == PAX_FORMAT:
2801 volume_info = {
be60ffd0
ERE
2802 "GNU.volume.filename": str(self.volume_tarinfo.name),
2803 "GNU.volume.size": str(self.volume_tarinfo.size - self.volume_tarinfo.volume_offset),
2804 "GNU.volume.offset": str(self.volume_tarinfo.volume_offset),
c04e0751 2805 }
0eb5048f 2806
c04e0751
ERE
2807 self.pax_headers.update(volume_info)
2808
a0873dcc
PG
2809 if isinstance(self.fileobj, _Stream):
2810 self.fileobj._init_write_gz ()
c04e0751
ERE
2811 buf = self.tarinfo.create_pax_global_header(volume_info.copy())
2812 self.fileobj.write(buf)
2813 self.offset += len(buf)
54128a00 2814 except Exception as exn:
68ddf955
ERE
2815 if not self._extfileobj:
2816 self.fileobj.close()
2817 self.closed = True
2818 raise
2819
e5f5681b 2820 def extractall(self, path=".", members=None, filter=None):
7584f5c9
ERE
2821 """Extract all members from the archive to the current working
2822 directory and set owner, modification time and permissions on
2823 directories afterwards. `path' specifies a different directory
2824 to extract to. `members' is optional and must be a subset of the
2825 list returned by getmembers().
2826 """
2827 directories = []
2828
2829 if members is None:
2830 members = self
2831
2832 for tarinfo in members:
c474439c
ERE
2833 if self.volume_number > 0 and tarinfo.ismultivol():
2834 continue
2835
974408b5 2836 if filter and not filter(tarinfo):
e5f5681b
ERE
2837 continue
2838
7584f5c9
ERE
2839 if tarinfo.isdir():
2840 # Extract directories with a safe mode.
2841 directories.append(tarinfo)
2842 tarinfo = copy.copy(tarinfo)
be60ffd0
ERE
2843 tarinfo.mode = 0o0700
2844 # Do not set_attrs directories, as we will do that further down
2845 self.extract(tarinfo, path, set_attrs=not tarinfo.isdir())
7584f5c9
ERE
2846
2847 # Reverse sort directories.
be60ffd0 2848 directories.sort(key=lambda a: a.name)
7584f5c9
ERE
2849 directories.reverse()
2850
2851 # Set correct owner, mtime and filemode on directories.
2852 for tarinfo in directories:
2853 dirpath = os.path.join(path, tarinfo.name)
2854 try:
2855 self.chown(tarinfo, dirpath)
2856 self.utime(tarinfo, dirpath)
2857 self.chmod(tarinfo, dirpath)
be60ffd0 2858 except ExtractError as e:
7584f5c9
ERE
2859 if self.errorlevel > 1:
2860 raise
2861 else:
2862 self._dbg(1, "tarfile: %s" % e)
2863
786addd6 2864 def extract(self, member, path="", set_attrs=True, symlink_cb=None):
7584f5c9
ERE
2865 """Extract a member from the archive to the current working directory,
2866 using its full name. Its file information is extracted as accurately
2867 as possible. `member' may be a filename or a TarInfo object. You can
be60ffd0
ERE
2868 specify a different directory using `path'. File attributes (owner,
2869 mtime, mode) are set unless `set_attrs' is False.
786addd6
PG
2870 ``symlink_cb`` is a hook accepting a function that is passed the
2871 ``member``, ``path``, and ``set_attrs`` arguments if the tarinfo for
2872 ``member`` indicates a symlink in which case only the callback
9b13f5c4
PG
2873 passed will be applied, skipping the actual extraction. In case the
2874 callback is invoked, its return value is passed on to the caller.
7584f5c9
ERE
2875 """
2876 self._check("r")
2877
be60ffd0 2878 if isinstance(member, str):
7584f5c9
ERE
2879 tarinfo = self.getmember(member)
2880 else:
2881 tarinfo = member
2882
2883 # Prepare the link target for makelink().
2884 if tarinfo.islnk():
2885 tarinfo._link_target = os.path.join(path, tarinfo.linkname)
2886
9b13f5c4 2887 if symlink_cb is not None and tarinfo.issym():
83f5fd71 2888 return symlink_cb(member, path, set_attrs)
786addd6 2889
7584f5c9 2890 try:
be60ffd0
ERE
2891 self._extract_member(tarinfo, os.path.join(path, tarinfo.name),
2892 set_attrs=set_attrs)
2893 except EnvironmentError as e:
7584f5c9
ERE
2894 if self.errorlevel > 0:
2895 raise
2896 else:
2897 if e.filename is None:
2898 self._dbg(1, "tarfile: %s" % e.strerror)
2899 else:
2900 self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename))
be60ffd0 2901 except ExtractError as e:
7584f5c9
ERE
2902 if self.errorlevel > 1:
2903 raise
2904 else:
2905 self._dbg(1, "tarfile: %s" % e)
2906
2907 def extractfile(self, member):
2908 """Extract a member from the archive as a file object. `member' may be
be60ffd0
ERE
2909 a filename or a TarInfo object. If `member' is a regular file or a
2910 link, an io.BufferedReader object is returned. Otherwise, None is
2911 returned.
7584f5c9
ERE
2912 """
2913 self._check("r")
2914
be60ffd0 2915 if isinstance(member, str):
7584f5c9
ERE
2916 tarinfo = self.getmember(member)
2917 else:
2918 tarinfo = member
2919
be60ffd0
ERE
2920 if tarinfo.isreg() or tarinfo.ismultivol() or\
2921 tarinfo.type not in SUPPORTED_TYPES:
7584f5c9
ERE
2922 # If a member's type is unknown, it is treated as a
2923 # regular file.
2924 return self.fileobject(self, tarinfo)
2925
2926 elif tarinfo.islnk() or tarinfo.issym():
2927 if isinstance(self.fileobj, _Stream):
2928 # A small but ugly workaround for the case that someone tries
2929 # to extract a (sym)link as a file-object from a non-seekable
2930 # stream of tar blocks.
2931 raise StreamError("cannot extract (sym)link as file object")
2932 else:
2933 # A (sym)link's file object is its target's file object.
2934 return self.extractfile(self._find_link_target(tarinfo))
2935 else:
2936 # If there's no data associated with the member (directory, chrdev,
2937 # blkdev, etc.), return None instead of a file object.
2938 return None
2939
be60ffd0 2940 def _extract_member(self, tarinfo, targetpath, set_attrs=True):
7584f5c9
ERE
2941 """Extract the TarInfo object tarinfo to a physical
2942 file called targetpath.
2943 """
2944 # Fetch the TarInfo object for the given name
2945 # and build the destination pathname, replacing
2946 # forward slashes to platform specific separators.
2947 targetpath = targetpath.rstrip("/")
2948 targetpath = targetpath.replace("/", os.sep)
2949
2950 # Create all upper directories.
2951 upperdirs = os.path.dirname(targetpath)
2952 if upperdirs and not os.path.exists(upperdirs):
2953 # Create directories that are not part of the archive with
2954 # default permissions.
2955 os.makedirs(upperdirs)
2956
2957 if tarinfo.islnk() or tarinfo.issym():
2958 self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname))
2959 else:
2960 self._dbg(1, tarinfo.name)
2961
2962 if tarinfo.isreg():
2963 self.makefile(tarinfo, targetpath)
2964 elif tarinfo.isdir():
2965 self.makedir(tarinfo, targetpath)
2966 elif tarinfo.isfifo():
2967 self.makefifo(tarinfo, targetpath)
2968 elif tarinfo.ischr() or tarinfo.isblk():
2969 self.makedev(tarinfo, targetpath)
2970 elif tarinfo.islnk() or tarinfo.issym():
2971 self.makelink(tarinfo, targetpath)
2972 elif tarinfo.type not in SUPPORTED_TYPES:
2973 self.makeunknown(tarinfo, targetpath)
2974 else:
2975 self.makefile(tarinfo, targetpath)
2976
be60ffd0
ERE
2977 if set_attrs:
2978 self.chown(tarinfo, targetpath)
2979 if not tarinfo.issym():
2980 self.chmod(tarinfo, targetpath)
2981 self.utime(tarinfo, targetpath)
7584f5c9
ERE
2982
2983 #--------------------------------------------------------------------------
2984 # Below are the different file methods. They are called via
2985 # _extract_member() when extract() is called. They can be replaced in a
2986 # subclass to implement other functionality.
2987
2988 def makedir(self, tarinfo, targetpath):
2989 """Make a directory called targetpath.
2990 """
2991 try:
2992 # Use a safe mode for the directory, the real mode is set
2993 # later in _extract_member().
be60ffd0
ERE
2994 os.mkdir(targetpath, 0o0700)
2995 except FileExistsError:
2996 pass
7584f5c9
ERE
2997
2998 def makefile(self, tarinfo, targetpath):
2999 """Make a file called targetpath.
3000 """
be60ffd0
ERE
3001 source = self.fileobj
3002 source.seek(tarinfo.offset_data)
c7c736b6 3003 decrypt = False
c474439c
ERE
3004 iterate = True
3005 target = bltn_open(targetpath, "wb")
3006
be60ffd0
ERE
3007 if tarinfo.sparse is not None:
3008 try:
3009 for offset, size in tarinfo.sparse:
3010 target.seek(offset)
3011 copyfileobj(source, target, size)
3012 target.seek(tarinfo.size)
3013 target.truncate()
3014 finally:
3015 target.close()
3016 return
3017
c474439c
ERE
3018 while iterate:
3019 iterate = False
3020 try:
3021 copyfileobj(source, target, tarinfo.size)
aa828cd1 3022 except OSError:
c474439c
ERE
3023 source.close()
3024 # only if we are extracting a multivolume this can be treated
3025 if not self.new_volume_handler:
3026 target.close()
3027 raise Exception("We need to read a new volume and you"
3028 " didn't supply a new_volume_handler")
3029
3030 # the new volume handler should do everything needed to
3031 # start working in a new volume. usually, the handler calls
3032 # to self.open_volume
3033 self.volume_number += 1
3034 self.new_volume_handler(self, self.base_name, self.volume_number)
be60ffd0
ERE
3035 tarinfo = self.firstmember
3036 source = self.fileobj
c474439c 3037 iterate = True
c474439c
ERE
3038 target.close()
3039
7584f5c9
ERE
3040
3041 def makeunknown(self, tarinfo, targetpath):
3042 """Make a file from a TarInfo object with an unknown type
3043 at targetpath.
3044 """
3045 self.makefile(tarinfo, targetpath)
3046 self._dbg(1, "tarfile: Unknown file type %r, " \
3047 "extracted as regular file." % tarinfo.type)
3048
3049 def makefifo(self, tarinfo, targetpath):
3050 """Make a fifo called targetpath.
3051 """
3052 if hasattr(os, "mkfifo"):
3053 os.mkfifo(targetpath)
3054 else:
3055 raise ExtractError("fifo not supported by system")
3056
3057 def makedev(self, tarinfo, targetpath):
3058 """Make a character or block device called targetpath.
3059 """
3060 if not hasattr(os, "mknod") or not hasattr(os, "makedev"):
3061 raise ExtractError("special devices not supported by system")
3062
3063 mode = tarinfo.mode
3064 if tarinfo.isblk():
3065 mode |= stat.S_IFBLK
3066 else:
3067 mode |= stat.S_IFCHR
3068
3069 os.mknod(targetpath, mode,
3070 os.makedev(tarinfo.devmajor, tarinfo.devminor))
3071
3072 def makelink(self, tarinfo, targetpath):
3073 """Make a (symbolic) link called targetpath. If it cannot be created
3074 (platform limitation), we try to make a copy of the referenced file
3075 instead of a link.
3076 """
be60ffd0 3077 try:
7584f5c9
ERE
3078 # For systems that support symbolic and hard links.
3079 if tarinfo.issym():
7584f5c9
ERE
3080 os.symlink(tarinfo.linkname, targetpath)
3081 else:
3082 # See extract().
3083 if os.path.exists(tarinfo._link_target):
7584f5c9
ERE
3084 os.link(tarinfo._link_target, targetpath)
3085 else:
be60ffd0
ERE
3086 self._extract_member(self._find_link_target(tarinfo),
3087 targetpath)
3088 except symlink_exception:
7584f5c9 3089 try:
be60ffd0
ERE
3090 self._extract_member(self._find_link_target(tarinfo),
3091 targetpath)
7584f5c9
ERE
3092 except KeyError:
3093 raise ExtractError("unable to resolve link inside archive")
3094
3095 def chown(self, tarinfo, targetpath):
3096 """Set owner of targetpath according to tarinfo.
3097 """
3098 if pwd and hasattr(os, "geteuid") and os.geteuid() == 0:
3099 # We have to be root to do so.
3100 try:
3101 g = grp.getgrnam(tarinfo.gname)[2]
3102 except KeyError:
3103 g = tarinfo.gid
3104 try:
3105 u = pwd.getpwnam(tarinfo.uname)[2]
3106 except KeyError:
3107 u = tarinfo.uid
3108 try:
3109 if tarinfo.issym() and hasattr(os, "lchown"):
3110 os.lchown(targetpath, u, g)
3111 else:
be60ffd0
ERE
3112 os.chown(targetpath, u, g)
3113 except OSError as e:
7584f5c9
ERE
3114 raise ExtractError("could not change owner")
3115
3116 def chmod(self, tarinfo, targetpath):
3117 """Set file permissions of targetpath according to tarinfo.
3118 """
3119 if hasattr(os, 'chmod'):
3120 try:
3121 os.chmod(targetpath, tarinfo.mode)
be60ffd0 3122 except OSError as e:
7584f5c9
ERE
3123 raise ExtractError("could not change mode")
3124
3125 def utime(self, tarinfo, targetpath):
3126 """Set modification time of targetpath according to tarinfo.
3127 """
3128 if not hasattr(os, 'utime'):
3129 return
3130 try:
3131 os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime))
be60ffd0 3132 except OSError as e:
7584f5c9
ERE
3133 raise ExtractError("could not change modification time")
3134
3135 #--------------------------------------------------------------------------
3136 def next(self):
3137 """Return the next member of the archive as a TarInfo object, when
3138 TarFile is opened for reading. Return None if there is no more
3139 available.
3140 """
3141 self._check("ra")
3142 if self.firstmember is not None:
3143 m = self.firstmember
3144 self.firstmember = None
3145 return m
3146
be60ffd0
ERE
3147 # Read the next block.
3148 self.fileobj.seek(self.offset)
7584f5c9
ERE
3149 tarinfo = None
3150 while True:
3151 try:
3152 tarinfo = self.tarinfo.fromtarfile(self)
be60ffd0 3153 except EOFHeaderError as e:
7584f5c9
ERE
3154 if self.ignore_zeros:
3155 self._dbg(2, "0x%X: %s" % (self.offset, e))
3156 self.offset += BLOCKSIZE
3157 continue
be60ffd0 3158 except InvalidHeaderError as e:
7584f5c9
ERE
3159 if self.ignore_zeros:
3160 self._dbg(2, "0x%X: %s" % (self.offset, e))
3161 self.offset += BLOCKSIZE
3162 continue
3163 elif self.offset == 0:
3164 raise ReadError(str(e))
3165 except EmptyHeaderError:
3166 if self.offset == 0:
3167 raise ReadError("empty file")
be60ffd0 3168 except TruncatedHeaderError as e:
7584f5c9
ERE
3169 if self.offset == 0:
3170 raise ReadError(str(e))
be60ffd0 3171 except SubsequentHeaderError as e:
7584f5c9
ERE
3172 raise ReadError(str(e))
3173 break
3174
3175 if tarinfo is not None:
ea625b04
ERE
3176 if self.save_to_members:
3177 self.members.append(tarinfo)
7584f5c9
ERE
3178 else:
3179 self._loaded = True
3180
3181 return tarinfo
3182
3183 #--------------------------------------------------------------------------
3184 # Little helper methods:
3185
3186 def _getmember(self, name, tarinfo=None, normalize=False):
3187 """Find an archive member by name from bottom to top.
3188 If tarinfo is given, it is used as the starting point.
3189 """
3190 # Ensure that all members have been loaded.
3191 members = self.getmembers()
3192
3193 # Limit the member search list up to tarinfo.
3194 if tarinfo is not None:
3195 members = members[:members.index(tarinfo)]
3196
3197 if normalize:
3198 name = os.path.normpath(name)
3199
3200 for member in reversed(members):
3201 if normalize:
3202 member_name = os.path.normpath(member.name)
3203 else:
3204 member_name = member.name
3205
3206 if name == member_name:
3207 return member
3208
3209 def _load(self):
3210 """Read through the entire archive file and look for readable
3211 members.
3212 """
3213 while True:
3214 tarinfo = self.next()
3215 if tarinfo is None:
3216 break
3217 self._loaded = True
3218
3219 def _check(self, mode=None):
3220 """Check if TarFile is still open, and if the operation's mode
3221 corresponds to TarFile's mode.
3222 """
3223 if self.closed:
be60ffd0 3224 raise OSError("%s is closed" % self.__class__.__name__)
7584f5c9 3225 if mode is not None and self.mode not in mode:
be60ffd0 3226 raise OSError("bad operation for mode %r" % self.mode)
7584f5c9
ERE
3227
3228 def _find_link_target(self, tarinfo):
3229 """Find the target member of a symlink or hardlink member in the
3230 archive.
3231 """
3232 if tarinfo.issym():
3233 # Always search the entire archive.
3234 linkname = "/".join(filter(None, (os.path.dirname(tarinfo.name), tarinfo.linkname)))
3235 limit = None
3236 else:
3237 # Search the archive before the link, because a hard link is
3238 # just a reference to an already archived file.
3239 linkname = tarinfo.linkname
3240 limit = tarinfo
3241
3242 member = self._getmember(linkname, tarinfo=limit, normalize=True)
3243 if member is None:
3244 raise KeyError("linkname %r not found" % linkname)
3245 return member
3246
3247 def __iter__(self):
3248 """Provide an iterator object.
3249 """
3250 if self._loaded:
3251 return iter(self.members)
3252 else:
3253 return TarIter(self)
3254
1027433a 3255 def _dbg(self, level, msg, *args):
7584f5c9
ERE
3256 """Write debugging output to sys.stderr.
3257 """
3258 if level <= self.debug:
1027433a 3259 print(msg.format(*args), file=sys.stderr)
7584f5c9
ERE
3260
3261 def __enter__(self):
3262 self._check()
3263 return self
3264
3265 def __exit__(self, type, value, traceback):
3266 if type is None:
3267 self.close()
3268 else:
3269 # An exception occurred. We must not call close() because
3270 # it would try to write end-of-archive blocks and padding.
3271 if not self._extfileobj:
3272 self.fileobj.close()
3273 self.closed = True
3274# class TarFile
3275
3276class TarIter:
3277 """Iterator Class.
3278
3279 for tarinfo in TarFile(...):
3280 suite...
3281 """
3282
3283 def __init__(self, tarfile):
3284 """Construct a TarIter object.
3285 """
3286 self.tarfile = tarfile
3287 self.index = 0
3288 def __iter__(self):
3289 """Return iterator object.
3290 """
3291 return self
be60ffd0 3292 def __next__(self):
7584f5c9
ERE
3293 """Return the next item using TarFile's next() method.
3294 When all members have been read, set TarFile as _loaded.
3295 """
3296 # Fix for SF #1100429: Under rare circumstances it can
3297 # happen that getmembers() is called during iteration,
3298 # which will cause TarIter to stop prematurely.
3299
3300 if self.index == 0 and self.tarfile.firstmember is not None:
3301 tarinfo = self.tarfile.next()
3302 elif self.index < len(self.tarfile.members):
3303 tarinfo = self.tarfile.members[self.index]
3304 elif not self.tarfile._loaded:
3305 tarinfo = self.tarfile.next()
3306 if not tarinfo:
3307 self.tarfile._loaded = True
3308 raise StopIteration
3309 else:
3310 raise StopIteration
3311 self.index += 1
fb27c6e8 3312
7584f5c9
ERE
3313 return tarinfo
3314
6690f5e0
PG
3315#---------------------------------------------------------
3316# support functionality for rescue mode
3317#---------------------------------------------------------
3318
65b35c42
PG
3319def locate_tar_hdr_candidates (fd):
3320 raise NotImplementedError ("too soon")
3321
3322
3323def readable_tar_objects_offsets (ifd, cands):
3324 raise NotImplementedError ("too soon")
3325
3326
dfd7865e
PG
3327def locate_gz_hdr_candidates (fd):
3328 """
3329 Walk over instances of the GZ magic in the payload, collecting their
3330 positions. If the offset of the first found instance is not zero, the file
3331 begins with leading garbage.
3332
3333 Note that since the GZ magic consists of only two bytes, we expect a lot of
3334 false positives inside binary data.
3335
3336 :return: The list of offsets in the file.
3337 """
3338 pos = 0
3339 cands = []
3340 mm = mmap.mmap(fd, 0, mmap.MAP_SHARED, mmap.PROT_READ)
3341
3342 while True:
3343 pos = mm.find (GZ_MAGIC_BYTES, pos)
3344 if pos == -1:
3345 break
3346 cands.append (pos)
3347 pos += len (GZ_MAGIC_BYTES)
3348
3349 return cands
3350
3351
3352HDR_CAND_GOOD = 0 # header marks begin of valid object
3353HDR_CAND_FISHY = 1 # inconclusive
3354HDR_CAND_JUNK = 2 # not a header / object unreadable
3355
3356
3357def read_cstring (fd, max=-1, encoding=None):
3358 """
3359 Read one NUL-terminated string from *fd* into a Python string. If *max* is
3360 non-negative, reading will terminate after the specified number of bytes.
3361
3362 Optionally, an *encoding* may be specified to interpret the data as.
3363
3364 :returns: *None* if parsing failed or the maximum number of bytes has been
3365 exceeded; a Python string with the data otherwise.
3366 """
3367 buf = b""
3368 l = 0
3369
3370 while True:
3371 c = os.read (fd, 1)
3372 if c == NUL:
3373 break
3374 if max >= 0 and l > max:
3375 return None
3376 buf += c
3377 l += 1
3378 if encoding is not None:
3379 buf = buf.decode (encoding)
3380
3381 return buf
3382
3383
3384def inspect_gz_hdr (fd, off):
3385 """
3386 Attempt to parse a Gzip header in *fd* at position *off*. The format is
3387 documented as RFC1952.
3388
3389 Returns a verdict about the quality of that header plus the parsed header
3390 when readable. Problematic sizes such as fields running past the EOF are
3391 treated as garbage. Properties in which the header merely doesn’t conform
3392 to the spec (garbage flag bits, bogus timestamp) are considered “fishy”. No
3393 validation is possible on embedded strings because they are single-byte
3394 encoded.
3395 """
3396 fname = None
3397 flags = 0x00
3398 dflags = 0x00
3399 mtime = 0x00000000
3400 oscode = 0x00
3401 verdict = HDR_CAND_GOOD
3402
3403 os.lseek (fd, off, os.SEEK_SET)
3404 if os.lseek (fd, 0, os.SEEK_CUR) != off:
3405 return HDR_CAND_JUNK, None
3406
3407 raw = os.read (fd, GZ_HEADER_SIZE)
3408 if len (raw) != GZ_HEADER_SIZE:
3409 return HDR_CAND_JUNK, None
3410
3411 flags = 0x0
3412 try:
3413 _m1, _m2, meth, flags, mtime, dflags, oscode = \
3414 struct.unpack (GZ_FMT_HEADER, raw)
3415 if meth != GZ_METHOD_DEFLATE: # only deflate is supported
3416 return HDR_CAND_JUNK, None
3417 except struct.error as exn:
3418 return HDR_CAND_JUNK, None
3419
3420 if mtime > int (time.time ()):
3421 verdict = HDR_CAND_FISHY
3422
3423 if dflags != GZ_DEFLATE_FLAGS:
3424 verdict = HDR_CAND_FISHY
3425
3426 if oscode != GZ_OS_CODE:
3427 verdict = HDR_CAND_FISHY
3428
3429 if flags & GZ_FLAG_FTEXT: # created by some contrarian
3430 verdict = HDR_CAND_FISHY
3431 if flags & GZ_FLAG_FEXTRA:
3432 xlen = struct.unpack ("<H", os.read (fd, 2))
3433 xtra = os.read (fd, xlen)
3434 if len (xtra) != xlen: # eof inside header
3435 return HDR_CAND_JUNK, None
3436 if flags & GZ_FLAG_FNAME:
3437 # read up to the next NUL byte, not exceeding the maximum path length
3438 # allowed by tar(5)
3439 fname = read_cstring (fd, max=(LENGTH_NAME + LENGTH_PREFIX),
3440 encoding="iso-8859-1")
3441 if fname is None:
3442 return HDR_CAND_JUNK, None
3443 if flags & GZ_FLAG_FCOMMENT:
3444 fname = read_cstring (fd, max=(LENGTH_NAME + LENGTH_PREFIX),
3445 encoding="iso-8859-1")
3446 if fname is None:
3447 return HDR_CAND_JUNK, None
3448 if flags & GZ_FLAG_FHCRC: # half a CRC32; discarded
3449 crc16 = os.read (fd, 2)
3450 if len (crc16) != 2: # eof inside header
3451 return HDR_CAND_JUNK, None
3452 if flags & GZ_FLAG_RESERVED:
3453 # according to the RFC, these must not be set
3454 verdict = HDR_CAND_FISHY
3455
3456 hlen = os.lseek (fd, 0, os.SEEK_CUR) - off
3457
3458 return verdict, \
3459 { "fname" : fname
3460 , "flags" : flags
3461 , "dflags" : dflags
3462 , "mtime" : mtime
3463 , "oscode" : oscode
3464 , "hlen" : hlen
3465 }
3466
3467
3468def try_decompress (ifd, off, hdr):
3469 """
3470 Attempt to process the object starting at *off* with gzip.
3471
3472 :returns: A pair containing the values of the decompressed data and
3473 the length of the input consumed. Note that the latter value
3474 may exceed the length of the compressed data because the
3475 *zlib* module does not provide a means to query how much
3476 of the input it processed before the end of an object.
3477 """
3478 import zlib
3479 decmp = zlib.decompressobj (-zlib.MAX_WBITS)
3480 pos = off
3481 dlen = 0 # size of decompressed data
3482
3483 os.lseek (ifd, pos, os.SEEK_SET)
3484 while True:
3485 cnk = os.read (ifd, BUFSIZE)
3486 pos += len (cnk)
3487 try:
3488 data = decmp.decompress (cnk)
3489 except zlib.error as exn: # probably CRC32 mismatch; terminate softly
3490 break # fishy
3491 dlen += len (data)
3492 if decmp.eof is True:
3493 break
3494 if len (cnk) != BUFSIZE: # eof, but not end of decompressed object!
3495 break # fishy
3496
3497 return dlen, pos - off
3498
3499def readable_gz_objects_offsets (ifd, cands):
3500 """
3501 Inspect header candidates for parseable *ifd* gzipped objects.
3502 """
3503 good = []
3504 nobj = 0
3505
3506 for cand in cands:
3507 nobj += 1
3508 vdt, hdr = inspect_gz_hdr (ifd, cand)
3509 if vdt == HDR_CAND_JUNK:
3510 pass # ignore unreadable ones
3511 elif vdt in [HDR_CAND_GOOD, HDR_CAND_FISHY]:
3512 off0 = cand + hdr ["hlen"]
3513 dlen, clen = try_decompress (ifd, off0, hdr)
3514 if dlen > 0 and clen > 0:
3515 good.append (cand)
3516
3517 return good
3518
3519
3520def reconstruct_offsets_gz (fname):
3521 """
3522 From the given file, retrieve all GZ header-like offsets (“candidates”).
3523 Then check each of those locations whether they can be processed as
3524 compressed data.
3525 """
3526 ifd = os.open (fname, os.O_RDONLY)
3527
3528 try:
3529 cands = locate_gz_hdr_candidates (ifd)
3530 return readable_gz_objects_offsets (ifd, cands)
3531 finally:
3532 os.close (ifd)
3533
3534
65b35c42
PG
3535def reconstruct_offsets_tar (fname):
3536 """
3537 From the given file, retrieve all tar header-like offsets (“candidates”).
3538 Then check each of those locations whether they can be processed as tar
3539 data.
3540 """
3541 ifd = os.open (fname, os.O_RDONLY)
3542
3543 try:
3544 cands = locate_tar_hdr_candidates (ifd)
3545 return readable_tar_objects_offsets (ifd, cands)
3546 finally:
3547 os.close (ifd)
3548
3549
d39d4cbf
PG
3550def read_tarobj_at_offset (fileobj, offset, mode, secret=None):
3551 decr = None
d39d4cbf 3552
dfd7865e
PG
3553 if secret is not None:
3554 ks = secret [0]
3555
3556 if ks == crypto.PDTCRYPT_SECRET_PW:
3557 decr = crypto.Decrypt (password=secret [1])
3558 elif ks == crypto.PDTCRYPT_SECRET_KEY:
3559 key = binascii.unhexlify (secret [1])
3560 decr = crypto.Decrypt (key=key)
3561 else:
3562 raise RuntimeError
d39d4cbf
PG
3563
3564 tarobj = \
3565 TarFile.open_at_offset (offset,
3566 mode=mode,
3567 fileobj=fileobj,
3568 format=GNU_FORMAT,
3569 concat='#' in mode,
3570 encryption=decr,
3571 save_to_members=False,
3572 tolerance=TOLERANCE_RESCUE)
3573
3574 return tarobj.next ()
3575
3576
2d50b7f7
PG
3577def idxent_of_tarinfo (tarinfo):
3578 """
3579 Scrape the information relevant for the index from a *TarInfo* object.
3580 Keys like the inode number that lack a corresponding field in a TarInfo
3581 will be set to some neutral value.
3582 Example output:
3583
3584 { "inode" : 0
3585 , "uid" : 0
3586 , "path" : "snapshot://annotations.db"
3587 , "offset" : 0
3588 , "volume" : 0
3589 , "mode" : 33152
3590 , "ctime" : 1502798115
3591 , "mtime" : 1502196423
3592 , "size" : 144
3593 , "type" : "file"
3594 , "gid" : 0
3595 }
3596
3597 """
3598
3599 return \
3600 { "inode" : 0 # ignored when reading the index
3601 , "uid" : tarinfo.uid
3602 , "gid" : tarinfo.gid
3603 , "path" : tarinfo.name # keeping URI scheme
3604 , "offset" : 0 # to be added by the caller
3605 , "volume" : tarinfo.volume_offset
3606 , "mode" : tarinfo.mode
3607 , "ctime" : tarinfo.mtime
3608 , "mtime" : tarinfo.mtime
3609 , "size" : tarinfo.size
3610 , "type" : tarinfo.type
3611 }
3612
3613
d39d4cbf 3614def gen_rescue_index (backup_tar_path, mode, password=None, key=None):
6690f5e0
PG
3615 psidx = [] # pseudo index, return value
3616 offsets = None
addcec42 3617 secret = crypto.make_secret (password=password, key=key)
6690f5e0
PG
3618
3619 if secret is not None:
3620 offsets = crypto.reconstruct_offsets (backup_tar_path, secret)
dfd7865e
PG
3621 elif mode == "#gz":
3622 offsets = reconstruct_offsets_gz (backup_tar_path)
65b35c42
PG
3623 elif mode == "#":
3624 offsets = reconstruct_offsets_tar (backup_tar_path)
3625 else:
3626 raise TarError ("no rescue handling for mode “%s”" % mode)
dfd7865e
PG
3627
3628 fileobj = bltn_open (backup_tar_path, "rb")
3629 infos = [ (off, read_tarobj_at_offset (fileobj, off, mode, secret=secret))
3630 for off in offsets ]
3631 def aux (o, ti):
3632 ie = idxent_of_tarinfo (ti)
3633 ie ["offset"] = o
3634 return ie
3635 psidx = [ aux (o, ti) for o, ti in infos ]
6690f5e0
PG
3636
3637 return psidx
7584f5c9
ERE
3638
3639#--------------------
3640# exported functions
3641#--------------------
3642def is_tarfile(name):
3643 """Return True if name points to a tar archive that we
3644 are able to handle, else return False.
3645 """
3646 try:
3647 t = open(name)
3648 t.close()
3649 return True
3650 except TarError:
3651 return False
3652
3653bltn_open = open
3654open = TarFile.open