implement tolerant gz header parser
[python-delta-tar] / deltatar / tarfile.py
CommitLineData
be60ffd0 1#!/usr/bin/env python3
7584f5c9
ERE
2#-------------------------------------------------------------------
3# tarfile.py
4#-------------------------------------------------------------------
5# Copyright (C) 2002 Lars Gustäbel <lars@gustaebel.de>
6# All rights reserved.
7#
8# Permission is hereby granted, free of charge, to any person
9# obtaining a copy of this software and associated documentation
10# files (the "Software"), to deal in the Software without
11# restriction, including without limitation the rights to use,
12# copy, modify, merge, publish, distribute, sublicense, and/or sell
13# copies of the Software, and to permit persons to whom the
14# Software is furnished to do so, subject to the following
15# conditions:
16#
17# The above copyright notice and this permission notice shall be
18# included in all copies or substantial portions of the Software.
19#
20# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27# OTHER DEALINGS IN THE SOFTWARE.
28#
29"""Read from and write to tar format archives.
30"""
31
32__version__ = "$Revision: 85213 $"
33# $Source$
34
35version = "0.9.0"
36__author__ = "Lars Gustäbel (lars@gustaebel.de)"
37__date__ = "$Date$"
38__cvsid__ = "$Id$"
5fdff89f 39__credits__ = "Gustavo Niemeyer, Niels Gustäbel, Richard Townsend, Eduardo Robles."
7584f5c9
ERE
40
41#---------
42# Imports
43#---------
c7c736b6 44import binascii
dfd7865e
PG
45import copy
46import errno
be60ffd0 47import io
dfd7865e
PG
48import mmap
49import operator
50import os
51import re
7584f5c9
ERE
52import shutil
53import stat
7584f5c9 54import struct
dfd7865e
PG
55import sys
56import time
7584f5c9 57
c7c736b6
PG
58import traceback # XXX
59
8ab8fac5 60from . import crypto
6e812ad9 61
7584f5c9
ERE
62try:
63 import grp, pwd
64except ImportError:
65 grp = pwd = None
66
be60ffd0
ERE
67# os.symlink on Windows prior to 6.0 raises NotImplementedError
68symlink_exception = (AttributeError, NotImplementedError)
69try:
70 # OSError (winerror=1314) will be raised if the caller does not hold the
71 # SeCreateSymbolicLinkPrivilege privilege
72 symlink_exception += (OSError,)
73except NameError:
74 pass
75
7584f5c9
ERE
76# from tarfile import *
77__all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError"]
78
be60ffd0
ERE
79from builtins import open as _open # Since 'open' is TarFile.open
80
7584f5c9
ERE
81#---------------------------------------------------------
82# tar constants
83#---------------------------------------------------------
be60ffd0 84NUL = b"\0" # the null character
7584f5c9
ERE
85BLOCKSIZE = 512 # length of processing blocks
86RECORDSIZE = BLOCKSIZE * 20 # length of records
be60ffd0
ERE
87GNU_MAGIC = b"ustar \0" # magic gnu tar string
88POSIX_MAGIC = b"ustar\x0000" # magic posix tar string
7584f5c9
ERE
89
90LENGTH_NAME = 100 # maximum length of a filename
91LENGTH_LINK = 100 # maximum length of a linkname
92LENGTH_PREFIX = 155 # maximum length of the prefix field
93
be60ffd0
ERE
94REGTYPE = b"0" # regular file
95AREGTYPE = b"\0" # regular file
96LNKTYPE = b"1" # link (inside tarfile)
97SYMTYPE = b"2" # symbolic link
98CHRTYPE = b"3" # character special device
99BLKTYPE = b"4" # block special device
100DIRTYPE = b"5" # directory
101FIFOTYPE = b"6" # fifo special device
102CONTTYPE = b"7" # contiguous file
103
104GNUTYPE_LONGNAME = b"L" # GNU tar longname
105GNUTYPE_LONGLINK = b"K" # GNU tar longlink
106GNUTYPE_SPARSE = b"S" # GNU tar sparse file
107GNUTYPE_MULTIVOL = b"M" # GNU tar continuation of a file that began on
68ddf955 108 # another volume
7584f5c9 109
be60ffd0
ERE
110XHDTYPE = b"x" # POSIX.1-2001 extended header
111XGLTYPE = b"g" # POSIX.1-2001 global header
112SOLARIS_XHDTYPE = b"X" # Solaris extended header
7584f5c9
ERE
113
114USTAR_FORMAT = 0 # POSIX.1-1988 (ustar) format
115GNU_FORMAT = 1 # GNU tar format
116PAX_FORMAT = 2 # POSIX.1-2001 (pax) format
117DEFAULT_FORMAT = GNU_FORMAT
118
15a81fc0 119GZ_FMT_HEADER = b"<BBBBLBB"
203cb25e 120GZ_HEADER_SIZE = 10 # not including the name
15a81fc0
PG
121GZ_MAGIC = (0x1f, 0x8b) # 0o37, 0o213
122GZ_METHOD_DEFLATE = 0x08 # 0o10
dfd7865e
PG
123GZ_FLAG_FTEXT = 1 << 0 # ASCII payload
124GZ_FLAG_FHCRC = 1 << 1 # CRC16
125GZ_FLAG_FEXTRA = 1 << 2 # extra field
126GZ_FLAG_FNAME = 1 << 3 # set by default in gzip
127GZ_FLAG_FCOMMENT = 1 << 4 # NUL-terminated comment
128GZ_FLAG_RESERVED = 7 << 5 # unassigned
15a81fc0
PG
129GZ_DEFLATE_FLAGS = 0x00 # 0o00, never read (deflate.c)
130GZ_OS_CODE = 0x03 # 0o03, default in gzip (tailor.h)
d601d33b
PG
131GZ_MAGIC_BYTES = struct.pack ("<BB", GZ_MAGIC [0], GZ_MAGIC [1])
132GZ_MAGIC_DEFLATE = struct.pack ("<BBB", GZ_MAGIC [0], GZ_MAGIC [1],
133 GZ_METHOD_DEFLATE)
15a81fc0 134
04f4c7ab
PG
135TOLERANCE_STRICT = 0
136TOLERANCE_RECOVER = 1 # rely on offsets in index
137TOLERANCE_RESCUE = 2 # deduce metadata from archive contents
138
dfd7865e
PG
139BUFSIZE = 16 * 1024
140
7584f5c9 141#---------------------------------------------------------
d1c38f40
PG
142# archive handling mode
143#---------------------------------------------------------
144
145ARCMODE_PLAIN = 0
146ARCMODE_ENCRYPT = 1 << 0
147ARCMODE_COMPRESS = 1 << 1
148ARCMODE_CONCAT = 1 << 2
149
150def arcmode_fmt (m):
151 if m == ARCMODE_PLAIN:
152 return "PLAIN"
153 first = True
154 ret = "["
155 def chkappend (b, s):
156 nonlocal m
157 nonlocal ret
158 nonlocal first
159 if m & b:
160 if first is True: first = False
161 else: ret += " |"
162 ret += " " + s
163 chkappend (ARCMODE_ENCRYPT, "ENCRYPT")
164 chkappend (ARCMODE_COMPRESS, "COMPRESS")
165 chkappend (ARCMODE_CONCAT, "CONCAT")
166 return ret + " ]"
167
168
169def arcmode_set (concat=False, encryption=None, comptype=None, init=ARCMODE_PLAIN):
170 ret = init
171 if bool (concat) is True:
172 ret |= ARCMODE_CONCAT
173 if encryption is not None:
174 ret |= ARCMODE_ENCRYPT
175 if comptype == "gz":
176 ret |= ARCMODE_COMPRESS
177 return ret
178
179#---------------------------------------------------------
7584f5c9
ERE
180# tarfile constants
181#---------------------------------------------------------
182# File types that tarfile supports:
183SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE,
184 SYMTYPE, DIRTYPE, FIFOTYPE,
185 CONTTYPE, CHRTYPE, BLKTYPE,
186 GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
68ddf955 187 GNUTYPE_SPARSE, GNUTYPE_MULTIVOL)
7584f5c9
ERE
188
189# File types that will be treated as a regular file.
190REGULAR_TYPES = (REGTYPE, AREGTYPE,
191 CONTTYPE, GNUTYPE_SPARSE)
192
193# File types that are part of the GNU tar format.
194GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
68ddf955 195 GNUTYPE_SPARSE, GNUTYPE_MULTIVOL)
7584f5c9
ERE
196
197# Fields from a pax header that override a TarInfo attribute.
198PAX_FIELDS = ("path", "linkpath", "size", "mtime",
199 "uid", "gid", "uname", "gname")
200
be60ffd0
ERE
201# Fields from a pax header that are affected by hdrcharset.
202PAX_NAME_FIELDS = {"path", "linkpath", "uname", "gname"}
203
7584f5c9
ERE
204# Fields in a pax header that are numbers, all other fields
205# are treated as strings.
206PAX_NUMBER_FIELDS = {
207 "atime": float,
208 "ctime": float,
209 "mtime": float,
210 "uid": int,
211 "gid": int,
212 "size": int
213}
214
215#---------------------------------------------------------
7584f5c9
ERE
216# initialization
217#---------------------------------------------------------
be60ffd0
ERE
218
219if os.name in ("nt", "ce"):
220 ENCODING = "utf-8"
221else:
222 ENCODING = sys.getfilesystemencoding()
7584f5c9
ERE
223
224#---------------------------------------------------------
225# Some useful functions
226#---------------------------------------------------------
227
be60ffd0
ERE
228def stn(s, length, encoding, errors):
229 """Convert a string to a null-terminated bytes object.
7584f5c9 230 """
be60ffd0 231 s = s.encode(encoding, errors)
7584f5c9
ERE
232 return s[:length] + (length - len(s)) * NUL
233
be60ffd0
ERE
234def nts(s, encoding, errors):
235 """Convert a null-terminated bytes object to a string.
7584f5c9 236 """
be60ffd0
ERE
237 p = s.find(b"\0")
238 if p != -1:
239 s = s[:p]
240 return s.decode(encoding, errors)
241
242def sbtn(s, length, encoding, errors):
243 """Convert a string or a bunch of bytes to a null-terminated bytes object
244 of specific size.
245 """
246 if isinstance(s, str):
247 s = s.encode(encoding, errors)
248 return s[:length] + (length - len(s)) * NUL
7584f5c9
ERE
249
250def nti(s):
251 """Convert a number field to a python number.
252 """
253 # There are two possible encodings for a number field, see
254 # itn() below.
be60ffd0
ERE
255 if s[0] in (0o200, 0o377):
256 n = 0
257 for i in range(len(s) - 1):
258 n <<= 8
259 n += s[i + 1]
260 if s[0] == 0o377:
261 n = -(256 ** (len(s) - 1) - n)
262 else:
7584f5c9 263 try:
be60ffd0 264 n = int(nts(s, "ascii", "strict") or "0", 8)
7584f5c9
ERE
265 except ValueError:
266 raise InvalidHeaderError("invalid header")
7584f5c9
ERE
267 return n
268
269def itn(n, digits=8, format=DEFAULT_FORMAT):
270 """Convert a python number to a number field.
271 """
272 # POSIX 1003.1-1988 requires numbers to be encoded as a string of
273 # octal digits followed by a null-byte, this allows values up to
274 # (8**(digits-1))-1. GNU tar allows storing numbers greater than
be60ffd0
ERE
275 # that if necessary. A leading 0o200 or 0o377 byte indicate this
276 # particular encoding, the following digits-1 bytes are a big-endian
277 # base-256 representation. This allows values up to (256**(digits-1))-1.
278 # A 0o200 byte indicates a positive number, a 0o377 byte a negative
279 # number.
7584f5c9 280 if 0 <= n < 8 ** (digits - 1):
8112b0ed 281 s = bytes("%0*o" % (digits - 1, int(n)), "ascii") + NUL
be60ffd0
ERE
282 elif format == GNU_FORMAT and -256 ** (digits - 1) <= n < 256 ** (digits - 1):
283 if n >= 0:
284 s = bytearray([0o200])
285 else:
286 s = bytearray([0o377])
287 n = 256 ** digits + n
7584f5c9 288
be60ffd0
ERE
289 for i in range(digits - 1):
290 s.insert(1, n & 0o377)
7584f5c9 291 n >>= 8
7584f5c9 292 else:
be60ffd0
ERE
293 raise ValueError("overflow in number field")
294
295 return s
7584f5c9
ERE
296
297def calc_chksums(buf):
298 """Calculate the checksum for a member's header by summing up all
299 characters except for the chksum field which is treated as if
300 it was filled with spaces. According to the GNU tar sources,
301 some tars (Sun and NeXT) calculate chksum with signed char,
302 which will be different if there are chars in the buffer with
303 the high bit set. So we calculate two checksums, unsigned and
304 signed.
305 """
be60ffd0
ERE
306 unsigned_chksum = 256 + sum(struct.unpack_from("148B8x356B", buf))
307 signed_chksum = 256 + sum(struct.unpack_from("148b8x356b", buf))
7584f5c9
ERE
308 return unsigned_chksum, signed_chksum
309
310def copyfileobj(src, dst, length=None):
311 """Copy length bytes from fileobj src to fileobj dst.
312 If length is None, copy the entire content.
313 """
314 if length == 0:
315 return
316 if length is None:
317 shutil.copyfileobj(src, dst)
318 return
319
7584f5c9 320 blocks, remainder = divmod(length, BUFSIZE)
be60ffd0 321 for b in range(blocks):
7584f5c9 322 buf = src.read(BUFSIZE)
c474439c 323 dst.write(buf)
7584f5c9 324 if len(buf) < BUFSIZE:
be60ffd0 325 raise OSError("end of file reached")
7584f5c9
ERE
326 if remainder != 0:
327 buf = src.read(remainder)
c474439c 328 dst.write(buf)
7584f5c9 329 if len(buf) < remainder:
be60ffd0 330 raise OSError("end of file reached")
c7c736b6 331
7584f5c9 332
7584f5c9 333def filemode(mode):
be60ffd0
ERE
334 """Deprecated in this location; use stat.filemode."""
335 import warnings
336 warnings.warn("deprecated in favor of stat.filemode",
337 DeprecationWarning, 2)
338 return stat.filemode(mode)
7584f5c9
ERE
339
340class TarError(Exception):
341 """Base exception."""
342 pass
343class ExtractError(TarError):
344 """General exception for extract errors."""
345 pass
346class ReadError(TarError):
be60ffd0 347 """Exception for unreadable tar archives."""
7584f5c9
ERE
348 pass
349class CompressionError(TarError):
350 """Exception for unavailable compression methods."""
351 pass
352class StreamError(TarError):
353 """Exception for unsupported operations on stream-like TarFiles."""
354 pass
355class HeaderError(TarError):
356 """Base exception for header errors."""
357 pass
358class EmptyHeaderError(HeaderError):
359 """Exception for empty headers."""
360 pass
361class TruncatedHeaderError(HeaderError):
362 """Exception for truncated headers."""
363 pass
364class EOFHeaderError(HeaderError):
365 """Exception for end of file headers."""
366 pass
367class InvalidHeaderError(HeaderError):
368 """Exception for invalid headers."""
369 pass
370class SubsequentHeaderError(HeaderError):
371 """Exception for missing and invalid extended headers."""
372 pass
8ab8fac5
PG
373class InvalidEncryptionError(TarError):
374 """Exception for undefined crypto modes and combinations."""
375 pass
e4e5d0b8
PG
376class DecryptionError(TarError):
377 """Exception for error during decryption."""
378 pass
c7c736b6 379class EncryptionError(TarError):
e93f83f1 380 """Exception for error during encryption."""
c7c736b6 381 pass
e50fa574
PG
382class EndOfFile(Exception):
383 """Signal end of file condition when they’re not an error."""
7584f5c9
ERE
384
385#---------------------------
386# internal stream interface
387#---------------------------
388class _LowLevelFile:
389 """Low-level file object. Supports reading and writing.
390 It is used instead of a regular file object for streaming
391 access.
392 """
393
394 def __init__(self, name, mode):
ad4402e8 395 _mode = {
7584f5c9 396 "r": os.O_RDONLY,
c7c736b6 397 "w": os.O_RDWR | os.O_CREAT | os.O_TRUNC,
7584f5c9
ERE
398 }[mode]
399 if hasattr(os, "O_BINARY"):
f0287fb7 400 _mode |= os.O_BINARY # pylint: disable=no-member
be60ffd0 401 self.fd = os.open(name, _mode, 0o666)
ad4402e8 402 self.offset = 0
7584f5c9
ERE
403
404 def close(self):
405 os.close(self.fd)
406
407 def read(self, size):
ad4402e8
ERE
408 ret = os.read(self.fd, size)
409 self.offset += len(ret)
410 return ret
7584f5c9 411
867f75f7
PG
412 def write(self, s, pos=None):
413 if pos is not None:
414 p0 = self.offset
415 os.lseek (self.fd, pos, os.SEEK_SET)
416 n = os.write(self.fd, s)
417 if pos is None:
418 self.offset += len(s)
419 else:
420 append = pos + n - p0
421 if append > 0:
422 self.offset += append
423 os.lseek (self.fd, p0, os.SEEK_SET)
7584f5c9 424
ad4402e8
ERE
425 def tell(self):
426 return self.offset
427
c7c736b6
PG
428 def seek_set (self, pos):
429 os.lseek (self.fd, pos, os.SEEK_SET)
430 self.offset = pos
431
8ab8fac5 432
15a81fc0
PG
433def gz_header (name=None):
434 timestamp = int(time.time())
435 flags = 0x0
436
437 if name is None:
438 name = b""
439 else:
dfd7865e 440 flags |= GZ_FLAG_FNAME
15a81fc0
PG
441 if type(name) is str:
442 name = name.encode("iso-8859-1", "replace")
6e99d23a
PG
443 if name.endswith(b".pdtcrypt"):
444 name = name[:-9]
15a81fc0
PG
445 if name.endswith(b".gz"):
446 name = name[:-3]
447 # RFC1952 says we must use ISO-8859-1 for the FNAME field.
448 name += NUL
449
450 hdr = struct.pack (GZ_FMT_HEADER,
451 GZ_MAGIC [0], GZ_MAGIC [1],
452 GZ_METHOD_DEFLATE, flags,
453 timestamp,
454 GZ_DEFLATE_FLAGS, GZ_OS_CODE)
455
456 return hdr + name
457
d601d33b 458
7584f5c9
ERE
459class _Stream:
460 """Class that serves as an adapter between TarFile and
461 a stream-like object. The stream-like object only
462 needs to have a read() or write() method and is accessed
463 blockwise. Use of gzip or bzip2 compression is possible.
464 A stream-like object could be for example: sys.stdin,
465 sys.stdout, a socket, a tape device etc.
466
3031b7ae
PG
467 _Stream is intended to be used only internally but is
468 nevertherless used externally by Deltatar.
469
470 When encrypting, the ``enccounter`` will be used for
471 initializing the first cryptographic context. When
472 decrypting, its value will be compared to the decrypted
473 object. Decryption fails if the value does not match.
474 In effect, this means that a ``_Stream`` whose ctor was
475 passed ``enccounter`` can only be used to encrypt or
476 decrypt a single object.
7584f5c9
ERE
477 """
478
c7c736b6 479 remainder = -1 # track size in encrypted entries
04f4c7ab 480 tolerance = TOLERANCE_STRICT
c7c736b6 481
6e812ad9 482 def __init__(self, name, mode, comptype, fileobj, bufsize,
d1c38f40 483 concat=False, encryption=None, enccounter=None,
04f4c7ab 484 compresslevel=9, tolerance=TOLERANCE_STRICT):
7584f5c9
ERE
485 """Construct a _Stream object.
486 """
d1c38f40 487 self.arcmode = arcmode_set (concat, encryption, comptype)
04f4c7ab 488 self.tolerance = tolerance
d1c38f40 489
7584f5c9
ERE
490 self._extfileobj = True
491 if fileobj is None:
492 fileobj = _LowLevelFile(name, mode)
493 self._extfileobj = False
494
495 if comptype == '*':
496 # Enable transparent compression detection for the
497 # stream interface
498 fileobj = _StreamProxy(fileobj)
499 comptype = fileobj.getcomptype()
d1c38f40
PG
500 if comptype == '':
501 comptype = "tar"
7584f5c9 502
3031b7ae
PG
503 self.enccounter = None
504 if self.arcmode & ARCMODE_ENCRYPT:
505 self.enccounter = enccounter
506
7584f5c9
ERE
507 self.name = name or ""
508 self.mode = mode
509 self.comptype = comptype
53732900 510 self.cmp = None
7584f5c9
ERE
511 self.fileobj = fileobj
512 self.bufsize = bufsize
be60ffd0
ERE
513 self.buf = b""
514 self.pos = 0
515 self.concat_pos = 0
7584f5c9 516 self.closed = False
be60ffd0 517 self.flags = 0
be60ffd0 518 self.last_block_offset = 0
e4e5d0b8 519 self.dbuf = b"" # ???
46c03c02 520 self.exception = None # communicate decompression failure
2b82f50c 521 self.compresslevel = compresslevel
784175ba 522 self.bytes_written = 0
c7c736b6 523 # crypto parameters
2ae46844 524 self.encryption = encryption
c7c736b6 525 self.lasthdr = None
7584f5c9 526
be60ffd0
ERE
527 try:
528 if comptype == "gz":
529 try:
530 import zlib
531 except ImportError:
532 raise CompressionError("zlib module is not available")
533 self.zlib = zlib
bec34b42
PG
534 if mode == "r":
535 self.exception = zlib.error
8ae983c4 536 self._init_read_gz()
bec34b42 537 elif mode == "w":
d1c38f40
PG
538 if not (self.arcmode & ARCMODE_CONCAT):
539 if self.arcmode & ARCMODE_ENCRYPT:
3031b7ae 540 self._init_write_encrypt (name)
a0873dcc 541 self._init_write_gz ()
c2ffe2ec 542 self.crc = zlib.crc32(b"") & 0xFFFFffff
7584f5c9 543
be60ffd0 544 elif comptype == "bz2":
d1c38f40 545 if self.arcmode & ARCMODE_ENCRYPT:
2ae46844 546 raise InvalidEncryptionError("encryption not available for "
d1c38f40 547 "compression “%s”" % comptype)
be60ffd0
ERE
548 try:
549 import bz2
550 except ImportError:
551 raise CompressionError("bz2 module is not available")
552 if mode == "r":
553 self.dbuf = b""
554 self.cmp = bz2.BZ2Decompressor()
555 self.exception = OSError
556 else:
557 self.cmp = bz2.BZ2Compressor()
7584f5c9 558
be60ffd0 559 elif comptype == 'xz':
d1c38f40 560 if self.arcmode & ARCMODE_ENCRYPT:
2ae46844 561 raise InvalidEncryptionError("encryption not available for "
d1c38f40 562 "compression “%s”" % comptype)
c7c736b6
PG
563 try:
564 import lzma
565 except ImportError:
566 raise CompressionError("lzma module is not available")
567 if mode == "r":
568 self.dbuf = b""
569 self.cmp = lzma.LZMADecompressor()
570 self.exception = lzma.LZMAError
571 else:
572 self.cmp = lzma.LZMACompressor()
573
6de9444a 574 elif comptype == "tar":
d1c38f40 575 if not (self.arcmode & ARCMODE_CONCAT) \
6de9444a 576 and mode == "w" \
d1c38f40 577 and self.arcmode & ARCMODE_ENCRYPT:
3031b7ae 578 self._init_write_encrypt (name)
6de9444a
PG
579
580 else:
d1c38f40 581 if self.arcmode & ARCMODE_ENCRYPT:
2ae46844 582 raise InvalidEncryptionError("encryption not available for "
d1c38f40 583 "compression “%s”" % comptype)
c7c736b6 584 raise CompressionError("unknown compression type %r" % comptype)
be60ffd0 585
200d4866 586 except:
be60ffd0
ERE
587 if not self._extfileobj:
588 self.fileobj.close()
589 self.closed = True
590 raise
ac5e4184 591
7584f5c9
ERE
592 def __del__(self):
593 if hasattr(self, "closed") and not self.closed:
fac2cfe1
PG
594 try:
595 self.close()
596 except crypto.InternalError:
597 # context already finalized due to abort but close() tried
598 # to use it
599 pass
7584f5c9 600
c7c736b6 601
d1c38f40
PG
602 def next (self, name):
603 if self.arcmode & ARCMODE_COMPRESS:
604 if getattr (self, "cmp", None) is not None:
605 self._finalize_write_gz ()
0349168a
PG
606 self.__sync()
607 if self.arcmode & ~(ARCMODE_ENCRYPT | ARCMODE_COMPRESS):
608 self.last_block_offset = self.fileobj.tell()
d1c38f40
PG
609 if self.arcmode & ARCMODE_ENCRYPT:
610 self._finalize_write_encrypt ()
611 self._init_write_encrypt (name, set_last_block_offset=True)
612 if self.arcmode & ARCMODE_COMPRESS:
613 self._init_write_gz (set_last_block_offset =
0349168a 614 not (self.arcmode & ARCMODE_ENCRYPT))
d1c38f40
PG
615 return self.last_block_offset
616
617
618 def next_volume (self, name):
619 # with non-concat modes, this is taken care by the _Stream
620 # ctor as invoked by the newvol handler
621 if self.arcmode & ARCMODE_COMPRESS:
622 if getattr (self, "cmp", None) is not None:
623 # e. g. compressed PAX header written
624 self._finalize_write_gz ()
625 if self.arcmode & ARCMODE_ENCRYPT:
626 self._init_write_encrypt (name)
627 if self.arcmode & ARCMODE_COMPRESS:
628 self._init_write_gz ()
629
c7c736b6 630
d1c38f40
PG
631 def _init_write_encrypt (self, entry=None, set_last_block_offset=False):
632 """
633 Save position for delayed write of header; fill the header location
634 with dummy bytes.
635 """
636 # first thing, proclaim new object to the encryption context
637 # secondly, assemble the header with the updated parameters
638 # and commit it directly to the underlying stream, bypassing the
639 # encryption layer in .__write().
640 dummyhdr = self.encryption.next (entry, counter=self.enccounter)
641 if dummyhdr is None:
642 raise EncryptionError ("Crypto.next(): bad dummy header") # XXX
643 self.lasthdr = self.fileobj.tell()
644 self.__write_to_file(dummyhdr)
645 if set_last_block_offset is True:
646 self.last_block_offset = self.lasthdr
c7c736b6
PG
647
648
649 def _finalize_write_encrypt (self):
650 """
651 Seek back to header position, read dummy bytes, finalize crypto
652 obtaining the actual header, write header, seek back to current
653 position.
963d0db4
PG
654
655 Returns the list of IV fixed parts as used during encryption.
c7c736b6 656 """
d1c38f40 657 if self.lasthdr is not None:
c7c736b6
PG
658 pos0 = self.fileobj.tell ()
659 self.fileobj.seek_set (self.lasthdr)
dd47d6a2 660 dummy = self.fileobj.read (crypto.PDTCRYPT_HDR_SIZE)
c7c736b6
PG
661 pos1 = self.fileobj.tell ()
662 dpos = pos1 - self.lasthdr
dd47d6a2 663 assert dpos == crypto.PDTCRYPT_HDR_SIZE
c7c736b6 664 self.fileobj.seek_set (pos0)
c8c72fe1 665 data, hdr, _ = self.encryption.done (dummy)
5f38bff6 666 self.__write_to_file(hdr, pos=self.lasthdr)
c7c736b6
PG
667 self.__write_to_file(data) # append remainder of data
668 self.lasthdr = -1
669
670
57db1546
PG
671 def _finalize_write_gz (self):
672 if self.cmp is not None:
673 chunk = self.buf + self.cmp.flush()
674 if chunk:
675 if self.comptype == "gz":
676 # The native zlib crc is an unsigned 32-bit integer, but
677 # the Python wrapper implicitly casts that to a signed C
678 # long. So, on a 32-bit box self.crc may "look negative",
679 # while the same crc on a 64-bit box may "look positive".
680 # To avoid irksome warnings from the `struct` module, force
681 # it to look positive on all boxes.
682 chunk += struct.pack("<L", self.crc & 0xffffffff)
683 chunk += struct.pack("<L", self.concat_pos & 0xffffFFFF)
684 self.__enc_write (chunk)
15a81fc0 685 self.buf = b""
57db1546
PG
686
687
a0873dcc 688 def _init_write_gz (self, set_last_block_offset=False):
5fdff89f
ERE
689 '''
690 Add a new gzip block, closing last one
691 '''
be60ffd0 692 self.concat_pos = 0
c2ffe2ec 693 self.crc = self.zlib.crc32(b"") & 0xFFFFffff
6de9444a 694 first = self.cmp is None
2b82f50c
ERE
695 self.cmp = self.zlib.compressobj(self.compresslevel,
696 self.zlib.DEFLATED,
697 -self.zlib.MAX_WBITS,
698 self.zlib.DEF_MEM_LEVEL,
699 0)
6e812ad9
DGM
700
701 # if aes, we encrypt after compression
6de9444a 702 if set_last_block_offset is True:
ad4402e8 703 self.last_block_offset = self.fileobj.tell()
6e812ad9 704
15a81fc0 705 self.__write(gz_header (self.name if first is True else None))
5fdff89f 706
ac5e4184 707
7584f5c9
ERE
708 def write(self, s):
709 """Write string s to the stream.
710 """
711 if self.comptype == "gz":
c2ffe2ec 712 self.crc = self.zlib.crc32(s, self.crc) & 0xFFFFffff
7584f5c9 713 self.pos += len(s)
5fdff89f 714 self.concat_pos += len(s)
53732900 715 if self.cmp is not None:
7584f5c9
ERE
716 s = self.cmp.compress(s)
717 self.__write(s)
718
c7c736b6 719 def __sync(self):
cb7a3911 720 """Write what’s left in the buffer to the stream."""
c7c736b6
PG
721 self.__write (b"") # → len (buf) <= bufsiz
722 self.__enc_write (self.buf)
723 self.buf = b""
724
7584f5c9 725 def __write(self, s):
548bb8d5
CH
726 """Writes (and encodes) string s to the stream blockwise
727
728 will wait with encoding/writing until block is complete
7584f5c9
ERE
729 """
730 self.buf += s
731 while len(self.buf) > self.bufsize:
6e812ad9 732 self.__enc_write(self.buf[:self.bufsize])
7584f5c9
ERE
733 self.buf = self.buf[self.bufsize:]
734
867f75f7 735
5f38bff6 736 def __write_to_file(self, s, pos=None):
6e812ad9 737 '''
5f38bff6 738 Writes directly to the fileobj; updates self.bytes_written. If “pos” is
cb7a3911 739 given, the stream will seek to that position first and back afterwards,
5f38bff6 740 and the total of bytes written is not updated.
6e812ad9 741 '''
867f75f7 742 self.fileobj.write(s, pos)
5f38bff6
PG
743 if pos is None:
744 self.bytes_written += len(s)
867f75f7 745
6e812ad9
DGM
746
747 def __enc_write(self, s):
cb7a3911
PG
748 """
749 If encryption is active, the string s is encrypted before being written
750 to the file.
751 """
752 if len (s) == 0:
753 return
d1c38f40 754 if self.arcmode & ARCMODE_ENCRYPT:
cb7a3911
PG
755 buf = s
756 while len (buf) > 0:
757 n, ct = self.encryption.process(buf)
758 self.__write_to_file(ct)
759 buf = buf [n:]
760 if len (buf) > 0:
761 # The entire plaintext was not consumed: The size limit
762 # for encrypted objects was reached. Transparently create
763 # a new encrypted object and continue processing the input.
764 self._finalize_write_encrypt ()
765 self._init_write_encrypt ()
766 else:
767 self.__write_to_file(s)
768
6e812ad9 769
784175ba
CH
770 def estim_file_size(self):
771 """ estimates size of file if closing it now
772
773 The result may differ greatly from the amount of data sent to write()
774 due to compression, encryption and buffering.
775
776 In tests the result (before calling close()) was up to 12k smaller than
777 the final file size if compression is being used because zlib/bz2
778 compressors do not allow inspection of their buffered data :-(
779
ba5a449e
CH
780 Still, we add what close() would add: 8 bytes for gz checksum, one
781 encryption block size if encryption is used and the size of our own
782 buffer
784175ba
CH
783 """
784 if self.closed:
785 return self.bytes_written
786
787 result = self.bytes_written
788 if self.buf:
789 result += len(self.buf)
790 if self.comptype == 'gz':
ba5a449e 791 result += 8 # 2 longs = 8 byte (no extra info written for bzip2)
784175ba
CH
792 return result
793
5fdff89f 794 def close(self, close_fileobj=True):
7584f5c9
ERE
795 """Close the _Stream object. No operation should be
796 done on it afterwards.
797 """
963d0db4 798
7584f5c9
ERE
799 if self.closed:
800 return
801
963d0db4 802 if close_fileobj is True:
a0873dcc 803
ae3d0f2a 804 if self.mode == "w":
d1c38f40 805 if self.arcmode & ARCMODE_COMPRESS:
a0873dcc 806 self._finalize_write_gz ()
ae3d0f2a 807 # end of Tar archive marker (two empty blocks) was written
267bc643
PG
808 # finalize encryption last; no writes may be performed after
809 # this point
cb7a3911 810 self.__sync ()
d1c38f40
PG
811 if self.arcmode & ARCMODE_ENCRYPT:
812 self._finalize_write_encrypt ()
267bc643 813
963d0db4
PG
814 if not self._extfileobj:
815 self.fileobj.close()
816 else:
817 # read the zlib crc and length and check them
818 if self.mode == "r" and self.comptype == "gz":
819 read_crc = self.__read(4)
820 read_length = self.__read(4)
821 calculated_crc = self.crc
822 if struct.unpack("<L", read_crc)[0] != calculated_crc:
823 raise CompressionError("bad gzip crc")
7584f5c9
ERE
824 self.closed = True
825
54128a00 826
7584f5c9
ERE
827 def _init_read_gz(self):
828 """Initialize for reading a gzip compressed fileobj.
829 """
830 self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
7584f5c9 831
85737f48 832 read2 = self.__read(2)
e50fa574
PG
833 if read2 == b"":
834 raise EndOfFile ("_init_read_gz(): read returned zero bytes at pos "
835 "%d" % self.fileobj.tell())
836 # taken from gzip.GzipFile with some alterations
d601d33b 837 if read2 != GZ_MAGIC_BYTES:
7584f5c9 838 raise ReadError("not a gzip file")
85737f48 839
dfd7865e
PG
840 read1 = ord (self.__read(1))
841 if read1 != GZ_METHOD_DEFLATE:
7584f5c9
ERE
842 raise CompressionError("unsupported compression method")
843
85737f48 844 self.flags = flag = ord(self.__read(1))
dfd7865e 845 self.__read(6) # discard timestamp[4], deflate flags, os code
7584f5c9 846
dfd7865e 847 if flag & GZ_FLAG_FEXTRA:
7584f5c9
ERE
848 xlen = ord(self.__read(1)) + 256 * ord(self.__read(1))
849 self.read(xlen)
dfd7865e 850 if flag & GZ_FLAG_FNAME:
7584f5c9
ERE
851 while True:
852 s = self.__read(1)
853 if not s or s == NUL:
854 break
dfd7865e 855 if flag & GZ_FLAG_FCOMMENT:
7584f5c9
ERE
856 while True:
857 s = self.__read(1)
858 if not s or s == NUL:
859 break
dfd7865e 860 if flag & GZ_FLAG_FHCRC:
7584f5c9
ERE
861 self.__read(2)
862
c7c736b6
PG
863 def _init_read_encrypt (self):
864 """Initialize encryption for next entry in archive. Read a header and
865 notify the crypto context."""
d1c38f40 866 if self.arcmode & ARCMODE_ENCRYPT:
6e99d23a 867 lasthdr = self.fileobj.tell ()
15d3eefd
PG
868 try:
869 hdr = crypto.hdr_read_stream (self.fileobj)
8a8ac469
PG
870 except crypto.EndOfFile:
871 return False
6e99d23a 872 except crypto.InvalidHeader as exn:
c7c736b6 873 raise DecryptionError ("Crypto.hdr_read_stream(): error “%s” "
6e99d23a
PG
874 "processing %r at pos %d"
875 % (exn, self.fileobj, lasthdr)) \
ae3d0f2a 876 from exn
3031b7ae
PG
877 if self.enccounter is not None:
878 # enforce that the iv counter in the header matches an
879 # explicitly requested one
880 iv = crypto.hdr_iv_counter (hdr)
881 if iv != self.enccounter:
882 raise DecryptionError ("expected IV counter %d, got %d"
883 % (self.enccounter, iv))
6e99d23a 884 self.lasthdr = lasthdr
c7c736b6 885 self.remainder = hdr ["ctsize"] # distance to next header
1ed44e7b
PG
886 try:
887 self.encryption.next (hdr)
888 except crypto.InvalidParameter as exn:
889 raise DecryptionError ("Crypto.next(): error “%s” "
890 "processing %r at pos %d"
891 % (exn, self.fileobj, lasthdr)) \
892 from exn
8a8ac469
PG
893
894 return True
c7c736b6
PG
895
896
8de91f4f
PG
897 def _read_encrypt (self, buf):
898 """
899 Demote a program error to a decryption error in tolerant mode. This
900 allows recovery from corrupted headers and invalid data.
901 """
902 try:
903 return self.encryption.process (buf)
904 except RuntimeError as exn:
04f4c7ab 905 if self.tolerance != TOLERANCE_STRICT:
8de91f4f
PG
906 raise DecryptionError (exn)
907 raise
908
909
c7c736b6
PG
910 def _finalize_read_encrypt (self):
911 """
912 Finalize decryption.
913 """
d1c38f40
PG
914 if self.arcmode & ARCMODE_ENCRYPT \
915 and self.lasthdr is not None :
c7c736b6
PG
916 assert self.remainder >= 0
917 if self.remainder > 0:
c7c736b6 918 self.remainder = 0
b0078f26
PG
919 try:
920 data = self.encryption.done ()
921 except crypto.InvalidGCMTag as exn:
922 raise DecryptionError ("decryption failed: %s" % exn)
c7c736b6
PG
923 return data
924
925
7584f5c9
ERE
926 def tell(self):
927 """Return the stream's file pointer position.
928 """
929 return self.pos
930
931 def seek(self, pos=0):
932 """Set the stream's file pointer to pos. Negative seeking
933 is forbidden.
934 """
935 if pos - self.pos >= 0:
936 blocks, remainder = divmod(pos - self.pos, self.bufsize)
be60ffd0 937 for i in range(blocks):
7584f5c9
ERE
938 self.read(self.bufsize)
939 self.read(remainder)
940 else:
941 raise StreamError("seeking backwards is not allowed")
942 return self.pos
943
944 def read(self, size=None):
945 """Return the next size number of bytes from the stream.
946 If size is not defined, return all bytes of the stream
947 up to EOF.
948 """
949 if size is None:
950 t = []
951 while True:
952 buf = self._read(self.bufsize)
953 if not buf:
954 break
955 t.append(buf)
9dc7ac5c 956 buf = b"".join(t)
7584f5c9
ERE
957 else:
958 buf = self._read(size)
959 self.pos += len(buf)
960 return buf
961
3a7e1a50
ERE
962 def readline(self):
963 """Reads just one line, new line character included
964 """
f0fd5e3a 965 # if \n in dbuf, no read neads to be done
be60ffd0
ERE
966 if b'\n' in self.dbuf:
967 pos = self.dbuf.index(b'\n') + 1
f0fd5e3a
ERE
968 ret = self.dbuf[:pos]
969 self.dbuf = self.dbuf[pos:]
970 return ret
971
1215b602 972 buf = []
3a7e1a50
ERE
973 while True:
974 chunk = self._read(self.bufsize)
975
f0fd5e3a 976 # nothing more to read, so return the buffer
3a7e1a50 977 if not chunk:
be60ffd0 978 return b''.join(buf)
3a7e1a50
ERE
979
980 buf.append(chunk)
f0fd5e3a
ERE
981
982 # if \n found, return the new line
be60ffd0
ERE
983 if b'\n' in chunk:
984 dbuf = b''.join(buf)
985 pos = dbuf.index(b'\n') + 1
1215b602 986 self.dbuf = dbuf[pos:] + self.dbuf
3a7e1a50
ERE
987 return dbuf[:pos]
988
7584f5c9
ERE
989 def _read(self, size):
990 """Return size bytes from the stream.
991 """
7584f5c9
ERE
992 c = len(self.dbuf)
993 t = [self.dbuf]
e4e5d0b8 994
7584f5c9 995 while c < size:
867f75f7 996 buf = self.__read(self.bufsize)
7584f5c9
ERE
997 if not buf:
998 break
3a7e1a50 999
53732900 1000 if self.cmp is not None:
85737f48 1001 try:
3a7e1a50 1002 buf = self.cmp.decompress(buf)
54128a00
PG
1003 except self.exception as exn:
1004 raise ReadError("invalid compressed data (%r)" % exn)
be60ffd0 1005 except Exception as e:
04fb06f4
DGM
1006 # happens at the end of the file
1007 # _init_read_gz failed in the previous iteration so
e4e5d0b8 1008 # self.cmp.decompress fails here
d1c38f40 1009 if self.arcmode & ARCMODE_CONCAT:
be60ffd0
ERE
1010 pass
1011 else:
1012 raise ReadError("invalid compressed data")
d1c38f40 1013 if self.arcmode & ARCMODE_COMPRESS and hasattr(self, "crc"):
c2ffe2ec 1014 self.crc = self.zlib.crc32(buf, self.crc) & 0xFFFFffff
d1c38f40
PG
1015 if self.arcmode & ARCMODE_CONCAT \
1016 and len(self.cmp.unused_data) != 0:
3a7e1a50
ERE
1017 self.buf = self.cmp.unused_data + self.buf
1018 self.close(close_fileobj=False)
1019 try:
1020 self._init_read_gz()
8de91f4f 1021 except DecryptionError:
04f4c7ab 1022 if self.tolerance != TOLERANCE_STRICT:
8de91f4f
PG
1023 # return whatever data was processed successfully
1024 if len (buf) > 0:
1025 t.append (buf)
1026 if len (t) > 0:
1027 break
1028 raise
e50fa574 1029 except EndOfFile:
3a7e1a50
ERE
1030 # happens at the end of the file
1031 pass
c2ffe2ec 1032 self.crc = self.zlib.crc32(b"") & 0xFFFFffff
3a7e1a50 1033 self.closed = False
7584f5c9
ERE
1034 t.append(buf)
1035 c += len(buf)
be60ffd0 1036 t = b"".join(t)
7584f5c9
ERE
1037 self.dbuf = t[size:]
1038 return t[:size]
1039
e4e5d0b8 1040
7584f5c9 1041 def __read(self, size):
ef3b4499
PG
1042 """
1043 Return size bytes from stream. If internal buffer is empty, read
1044 another block from the stream.
1045
1046 The function returns up to size bytes of data. When an error occurs
1047 during decryption, everything until the end of the last successfully
1048 finalized object is returned.
7584f5c9
ERE
1049 """
1050 c = len(self.buf)
8de91f4f 1051 t = [self.buf] if c > 0 else []
1ed44e7b 1052 good_crypto = len (t)
8de91f4f 1053
7584f5c9 1054 while c < size:
c7c736b6 1055 todo = size
8de91f4f
PG
1056 try:
1057 if self.arcmode & ARCMODE_ENCRYPT:
1058 if self.remainder <= 0:
1059 # prepare next object
044585c6
PG
1060 if self._init_read_encrypt () is False: # EOF
1061 buf = None
1062 break # while
8de91f4f
PG
1063
1064 # only read up to the end of the encrypted object
1065 todo = min (size, self.remainder)
1066 buf = self.fileobj.read(todo)
1067 if self.arcmode & ARCMODE_ENCRYPT:
1068 # decrypt the thing
1069 buf = self._read_encrypt (buf)
1070 if todo == self.remainder:
1071 # at the end of a crypto object; finalization will fail if
1072 # the GCM tag does not match
ef3b4499 1073 trailing = self._finalize_read_encrypt ()
8de91f4f
PG
1074 good_crypto = len (t) + 1
1075 if len (trailing) > 0:
1076 buf += trailing
1077 self.remainder = 0
1078 else:
1079 self.remainder -= todo
1080 except DecryptionError:
04f4c7ab 1081 if self.tolerance == TOLERANCE_STRICT:
8de91f4f
PG
1082 raise
1083 self.encryption.drop ()
1084 if good_crypto == 0:
1085 raise
1086 # this may occur at any of the three crypto operations above.
1087 # some objects did validate; discard all data after it; next
1088 # call will start with the bad object and error out immediately
1089 self.buf = b"".join (t [good_crypto:])
1090 return b"".join (t [:good_crypto])
c7c736b6
PG
1091
1092 if not buf: ## XXX stream terminated prematurely; this should be an error
7584f5c9 1093 break
c7c736b6 1094
7584f5c9
ERE
1095 t.append(buf)
1096 c += len(buf)
be60ffd0 1097 t = b"".join(t)
7584f5c9 1098 self.buf = t[size:]
fb27c6e8 1099
7584f5c9 1100 return t[:size]
7d372216 1101
7584f5c9
ERE
1102
1103class _StreamProxy(object):
1104 """Small proxy class that enables transparent compression
1105 detection for the Stream interface (mode 'r|*').
1106 """
1107
1108 def __init__(self, fileobj):
1109 self.fileobj = fileobj
1110 self.buf = self.fileobj.read(BLOCKSIZE)
1111
f0287fb7 1112 def read(self, size): # pylint: disable=method-hidden
7584f5c9
ERE
1113 self.read = self.fileobj.read
1114 return self.buf
1115
1116 def getcomptype(self):
d601d33b 1117 if self.buf.startswith(GZ_MAGIC_DEFLATE):
7584f5c9 1118 return "gz"
be60ffd0 1119 elif self.buf[0:3] == b"BZh" and self.buf[4:10] == b"1AY&SY":
7584f5c9 1120 return "bz2"
be60ffd0
ERE
1121 elif self.buf.startswith((b"\x5d\x00\x00\x80", b"\xfd7zXZ")):
1122 return "xz"
1123 else:
1124 return "tar"
7584f5c9
ERE
1125
1126 def close(self):
1127 self.fileobj.close()
1128# class StreamProxy
1129
7584f5c9
ERE
1130#------------------------
1131# Extraction file object
1132#------------------------
1133class _FileInFile(object):
1134 """A thin wrapper around an existing file object that
1135 provides a part of its data as an individual file
1136 object.
1137 """
1138
be60ffd0 1139 def __init__(self, fileobj, offset, size, blockinfo=None):
7584f5c9
ERE
1140 self.fileobj = fileobj
1141 self.offset = offset
1142 self.size = size
7584f5c9 1143 self.position = 0
be60ffd0
ERE
1144 self.name = getattr(fileobj, "name", None)
1145 self.closed = False
1146
1147 if blockinfo is None:
1148 blockinfo = [(0, size)]
1149
1150 # Construct a map with data and zero blocks.
1151 self.map_index = 0
1152 self.map = []
1153 lastpos = 0
1154 realpos = self.offset
1155 for offset, size in blockinfo:
1156 if offset > lastpos:
1157 self.map.append((False, lastpos, offset, None))
1158 self.map.append((True, offset, offset + size, realpos))
1159 realpos += size
1160 lastpos = offset + size
1161 if lastpos < self.size:
1162 self.map.append((False, lastpos, self.size, None))
1163
1164 def flush(self):
1165 pass
1166
1167 def readable(self):
1168 return True
1169
1170 def writable(self):
1171 return False
1172
1173 def seekable(self):
1174 return self.fileobj.seekable()
7584f5c9
ERE
1175
1176 def tell(self):
1177 """Return the current file position.
1178 """
1179 return self.position
1180
be60ffd0 1181 def seek(self, position, whence=io.SEEK_SET):
7584f5c9
ERE
1182 """Seek to a position in the file.
1183 """
be60ffd0
ERE
1184 if whence == io.SEEK_SET:
1185 self.position = min(max(position, 0), self.size)
1186 elif whence == io.SEEK_CUR:
1187 if position < 0:
1188 self.position = max(self.position + position, 0)
1189 else:
1190 self.position = min(self.position + position, self.size)
1191 elif whence == io.SEEK_END:
1192 self.position = max(min(self.size + position, self.size), 0)
1193 else:
1194 raise ValueError("Invalid argument")
1195 return self.position
7584f5c9
ERE
1196
1197 def read(self, size=None):
1198 """Read data from the file.
1199 """
1200 if size is None:
1201 size = self.size - self.position
1202 else:
1203 size = min(size, self.size - self.position)
1204
be60ffd0 1205 buf = b""
7584f5c9 1206 while size > 0:
7584f5c9 1207 while True:
be60ffd0
ERE
1208 data, start, stop, offset = self.map[self.map_index]
1209 if start <= self.position < stop:
7584f5c9 1210 break
be60ffd0
ERE
1211 else:
1212 self.map_index += 1
1213 if self.map_index == len(self.map):
1214 self.map_index = 0
1215 length = min(size, stop - self.position)
1216 if data:
1217 self.fileobj.seek(offset + (self.position - start))
1218 buf += self.fileobj.read(length)
7584f5c9 1219 else:
be60ffd0
ERE
1220 buf += NUL * length
1221 size -= length
1222 self.position += length
1223 return buf
7584f5c9 1224
be60ffd0
ERE
1225 def readinto(self, b):
1226 buf = self.read(len(b))
1227 b[:len(buf)] = buf
1228 return len(buf)
7584f5c9
ERE
1229
1230 def close(self):
7584f5c9 1231 self.closed = True
be60ffd0 1232#class _FileInFile
7584f5c9 1233
be60ffd0
ERE
1234
1235class ExFileObject(io.BufferedReader):
1236
1237 def __init__(self, tarfile, tarinfo):
1238 fileobj = _FileInFile(tarfile.fileobj, tarinfo.offset_data,
1239 tarinfo.size, tarinfo.sparse)
1240 super().__init__(fileobj)
7584f5c9
ERE
1241#class ExFileObject
1242
1243#------------------
1244# Exported Classes
1245#------------------
1246class TarInfo(object):
1247 """Informational class which holds the details about an
1248 archive member given by a tar header block.
1249 TarInfo objects are returned by TarFile.getmember(),
1250 TarFile.getmembers() and TarFile.gettarinfo() and are
1251 usually created internally.
1252 """
1253
be60ffd0
ERE
1254 __slots__ = ("name", "mode", "uid", "gid", "size", "mtime",
1255 "chksum", "type", "linkname", "uname", "gname",
1256 "devmajor", "devminor", "volume_offset",
1257 "offset", "offset_data", "pax_headers", "sparse",
1258 "tarfile", "_sparse_structs", "_link_target")
1259
7584f5c9
ERE
1260 def __init__(self, name=""):
1261 """Construct a TarInfo object. name is the optional name
1262 of the member.
1263 """
1264 self.name = name # member name
be60ffd0 1265 self.mode = 0o644 # file permissions
7584f5c9
ERE
1266 self.uid = 0 # user id
1267 self.gid = 0 # group id
1268 self.size = 0 # file size
1269 self.mtime = 0 # modification time
1270 self.chksum = 0 # header checksum
1271 self.type = REGTYPE # member type
1272 self.linkname = "" # link name
1273 self.uname = "" # user name
1274 self.gname = "" # group name
1275 self.devmajor = 0 # device major number
1276 self.devminor = 0 # device minor number
1277
1278 self.offset = 0 # the tar header starts here
1279 self.offset_data = 0 # the file's data starts here
0eb5048f
ERE
1280 self.volume_offset = 0 # the file's data corresponds with the data
1281 # starting at this position
7584f5c9 1282
be60ffd0 1283 self.sparse = None # sparse member information
7584f5c9
ERE
1284 self.pax_headers = {} # pax header information
1285
1286 # In pax headers the "name" and "linkname" field are called
1287 # "path" and "linkpath".
1288 def _getpath(self):
1289 return self.name
1290 def _setpath(self, name):
1291 self.name = name
1292 path = property(_getpath, _setpath)
1293
1294 def _getlinkpath(self):
1295 return self.linkname
1296 def _setlinkpath(self, linkname):
1297 self.linkname = linkname
1298 linkpath = property(_getlinkpath, _setlinkpath)
1299
1300 def __repr__(self):
1301 return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self))
1302
be60ffd0 1303 def get_info(self, encoding=None, errors=None):
7584f5c9
ERE
1304 """Return the TarInfo's attributes as a dictionary.
1305 """
1306 info = {
1307 "name": self.name,
be60ffd0 1308 "mode": self.mode & 0o7777,
7584f5c9
ERE
1309 "uid": self.uid,
1310 "gid": self.gid,
1311 "size": self.size,
1312 "mtime": self.mtime,
1313 "chksum": self.chksum,
1314 "type": self.type,
1315 "linkname": self.linkname,
1316 "uname": self.uname,
1317 "gname": self.gname,
1318 "devmajor": self.devmajor,
36a315a0 1319 "devminor": self.devminor,
0eb5048f
ERE
1320 "offset_data": self.offset_data,
1321 "volume_offset": self.volume_offset
7584f5c9
ERE
1322 }
1323
1324 if info["type"] == DIRTYPE and not info["name"].endswith("/"):
1325 info["name"] += "/"
1326
7584f5c9
ERE
1327 return info
1328
be60ffd0
ERE
1329 def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING,
1330 errors="surrogateescape"):
7584f5c9
ERE
1331 """Return a tar header as a string of 512 byte blocks.
1332 """
1333 info = self.get_info(encoding, errors)
1334
1335 if format == USTAR_FORMAT:
be60ffd0 1336 return self.create_ustar_header(info, encoding, errors)
7584f5c9 1337 elif format == GNU_FORMAT:
be60ffd0 1338 return self.create_gnu_header(info, encoding, errors)
7584f5c9
ERE
1339 elif format == PAX_FORMAT:
1340 return self.create_pax_header(info, encoding, errors)
1341 else:
1342 raise ValueError("invalid format")
1343
be60ffd0 1344 def create_ustar_header(self, info, encoding, errors):
7584f5c9
ERE
1345 """Return the object as a ustar header block.
1346 """
1347 info["magic"] = POSIX_MAGIC
1348
1349 if len(info["linkname"]) > LENGTH_LINK:
1350 raise ValueError("linkname is too long")
1351
1352 if len(info["name"]) > LENGTH_NAME:
1353 info["prefix"], info["name"] = self._posix_split_name(info["name"])
1354
be60ffd0 1355 return self._create_header(info, USTAR_FORMAT, encoding, errors)
7584f5c9 1356
be60ffd0 1357 def create_gnu_header(self, info, encoding, errors):
7584f5c9
ERE
1358 """Return the object as a GNU header block sequence.
1359 """
1360 info["magic"] = GNU_MAGIC
1361
2f854e77
ERE
1362 if self.ismultivol():
1363 prefix = [
1364 itn(info.get("atime", 0), 12, GNU_FORMAT),
1365 itn(info.get("ctime", 0), 12, GNU_FORMAT),
0eb5048f 1366 itn(self.volume_offset, 12, GNU_FORMAT),
2f854e77
ERE
1367 itn(0, 119, GNU_FORMAT), # stuff unused in this tar implementation, set to zero
1368 ]
be60ffd0 1369 info['prefix'] = b"".join(prefix)
0eb5048f 1370 info['size'] = info['size'] - self.volume_offset
2f854e77 1371
be60ffd0 1372 buf = b""
7584f5c9 1373 if len(info["linkname"]) > LENGTH_LINK:
be60ffd0
ERE
1374 buf += self._create_gnu_long_header(info["linkname"],
1375 GNUTYPE_LONGLINK, encoding, errors)
7584f5c9
ERE
1376
1377 if len(info["name"]) > LENGTH_NAME:
be60ffd0
ERE
1378 buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME,
1379 encoding, errors)
7584f5c9 1380
be60ffd0 1381 return buf + self._create_header(info, GNU_FORMAT, encoding, errors)
7584f5c9
ERE
1382
1383 def create_pax_header(self, info, encoding, errors):
1384 """Return the object as a ustar header block. If it cannot be
1385 represented this way, prepend a pax extended header sequence
1386 with supplement information.
1387 """
1388 info["magic"] = POSIX_MAGIC
1389 pax_headers = self.pax_headers.copy()
c04e0751
ERE
1390 if self.ismultivol():
1391 info['size'] = info['size'] - self.volume_offset
7584f5c9
ERE
1392
1393 # Test string fields for values that exceed the field length or cannot
1394 # be represented in ASCII encoding.
1395 for name, hname, length in (
36a315a0
ERE
1396 ("name", "path", LENGTH_NAME),
1397 ("linkname", "linkpath", LENGTH_LINK),
1398 ("uname", "uname", 32),
1399 ("gname", "gname", 32)):
7584f5c9
ERE
1400
1401 if hname in pax_headers:
1402 # The pax header has priority.
1403 continue
1404
7584f5c9
ERE
1405 # Try to encode the string as ASCII.
1406 try:
be60ffd0 1407 info[name].encode("ascii", "strict")
7584f5c9 1408 except UnicodeEncodeError:
be60ffd0 1409 pax_headers[hname] = info[name]
7584f5c9
ERE
1410 continue
1411
1412 if len(info[name]) > length:
be60ffd0 1413 pax_headers[hname] = info[name]
7584f5c9
ERE
1414
1415 # Test number fields for values that exceed the field limit or values
1416 # that like to be stored as float.
1417 for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)):
1418 if name in pax_headers:
1419 # The pax header has priority. Avoid overflow.
1420 info[name] = 0
1421 continue
1422
1423 val = info[name]
1424 if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float):
be60ffd0 1425 pax_headers[name] = str(val)
7584f5c9
ERE
1426 info[name] = 0
1427
1428 # Create a pax extended header if necessary.
1429 if pax_headers:
be60ffd0 1430 buf = self._create_pax_generic_header(pax_headers, XHDTYPE, encoding)
7584f5c9 1431 else:
be60ffd0 1432 buf = b""
7584f5c9 1433
be60ffd0 1434 return buf + self._create_header(info, USTAR_FORMAT, "ascii", "replace")
7584f5c9
ERE
1435
1436 @classmethod
1437 def create_pax_global_header(cls, pax_headers):
1438 """Return the object as a pax global header block sequence.
1439 """
be60ffd0 1440 return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf-8")
7584f5c9
ERE
1441
1442 def _posix_split_name(self, name):
1443 """Split a name longer than 100 chars into a prefix
1444 and a name part.
1445 """
1446 prefix = name[:LENGTH_PREFIX + 1]
1447 while prefix and prefix[-1] != "/":
1448 prefix = prefix[:-1]
1449
1450 name = name[len(prefix):]
1451 prefix = prefix[:-1]
1452
1453 if not prefix or len(name) > LENGTH_NAME:
1454 raise ValueError("name is too long")
1455 return prefix, name
1456
1457 @staticmethod
be60ffd0 1458 def _create_header(info, format, encoding, errors):
7584f5c9
ERE
1459 """Return a header block. info is a dictionary with file
1460 information, format must be one of the *_FORMAT constants.
1461 """
1462 parts = [
be60ffd0
ERE
1463 stn(info.get("name", ""), 100, encoding, errors),
1464 itn(info.get("mode", 0) & 0o7777, 8, format),
7584f5c9
ERE
1465 itn(info.get("uid", 0), 8, format),
1466 itn(info.get("gid", 0), 8, format),
1467 itn(info.get("size", 0), 12, format),
1468 itn(info.get("mtime", 0), 12, format),
be60ffd0 1469 b" ", # checksum field
2f854e77 1470 info.get("type", REGTYPE),
be60ffd0
ERE
1471 stn(info.get("linkname", ""), 100, encoding, errors),
1472 info.get("magic", POSIX_MAGIC),
1473 stn(info.get("uname", ""), 32, encoding, errors),
1474 stn(info.get("gname", ""), 32, encoding, errors),
7584f5c9
ERE
1475 itn(info.get("devmajor", 0), 8, format),
1476 itn(info.get("devminor", 0), 8, format),
be60ffd0 1477 sbtn(info.get("prefix", ""), 155, encoding, errors)
7584f5c9
ERE
1478 ]
1479
be60ffd0 1480 buf = struct.pack("%ds" % BLOCKSIZE, b"".join(parts))
7584f5c9 1481 chksum = calc_chksums(buf[-BLOCKSIZE:])[0]
be60ffd0 1482 buf = buf[:-364] + bytes("%06o\0" % chksum, "ascii") + buf[-357:]
7584f5c9
ERE
1483 return buf
1484
1485 @staticmethod
1486 def _create_payload(payload):
1487 """Return the string payload filled with zero bytes
1488 up to the next 512 byte border.
1489 """
1490 blocks, remainder = divmod(len(payload), BLOCKSIZE)
1491 if remainder > 0:
1492 payload += (BLOCKSIZE - remainder) * NUL
1493 return payload
1494
1495 @classmethod
be60ffd0 1496 def _create_gnu_long_header(cls, name, type, encoding, errors):
7584f5c9
ERE
1497 """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence
1498 for name.
1499 """
be60ffd0 1500 name = name.encode(encoding, errors) + NUL
7584f5c9
ERE
1501
1502 info = {}
1503 info["name"] = "././@LongLink"
1504 info["type"] = type
1505 info["size"] = len(name)
1506 info["magic"] = GNU_MAGIC
1507
1508 # create extended header + name blocks.
be60ffd0 1509 return cls._create_header(info, USTAR_FORMAT, encoding, errors) + \
7584f5c9
ERE
1510 cls._create_payload(name)
1511
1512 @classmethod
be60ffd0
ERE
1513 def _create_pax_generic_header(cls, pax_headers, type, encoding):
1514 """Return a POSIX.1-2008 extended or global header sequence
7584f5c9 1515 that contains a list of keyword, value pairs. The values
be60ffd0 1516 must be strings.
7584f5c9 1517 """
be60ffd0
ERE
1518 # Check if one of the fields contains surrogate characters and thereby
1519 # forces hdrcharset=BINARY, see _proc_pax() for more information.
1520 binary = False
1521 for keyword, value in pax_headers.items():
1522 try:
1523 value.encode("utf-8", "strict")
1524 except UnicodeEncodeError:
1525 binary = True
1526 break
1527
1528 records = b""
1529 if binary:
1530 # Put the hdrcharset field at the beginning of the header.
1531 records += b"21 hdrcharset=BINARY\n"
1532
1533 for keyword, value in pax_headers.items():
1534 keyword = keyword.encode("utf-8")
1535 if binary:
1536 # Try to restore the original byte representation of `value'.
1537 # Needless to say, that the encoding must match the string.
1538 value = value.encode(encoding, "surrogateescape")
1539 else:
1540 value = value.encode("utf-8")
1541
7584f5c9
ERE
1542 l = len(keyword) + len(value) + 3 # ' ' + '=' + '\n'
1543 n = p = 0
1544 while True:
1545 n = l + len(str(p))
1546 if n == p:
1547 break
1548 p = n
be60ffd0 1549 records += bytes(str(p), "ascii") + b" " + keyword + b"=" + value + b"\n"
7584f5c9
ERE
1550
1551 # We use a hardcoded "././@PaxHeader" name like star does
1552 # instead of the one that POSIX recommends.
1553 info = {}
1554 info["name"] = "././@PaxHeader"
1555 info["type"] = type
1556 info["size"] = len(records)
1557 info["magic"] = POSIX_MAGIC
1558
1559 # Create pax header + record blocks.
be60ffd0 1560 return cls._create_header(info, USTAR_FORMAT, "ascii", "replace") + \
7584f5c9
ERE
1561 cls._create_payload(records)
1562
1563 @classmethod
be60ffd0
ERE
1564 def frombuf(cls, buf, encoding, errors):
1565 """Construct a TarInfo object from a 512 byte bytes object.
7584f5c9
ERE
1566 """
1567 if len(buf) == 0:
1568 raise EmptyHeaderError("empty header")
1569 if len(buf) != BLOCKSIZE:
1570 raise TruncatedHeaderError("truncated header")
1571 if buf.count(NUL) == BLOCKSIZE:
1572 raise EOFHeaderError("end of file header")
1573
1574 chksum = nti(buf[148:156])
1575 if chksum not in calc_chksums(buf):
1576 raise InvalidHeaderError("bad checksum")
1577
1578 obj = cls()
be60ffd0 1579 obj.name = nts(buf[0:100], encoding, errors)
7584f5c9
ERE
1580 obj.mode = nti(buf[100:108])
1581 obj.uid = nti(buf[108:116])
1582 obj.gid = nti(buf[116:124])
1583 obj.size = nti(buf[124:136])
1584 obj.mtime = nti(buf[136:148])
1585 obj.chksum = chksum
1586 obj.type = buf[156:157]
be60ffd0
ERE
1587 obj.linkname = nts(buf[157:257], encoding, errors)
1588 obj.uname = nts(buf[265:297], encoding, errors)
1589 obj.gname = nts(buf[297:329], encoding, errors)
7584f5c9
ERE
1590 obj.devmajor = nti(buf[329:337])
1591 obj.devminor = nti(buf[337:345])
be60ffd0
ERE
1592 prefix = nts(buf[345:500], encoding, errors)
1593
1594 # The old GNU sparse format occupies some of the unused
1595 # space in the buffer for up to 4 sparse structures.
1596 # Save the them for later processing in _proc_sparse().
1597 if obj.type == GNUTYPE_SPARSE:
1598 pos = 386
1599 structs = []
1600 for i in range(4):
1601 try:
1602 offset = nti(buf[pos:pos + 12])
1603 numbytes = nti(buf[pos + 12:pos + 24])
1604 except ValueError:
1605 break
1606 structs.append((offset, numbytes))
1607 pos += 24
1608 isextended = bool(buf[482])
1609 origsize = nti(buf[483:495])
1610 obj._sparse_structs = (structs, isextended, origsize)
7584f5c9
ERE
1611
1612 # Old V7 tar format represents a directory as a regular
1613 # file with a trailing slash.
1614 if obj.type == AREGTYPE and obj.name.endswith("/"):
1615 obj.type = DIRTYPE
1616
1617 # Remove redundant slashes from directories.
1618 if obj.isdir():
1619 obj.name = obj.name.rstrip("/")
1620
1621 # Reconstruct a ustar longname.
1622 if prefix and obj.type not in GNU_TYPES:
1623 obj.name = prefix + "/" + obj.name
c474439c
ERE
1624 else:
1625 obj.offset_data = nti(buf[369:381])
7584f5c9
ERE
1626 return obj
1627
1628 @classmethod
1629 def fromtarfile(cls, tarfile):
1630 """Return the next TarInfo object from TarFile object
1631 tarfile.
1632 """
1633 buf = tarfile.fileobj.read(BLOCKSIZE)
be60ffd0 1634 obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors)
7584f5c9
ERE
1635 obj.offset = tarfile.fileobj.tell() - BLOCKSIZE
1636 return obj._proc_member(tarfile)
1637
1638 #--------------------------------------------------------------------------
1639 # The following are methods that are called depending on the type of a
1640 # member. The entry point is _proc_member() which can be overridden in a
1641 # subclass to add custom _proc_*() methods. A _proc_*() method MUST
1642 # implement the following
1643 # operations:
1644 # 1. Set self.offset_data to the position where the data blocks begin,
1645 # if there is data that follows.
1646 # 2. Set tarfile.offset to the position where the next member's header will
1647 # begin.
1648 # 3. Return self or another valid TarInfo object.
1649 def _proc_member(self, tarfile):
1650 """Choose the right processing method depending on
1651 the type and call it.
1652 """
1653 if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK):
1654 return self._proc_gnulong(tarfile)
1655 elif self.type == GNUTYPE_SPARSE:
1656 return self._proc_sparse(tarfile)
1657 elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE):
1658 return self._proc_pax(tarfile)
1659 else:
1660 return self._proc_builtin(tarfile)
1661
1662 def _proc_builtin(self, tarfile):
1663 """Process a builtin type or an unknown type which
1664 will be treated as a regular file.
1665 """
1666 self.offset_data = tarfile.fileobj.tell()
1667 offset = self.offset_data
00c34a12 1668 if self.isreg() or self.ismultivol() or self.type not in SUPPORTED_TYPES:
7584f5c9
ERE
1669 # Skip the following data blocks.
1670 offset += self._block(self.size)
1671 tarfile.offset = offset
1672
1673 # Patch the TarInfo object with saved global
1674 # header information.
1675 self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors)
1676
1677 return self
1678
1679 def _proc_gnulong(self, tarfile):
1680 """Process the blocks that hold a GNU longname
1681 or longlink member.
1682 """
1683 buf = tarfile.fileobj.read(self._block(self.size))
1684
1685 # Fetch the next header and process it.
1686 try:
1687 next = self.fromtarfile(tarfile)
1688 except HeaderError:
1689 raise SubsequentHeaderError("missing or bad subsequent header")
1690
1691 # Patch the TarInfo object from the next header with
1692 # the longname information.
1693 next.offset = self.offset
1694 if self.type == GNUTYPE_LONGNAME:
be60ffd0 1695 next.name = nts(buf, tarfile.encoding, tarfile.errors)
7584f5c9 1696 elif self.type == GNUTYPE_LONGLINK:
be60ffd0 1697 next.linkname = nts(buf, tarfile.encoding, tarfile.errors)
7584f5c9
ERE
1698
1699 return next
1700
1701 def _proc_sparse(self, tarfile):
1702 """Process a GNU sparse header plus extra headers.
1703 """
be60ffd0
ERE
1704 # We already collected some sparse structures in frombuf().
1705 structs, isextended, origsize = self._sparse_structs
1706 del self._sparse_structs
1707
1708 # Collect sparse structures from extended header blocks.
1709 while isextended:
7584f5c9
ERE
1710 buf = tarfile.fileobj.read(BLOCKSIZE)
1711 pos = 0
be60ffd0 1712 for i in range(21):
7584f5c9
ERE
1713 try:
1714 offset = nti(buf[pos:pos + 12])
1715 numbytes = nti(buf[pos + 12:pos + 24])
1716 except ValueError:
1717 break
be60ffd0
ERE
1718 if offset and numbytes:
1719 structs.append((offset, numbytes))
7584f5c9 1720 pos += 24
be60ffd0
ERE
1721 isextended = bool(buf[504])
1722 self.sparse = structs
7584f5c9
ERE
1723
1724 self.offset_data = tarfile.fileobj.tell()
1725 tarfile.offset = self.offset_data + self._block(self.size)
1726 self.size = origsize
7584f5c9
ERE
1727 return self
1728
1729 def _proc_pax(self, tarfile):
1730 """Process an extended or global header as described in
be60ffd0 1731 POSIX.1-2008.
7584f5c9
ERE
1732 """
1733 # Read the header information.
1734 buf = tarfile.fileobj.read(self._block(self.size))
1735
1736 # A pax header stores supplemental information for either
1737 # the following file (extended) or all following files
1738 # (global).
1739 if self.type == XGLTYPE:
1740 pax_headers = tarfile.pax_headers
1741 else:
1742 pax_headers = tarfile.pax_headers.copy()
1743
be60ffd0
ERE
1744 # Check if the pax header contains a hdrcharset field. This tells us
1745 # the encoding of the path, linkpath, uname and gname fields. Normally,
1746 # these fields are UTF-8 encoded but since POSIX.1-2008 tar
1747 # implementations are allowed to store them as raw binary strings if
1748 # the translation to UTF-8 fails.
1749 match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf)
1750 if match is not None:
1751 pax_headers["hdrcharset"] = match.group(1).decode("utf-8")
1752
1753 # For the time being, we don't care about anything other than "BINARY".
1754 # The only other value that is currently allowed by the standard is
1755 # "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
1756 hdrcharset = pax_headers.get("hdrcharset")
1757 if hdrcharset == "BINARY":
1758 encoding = tarfile.encoding
1759 else:
1760 encoding = "utf-8"
1761
7584f5c9
ERE
1762 # Parse pax header information. A record looks like that:
1763 # "%d %s=%s\n" % (length, keyword, value). length is the size
1764 # of the complete record including the length field itself and
1765 # the newline. keyword and value are both UTF-8 encoded strings.
be60ffd0 1766 regex = re.compile(br"(\d+) ([^=]+)=")
7584f5c9
ERE
1767 pos = 0
1768 while True:
1769 match = regex.match(buf, pos)
1770 if not match:
1771 break
1772
1773 length, keyword = match.groups()
1774 length = int(length)
1775 value = buf[match.end(2) + 1:match.start(1) + length - 1]
1776
be60ffd0
ERE
1777 # Normally, we could just use "utf-8" as the encoding and "strict"
1778 # as the error handler, but we better not take the risk. For
1779 # example, GNU tar <= 1.23 is known to store filenames it cannot
1780 # translate to UTF-8 as raw strings (unfortunately without a
1781 # hdrcharset=BINARY header).
1782 # We first try the strict standard encoding, and if that fails we
1783 # fall back on the user's encoding and error handler.
1784 keyword = self._decode_pax_field(keyword, "utf-8", "utf-8",
1785 tarfile.errors)
1786 if keyword in PAX_NAME_FIELDS:
1787 value = self._decode_pax_field(value, encoding, tarfile.encoding,
1788 tarfile.errors)
1789 else:
1790 value = self._decode_pax_field(value, "utf-8", "utf-8",
1791 tarfile.errors)
7584f5c9
ERE
1792
1793 pax_headers[keyword] = value
1794 pos += length
1795
36a315a0 1796
7584f5c9
ERE
1797 # Fetch the next header.
1798 try:
1799 next = self.fromtarfile(tarfile)
1800 except HeaderError:
1801 raise SubsequentHeaderError("missing or bad subsequent header")
1802
be60ffd0
ERE
1803 # Process GNU sparse information.
1804 if "GNU.sparse.map" in pax_headers:
1805 # GNU extended sparse format version 0.1.
1806 self._proc_gnusparse_01(next, pax_headers)
1807
1808 elif "GNU.sparse.size" in pax_headers:
1809 # GNU extended sparse format version 0.0.
1810 self._proc_gnusparse_00(next, pax_headers, buf)
1811
1812 elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0":
1813 # GNU extended sparse format version 1.0.
1814 self._proc_gnusparse_10(next, pax_headers, tarfile)
1815
7584f5c9
ERE
1816 if self.type in (XHDTYPE, SOLARIS_XHDTYPE):
1817 # Patch the TarInfo object with the extended header info.
1818 next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors)
1819 next.offset = self.offset
1820
1821 if "size" in pax_headers:
1822 # If the extended header replaces the size field,
1823 # we need to recalculate the offset where the next
1824 # header starts.
1825 offset = next.offset_data
1826 if next.isreg() or next.type not in SUPPORTED_TYPES:
1827 offset += next._block(next.size)
1828 tarfile.offset = offset
1829
c04e0751
ERE
1830 if next is not None:
1831 if "GNU.volume.filename" in pax_headers:
1832 if pax_headers["GNU.volume.filename"] == next.name:
1833 if "GNU.volume.size" in pax_headers:
1834 next.size = int(pax_headers["GNU.volume.size"])
1835 if "GNU.volume.offset" in pax_headers:
1836 next.volume_offset = int(pax_headers["GNU.volume.offset"])
1837
1838 for key in pax_headers.keys():
1839 if key.startswith("GNU.volume"):
1840 del tarfile.pax_headers[key]
0eb5048f 1841
7584f5c9
ERE
1842 return next
1843
be60ffd0
ERE
1844 def _proc_gnusparse_00(self, next, pax_headers, buf):
1845 """Process a GNU tar extended sparse header, version 0.0.
7584f5c9 1846 """
be60ffd0
ERE
1847 offsets = []
1848 for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf):
1849 offsets.append(int(match.group(1)))
1850 numbytes = []
1851 for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf):
1852 numbytes.append(int(match.group(1)))
1853 next.sparse = list(zip(offsets, numbytes))
7584f5c9 1854
be60ffd0
ERE
1855 def _proc_gnusparse_01(self, next, pax_headers):
1856 """Process a GNU tar extended sparse header, version 0.1.
1857 """
1858 sparse = [int(x) for x in pax_headers["GNU.sparse.map"].split(",")]
1859 next.sparse = list(zip(sparse[::2], sparse[1::2]))
7584f5c9 1860
be60ffd0
ERE
1861 def _proc_gnusparse_10(self, next, pax_headers, tarfile):
1862 """Process a GNU tar extended sparse header, version 1.0.
1863 """
1864 fields = None
1865 sparse = []
1866 buf = tarfile.fileobj.read(BLOCKSIZE)
1867 fields, buf = buf.split(b"\n", 1)
1868 fields = int(fields)
1869 while len(sparse) < fields * 2:
1870 if b"\n" not in buf:
1871 buf += tarfile.fileobj.read(BLOCKSIZE)
1872 number, buf = buf.split(b"\n", 1)
1873 sparse.append(int(number))
1874 next.offset_data = tarfile.fileobj.tell()
1875 next.sparse = list(zip(sparse[::2], sparse[1::2]))
7584f5c9 1876
be60ffd0
ERE
1877 def _apply_pax_info(self, pax_headers, encoding, errors):
1878 """Replace fields with supplemental information from a previous
1879 pax extended or global header.
1880 """
1881 for keyword, value in pax_headers.items():
1882 if keyword == "GNU.sparse.name":
1883 setattr(self, "path", value)
1884 elif keyword == "GNU.sparse.size":
1885 setattr(self, "size", int(value))
1886 elif keyword == "GNU.sparse.realsize":
1887 setattr(self, "size", int(value))
1888 elif keyword in PAX_FIELDS:
1889 if keyword in PAX_NUMBER_FIELDS:
1890 try:
1891 value = PAX_NUMBER_FIELDS[keyword](value)
1892 except ValueError:
1893 value = 0
1894 if keyword == "path":
f0287fb7 1895 value = value.rstrip("/") # pylint: disable=no-member
be60ffd0 1896 setattr(self, keyword, value)
7584f5c9
ERE
1897
1898 self.pax_headers = pax_headers.copy()
1899
be60ffd0
ERE
1900 def _decode_pax_field(self, value, encoding, fallback_encoding, fallback_errors):
1901 """Decode a single field from a pax record.
1902 """
1903 try:
1904 return value.decode(encoding, "strict")
1905 except UnicodeDecodeError:
1906 return value.decode(fallback_encoding, fallback_errors)
1907
7584f5c9
ERE
1908 def _block(self, count):
1909 """Round up a byte count by BLOCKSIZE and return it,
1910 e.g. _block(834) => 1024.
1911 """
1912 blocks, remainder = divmod(count, BLOCKSIZE)
1913 if remainder:
1914 blocks += 1
1915 return blocks * BLOCKSIZE
1916
1917 def isreg(self):
1918 return self.type in REGULAR_TYPES
1919 def isfile(self):
1920 return self.isreg()
1921 def isdir(self):
1922 return self.type == DIRTYPE
1923 def issym(self):
1924 return self.type == SYMTYPE
1925 def islnk(self):
1926 return self.type == LNKTYPE
1927 def ischr(self):
1928 return self.type == CHRTYPE
1929 def isblk(self):
1930 return self.type == BLKTYPE
1931 def isfifo(self):
1932 return self.type == FIFOTYPE
1933 def issparse(self):
be60ffd0 1934 return self.sparse is not None
7584f5c9
ERE
1935 def isdev(self):
1936 return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE)
68ddf955 1937 def ismultivol(self):
c04e0751
ERE
1938 return self.type == GNUTYPE_MULTIVOL or self.volume_offset > 0 or\
1939 "GNU.volume.offset" in self.pax_headers
7584f5c9
ERE
1940# class TarInfo
1941
1942class TarFile(object):
1943 """The TarFile Class provides an interface to tar archives.
1944 """
1945
1946 debug = 0 # May be set from 0 (no msgs) to 3 (all msgs)
1947
1948 dereference = False # If true, add content of linked file to the
1949 # tar file, else the link.
1950
1951 ignore_zeros = False # If true, skips empty or invalid blocks and
1952 # continues processing.
1953
83f2d71e 1954 max_volume_size = None # If different from None, establishes maximum
68ddf955
ERE
1955 # size of tar volumes
1956
1957 new_volume_handler = None # function handler to be executed before when
1958 # a new volume is needed
1959
1960 volume_number = 0 # current volume number, used for multi volume
1961 # support
1962
7584f5c9
ERE
1963 errorlevel = 1 # If 0, fatal errors only appear in debug
1964 # messages (if debug >= 0). If > 0, errors
1965 # are passed to the caller as exceptions.
1966
1967 format = DEFAULT_FORMAT # The format to use when creating an archive.
1968
1969 encoding = ENCODING # Encoding for 8-bit character strings.
1970
1971 errors = None # Error handler for unicode conversion.
1972
1973 tarinfo = TarInfo # The default TarInfo class to use.
1974
be60ffd0 1975 fileobject = ExFileObject # The file-object for extractfile().
7584f5c9 1976
d1c38f40
PG
1977 arcmode = ARCMODE_PLAIN # Object processing mode (“concat”, encryption,
1978 # compression)
5fdff89f 1979
ea625b04
ERE
1980 save_to_members = True # If new members are saved. This can be disabled
1981 # if you manage lots of files and don't want
1982 # to have high memory usage
1983
9ef1fb87
TJ
1984 cache_uid2user = {} # cache to avoid getpwuid calls. It always parses /etc/passwd.
1985 cache_gid2group = {} # same cache for groups
1986
7584f5c9
ERE
1987 def __init__(self, name=None, mode="r", fileobj=None, format=None,
1988 tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
be60ffd0 1989 errors="surrogateescape", pax_headers=None, debug=None,
548bb8d5 1990 errorlevel=None, max_volume_size=None, new_volume_handler=None,
d1c38f40 1991 concat=False, nacl=None,
c7c736b6 1992 save_to_members=True):
7584f5c9
ERE
1993 """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
1994 read from an existing archive, 'a' to append data to an existing
1995 file or 'w' to create a new file overwriting an existing one. `mode'
1996 defaults to 'r'.
1997 If `fileobj' is given, it is used for reading or writing data. If it
1998 can be determined, `mode' is overridden by `fileobj's mode.
1999 `fileobj' is not closed, when TarFile is closed.
2000 """
2001 if len(mode) > 1 or mode not in "raw":
2002 raise ValueError("mode must be 'r', 'a' or 'w'")
2003 self.mode = mode
d1c38f40 2004 self.arcmode = arcmode_set (concat)
c7c736b6 2005 self.nacl = nacl
7584f5c9
ERE
2006 self._mode = {"r": "rb", "a": "r+b", "w": "wb"}[mode]
2007
2008 if not fileobj:
2009 if self.mode == "a" and not os.path.exists(name):
2010 # Create nonexistent files in append mode.
2011 self.mode = "w"
2012 self._mode = "wb"
2013 fileobj = bltn_open(name, self._mode)
2014 self._extfileobj = False
2015 else:
2016 if name is None and hasattr(fileobj, "name"):
2017 name = fileobj.name
d5361dac 2018 # when fileobj is a gzip.GzipFile, fileobj.mode is an int (not valid for us)
be60ffd0 2019 if hasattr(fileobj, "mode") and isinstance(fileobj.mode, str):
7584f5c9
ERE
2020 self._mode = fileobj.mode
2021 self._extfileobj = True
be60ffd0 2022 self.name = os.path.abspath(name) if name else None
2f854e77 2023 self.base_name = self.name = os.path.abspath(name) if name else None
7584f5c9
ERE
2024 self.fileobj = fileobj
2025
2026 # Init attributes.
2027 if format is not None:
2028 self.format = format
2029 if tarinfo is not None:
2030 self.tarinfo = tarinfo
2031 if dereference is not None:
2032 self.dereference = dereference
2033 if ignore_zeros is not None:
2034 self.ignore_zeros = ignore_zeros
2035 if encoding is not None:
2036 self.encoding = encoding
2037
be60ffd0 2038 self.errors = errors
7584f5c9
ERE
2039
2040 if pax_headers is not None and self.format == PAX_FORMAT:
2041 self.pax_headers = pax_headers
2042 else:
2043 self.pax_headers = {}
2044
2045 if debug is not None:
2046 self.debug = debug
2047 if errorlevel is not None:
2048 self.errorlevel = errorlevel
2049
2050 # Init datastructures.
ae48acc8 2051 if max_volume_size and max_volume_size < 3*BLOCKSIZE:
0c818a18 2052 raise ValueError("max_volume_size needs to be at least %d" % (3*BLOCKSIZE))
ae48acc8
ERE
2053 if max_volume_size and not callable(new_volume_handler):
2054 raise ValueError("new_volume_handler needs to be set and be callable for multivolume support")
5ab3f8f9
CH
2055 if max_volume_size:
2056 self.max_volume_size = int(max_volume_size)
2057 else:
2058 self.max_volume_size = None
ae48acc8 2059
ea625b04 2060 self.save_to_members = save_to_members
68ddf955 2061 self.new_volume_handler = new_volume_handler
7584f5c9
ERE
2062 self.closed = False
2063 self.members = [] # list of members as TarInfo objects
2064 self._loaded = False # flag if all members have been read
2065 self.offset = self.fileobj.tell()
2066 # current position in the archive file
2067 self.inodes = {} # dictionary caching the inodes of
2068 # archive members already added
2069
2070 try:
2071 if self.mode == "r":
2072 self.firstmember = None
2073 self.firstmember = self.next()
2074
2075 if self.mode == "a":
2076 # Move to the end of the archive,
2077 # before the first empty block.
2078 while True:
2079 self.fileobj.seek(self.offset)
2080 try:
2081 tarinfo = self.tarinfo.fromtarfile(self)
2082 self.members.append(tarinfo)
2083 except EOFHeaderError:
2084 self.fileobj.seek(self.offset)
2085 break
be60ffd0 2086 except HeaderError as e:
7584f5c9
ERE
2087 raise ReadError(str(e))
2088
2089 if self.mode in "aw":
2090 self._loaded = True
2091
2092 if self.pax_headers:
2093 buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy())
2094 self.fileobj.write(buf)
2095 self.offset += len(buf)
2096 except:
2097 if not self._extfileobj:
2098 self.fileobj.close()
2099 self.closed = True
2100 raise
2101
7584f5c9
ERE
2102 #--------------------------------------------------------------------------
2103 # Below are the classmethods which act as alternate constructors to the
2104 # TarFile class. The open() method is the only one that is needed for
2105 # public use; it is the "super"-constructor and is able to select an
2106 # adequate "sub"-constructor for a particular compression using the mapping
2107 # from OPEN_METH.
2108 #
2109 # This concept allows one to subclass TarFile without losing the comfort of
2110 # the super-constructor. A sub-constructor is registered and made available
2111 # by adding it to the mapping in OPEN_METH.
2112
2113 @classmethod
2b82f50c 2114 def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE,
04f4c7ab
PG
2115 encryption=None, compresslevel=9, tolerance=TOLERANCE_STRICT,
2116 **kwargs):
7584f5c9
ERE
2117 """Open a tar archive for reading, writing or appending. Return
2118 an appropriate TarFile class.
2119
2120 mode:
2121 'r' or 'r:*' open for reading with transparent compression
2122 'r:' open for reading exclusively uncompressed
2123 'r:gz' open for reading with gzip compression
2124 'r:bz2' open for reading with bzip2 compression
be60ffd0 2125 'r:xz' open for reading with lzma compression
7584f5c9
ERE
2126 'a' or 'a:' open for appending, creating the file if necessary
2127 'w' or 'w:' open for writing without compression
2128 'w:gz' open for writing with gzip compression
2129 'w:bz2' open for writing with bzip2 compression
be60ffd0 2130 'w:xz' open for writing with lzma compression
7584f5c9
ERE
2131
2132 'r|*' open a stream of tar blocks with transparent compression
2133 'r|' open an uncompressed stream of tar blocks for reading
2134 'r|gz' open a gzip compressed stream of tar blocks
2135 'r|bz2' open a bzip2 compressed stream of tar blocks
be60ffd0 2136 'r|xz' open an lzma compressed stream of tar blocks
7584f5c9
ERE
2137 'w|' open an uncompressed stream for writing
2138 'w|gz' open a gzip compressed stream for writing
2139 'w|bz2' open a bzip2 compressed stream for writing
be60ffd0 2140 'w|xz' open an lzma compressed stream for writing
85737f48
ERE
2141
2142 'r#gz' open a stream of gzip compressed tar blocks for reading
2143 'w#gz' open a stream of gzip compressed tar blocks for writing
7584f5c9 2144 """
7584f5c9
ERE
2145 if not name and not fileobj:
2146 raise ValueError("nothing to open")
2147
2148 if mode in ("r", "r:*"):
2149 # Find out which *open() is appropriate for opening the file.
2150 for comptype in cls.OPEN_METH:
2151 func = getattr(cls, cls.OPEN_METH[comptype])
2152 if fileobj is not None:
2153 saved_pos = fileobj.tell()
2154 try:
2155 return func(name, "r", fileobj, **kwargs)
be60ffd0 2156 except (ReadError, CompressionError) as e:
c7c736b6 2157 # usually nothing exceptional but sometimes is
7584f5c9
ERE
2158 if fileobj is not None:
2159 fileobj.seek(saved_pos)
2160 continue
2161 raise ReadError("file could not be opened successfully")
2162
2163 elif ":" in mode:
2164 filemode, comptype = mode.split(":", 1)
2165 filemode = filemode or "r"
2166 comptype = comptype or "tar"
2167
2168 # Select the *open() function according to
2169 # given compression.
2170 if comptype in cls.OPEN_METH:
2171 func = getattr(cls, cls.OPEN_METH[comptype])
2172 else:
2173 raise CompressionError("unknown compression type %r" % comptype)
e05f0440
TJ
2174
2175 # Pass on compression level for gzip / bzip2.
2176 if comptype == 'gz' or comptype == 'bz2':
2177 kwargs['compresslevel'] = compresslevel
2178
7a2b9329
CH
2179 if 'max_volume_size' in kwargs:
2180 if comptype != 'tar' and filemode in 'wa' \
2181 and kwargs['max_volume_size']:
2182 import warnings
2183 warnings.warn('Only the first volume will be compressed '
2184 'for modes with "w:"!')
2185
e05f0440 2186 return func(name, filemode, fileobj, **kwargs)
7584f5c9
ERE
2187
2188 elif "|" in mode:
2189 filemode, comptype = mode.split("|", 1)
2190 filemode = filemode or "r"
2191 comptype = comptype or "tar"
2192
2193 if filemode not in "rw":
2194 raise ValueError("mode must be 'r' or 'w'")
2195
2196 t = cls(name, filemode,
2b82f50c
ERE
2197 _Stream(name, filemode, comptype, fileobj, bufsize,
2198 compresslevel=compresslevel),
7584f5c9
ERE
2199 **kwargs)
2200 t._extfileobj = False
2201 return t
2202
5fdff89f
ERE
2203 elif "#" in mode:
2204 filemode, comptype = mode.split("#", 1)
2205 filemode = filemode or "r"
5fdff89f
ERE
2206
2207 if filemode not in "rw":
5faea0e1
PG
2208 raise ValueError ("mode %s not compatible with concat "
2209 "archive; must be 'r' or 'w'" % mode)
5fdff89f 2210
be60ffd0 2211 stream = _Stream(name, filemode, comptype, fileobj, bufsize,
d1c38f40 2212 concat=True, encryption=encryption,
04f4c7ab 2213 compresslevel=compresslevel, tolerance=tolerance)
d1c38f40 2214 kwargs ["concat"] = True
be60ffd0
ERE
2215 try:
2216 t = cls(name, filemode, stream, **kwargs)
c7c736b6 2217 except: # XXX except what?
be60ffd0 2218 stream.close()
c7c736b6 2219 raise # XXX raise what?
5fdff89f
ERE
2220 t._extfileobj = False
2221 return t
2222
7584f5c9
ERE
2223 elif mode in "aw":
2224 return cls.taropen(name, mode, fileobj, **kwargs)
2225
133d30da 2226 raise ValueError("undiscernible mode %r" % mode)
7584f5c9 2227
d39d4cbf
PG
2228
2229 @classmethod
2230 def open_at_offset(cls, offset, *a, **kwa):
2231 """
2232 Same as ``.open()``, but start reading at the given offset. Assumes a
2233 seekable file object.
2234 """
2235 fileobj = kwa.get ("fileobj")
2236 if fileobj is not None:
2237 fileobj.seek (offset)
2238 return cls.open (*a, **kwa)
2239
2240
7584f5c9
ERE
2241 @classmethod
2242 def taropen(cls, name, mode="r", fileobj=None, **kwargs):
2243 """Open uncompressed tar archive name for reading or writing.
2244 """
2245 if len(mode) > 1 or mode not in "raw":
2246 raise ValueError("mode must be 'r', 'a' or 'w'")
2247 return cls(name, mode, fileobj, **kwargs)
2248
2249 @classmethod
2250 def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
2251 """Open gzip compressed tar archive name for reading or writing.
2252 Appending is not allowed.
2253 """
2254 if len(mode) > 1 or mode not in "rw":
2255 raise ValueError("mode must be 'r' or 'w'")
2256
2257 try:
2258 import gzip
2259 gzip.GzipFile
2260 except (ImportError, AttributeError):
2261 raise CompressionError("gzip module is not available")
2262
be60ffd0 2263 extfileobj = fileobj is not None
7584f5c9 2264 try:
be60ffd0
ERE
2265 fileobj = gzip.GzipFile(name, mode + "b", compresslevel, fileobj)
2266 t = cls.taropen(name, mode, fileobj, **kwargs)
2267 except OSError:
2268 if not extfileobj and fileobj is not None:
2269 fileobj.close()
2270 if fileobj is None:
2271 raise
7584f5c9 2272 raise ReadError("not a gzip file")
be60ffd0
ERE
2273 except:
2274 if not extfileobj and fileobj is not None:
2275 fileobj.close()
2276 raise
2277 t._extfileobj = extfileobj
7584f5c9
ERE
2278 return t
2279
2280 @classmethod
2281 def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
2282 """Open bzip2 compressed tar archive name for reading or writing.
2283 Appending is not allowed.
2284 """
2285 if len(mode) > 1 or mode not in "rw":
2286 raise ValueError("mode must be 'r' or 'w'.")
2287
2288 try:
2289 import bz2
2290 except ImportError:
2291 raise CompressionError("bz2 module is not available")
2292
be60ffd0
ERE
2293 fileobj = bz2.BZ2File(fileobj or name, mode,
2294 compresslevel=compresslevel)
7584f5c9
ERE
2295
2296 try:
2297 t = cls.taropen(name, mode, fileobj, **kwargs)
be60ffd0
ERE
2298 except (OSError, EOFError):
2299 fileobj.close()
7584f5c9
ERE
2300 raise ReadError("not a bzip2 file")
2301 t._extfileobj = False
2302 return t
2303
be60ffd0
ERE
2304 @classmethod
2305 def xzopen(cls, name, mode="r", fileobj=None, preset=None, **kwargs):
2306 """Open lzma compressed tar archive name for reading or writing.
2307 Appending is not allowed.
2308 """
2309 if mode not in ("r", "w"):
2310 raise ValueError("mode must be 'r' or 'w'")
2311
2312 try:
2313 import lzma
2314 except ImportError:
2315 raise CompressionError("lzma module is not available")
2316
2317 fileobj = lzma.LZMAFile(fileobj or name, mode, preset=preset)
2318
2319 try:
2320 t = cls.taropen(name, mode, fileobj, **kwargs)
2321 except (lzma.LZMAError, EOFError):
2322 fileobj.close()
2323 raise ReadError("not an lzma file")
2324 t._extfileobj = False
2325 return t
2326
7584f5c9
ERE
2327 # All *open() methods are registered here.
2328 OPEN_METH = {
2329 "tar": "taropen", # uncompressed tar
2330 "gz": "gzopen", # gzip compressed tar
be60ffd0
ERE
2331 "bz2": "bz2open", # bzip2 compressed tar
2332 "xz": "xzopen" # lzma compressed tar
7584f5c9
ERE
2333 }
2334
2335 #--------------------------------------------------------------------------
2336 # The public methods which TarFile provides:
2337
2338 def close(self):
2339 """Close the TarFile. In write-mode, two finishing zero blocks are
fd2f01f2
PG
2340 appended to the archive. A special case are empty archives which are
2341 initialized accordingly so the two mandatory blocks of zeros are
2342 written abiding by the requested encryption and compression settings.
7584f5c9
ERE
2343 """
2344 if self.closed:
2345 return
2346
2347 if self.mode in "aw":
fd2f01f2
PG
2348 if self.arcmode & ARCMODE_CONCAT and self.fileobj.tell () == 0:
2349 self.fileobj.next ("")
7584f5c9
ERE
2350 self.fileobj.write(NUL * (BLOCKSIZE * 2))
2351 self.offset += (BLOCKSIZE * 2)
2352 # fill up the end with zero-blocks
2353 # (like option -b20 for tar does)
2354 blocks, remainder = divmod(self.offset, RECORDSIZE)
2355 if remainder > 0:
2356 self.fileobj.write(NUL * (RECORDSIZE - remainder))
7584f5c9
ERE
2357 if not self._extfileobj:
2358 self.fileobj.close()
2359 self.closed = True
2360
2361 def getmember(self, name):
2362 """Return a TarInfo object for member `name'. If `name' can not be
2363 found in the archive, KeyError is raised. If a member occurs more
2364 than once in the archive, its last occurrence is assumed to be the
2365 most up-to-date version.
2366 """
2367 tarinfo = self._getmember(name)
2368 if tarinfo is None:
2369 raise KeyError("filename %r not found" % name)
2370 return tarinfo
2371
2372 def getmembers(self):
2373 """Return the members of the archive as a list of TarInfo objects. The
2374 list has the same order as the members in the archive.
2375 """
2376 self._check()
2377 if not self._loaded: # if we want to obtain a list of
2378 self._load() # all members, we first have to
2379 # scan the whole archive.
2380 return self.members
2381
ad4402e8
ERE
2382 def get_last_member_offset(self):
2383 """Return the last member offset. Usually this is self.fileobj.tell(),
2384 but when there's encryption or concat compression going on it's more
2385 complicated than that.
2386 """
b8fc2f5d 2387 return self.last_block_offset
ad4402e8 2388
7584f5c9
ERE
2389 def getnames(self):
2390 """Return the members of the archive as a list of their names. It has
2391 the same order as the list returned by getmembers().
2392 """
2393 return [tarinfo.name for tarinfo in self.getmembers()]
2394
2395 def gettarinfo(self, name=None, arcname=None, fileobj=None):
2396 """Create a TarInfo object for either the file `name' or the file
2397 object `fileobj' (using os.fstat on its file descriptor). You can
2398 modify some of the TarInfo's attributes before you add it using
2399 addfile(). If given, `arcname' specifies an alternative name for the
2400 file in the archive.
2401 """
2402 self._check("aw")
2403
2404 # When fileobj is given, replace name by
2405 # fileobj's real name.
2406 if fileobj is not None:
2407 name = fileobj.name
2408
2409 # Building the name of the member in the archive.
2410 # Backward slashes are converted to forward slashes,
2411 # Absolute paths are turned to relative paths.
2412 if arcname is None:
2413 arcname = name
2414 drv, arcname = os.path.splitdrive(arcname)
be60ffd0 2415 arcname = arcname.replace(os.sep, "/")
7584f5c9
ERE
2416 arcname = arcname.lstrip("/")
2417
2418 # Now, fill the TarInfo object with
2419 # information specific for the file.
2420 tarinfo = self.tarinfo()
2421 tarinfo.tarfile = self
2422
2423 # Use os.stat or os.lstat, depending on platform
2424 # and if symlinks shall be resolved.
2425 if fileobj is None:
2426 if hasattr(os, "lstat") and not self.dereference:
2427 statres = os.lstat(name)
2428 else:
2429 statres = os.stat(name)
2430 else:
2431 statres = os.fstat(fileobj.fileno())
2432 linkname = ""
2433
2434 stmd = statres.st_mode
2435 if stat.S_ISREG(stmd):
2436 inode = (statres.st_ino, statres.st_dev)
2437 if not self.dereference and statres.st_nlink > 1 and \
2438 inode in self.inodes and arcname != self.inodes[inode]:
2439 # Is it a hardlink to an already
2440 # archived file?
2441 type = LNKTYPE
2442 linkname = self.inodes[inode]
2443 else:
2444 # The inode is added only if its valid.
2445 # For win32 it is always 0.
2446 type = REGTYPE
6f422b65 2447 if inode[0] and self.save_to_members:
7584f5c9
ERE
2448 self.inodes[inode] = arcname
2449 elif stat.S_ISDIR(stmd):
2450 type = DIRTYPE
2451 elif stat.S_ISFIFO(stmd):
2452 type = FIFOTYPE
2453 elif stat.S_ISLNK(stmd):
2454 type = SYMTYPE
2455 linkname = os.readlink(name)
2456 elif stat.S_ISCHR(stmd):
2457 type = CHRTYPE
2458 elif stat.S_ISBLK(stmd):
2459 type = BLKTYPE
2460 else:
2461 return None
2462
2463 # Fill the TarInfo object with all
2464 # information we can get.
2465 tarinfo.name = arcname
2466 tarinfo.mode = stmd
2467 tarinfo.uid = statres.st_uid
2468 tarinfo.gid = statres.st_gid
2469 if type == REGTYPE:
2470 tarinfo.size = statres.st_size
2471 else:
be60ffd0 2472 tarinfo.size = 0
7584f5c9
ERE
2473 tarinfo.mtime = statres.st_mtime
2474 tarinfo.type = type
2475 tarinfo.linkname = linkname
2476 if pwd:
9ef1fb87
TJ
2477 if tarinfo.uid in self.cache_uid2user:
2478 tarinfo.uname = self.cache_uid2user[tarinfo.uid]
2479 else:
2480 try:
2481 tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0]
2482 self.cache_uid2user[tarinfo.uid] = tarinfo.uname
2483 except KeyError:
2484 # remember user does not exist:
2485 # same default value as in tarinfo class
2486 self.cache_uid2user[tarinfo.uid] = ""
7584f5c9 2487 if grp:
9ef1fb87
TJ
2488 if tarinfo.gid in self.cache_gid2group:
2489 tarinfo.gname = self.cache_gid2group[tarinfo.gid]
2490 else:
2491 try:
2492 tarinfo.gname = grp.getgrgid(tarinfo.gid)[0]
2493 self.cache_gid2group[tarinfo.gid] = tarinfo.gname
2494 except KeyError:
2495 # remember group does not exist:
2496 # same default value as in tarinfo class
2497 self.cache_gid2group[tarinfo.gid] = ""
7584f5c9
ERE
2498
2499 if type in (CHRTYPE, BLKTYPE):
2500 if hasattr(os, "major") and hasattr(os, "minor"):
2501 tarinfo.devmajor = os.major(statres.st_rdev)
2502 tarinfo.devminor = os.minor(statres.st_rdev)
2503 return tarinfo
2504
2505 def list(self, verbose=True):
2506 """Print a table of contents to sys.stdout. If `verbose' is False, only
2507 the names of the members are printed. If it is True, an `ls -l'-like
2508 output is produced.
2509 """
2510 self._check()
2511
2512 for tarinfo in self:
2513 if verbose:
be60ffd0
ERE
2514 print(stat.filemode(tarinfo.mode), end=' ')
2515 print("%s/%s" % (tarinfo.uname or tarinfo.uid,
2516 tarinfo.gname or tarinfo.gid), end=' ')
7584f5c9 2517 if tarinfo.ischr() or tarinfo.isblk():
be60ffd0
ERE
2518 print("%10s" % ("%d,%d" \
2519 % (tarinfo.devmajor, tarinfo.devminor)), end=' ')
7584f5c9 2520 else:
be60ffd0
ERE
2521 print("%10d" % tarinfo.size, end=' ')
2522 print("%d-%02d-%02d %02d:%02d:%02d" \
2523 % time.localtime(tarinfo.mtime)[:6], end=' ')
7584f5c9 2524
be60ffd0 2525 print(tarinfo.name + ("/" if tarinfo.isdir() else ""), end=' ')
7584f5c9
ERE
2526
2527 if verbose:
2528 if tarinfo.issym():
be60ffd0 2529 print("->", tarinfo.linkname, end=' ')
7584f5c9 2530 if tarinfo.islnk():
be60ffd0
ERE
2531 print("link to", tarinfo.linkname, end=' ')
2532 print()
7584f5c9 2533
be60ffd0 2534 def add(self, name, arcname=None, recursive=True, exclude=None, *, filter=None):
7584f5c9
ERE
2535 """Add the file `name' to the archive. `name' may be any type of file
2536 (directory, fifo, symbolic link, etc.). If given, `arcname'
2537 specifies an alternative name for the file in the archive.
2538 Directories are added recursively by default. This can be avoided by
2539 setting `recursive' to False. `exclude' is a function that should
2540 return True for each filename to be excluded. `filter' is a function
2541 that expects a TarInfo object argument and returns the changed
2542 TarInfo object, if it returns None the TarInfo object will be
2543 excluded from the archive.
2544 """
2545 self._check("aw")
2546
2547 if arcname is None:
2548 arcname = name
2549
2550 # Exclude pathnames.
2551 if exclude is not None:
2552 import warnings
2553 warnings.warn("use the filter argument instead",
2554 DeprecationWarning, 2)
2555 if exclude(name):
2556 self._dbg(2, "tarfile: Excluded %r" % name)
2557 return
2558
2559 # Skip if somebody tries to archive the archive...
2560 if self.name is not None and os.path.abspath(name) == self.name:
2561 self._dbg(2, "tarfile: Skipped %r" % name)
2562 return
2563
2564 self._dbg(1, name)
2565
2566 # Create a TarInfo object from the file.
2567 tarinfo = self.gettarinfo(name, arcname)
2568
2569 if tarinfo is None:
2570 self._dbg(1, "tarfile: Unsupported type %r" % name)
2571 return
2572
2573 # Change or exclude the TarInfo object.
2574 if filter is not None:
2575 tarinfo = filter(tarinfo)
2576 if tarinfo is None:
2577 self._dbg(2, "tarfile: Excluded %r" % name)
2578 return
2579
2580 # Append the tar header and data to the archive.
2581 if tarinfo.isreg():
2582 with bltn_open(name, "rb") as f:
2583 self.addfile(tarinfo, f)
2584
2585 elif tarinfo.isdir():
2586 self.addfile(tarinfo)
2587 if recursive:
2588 for f in os.listdir(name):
2589 self.add(os.path.join(name, f), os.path.join(arcname, f),
be60ffd0 2590 recursive, exclude, filter=filter)
7584f5c9
ERE
2591
2592 else:
2593 self.addfile(tarinfo)
2594
defc9a22 2595 def _size_left_file(self):
be60ffd0 2596 """Calculates size left in a volume with a maximum volume size.
ba5a449e 2597
be60ffd0 2598 Assumes self.max_volume_size is set.
ba5a449e 2599 If using compression through a _Stream, use _size_left_stream instead
be60ffd0 2600 """
ba5a449e 2601 # left-over size = max_size - offset - 2 zero-blocks written in close
ae48acc8
ERE
2602 size_left = self.max_volume_size - 2*BLOCKSIZE - self.offset
2603 # limit size left to a discrete number of blocks, because we won't
be60ffd0 2604 # write only half a block when writting the end of a volume
ae48acc8 2605 # and filling with zeros
defc9a22
CH
2606 return BLOCKSIZE * (size_left // BLOCKSIZE)
2607
2608 def _size_left_stream(self):
ba5a449e
CH
2609 """ Calculates size left in a volume if using comression/encryption
2610
2611 Assumes self.max_volume_size is set and self.fileobj is a _Stream
2612 (otherwise use _size_left_file)
2613 """
2614 # left-over size = max_size - bytes written - 2 zero-blocks (close)
defc9a22
CH
2615 size_left = self.max_volume_size - self.fileobj.estim_file_size() \
2616 - 2*BLOCKSIZE
2617 return BLOCKSIZE * (size_left // BLOCKSIZE)
ae48acc8 2618
7584f5c9
ERE
2619 def addfile(self, tarinfo, fileobj=None):
2620 """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is
2621 given, tarinfo.size bytes are read from it and added to the archive.
2622 You can create TarInfo objects using gettarinfo().
2623 On Windows platforms, `fileobj' should always be opened with mode
2624 'rb' to avoid irritation about the file size.
2625 """
2626 self._check("aw")
2627
2628 tarinfo = copy.copy(tarinfo)
cbf55ffb 2629
d1c38f40
PG
2630 if self.arcmode & ARCMODE_CONCAT:
2631 self.last_block_offset = self.fileobj.next (tarinfo.name)
11684b1d
ERE
2632 else:
2633 self.last_block_offset = self.fileobj.tell()
7584f5c9
ERE
2634
2635 buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
2636 self.fileobj.write(buf)
2637 self.offset += len(buf)
2638
ae9c8de2
CH
2639 if self.max_volume_size:
2640 if isinstance(self.fileobj, _Stream):
2641 _size_left = self._size_left_stream
2642 else:
2643 _size_left = self._size_left_file
2644 else:
2645 _size_left = lambda: tarinfo.size
68ddf955 2646
29c354ac
PG
2647 # If there's no data to follow, finish
2648 if not fileobj:
29c354ac
PG
2649 if self.save_to_members:
2650 self.members.append(tarinfo)
2651 return
2652
2653 target_size_left = _size_left()
2654 source_size_left = tarinfo.size
2655 assert tarinfo.volume_offset == 0
2656
2657 # we only split volumes in the middle of a file, that means we have
2658 # to write at least one block
2659 if target_size_left < BLOCKSIZE:
2660 target_size_left = BLOCKSIZE
2661
ae9c8de2
CH
2662 # loop over multiple volumes
2663 while source_size_left > 0:
ae48acc8 2664
ae9c8de2
CH
2665 # Write as much data as possble from source into target.
2666 # When compressing data, we cannot easily predict how much data we
2667 # can write until target_size_left == 0 --> need to iterate
2668 size_can_write = min(target_size_left, source_size_left)
c04e0751 2669
ae9c8de2
CH
2670 while size_can_write > 0:
2671 copyfileobj(fileobj, self.fileobj, size_can_write)
2672 self.offset += size_can_write
2673 source_size_left -= size_can_write
2674 target_size_left = _size_left()
2675 size_can_write = min(target_size_left, source_size_left)
68ddf955 2676
ae9c8de2
CH
2677 # now target_size_left == 0 or source_size_left == 0
2678
2679 # if there is data left to write, we need to create a new volume
2680 if source_size_left > 0:
5f38bff6
PG
2681 # Only finalize the crypto entry here if we’re continuing with
2682 # another one; otherwise, the encryption must include the block
2683 # padding below.
2f854e77 2684 tarinfo.type = GNUTYPE_MULTIVOL
68ddf955
ERE
2685
2686 if not self.new_volume_handler or\
2687 not callable(self.new_volume_handler):
c04e0751 2688 raise Exception("We need to create a new volume and you "
ae9c8de2 2689 "didn't supply a new_volume_handler")
68ddf955 2690
54128a00 2691
68ddf955
ERE
2692 # the new volume handler should do everything needed to
2693 # start working in a new volume. usually, the handler calls
2694 # to self.open_volume
2f854e77 2695 self.volume_number += 1
0eb5048f 2696
ae9c8de2 2697 # set to be used by open_volume, because in the case of a PAX
0eb5048f
ERE
2698 # tar it needs to write information about the volume and offset
2699 # in the global header
ae9c8de2 2700 tarinfo.volume_offset = tarinfo.size - source_size_left
0eb5048f 2701 self.volume_tarinfo = tarinfo
ae9c8de2 2702
a0873dcc
PG
2703 # the “new_volume_handler” is supposed to call .close() on the
2704 # “fileobj” _Stream
2f854e77
ERE
2705 self.new_volume_handler(self, self.base_name, self.volume_number)
2706
0eb5048f
ERE
2707 self.volume_tarinfo = None
2708
d1c38f40
PG
2709 if self.arcmode & ARCMODE_CONCAT:
2710 self.fileobj.next_volume (tarinfo.name)
5f38bff6 2711
2f854e77
ERE
2712 # write new volume header
2713 buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
2f854e77 2714 self.fileobj.write(buf)
ae9c8de2
CH
2715 self.offset += len(buf)
2716
2717 # adjust variables; open_volume should have reset self.offset
2718 # --> _size_left should be big again
2719 target_size_left = _size_left()
2720 size_can_write = min(target_size_left, source_size_left)
e0da4709 2721 self._dbg(3, 'new volume')
ae9c8de2
CH
2722
2723 # now, all data has been written. We may have to fill up the rest of
2724 # the block in target with 0s
2725 remainder = (tarinfo.size - tarinfo.volume_offset) % BLOCKSIZE
2726 if remainder > 0:
2727 self.fileobj.write(NUL * (BLOCKSIZE - remainder))
2728 self.offset += BLOCKSIZE - remainder
7584f5c9 2729
ea625b04
ERE
2730 if self.save_to_members:
2731 self.members.append(tarinfo)
7584f5c9 2732
170c6c52 2733 def open_volume(self, name="", fileobj=None, encryption=None):
68ddf955 2734 '''
0eb5048f 2735 Called by the user to change this tar file to point to a new volume.
68ddf955
ERE
2736 '''
2737 # open the file using either fileobj or name
2738 if not fileobj:
2739 if self.mode == "a" and not os.path.exists(name):
2740 # Create nonexistent files in append mode.
2741 self.mode = "w"
2742 self._mode = "wb"
68ddf955 2743 self._extfileobj = False
26fa5ad5
ERE
2744
2745 if isinstance(self.fileobj, _Stream):
e0da4709 2746 self._dbg(3, 'open_volume: create a _Stream')
26fa5ad5
ERE
2747 fileobj = _Stream(name=name,
2748 mode=self.fileobj.mode,
2749 comptype=self.fileobj.comptype,
2750 fileobj=None,
2751 bufsize=self.fileobj.bufsize,
cea130ec 2752 encryption=encryption or self.fileobj.encryption,
d1c38f40 2753 concat=self.fileobj.arcmode & ARCMODE_CONCAT)
26fa5ad5 2754 else:
7a2b9329 2755 # here, we lose information about compression/encryption!
e0da4709 2756 self._dbg(3, 'open_volume: builtin open')
26fa5ad5 2757 fileobj = bltn_open(name, self._mode)
68ddf955
ERE
2758 else:
2759 if name is None and hasattr(fileobj, "name"):
2760 name = fileobj.name
2761 if hasattr(fileobj, "mode"):
2762 self._mode = fileobj.mode
2763 self._extfileobj = True
1027433a 2764 self._dbg(3, 'open_volume: using external fileobj {}', fileobj)
68ddf955
ERE
2765 self.name = os.path.abspath(name) if name else None
2766 self.fileobj = fileobj
2767
2768 # init data structures
2769 self.closed = False
2770 self.members = [] # list of members as TarInfo objects
2771 self._loaded = False # flag if all members have been read
2772 self.offset = self.fileobj.tell()
2773 # current position in the archive file
2774 self.inodes = {} # dictionary caching the inodes of
2775 # archive members already added
2776
2777 try:
2778 if self.mode == "r":
2779 self.firstmember = None
2780 self.firstmember = self.next()
2781
2782 if self.mode == "a":
2783 # Move to the end of the archive,
2784 # before the first empty block.
2785 while True:
2786 self.fileobj.seek(self.offset)
2787 try:
2788 tarinfo = self.tarinfo.fromtarfile(self)
2789 self.members.append(tarinfo)
2790 except EOFHeaderError:
2791 self.fileobj.seek(self.offset)
2792 break
be60ffd0 2793 except HeaderError as e:
68ddf955
ERE
2794 raise ReadError(str(e))
2795
2796 if self.mode in "aw":
2797 self._loaded = True
2798
c04e0751
ERE
2799 if self.format == PAX_FORMAT:
2800 volume_info = {
be60ffd0
ERE
2801 "GNU.volume.filename": str(self.volume_tarinfo.name),
2802 "GNU.volume.size": str(self.volume_tarinfo.size - self.volume_tarinfo.volume_offset),
2803 "GNU.volume.offset": str(self.volume_tarinfo.volume_offset),
c04e0751 2804 }
0eb5048f 2805
c04e0751
ERE
2806 self.pax_headers.update(volume_info)
2807
a0873dcc
PG
2808 if isinstance(self.fileobj, _Stream):
2809 self.fileobj._init_write_gz ()
c04e0751
ERE
2810 buf = self.tarinfo.create_pax_global_header(volume_info.copy())
2811 self.fileobj.write(buf)
2812 self.offset += len(buf)
54128a00 2813 except Exception as exn:
68ddf955
ERE
2814 if not self._extfileobj:
2815 self.fileobj.close()
2816 self.closed = True
2817 raise
2818
e5f5681b 2819 def extractall(self, path=".", members=None, filter=None):
7584f5c9
ERE
2820 """Extract all members from the archive to the current working
2821 directory and set owner, modification time and permissions on
2822 directories afterwards. `path' specifies a different directory
2823 to extract to. `members' is optional and must be a subset of the
2824 list returned by getmembers().
2825 """
2826 directories = []
2827
2828 if members is None:
2829 members = self
2830
2831 for tarinfo in members:
c474439c
ERE
2832 if self.volume_number > 0 and tarinfo.ismultivol():
2833 continue
2834
974408b5 2835 if filter and not filter(tarinfo):
e5f5681b
ERE
2836 continue
2837
7584f5c9
ERE
2838 if tarinfo.isdir():
2839 # Extract directories with a safe mode.
2840 directories.append(tarinfo)
2841 tarinfo = copy.copy(tarinfo)
be60ffd0
ERE
2842 tarinfo.mode = 0o0700
2843 # Do not set_attrs directories, as we will do that further down
2844 self.extract(tarinfo, path, set_attrs=not tarinfo.isdir())
7584f5c9
ERE
2845
2846 # Reverse sort directories.
be60ffd0 2847 directories.sort(key=lambda a: a.name)
7584f5c9
ERE
2848 directories.reverse()
2849
2850 # Set correct owner, mtime and filemode on directories.
2851 for tarinfo in directories:
2852 dirpath = os.path.join(path, tarinfo.name)
2853 try:
2854 self.chown(tarinfo, dirpath)
2855 self.utime(tarinfo, dirpath)
2856 self.chmod(tarinfo, dirpath)
be60ffd0 2857 except ExtractError as e:
7584f5c9
ERE
2858 if self.errorlevel > 1:
2859 raise
2860 else:
2861 self._dbg(1, "tarfile: %s" % e)
2862
786addd6 2863 def extract(self, member, path="", set_attrs=True, symlink_cb=None):
7584f5c9
ERE
2864 """Extract a member from the archive to the current working directory,
2865 using its full name. Its file information is extracted as accurately
2866 as possible. `member' may be a filename or a TarInfo object. You can
be60ffd0
ERE
2867 specify a different directory using `path'. File attributes (owner,
2868 mtime, mode) are set unless `set_attrs' is False.
786addd6
PG
2869 ``symlink_cb`` is a hook accepting a function that is passed the
2870 ``member``, ``path``, and ``set_attrs`` arguments if the tarinfo for
2871 ``member`` indicates a symlink in which case only the callback
9b13f5c4
PG
2872 passed will be applied, skipping the actual extraction. In case the
2873 callback is invoked, its return value is passed on to the caller.
7584f5c9
ERE
2874 """
2875 self._check("r")
2876
be60ffd0 2877 if isinstance(member, str):
7584f5c9
ERE
2878 tarinfo = self.getmember(member)
2879 else:
2880 tarinfo = member
2881
2882 # Prepare the link target for makelink().
2883 if tarinfo.islnk():
2884 tarinfo._link_target = os.path.join(path, tarinfo.linkname)
2885
9b13f5c4 2886 if symlink_cb is not None and tarinfo.issym():
83f5fd71 2887 return symlink_cb(member, path, set_attrs)
786addd6 2888
7584f5c9 2889 try:
be60ffd0
ERE
2890 self._extract_member(tarinfo, os.path.join(path, tarinfo.name),
2891 set_attrs=set_attrs)
2892 except EnvironmentError as e:
7584f5c9
ERE
2893 if self.errorlevel > 0:
2894 raise
2895 else:
2896 if e.filename is None:
2897 self._dbg(1, "tarfile: %s" % e.strerror)
2898 else:
2899 self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename))
be60ffd0 2900 except ExtractError as e:
7584f5c9
ERE
2901 if self.errorlevel > 1:
2902 raise
2903 else:
2904 self._dbg(1, "tarfile: %s" % e)
2905
2906 def extractfile(self, member):
2907 """Extract a member from the archive as a file object. `member' may be
be60ffd0
ERE
2908 a filename or a TarInfo object. If `member' is a regular file or a
2909 link, an io.BufferedReader object is returned. Otherwise, None is
2910 returned.
7584f5c9
ERE
2911 """
2912 self._check("r")
2913
be60ffd0 2914 if isinstance(member, str):
7584f5c9
ERE
2915 tarinfo = self.getmember(member)
2916 else:
2917 tarinfo = member
2918
be60ffd0
ERE
2919 if tarinfo.isreg() or tarinfo.ismultivol() or\
2920 tarinfo.type not in SUPPORTED_TYPES:
7584f5c9
ERE
2921 # If a member's type is unknown, it is treated as a
2922 # regular file.
2923 return self.fileobject(self, tarinfo)
2924
2925 elif tarinfo.islnk() or tarinfo.issym():
2926 if isinstance(self.fileobj, _Stream):
2927 # A small but ugly workaround for the case that someone tries
2928 # to extract a (sym)link as a file-object from a non-seekable
2929 # stream of tar blocks.
2930 raise StreamError("cannot extract (sym)link as file object")
2931 else:
2932 # A (sym)link's file object is its target's file object.
2933 return self.extractfile(self._find_link_target(tarinfo))
2934 else:
2935 # If there's no data associated with the member (directory, chrdev,
2936 # blkdev, etc.), return None instead of a file object.
2937 return None
2938
be60ffd0 2939 def _extract_member(self, tarinfo, targetpath, set_attrs=True):
7584f5c9
ERE
2940 """Extract the TarInfo object tarinfo to a physical
2941 file called targetpath.
2942 """
2943 # Fetch the TarInfo object for the given name
2944 # and build the destination pathname, replacing
2945 # forward slashes to platform specific separators.
2946 targetpath = targetpath.rstrip("/")
2947 targetpath = targetpath.replace("/", os.sep)
2948
2949 # Create all upper directories.
2950 upperdirs = os.path.dirname(targetpath)
2951 if upperdirs and not os.path.exists(upperdirs):
2952 # Create directories that are not part of the archive with
2953 # default permissions.
2954 os.makedirs(upperdirs)
2955
2956 if tarinfo.islnk() or tarinfo.issym():
2957 self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname))
2958 else:
2959 self._dbg(1, tarinfo.name)
2960
2961 if tarinfo.isreg():
2962 self.makefile(tarinfo, targetpath)
2963 elif tarinfo.isdir():
2964 self.makedir(tarinfo, targetpath)
2965 elif tarinfo.isfifo():
2966 self.makefifo(tarinfo, targetpath)
2967 elif tarinfo.ischr() or tarinfo.isblk():
2968 self.makedev(tarinfo, targetpath)
2969 elif tarinfo.islnk() or tarinfo.issym():
2970 self.makelink(tarinfo, targetpath)
2971 elif tarinfo.type not in SUPPORTED_TYPES:
2972 self.makeunknown(tarinfo, targetpath)
2973 else:
2974 self.makefile(tarinfo, targetpath)
2975
be60ffd0
ERE
2976 if set_attrs:
2977 self.chown(tarinfo, targetpath)
2978 if not tarinfo.issym():
2979 self.chmod(tarinfo, targetpath)
2980 self.utime(tarinfo, targetpath)
7584f5c9
ERE
2981
2982 #--------------------------------------------------------------------------
2983 # Below are the different file methods. They are called via
2984 # _extract_member() when extract() is called. They can be replaced in a
2985 # subclass to implement other functionality.
2986
2987 def makedir(self, tarinfo, targetpath):
2988 """Make a directory called targetpath.
2989 """
2990 try:
2991 # Use a safe mode for the directory, the real mode is set
2992 # later in _extract_member().
be60ffd0
ERE
2993 os.mkdir(targetpath, 0o0700)
2994 except FileExistsError:
2995 pass
7584f5c9
ERE
2996
2997 def makefile(self, tarinfo, targetpath):
2998 """Make a file called targetpath.
2999 """
be60ffd0
ERE
3000 source = self.fileobj
3001 source.seek(tarinfo.offset_data)
c7c736b6 3002 decrypt = False
c474439c
ERE
3003 iterate = True
3004 target = bltn_open(targetpath, "wb")
3005
be60ffd0
ERE
3006 if tarinfo.sparse is not None:
3007 try:
3008 for offset, size in tarinfo.sparse:
3009 target.seek(offset)
3010 copyfileobj(source, target, size)
3011 target.seek(tarinfo.size)
3012 target.truncate()
3013 finally:
3014 target.close()
3015 return
3016
c474439c
ERE
3017 while iterate:
3018 iterate = False
3019 try:
3020 copyfileobj(source, target, tarinfo.size)
aa828cd1 3021 except OSError:
c474439c
ERE
3022 source.close()
3023 # only if we are extracting a multivolume this can be treated
3024 if not self.new_volume_handler:
3025 target.close()
3026 raise Exception("We need to read a new volume and you"
3027 " didn't supply a new_volume_handler")
3028
3029 # the new volume handler should do everything needed to
3030 # start working in a new volume. usually, the handler calls
3031 # to self.open_volume
3032 self.volume_number += 1
3033 self.new_volume_handler(self, self.base_name, self.volume_number)
be60ffd0
ERE
3034 tarinfo = self.firstmember
3035 source = self.fileobj
c474439c 3036 iterate = True
c474439c
ERE
3037 target.close()
3038
7584f5c9
ERE
3039
3040 def makeunknown(self, tarinfo, targetpath):
3041 """Make a file from a TarInfo object with an unknown type
3042 at targetpath.
3043 """
3044 self.makefile(tarinfo, targetpath)
3045 self._dbg(1, "tarfile: Unknown file type %r, " \
3046 "extracted as regular file." % tarinfo.type)
3047
3048 def makefifo(self, tarinfo, targetpath):
3049 """Make a fifo called targetpath.
3050 """
3051 if hasattr(os, "mkfifo"):
3052 os.mkfifo(targetpath)
3053 else:
3054 raise ExtractError("fifo not supported by system")
3055
3056 def makedev(self, tarinfo, targetpath):
3057 """Make a character or block device called targetpath.
3058 """
3059 if not hasattr(os, "mknod") or not hasattr(os, "makedev"):
3060 raise ExtractError("special devices not supported by system")
3061
3062 mode = tarinfo.mode
3063 if tarinfo.isblk():
3064 mode |= stat.S_IFBLK
3065 else:
3066 mode |= stat.S_IFCHR
3067
3068 os.mknod(targetpath, mode,
3069 os.makedev(tarinfo.devmajor, tarinfo.devminor))
3070
3071 def makelink(self, tarinfo, targetpath):
3072 """Make a (symbolic) link called targetpath. If it cannot be created
3073 (platform limitation), we try to make a copy of the referenced file
3074 instead of a link.
3075 """
be60ffd0 3076 try:
7584f5c9
ERE
3077 # For systems that support symbolic and hard links.
3078 if tarinfo.issym():
7584f5c9
ERE
3079 os.symlink(tarinfo.linkname, targetpath)
3080 else:
3081 # See extract().
3082 if os.path.exists(tarinfo._link_target):
7584f5c9
ERE
3083 os.link(tarinfo._link_target, targetpath)
3084 else:
be60ffd0
ERE
3085 self._extract_member(self._find_link_target(tarinfo),
3086 targetpath)
3087 except symlink_exception:
7584f5c9 3088 try:
be60ffd0
ERE
3089 self._extract_member(self._find_link_target(tarinfo),
3090 targetpath)
7584f5c9
ERE
3091 except KeyError:
3092 raise ExtractError("unable to resolve link inside archive")
3093
3094 def chown(self, tarinfo, targetpath):
3095 """Set owner of targetpath according to tarinfo.
3096 """
3097 if pwd and hasattr(os, "geteuid") and os.geteuid() == 0:
3098 # We have to be root to do so.
3099 try:
3100 g = grp.getgrnam(tarinfo.gname)[2]
3101 except KeyError:
3102 g = tarinfo.gid
3103 try:
3104 u = pwd.getpwnam(tarinfo.uname)[2]
3105 except KeyError:
3106 u = tarinfo.uid
3107 try:
3108 if tarinfo.issym() and hasattr(os, "lchown"):
3109 os.lchown(targetpath, u, g)
3110 else:
be60ffd0
ERE
3111 os.chown(targetpath, u, g)
3112 except OSError as e:
7584f5c9
ERE
3113 raise ExtractError("could not change owner")
3114
3115 def chmod(self, tarinfo, targetpath):
3116 """Set file permissions of targetpath according to tarinfo.
3117 """
3118 if hasattr(os, 'chmod'):
3119 try:
3120 os.chmod(targetpath, tarinfo.mode)
be60ffd0 3121 except OSError as e:
7584f5c9
ERE
3122 raise ExtractError("could not change mode")
3123
3124 def utime(self, tarinfo, targetpath):
3125 """Set modification time of targetpath according to tarinfo.
3126 """
3127 if not hasattr(os, 'utime'):
3128 return
3129 try:
3130 os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime))
be60ffd0 3131 except OSError as e:
7584f5c9
ERE
3132 raise ExtractError("could not change modification time")
3133
3134 #--------------------------------------------------------------------------
3135 def next(self):
3136 """Return the next member of the archive as a TarInfo object, when
3137 TarFile is opened for reading. Return None if there is no more
3138 available.
3139 """
3140 self._check("ra")
3141 if self.firstmember is not None:
3142 m = self.firstmember
3143 self.firstmember = None
3144 return m
3145
be60ffd0
ERE
3146 # Read the next block.
3147 self.fileobj.seek(self.offset)
7584f5c9
ERE
3148 tarinfo = None
3149 while True:
3150 try:
3151 tarinfo = self.tarinfo.fromtarfile(self)
be60ffd0 3152 except EOFHeaderError as e:
7584f5c9
ERE
3153 if self.ignore_zeros:
3154 self._dbg(2, "0x%X: %s" % (self.offset, e))
3155 self.offset += BLOCKSIZE
3156 continue
be60ffd0 3157 except InvalidHeaderError as e:
7584f5c9
ERE
3158 if self.ignore_zeros:
3159 self._dbg(2, "0x%X: %s" % (self.offset, e))
3160 self.offset += BLOCKSIZE
3161 continue
3162 elif self.offset == 0:
3163 raise ReadError(str(e))
3164 except EmptyHeaderError:
3165 if self.offset == 0:
3166 raise ReadError("empty file")
be60ffd0 3167 except TruncatedHeaderError as e:
7584f5c9
ERE
3168 if self.offset == 0:
3169 raise ReadError(str(e))
be60ffd0 3170 except SubsequentHeaderError as e:
7584f5c9
ERE
3171 raise ReadError(str(e))
3172 break
3173
3174 if tarinfo is not None:
ea625b04
ERE
3175 if self.save_to_members:
3176 self.members.append(tarinfo)
7584f5c9
ERE
3177 else:
3178 self._loaded = True
3179
3180 return tarinfo
3181
3182 #--------------------------------------------------------------------------
3183 # Little helper methods:
3184
3185 def _getmember(self, name, tarinfo=None, normalize=False):
3186 """Find an archive member by name from bottom to top.
3187 If tarinfo is given, it is used as the starting point.
3188 """
3189 # Ensure that all members have been loaded.
3190 members = self.getmembers()
3191
3192 # Limit the member search list up to tarinfo.
3193 if tarinfo is not None:
3194 members = members[:members.index(tarinfo)]
3195
3196 if normalize:
3197 name = os.path.normpath(name)
3198
3199 for member in reversed(members):
3200 if normalize:
3201 member_name = os.path.normpath(member.name)
3202 else:
3203 member_name = member.name
3204
3205 if name == member_name:
3206 return member
3207
3208 def _load(self):
3209 """Read through the entire archive file and look for readable
3210 members.
3211 """
3212 while True:
3213 tarinfo = self.next()
3214 if tarinfo is None:
3215 break
3216 self._loaded = True
3217
3218 def _check(self, mode=None):
3219 """Check if TarFile is still open, and if the operation's mode
3220 corresponds to TarFile's mode.
3221 """
3222 if self.closed:
be60ffd0 3223 raise OSError("%s is closed" % self.__class__.__name__)
7584f5c9 3224 if mode is not None and self.mode not in mode:
be60ffd0 3225 raise OSError("bad operation for mode %r" % self.mode)
7584f5c9
ERE
3226
3227 def _find_link_target(self, tarinfo):
3228 """Find the target member of a symlink or hardlink member in the
3229 archive.
3230 """
3231 if tarinfo.issym():
3232 # Always search the entire archive.
3233 linkname = "/".join(filter(None, (os.path.dirname(tarinfo.name), tarinfo.linkname)))
3234 limit = None
3235 else:
3236 # Search the archive before the link, because a hard link is
3237 # just a reference to an already archived file.
3238 linkname = tarinfo.linkname
3239 limit = tarinfo
3240
3241 member = self._getmember(linkname, tarinfo=limit, normalize=True)
3242 if member is None:
3243 raise KeyError("linkname %r not found" % linkname)
3244 return member
3245
3246 def __iter__(self):
3247 """Provide an iterator object.
3248 """
3249 if self._loaded:
3250 return iter(self.members)
3251 else:
3252 return TarIter(self)
3253
1027433a 3254 def _dbg(self, level, msg, *args):
7584f5c9
ERE
3255 """Write debugging output to sys.stderr.
3256 """
3257 if level <= self.debug:
1027433a 3258 print(msg.format(*args), file=sys.stderr)
7584f5c9
ERE
3259
3260 def __enter__(self):
3261 self._check()
3262 return self
3263
3264 def __exit__(self, type, value, traceback):
3265 if type is None:
3266 self.close()
3267 else:
3268 # An exception occurred. We must not call close() because
3269 # it would try to write end-of-archive blocks and padding.
3270 if not self._extfileobj:
3271 self.fileobj.close()
3272 self.closed = True
3273# class TarFile
3274
3275class TarIter:
3276 """Iterator Class.
3277
3278 for tarinfo in TarFile(...):
3279 suite...
3280 """
3281
3282 def __init__(self, tarfile):
3283 """Construct a TarIter object.
3284 """
3285 self.tarfile = tarfile
3286 self.index = 0
3287 def __iter__(self):
3288 """Return iterator object.
3289 """
3290 return self
be60ffd0 3291 def __next__(self):
7584f5c9
ERE
3292 """Return the next item using TarFile's next() method.
3293 When all members have been read, set TarFile as _loaded.
3294 """
3295 # Fix for SF #1100429: Under rare circumstances it can
3296 # happen that getmembers() is called during iteration,
3297 # which will cause TarIter to stop prematurely.
3298
3299 if self.index == 0 and self.tarfile.firstmember is not None:
3300 tarinfo = self.tarfile.next()
3301 elif self.index < len(self.tarfile.members):
3302 tarinfo = self.tarfile.members[self.index]
3303 elif not self.tarfile._loaded:
3304 tarinfo = self.tarfile.next()
3305 if not tarinfo:
3306 self.tarfile._loaded = True
3307 raise StopIteration
3308 else:
3309 raise StopIteration
3310 self.index += 1
fb27c6e8 3311
7584f5c9
ERE
3312 return tarinfo
3313
6690f5e0
PG
3314#---------------------------------------------------------
3315# support functionality for rescue mode
3316#---------------------------------------------------------
3317
dfd7865e
PG
3318def locate_gz_hdr_candidates (fd):
3319 """
3320 Walk over instances of the GZ magic in the payload, collecting their
3321 positions. If the offset of the first found instance is not zero, the file
3322 begins with leading garbage.
3323
3324 Note that since the GZ magic consists of only two bytes, we expect a lot of
3325 false positives inside binary data.
3326
3327 :return: The list of offsets in the file.
3328 """
3329 pos = 0
3330 cands = []
3331 mm = mmap.mmap(fd, 0, mmap.MAP_SHARED, mmap.PROT_READ)
3332
3333 while True:
3334 pos = mm.find (GZ_MAGIC_BYTES, pos)
3335 if pos == -1:
3336 break
3337 cands.append (pos)
3338 pos += len (GZ_MAGIC_BYTES)
3339
3340 return cands
3341
3342
3343HDR_CAND_GOOD = 0 # header marks begin of valid object
3344HDR_CAND_FISHY = 1 # inconclusive
3345HDR_CAND_JUNK = 2 # not a header / object unreadable
3346
3347
3348def read_cstring (fd, max=-1, encoding=None):
3349 """
3350 Read one NUL-terminated string from *fd* into a Python string. If *max* is
3351 non-negative, reading will terminate after the specified number of bytes.
3352
3353 Optionally, an *encoding* may be specified to interpret the data as.
3354
3355 :returns: *None* if parsing failed or the maximum number of bytes has been
3356 exceeded; a Python string with the data otherwise.
3357 """
3358 buf = b""
3359 l = 0
3360
3361 while True:
3362 c = os.read (fd, 1)
3363 if c == NUL:
3364 break
3365 if max >= 0 and l > max:
3366 return None
3367 buf += c
3368 l += 1
3369 if encoding is not None:
3370 buf = buf.decode (encoding)
3371
3372 return buf
3373
3374
3375def inspect_gz_hdr (fd, off):
3376 """
3377 Attempt to parse a Gzip header in *fd* at position *off*. The format is
3378 documented as RFC1952.
3379
3380 Returns a verdict about the quality of that header plus the parsed header
3381 when readable. Problematic sizes such as fields running past the EOF are
3382 treated as garbage. Properties in which the header merely doesn’t conform
3383 to the spec (garbage flag bits, bogus timestamp) are considered “fishy”. No
3384 validation is possible on embedded strings because they are single-byte
3385 encoded.
3386 """
3387 fname = None
3388 flags = 0x00
3389 dflags = 0x00
3390 mtime = 0x00000000
3391 oscode = 0x00
3392 verdict = HDR_CAND_GOOD
3393
3394 os.lseek (fd, off, os.SEEK_SET)
3395 if os.lseek (fd, 0, os.SEEK_CUR) != off:
3396 return HDR_CAND_JUNK, None
3397
3398 raw = os.read (fd, GZ_HEADER_SIZE)
3399 if len (raw) != GZ_HEADER_SIZE:
3400 return HDR_CAND_JUNK, None
3401
3402 flags = 0x0
3403 try:
3404 _m1, _m2, meth, flags, mtime, dflags, oscode = \
3405 struct.unpack (GZ_FMT_HEADER, raw)
3406 if meth != GZ_METHOD_DEFLATE: # only deflate is supported
3407 return HDR_CAND_JUNK, None
3408 except struct.error as exn:
3409 return HDR_CAND_JUNK, None
3410
3411 if mtime > int (time.time ()):
3412 verdict = HDR_CAND_FISHY
3413
3414 if dflags != GZ_DEFLATE_FLAGS:
3415 verdict = HDR_CAND_FISHY
3416
3417 if oscode != GZ_OS_CODE:
3418 verdict = HDR_CAND_FISHY
3419
3420 if flags & GZ_FLAG_FTEXT: # created by some contrarian
3421 verdict = HDR_CAND_FISHY
3422 if flags & GZ_FLAG_FEXTRA:
3423 xlen = struct.unpack ("<H", os.read (fd, 2))
3424 xtra = os.read (fd, xlen)
3425 if len (xtra) != xlen: # eof inside header
3426 return HDR_CAND_JUNK, None
3427 if flags & GZ_FLAG_FNAME:
3428 # read up to the next NUL byte, not exceeding the maximum path length
3429 # allowed by tar(5)
3430 fname = read_cstring (fd, max=(LENGTH_NAME + LENGTH_PREFIX),
3431 encoding="iso-8859-1")
3432 if fname is None:
3433 return HDR_CAND_JUNK, None
3434 if flags & GZ_FLAG_FCOMMENT:
3435 fname = read_cstring (fd, max=(LENGTH_NAME + LENGTH_PREFIX),
3436 encoding="iso-8859-1")
3437 if fname is None:
3438 return HDR_CAND_JUNK, None
3439 if flags & GZ_FLAG_FHCRC: # half a CRC32; discarded
3440 crc16 = os.read (fd, 2)
3441 if len (crc16) != 2: # eof inside header
3442 return HDR_CAND_JUNK, None
3443 if flags & GZ_FLAG_RESERVED:
3444 # according to the RFC, these must not be set
3445 verdict = HDR_CAND_FISHY
3446
3447 hlen = os.lseek (fd, 0, os.SEEK_CUR) - off
3448
3449 return verdict, \
3450 { "fname" : fname
3451 , "flags" : flags
3452 , "dflags" : dflags
3453 , "mtime" : mtime
3454 , "oscode" : oscode
3455 , "hlen" : hlen
3456 }
3457
3458
3459def try_decompress (ifd, off, hdr):
3460 """
3461 Attempt to process the object starting at *off* with gzip.
3462
3463 :returns: A pair containing the values of the decompressed data and
3464 the length of the input consumed. Note that the latter value
3465 may exceed the length of the compressed data because the
3466 *zlib* module does not provide a means to query how much
3467 of the input it processed before the end of an object.
3468 """
3469 import zlib
3470 decmp = zlib.decompressobj (-zlib.MAX_WBITS)
3471 pos = off
3472 dlen = 0 # size of decompressed data
3473
3474 os.lseek (ifd, pos, os.SEEK_SET)
3475 while True:
3476 cnk = os.read (ifd, BUFSIZE)
3477 pos += len (cnk)
3478 try:
3479 data = decmp.decompress (cnk)
3480 except zlib.error as exn: # probably CRC32 mismatch; terminate softly
3481 break # fishy
3482 dlen += len (data)
3483 if decmp.eof is True:
3484 break
3485 if len (cnk) != BUFSIZE: # eof, but not end of decompressed object!
3486 break # fishy
3487
3488 return dlen, pos - off
3489
3490def readable_gz_objects_offsets (ifd, cands):
3491 """
3492 Inspect header candidates for parseable *ifd* gzipped objects.
3493 """
3494 good = []
3495 nobj = 0
3496
3497 for cand in cands:
3498 nobj += 1
3499 vdt, hdr = inspect_gz_hdr (ifd, cand)
3500 if vdt == HDR_CAND_JUNK:
3501 pass # ignore unreadable ones
3502 elif vdt in [HDR_CAND_GOOD, HDR_CAND_FISHY]:
3503 off0 = cand + hdr ["hlen"]
3504 dlen, clen = try_decompress (ifd, off0, hdr)
3505 if dlen > 0 and clen > 0:
3506 good.append (cand)
3507
3508 return good
3509
3510
3511def reconstruct_offsets_gz (fname):
3512 """
3513 From the given file, retrieve all GZ header-like offsets (“candidates”).
3514 Then check each of those locations whether they can be processed as
3515 compressed data.
3516 """
3517 ifd = os.open (fname, os.O_RDONLY)
3518
3519 try:
3520 cands = locate_gz_hdr_candidates (ifd)
3521 return readable_gz_objects_offsets (ifd, cands)
3522 finally:
3523 os.close (ifd)
3524
3525
d39d4cbf
PG
3526def read_tarobj_at_offset (fileobj, offset, mode, secret=None):
3527 decr = None
d39d4cbf 3528
dfd7865e
PG
3529 if secret is not None:
3530 ks = secret [0]
3531
3532 if ks == crypto.PDTCRYPT_SECRET_PW:
3533 decr = crypto.Decrypt (password=secret [1])
3534 elif ks == crypto.PDTCRYPT_SECRET_KEY:
3535 key = binascii.unhexlify (secret [1])
3536 decr = crypto.Decrypt (key=key)
3537 else:
3538 raise RuntimeError
d39d4cbf
PG
3539
3540 tarobj = \
3541 TarFile.open_at_offset (offset,
3542 mode=mode,
3543 fileobj=fileobj,
3544 format=GNU_FORMAT,
3545 concat='#' in mode,
3546 encryption=decr,
3547 save_to_members=False,
3548 tolerance=TOLERANCE_RESCUE)
3549
3550 return tarobj.next ()
3551
3552
2d50b7f7
PG
3553def idxent_of_tarinfo (tarinfo):
3554 """
3555 Scrape the information relevant for the index from a *TarInfo* object.
3556 Keys like the inode number that lack a corresponding field in a TarInfo
3557 will be set to some neutral value.
3558 Example output:
3559
3560 { "inode" : 0
3561 , "uid" : 0
3562 , "path" : "snapshot://annotations.db"
3563 , "offset" : 0
3564 , "volume" : 0
3565 , "mode" : 33152
3566 , "ctime" : 1502798115
3567 , "mtime" : 1502196423
3568 , "size" : 144
3569 , "type" : "file"
3570 , "gid" : 0
3571 }
3572
3573 """
3574
3575 return \
3576 { "inode" : 0 # ignored when reading the index
3577 , "uid" : tarinfo.uid
3578 , "gid" : tarinfo.gid
3579 , "path" : tarinfo.name # keeping URI scheme
3580 , "offset" : 0 # to be added by the caller
3581 , "volume" : tarinfo.volume_offset
3582 , "mode" : tarinfo.mode
3583 , "ctime" : tarinfo.mtime
3584 , "mtime" : tarinfo.mtime
3585 , "size" : tarinfo.size
3586 , "type" : tarinfo.type
3587 }
3588
3589
d39d4cbf 3590def gen_rescue_index (backup_tar_path, mode, password=None, key=None):
6690f5e0
PG
3591 psidx = [] # pseudo index, return value
3592 offsets = None
3593 secret = None
3594
3595 if password is not None:
3596 secret = (crypto.PDTCRYPT_SECRET_PW, password)
3597 elif key is not None:
3598 secret = (crypto.PDTCRYPT_SECRET_KEY, key)
3599
3600 if secret is not None:
3601 offsets = crypto.reconstruct_offsets (backup_tar_path, secret)
dfd7865e
PG
3602 elif mode == "#gz":
3603 offsets = reconstruct_offsets_gz (backup_tar_path)
3604
3605 fileobj = bltn_open (backup_tar_path, "rb")
3606 infos = [ (off, read_tarobj_at_offset (fileobj, off, mode, secret=secret))
3607 for off in offsets ]
3608 def aux (o, ti):
3609 ie = idxent_of_tarinfo (ti)
3610 ie ["offset"] = o
3611 return ie
3612 psidx = [ aux (o, ti) for o, ti in infos ]
6690f5e0
PG
3613
3614 return psidx
7584f5c9
ERE
3615
3616#--------------------
3617# exported functions
3618#--------------------
3619def is_tarfile(name):
3620 """Return True if name points to a tar archive that we
3621 are able to handle, else return False.
3622 """
3623 try:
3624 t = open(name)
3625 t.close()
3626 return True
3627 except TarError:
3628 return False
3629
3630bltn_open = open
3631open = TarFile.open