guard invocations of tar from interactive mode
[python-delta-tar] / deltatar / tarfile.py
CommitLineData
be60ffd0 1#!/usr/bin/env python3
7584f5c9
ERE
2#-------------------------------------------------------------------
3# tarfile.py
4#-------------------------------------------------------------------
5# Copyright (C) 2002 Lars Gustäbel <lars@gustaebel.de>
6# All rights reserved.
7#
8# Permission is hereby granted, free of charge, to any person
9# obtaining a copy of this software and associated documentation
10# files (the "Software"), to deal in the Software without
11# restriction, including without limitation the rights to use,
12# copy, modify, merge, publish, distribute, sublicense, and/or sell
13# copies of the Software, and to permit persons to whom the
14# Software is furnished to do so, subject to the following
15# conditions:
16#
17# The above copyright notice and this permission notice shall be
18# included in all copies or substantial portions of the Software.
19#
20# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27# OTHER DEALINGS IN THE SOFTWARE.
28#
29"""Read from and write to tar format archives.
30"""
31
32__version__ = "$Revision: 85213 $"
33# $Source$
34
35version = "0.9.0"
36__author__ = "Lars Gustäbel (lars@gustaebel.de)"
37__date__ = "$Date$"
38__cvsid__ = "$Id$"
5fdff89f 39__credits__ = "Gustavo Niemeyer, Niels Gustäbel, Richard Townsend, Eduardo Robles."
7584f5c9
ERE
40
41#---------
42# Imports
43#---------
c7c736b6 44import binascii
dfd7865e
PG
45import copy
46import errno
5bd2d4b5 47import functools
be60ffd0 48import io
dfd7865e
PG
49import mmap
50import operator
51import os
52import re
7584f5c9
ERE
53import shutil
54import stat
7584f5c9 55import struct
dfd7865e
PG
56import sys
57import time
7584f5c9 58
c7c736b6
PG
59import traceback # XXX
60
8ab8fac5 61from . import crypto
6e812ad9 62
7584f5c9
ERE
63try:
64 import grp, pwd
65except ImportError:
66 grp = pwd = None
67
be60ffd0
ERE
68# os.symlink on Windows prior to 6.0 raises NotImplementedError
69symlink_exception = (AttributeError, NotImplementedError)
70try:
71 # OSError (winerror=1314) will be raised if the caller does not hold the
72 # SeCreateSymbolicLinkPrivilege privilege
73 symlink_exception += (OSError,)
74except NameError:
75 pass
76
7584f5c9
ERE
77# from tarfile import *
78__all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError"]
79
be60ffd0
ERE
80from builtins import open as _open # Since 'open' is TarFile.open
81
7584f5c9
ERE
82#---------------------------------------------------------
83# tar constants
84#---------------------------------------------------------
be60ffd0 85NUL = b"\0" # the null character
7584f5c9
ERE
86BLOCKSIZE = 512 # length of processing blocks
87RECORDSIZE = BLOCKSIZE * 20 # length of records
be60ffd0
ERE
88GNU_MAGIC = b"ustar \0" # magic gnu tar string
89POSIX_MAGIC = b"ustar\x0000" # magic posix tar string
7584f5c9
ERE
90
91LENGTH_NAME = 100 # maximum length of a filename
92LENGTH_LINK = 100 # maximum length of a linkname
93LENGTH_PREFIX = 155 # maximum length of the prefix field
94
be60ffd0
ERE
95REGTYPE = b"0" # regular file
96AREGTYPE = b"\0" # regular file
97LNKTYPE = b"1" # link (inside tarfile)
98SYMTYPE = b"2" # symbolic link
99CHRTYPE = b"3" # character special device
100BLKTYPE = b"4" # block special device
101DIRTYPE = b"5" # directory
102FIFOTYPE = b"6" # fifo special device
103CONTTYPE = b"7" # contiguous file
104
105GNUTYPE_LONGNAME = b"L" # GNU tar longname
106GNUTYPE_LONGLINK = b"K" # GNU tar longlink
107GNUTYPE_SPARSE = b"S" # GNU tar sparse file
108GNUTYPE_MULTIVOL = b"M" # GNU tar continuation of a file that began on
68ddf955 109 # another volume
7584f5c9 110
be60ffd0
ERE
111XHDTYPE = b"x" # POSIX.1-2001 extended header
112XGLTYPE = b"g" # POSIX.1-2001 global header
113SOLARIS_XHDTYPE = b"X" # Solaris extended header
7584f5c9
ERE
114
115USTAR_FORMAT = 0 # POSIX.1-1988 (ustar) format
116GNU_FORMAT = 1 # GNU tar format
117PAX_FORMAT = 2 # POSIX.1-2001 (pax) format
118DEFAULT_FORMAT = GNU_FORMAT
119
15a81fc0 120GZ_FMT_HEADER = b"<BBBBLBB"
203cb25e 121GZ_HEADER_SIZE = 10 # not including the name
15a81fc0
PG
122GZ_MAGIC = (0x1f, 0x8b) # 0o37, 0o213
123GZ_METHOD_DEFLATE = 0x08 # 0o10
dfd7865e
PG
124GZ_FLAG_FTEXT = 1 << 0 # ASCII payload
125GZ_FLAG_FHCRC = 1 << 1 # CRC16
126GZ_FLAG_FEXTRA = 1 << 2 # extra field
127GZ_FLAG_FNAME = 1 << 3 # set by default in gzip
128GZ_FLAG_FCOMMENT = 1 << 4 # NUL-terminated comment
129GZ_FLAG_RESERVED = 7 << 5 # unassigned
15a81fc0
PG
130GZ_DEFLATE_FLAGS = 0x00 # 0o00, never read (deflate.c)
131GZ_OS_CODE = 0x03 # 0o03, default in gzip (tailor.h)
d601d33b
PG
132GZ_MAGIC_BYTES = struct.pack ("<BB", GZ_MAGIC [0], GZ_MAGIC [1])
133GZ_MAGIC_DEFLATE = struct.pack ("<BBB", GZ_MAGIC [0], GZ_MAGIC [1],
134 GZ_METHOD_DEFLATE)
15a81fc0 135
04f4c7ab
PG
136TOLERANCE_STRICT = 0
137TOLERANCE_RECOVER = 1 # rely on offsets in index
138TOLERANCE_RESCUE = 2 # deduce metadata from archive contents
139
dfd7865e
PG
140BUFSIZE = 16 * 1024
141
7584f5c9 142#---------------------------------------------------------
d1c38f40
PG
143# archive handling mode
144#---------------------------------------------------------
145
146ARCMODE_PLAIN = 0
147ARCMODE_ENCRYPT = 1 << 0
148ARCMODE_COMPRESS = 1 << 1
149ARCMODE_CONCAT = 1 << 2
150
151def arcmode_fmt (m):
152 if m == ARCMODE_PLAIN:
153 return "PLAIN"
154 first = True
155 ret = "["
156 def chkappend (b, s):
157 nonlocal m
158 nonlocal ret
159 nonlocal first
160 if m & b:
161 if first is True: first = False
162 else: ret += " |"
163 ret += " " + s
164 chkappend (ARCMODE_ENCRYPT, "ENCRYPT")
165 chkappend (ARCMODE_COMPRESS, "COMPRESS")
166 chkappend (ARCMODE_CONCAT, "CONCAT")
167 return ret + " ]"
168
169
170def arcmode_set (concat=False, encryption=None, comptype=None, init=ARCMODE_PLAIN):
171 ret = init
172 if bool (concat) is True:
173 ret |= ARCMODE_CONCAT
174 if encryption is not None:
175 ret |= ARCMODE_ENCRYPT
176 if comptype == "gz":
177 ret |= ARCMODE_COMPRESS
178 return ret
179
180#---------------------------------------------------------
7584f5c9
ERE
181# tarfile constants
182#---------------------------------------------------------
183# File types that tarfile supports:
184SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE,
185 SYMTYPE, DIRTYPE, FIFOTYPE,
186 CONTTYPE, CHRTYPE, BLKTYPE,
187 GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
68ddf955 188 GNUTYPE_SPARSE, GNUTYPE_MULTIVOL)
7584f5c9
ERE
189
190# File types that will be treated as a regular file.
191REGULAR_TYPES = (REGTYPE, AREGTYPE,
192 CONTTYPE, GNUTYPE_SPARSE)
193
194# File types that are part of the GNU tar format.
195GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
68ddf955 196 GNUTYPE_SPARSE, GNUTYPE_MULTIVOL)
7584f5c9
ERE
197
198# Fields from a pax header that override a TarInfo attribute.
199PAX_FIELDS = ("path", "linkpath", "size", "mtime",
200 "uid", "gid", "uname", "gname")
201
be60ffd0
ERE
202# Fields from a pax header that are affected by hdrcharset.
203PAX_NAME_FIELDS = {"path", "linkpath", "uname", "gname"}
204
7584f5c9
ERE
205# Fields in a pax header that are numbers, all other fields
206# are treated as strings.
207PAX_NUMBER_FIELDS = {
208 "atime": float,
209 "ctime": float,
210 "mtime": float,
211 "uid": int,
212 "gid": int,
213 "size": int
214}
215
216#---------------------------------------------------------
7584f5c9
ERE
217# initialization
218#---------------------------------------------------------
be60ffd0
ERE
219
220if os.name in ("nt", "ce"):
221 ENCODING = "utf-8"
222else:
223 ENCODING = sys.getfilesystemencoding()
7584f5c9
ERE
224
225#---------------------------------------------------------
226# Some useful functions
227#---------------------------------------------------------
228
be60ffd0
ERE
229def stn(s, length, encoding, errors):
230 """Convert a string to a null-terminated bytes object.
7584f5c9 231 """
be60ffd0 232 s = s.encode(encoding, errors)
7584f5c9
ERE
233 return s[:length] + (length - len(s)) * NUL
234
be60ffd0
ERE
235def nts(s, encoding, errors):
236 """Convert a null-terminated bytes object to a string.
7584f5c9 237 """
be60ffd0
ERE
238 p = s.find(b"\0")
239 if p != -1:
240 s = s[:p]
241 return s.decode(encoding, errors)
242
243def sbtn(s, length, encoding, errors):
244 """Convert a string or a bunch of bytes to a null-terminated bytes object
245 of specific size.
246 """
247 if isinstance(s, str):
248 s = s.encode(encoding, errors)
249 return s[:length] + (length - len(s)) * NUL
7584f5c9
ERE
250
251def nti(s):
252 """Convert a number field to a python number.
253 """
254 # There are two possible encodings for a number field, see
255 # itn() below.
be60ffd0
ERE
256 if s[0] in (0o200, 0o377):
257 n = 0
258 for i in range(len(s) - 1):
259 n <<= 8
260 n += s[i + 1]
261 if s[0] == 0o377:
262 n = -(256 ** (len(s) - 1) - n)
263 else:
7584f5c9 264 try:
be60ffd0 265 n = int(nts(s, "ascii", "strict") or "0", 8)
7584f5c9
ERE
266 except ValueError:
267 raise InvalidHeaderError("invalid header")
7584f5c9
ERE
268 return n
269
270def itn(n, digits=8, format=DEFAULT_FORMAT):
271 """Convert a python number to a number field.
272 """
273 # POSIX 1003.1-1988 requires numbers to be encoded as a string of
274 # octal digits followed by a null-byte, this allows values up to
275 # (8**(digits-1))-1. GNU tar allows storing numbers greater than
be60ffd0
ERE
276 # that if necessary. A leading 0o200 or 0o377 byte indicate this
277 # particular encoding, the following digits-1 bytes are a big-endian
278 # base-256 representation. This allows values up to (256**(digits-1))-1.
279 # A 0o200 byte indicates a positive number, a 0o377 byte a negative
280 # number.
7584f5c9 281 if 0 <= n < 8 ** (digits - 1):
8112b0ed 282 s = bytes("%0*o" % (digits - 1, int(n)), "ascii") + NUL
be60ffd0
ERE
283 elif format == GNU_FORMAT and -256 ** (digits - 1) <= n < 256 ** (digits - 1):
284 if n >= 0:
285 s = bytearray([0o200])
286 else:
287 s = bytearray([0o377])
288 n = 256 ** digits + n
7584f5c9 289
be60ffd0
ERE
290 for i in range(digits - 1):
291 s.insert(1, n & 0o377)
7584f5c9 292 n >>= 8
7584f5c9 293 else:
be60ffd0
ERE
294 raise ValueError("overflow in number field")
295
296 return s
7584f5c9
ERE
297
298def calc_chksums(buf):
299 """Calculate the checksum for a member's header by summing up all
300 characters except for the chksum field which is treated as if
301 it was filled with spaces. According to the GNU tar sources,
302 some tars (Sun and NeXT) calculate chksum with signed char,
303 which will be different if there are chars in the buffer with
304 the high bit set. So we calculate two checksums, unsigned and
305 signed.
306 """
be60ffd0
ERE
307 unsigned_chksum = 256 + sum(struct.unpack_from("148B8x356B", buf))
308 signed_chksum = 256 + sum(struct.unpack_from("148b8x356b", buf))
7584f5c9
ERE
309 return unsigned_chksum, signed_chksum
310
311def copyfileobj(src, dst, length=None):
312 """Copy length bytes from fileobj src to fileobj dst.
313 If length is None, copy the entire content.
314 """
315 if length == 0:
316 return
317 if length is None:
318 shutil.copyfileobj(src, dst)
319 return
320
7584f5c9 321 blocks, remainder = divmod(length, BUFSIZE)
be60ffd0 322 for b in range(blocks):
7584f5c9 323 buf = src.read(BUFSIZE)
c474439c 324 dst.write(buf)
7584f5c9 325 if len(buf) < BUFSIZE:
be60ffd0 326 raise OSError("end of file reached")
7584f5c9
ERE
327 if remainder != 0:
328 buf = src.read(remainder)
c474439c 329 dst.write(buf)
7584f5c9 330 if len(buf) < remainder:
be60ffd0 331 raise OSError("end of file reached")
c7c736b6 332
7584f5c9 333
7584f5c9 334def filemode(mode):
be60ffd0
ERE
335 """Deprecated in this location; use stat.filemode."""
336 import warnings
337 warnings.warn("deprecated in favor of stat.filemode",
338 DeprecationWarning, 2)
339 return stat.filemode(mode)
7584f5c9
ERE
340
341class TarError(Exception):
342 """Base exception."""
343 pass
344class ExtractError(TarError):
345 """General exception for extract errors."""
346 pass
347class ReadError(TarError):
be60ffd0 348 """Exception for unreadable tar archives."""
7584f5c9
ERE
349 pass
350class CompressionError(TarError):
351 """Exception for unavailable compression methods."""
352 pass
353class StreamError(TarError):
354 """Exception for unsupported operations on stream-like TarFiles."""
355 pass
356class HeaderError(TarError):
357 """Base exception for header errors."""
358 pass
359class EmptyHeaderError(HeaderError):
360 """Exception for empty headers."""
361 pass
362class TruncatedHeaderError(HeaderError):
363 """Exception for truncated headers."""
364 pass
365class EOFHeaderError(HeaderError):
366 """Exception for end of file headers."""
367 pass
368class InvalidHeaderError(HeaderError):
369 """Exception for invalid headers."""
370 pass
371class SubsequentHeaderError(HeaderError):
372 """Exception for missing and invalid extended headers."""
373 pass
8ab8fac5
PG
374class InvalidEncryptionError(TarError):
375 """Exception for undefined crypto modes and combinations."""
376 pass
e4e5d0b8
PG
377class DecryptionError(TarError):
378 """Exception for error during decryption."""
379 pass
c7c736b6 380class EncryptionError(TarError):
e93f83f1 381 """Exception for error during encryption."""
c7c736b6 382 pass
e50fa574
PG
383class EndOfFile(Exception):
384 """Signal end of file condition when they’re not an error."""
65b35c42 385 pass
7584f5c9
ERE
386
387#---------------------------
388# internal stream interface
389#---------------------------
390class _LowLevelFile:
391 """Low-level file object. Supports reading and writing.
392 It is used instead of a regular file object for streaming
393 access.
394 """
395
396 def __init__(self, name, mode):
ad4402e8 397 _mode = {
7584f5c9 398 "r": os.O_RDONLY,
c7c736b6 399 "w": os.O_RDWR | os.O_CREAT | os.O_TRUNC,
7584f5c9
ERE
400 }[mode]
401 if hasattr(os, "O_BINARY"):
f0287fb7 402 _mode |= os.O_BINARY # pylint: disable=no-member
be60ffd0 403 self.fd = os.open(name, _mode, 0o666)
ad4402e8 404 self.offset = 0
7584f5c9
ERE
405
406 def close(self):
407 os.close(self.fd)
408
409 def read(self, size):
ad4402e8
ERE
410 ret = os.read(self.fd, size)
411 self.offset += len(ret)
412 return ret
7584f5c9 413
867f75f7
PG
414 def write(self, s, pos=None):
415 if pos is not None:
416 p0 = self.offset
417 os.lseek (self.fd, pos, os.SEEK_SET)
418 n = os.write(self.fd, s)
419 if pos is None:
420 self.offset += len(s)
421 else:
422 append = pos + n - p0
423 if append > 0:
424 self.offset += append
425 os.lseek (self.fd, p0, os.SEEK_SET)
7584f5c9 426
ad4402e8
ERE
427 def tell(self):
428 return self.offset
429
c7c736b6
PG
430 def seek_set (self, pos):
431 os.lseek (self.fd, pos, os.SEEK_SET)
432 self.offset = pos
433
8ab8fac5 434
15a81fc0
PG
435def gz_header (name=None):
436 timestamp = int(time.time())
437 flags = 0x0
438
439 if name is None:
440 name = b""
441 else:
dfd7865e 442 flags |= GZ_FLAG_FNAME
15a81fc0
PG
443 if type(name) is str:
444 name = name.encode("iso-8859-1", "replace")
6e99d23a
PG
445 if name.endswith(b".pdtcrypt"):
446 name = name[:-9]
15a81fc0
PG
447 if name.endswith(b".gz"):
448 name = name[:-3]
449 # RFC1952 says we must use ISO-8859-1 for the FNAME field.
450 name += NUL
451
452 hdr = struct.pack (GZ_FMT_HEADER,
453 GZ_MAGIC [0], GZ_MAGIC [1],
454 GZ_METHOD_DEFLATE, flags,
455 timestamp,
456 GZ_DEFLATE_FLAGS, GZ_OS_CODE)
457
458 return hdr + name
459
d601d33b 460
7584f5c9
ERE
461class _Stream:
462 """Class that serves as an adapter between TarFile and
463 a stream-like object. The stream-like object only
464 needs to have a read() or write() method and is accessed
465 blockwise. Use of gzip or bzip2 compression is possible.
466 A stream-like object could be for example: sys.stdin,
467 sys.stdout, a socket, a tape device etc.
468
3031b7ae
PG
469 _Stream is intended to be used only internally but is
470 nevertherless used externally by Deltatar.
471
472 When encrypting, the ``enccounter`` will be used for
473 initializing the first cryptographic context. When
474 decrypting, its value will be compared to the decrypted
475 object. Decryption fails if the value does not match.
476 In effect, this means that a ``_Stream`` whose ctor was
477 passed ``enccounter`` can only be used to encrypt or
478 decrypt a single object.
7584f5c9
ERE
479 """
480
c7c736b6 481 remainder = -1 # track size in encrypted entries
04f4c7ab 482 tolerance = TOLERANCE_STRICT
c7c736b6 483
6e812ad9 484 def __init__(self, name, mode, comptype, fileobj, bufsize,
d1c38f40 485 concat=False, encryption=None, enccounter=None,
04f4c7ab 486 compresslevel=9, tolerance=TOLERANCE_STRICT):
7584f5c9
ERE
487 """Construct a _Stream object.
488 """
d1c38f40 489 self.arcmode = arcmode_set (concat, encryption, comptype)
04f4c7ab 490 self.tolerance = tolerance
d1c38f40 491
7584f5c9
ERE
492 self._extfileobj = True
493 if fileobj is None:
494 fileobj = _LowLevelFile(name, mode)
495 self._extfileobj = False
496
497 if comptype == '*':
498 # Enable transparent compression detection for the
499 # stream interface
500 fileobj = _StreamProxy(fileobj)
501 comptype = fileobj.getcomptype()
d1c38f40
PG
502 if comptype == '':
503 comptype = "tar"
7584f5c9 504
3031b7ae
PG
505 self.enccounter = None
506 if self.arcmode & ARCMODE_ENCRYPT:
507 self.enccounter = enccounter
508
7584f5c9
ERE
509 self.name = name or ""
510 self.mode = mode
511 self.comptype = comptype
53732900 512 self.cmp = None
7584f5c9
ERE
513 self.fileobj = fileobj
514 self.bufsize = bufsize
be60ffd0
ERE
515 self.buf = b""
516 self.pos = 0
517 self.concat_pos = 0
7584f5c9 518 self.closed = False
be60ffd0 519 self.flags = 0
be60ffd0 520 self.last_block_offset = 0
e4e5d0b8 521 self.dbuf = b"" # ???
46c03c02 522 self.exception = None # communicate decompression failure
2b82f50c 523 self.compresslevel = compresslevel
784175ba 524 self.bytes_written = 0
c7c736b6 525 # crypto parameters
2ae46844 526 self.encryption = encryption
c7c736b6 527 self.lasthdr = None
7584f5c9 528
be60ffd0
ERE
529 try:
530 if comptype == "gz":
531 try:
532 import zlib
533 except ImportError:
534 raise CompressionError("zlib module is not available")
535 self.zlib = zlib
bec34b42
PG
536 if mode == "r":
537 self.exception = zlib.error
8ae983c4 538 self._init_read_gz()
bec34b42 539 elif mode == "w":
d1c38f40
PG
540 if not (self.arcmode & ARCMODE_CONCAT):
541 if self.arcmode & ARCMODE_ENCRYPT:
3031b7ae 542 self._init_write_encrypt (name)
a0873dcc 543 self._init_write_gz ()
c2ffe2ec 544 self.crc = zlib.crc32(b"") & 0xFFFFffff
7584f5c9 545
be60ffd0 546 elif comptype == "bz2":
d1c38f40 547 if self.arcmode & ARCMODE_ENCRYPT:
2ae46844 548 raise InvalidEncryptionError("encryption not available for "
d1c38f40 549 "compression “%s”" % comptype)
be60ffd0
ERE
550 try:
551 import bz2
552 except ImportError:
553 raise CompressionError("bz2 module is not available")
554 if mode == "r":
555 self.dbuf = b""
556 self.cmp = bz2.BZ2Decompressor()
557 self.exception = OSError
558 else:
559 self.cmp = bz2.BZ2Compressor()
7584f5c9 560
be60ffd0 561 elif comptype == 'xz':
d1c38f40 562 if self.arcmode & ARCMODE_ENCRYPT:
2ae46844 563 raise InvalidEncryptionError("encryption not available for "
d1c38f40 564 "compression “%s”" % comptype)
c7c736b6
PG
565 try:
566 import lzma
567 except ImportError:
568 raise CompressionError("lzma module is not available")
569 if mode == "r":
570 self.dbuf = b""
571 self.cmp = lzma.LZMADecompressor()
572 self.exception = lzma.LZMAError
573 else:
574 self.cmp = lzma.LZMACompressor()
575
6de9444a 576 elif comptype == "tar":
d1c38f40 577 if not (self.arcmode & ARCMODE_CONCAT) \
6de9444a 578 and mode == "w" \
d1c38f40 579 and self.arcmode & ARCMODE_ENCRYPT:
3031b7ae 580 self._init_write_encrypt (name)
6de9444a
PG
581
582 else:
d1c38f40 583 if self.arcmode & ARCMODE_ENCRYPT:
2ae46844 584 raise InvalidEncryptionError("encryption not available for "
d1c38f40 585 "compression “%s”" % comptype)
c7c736b6 586 raise CompressionError("unknown compression type %r" % comptype)
be60ffd0 587
200d4866 588 except:
be60ffd0
ERE
589 if not self._extfileobj:
590 self.fileobj.close()
591 self.closed = True
592 raise
ac5e4184 593
7584f5c9
ERE
594 def __del__(self):
595 if hasattr(self, "closed") and not self.closed:
fac2cfe1
PG
596 try:
597 self.close()
598 except crypto.InternalError:
599 # context already finalized due to abort but close() tried
600 # to use it
601 pass
7584f5c9 602
c7c736b6 603
d1c38f40
PG
604 def next (self, name):
605 if self.arcmode & ARCMODE_COMPRESS:
606 if getattr (self, "cmp", None) is not None:
607 self._finalize_write_gz ()
0349168a
PG
608 self.__sync()
609 if self.arcmode & ~(ARCMODE_ENCRYPT | ARCMODE_COMPRESS):
610 self.last_block_offset = self.fileobj.tell()
d1c38f40
PG
611 if self.arcmode & ARCMODE_ENCRYPT:
612 self._finalize_write_encrypt ()
613 self._init_write_encrypt (name, set_last_block_offset=True)
614 if self.arcmode & ARCMODE_COMPRESS:
615 self._init_write_gz (set_last_block_offset =
0349168a 616 not (self.arcmode & ARCMODE_ENCRYPT))
d1c38f40
PG
617 return self.last_block_offset
618
619
620 def next_volume (self, name):
621 # with non-concat modes, this is taken care by the _Stream
622 # ctor as invoked by the newvol handler
623 if self.arcmode & ARCMODE_COMPRESS:
624 if getattr (self, "cmp", None) is not None:
625 # e. g. compressed PAX header written
626 self._finalize_write_gz ()
627 if self.arcmode & ARCMODE_ENCRYPT:
628 self._init_write_encrypt (name)
629 if self.arcmode & ARCMODE_COMPRESS:
630 self._init_write_gz ()
631
c7c736b6 632
d1c38f40
PG
633 def _init_write_encrypt (self, entry=None, set_last_block_offset=False):
634 """
635 Save position for delayed write of header; fill the header location
636 with dummy bytes.
637 """
638 # first thing, proclaim new object to the encryption context
639 # secondly, assemble the header with the updated parameters
640 # and commit it directly to the underlying stream, bypassing the
641 # encryption layer in .__write().
642 dummyhdr = self.encryption.next (entry, counter=self.enccounter)
643 if dummyhdr is None:
644 raise EncryptionError ("Crypto.next(): bad dummy header") # XXX
645 self.lasthdr = self.fileobj.tell()
646 self.__write_to_file(dummyhdr)
647 if set_last_block_offset is True:
648 self.last_block_offset = self.lasthdr
c7c736b6
PG
649
650
651 def _finalize_write_encrypt (self):
652 """
653 Seek back to header position, read dummy bytes, finalize crypto
654 obtaining the actual header, write header, seek back to current
655 position.
963d0db4
PG
656
657 Returns the list of IV fixed parts as used during encryption.
c7c736b6 658 """
d1c38f40 659 if self.lasthdr is not None:
c7c736b6
PG
660 pos0 = self.fileobj.tell ()
661 self.fileobj.seek_set (self.lasthdr)
dd47d6a2 662 dummy = self.fileobj.read (crypto.PDTCRYPT_HDR_SIZE)
c7c736b6
PG
663 pos1 = self.fileobj.tell ()
664 dpos = pos1 - self.lasthdr
dd47d6a2 665 assert dpos == crypto.PDTCRYPT_HDR_SIZE
c7c736b6 666 self.fileobj.seek_set (pos0)
c8c72fe1 667 data, hdr, _ = self.encryption.done (dummy)
5f38bff6 668 self.__write_to_file(hdr, pos=self.lasthdr)
c7c736b6
PG
669 self.__write_to_file(data) # append remainder of data
670 self.lasthdr = -1
671
672
57db1546
PG
673 def _finalize_write_gz (self):
674 if self.cmp is not None:
675 chunk = self.buf + self.cmp.flush()
676 if chunk:
677 if self.comptype == "gz":
678 # The native zlib crc is an unsigned 32-bit integer, but
679 # the Python wrapper implicitly casts that to a signed C
680 # long. So, on a 32-bit box self.crc may "look negative",
681 # while the same crc on a 64-bit box may "look positive".
682 # To avoid irksome warnings from the `struct` module, force
683 # it to look positive on all boxes.
684 chunk += struct.pack("<L", self.crc & 0xffffffff)
685 chunk += struct.pack("<L", self.concat_pos & 0xffffFFFF)
686 self.__enc_write (chunk)
15a81fc0 687 self.buf = b""
57db1546
PG
688
689
a0873dcc 690 def _init_write_gz (self, set_last_block_offset=False):
5fdff89f
ERE
691 '''
692 Add a new gzip block, closing last one
693 '''
be60ffd0 694 self.concat_pos = 0
c2ffe2ec 695 self.crc = self.zlib.crc32(b"") & 0xFFFFffff
6de9444a 696 first = self.cmp is None
2b82f50c
ERE
697 self.cmp = self.zlib.compressobj(self.compresslevel,
698 self.zlib.DEFLATED,
699 -self.zlib.MAX_WBITS,
700 self.zlib.DEF_MEM_LEVEL,
701 0)
6e812ad9
DGM
702
703 # if aes, we encrypt after compression
6de9444a 704 if set_last_block_offset is True:
ad4402e8 705 self.last_block_offset = self.fileobj.tell()
6e812ad9 706
15a81fc0 707 self.__write(gz_header (self.name if first is True else None))
5fdff89f 708
ac5e4184 709
7584f5c9
ERE
710 def write(self, s):
711 """Write string s to the stream.
712 """
713 if self.comptype == "gz":
c2ffe2ec 714 self.crc = self.zlib.crc32(s, self.crc) & 0xFFFFffff
7584f5c9 715 self.pos += len(s)
5fdff89f 716 self.concat_pos += len(s)
53732900 717 if self.cmp is not None:
7584f5c9
ERE
718 s = self.cmp.compress(s)
719 self.__write(s)
720
c7c736b6 721 def __sync(self):
cb7a3911 722 """Write what’s left in the buffer to the stream."""
c7c736b6
PG
723 self.__write (b"") # → len (buf) <= bufsiz
724 self.__enc_write (self.buf)
725 self.buf = b""
726
7584f5c9 727 def __write(self, s):
548bb8d5
CH
728 """Writes (and encodes) string s to the stream blockwise
729
730 will wait with encoding/writing until block is complete
7584f5c9
ERE
731 """
732 self.buf += s
733 while len(self.buf) > self.bufsize:
6e812ad9 734 self.__enc_write(self.buf[:self.bufsize])
7584f5c9
ERE
735 self.buf = self.buf[self.bufsize:]
736
867f75f7 737
5f38bff6 738 def __write_to_file(self, s, pos=None):
6e812ad9 739 '''
5f38bff6 740 Writes directly to the fileobj; updates self.bytes_written. If “pos” is
cb7a3911 741 given, the stream will seek to that position first and back afterwards,
5f38bff6 742 and the total of bytes written is not updated.
6e812ad9 743 '''
867f75f7 744 self.fileobj.write(s, pos)
5f38bff6
PG
745 if pos is None:
746 self.bytes_written += len(s)
867f75f7 747
6e812ad9
DGM
748
749 def __enc_write(self, s):
cb7a3911
PG
750 """
751 If encryption is active, the string s is encrypted before being written
752 to the file.
753 """
754 if len (s) == 0:
755 return
d1c38f40 756 if self.arcmode & ARCMODE_ENCRYPT:
cb7a3911
PG
757 buf = s
758 while len (buf) > 0:
759 n, ct = self.encryption.process(buf)
760 self.__write_to_file(ct)
761 buf = buf [n:]
762 if len (buf) > 0:
763 # The entire plaintext was not consumed: The size limit
764 # for encrypted objects was reached. Transparently create
765 # a new encrypted object and continue processing the input.
766 self._finalize_write_encrypt ()
767 self._init_write_encrypt ()
768 else:
769 self.__write_to_file(s)
770
6e812ad9 771
784175ba
CH
772 def estim_file_size(self):
773 """ estimates size of file if closing it now
774
775 The result may differ greatly from the amount of data sent to write()
776 due to compression, encryption and buffering.
777
778 In tests the result (before calling close()) was up to 12k smaller than
779 the final file size if compression is being used because zlib/bz2
780 compressors do not allow inspection of their buffered data :-(
781
ba5a449e
CH
782 Still, we add what close() would add: 8 bytes for gz checksum, one
783 encryption block size if encryption is used and the size of our own
784 buffer
784175ba
CH
785 """
786 if self.closed:
787 return self.bytes_written
788
789 result = self.bytes_written
790 if self.buf:
791 result += len(self.buf)
792 if self.comptype == 'gz':
ba5a449e 793 result += 8 # 2 longs = 8 byte (no extra info written for bzip2)
784175ba
CH
794 return result
795
5fdff89f 796 def close(self, close_fileobj=True):
7584f5c9
ERE
797 """Close the _Stream object. No operation should be
798 done on it afterwards.
799 """
963d0db4 800
7584f5c9
ERE
801 if self.closed:
802 return
803
963d0db4 804 if close_fileobj is True:
a0873dcc 805
ae3d0f2a 806 if self.mode == "w":
d1c38f40 807 if self.arcmode & ARCMODE_COMPRESS:
a0873dcc 808 self._finalize_write_gz ()
ae3d0f2a 809 # end of Tar archive marker (two empty blocks) was written
267bc643
PG
810 # finalize encryption last; no writes may be performed after
811 # this point
cb7a3911 812 self.__sync ()
d1c38f40
PG
813 if self.arcmode & ARCMODE_ENCRYPT:
814 self._finalize_write_encrypt ()
267bc643 815
963d0db4
PG
816 if not self._extfileobj:
817 self.fileobj.close()
818 else:
819 # read the zlib crc and length and check them
820 if self.mode == "r" and self.comptype == "gz":
821 read_crc = self.__read(4)
822 read_length = self.__read(4)
823 calculated_crc = self.crc
824 if struct.unpack("<L", read_crc)[0] != calculated_crc:
825 raise CompressionError("bad gzip crc")
7584f5c9
ERE
826 self.closed = True
827
54128a00 828
7584f5c9
ERE
829 def _init_read_gz(self):
830 """Initialize for reading a gzip compressed fileobj.
831 """
832 self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
7584f5c9 833
85737f48 834 read2 = self.__read(2)
e50fa574
PG
835 if read2 == b"":
836 raise EndOfFile ("_init_read_gz(): read returned zero bytes at pos "
837 "%d" % self.fileobj.tell())
838 # taken from gzip.GzipFile with some alterations
d601d33b 839 if read2 != GZ_MAGIC_BYTES:
7584f5c9 840 raise ReadError("not a gzip file")
85737f48 841
5bd2d4b5
PG
842 read1 = self.__read(1)
843 if read1 == b"":
844 raise EndOfFile ("_init_read_gz(): read returned zero bytes inside "
845 "gzip header at pos %d" % self.fileobj.tell())
846 if ord (read1) != GZ_METHOD_DEFLATE:
7584f5c9
ERE
847 raise CompressionError("unsupported compression method")
848
85737f48 849 self.flags = flag = ord(self.__read(1))
dfd7865e 850 self.__read(6) # discard timestamp[4], deflate flags, os code
7584f5c9 851
dfd7865e 852 if flag & GZ_FLAG_FEXTRA:
7584f5c9
ERE
853 xlen = ord(self.__read(1)) + 256 * ord(self.__read(1))
854 self.read(xlen)
dfd7865e 855 if flag & GZ_FLAG_FNAME:
7584f5c9
ERE
856 while True:
857 s = self.__read(1)
858 if not s or s == NUL:
859 break
dfd7865e 860 if flag & GZ_FLAG_FCOMMENT:
7584f5c9
ERE
861 while True:
862 s = self.__read(1)
863 if not s or s == NUL:
864 break
dfd7865e 865 if flag & GZ_FLAG_FHCRC:
7584f5c9
ERE
866 self.__read(2)
867
c7c736b6
PG
868 def _init_read_encrypt (self):
869 """Initialize encryption for next entry in archive. Read a header and
870 notify the crypto context."""
d1c38f40 871 if self.arcmode & ARCMODE_ENCRYPT:
6e99d23a 872 lasthdr = self.fileobj.tell ()
15d3eefd
PG
873 try:
874 hdr = crypto.hdr_read_stream (self.fileobj)
8a8ac469
PG
875 except crypto.EndOfFile:
876 return False
6e99d23a 877 except crypto.InvalidHeader as exn:
c7c736b6 878 raise DecryptionError ("Crypto.hdr_read_stream(): error “%s” "
6e99d23a
PG
879 "processing %r at pos %d"
880 % (exn, self.fileobj, lasthdr)) \
ae3d0f2a 881 from exn
3031b7ae
PG
882 if self.enccounter is not None:
883 # enforce that the iv counter in the header matches an
884 # explicitly requested one
885 iv = crypto.hdr_iv_counter (hdr)
886 if iv != self.enccounter:
887 raise DecryptionError ("expected IV counter %d, got %d"
888 % (self.enccounter, iv))
6e99d23a 889 self.lasthdr = lasthdr
c7c736b6 890 self.remainder = hdr ["ctsize"] # distance to next header
1ed44e7b
PG
891 try:
892 self.encryption.next (hdr)
893 except crypto.InvalidParameter as exn:
894 raise DecryptionError ("Crypto.next(): error “%s” "
895 "processing %r at pos %d"
896 % (exn, self.fileobj, lasthdr)) \
897 from exn
8a8ac469
PG
898
899 return True
c7c736b6
PG
900
901
8de91f4f
PG
902 def _read_encrypt (self, buf):
903 """
904 Demote a program error to a decryption error in tolerant mode. This
905 allows recovery from corrupted headers and invalid data.
906 """
907 try:
908 return self.encryption.process (buf)
909 except RuntimeError as exn:
04f4c7ab 910 if self.tolerance != TOLERANCE_STRICT:
8de91f4f
PG
911 raise DecryptionError (exn)
912 raise
913
914
c7c736b6
PG
915 def _finalize_read_encrypt (self):
916 """
917 Finalize decryption.
918 """
d1c38f40
PG
919 if self.arcmode & ARCMODE_ENCRYPT \
920 and self.lasthdr is not None :
c7c736b6
PG
921 assert self.remainder >= 0
922 if self.remainder > 0:
c7c736b6 923 self.remainder = 0
b0078f26
PG
924 try:
925 data = self.encryption.done ()
926 except crypto.InvalidGCMTag as exn:
927 raise DecryptionError ("decryption failed: %s" % exn)
c7c736b6
PG
928 return data
929
930
7584f5c9
ERE
931 def tell(self):
932 """Return the stream's file pointer position.
933 """
934 return self.pos
935
936 def seek(self, pos=0):
937 """Set the stream's file pointer to pos. Negative seeking
938 is forbidden.
939 """
940 if pos - self.pos >= 0:
941 blocks, remainder = divmod(pos - self.pos, self.bufsize)
be60ffd0 942 for i in range(blocks):
7584f5c9
ERE
943 self.read(self.bufsize)
944 self.read(remainder)
945 else:
946 raise StreamError("seeking backwards is not allowed")
947 return self.pos
948
949 def read(self, size=None):
950 """Return the next size number of bytes from the stream.
951 If size is not defined, return all bytes of the stream
952 up to EOF.
953 """
954 if size is None:
955 t = []
956 while True:
957 buf = self._read(self.bufsize)
958 if not buf:
959 break
960 t.append(buf)
9dc7ac5c 961 buf = b"".join(t)
7584f5c9
ERE
962 else:
963 buf = self._read(size)
964 self.pos += len(buf)
965 return buf
966
3a7e1a50
ERE
967 def readline(self):
968 """Reads just one line, new line character included
969 """
f0fd5e3a 970 # if \n in dbuf, no read neads to be done
be60ffd0
ERE
971 if b'\n' in self.dbuf:
972 pos = self.dbuf.index(b'\n') + 1
f0fd5e3a
ERE
973 ret = self.dbuf[:pos]
974 self.dbuf = self.dbuf[pos:]
975 return ret
976
1215b602 977 buf = []
3a7e1a50
ERE
978 while True:
979 chunk = self._read(self.bufsize)
980
f0fd5e3a 981 # nothing more to read, so return the buffer
3a7e1a50 982 if not chunk:
be60ffd0 983 return b''.join(buf)
3a7e1a50
ERE
984
985 buf.append(chunk)
f0fd5e3a
ERE
986
987 # if \n found, return the new line
be60ffd0
ERE
988 if b'\n' in chunk:
989 dbuf = b''.join(buf)
990 pos = dbuf.index(b'\n') + 1
1215b602 991 self.dbuf = dbuf[pos:] + self.dbuf
3a7e1a50
ERE
992 return dbuf[:pos]
993
7584f5c9
ERE
994 def _read(self, size):
995 """Return size bytes from the stream.
996 """
7584f5c9
ERE
997 c = len(self.dbuf)
998 t = [self.dbuf]
e4e5d0b8 999
7584f5c9 1000 while c < size:
867f75f7 1001 buf = self.__read(self.bufsize)
7584f5c9
ERE
1002 if not buf:
1003 break
3a7e1a50 1004
53732900 1005 if self.cmp is not None:
85737f48 1006 try:
3a7e1a50 1007 buf = self.cmp.decompress(buf)
54128a00
PG
1008 except self.exception as exn:
1009 raise ReadError("invalid compressed data (%r)" % exn)
be60ffd0 1010 except Exception as e:
04fb06f4
DGM
1011 # happens at the end of the file
1012 # _init_read_gz failed in the previous iteration so
e4e5d0b8 1013 # self.cmp.decompress fails here
d1c38f40 1014 if self.arcmode & ARCMODE_CONCAT:
be60ffd0
ERE
1015 pass
1016 else:
1017 raise ReadError("invalid compressed data")
d1c38f40 1018 if self.arcmode & ARCMODE_COMPRESS and hasattr(self, "crc"):
c2ffe2ec 1019 self.crc = self.zlib.crc32(buf, self.crc) & 0xFFFFffff
d1c38f40
PG
1020 if self.arcmode & ARCMODE_CONCAT \
1021 and len(self.cmp.unused_data) != 0:
3a7e1a50
ERE
1022 self.buf = self.cmp.unused_data + self.buf
1023 self.close(close_fileobj=False)
1024 try:
1025 self._init_read_gz()
8de91f4f 1026 except DecryptionError:
04f4c7ab 1027 if self.tolerance != TOLERANCE_STRICT:
8de91f4f
PG
1028 # return whatever data was processed successfully
1029 if len (buf) > 0:
1030 t.append (buf)
1031 if len (t) > 0:
1032 break
24afaf18
PG
1033 raise
1034 except ReadError: # gzip troubles
1035 if self.tolerance == TOLERANCE_RESCUE:
1036 if len (buf) > 0:
1037 t.append (buf)
1038 if len (t) > 0:
1039 break
1040 raise
e50fa574 1041 except EndOfFile:
3a7e1a50
ERE
1042 # happens at the end of the file
1043 pass
c2ffe2ec 1044 self.crc = self.zlib.crc32(b"") & 0xFFFFffff
3a7e1a50 1045 self.closed = False
7584f5c9
ERE
1046 t.append(buf)
1047 c += len(buf)
be60ffd0 1048 t = b"".join(t)
7584f5c9
ERE
1049 self.dbuf = t[size:]
1050 return t[:size]
1051
e4e5d0b8 1052
7584f5c9 1053 def __read(self, size):
ef3b4499
PG
1054 """
1055 Return size bytes from stream. If internal buffer is empty, read
1056 another block from the stream.
1057
1058 The function returns up to size bytes of data. When an error occurs
1059 during decryption, everything until the end of the last successfully
1060 finalized object is returned.
7584f5c9
ERE
1061 """
1062 c = len(self.buf)
8de91f4f 1063 t = [self.buf] if c > 0 else []
1ed44e7b 1064 good_crypto = len (t)
8de91f4f 1065
7584f5c9 1066 while c < size:
c7c736b6 1067 todo = size
8de91f4f
PG
1068 try:
1069 if self.arcmode & ARCMODE_ENCRYPT:
1070 if self.remainder <= 0:
1071 # prepare next object
044585c6
PG
1072 if self._init_read_encrypt () is False: # EOF
1073 buf = None
1074 break # while
8de91f4f
PG
1075
1076 # only read up to the end of the encrypted object
1077 todo = min (size, self.remainder)
1078 buf = self.fileobj.read(todo)
1079 if self.arcmode & ARCMODE_ENCRYPT:
1080 # decrypt the thing
1081 buf = self._read_encrypt (buf)
1082 if todo == self.remainder:
1083 # at the end of a crypto object; finalization will fail if
1084 # the GCM tag does not match
ef3b4499 1085 trailing = self._finalize_read_encrypt ()
8de91f4f
PG
1086 good_crypto = len (t) + 1
1087 if len (trailing) > 0:
1088 buf += trailing
1089 self.remainder = 0
1090 else:
1091 self.remainder -= todo
1092 except DecryptionError:
04f4c7ab 1093 if self.tolerance == TOLERANCE_STRICT:
8de91f4f
PG
1094 raise
1095 self.encryption.drop ()
24afaf18
PG
1096 if self.tolerance == TOLERANCE_RECOVER:
1097 if good_crypto == 0:
1098 raise
1099 # this may occur at any of the three crypto operations above.
1100 # some objects did validate; discard all data after it; next
1101 # call will start with the bad object and error out immediately
1102 self.buf = b"".join (t [good_crypto:])
1103 return b"".join (t [:good_crypto])
1104 elif self.tolerance == TOLERANCE_RESCUE:
1105 # keep what we have so far despite the finalization issue
1106 t.append (buf)
1107 c += len (buf)
1108 break
1109 else:
1110 raise RuntimeError("internal error: bad tolerance level")
c7c736b6
PG
1111
1112 if not buf: ## XXX stream terminated prematurely; this should be an error
7584f5c9 1113 break
c7c736b6 1114
7584f5c9
ERE
1115 t.append(buf)
1116 c += len(buf)
be60ffd0 1117 t = b"".join(t)
7584f5c9 1118 self.buf = t[size:]
fb27c6e8 1119
7584f5c9 1120 return t[:size]
7d372216 1121
7584f5c9
ERE
1122
1123class _StreamProxy(object):
1124 """Small proxy class that enables transparent compression
1125 detection for the Stream interface (mode 'r|*').
1126 """
1127
1128 def __init__(self, fileobj):
1129 self.fileobj = fileobj
1130 self.buf = self.fileobj.read(BLOCKSIZE)
1131
f0287fb7 1132 def read(self, size): # pylint: disable=method-hidden
7584f5c9
ERE
1133 self.read = self.fileobj.read
1134 return self.buf
1135
1136 def getcomptype(self):
d601d33b 1137 if self.buf.startswith(GZ_MAGIC_DEFLATE):
7584f5c9 1138 return "gz"
be60ffd0 1139 elif self.buf[0:3] == b"BZh" and self.buf[4:10] == b"1AY&SY":
7584f5c9 1140 return "bz2"
be60ffd0
ERE
1141 elif self.buf.startswith((b"\x5d\x00\x00\x80", b"\xfd7zXZ")):
1142 return "xz"
1143 else:
1144 return "tar"
7584f5c9
ERE
1145
1146 def close(self):
1147 self.fileobj.close()
1148# class StreamProxy
1149
7584f5c9
ERE
1150#------------------------
1151# Extraction file object
1152#------------------------
1153class _FileInFile(object):
1154 """A thin wrapper around an existing file object that
1155 provides a part of its data as an individual file
1156 object.
1157 """
1158
be60ffd0 1159 def __init__(self, fileobj, offset, size, blockinfo=None):
7584f5c9
ERE
1160 self.fileobj = fileobj
1161 self.offset = offset
1162 self.size = size
7584f5c9 1163 self.position = 0
be60ffd0
ERE
1164 self.name = getattr(fileobj, "name", None)
1165 self.closed = False
1166
1167 if blockinfo is None:
1168 blockinfo = [(0, size)]
1169
1170 # Construct a map with data and zero blocks.
1171 self.map_index = 0
1172 self.map = []
1173 lastpos = 0
1174 realpos = self.offset
1175 for offset, size in blockinfo:
1176 if offset > lastpos:
1177 self.map.append((False, lastpos, offset, None))
1178 self.map.append((True, offset, offset + size, realpos))
1179 realpos += size
1180 lastpos = offset + size
1181 if lastpos < self.size:
1182 self.map.append((False, lastpos, self.size, None))
1183
1184 def flush(self):
1185 pass
1186
1187 def readable(self):
1188 return True
1189
1190 def writable(self):
1191 return False
1192
1193 def seekable(self):
1194 return self.fileobj.seekable()
7584f5c9
ERE
1195
1196 def tell(self):
1197 """Return the current file position.
1198 """
1199 return self.position
1200
be60ffd0 1201 def seek(self, position, whence=io.SEEK_SET):
7584f5c9
ERE
1202 """Seek to a position in the file.
1203 """
be60ffd0
ERE
1204 if whence == io.SEEK_SET:
1205 self.position = min(max(position, 0), self.size)
1206 elif whence == io.SEEK_CUR:
1207 if position < 0:
1208 self.position = max(self.position + position, 0)
1209 else:
1210 self.position = min(self.position + position, self.size)
1211 elif whence == io.SEEK_END:
1212 self.position = max(min(self.size + position, self.size), 0)
1213 else:
1214 raise ValueError("Invalid argument")
1215 return self.position
7584f5c9
ERE
1216
1217 def read(self, size=None):
1218 """Read data from the file.
1219 """
1220 if size is None:
1221 size = self.size - self.position
1222 else:
1223 size = min(size, self.size - self.position)
1224
be60ffd0 1225 buf = b""
7584f5c9 1226 while size > 0:
7584f5c9 1227 while True:
be60ffd0
ERE
1228 data, start, stop, offset = self.map[self.map_index]
1229 if start <= self.position < stop:
7584f5c9 1230 break
be60ffd0
ERE
1231 else:
1232 self.map_index += 1
1233 if self.map_index == len(self.map):
1234 self.map_index = 0
1235 length = min(size, stop - self.position)
1236 if data:
1237 self.fileobj.seek(offset + (self.position - start))
1238 buf += self.fileobj.read(length)
7584f5c9 1239 else:
be60ffd0
ERE
1240 buf += NUL * length
1241 size -= length
1242 self.position += length
1243 return buf
7584f5c9 1244
be60ffd0
ERE
1245 def readinto(self, b):
1246 buf = self.read(len(b))
1247 b[:len(buf)] = buf
1248 return len(buf)
7584f5c9
ERE
1249
1250 def close(self):
7584f5c9 1251 self.closed = True
be60ffd0 1252#class _FileInFile
7584f5c9 1253
be60ffd0
ERE
1254
1255class ExFileObject(io.BufferedReader):
1256
1257 def __init__(self, tarfile, tarinfo):
1258 fileobj = _FileInFile(tarfile.fileobj, tarinfo.offset_data,
1259 tarinfo.size, tarinfo.sparse)
1260 super().__init__(fileobj)
7584f5c9
ERE
1261#class ExFileObject
1262
1263#------------------
1264# Exported Classes
1265#------------------
1266class TarInfo(object):
1267 """Informational class which holds the details about an
1268 archive member given by a tar header block.
1269 TarInfo objects are returned by TarFile.getmember(),
1270 TarFile.getmembers() and TarFile.gettarinfo() and are
1271 usually created internally.
1272 """
1273
be60ffd0
ERE
1274 __slots__ = ("name", "mode", "uid", "gid", "size", "mtime",
1275 "chksum", "type", "linkname", "uname", "gname",
1276 "devmajor", "devminor", "volume_offset",
1277 "offset", "offset_data", "pax_headers", "sparse",
1278 "tarfile", "_sparse_structs", "_link_target")
1279
7584f5c9
ERE
1280 def __init__(self, name=""):
1281 """Construct a TarInfo object. name is the optional name
1282 of the member.
1283 """
1284 self.name = name # member name
be60ffd0 1285 self.mode = 0o644 # file permissions
7584f5c9
ERE
1286 self.uid = 0 # user id
1287 self.gid = 0 # group id
1288 self.size = 0 # file size
1289 self.mtime = 0 # modification time
1290 self.chksum = 0 # header checksum
1291 self.type = REGTYPE # member type
1292 self.linkname = "" # link name
1293 self.uname = "" # user name
1294 self.gname = "" # group name
1295 self.devmajor = 0 # device major number
1296 self.devminor = 0 # device minor number
1297
1298 self.offset = 0 # the tar header starts here
1299 self.offset_data = 0 # the file's data starts here
0eb5048f
ERE
1300 self.volume_offset = 0 # the file's data corresponds with the data
1301 # starting at this position
7584f5c9 1302
be60ffd0 1303 self.sparse = None # sparse member information
7584f5c9
ERE
1304 self.pax_headers = {} # pax header information
1305
1306 # In pax headers the "name" and "linkname" field are called
1307 # "path" and "linkpath".
1308 def _getpath(self):
1309 return self.name
1310 def _setpath(self, name):
1311 self.name = name
1312 path = property(_getpath, _setpath)
1313
1314 def _getlinkpath(self):
1315 return self.linkname
1316 def _setlinkpath(self, linkname):
1317 self.linkname = linkname
1318 linkpath = property(_getlinkpath, _setlinkpath)
1319
1320 def __repr__(self):
1321 return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self))
1322
be60ffd0 1323 def get_info(self, encoding=None, errors=None):
7584f5c9
ERE
1324 """Return the TarInfo's attributes as a dictionary.
1325 """
1326 info = {
1327 "name": self.name,
be60ffd0 1328 "mode": self.mode & 0o7777,
7584f5c9
ERE
1329 "uid": self.uid,
1330 "gid": self.gid,
1331 "size": self.size,
1332 "mtime": self.mtime,
1333 "chksum": self.chksum,
1334 "type": self.type,
1335 "linkname": self.linkname,
1336 "uname": self.uname,
1337 "gname": self.gname,
1338 "devmajor": self.devmajor,
36a315a0 1339 "devminor": self.devminor,
0eb5048f
ERE
1340 "offset_data": self.offset_data,
1341 "volume_offset": self.volume_offset
7584f5c9
ERE
1342 }
1343
1344 if info["type"] == DIRTYPE and not info["name"].endswith("/"):
1345 info["name"] += "/"
1346
7584f5c9
ERE
1347 return info
1348
be60ffd0
ERE
1349 def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING,
1350 errors="surrogateescape"):
7584f5c9
ERE
1351 """Return a tar header as a string of 512 byte blocks.
1352 """
1353 info = self.get_info(encoding, errors)
1354
1355 if format == USTAR_FORMAT:
be60ffd0 1356 return self.create_ustar_header(info, encoding, errors)
7584f5c9 1357 elif format == GNU_FORMAT:
be60ffd0 1358 return self.create_gnu_header(info, encoding, errors)
7584f5c9
ERE
1359 elif format == PAX_FORMAT:
1360 return self.create_pax_header(info, encoding, errors)
1361 else:
1362 raise ValueError("invalid format")
1363
be60ffd0 1364 def create_ustar_header(self, info, encoding, errors):
7584f5c9
ERE
1365 """Return the object as a ustar header block.
1366 """
1367 info["magic"] = POSIX_MAGIC
1368
1369 if len(info["linkname"]) > LENGTH_LINK:
1370 raise ValueError("linkname is too long")
1371
1372 if len(info["name"]) > LENGTH_NAME:
1373 info["prefix"], info["name"] = self._posix_split_name(info["name"])
1374
be60ffd0 1375 return self._create_header(info, USTAR_FORMAT, encoding, errors)
7584f5c9 1376
be60ffd0 1377 def create_gnu_header(self, info, encoding, errors):
7584f5c9
ERE
1378 """Return the object as a GNU header block sequence.
1379 """
1380 info["magic"] = GNU_MAGIC
1381
2f854e77
ERE
1382 if self.ismultivol():
1383 prefix = [
1384 itn(info.get("atime", 0), 12, GNU_FORMAT),
1385 itn(info.get("ctime", 0), 12, GNU_FORMAT),
0eb5048f 1386 itn(self.volume_offset, 12, GNU_FORMAT),
2f854e77
ERE
1387 itn(0, 119, GNU_FORMAT), # stuff unused in this tar implementation, set to zero
1388 ]
be60ffd0 1389 info['prefix'] = b"".join(prefix)
0eb5048f 1390 info['size'] = info['size'] - self.volume_offset
2f854e77 1391
be60ffd0 1392 buf = b""
7584f5c9 1393 if len(info["linkname"]) > LENGTH_LINK:
be60ffd0
ERE
1394 buf += self._create_gnu_long_header(info["linkname"],
1395 GNUTYPE_LONGLINK, encoding, errors)
7584f5c9
ERE
1396
1397 if len(info["name"]) > LENGTH_NAME:
be60ffd0
ERE
1398 buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME,
1399 encoding, errors)
7584f5c9 1400
be60ffd0 1401 return buf + self._create_header(info, GNU_FORMAT, encoding, errors)
7584f5c9
ERE
1402
1403 def create_pax_header(self, info, encoding, errors):
1404 """Return the object as a ustar header block. If it cannot be
1405 represented this way, prepend a pax extended header sequence
1406 with supplement information.
1407 """
1408 info["magic"] = POSIX_MAGIC
1409 pax_headers = self.pax_headers.copy()
c04e0751
ERE
1410 if self.ismultivol():
1411 info['size'] = info['size'] - self.volume_offset
7584f5c9
ERE
1412
1413 # Test string fields for values that exceed the field length or cannot
1414 # be represented in ASCII encoding.
1415 for name, hname, length in (
36a315a0
ERE
1416 ("name", "path", LENGTH_NAME),
1417 ("linkname", "linkpath", LENGTH_LINK),
1418 ("uname", "uname", 32),
1419 ("gname", "gname", 32)):
7584f5c9
ERE
1420
1421 if hname in pax_headers:
1422 # The pax header has priority.
1423 continue
1424
7584f5c9
ERE
1425 # Try to encode the string as ASCII.
1426 try:
be60ffd0 1427 info[name].encode("ascii", "strict")
7584f5c9 1428 except UnicodeEncodeError:
be60ffd0 1429 pax_headers[hname] = info[name]
7584f5c9
ERE
1430 continue
1431
1432 if len(info[name]) > length:
be60ffd0 1433 pax_headers[hname] = info[name]
7584f5c9
ERE
1434
1435 # Test number fields for values that exceed the field limit or values
1436 # that like to be stored as float.
1437 for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)):
1438 if name in pax_headers:
1439 # The pax header has priority. Avoid overflow.
1440 info[name] = 0
1441 continue
1442
1443 val = info[name]
1444 if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float):
be60ffd0 1445 pax_headers[name] = str(val)
7584f5c9
ERE
1446 info[name] = 0
1447
1448 # Create a pax extended header if necessary.
1449 if pax_headers:
be60ffd0 1450 buf = self._create_pax_generic_header(pax_headers, XHDTYPE, encoding)
7584f5c9 1451 else:
be60ffd0 1452 buf = b""
7584f5c9 1453
be60ffd0 1454 return buf + self._create_header(info, USTAR_FORMAT, "ascii", "replace")
7584f5c9
ERE
1455
1456 @classmethod
1457 def create_pax_global_header(cls, pax_headers):
1458 """Return the object as a pax global header block sequence.
1459 """
be60ffd0 1460 return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf-8")
7584f5c9
ERE
1461
1462 def _posix_split_name(self, name):
1463 """Split a name longer than 100 chars into a prefix
1464 and a name part.
1465 """
1466 prefix = name[:LENGTH_PREFIX + 1]
1467 while prefix and prefix[-1] != "/":
1468 prefix = prefix[:-1]
1469
1470 name = name[len(prefix):]
1471 prefix = prefix[:-1]
1472
1473 if not prefix or len(name) > LENGTH_NAME:
1474 raise ValueError("name is too long")
1475 return prefix, name
1476
1477 @staticmethod
be60ffd0 1478 def _create_header(info, format, encoding, errors):
7584f5c9
ERE
1479 """Return a header block. info is a dictionary with file
1480 information, format must be one of the *_FORMAT constants.
1481 """
1482 parts = [
be60ffd0
ERE
1483 stn(info.get("name", ""), 100, encoding, errors),
1484 itn(info.get("mode", 0) & 0o7777, 8, format),
7584f5c9
ERE
1485 itn(info.get("uid", 0), 8, format),
1486 itn(info.get("gid", 0), 8, format),
1487 itn(info.get("size", 0), 12, format),
1488 itn(info.get("mtime", 0), 12, format),
be60ffd0 1489 b" ", # checksum field
2f854e77 1490 info.get("type", REGTYPE),
be60ffd0
ERE
1491 stn(info.get("linkname", ""), 100, encoding, errors),
1492 info.get("magic", POSIX_MAGIC),
1493 stn(info.get("uname", ""), 32, encoding, errors),
1494 stn(info.get("gname", ""), 32, encoding, errors),
7584f5c9
ERE
1495 itn(info.get("devmajor", 0), 8, format),
1496 itn(info.get("devminor", 0), 8, format),
be60ffd0 1497 sbtn(info.get("prefix", ""), 155, encoding, errors)
7584f5c9
ERE
1498 ]
1499
be60ffd0 1500 buf = struct.pack("%ds" % BLOCKSIZE, b"".join(parts))
7584f5c9 1501 chksum = calc_chksums(buf[-BLOCKSIZE:])[0]
be60ffd0 1502 buf = buf[:-364] + bytes("%06o\0" % chksum, "ascii") + buf[-357:]
7584f5c9
ERE
1503 return buf
1504
1505 @staticmethod
1506 def _create_payload(payload):
1507 """Return the string payload filled with zero bytes
1508 up to the next 512 byte border.
1509 """
1510 blocks, remainder = divmod(len(payload), BLOCKSIZE)
1511 if remainder > 0:
1512 payload += (BLOCKSIZE - remainder) * NUL
1513 return payload
1514
1515 @classmethod
be60ffd0 1516 def _create_gnu_long_header(cls, name, type, encoding, errors):
7584f5c9
ERE
1517 """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence
1518 for name.
1519 """
be60ffd0 1520 name = name.encode(encoding, errors) + NUL
7584f5c9
ERE
1521
1522 info = {}
1523 info["name"] = "././@LongLink"
1524 info["type"] = type
1525 info["size"] = len(name)
1526 info["magic"] = GNU_MAGIC
1527
1528 # create extended header + name blocks.
be60ffd0 1529 return cls._create_header(info, USTAR_FORMAT, encoding, errors) + \
7584f5c9
ERE
1530 cls._create_payload(name)
1531
1532 @classmethod
be60ffd0
ERE
1533 def _create_pax_generic_header(cls, pax_headers, type, encoding):
1534 """Return a POSIX.1-2008 extended or global header sequence
7584f5c9 1535 that contains a list of keyword, value pairs. The values
be60ffd0 1536 must be strings.
7584f5c9 1537 """
be60ffd0
ERE
1538 # Check if one of the fields contains surrogate characters and thereby
1539 # forces hdrcharset=BINARY, see _proc_pax() for more information.
1540 binary = False
1541 for keyword, value in pax_headers.items():
1542 try:
1543 value.encode("utf-8", "strict")
1544 except UnicodeEncodeError:
1545 binary = True
1546 break
1547
1548 records = b""
1549 if binary:
1550 # Put the hdrcharset field at the beginning of the header.
1551 records += b"21 hdrcharset=BINARY\n"
1552
1553 for keyword, value in pax_headers.items():
1554 keyword = keyword.encode("utf-8")
1555 if binary:
1556 # Try to restore the original byte representation of `value'.
1557 # Needless to say, that the encoding must match the string.
1558 value = value.encode(encoding, "surrogateescape")
1559 else:
1560 value = value.encode("utf-8")
1561
7584f5c9
ERE
1562 l = len(keyword) + len(value) + 3 # ' ' + '=' + '\n'
1563 n = p = 0
1564 while True:
1565 n = l + len(str(p))
1566 if n == p:
1567 break
1568 p = n
be60ffd0 1569 records += bytes(str(p), "ascii") + b" " + keyword + b"=" + value + b"\n"
7584f5c9
ERE
1570
1571 # We use a hardcoded "././@PaxHeader" name like star does
1572 # instead of the one that POSIX recommends.
1573 info = {}
1574 info["name"] = "././@PaxHeader"
1575 info["type"] = type
1576 info["size"] = len(records)
1577 info["magic"] = POSIX_MAGIC
1578
1579 # Create pax header + record blocks.
be60ffd0 1580 return cls._create_header(info, USTAR_FORMAT, "ascii", "replace") + \
7584f5c9
ERE
1581 cls._create_payload(records)
1582
1583 @classmethod
be60ffd0
ERE
1584 def frombuf(cls, buf, encoding, errors):
1585 """Construct a TarInfo object from a 512 byte bytes object.
7584f5c9
ERE
1586 """
1587 if len(buf) == 0:
1588 raise EmptyHeaderError("empty header")
1589 if len(buf) != BLOCKSIZE:
1590 raise TruncatedHeaderError("truncated header")
1591 if buf.count(NUL) == BLOCKSIZE:
1592 raise EOFHeaderError("end of file header")
1593
1594 chksum = nti(buf[148:156])
1595 if chksum not in calc_chksums(buf):
1596 raise InvalidHeaderError("bad checksum")
1597
1598 obj = cls()
be60ffd0 1599 obj.name = nts(buf[0:100], encoding, errors)
7584f5c9
ERE
1600 obj.mode = nti(buf[100:108])
1601 obj.uid = nti(buf[108:116])
1602 obj.gid = nti(buf[116:124])
1603 obj.size = nti(buf[124:136])
1604 obj.mtime = nti(buf[136:148])
1605 obj.chksum = chksum
1606 obj.type = buf[156:157]
be60ffd0
ERE
1607 obj.linkname = nts(buf[157:257], encoding, errors)
1608 obj.uname = nts(buf[265:297], encoding, errors)
1609 obj.gname = nts(buf[297:329], encoding, errors)
7584f5c9
ERE
1610 obj.devmajor = nti(buf[329:337])
1611 obj.devminor = nti(buf[337:345])
be60ffd0
ERE
1612 prefix = nts(buf[345:500], encoding, errors)
1613
1614 # The old GNU sparse format occupies some of the unused
1615 # space in the buffer for up to 4 sparse structures.
1616 # Save the them for later processing in _proc_sparse().
1617 if obj.type == GNUTYPE_SPARSE:
1618 pos = 386
1619 structs = []
1620 for i in range(4):
1621 try:
1622 offset = nti(buf[pos:pos + 12])
1623 numbytes = nti(buf[pos + 12:pos + 24])
1624 except ValueError:
1625 break
1626 structs.append((offset, numbytes))
1627 pos += 24
1628 isextended = bool(buf[482])
1629 origsize = nti(buf[483:495])
1630 obj._sparse_structs = (structs, isextended, origsize)
7584f5c9
ERE
1631
1632 # Old V7 tar format represents a directory as a regular
1633 # file with a trailing slash.
1634 if obj.type == AREGTYPE and obj.name.endswith("/"):
1635 obj.type = DIRTYPE
1636
1637 # Remove redundant slashes from directories.
1638 if obj.isdir():
1639 obj.name = obj.name.rstrip("/")
1640
1641 # Reconstruct a ustar longname.
1642 if prefix and obj.type not in GNU_TYPES:
1643 obj.name = prefix + "/" + obj.name
c474439c
ERE
1644 else:
1645 obj.offset_data = nti(buf[369:381])
7584f5c9
ERE
1646 return obj
1647
1648 @classmethod
1649 def fromtarfile(cls, tarfile):
1650 """Return the next TarInfo object from TarFile object
1651 tarfile.
1652 """
1653 buf = tarfile.fileobj.read(BLOCKSIZE)
be60ffd0 1654 obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors)
7584f5c9
ERE
1655 obj.offset = tarfile.fileobj.tell() - BLOCKSIZE
1656 return obj._proc_member(tarfile)
1657
1658 #--------------------------------------------------------------------------
1659 # The following are methods that are called depending on the type of a
1660 # member. The entry point is _proc_member() which can be overridden in a
1661 # subclass to add custom _proc_*() methods. A _proc_*() method MUST
1662 # implement the following
1663 # operations:
1664 # 1. Set self.offset_data to the position where the data blocks begin,
1665 # if there is data that follows.
1666 # 2. Set tarfile.offset to the position where the next member's header will
1667 # begin.
1668 # 3. Return self or another valid TarInfo object.
1669 def _proc_member(self, tarfile):
1670 """Choose the right processing method depending on
1671 the type and call it.
1672 """
1673 if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK):
1674 return self._proc_gnulong(tarfile)
1675 elif self.type == GNUTYPE_SPARSE:
1676 return self._proc_sparse(tarfile)
1677 elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE):
1678 return self._proc_pax(tarfile)
1679 else:
1680 return self._proc_builtin(tarfile)
1681
1682 def _proc_builtin(self, tarfile):
1683 """Process a builtin type or an unknown type which
1684 will be treated as a regular file.
1685 """
1686 self.offset_data = tarfile.fileobj.tell()
1687 offset = self.offset_data
00c34a12 1688 if self.isreg() or self.ismultivol() or self.type not in SUPPORTED_TYPES:
7584f5c9
ERE
1689 # Skip the following data blocks.
1690 offset += self._block(self.size)
1691 tarfile.offset = offset
1692
1693 # Patch the TarInfo object with saved global
1694 # header information.
1695 self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors)
1696
1697 return self
1698
1699 def _proc_gnulong(self, tarfile):
1700 """Process the blocks that hold a GNU longname
1701 or longlink member.
1702 """
1703 buf = tarfile.fileobj.read(self._block(self.size))
1704
1705 # Fetch the next header and process it.
1706 try:
1707 next = self.fromtarfile(tarfile)
1708 except HeaderError:
1709 raise SubsequentHeaderError("missing or bad subsequent header")
1710
1711 # Patch the TarInfo object from the next header with
1712 # the longname information.
1713 next.offset = self.offset
1714 if self.type == GNUTYPE_LONGNAME:
be60ffd0 1715 next.name = nts(buf, tarfile.encoding, tarfile.errors)
7584f5c9 1716 elif self.type == GNUTYPE_LONGLINK:
be60ffd0 1717 next.linkname = nts(buf, tarfile.encoding, tarfile.errors)
7584f5c9
ERE
1718
1719 return next
1720
1721 def _proc_sparse(self, tarfile):
1722 """Process a GNU sparse header plus extra headers.
1723 """
be60ffd0
ERE
1724 # We already collected some sparse structures in frombuf().
1725 structs, isextended, origsize = self._sparse_structs
1726 del self._sparse_structs
1727
1728 # Collect sparse structures from extended header blocks.
1729 while isextended:
7584f5c9
ERE
1730 buf = tarfile.fileobj.read(BLOCKSIZE)
1731 pos = 0
be60ffd0 1732 for i in range(21):
7584f5c9
ERE
1733 try:
1734 offset = nti(buf[pos:pos + 12])
1735 numbytes = nti(buf[pos + 12:pos + 24])
1736 except ValueError:
1737 break
be60ffd0
ERE
1738 if offset and numbytes:
1739 structs.append((offset, numbytes))
7584f5c9 1740 pos += 24
be60ffd0
ERE
1741 isextended = bool(buf[504])
1742 self.sparse = structs
7584f5c9
ERE
1743
1744 self.offset_data = tarfile.fileobj.tell()
1745 tarfile.offset = self.offset_data + self._block(self.size)
1746 self.size = origsize
7584f5c9
ERE
1747 return self
1748
1749 def _proc_pax(self, tarfile):
1750 """Process an extended or global header as described in
be60ffd0 1751 POSIX.1-2008.
7584f5c9
ERE
1752 """
1753 # Read the header information.
1754 buf = tarfile.fileobj.read(self._block(self.size))
1755
1756 # A pax header stores supplemental information for either
1757 # the following file (extended) or all following files
1758 # (global).
1759 if self.type == XGLTYPE:
1760 pax_headers = tarfile.pax_headers
1761 else:
1762 pax_headers = tarfile.pax_headers.copy()
1763
be60ffd0
ERE
1764 # Check if the pax header contains a hdrcharset field. This tells us
1765 # the encoding of the path, linkpath, uname and gname fields. Normally,
1766 # these fields are UTF-8 encoded but since POSIX.1-2008 tar
1767 # implementations are allowed to store them as raw binary strings if
1768 # the translation to UTF-8 fails.
1769 match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf)
1770 if match is not None:
1771 pax_headers["hdrcharset"] = match.group(1).decode("utf-8")
1772
1773 # For the time being, we don't care about anything other than "BINARY".
1774 # The only other value that is currently allowed by the standard is
1775 # "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
1776 hdrcharset = pax_headers.get("hdrcharset")
1777 if hdrcharset == "BINARY":
1778 encoding = tarfile.encoding
1779 else:
1780 encoding = "utf-8"
1781
7584f5c9
ERE
1782 # Parse pax header information. A record looks like that:
1783 # "%d %s=%s\n" % (length, keyword, value). length is the size
1784 # of the complete record including the length field itself and
1785 # the newline. keyword and value are both UTF-8 encoded strings.
be60ffd0 1786 regex = re.compile(br"(\d+) ([^=]+)=")
7584f5c9
ERE
1787 pos = 0
1788 while True:
1789 match = regex.match(buf, pos)
1790 if not match:
1791 break
1792
1793 length, keyword = match.groups()
1794 length = int(length)
1795 value = buf[match.end(2) + 1:match.start(1) + length - 1]
1796
be60ffd0
ERE
1797 # Normally, we could just use "utf-8" as the encoding and "strict"
1798 # as the error handler, but we better not take the risk. For
1799 # example, GNU tar <= 1.23 is known to store filenames it cannot
1800 # translate to UTF-8 as raw strings (unfortunately without a
1801 # hdrcharset=BINARY header).
1802 # We first try the strict standard encoding, and if that fails we
1803 # fall back on the user's encoding and error handler.
1804 keyword = self._decode_pax_field(keyword, "utf-8", "utf-8",
1805 tarfile.errors)
1806 if keyword in PAX_NAME_FIELDS:
1807 value = self._decode_pax_field(value, encoding, tarfile.encoding,
1808 tarfile.errors)
1809 else:
1810 value = self._decode_pax_field(value, "utf-8", "utf-8",
1811 tarfile.errors)
7584f5c9
ERE
1812
1813 pax_headers[keyword] = value
1814 pos += length
1815
36a315a0 1816
7584f5c9
ERE
1817 # Fetch the next header.
1818 try:
1819 next = self.fromtarfile(tarfile)
1820 except HeaderError:
1821 raise SubsequentHeaderError("missing or bad subsequent header")
1822
be60ffd0
ERE
1823 # Process GNU sparse information.
1824 if "GNU.sparse.map" in pax_headers:
1825 # GNU extended sparse format version 0.1.
1826 self._proc_gnusparse_01(next, pax_headers)
1827
1828 elif "GNU.sparse.size" in pax_headers:
1829 # GNU extended sparse format version 0.0.
1830 self._proc_gnusparse_00(next, pax_headers, buf)
1831
1832 elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0":
1833 # GNU extended sparse format version 1.0.
1834 self._proc_gnusparse_10(next, pax_headers, tarfile)
1835
7584f5c9
ERE
1836 if self.type in (XHDTYPE, SOLARIS_XHDTYPE):
1837 # Patch the TarInfo object with the extended header info.
1838 next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors)
1839 next.offset = self.offset
1840
1841 if "size" in pax_headers:
1842 # If the extended header replaces the size field,
1843 # we need to recalculate the offset where the next
1844 # header starts.
1845 offset = next.offset_data
1846 if next.isreg() or next.type not in SUPPORTED_TYPES:
1847 offset += next._block(next.size)
1848 tarfile.offset = offset
1849
c04e0751
ERE
1850 if next is not None:
1851 if "GNU.volume.filename" in pax_headers:
1852 if pax_headers["GNU.volume.filename"] == next.name:
1853 if "GNU.volume.size" in pax_headers:
1854 next.size = int(pax_headers["GNU.volume.size"])
1855 if "GNU.volume.offset" in pax_headers:
1856 next.volume_offset = int(pax_headers["GNU.volume.offset"])
1857
1858 for key in pax_headers.keys():
1859 if key.startswith("GNU.volume"):
1860 del tarfile.pax_headers[key]
0eb5048f 1861
7584f5c9
ERE
1862 return next
1863
be60ffd0
ERE
1864 def _proc_gnusparse_00(self, next, pax_headers, buf):
1865 """Process a GNU tar extended sparse header, version 0.0.
7584f5c9 1866 """
be60ffd0
ERE
1867 offsets = []
1868 for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf):
1869 offsets.append(int(match.group(1)))
1870 numbytes = []
1871 for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf):
1872 numbytes.append(int(match.group(1)))
1873 next.sparse = list(zip(offsets, numbytes))
7584f5c9 1874
be60ffd0
ERE
1875 def _proc_gnusparse_01(self, next, pax_headers):
1876 """Process a GNU tar extended sparse header, version 0.1.
1877 """
1878 sparse = [int(x) for x in pax_headers["GNU.sparse.map"].split(",")]
1879 next.sparse = list(zip(sparse[::2], sparse[1::2]))
7584f5c9 1880
be60ffd0
ERE
1881 def _proc_gnusparse_10(self, next, pax_headers, tarfile):
1882 """Process a GNU tar extended sparse header, version 1.0.
1883 """
1884 fields = None
1885 sparse = []
1886 buf = tarfile.fileobj.read(BLOCKSIZE)
1887 fields, buf = buf.split(b"\n", 1)
1888 fields = int(fields)
1889 while len(sparse) < fields * 2:
1890 if b"\n" not in buf:
1891 buf += tarfile.fileobj.read(BLOCKSIZE)
1892 number, buf = buf.split(b"\n", 1)
1893 sparse.append(int(number))
1894 next.offset_data = tarfile.fileobj.tell()
1895 next.sparse = list(zip(sparse[::2], sparse[1::2]))
7584f5c9 1896
be60ffd0
ERE
1897 def _apply_pax_info(self, pax_headers, encoding, errors):
1898 """Replace fields with supplemental information from a previous
1899 pax extended or global header.
1900 """
1901 for keyword, value in pax_headers.items():
1902 if keyword == "GNU.sparse.name":
1903 setattr(self, "path", value)
1904 elif keyword == "GNU.sparse.size":
1905 setattr(self, "size", int(value))
1906 elif keyword == "GNU.sparse.realsize":
1907 setattr(self, "size", int(value))
1908 elif keyword in PAX_FIELDS:
1909 if keyword in PAX_NUMBER_FIELDS:
1910 try:
1911 value = PAX_NUMBER_FIELDS[keyword](value)
1912 except ValueError:
1913 value = 0
1914 if keyword == "path":
f0287fb7 1915 value = value.rstrip("/") # pylint: disable=no-member
be60ffd0 1916 setattr(self, keyword, value)
7584f5c9
ERE
1917
1918 self.pax_headers = pax_headers.copy()
1919
be60ffd0
ERE
1920 def _decode_pax_field(self, value, encoding, fallback_encoding, fallback_errors):
1921 """Decode a single field from a pax record.
1922 """
1923 try:
1924 return value.decode(encoding, "strict")
1925 except UnicodeDecodeError:
1926 return value.decode(fallback_encoding, fallback_errors)
1927
7584f5c9
ERE
1928 def _block(self, count):
1929 """Round up a byte count by BLOCKSIZE and return it,
1930 e.g. _block(834) => 1024.
1931 """
1932 blocks, remainder = divmod(count, BLOCKSIZE)
1933 if remainder:
1934 blocks += 1
1935 return blocks * BLOCKSIZE
1936
1937 def isreg(self):
1938 return self.type in REGULAR_TYPES
1939 def isfile(self):
1940 return self.isreg()
1941 def isdir(self):
1942 return self.type == DIRTYPE
1943 def issym(self):
1944 return self.type == SYMTYPE
1945 def islnk(self):
1946 return self.type == LNKTYPE
1947 def ischr(self):
1948 return self.type == CHRTYPE
1949 def isblk(self):
1950 return self.type == BLKTYPE
1951 def isfifo(self):
1952 return self.type == FIFOTYPE
1953 def issparse(self):
be60ffd0 1954 return self.sparse is not None
7584f5c9
ERE
1955 def isdev(self):
1956 return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE)
68ddf955 1957 def ismultivol(self):
c04e0751
ERE
1958 return self.type == GNUTYPE_MULTIVOL or self.volume_offset > 0 or\
1959 "GNU.volume.offset" in self.pax_headers
7584f5c9
ERE
1960# class TarInfo
1961
1962class TarFile(object):
1963 """The TarFile Class provides an interface to tar archives.
1964 """
1965
1966 debug = 0 # May be set from 0 (no msgs) to 3 (all msgs)
1967
1968 dereference = False # If true, add content of linked file to the
1969 # tar file, else the link.
1970
1971 ignore_zeros = False # If true, skips empty or invalid blocks and
1972 # continues processing.
1973
83f2d71e 1974 max_volume_size = None # If different from None, establishes maximum
68ddf955
ERE
1975 # size of tar volumes
1976
1977 new_volume_handler = None # function handler to be executed before when
1978 # a new volume is needed
1979
1980 volume_number = 0 # current volume number, used for multi volume
1981 # support
1982
7584f5c9
ERE
1983 errorlevel = 1 # If 0, fatal errors only appear in debug
1984 # messages (if debug >= 0). If > 0, errors
1985 # are passed to the caller as exceptions.
1986
1987 format = DEFAULT_FORMAT # The format to use when creating an archive.
1988
1989 encoding = ENCODING # Encoding for 8-bit character strings.
1990
1991 errors = None # Error handler for unicode conversion.
1992
1993 tarinfo = TarInfo # The default TarInfo class to use.
1994
be60ffd0 1995 fileobject = ExFileObject # The file-object for extractfile().
7584f5c9 1996
d1c38f40
PG
1997 arcmode = ARCMODE_PLAIN # Object processing mode (“concat”, encryption,
1998 # compression)
5fdff89f 1999
ea625b04
ERE
2000 save_to_members = True # If new members are saved. This can be disabled
2001 # if you manage lots of files and don't want
2002 # to have high memory usage
2003
9ef1fb87
TJ
2004 cache_uid2user = {} # cache to avoid getpwuid calls. It always parses /etc/passwd.
2005 cache_gid2group = {} # same cache for groups
2006
7584f5c9
ERE
2007 def __init__(self, name=None, mode="r", fileobj=None, format=None,
2008 tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
be60ffd0 2009 errors="surrogateescape", pax_headers=None, debug=None,
548bb8d5 2010 errorlevel=None, max_volume_size=None, new_volume_handler=None,
d1c38f40 2011 concat=False, nacl=None,
c7c736b6 2012 save_to_members=True):
7584f5c9
ERE
2013 """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
2014 read from an existing archive, 'a' to append data to an existing
2015 file or 'w' to create a new file overwriting an existing one. `mode'
2016 defaults to 'r'.
2017 If `fileobj' is given, it is used for reading or writing data. If it
2018 can be determined, `mode' is overridden by `fileobj's mode.
2019 `fileobj' is not closed, when TarFile is closed.
2020 """
2021 if len(mode) > 1 or mode not in "raw":
2022 raise ValueError("mode must be 'r', 'a' or 'w'")
2023 self.mode = mode
d1c38f40 2024 self.arcmode = arcmode_set (concat)
c7c736b6 2025 self.nacl = nacl
7584f5c9
ERE
2026 self._mode = {"r": "rb", "a": "r+b", "w": "wb"}[mode]
2027
2028 if not fileobj:
2029 if self.mode == "a" and not os.path.exists(name):
2030 # Create nonexistent files in append mode.
2031 self.mode = "w"
2032 self._mode = "wb"
2033 fileobj = bltn_open(name, self._mode)
2034 self._extfileobj = False
2035 else:
2036 if name is None and hasattr(fileobj, "name"):
2037 name = fileobj.name
d5361dac 2038 # when fileobj is a gzip.GzipFile, fileobj.mode is an int (not valid for us)
be60ffd0 2039 if hasattr(fileobj, "mode") and isinstance(fileobj.mode, str):
7584f5c9
ERE
2040 self._mode = fileobj.mode
2041 self._extfileobj = True
be60ffd0 2042 self.name = os.path.abspath(name) if name else None
2f854e77 2043 self.base_name = self.name = os.path.abspath(name) if name else None
7584f5c9
ERE
2044 self.fileobj = fileobj
2045
2046 # Init attributes.
2047 if format is not None:
2048 self.format = format
2049 if tarinfo is not None:
2050 self.tarinfo = tarinfo
2051 if dereference is not None:
2052 self.dereference = dereference
2053 if ignore_zeros is not None:
2054 self.ignore_zeros = ignore_zeros
2055 if encoding is not None:
2056 self.encoding = encoding
2057
be60ffd0 2058 self.errors = errors
7584f5c9
ERE
2059
2060 if pax_headers is not None and self.format == PAX_FORMAT:
2061 self.pax_headers = pax_headers
2062 else:
2063 self.pax_headers = {}
2064
2065 if debug is not None:
2066 self.debug = debug
2067 if errorlevel is not None:
2068 self.errorlevel = errorlevel
2069
2070 # Init datastructures.
ae48acc8 2071 if max_volume_size and max_volume_size < 3*BLOCKSIZE:
0c818a18 2072 raise ValueError("max_volume_size needs to be at least %d" % (3*BLOCKSIZE))
ae48acc8
ERE
2073 if max_volume_size and not callable(new_volume_handler):
2074 raise ValueError("new_volume_handler needs to be set and be callable for multivolume support")
5ab3f8f9
CH
2075 if max_volume_size:
2076 self.max_volume_size = int(max_volume_size)
2077 else:
2078 self.max_volume_size = None
ae48acc8 2079
ea625b04 2080 self.save_to_members = save_to_members
68ddf955 2081 self.new_volume_handler = new_volume_handler
7584f5c9
ERE
2082 self.closed = False
2083 self.members = [] # list of members as TarInfo objects
2084 self._loaded = False # flag if all members have been read
2085 self.offset = self.fileobj.tell()
2086 # current position in the archive file
2087 self.inodes = {} # dictionary caching the inodes of
2088 # archive members already added
2089
2090 try:
2091 if self.mode == "r":
2092 self.firstmember = None
2093 self.firstmember = self.next()
2094
2095 if self.mode == "a":
2096 # Move to the end of the archive,
2097 # before the first empty block.
2098 while True:
2099 self.fileobj.seek(self.offset)
2100 try:
2101 tarinfo = self.tarinfo.fromtarfile(self)
2102 self.members.append(tarinfo)
2103 except EOFHeaderError:
2104 self.fileobj.seek(self.offset)
2105 break
be60ffd0 2106 except HeaderError as e:
7584f5c9
ERE
2107 raise ReadError(str(e))
2108
2109 if self.mode in "aw":
2110 self._loaded = True
2111
2112 if self.pax_headers:
2113 buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy())
2114 self.fileobj.write(buf)
2115 self.offset += len(buf)
2116 except:
2117 if not self._extfileobj:
2118 self.fileobj.close()
2119 self.closed = True
2120 raise
2121
7584f5c9
ERE
2122 #--------------------------------------------------------------------------
2123 # Below are the classmethods which act as alternate constructors to the
2124 # TarFile class. The open() method is the only one that is needed for
2125 # public use; it is the "super"-constructor and is able to select an
2126 # adequate "sub"-constructor for a particular compression using the mapping
2127 # from OPEN_METH.
2128 #
2129 # This concept allows one to subclass TarFile without losing the comfort of
2130 # the super-constructor. A sub-constructor is registered and made available
2131 # by adding it to the mapping in OPEN_METH.
2132
2133 @classmethod
2b82f50c 2134 def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE,
04f4c7ab
PG
2135 encryption=None, compresslevel=9, tolerance=TOLERANCE_STRICT,
2136 **kwargs):
7584f5c9
ERE
2137 """Open a tar archive for reading, writing or appending. Return
2138 an appropriate TarFile class.
2139
2140 mode:
2141 'r' or 'r:*' open for reading with transparent compression
2142 'r:' open for reading exclusively uncompressed
2143 'r:gz' open for reading with gzip compression
2144 'r:bz2' open for reading with bzip2 compression
be60ffd0 2145 'r:xz' open for reading with lzma compression
7584f5c9
ERE
2146 'a' or 'a:' open for appending, creating the file if necessary
2147 'w' or 'w:' open for writing without compression
2148 'w:gz' open for writing with gzip compression
2149 'w:bz2' open for writing with bzip2 compression
be60ffd0 2150 'w:xz' open for writing with lzma compression
7584f5c9
ERE
2151
2152 'r|*' open a stream of tar blocks with transparent compression
2153 'r|' open an uncompressed stream of tar blocks for reading
2154 'r|gz' open a gzip compressed stream of tar blocks
2155 'r|bz2' open a bzip2 compressed stream of tar blocks
be60ffd0 2156 'r|xz' open an lzma compressed stream of tar blocks
7584f5c9
ERE
2157 'w|' open an uncompressed stream for writing
2158 'w|gz' open a gzip compressed stream for writing
2159 'w|bz2' open a bzip2 compressed stream for writing
be60ffd0 2160 'w|xz' open an lzma compressed stream for writing
85737f48
ERE
2161
2162 'r#gz' open a stream of gzip compressed tar blocks for reading
2163 'w#gz' open a stream of gzip compressed tar blocks for writing
7584f5c9 2164 """
7584f5c9
ERE
2165 if not name and not fileobj:
2166 raise ValueError("nothing to open")
2167
2168 if mode in ("r", "r:*"):
2169 # Find out which *open() is appropriate for opening the file.
2170 for comptype in cls.OPEN_METH:
2171 func = getattr(cls, cls.OPEN_METH[comptype])
2172 if fileobj is not None:
2173 saved_pos = fileobj.tell()
2174 try:
2175 return func(name, "r", fileobj, **kwargs)
be60ffd0 2176 except (ReadError, CompressionError) as e:
c7c736b6 2177 # usually nothing exceptional but sometimes is
7584f5c9
ERE
2178 if fileobj is not None:
2179 fileobj.seek(saved_pos)
2180 continue
2181 raise ReadError("file could not be opened successfully")
2182
2183 elif ":" in mode:
2184 filemode, comptype = mode.split(":", 1)
2185 filemode = filemode or "r"
2186 comptype = comptype or "tar"
2187
2188 # Select the *open() function according to
2189 # given compression.
2190 if comptype in cls.OPEN_METH:
2191 func = getattr(cls, cls.OPEN_METH[comptype])
2192 else:
2193 raise CompressionError("unknown compression type %r" % comptype)
e05f0440
TJ
2194
2195 # Pass on compression level for gzip / bzip2.
2196 if comptype == 'gz' or comptype == 'bz2':
2197 kwargs['compresslevel'] = compresslevel
2198
7a2b9329
CH
2199 if 'max_volume_size' in kwargs:
2200 if comptype != 'tar' and filemode in 'wa' \
2201 and kwargs['max_volume_size']:
2202 import warnings
2203 warnings.warn('Only the first volume will be compressed '
2204 'for modes with "w:"!')
2205
e05f0440 2206 return func(name, filemode, fileobj, **kwargs)
7584f5c9
ERE
2207
2208 elif "|" in mode:
2209 filemode, comptype = mode.split("|", 1)
2210 filemode = filemode or "r"
2211 comptype = comptype or "tar"
2212
2213 if filemode not in "rw":
2214 raise ValueError("mode must be 'r' or 'w'")
2215
2216 t = cls(name, filemode,
2b82f50c
ERE
2217 _Stream(name, filemode, comptype, fileobj, bufsize,
2218 compresslevel=compresslevel),
7584f5c9
ERE
2219 **kwargs)
2220 t._extfileobj = False
2221 return t
2222
5fdff89f
ERE
2223 elif "#" in mode:
2224 filemode, comptype = mode.split("#", 1)
2225 filemode = filemode or "r"
5fdff89f
ERE
2226
2227 if filemode not in "rw":
5faea0e1
PG
2228 raise ValueError ("mode %s not compatible with concat "
2229 "archive; must be 'r' or 'w'" % mode)
5fdff89f 2230
be60ffd0 2231 stream = _Stream(name, filemode, comptype, fileobj, bufsize,
d1c38f40 2232 concat=True, encryption=encryption,
04f4c7ab 2233 compresslevel=compresslevel, tolerance=tolerance)
d1c38f40 2234 kwargs ["concat"] = True
be60ffd0
ERE
2235 try:
2236 t = cls(name, filemode, stream, **kwargs)
c7c736b6 2237 except: # XXX except what?
be60ffd0 2238 stream.close()
c7c736b6 2239 raise # XXX raise what?
5fdff89f
ERE
2240 t._extfileobj = False
2241 return t
2242
7584f5c9
ERE
2243 elif mode in "aw":
2244 return cls.taropen(name, mode, fileobj, **kwargs)
2245
133d30da 2246 raise ValueError("undiscernible mode %r" % mode)
7584f5c9 2247
d39d4cbf
PG
2248
2249 @classmethod
2250 def open_at_offset(cls, offset, *a, **kwa):
2251 """
2252 Same as ``.open()``, but start reading at the given offset. Assumes a
5bd2d4b5
PG
2253 seekable file object. Returns *None* if opening failed due to a read
2254 problem.
d39d4cbf
PG
2255 """
2256 fileobj = kwa.get ("fileobj")
2257 if fileobj is not None:
2258 fileobj.seek (offset)
5bd2d4b5 2259
d39d4cbf
PG
2260 return cls.open (*a, **kwa)
2261
2262
7584f5c9
ERE
2263 @classmethod
2264 def taropen(cls, name, mode="r", fileobj=None, **kwargs):
2265 """Open uncompressed tar archive name for reading or writing.
2266 """
2267 if len(mode) > 1 or mode not in "raw":
2268 raise ValueError("mode must be 'r', 'a' or 'w'")
2269 return cls(name, mode, fileobj, **kwargs)
2270
2271 @classmethod
2272 def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
2273 """Open gzip compressed tar archive name for reading or writing.
2274 Appending is not allowed.
2275 """
2276 if len(mode) > 1 or mode not in "rw":
2277 raise ValueError("mode must be 'r' or 'w'")
2278
2279 try:
2280 import gzip
2281 gzip.GzipFile
2282 except (ImportError, AttributeError):
2283 raise CompressionError("gzip module is not available")
2284
be60ffd0 2285 extfileobj = fileobj is not None
7584f5c9 2286 try:
be60ffd0
ERE
2287 fileobj = gzip.GzipFile(name, mode + "b", compresslevel, fileobj)
2288 t = cls.taropen(name, mode, fileobj, **kwargs)
2289 except OSError:
2290 if not extfileobj and fileobj is not None:
2291 fileobj.close()
2292 if fileobj is None:
2293 raise
7584f5c9 2294 raise ReadError("not a gzip file")
be60ffd0
ERE
2295 except:
2296 if not extfileobj and fileobj is not None:
2297 fileobj.close()
2298 raise
2299 t._extfileobj = extfileobj
7584f5c9
ERE
2300 return t
2301
2302 @classmethod
2303 def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
2304 """Open bzip2 compressed tar archive name for reading or writing.
2305 Appending is not allowed.
2306 """
2307 if len(mode) > 1 or mode not in "rw":
2308 raise ValueError("mode must be 'r' or 'w'.")
2309
2310 try:
2311 import bz2
2312 except ImportError:
2313 raise CompressionError("bz2 module is not available")
2314
be60ffd0
ERE
2315 fileobj = bz2.BZ2File(fileobj or name, mode,
2316 compresslevel=compresslevel)
7584f5c9
ERE
2317
2318 try:
2319 t = cls.taropen(name, mode, fileobj, **kwargs)
be60ffd0
ERE
2320 except (OSError, EOFError):
2321 fileobj.close()
7584f5c9
ERE
2322 raise ReadError("not a bzip2 file")
2323 t._extfileobj = False
2324 return t
2325
be60ffd0
ERE
2326 @classmethod
2327 def xzopen(cls, name, mode="r", fileobj=None, preset=None, **kwargs):
2328 """Open lzma compressed tar archive name for reading or writing.
2329 Appending is not allowed.
2330 """
2331 if mode not in ("r", "w"):
2332 raise ValueError("mode must be 'r' or 'w'")
2333
2334 try:
2335 import lzma
2336 except ImportError:
2337 raise CompressionError("lzma module is not available")
2338
2339 fileobj = lzma.LZMAFile(fileobj or name, mode, preset=preset)
2340
2341 try:
2342 t = cls.taropen(name, mode, fileobj, **kwargs)
2343 except (lzma.LZMAError, EOFError):
2344 fileobj.close()
2345 raise ReadError("not an lzma file")
2346 t._extfileobj = False
2347 return t
2348
7584f5c9
ERE
2349 # All *open() methods are registered here.
2350 OPEN_METH = {
2351 "tar": "taropen", # uncompressed tar
2352 "gz": "gzopen", # gzip compressed tar
be60ffd0
ERE
2353 "bz2": "bz2open", # bzip2 compressed tar
2354 "xz": "xzopen" # lzma compressed tar
7584f5c9
ERE
2355 }
2356
2357 #--------------------------------------------------------------------------
2358 # The public methods which TarFile provides:
2359
2360 def close(self):
2361 """Close the TarFile. In write-mode, two finishing zero blocks are
fd2f01f2
PG
2362 appended to the archive. A special case are empty archives which are
2363 initialized accordingly so the two mandatory blocks of zeros are
2364 written abiding by the requested encryption and compression settings.
7584f5c9
ERE
2365 """
2366 if self.closed:
2367 return
2368
2369 if self.mode in "aw":
fd2f01f2
PG
2370 if self.arcmode & ARCMODE_CONCAT and self.fileobj.tell () == 0:
2371 self.fileobj.next ("")
7584f5c9
ERE
2372 self.fileobj.write(NUL * (BLOCKSIZE * 2))
2373 self.offset += (BLOCKSIZE * 2)
2374 # fill up the end with zero-blocks
2375 # (like option -b20 for tar does)
2376 blocks, remainder = divmod(self.offset, RECORDSIZE)
2377 if remainder > 0:
2378 self.fileobj.write(NUL * (RECORDSIZE - remainder))
7584f5c9
ERE
2379 if not self._extfileobj:
2380 self.fileobj.close()
2381 self.closed = True
2382
2383 def getmember(self, name):
2384 """Return a TarInfo object for member `name'. If `name' can not be
2385 found in the archive, KeyError is raised. If a member occurs more
2386 than once in the archive, its last occurrence is assumed to be the
2387 most up-to-date version.
2388 """
2389 tarinfo = self._getmember(name)
2390 if tarinfo is None:
2391 raise KeyError("filename %r not found" % name)
2392 return tarinfo
2393
2394 def getmembers(self):
2395 """Return the members of the archive as a list of TarInfo objects. The
2396 list has the same order as the members in the archive.
2397 """
2398 self._check()
2399 if not self._loaded: # if we want to obtain a list of
2400 self._load() # all members, we first have to
2401 # scan the whole archive.
2402 return self.members
2403
ad4402e8
ERE
2404 def get_last_member_offset(self):
2405 """Return the last member offset. Usually this is self.fileobj.tell(),
2406 but when there's encryption or concat compression going on it's more
2407 complicated than that.
2408 """
b8fc2f5d 2409 return self.last_block_offset
ad4402e8 2410
7584f5c9
ERE
2411 def getnames(self):
2412 """Return the members of the archive as a list of their names. It has
2413 the same order as the list returned by getmembers().
2414 """
2415 return [tarinfo.name for tarinfo in self.getmembers()]
2416
2417 def gettarinfo(self, name=None, arcname=None, fileobj=None):
2418 """Create a TarInfo object for either the file `name' or the file
2419 object `fileobj' (using os.fstat on its file descriptor). You can
2420 modify some of the TarInfo's attributes before you add it using
2421 addfile(). If given, `arcname' specifies an alternative name for the
2422 file in the archive.
2423 """
2424 self._check("aw")
2425
2426 # When fileobj is given, replace name by
2427 # fileobj's real name.
2428 if fileobj is not None:
2429 name = fileobj.name
2430
2431 # Building the name of the member in the archive.
2432 # Backward slashes are converted to forward slashes,
2433 # Absolute paths are turned to relative paths.
2434 if arcname is None:
2435 arcname = name
2436 drv, arcname = os.path.splitdrive(arcname)
be60ffd0 2437 arcname = arcname.replace(os.sep, "/")
7584f5c9
ERE
2438 arcname = arcname.lstrip("/")
2439
2440 # Now, fill the TarInfo object with
2441 # information specific for the file.
2442 tarinfo = self.tarinfo()
2443 tarinfo.tarfile = self
2444
2445 # Use os.stat or os.lstat, depending on platform
2446 # and if symlinks shall be resolved.
2447 if fileobj is None:
2448 if hasattr(os, "lstat") and not self.dereference:
2449 statres = os.lstat(name)
2450 else:
2451 statres = os.stat(name)
2452 else:
2453 statres = os.fstat(fileobj.fileno())
2454 linkname = ""
2455
2456 stmd = statres.st_mode
2457 if stat.S_ISREG(stmd):
2458 inode = (statres.st_ino, statres.st_dev)
2459 if not self.dereference and statres.st_nlink > 1 and \
2460 inode in self.inodes and arcname != self.inodes[inode]:
2461 # Is it a hardlink to an already
2462 # archived file?
2463 type = LNKTYPE
2464 linkname = self.inodes[inode]
2465 else:
2466 # The inode is added only if its valid.
2467 # For win32 it is always 0.
2468 type = REGTYPE
6f422b65 2469 if inode[0] and self.save_to_members:
7584f5c9
ERE
2470 self.inodes[inode] = arcname
2471 elif stat.S_ISDIR(stmd):
2472 type = DIRTYPE
2473 elif stat.S_ISFIFO(stmd):
2474 type = FIFOTYPE
2475 elif stat.S_ISLNK(stmd):
2476 type = SYMTYPE
2477 linkname = os.readlink(name)
2478 elif stat.S_ISCHR(stmd):
2479 type = CHRTYPE
2480 elif stat.S_ISBLK(stmd):
2481 type = BLKTYPE
2482 else:
2483 return None
2484
2485 # Fill the TarInfo object with all
2486 # information we can get.
2487 tarinfo.name = arcname
2488 tarinfo.mode = stmd
2489 tarinfo.uid = statres.st_uid
2490 tarinfo.gid = statres.st_gid
2491 if type == REGTYPE:
2492 tarinfo.size = statres.st_size
2493 else:
be60ffd0 2494 tarinfo.size = 0
7584f5c9
ERE
2495 tarinfo.mtime = statres.st_mtime
2496 tarinfo.type = type
2497 tarinfo.linkname = linkname
2498 if pwd:
9ef1fb87
TJ
2499 if tarinfo.uid in self.cache_uid2user:
2500 tarinfo.uname = self.cache_uid2user[tarinfo.uid]
2501 else:
2502 try:
2503 tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0]
2504 self.cache_uid2user[tarinfo.uid] = tarinfo.uname
2505 except KeyError:
2506 # remember user does not exist:
2507 # same default value as in tarinfo class
2508 self.cache_uid2user[tarinfo.uid] = ""
7584f5c9 2509 if grp:
9ef1fb87
TJ
2510 if tarinfo.gid in self.cache_gid2group:
2511 tarinfo.gname = self.cache_gid2group[tarinfo.gid]
2512 else:
2513 try:
2514 tarinfo.gname = grp.getgrgid(tarinfo.gid)[0]
2515 self.cache_gid2group[tarinfo.gid] = tarinfo.gname
2516 except KeyError:
2517 # remember group does not exist:
2518 # same default value as in tarinfo class
2519 self.cache_gid2group[tarinfo.gid] = ""
7584f5c9
ERE
2520
2521 if type in (CHRTYPE, BLKTYPE):
2522 if hasattr(os, "major") and hasattr(os, "minor"):
2523 tarinfo.devmajor = os.major(statres.st_rdev)
2524 tarinfo.devminor = os.minor(statres.st_rdev)
2525 return tarinfo
2526
2527 def list(self, verbose=True):
2528 """Print a table of contents to sys.stdout. If `verbose' is False, only
2529 the names of the members are printed. If it is True, an `ls -l'-like
2530 output is produced.
2531 """
2532 self._check()
2533
2534 for tarinfo in self:
2535 if verbose:
be60ffd0
ERE
2536 print(stat.filemode(tarinfo.mode), end=' ')
2537 print("%s/%s" % (tarinfo.uname or tarinfo.uid,
2538 tarinfo.gname or tarinfo.gid), end=' ')
7584f5c9 2539 if tarinfo.ischr() or tarinfo.isblk():
be60ffd0
ERE
2540 print("%10s" % ("%d,%d" \
2541 % (tarinfo.devmajor, tarinfo.devminor)), end=' ')
7584f5c9 2542 else:
be60ffd0
ERE
2543 print("%10d" % tarinfo.size, end=' ')
2544 print("%d-%02d-%02d %02d:%02d:%02d" \
2545 % time.localtime(tarinfo.mtime)[:6], end=' ')
7584f5c9 2546
be60ffd0 2547 print(tarinfo.name + ("/" if tarinfo.isdir() else ""), end=' ')
7584f5c9
ERE
2548
2549 if verbose:
2550 if tarinfo.issym():
be60ffd0 2551 print("->", tarinfo.linkname, end=' ')
7584f5c9 2552 if tarinfo.islnk():
be60ffd0
ERE
2553 print("link to", tarinfo.linkname, end=' ')
2554 print()
7584f5c9 2555
be60ffd0 2556 def add(self, name, arcname=None, recursive=True, exclude=None, *, filter=None):
7584f5c9
ERE
2557 """Add the file `name' to the archive. `name' may be any type of file
2558 (directory, fifo, symbolic link, etc.). If given, `arcname'
2559 specifies an alternative name for the file in the archive.
2560 Directories are added recursively by default. This can be avoided by
2561 setting `recursive' to False. `exclude' is a function that should
2562 return True for each filename to be excluded. `filter' is a function
2563 that expects a TarInfo object argument and returns the changed
2564 TarInfo object, if it returns None the TarInfo object will be
2565 excluded from the archive.
2566 """
2567 self._check("aw")
2568
2569 if arcname is None:
2570 arcname = name
2571
2572 # Exclude pathnames.
2573 if exclude is not None:
2574 import warnings
2575 warnings.warn("use the filter argument instead",
2576 DeprecationWarning, 2)
2577 if exclude(name):
2578 self._dbg(2, "tarfile: Excluded %r" % name)
2579 return
2580
2581 # Skip if somebody tries to archive the archive...
2582 if self.name is not None and os.path.abspath(name) == self.name:
2583 self._dbg(2, "tarfile: Skipped %r" % name)
2584 return
2585
2586 self._dbg(1, name)
2587
2588 # Create a TarInfo object from the file.
2589 tarinfo = self.gettarinfo(name, arcname)
2590
2591 if tarinfo is None:
2592 self._dbg(1, "tarfile: Unsupported type %r" % name)
2593 return
2594
2595 # Change or exclude the TarInfo object.
2596 if filter is not None:
2597 tarinfo = filter(tarinfo)
2598 if tarinfo is None:
2599 self._dbg(2, "tarfile: Excluded %r" % name)
2600 return
2601
2602 # Append the tar header and data to the archive.
2603 if tarinfo.isreg():
2604 with bltn_open(name, "rb") as f:
2605 self.addfile(tarinfo, f)
2606
2607 elif tarinfo.isdir():
2608 self.addfile(tarinfo)
2609 if recursive:
2610 for f in os.listdir(name):
2611 self.add(os.path.join(name, f), os.path.join(arcname, f),
be60ffd0 2612 recursive, exclude, filter=filter)
7584f5c9
ERE
2613
2614 else:
2615 self.addfile(tarinfo)
2616
defc9a22 2617 def _size_left_file(self):
be60ffd0 2618 """Calculates size left in a volume with a maximum volume size.
ba5a449e 2619
be60ffd0 2620 Assumes self.max_volume_size is set.
ba5a449e 2621 If using compression through a _Stream, use _size_left_stream instead
be60ffd0 2622 """
ba5a449e 2623 # left-over size = max_size - offset - 2 zero-blocks written in close
ae48acc8
ERE
2624 size_left = self.max_volume_size - 2*BLOCKSIZE - self.offset
2625 # limit size left to a discrete number of blocks, because we won't
be60ffd0 2626 # write only half a block when writting the end of a volume
ae48acc8 2627 # and filling with zeros
defc9a22
CH
2628 return BLOCKSIZE * (size_left // BLOCKSIZE)
2629
2630 def _size_left_stream(self):
ba5a449e
CH
2631 """ Calculates size left in a volume if using comression/encryption
2632
2633 Assumes self.max_volume_size is set and self.fileobj is a _Stream
2634 (otherwise use _size_left_file)
2635 """
2636 # left-over size = max_size - bytes written - 2 zero-blocks (close)
defc9a22
CH
2637 size_left = self.max_volume_size - self.fileobj.estim_file_size() \
2638 - 2*BLOCKSIZE
2639 return BLOCKSIZE * (size_left // BLOCKSIZE)
ae48acc8 2640
7584f5c9
ERE
2641 def addfile(self, tarinfo, fileobj=None):
2642 """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is
2643 given, tarinfo.size bytes are read from it and added to the archive.
2644 You can create TarInfo objects using gettarinfo().
2645 On Windows platforms, `fileobj' should always be opened with mode
2646 'rb' to avoid irritation about the file size.
2647 """
2648 self._check("aw")
2649
2650 tarinfo = copy.copy(tarinfo)
cbf55ffb 2651
d1c38f40
PG
2652 if self.arcmode & ARCMODE_CONCAT:
2653 self.last_block_offset = self.fileobj.next (tarinfo.name)
11684b1d
ERE
2654 else:
2655 self.last_block_offset = self.fileobj.tell()
7584f5c9
ERE
2656
2657 buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
2658 self.fileobj.write(buf)
2659 self.offset += len(buf)
2660
ae9c8de2
CH
2661 if self.max_volume_size:
2662 if isinstance(self.fileobj, _Stream):
2663 _size_left = self._size_left_stream
2664 else:
2665 _size_left = self._size_left_file
2666 else:
2667 _size_left = lambda: tarinfo.size
68ddf955 2668
29c354ac
PG
2669 # If there's no data to follow, finish
2670 if not fileobj:
29c354ac
PG
2671 if self.save_to_members:
2672 self.members.append(tarinfo)
2673 return
2674
2675 target_size_left = _size_left()
2676 source_size_left = tarinfo.size
2677 assert tarinfo.volume_offset == 0
2678
2679 # we only split volumes in the middle of a file, that means we have
2680 # to write at least one block
2681 if target_size_left < BLOCKSIZE:
2682 target_size_left = BLOCKSIZE
2683
ae9c8de2
CH
2684 # loop over multiple volumes
2685 while source_size_left > 0:
ae48acc8 2686
ae9c8de2
CH
2687 # Write as much data as possble from source into target.
2688 # When compressing data, we cannot easily predict how much data we
2689 # can write until target_size_left == 0 --> need to iterate
2690 size_can_write = min(target_size_left, source_size_left)
c04e0751 2691
ae9c8de2
CH
2692 while size_can_write > 0:
2693 copyfileobj(fileobj, self.fileobj, size_can_write)
2694 self.offset += size_can_write
2695 source_size_left -= size_can_write
2696 target_size_left = _size_left()
2697 size_can_write = min(target_size_left, source_size_left)
68ddf955 2698
ae9c8de2
CH
2699 # now target_size_left == 0 or source_size_left == 0
2700
2701 # if there is data left to write, we need to create a new volume
2702 if source_size_left > 0:
5f38bff6
PG
2703 # Only finalize the crypto entry here if we’re continuing with
2704 # another one; otherwise, the encryption must include the block
2705 # padding below.
2f854e77 2706 tarinfo.type = GNUTYPE_MULTIVOL
68ddf955
ERE
2707
2708 if not self.new_volume_handler or\
2709 not callable(self.new_volume_handler):
c04e0751 2710 raise Exception("We need to create a new volume and you "
ae9c8de2 2711 "didn't supply a new_volume_handler")
68ddf955 2712
54128a00 2713
68ddf955
ERE
2714 # the new volume handler should do everything needed to
2715 # start working in a new volume. usually, the handler calls
2716 # to self.open_volume
2f854e77 2717 self.volume_number += 1
0eb5048f 2718
ae9c8de2 2719 # set to be used by open_volume, because in the case of a PAX
0eb5048f
ERE
2720 # tar it needs to write information about the volume and offset
2721 # in the global header
ae9c8de2 2722 tarinfo.volume_offset = tarinfo.size - source_size_left
0eb5048f 2723 self.volume_tarinfo = tarinfo
ae9c8de2 2724
a0873dcc
PG
2725 # the “new_volume_handler” is supposed to call .close() on the
2726 # “fileobj” _Stream
2f854e77
ERE
2727 self.new_volume_handler(self, self.base_name, self.volume_number)
2728
0eb5048f
ERE
2729 self.volume_tarinfo = None
2730
d1c38f40
PG
2731 if self.arcmode & ARCMODE_CONCAT:
2732 self.fileobj.next_volume (tarinfo.name)
5f38bff6 2733
2f854e77
ERE
2734 # write new volume header
2735 buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
2f854e77 2736 self.fileobj.write(buf)
ae9c8de2
CH
2737 self.offset += len(buf)
2738
2739 # adjust variables; open_volume should have reset self.offset
2740 # --> _size_left should be big again
2741 target_size_left = _size_left()
2742 size_can_write = min(target_size_left, source_size_left)
e0da4709 2743 self._dbg(3, 'new volume')
ae9c8de2
CH
2744
2745 # now, all data has been written. We may have to fill up the rest of
2746 # the block in target with 0s
2747 remainder = (tarinfo.size - tarinfo.volume_offset) % BLOCKSIZE
2748 if remainder > 0:
2749 self.fileobj.write(NUL * (BLOCKSIZE - remainder))
2750 self.offset += BLOCKSIZE - remainder
7584f5c9 2751
ea625b04
ERE
2752 if self.save_to_members:
2753 self.members.append(tarinfo)
7584f5c9 2754
170c6c52 2755 def open_volume(self, name="", fileobj=None, encryption=None):
68ddf955 2756 '''
0eb5048f 2757 Called by the user to change this tar file to point to a new volume.
68ddf955 2758 '''
27ee4dd4 2759
68ddf955
ERE
2760 # open the file using either fileobj or name
2761 if not fileobj:
2762 if self.mode == "a" and not os.path.exists(name):
2763 # Create nonexistent files in append mode.
2764 self.mode = "w"
2765 self._mode = "wb"
68ddf955 2766 self._extfileobj = False
26fa5ad5
ERE
2767
2768 if isinstance(self.fileobj, _Stream):
e0da4709 2769 self._dbg(3, 'open_volume: create a _Stream')
26fa5ad5
ERE
2770 fileobj = _Stream(name=name,
2771 mode=self.fileobj.mode,
2772 comptype=self.fileobj.comptype,
2773 fileobj=None,
2774 bufsize=self.fileobj.bufsize,
cea130ec 2775 encryption=encryption or self.fileobj.encryption,
27ee4dd4
PG
2776 concat=self.fileobj.arcmode & ARCMODE_CONCAT,
2777 tolerance=self.fileobj.tolerance)
26fa5ad5 2778 else:
7a2b9329 2779 # here, we lose information about compression/encryption!
e0da4709 2780 self._dbg(3, 'open_volume: builtin open')
26fa5ad5 2781 fileobj = bltn_open(name, self._mode)
68ddf955
ERE
2782 else:
2783 if name is None and hasattr(fileobj, "name"):
2784 name = fileobj.name
2785 if hasattr(fileobj, "mode"):
2786 self._mode = fileobj.mode
2787 self._extfileobj = True
1027433a 2788 self._dbg(3, 'open_volume: using external fileobj {}', fileobj)
68ddf955 2789 self.name = os.path.abspath(name) if name else None
5cf23eab 2790 self.fileobj.close()
68ddf955
ERE
2791 self.fileobj = fileobj
2792
2793 # init data structures
2794 self.closed = False
2795 self.members = [] # list of members as TarInfo objects
2796 self._loaded = False # flag if all members have been read
2797 self.offset = self.fileobj.tell()
2798 # current position in the archive file
2799 self.inodes = {} # dictionary caching the inodes of
2800 # archive members already added
2801
2802 try:
2803 if self.mode == "r":
2804 self.firstmember = None
2805 self.firstmember = self.next()
2806
2807 if self.mode == "a":
2808 # Move to the end of the archive,
2809 # before the first empty block.
2810 while True:
2811 self.fileobj.seek(self.offset)
2812 try:
2813 tarinfo = self.tarinfo.fromtarfile(self)
2814 self.members.append(tarinfo)
2815 except EOFHeaderError:
2816 self.fileobj.seek(self.offset)
2817 break
be60ffd0 2818 except HeaderError as e:
68ddf955
ERE
2819 raise ReadError(str(e))
2820
2821 if self.mode in "aw":
2822 self._loaded = True
2823
c04e0751
ERE
2824 if self.format == PAX_FORMAT:
2825 volume_info = {
be60ffd0
ERE
2826 "GNU.volume.filename": str(self.volume_tarinfo.name),
2827 "GNU.volume.size": str(self.volume_tarinfo.size - self.volume_tarinfo.volume_offset),
2828 "GNU.volume.offset": str(self.volume_tarinfo.volume_offset),
c04e0751 2829 }
0eb5048f 2830
c04e0751
ERE
2831 self.pax_headers.update(volume_info)
2832
a0873dcc
PG
2833 if isinstance(self.fileobj, _Stream):
2834 self.fileobj._init_write_gz ()
c04e0751
ERE
2835 buf = self.tarinfo.create_pax_global_header(volume_info.copy())
2836 self.fileobj.write(buf)
2837 self.offset += len(buf)
54128a00 2838 except Exception as exn:
68ddf955
ERE
2839 if not self._extfileobj:
2840 self.fileobj.close()
2841 self.closed = True
2842 raise
2843
e5f5681b 2844 def extractall(self, path=".", members=None, filter=None):
7584f5c9
ERE
2845 """Extract all members from the archive to the current working
2846 directory and set owner, modification time and permissions on
2847 directories afterwards. `path' specifies a different directory
2848 to extract to. `members' is optional and must be a subset of the
2849 list returned by getmembers().
2850 """
2851 directories = []
2852
2853 if members is None:
2854 members = self
2855
2856 for tarinfo in members:
c474439c
ERE
2857 if self.volume_number > 0 and tarinfo.ismultivol():
2858 continue
2859
974408b5 2860 if filter and not filter(tarinfo):
e5f5681b
ERE
2861 continue
2862
7584f5c9
ERE
2863 if tarinfo.isdir():
2864 # Extract directories with a safe mode.
2865 directories.append(tarinfo)
2866 tarinfo = copy.copy(tarinfo)
be60ffd0
ERE
2867 tarinfo.mode = 0o0700
2868 # Do not set_attrs directories, as we will do that further down
2869 self.extract(tarinfo, path, set_attrs=not tarinfo.isdir())
7584f5c9
ERE
2870
2871 # Reverse sort directories.
be60ffd0 2872 directories.sort(key=lambda a: a.name)
7584f5c9
ERE
2873 directories.reverse()
2874
2875 # Set correct owner, mtime and filemode on directories.
2876 for tarinfo in directories:
2877 dirpath = os.path.join(path, tarinfo.name)
2878 try:
2879 self.chown(tarinfo, dirpath)
2880 self.utime(tarinfo, dirpath)
2881 self.chmod(tarinfo, dirpath)
be60ffd0 2882 except ExtractError as e:
7584f5c9
ERE
2883 if self.errorlevel > 1:
2884 raise
2885 else:
2886 self._dbg(1, "tarfile: %s" % e)
2887
786addd6 2888 def extract(self, member, path="", set_attrs=True, symlink_cb=None):
7584f5c9
ERE
2889 """Extract a member from the archive to the current working directory,
2890 using its full name. Its file information is extracted as accurately
2891 as possible. `member' may be a filename or a TarInfo object. You can
be60ffd0
ERE
2892 specify a different directory using `path'. File attributes (owner,
2893 mtime, mode) are set unless `set_attrs' is False.
786addd6
PG
2894 ``symlink_cb`` is a hook accepting a function that is passed the
2895 ``member``, ``path``, and ``set_attrs`` arguments if the tarinfo for
2896 ``member`` indicates a symlink in which case only the callback
9b13f5c4
PG
2897 passed will be applied, skipping the actual extraction. In case the
2898 callback is invoked, its return value is passed on to the caller.
7584f5c9
ERE
2899 """
2900 self._check("r")
2901
be60ffd0 2902 if isinstance(member, str):
7584f5c9
ERE
2903 tarinfo = self.getmember(member)
2904 else:
2905 tarinfo = member
2906
2907 # Prepare the link target for makelink().
2908 if tarinfo.islnk():
2909 tarinfo._link_target = os.path.join(path, tarinfo.linkname)
2910
9b13f5c4 2911 if symlink_cb is not None and tarinfo.issym():
83f5fd71 2912 return symlink_cb(member, path, set_attrs)
786addd6 2913
7584f5c9 2914 try:
be60ffd0
ERE
2915 self._extract_member(tarinfo, os.path.join(path, tarinfo.name),
2916 set_attrs=set_attrs)
2917 except EnvironmentError as e:
7584f5c9
ERE
2918 if self.errorlevel > 0:
2919 raise
2920 else:
2921 if e.filename is None:
2922 self._dbg(1, "tarfile: %s" % e.strerror)
2923 else:
2924 self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename))
be60ffd0 2925 except ExtractError as e:
7584f5c9
ERE
2926 if self.errorlevel > 1:
2927 raise
2928 else:
2929 self._dbg(1, "tarfile: %s" % e)
2930
2931 def extractfile(self, member):
2932 """Extract a member from the archive as a file object. `member' may be
be60ffd0
ERE
2933 a filename or a TarInfo object. If `member' is a regular file or a
2934 link, an io.BufferedReader object is returned. Otherwise, None is
2935 returned.
7584f5c9
ERE
2936 """
2937 self._check("r")
2938
be60ffd0 2939 if isinstance(member, str):
7584f5c9
ERE
2940 tarinfo = self.getmember(member)
2941 else:
2942 tarinfo = member
2943
be60ffd0
ERE
2944 if tarinfo.isreg() or tarinfo.ismultivol() or\
2945 tarinfo.type not in SUPPORTED_TYPES:
7584f5c9
ERE
2946 # If a member's type is unknown, it is treated as a
2947 # regular file.
2948 return self.fileobject(self, tarinfo)
2949
2950 elif tarinfo.islnk() or tarinfo.issym():
2951 if isinstance(self.fileobj, _Stream):
2952 # A small but ugly workaround for the case that someone tries
2953 # to extract a (sym)link as a file-object from a non-seekable
2954 # stream of tar blocks.
2955 raise StreamError("cannot extract (sym)link as file object")
2956 else:
2957 # A (sym)link's file object is its target's file object.
2958 return self.extractfile(self._find_link_target(tarinfo))
2959 else:
2960 # If there's no data associated with the member (directory, chrdev,
2961 # blkdev, etc.), return None instead of a file object.
2962 return None
2963
be60ffd0 2964 def _extract_member(self, tarinfo, targetpath, set_attrs=True):
7584f5c9
ERE
2965 """Extract the TarInfo object tarinfo to a physical
2966 file called targetpath.
2967 """
2968 # Fetch the TarInfo object for the given name
2969 # and build the destination pathname, replacing
2970 # forward slashes to platform specific separators.
2971 targetpath = targetpath.rstrip("/")
2972 targetpath = targetpath.replace("/", os.sep)
2973
2974 # Create all upper directories.
2975 upperdirs = os.path.dirname(targetpath)
2976 if upperdirs and not os.path.exists(upperdirs):
2977 # Create directories that are not part of the archive with
2978 # default permissions.
2979 os.makedirs(upperdirs)
2980
2981 if tarinfo.islnk() or tarinfo.issym():
2982 self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname))
2983 else:
2984 self._dbg(1, tarinfo.name)
2985
2986 if tarinfo.isreg():
2987 self.makefile(tarinfo, targetpath)
2988 elif tarinfo.isdir():
2989 self.makedir(tarinfo, targetpath)
2990 elif tarinfo.isfifo():
2991 self.makefifo(tarinfo, targetpath)
2992 elif tarinfo.ischr() or tarinfo.isblk():
2993 self.makedev(tarinfo, targetpath)
2994 elif tarinfo.islnk() or tarinfo.issym():
2995 self.makelink(tarinfo, targetpath)
2996 elif tarinfo.type not in SUPPORTED_TYPES:
2997 self.makeunknown(tarinfo, targetpath)
2998 else:
2999 self.makefile(tarinfo, targetpath)
3000
be60ffd0
ERE
3001 if set_attrs:
3002 self.chown(tarinfo, targetpath)
3003 if not tarinfo.issym():
3004 self.chmod(tarinfo, targetpath)
3005 self.utime(tarinfo, targetpath)
7584f5c9
ERE
3006
3007 #--------------------------------------------------------------------------
3008 # Below are the different file methods. They are called via
3009 # _extract_member() when extract() is called. They can be replaced in a
3010 # subclass to implement other functionality.
3011
3012 def makedir(self, tarinfo, targetpath):
3013 """Make a directory called targetpath.
3014 """
3015 try:
3016 # Use a safe mode for the directory, the real mode is set
3017 # later in _extract_member().
be60ffd0
ERE
3018 os.mkdir(targetpath, 0o0700)
3019 except FileExistsError:
3020 pass
7584f5c9
ERE
3021
3022 def makefile(self, tarinfo, targetpath):
3023 """Make a file called targetpath.
3024 """
be60ffd0
ERE
3025 source = self.fileobj
3026 source.seek(tarinfo.offset_data)
c7c736b6 3027 decrypt = False
c474439c
ERE
3028 iterate = True
3029 target = bltn_open(targetpath, "wb")
3030
be60ffd0
ERE
3031 if tarinfo.sparse is not None:
3032 try:
3033 for offset, size in tarinfo.sparse:
3034 target.seek(offset)
3035 copyfileobj(source, target, size)
3036 target.seek(tarinfo.size)
3037 target.truncate()
3038 finally:
3039 target.close()
3040 return
3041
c474439c
ERE
3042 while iterate:
3043 iterate = False
3044 try:
3045 copyfileobj(source, target, tarinfo.size)
aa828cd1 3046 except OSError:
c474439c
ERE
3047 source.close()
3048 # only if we are extracting a multivolume this can be treated
3049 if not self.new_volume_handler:
c474439c
ERE
3050 raise Exception("We need to read a new volume and you"
3051 " didn't supply a new_volume_handler")
3052
3053 # the new volume handler should do everything needed to
3054 # start working in a new volume. usually, the handler calls
3055 # to self.open_volume
3056 self.volume_number += 1
3057 self.new_volume_handler(self, self.base_name, self.volume_number)
be60ffd0
ERE
3058 tarinfo = self.firstmember
3059 source = self.fileobj
c474439c 3060 iterate = True
bcc8b174
PG
3061 finally:
3062 if iterate is False: target.close()
c474439c 3063
7584f5c9
ERE
3064
3065 def makeunknown(self, tarinfo, targetpath):
3066 """Make a file from a TarInfo object with an unknown type
3067 at targetpath.
3068 """
3069 self.makefile(tarinfo, targetpath)
3070 self._dbg(1, "tarfile: Unknown file type %r, " \
3071 "extracted as regular file." % tarinfo.type)
3072
3073 def makefifo(self, tarinfo, targetpath):
3074 """Make a fifo called targetpath.
3075 """
3076 if hasattr(os, "mkfifo"):
3077 os.mkfifo(targetpath)
3078 else:
3079 raise ExtractError("fifo not supported by system")
3080
3081 def makedev(self, tarinfo, targetpath):
3082 """Make a character or block device called targetpath.
3083 """
3084 if not hasattr(os, "mknod") or not hasattr(os, "makedev"):
3085 raise ExtractError("special devices not supported by system")
3086
3087 mode = tarinfo.mode
3088 if tarinfo.isblk():
3089 mode |= stat.S_IFBLK
3090 else:
3091 mode |= stat.S_IFCHR
3092
3093 os.mknod(targetpath, mode,
3094 os.makedev(tarinfo.devmajor, tarinfo.devminor))
3095
3096 def makelink(self, tarinfo, targetpath):
3097 """Make a (symbolic) link called targetpath. If it cannot be created
3098 (platform limitation), we try to make a copy of the referenced file
3099 instead of a link.
3100 """
be60ffd0 3101 try:
7584f5c9
ERE
3102 # For systems that support symbolic and hard links.
3103 if tarinfo.issym():
7584f5c9
ERE
3104 os.symlink(tarinfo.linkname, targetpath)
3105 else:
3106 # See extract().
3107 if os.path.exists(tarinfo._link_target):
7584f5c9
ERE
3108 os.link(tarinfo._link_target, targetpath)
3109 else:
be60ffd0
ERE
3110 self._extract_member(self._find_link_target(tarinfo),
3111 targetpath)
3112 except symlink_exception:
7584f5c9 3113 try:
be60ffd0
ERE
3114 self._extract_member(self._find_link_target(tarinfo),
3115 targetpath)
7584f5c9
ERE
3116 except KeyError:
3117 raise ExtractError("unable to resolve link inside archive")
3118
3119 def chown(self, tarinfo, targetpath):
3120 """Set owner of targetpath according to tarinfo.
3121 """
3122 if pwd and hasattr(os, "geteuid") and os.geteuid() == 0:
3123 # We have to be root to do so.
3124 try:
3125 g = grp.getgrnam(tarinfo.gname)[2]
3126 except KeyError:
3127 g = tarinfo.gid
3128 try:
3129 u = pwd.getpwnam(tarinfo.uname)[2]
3130 except KeyError:
3131 u = tarinfo.uid
3132 try:
3133 if tarinfo.issym() and hasattr(os, "lchown"):
3134 os.lchown(targetpath, u, g)
3135 else:
be60ffd0
ERE
3136 os.chown(targetpath, u, g)
3137 except OSError as e:
7584f5c9
ERE
3138 raise ExtractError("could not change owner")
3139
3140 def chmod(self, tarinfo, targetpath):
3141 """Set file permissions of targetpath according to tarinfo.
3142 """
3143 if hasattr(os, 'chmod'):
3144 try:
3145 os.chmod(targetpath, tarinfo.mode)
be60ffd0 3146 except OSError as e:
7584f5c9
ERE
3147 raise ExtractError("could not change mode")
3148
3149 def utime(self, tarinfo, targetpath):
3150 """Set modification time of targetpath according to tarinfo.
3151 """
3152 if not hasattr(os, 'utime'):
3153 return
3154 try:
3155 os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime))
be60ffd0 3156 except OSError as e:
7584f5c9
ERE
3157 raise ExtractError("could not change modification time")
3158
3159 #--------------------------------------------------------------------------
3160 def next(self):
3161 """Return the next member of the archive as a TarInfo object, when
3162 TarFile is opened for reading. Return None if there is no more
3163 available.
3164 """
3165 self._check("ra")
3166 if self.firstmember is not None:
3167 m = self.firstmember
3168 self.firstmember = None
3169 return m
3170
be60ffd0
ERE
3171 # Read the next block.
3172 self.fileobj.seek(self.offset)
7584f5c9
ERE
3173 tarinfo = None
3174 while True:
3175 try:
3176 tarinfo = self.tarinfo.fromtarfile(self)
be60ffd0 3177 except EOFHeaderError as e:
7584f5c9
ERE
3178 if self.ignore_zeros:
3179 self._dbg(2, "0x%X: %s" % (self.offset, e))
3180 self.offset += BLOCKSIZE
3181 continue
be60ffd0 3182 except InvalidHeaderError as e:
7584f5c9
ERE
3183 if self.ignore_zeros:
3184 self._dbg(2, "0x%X: %s" % (self.offset, e))
3185 self.offset += BLOCKSIZE
3186 continue
3187 elif self.offset == 0:
3188 raise ReadError(str(e))
3189 except EmptyHeaderError:
3190 if self.offset == 0:
3191 raise ReadError("empty file")
be60ffd0 3192 except TruncatedHeaderError as e:
7584f5c9
ERE
3193 if self.offset == 0:
3194 raise ReadError(str(e))
be60ffd0 3195 except SubsequentHeaderError as e:
7584f5c9
ERE
3196 raise ReadError(str(e))
3197 break
3198
3199 if tarinfo is not None:
ea625b04
ERE
3200 if self.save_to_members:
3201 self.members.append(tarinfo)
7584f5c9
ERE
3202 else:
3203 self._loaded = True
3204
3205 return tarinfo
3206
3207 #--------------------------------------------------------------------------
3208 # Little helper methods:
3209
3210 def _getmember(self, name, tarinfo=None, normalize=False):
3211 """Find an archive member by name from bottom to top.
3212 If tarinfo is given, it is used as the starting point.
3213 """
3214 # Ensure that all members have been loaded.
3215 members = self.getmembers()
3216
3217 # Limit the member search list up to tarinfo.
3218 if tarinfo is not None:
3219 members = members[:members.index(tarinfo)]
3220
3221 if normalize:
3222 name = os.path.normpath(name)
3223
3224 for member in reversed(members):
3225 if normalize:
3226 member_name = os.path.normpath(member.name)
3227 else:
3228 member_name = member.name
3229
3230 if name == member_name:
3231 return member
3232
3233 def _load(self):
3234 """Read through the entire archive file and look for readable
3235 members.
3236 """
3237 while True:
3238 tarinfo = self.next()
3239 if tarinfo is None:
3240 break
3241 self._loaded = True
3242
3243 def _check(self, mode=None):
3244 """Check if TarFile is still open, and if the operation's mode
3245 corresponds to TarFile's mode.
3246 """
3247 if self.closed:
be60ffd0 3248 raise OSError("%s is closed" % self.__class__.__name__)
7584f5c9 3249 if mode is not None and self.mode not in mode:
be60ffd0 3250 raise OSError("bad operation for mode %r" % self.mode)
7584f5c9
ERE
3251
3252 def _find_link_target(self, tarinfo):
3253 """Find the target member of a symlink or hardlink member in the
3254 archive.
3255 """
3256 if tarinfo.issym():
3257 # Always search the entire archive.
3258 linkname = "/".join(filter(None, (os.path.dirname(tarinfo.name), tarinfo.linkname)))
3259 limit = None
3260 else:
3261 # Search the archive before the link, because a hard link is
3262 # just a reference to an already archived file.
3263 linkname = tarinfo.linkname
3264 limit = tarinfo
3265
3266 member = self._getmember(linkname, tarinfo=limit, normalize=True)
3267 if member is None:
3268 raise KeyError("linkname %r not found" % linkname)
3269 return member
3270
3271 def __iter__(self):
3272 """Provide an iterator object.
3273 """
3274 if self._loaded:
3275 return iter(self.members)
3276 else:
3277 return TarIter(self)
3278
1027433a 3279 def _dbg(self, level, msg, *args):
7584f5c9
ERE
3280 """Write debugging output to sys.stderr.
3281 """
3282 if level <= self.debug:
1027433a 3283 print(msg.format(*args), file=sys.stderr)
7584f5c9
ERE
3284
3285 def __enter__(self):
3286 self._check()
3287 return self
3288
3289 def __exit__(self, type, value, traceback):
3290 if type is None:
3291 self.close()
3292 else:
3293 # An exception occurred. We must not call close() because
3294 # it would try to write end-of-archive blocks and padding.
3295 if not self._extfileobj:
3296 self.fileobj.close()
3297 self.closed = True
3298# class TarFile
3299
3300class TarIter:
3301 """Iterator Class.
3302
3303 for tarinfo in TarFile(...):
3304 suite...
3305 """
3306
3307 def __init__(self, tarfile):
3308 """Construct a TarIter object.
3309 """
3310 self.tarfile = tarfile
3311 self.index = 0
3312 def __iter__(self):
3313 """Return iterator object.
3314 """
3315 return self
be60ffd0 3316 def __next__(self):
7584f5c9
ERE
3317 """Return the next item using TarFile's next() method.
3318 When all members have been read, set TarFile as _loaded.
3319 """
3320 # Fix for SF #1100429: Under rare circumstances it can
3321 # happen that getmembers() is called during iteration,
3322 # which will cause TarIter to stop prematurely.
3323
3324 if self.index == 0 and self.tarfile.firstmember is not None:
3325 tarinfo = self.tarfile.next()
3326 elif self.index < len(self.tarfile.members):
3327 tarinfo = self.tarfile.members[self.index]
3328 elif not self.tarfile._loaded:
3329 tarinfo = self.tarfile.next()
3330 if not tarinfo:
3331 self.tarfile._loaded = True
3332 raise StopIteration
3333 else:
3334 raise StopIteration
3335 self.index += 1
fb27c6e8 3336
7584f5c9
ERE
3337 return tarinfo
3338
6690f5e0
PG
3339#---------------------------------------------------------
3340# support functionality for rescue mode
3341#---------------------------------------------------------
3342
8fc6040c
PG
3343TAR_FMT_HDR = (# See tar(5):
3344 "<"
3345 "100s" # ← char name[100]; /* 100 */
3346 "8s" # ← char mode[8]; /* 108 */
3347 "8s" # ← char uid[8]; /* 116 */
3348 "8s" # ← char gid[8]; /* 124 */
3349 "12s" # ← char size[12]; /* 136 */
3350 "12s" # ← char mtime[12]; /* 148 */
3351 "8s" # ← char checksum[8]; /* 156 */
3352 "B" # ← char typeflag[1]; /* 157 */
3353 "100s" # ← char linkname[100]; /* 257 */
3354 "6s" # ← char magic[6]; /* 263 */
3355 "2s" # ← char version[2]; /* 265 */
3356 "32s" # ← char uname[32]; /* 297 */
3357 "32s" # ← char gname[32]; /* 329 */
3358 "8s" # ← char devmajor[8]; /* 337 */
3359 "8s" # ← char devminor[8]; /* 345 */
3360 "12s" # ← char atime[12]; /* 357 */
3361 "12s" # ← char ctime[12]; /* 369 */
3362 "12s" # ← char offset[12]; /* 381 */
3363 "4s" # ← char longnames[4]; /* 385 */
3364 "B" # ← char unused[1]; /* 386 */
3365 "" # struct {
3366 "12s" # ← char offset[12];
3367 "12s" # ← char numbytes[12];
3368 "12s" # ← char offset[12];
3369 "12s" # ← char numbytes[12];
3370 "12s" # ← char offset[12];
3371 "12s" # ← char numbytes[12];
3372 "12s" # ← char offset[12];
3373 "12s" # ← char numbytes[12];
3374 "" # } sparse[4]; /* 482 */
3375 "B" # ← char isextended[1]; /* 483 */
3376 "12s" # ← char realsize[12]; /* 495 */
3377 "17s" # ← char pad[17]; /* 512 */
3378)
3379
3380# The “magic” and “version” fields are special:
3381#
3382# tar(5)
3383# magic The magic field holds the five characters “ustar” followed by a
3384# space. Note that POSIX ustar archives have a trailing null.
3385#
3386# however, “tar.h”:
3387#
3388# /* OLDGNU_MAGIC uses both magic and version fields, which are contiguous.
3389# Found in an archive, it indicates an old GNU header format, which will be
3390# hopefully become obsolescent. With OLDGNU_MAGIC, uname and gname are
3391# valid, though the header is not truly POSIX conforming. */
3392#
3393#
a793ee30 3394TAR_HDR_OFF_MAGIC = 257
8fc6040c
PG
3395TAR_FMT_OLDGNU_MAGIC = b"ustar "
3396
3397def read_gnu_tar_hdr (data):
3398 if len (data) != BLOCKSIZE: # header requires one complete block
3399 return None
65b35c42 3400
8fc6040c
PG
3401 try:
3402 name, mode, \
3403 uid, gid, \
3404 size, mtime, \
3405 checksum, \
3406 typeflag, \
3407 linkname, \
3408 magic, \
3409 version, \
3410 uname, \
3411 gname, \
3412 devmajor, \
3413 devminor, \
3414 atime, \
3415 ctime, \
3416 offset, \
3417 longnames, \
3418 unused, \
3419 offset1, numbytes1, \
3420 offset2, numbytes2, \
3421 offset3, numbytes3, \
3422 offset4, numbytes4, \
3423 isextended, \
3424 realsize, \
3425 pad = struct.unpack (TAR_FMT_HDR, data)
3426 except struct.error:
3427 return None
3428
3429 if magic != TAR_FMT_OLDGNU_MAGIC:
3430 return None
3431
3432 # return all except “unused” and “pad”
3433 return \
3434 { "name" : name, "mode" : mode
3435 , "uid" : uid , "gid" : gid
3436 , "size" : size, "mtime" : mtime
3437 , "checksum" : checksum
3438 , "typeflag" : typeflag
3439 , "linkname" : linkname
3440 , "magic" : magic
3441 , "version" : version
3442 , "uname" : uname, "gname" : gname
3443 , "devmajor" : devmajor, "devminor" : devminor
3444 , "atime" : atime, "ctime" : ctime
3445 , "offset" : offset
3446 , "longnames" : longnames
3447 , "offset1" : offset1, "numbytes1" : numbytes1
3448 , "offset2" : offset2, "numbytes2" : numbytes2
3449 , "offset3" : offset3, "numbytes3" : numbytes3
3450 , "offset4" : offset4, "numbytes4" : numbytes4
3451 , "isextended" : isextended
3452 , "realsize" : realsize
3453 }
3454
3455
a793ee30
PG
3456def tar_hdr_check_chksum (data):
3457 hdr = read_gnu_tar_hdr (data)
3458 if hdr is None:
3459 return False
3460 s = calc_chksums (data)
3461 return nti (hdr ["checksum"]) in s
3462
3463
8fc6040c
PG
3464def readable_tar_objects_offsets (ifd):
3465 """
3466 Traverse blocks in file, trying to extract tar headers.
3467 """
3468 pos = 0
3469 offsets = []
3470
a793ee30
PG
3471 mm = mmap.mmap(ifd, 0, mmap.MAP_SHARED, mmap.PROT_READ)
3472 pos = TAR_HDR_OFF_MAGIC
3473
8fc6040c 3474 while True:
a793ee30
PG
3475 pos = mm.find (TAR_FMT_OLDGNU_MAGIC, pos)
3476 if pos == -1:
8fc6040c 3477 break
a793ee30
PG
3478 off = pos - TAR_HDR_OFF_MAGIC
3479 mm.seek (off)
3480 blk = mm.read (BLOCKSIZE)
3481 if tar_hdr_check_chksum (blk) is True:
3482 offsets.append (off)
3483 pos += 1
65b35c42 3484
8fc6040c 3485 return offsets
65b35c42
PG
3486
3487
dfd7865e
PG
3488def locate_gz_hdr_candidates (fd):
3489 """
3490 Walk over instances of the GZ magic in the payload, collecting their
3491 positions. If the offset of the first found instance is not zero, the file
3492 begins with leading garbage.
3493
3494 Note that since the GZ magic consists of only two bytes, we expect a lot of
3495 false positives inside binary data.
3496
3497 :return: The list of offsets in the file.
3498 """
3499 pos = 0
3500 cands = []
3501 mm = mmap.mmap(fd, 0, mmap.MAP_SHARED, mmap.PROT_READ)
3502
3503 while True:
3504 pos = mm.find (GZ_MAGIC_BYTES, pos)
3505 if pos == -1:
3506 break
3507 cands.append (pos)
3508 pos += len (GZ_MAGIC_BYTES)
3509
3510 return cands
3511
3512
3513HDR_CAND_GOOD = 0 # header marks begin of valid object
3514HDR_CAND_FISHY = 1 # inconclusive
3515HDR_CAND_JUNK = 2 # not a header / object unreadable
3516
3517
3518def read_cstring (fd, max=-1, encoding=None):
3519 """
3520 Read one NUL-terminated string from *fd* into a Python string. If *max* is
3521 non-negative, reading will terminate after the specified number of bytes.
3522
3523 Optionally, an *encoding* may be specified to interpret the data as.
3524
3525 :returns: *None* if parsing failed or the maximum number of bytes has been
3526 exceeded; a Python string with the data otherwise.
3527 """
3528 buf = b""
3529 l = 0
3530
3531 while True:
3532 c = os.read (fd, 1)
3533 if c == NUL:
3534 break
3535 if max >= 0 and l > max:
3536 return None
3537 buf += c
3538 l += 1
3539 if encoding is not None:
3540 buf = buf.decode (encoding)
3541
3542 return buf
3543
3544
3545def inspect_gz_hdr (fd, off):
3546 """
3547 Attempt to parse a Gzip header in *fd* at position *off*. The format is
3548 documented as RFC1952.
3549
3550 Returns a verdict about the quality of that header plus the parsed header
3551 when readable. Problematic sizes such as fields running past the EOF are
3552 treated as garbage. Properties in which the header merely doesn’t conform
3553 to the spec (garbage flag bits, bogus timestamp) are considered “fishy”. No
3554 validation is possible on embedded strings because they are single-byte
3555 encoded.
3556 """
3557 fname = None
3558 flags = 0x00
3559 dflags = 0x00
3560 mtime = 0x00000000
3561 oscode = 0x00
3562 verdict = HDR_CAND_GOOD
3563
3564 os.lseek (fd, off, os.SEEK_SET)
3565 if os.lseek (fd, 0, os.SEEK_CUR) != off:
3566 return HDR_CAND_JUNK, None
3567
3568 raw = os.read (fd, GZ_HEADER_SIZE)
3569 if len (raw) != GZ_HEADER_SIZE:
3570 return HDR_CAND_JUNK, None
3571
3572 flags = 0x0
3573 try:
3574 _m1, _m2, meth, flags, mtime, dflags, oscode = \
3575 struct.unpack (GZ_FMT_HEADER, raw)
3576 if meth != GZ_METHOD_DEFLATE: # only deflate is supported
3577 return HDR_CAND_JUNK, None
3578 except struct.error as exn:
3579 return HDR_CAND_JUNK, None
3580
3581 if mtime > int (time.time ()):
3582 verdict = HDR_CAND_FISHY
3583
3584 if dflags != GZ_DEFLATE_FLAGS:
3585 verdict = HDR_CAND_FISHY
3586
3587 if oscode != GZ_OS_CODE:
3588 verdict = HDR_CAND_FISHY
3589
3590 if flags & GZ_FLAG_FTEXT: # created by some contrarian
3591 verdict = HDR_CAND_FISHY
3592 if flags & GZ_FLAG_FEXTRA:
04263795 3593 xlen = struct.unpack ("<H", os.read (fd, 2))[0]
dfd7865e
PG
3594 xtra = os.read (fd, xlen)
3595 if len (xtra) != xlen: # eof inside header
3596 return HDR_CAND_JUNK, None
3597 if flags & GZ_FLAG_FNAME:
3598 # read up to the next NUL byte, not exceeding the maximum path length
3599 # allowed by tar(5)
3600 fname = read_cstring (fd, max=(LENGTH_NAME + LENGTH_PREFIX),
3601 encoding="iso-8859-1")
3602 if fname is None:
3603 return HDR_CAND_JUNK, None
3604 if flags & GZ_FLAG_FCOMMENT:
3605 fname = read_cstring (fd, max=(LENGTH_NAME + LENGTH_PREFIX),
3606 encoding="iso-8859-1")
3607 if fname is None:
3608 return HDR_CAND_JUNK, None
3609 if flags & GZ_FLAG_FHCRC: # half a CRC32; discarded
3610 crc16 = os.read (fd, 2)
3611 if len (crc16) != 2: # eof inside header
3612 return HDR_CAND_JUNK, None
3613 if flags & GZ_FLAG_RESERVED:
3614 # according to the RFC, these must not be set
3615 verdict = HDR_CAND_FISHY
3616
3617 hlen = os.lseek (fd, 0, os.SEEK_CUR) - off
3618
3619 return verdict, \
3620 { "fname" : fname
3621 , "flags" : flags
3622 , "dflags" : dflags
3623 , "mtime" : mtime
3624 , "oscode" : oscode
3625 , "hlen" : hlen
3626 }
3627
3628
3629def try_decompress (ifd, off, hdr):
3630 """
3631 Attempt to process the object starting at *off* with gzip.
3632
3633 :returns: A pair containing the values of the decompressed data and
3634 the length of the input consumed. Note that the latter value
3635 may exceed the length of the compressed data because the
3636 *zlib* module does not provide a means to query how much
3637 of the input it processed before the end of an object.
3638 """
3639 import zlib
3640 decmp = zlib.decompressobj (-zlib.MAX_WBITS)
3641 pos = off
3642 dlen = 0 # size of decompressed data
3643
3644 os.lseek (ifd, pos, os.SEEK_SET)
3645 while True:
3646 cnk = os.read (ifd, BUFSIZE)
3647 pos += len (cnk)
3648 try:
3649 data = decmp.decompress (cnk)
3650 except zlib.error as exn: # probably CRC32 mismatch; terminate softly
3651 break # fishy
3652 dlen += len (data)
3653 if decmp.eof is True:
3654 break
3655 if len (cnk) != BUFSIZE: # eof, but not end of decompressed object!
3656 break # fishy
3657
3658 return dlen, pos - off
3659
3660def readable_gz_objects_offsets (ifd, cands):
3661 """
3662 Inspect header candidates for parseable *ifd* gzipped objects.
3663 """
3664 good = []
3665 nobj = 0
3666
3667 for cand in cands:
3668 nobj += 1
3669 vdt, hdr = inspect_gz_hdr (ifd, cand)
3670 if vdt == HDR_CAND_JUNK:
3671 pass # ignore unreadable ones
3672 elif vdt in [HDR_CAND_GOOD, HDR_CAND_FISHY]:
3673 off0 = cand + hdr ["hlen"]
3674 dlen, clen = try_decompress (ifd, off0, hdr)
3675 if dlen > 0 and clen > 0:
3676 good.append (cand)
3677
3678 return good
3679
3680
3681def reconstruct_offsets_gz (fname):
3682 """
3683 From the given file, retrieve all GZ header-like offsets (“candidates”).
3684 Then check each of those locations whether they can be processed as
3685 compressed data.
3686 """
3687 ifd = os.open (fname, os.O_RDONLY)
3688
3689 try:
3690 cands = locate_gz_hdr_candidates (ifd)
3691 return readable_gz_objects_offsets (ifd, cands)
3692 finally:
3693 os.close (ifd)
3694
3695
65b35c42
PG
3696def reconstruct_offsets_tar (fname):
3697 """
3698 From the given file, retrieve all tar header-like offsets (“candidates”).
3699 Then check each of those locations whether they can be processed as tar
3700 data.
3701 """
3702 ifd = os.open (fname, os.O_RDONLY)
3703
3704 try:
8fc6040c 3705 return readable_tar_objects_offsets (ifd)
65b35c42
PG
3706 finally:
3707 os.close (ifd)
3708
3709
d39d4cbf
PG
3710def read_tarobj_at_offset (fileobj, offset, mode, secret=None):
3711 decr = None
d39d4cbf 3712
dfd7865e
PG
3713 if secret is not None:
3714 ks = secret [0]
3715
3716 if ks == crypto.PDTCRYPT_SECRET_PW:
3717 decr = crypto.Decrypt (password=secret [1])
3718 elif ks == crypto.PDTCRYPT_SECRET_KEY:
3719 key = binascii.unhexlify (secret [1])
3720 decr = crypto.Decrypt (key=key)
3721 else:
3722 raise RuntimeError
d39d4cbf 3723
5bd2d4b5
PG
3724 try:
3725 tarobj = \
3726 TarFile.open_at_offset (offset,
3727 mode=mode,
3728 fileobj=fileobj,
3729 format=GNU_FORMAT,
3730 concat='#' in mode,
3731 encryption=decr,
3732 save_to_members=False,
3733 tolerance=TOLERANCE_RESCUE)
3734 except (ReadError, EndOfFile):
3735 return None
d39d4cbf
PG
3736
3737 return tarobj.next ()
3738
3739
2d50b7f7
PG
3740def idxent_of_tarinfo (tarinfo):
3741 """
3742 Scrape the information relevant for the index from a *TarInfo* object.
3743 Keys like the inode number that lack a corresponding field in a TarInfo
3744 will be set to some neutral value.
3745 Example output:
3746
3747 { "inode" : 0
3748 , "uid" : 0
3749 , "path" : "snapshot://annotations.db"
3750 , "offset" : 0
3751 , "volume" : 0
3752 , "mode" : 33152
3753 , "ctime" : 1502798115
3754 , "mtime" : 1502196423
3755 , "size" : 144
3756 , "type" : "file"
3757 , "gid" : 0
3758 }
3759
3760 """
3761
3762 return \
3763 { "inode" : 0 # ignored when reading the index
3764 , "uid" : tarinfo.uid
3765 , "gid" : tarinfo.gid
3766 , "path" : tarinfo.name # keeping URI scheme
3767 , "offset" : 0 # to be added by the caller
3768 , "volume" : tarinfo.volume_offset
3769 , "mode" : tarinfo.mode
3770 , "ctime" : tarinfo.mtime
3771 , "mtime" : tarinfo.mtime
3772 , "size" : tarinfo.size
3773 , "type" : tarinfo.type
3774 }
3775
3776
27ee4dd4
PG
3777def gen_rescue_index (gen_volume_name, mode, maxvol=None, password=None, key=None):
3778 infos = []
6690f5e0
PG
3779 psidx = [] # pseudo index, return value
3780 offsets = None
addcec42 3781 secret = crypto.make_secret (password=password, key=key)
6690f5e0 3782
27ee4dd4 3783 nvol = 0
dfd7865e 3784
27ee4dd4
PG
3785 while True:
3786 vpath = gen_volume_name (nvol)
3787 try:
3788 if secret is not None:
3789 offsets = crypto.reconstruct_offsets (vpath, secret)
3790 elif mode == "#gz":
3791 offsets = reconstruct_offsets_gz (vpath)
3792 elif mode == "#":
3793 offsets = reconstruct_offsets_tar (vpath)
3794 else:
3795 raise TarError ("no rescue handling for mode “%s”" % mode)
3796 except FileNotFoundError as exn:
3797 # volume does not exist
611c5d03 3798 if maxvol is not None and nvol < maxvol:
27ee4dd4
PG
3799 continue # explicit volume number specified, ignore missing ones
3800 else:
3801 break
3802
3803 fileobj = bltn_open (vpath, "rb")
5bd2d4b5
PG
3804
3805 def aux (acc, off):
3806 obj = read_tarobj_at_offset (fileobj, off, mode, secret=secret)
3807 if obj is not None:
3808 acc.append ((off, nvol, obj))
3809 return acc
3810 infos += functools.reduce (aux, offsets, [])
3811
bcc8b174
PG
3812 fileobj.close()
3813
27ee4dd4
PG
3814 nvol += 1
3815
5bd2d4b5
PG
3816 def aux (o, nvol, ti):
3817 ie = idxent_of_tarinfo (ti)
3818 ie ["offset"] = o
3819 ie ["volume"] = nvol
3820 return ie
3821
27ee4dd4 3822 psidx = [ aux (o, nvol, ti) for o, nvol, ti in infos ]
6690f5e0
PG
3823
3824 return psidx
7584f5c9
ERE
3825
3826#--------------------
3827# exported functions
3828#--------------------
3829def is_tarfile(name):
3830 """Return True if name points to a tar archive that we
3831 are able to handle, else return False.
3832 """
3833 try:
3834 t = open(name)
3835 t.close()
3836 return True
3837 except TarError:
3838 return False
3839
3840bltn_open = open
3841open = TarFile.open