enable strict IV checking by default during decryption
[python-delta-tar] / deltatar / tarfile.py
CommitLineData
be60ffd0 1#!/usr/bin/env python3
7584f5c9
ERE
2#-------------------------------------------------------------------
3# tarfile.py
4#-------------------------------------------------------------------
5# Copyright (C) 2002 Lars Gustäbel <lars@gustaebel.de>
6# All rights reserved.
7#
8# Permission is hereby granted, free of charge, to any person
9# obtaining a copy of this software and associated documentation
10# files (the "Software"), to deal in the Software without
11# restriction, including without limitation the rights to use,
12# copy, modify, merge, publish, distribute, sublicense, and/or sell
13# copies of the Software, and to permit persons to whom the
14# Software is furnished to do so, subject to the following
15# conditions:
16#
17# The above copyright notice and this permission notice shall be
18# included in all copies or substantial portions of the Software.
19#
20# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27# OTHER DEALINGS IN THE SOFTWARE.
28#
29"""Read from and write to tar format archives.
30"""
31
32__version__ = "$Revision: 85213 $"
33# $Source$
34
35version = "0.9.0"
36__author__ = "Lars Gustäbel (lars@gustaebel.de)"
37__date__ = "$Date$"
38__cvsid__ = "$Id$"
5fdff89f 39__credits__ = "Gustavo Niemeyer, Niels Gustäbel, Richard Townsend, Eduardo Robles."
7584f5c9
ERE
40
41#---------
42# Imports
43#---------
c7c736b6 44import binascii
dfd7865e
PG
45import copy
46import errno
5bd2d4b5 47import functools
be60ffd0 48import io
dfd7865e
PG
49import mmap
50import operator
51import os
52import re
7584f5c9
ERE
53import shutil
54import stat
7584f5c9 55import struct
dfd7865e
PG
56import sys
57import time
7584f5c9 58
c7c736b6
PG
59import traceback # XXX
60
8ab8fac5 61from . import crypto
6e812ad9 62
7584f5c9
ERE
63try:
64 import grp, pwd
65except ImportError:
66 grp = pwd = None
67
be60ffd0
ERE
68# os.symlink on Windows prior to 6.0 raises NotImplementedError
69symlink_exception = (AttributeError, NotImplementedError)
70try:
71 # OSError (winerror=1314) will be raised if the caller does not hold the
72 # SeCreateSymbolicLinkPrivilege privilege
73 symlink_exception += (OSError,)
74except NameError:
75 pass
76
7584f5c9
ERE
77# from tarfile import *
78__all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError"]
79
be60ffd0
ERE
80from builtins import open as _open # Since 'open' is TarFile.open
81
7584f5c9
ERE
82#---------------------------------------------------------
83# tar constants
84#---------------------------------------------------------
be60ffd0 85NUL = b"\0" # the null character
7584f5c9
ERE
86BLOCKSIZE = 512 # length of processing blocks
87RECORDSIZE = BLOCKSIZE * 20 # length of records
be60ffd0
ERE
88GNU_MAGIC = b"ustar \0" # magic gnu tar string
89POSIX_MAGIC = b"ustar\x0000" # magic posix tar string
7584f5c9
ERE
90
91LENGTH_NAME = 100 # maximum length of a filename
92LENGTH_LINK = 100 # maximum length of a linkname
93LENGTH_PREFIX = 155 # maximum length of the prefix field
94
be60ffd0
ERE
95REGTYPE = b"0" # regular file
96AREGTYPE = b"\0" # regular file
97LNKTYPE = b"1" # link (inside tarfile)
98SYMTYPE = b"2" # symbolic link
99CHRTYPE = b"3" # character special device
100BLKTYPE = b"4" # block special device
101DIRTYPE = b"5" # directory
102FIFOTYPE = b"6" # fifo special device
103CONTTYPE = b"7" # contiguous file
104
105GNUTYPE_LONGNAME = b"L" # GNU tar longname
106GNUTYPE_LONGLINK = b"K" # GNU tar longlink
107GNUTYPE_SPARSE = b"S" # GNU tar sparse file
108GNUTYPE_MULTIVOL = b"M" # GNU tar continuation of a file that began on
68ddf955 109 # another volume
7584f5c9 110
be60ffd0
ERE
111XHDTYPE = b"x" # POSIX.1-2001 extended header
112XGLTYPE = b"g" # POSIX.1-2001 global header
113SOLARIS_XHDTYPE = b"X" # Solaris extended header
7584f5c9
ERE
114
115USTAR_FORMAT = 0 # POSIX.1-1988 (ustar) format
116GNU_FORMAT = 1 # GNU tar format
117PAX_FORMAT = 2 # POSIX.1-2001 (pax) format
118DEFAULT_FORMAT = GNU_FORMAT
119
15a81fc0 120GZ_FMT_HEADER = b"<BBBBLBB"
203cb25e 121GZ_HEADER_SIZE = 10 # not including the name
15a81fc0
PG
122GZ_MAGIC = (0x1f, 0x8b) # 0o37, 0o213
123GZ_METHOD_DEFLATE = 0x08 # 0o10
dfd7865e
PG
124GZ_FLAG_FTEXT = 1 << 0 # ASCII payload
125GZ_FLAG_FHCRC = 1 << 1 # CRC16
126GZ_FLAG_FEXTRA = 1 << 2 # extra field
127GZ_FLAG_FNAME = 1 << 3 # set by default in gzip
128GZ_FLAG_FCOMMENT = 1 << 4 # NUL-terminated comment
129GZ_FLAG_RESERVED = 7 << 5 # unassigned
15a81fc0
PG
130GZ_DEFLATE_FLAGS = 0x00 # 0o00, never read (deflate.c)
131GZ_OS_CODE = 0x03 # 0o03, default in gzip (tailor.h)
d601d33b
PG
132GZ_MAGIC_BYTES = struct.pack ("<BB", GZ_MAGIC [0], GZ_MAGIC [1])
133GZ_MAGIC_DEFLATE = struct.pack ("<BBB", GZ_MAGIC [0], GZ_MAGIC [1],
134 GZ_METHOD_DEFLATE)
15a81fc0 135
04f4c7ab
PG
136TOLERANCE_STRICT = 0
137TOLERANCE_RECOVER = 1 # rely on offsets in index
138TOLERANCE_RESCUE = 2 # deduce metadata from archive contents
139
dfd7865e
PG
140BUFSIZE = 16 * 1024
141
7584f5c9 142#---------------------------------------------------------
d1c38f40
PG
143# archive handling mode
144#---------------------------------------------------------
145
146ARCMODE_PLAIN = 0
147ARCMODE_ENCRYPT = 1 << 0
148ARCMODE_COMPRESS = 1 << 1
149ARCMODE_CONCAT = 1 << 2
150
151def arcmode_fmt (m):
152 if m == ARCMODE_PLAIN:
153 return "PLAIN"
154 first = True
155 ret = "["
156 def chkappend (b, s):
157 nonlocal m
158 nonlocal ret
159 nonlocal first
160 if m & b:
161 if first is True: first = False
162 else: ret += " |"
163 ret += " " + s
164 chkappend (ARCMODE_ENCRYPT, "ENCRYPT")
165 chkappend (ARCMODE_COMPRESS, "COMPRESS")
166 chkappend (ARCMODE_CONCAT, "CONCAT")
167 return ret + " ]"
168
169
170def arcmode_set (concat=False, encryption=None, comptype=None, init=ARCMODE_PLAIN):
171 ret = init
172 if bool (concat) is True:
173 ret |= ARCMODE_CONCAT
174 if encryption is not None:
175 ret |= ARCMODE_ENCRYPT
176 if comptype == "gz":
177 ret |= ARCMODE_COMPRESS
178 return ret
179
180#---------------------------------------------------------
7584f5c9
ERE
181# tarfile constants
182#---------------------------------------------------------
183# File types that tarfile supports:
184SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE,
185 SYMTYPE, DIRTYPE, FIFOTYPE,
186 CONTTYPE, CHRTYPE, BLKTYPE,
187 GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
68ddf955 188 GNUTYPE_SPARSE, GNUTYPE_MULTIVOL)
7584f5c9
ERE
189
190# File types that will be treated as a regular file.
191REGULAR_TYPES = (REGTYPE, AREGTYPE,
192 CONTTYPE, GNUTYPE_SPARSE)
193
194# File types that are part of the GNU tar format.
195GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
68ddf955 196 GNUTYPE_SPARSE, GNUTYPE_MULTIVOL)
7584f5c9
ERE
197
198# Fields from a pax header that override a TarInfo attribute.
199PAX_FIELDS = ("path", "linkpath", "size", "mtime",
200 "uid", "gid", "uname", "gname")
201
be60ffd0
ERE
202# Fields from a pax header that are affected by hdrcharset.
203PAX_NAME_FIELDS = {"path", "linkpath", "uname", "gname"}
204
7584f5c9
ERE
205# Fields in a pax header that are numbers, all other fields
206# are treated as strings.
207PAX_NUMBER_FIELDS = {
208 "atime": float,
209 "ctime": float,
210 "mtime": float,
211 "uid": int,
212 "gid": int,
213 "size": int
214}
215
216#---------------------------------------------------------
7584f5c9
ERE
217# initialization
218#---------------------------------------------------------
be60ffd0
ERE
219
220if os.name in ("nt", "ce"):
221 ENCODING = "utf-8"
222else:
223 ENCODING = sys.getfilesystemencoding()
7584f5c9
ERE
224
225#---------------------------------------------------------
226# Some useful functions
227#---------------------------------------------------------
228
be60ffd0
ERE
229def stn(s, length, encoding, errors):
230 """Convert a string to a null-terminated bytes object.
7584f5c9 231 """
be60ffd0 232 s = s.encode(encoding, errors)
7584f5c9
ERE
233 return s[:length] + (length - len(s)) * NUL
234
be60ffd0
ERE
235def nts(s, encoding, errors):
236 """Convert a null-terminated bytes object to a string.
7584f5c9 237 """
be60ffd0
ERE
238 p = s.find(b"\0")
239 if p != -1:
240 s = s[:p]
241 return s.decode(encoding, errors)
242
243def sbtn(s, length, encoding, errors):
244 """Convert a string or a bunch of bytes to a null-terminated bytes object
245 of specific size.
246 """
247 if isinstance(s, str):
248 s = s.encode(encoding, errors)
249 return s[:length] + (length - len(s)) * NUL
7584f5c9
ERE
250
251def nti(s):
252 """Convert a number field to a python number.
253 """
254 # There are two possible encodings for a number field, see
255 # itn() below.
be60ffd0
ERE
256 if s[0] in (0o200, 0o377):
257 n = 0
258 for i in range(len(s) - 1):
259 n <<= 8
260 n += s[i + 1]
261 if s[0] == 0o377:
262 n = -(256 ** (len(s) - 1) - n)
263 else:
7584f5c9 264 try:
be60ffd0 265 n = int(nts(s, "ascii", "strict") or "0", 8)
7584f5c9
ERE
266 except ValueError:
267 raise InvalidHeaderError("invalid header")
7584f5c9
ERE
268 return n
269
270def itn(n, digits=8, format=DEFAULT_FORMAT):
271 """Convert a python number to a number field.
272 """
273 # POSIX 1003.1-1988 requires numbers to be encoded as a string of
274 # octal digits followed by a null-byte, this allows values up to
275 # (8**(digits-1))-1. GNU tar allows storing numbers greater than
be60ffd0
ERE
276 # that if necessary. A leading 0o200 or 0o377 byte indicate this
277 # particular encoding, the following digits-1 bytes are a big-endian
278 # base-256 representation. This allows values up to (256**(digits-1))-1.
279 # A 0o200 byte indicates a positive number, a 0o377 byte a negative
280 # number.
7584f5c9 281 if 0 <= n < 8 ** (digits - 1):
8112b0ed 282 s = bytes("%0*o" % (digits - 1, int(n)), "ascii") + NUL
be60ffd0
ERE
283 elif format == GNU_FORMAT and -256 ** (digits - 1) <= n < 256 ** (digits - 1):
284 if n >= 0:
285 s = bytearray([0o200])
286 else:
287 s = bytearray([0o377])
288 n = 256 ** digits + n
7584f5c9 289
be60ffd0
ERE
290 for i in range(digits - 1):
291 s.insert(1, n & 0o377)
7584f5c9 292 n >>= 8
7584f5c9 293 else:
be60ffd0
ERE
294 raise ValueError("overflow in number field")
295
296 return s
7584f5c9
ERE
297
298def calc_chksums(buf):
299 """Calculate the checksum for a member's header by summing up all
300 characters except for the chksum field which is treated as if
301 it was filled with spaces. According to the GNU tar sources,
302 some tars (Sun and NeXT) calculate chksum with signed char,
303 which will be different if there are chars in the buffer with
304 the high bit set. So we calculate two checksums, unsigned and
305 signed.
306 """
be60ffd0
ERE
307 unsigned_chksum = 256 + sum(struct.unpack_from("148B8x356B", buf))
308 signed_chksum = 256 + sum(struct.unpack_from("148b8x356b", buf))
7584f5c9
ERE
309 return unsigned_chksum, signed_chksum
310
311def copyfileobj(src, dst, length=None):
312 """Copy length bytes from fileobj src to fileobj dst.
313 If length is None, copy the entire content.
314 """
315 if length == 0:
316 return
317 if length is None:
318 shutil.copyfileobj(src, dst)
319 return
320
7584f5c9 321 blocks, remainder = divmod(length, BUFSIZE)
be60ffd0 322 for b in range(blocks):
7584f5c9 323 buf = src.read(BUFSIZE)
c474439c 324 dst.write(buf)
7584f5c9 325 if len(buf) < BUFSIZE:
be60ffd0 326 raise OSError("end of file reached")
7584f5c9
ERE
327 if remainder != 0:
328 buf = src.read(remainder)
c474439c 329 dst.write(buf)
7584f5c9 330 if len(buf) < remainder:
be60ffd0 331 raise OSError("end of file reached")
c7c736b6 332
7584f5c9 333
7584f5c9 334def filemode(mode):
be60ffd0
ERE
335 """Deprecated in this location; use stat.filemode."""
336 import warnings
337 warnings.warn("deprecated in favor of stat.filemode",
338 DeprecationWarning, 2)
339 return stat.filemode(mode)
7584f5c9
ERE
340
341class TarError(Exception):
342 """Base exception."""
343 pass
344class ExtractError(TarError):
345 """General exception for extract errors."""
346 pass
347class ReadError(TarError):
be60ffd0 348 """Exception for unreadable tar archives."""
7584f5c9
ERE
349 pass
350class CompressionError(TarError):
351 """Exception for unavailable compression methods."""
352 pass
353class StreamError(TarError):
354 """Exception for unsupported operations on stream-like TarFiles."""
355 pass
356class HeaderError(TarError):
357 """Base exception for header errors."""
358 pass
359class EmptyHeaderError(HeaderError):
360 """Exception for empty headers."""
361 pass
362class TruncatedHeaderError(HeaderError):
363 """Exception for truncated headers."""
364 pass
365class EOFHeaderError(HeaderError):
366 """Exception for end of file headers."""
367 pass
368class InvalidHeaderError(HeaderError):
369 """Exception for invalid headers."""
370 pass
371class SubsequentHeaderError(HeaderError):
372 """Exception for missing and invalid extended headers."""
373 pass
8ab8fac5
PG
374class InvalidEncryptionError(TarError):
375 """Exception for undefined crypto modes and combinations."""
376 pass
e4e5d0b8
PG
377class DecryptionError(TarError):
378 """Exception for error during decryption."""
379 pass
c7c736b6 380class EncryptionError(TarError):
e93f83f1 381 """Exception for error during encryption."""
c7c736b6 382 pass
e50fa574
PG
383class EndOfFile(Exception):
384 """Signal end of file condition when they’re not an error."""
65b35c42 385 pass
7584f5c9
ERE
386
387#---------------------------
388# internal stream interface
389#---------------------------
390class _LowLevelFile:
391 """Low-level file object. Supports reading and writing.
392 It is used instead of a regular file object for streaming
393 access.
394 """
395
396 def __init__(self, name, mode):
ad4402e8 397 _mode = {
7584f5c9 398 "r": os.O_RDONLY,
c7c736b6 399 "w": os.O_RDWR | os.O_CREAT | os.O_TRUNC,
7584f5c9
ERE
400 }[mode]
401 if hasattr(os, "O_BINARY"):
f0287fb7 402 _mode |= os.O_BINARY # pylint: disable=no-member
be60ffd0 403 self.fd = os.open(name, _mode, 0o666)
ad4402e8 404 self.offset = 0
7584f5c9
ERE
405
406 def close(self):
407 os.close(self.fd)
408
409 def read(self, size):
ad4402e8
ERE
410 ret = os.read(self.fd, size)
411 self.offset += len(ret)
412 return ret
7584f5c9 413
867f75f7
PG
414 def write(self, s, pos=None):
415 if pos is not None:
416 p0 = self.offset
417 os.lseek (self.fd, pos, os.SEEK_SET)
418 n = os.write(self.fd, s)
419 if pos is None:
420 self.offset += len(s)
421 else:
422 append = pos + n - p0
423 if append > 0:
424 self.offset += append
425 os.lseek (self.fd, p0, os.SEEK_SET)
7584f5c9 426
ad4402e8
ERE
427 def tell(self):
428 return self.offset
429
c7c736b6
PG
430 def seek_set (self, pos):
431 os.lseek (self.fd, pos, os.SEEK_SET)
432 self.offset = pos
433
8ab8fac5 434
15a81fc0
PG
435def gz_header (name=None):
436 timestamp = int(time.time())
437 flags = 0x0
438
439 if name is None:
440 name = b""
441 else:
dfd7865e 442 flags |= GZ_FLAG_FNAME
15a81fc0
PG
443 if type(name) is str:
444 name = name.encode("iso-8859-1", "replace")
6e99d23a
PG
445 if name.endswith(b".pdtcrypt"):
446 name = name[:-9]
15a81fc0
PG
447 if name.endswith(b".gz"):
448 name = name[:-3]
449 # RFC1952 says we must use ISO-8859-1 for the FNAME field.
450 name += NUL
451
452 hdr = struct.pack (GZ_FMT_HEADER,
453 GZ_MAGIC [0], GZ_MAGIC [1],
454 GZ_METHOD_DEFLATE, flags,
455 timestamp,
456 GZ_DEFLATE_FLAGS, GZ_OS_CODE)
457
458 return hdr + name
459
d601d33b 460
7584f5c9
ERE
461class _Stream:
462 """Class that serves as an adapter between TarFile and
463 a stream-like object. The stream-like object only
464 needs to have a read() or write() method and is accessed
465 blockwise. Use of gzip or bzip2 compression is possible.
466 A stream-like object could be for example: sys.stdin,
467 sys.stdout, a socket, a tape device etc.
468
3031b7ae
PG
469 _Stream is intended to be used only internally but is
470 nevertherless used externally by Deltatar.
471
472 When encrypting, the ``enccounter`` will be used for
473 initializing the first cryptographic context. When
474 decrypting, its value will be compared to the decrypted
475 object. Decryption fails if the value does not match.
476 In effect, this means that a ``_Stream`` whose ctor was
477 passed ``enccounter`` can only be used to encrypt or
478 decrypt a single object.
7584f5c9
ERE
479 """
480
c7c736b6 481 remainder = -1 # track size in encrypted entries
04f4c7ab 482 tolerance = TOLERANCE_STRICT
c7c736b6 483
6e812ad9 484 def __init__(self, name, mode, comptype, fileobj, bufsize,
d1c38f40 485 concat=False, encryption=None, enccounter=None,
04f4c7ab 486 compresslevel=9, tolerance=TOLERANCE_STRICT):
7584f5c9
ERE
487 """Construct a _Stream object.
488 """
d1c38f40 489 self.arcmode = arcmode_set (concat, encryption, comptype)
04f4c7ab 490 self.tolerance = tolerance
d1c38f40 491
7584f5c9
ERE
492 self._extfileobj = True
493 if fileobj is None:
494 fileobj = _LowLevelFile(name, mode)
495 self._extfileobj = False
496
497 if comptype == '*':
498 # Enable transparent compression detection for the
499 # stream interface
500 fileobj = _StreamProxy(fileobj)
501 comptype = fileobj.getcomptype()
d1c38f40
PG
502 if comptype == '':
503 comptype = "tar"
7584f5c9 504
3031b7ae
PG
505 self.enccounter = None
506 if self.arcmode & ARCMODE_ENCRYPT:
507 self.enccounter = enccounter
508
7584f5c9
ERE
509 self.name = name or ""
510 self.mode = mode
511 self.comptype = comptype
53732900 512 self.cmp = None
7584f5c9
ERE
513 self.fileobj = fileobj
514 self.bufsize = bufsize
be60ffd0
ERE
515 self.buf = b""
516 self.pos = 0
517 self.concat_pos = 0
7584f5c9 518 self.closed = False
be60ffd0 519 self.flags = 0
be60ffd0 520 self.last_block_offset = 0
e4e5d0b8 521 self.dbuf = b"" # ???
46c03c02 522 self.exception = None # communicate decompression failure
2b82f50c 523 self.compresslevel = compresslevel
784175ba 524 self.bytes_written = 0
c7c736b6 525 # crypto parameters
2ae46844 526 self.encryption = encryption
c7c736b6 527 self.lasthdr = None
7584f5c9 528
b750b280
PG
529 if encryption is not None:
530 encryption.reset_last_iv ()
531
be60ffd0
ERE
532 try:
533 if comptype == "gz":
534 try:
535 import zlib
536 except ImportError:
537 raise CompressionError("zlib module is not available")
538 self.zlib = zlib
bec34b42
PG
539 if mode == "r":
540 self.exception = zlib.error
8ae983c4 541 self._init_read_gz()
bec34b42 542 elif mode == "w":
d1c38f40
PG
543 if not (self.arcmode & ARCMODE_CONCAT):
544 if self.arcmode & ARCMODE_ENCRYPT:
3031b7ae 545 self._init_write_encrypt (name)
a0873dcc 546 self._init_write_gz ()
c2ffe2ec 547 self.crc = zlib.crc32(b"") & 0xFFFFffff
7584f5c9 548
be60ffd0 549 elif comptype == "bz2":
d1c38f40 550 if self.arcmode & ARCMODE_ENCRYPT:
2ae46844 551 raise InvalidEncryptionError("encryption not available for "
d1c38f40 552 "compression “%s”" % comptype)
be60ffd0
ERE
553 try:
554 import bz2
555 except ImportError:
556 raise CompressionError("bz2 module is not available")
557 if mode == "r":
558 self.dbuf = b""
559 self.cmp = bz2.BZ2Decompressor()
560 self.exception = OSError
561 else:
562 self.cmp = bz2.BZ2Compressor()
7584f5c9 563
be60ffd0 564 elif comptype == 'xz':
d1c38f40 565 if self.arcmode & ARCMODE_ENCRYPT:
2ae46844 566 raise InvalidEncryptionError("encryption not available for "
d1c38f40 567 "compression “%s”" % comptype)
c7c736b6
PG
568 try:
569 import lzma
570 except ImportError:
571 raise CompressionError("lzma module is not available")
572 if mode == "r":
573 self.dbuf = b""
574 self.cmp = lzma.LZMADecompressor()
575 self.exception = lzma.LZMAError
576 else:
577 self.cmp = lzma.LZMACompressor()
578
6de9444a 579 elif comptype == "tar":
d1c38f40 580 if not (self.arcmode & ARCMODE_CONCAT) \
6de9444a 581 and mode == "w" \
d1c38f40 582 and self.arcmode & ARCMODE_ENCRYPT:
3031b7ae 583 self._init_write_encrypt (name)
6de9444a
PG
584
585 else:
d1c38f40 586 if self.arcmode & ARCMODE_ENCRYPT:
2ae46844 587 raise InvalidEncryptionError("encryption not available for "
d1c38f40 588 "compression “%s”" % comptype)
c7c736b6 589 raise CompressionError("unknown compression type %r" % comptype)
be60ffd0 590
200d4866 591 except:
be60ffd0
ERE
592 if not self._extfileobj:
593 self.fileobj.close()
594 self.closed = True
595 raise
ac5e4184 596
7584f5c9
ERE
597 def __del__(self):
598 if hasattr(self, "closed") and not self.closed:
fac2cfe1
PG
599 try:
600 self.close()
601 except crypto.InternalError:
602 # context already finalized due to abort but close() tried
603 # to use it
604 pass
7584f5c9 605
c7c736b6 606
d1c38f40
PG
607 def next (self, name):
608 if self.arcmode & ARCMODE_COMPRESS:
609 if getattr (self, "cmp", None) is not None:
610 self._finalize_write_gz ()
0349168a
PG
611 self.__sync()
612 if self.arcmode & ~(ARCMODE_ENCRYPT | ARCMODE_COMPRESS):
613 self.last_block_offset = self.fileobj.tell()
d1c38f40
PG
614 if self.arcmode & ARCMODE_ENCRYPT:
615 self._finalize_write_encrypt ()
616 self._init_write_encrypt (name, set_last_block_offset=True)
617 if self.arcmode & ARCMODE_COMPRESS:
618 self._init_write_gz (set_last_block_offset =
0349168a 619 not (self.arcmode & ARCMODE_ENCRYPT))
d1c38f40
PG
620 return self.last_block_offset
621
622
623 def next_volume (self, name):
624 # with non-concat modes, this is taken care by the _Stream
625 # ctor as invoked by the newvol handler
626 if self.arcmode & ARCMODE_COMPRESS:
627 if getattr (self, "cmp", None) is not None:
628 # e. g. compressed PAX header written
629 self._finalize_write_gz ()
630 if self.arcmode & ARCMODE_ENCRYPT:
631 self._init_write_encrypt (name)
632 if self.arcmode & ARCMODE_COMPRESS:
633 self._init_write_gz ()
634
c7c736b6 635
d1c38f40
PG
636 def _init_write_encrypt (self, entry=None, set_last_block_offset=False):
637 """
638 Save position for delayed write of header; fill the header location
639 with dummy bytes.
640 """
641 # first thing, proclaim new object to the encryption context
642 # secondly, assemble the header with the updated parameters
643 # and commit it directly to the underlying stream, bypassing the
644 # encryption layer in .__write().
645 dummyhdr = self.encryption.next (entry, counter=self.enccounter)
646 if dummyhdr is None:
647 raise EncryptionError ("Crypto.next(): bad dummy header") # XXX
648 self.lasthdr = self.fileobj.tell()
649 self.__write_to_file(dummyhdr)
650 if set_last_block_offset is True:
651 self.last_block_offset = self.lasthdr
c7c736b6
PG
652
653
654 def _finalize_write_encrypt (self):
655 """
656 Seek back to header position, read dummy bytes, finalize crypto
657 obtaining the actual header, write header, seek back to current
658 position.
963d0db4
PG
659
660 Returns the list of IV fixed parts as used during encryption.
c7c736b6 661 """
d1c38f40 662 if self.lasthdr is not None:
c7c736b6
PG
663 pos0 = self.fileobj.tell ()
664 self.fileobj.seek_set (self.lasthdr)
dd47d6a2 665 dummy = self.fileobj.read (crypto.PDTCRYPT_HDR_SIZE)
c7c736b6
PG
666 pos1 = self.fileobj.tell ()
667 dpos = pos1 - self.lasthdr
dd47d6a2 668 assert dpos == crypto.PDTCRYPT_HDR_SIZE
c7c736b6 669 self.fileobj.seek_set (pos0)
c8c72fe1 670 data, hdr, _ = self.encryption.done (dummy)
5f38bff6 671 self.__write_to_file(hdr, pos=self.lasthdr)
c7c736b6
PG
672 self.__write_to_file(data) # append remainder of data
673 self.lasthdr = -1
674
675
57db1546
PG
676 def _finalize_write_gz (self):
677 if self.cmp is not None:
678 chunk = self.buf + self.cmp.flush()
679 if chunk:
680 if self.comptype == "gz":
681 # The native zlib crc is an unsigned 32-bit integer, but
682 # the Python wrapper implicitly casts that to a signed C
683 # long. So, on a 32-bit box self.crc may "look negative",
684 # while the same crc on a 64-bit box may "look positive".
685 # To avoid irksome warnings from the `struct` module, force
686 # it to look positive on all boxes.
687 chunk += struct.pack("<L", self.crc & 0xffffffff)
688 chunk += struct.pack("<L", self.concat_pos & 0xffffFFFF)
689 self.__enc_write (chunk)
15a81fc0 690 self.buf = b""
57db1546
PG
691
692
a0873dcc 693 def _init_write_gz (self, set_last_block_offset=False):
5fdff89f
ERE
694 '''
695 Add a new gzip block, closing last one
696 '''
be60ffd0 697 self.concat_pos = 0
c2ffe2ec 698 self.crc = self.zlib.crc32(b"") & 0xFFFFffff
6de9444a 699 first = self.cmp is None
2b82f50c
ERE
700 self.cmp = self.zlib.compressobj(self.compresslevel,
701 self.zlib.DEFLATED,
702 -self.zlib.MAX_WBITS,
703 self.zlib.DEF_MEM_LEVEL,
704 0)
6e812ad9
DGM
705
706 # if aes, we encrypt after compression
6de9444a 707 if set_last_block_offset is True:
ad4402e8 708 self.last_block_offset = self.fileobj.tell()
6e812ad9 709
15a81fc0 710 self.__write(gz_header (self.name if first is True else None))
5fdff89f 711
ac5e4184 712
7584f5c9
ERE
713 def write(self, s):
714 """Write string s to the stream.
715 """
716 if self.comptype == "gz":
c2ffe2ec 717 self.crc = self.zlib.crc32(s, self.crc) & 0xFFFFffff
7584f5c9 718 self.pos += len(s)
5fdff89f 719 self.concat_pos += len(s)
53732900 720 if self.cmp is not None:
7584f5c9
ERE
721 s = self.cmp.compress(s)
722 self.__write(s)
723
c7c736b6 724 def __sync(self):
cb7a3911 725 """Write what’s left in the buffer to the stream."""
c7c736b6
PG
726 self.__write (b"") # → len (buf) <= bufsiz
727 self.__enc_write (self.buf)
728 self.buf = b""
729
7584f5c9 730 def __write(self, s):
548bb8d5
CH
731 """Writes (and encodes) string s to the stream blockwise
732
733 will wait with encoding/writing until block is complete
7584f5c9
ERE
734 """
735 self.buf += s
736 while len(self.buf) > self.bufsize:
6e812ad9 737 self.__enc_write(self.buf[:self.bufsize])
7584f5c9
ERE
738 self.buf = self.buf[self.bufsize:]
739
867f75f7 740
5f38bff6 741 def __write_to_file(self, s, pos=None):
6e812ad9 742 '''
5f38bff6 743 Writes directly to the fileobj; updates self.bytes_written. If “pos” is
cb7a3911 744 given, the stream will seek to that position first and back afterwards,
5f38bff6 745 and the total of bytes written is not updated.
6e812ad9 746 '''
867f75f7 747 self.fileobj.write(s, pos)
5f38bff6
PG
748 if pos is None:
749 self.bytes_written += len(s)
867f75f7 750
6e812ad9
DGM
751
752 def __enc_write(self, s):
cb7a3911
PG
753 """
754 If encryption is active, the string s is encrypted before being written
755 to the file.
756 """
757 if len (s) == 0:
758 return
d1c38f40 759 if self.arcmode & ARCMODE_ENCRYPT:
cb7a3911
PG
760 buf = s
761 while len (buf) > 0:
762 n, ct = self.encryption.process(buf)
763 self.__write_to_file(ct)
764 buf = buf [n:]
765 if len (buf) > 0:
766 # The entire plaintext was not consumed: The size limit
767 # for encrypted objects was reached. Transparently create
768 # a new encrypted object and continue processing the input.
769 self._finalize_write_encrypt ()
770 self._init_write_encrypt ()
771 else:
772 self.__write_to_file(s)
773
6e812ad9 774
784175ba
CH
775 def estim_file_size(self):
776 """ estimates size of file if closing it now
777
778 The result may differ greatly from the amount of data sent to write()
779 due to compression, encryption and buffering.
780
781 In tests the result (before calling close()) was up to 12k smaller than
782 the final file size if compression is being used because zlib/bz2
783 compressors do not allow inspection of their buffered data :-(
784
ba5a449e
CH
785 Still, we add what close() would add: 8 bytes for gz checksum, one
786 encryption block size if encryption is used and the size of our own
787 buffer
784175ba
CH
788 """
789 if self.closed:
790 return self.bytes_written
791
792 result = self.bytes_written
793 if self.buf:
794 result += len(self.buf)
795 if self.comptype == 'gz':
ba5a449e 796 result += 8 # 2 longs = 8 byte (no extra info written for bzip2)
784175ba
CH
797 return result
798
5fdff89f 799 def close(self, close_fileobj=True):
7584f5c9
ERE
800 """Close the _Stream object. No operation should be
801 done on it afterwards.
802 """
963d0db4 803
7584f5c9
ERE
804 if self.closed:
805 return
806
963d0db4 807 if close_fileobj is True:
a0873dcc 808
ae3d0f2a 809 if self.mode == "w":
d1c38f40 810 if self.arcmode & ARCMODE_COMPRESS:
a0873dcc 811 self._finalize_write_gz ()
ae3d0f2a 812 # end of Tar archive marker (two empty blocks) was written
267bc643
PG
813 # finalize encryption last; no writes may be performed after
814 # this point
cb7a3911 815 self.__sync ()
d1c38f40
PG
816 if self.arcmode & ARCMODE_ENCRYPT:
817 self._finalize_write_encrypt ()
267bc643 818
963d0db4
PG
819 if not self._extfileobj:
820 self.fileobj.close()
821 else:
822 # read the zlib crc and length and check them
823 if self.mode == "r" and self.comptype == "gz":
824 read_crc = self.__read(4)
825 read_length = self.__read(4)
826 calculated_crc = self.crc
827 if struct.unpack("<L", read_crc)[0] != calculated_crc:
828 raise CompressionError("bad gzip crc")
7584f5c9
ERE
829 self.closed = True
830
54128a00 831
7584f5c9
ERE
832 def _init_read_gz(self):
833 """Initialize for reading a gzip compressed fileobj.
834 """
835 self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
7584f5c9 836
85737f48 837 read2 = self.__read(2)
e50fa574
PG
838 if read2 == b"":
839 raise EndOfFile ("_init_read_gz(): read returned zero bytes at pos "
840 "%d" % self.fileobj.tell())
841 # taken from gzip.GzipFile with some alterations
d601d33b 842 if read2 != GZ_MAGIC_BYTES:
7584f5c9 843 raise ReadError("not a gzip file")
85737f48 844
5bd2d4b5
PG
845 read1 = self.__read(1)
846 if read1 == b"":
847 raise EndOfFile ("_init_read_gz(): read returned zero bytes inside "
848 "gzip header at pos %d" % self.fileobj.tell())
849 if ord (read1) != GZ_METHOD_DEFLATE:
7584f5c9
ERE
850 raise CompressionError("unsupported compression method")
851
85737f48 852 self.flags = flag = ord(self.__read(1))
dfd7865e 853 self.__read(6) # discard timestamp[4], deflate flags, os code
7584f5c9 854
dfd7865e 855 if flag & GZ_FLAG_FEXTRA:
7584f5c9
ERE
856 xlen = ord(self.__read(1)) + 256 * ord(self.__read(1))
857 self.read(xlen)
dfd7865e 858 if flag & GZ_FLAG_FNAME:
7584f5c9
ERE
859 while True:
860 s = self.__read(1)
861 if not s or s == NUL:
862 break
dfd7865e 863 if flag & GZ_FLAG_FCOMMENT:
7584f5c9
ERE
864 while True:
865 s = self.__read(1)
866 if not s or s == NUL:
867 break
dfd7865e 868 if flag & GZ_FLAG_FHCRC:
7584f5c9
ERE
869 self.__read(2)
870
c7c736b6
PG
871 def _init_read_encrypt (self):
872 """Initialize encryption for next entry in archive. Read a header and
873 notify the crypto context."""
d1c38f40 874 if self.arcmode & ARCMODE_ENCRYPT:
6e99d23a 875 lasthdr = self.fileobj.tell ()
15d3eefd
PG
876 try:
877 hdr = crypto.hdr_read_stream (self.fileobj)
8a8ac469
PG
878 except crypto.EndOfFile:
879 return False
6e99d23a 880 except crypto.InvalidHeader as exn:
c7c736b6 881 raise DecryptionError ("Crypto.hdr_read_stream(): error “%s” "
6e99d23a
PG
882 "processing %r at pos %d"
883 % (exn, self.fileobj, lasthdr)) \
ae3d0f2a 884 from exn
3031b7ae
PG
885 if self.enccounter is not None:
886 # enforce that the iv counter in the header matches an
887 # explicitly requested one
888 iv = crypto.hdr_iv_counter (hdr)
889 if iv != self.enccounter:
890 raise DecryptionError ("expected IV counter %d, got %d"
891 % (self.enccounter, iv))
6e99d23a 892 self.lasthdr = lasthdr
c7c736b6 893 self.remainder = hdr ["ctsize"] # distance to next header
1ed44e7b
PG
894 try:
895 self.encryption.next (hdr)
896 except crypto.InvalidParameter as exn:
897 raise DecryptionError ("Crypto.next(): error “%s” "
898 "processing %r at pos %d"
899 % (exn, self.fileobj, lasthdr)) \
900 from exn
8a8ac469
PG
901
902 return True
c7c736b6
PG
903
904
8de91f4f
PG
905 def _read_encrypt (self, buf):
906 """
907 Demote a program error to a decryption error in tolerant mode. This
908 allows recovery from corrupted headers and invalid data.
909 """
910 try:
911 return self.encryption.process (buf)
912 except RuntimeError as exn:
04f4c7ab 913 if self.tolerance != TOLERANCE_STRICT:
8de91f4f
PG
914 raise DecryptionError (exn)
915 raise
916
917
c7c736b6
PG
918 def _finalize_read_encrypt (self):
919 """
920 Finalize decryption.
921 """
d1c38f40
PG
922 if self.arcmode & ARCMODE_ENCRYPT \
923 and self.lasthdr is not None :
c7c736b6
PG
924 assert self.remainder >= 0
925 if self.remainder > 0:
c7c736b6 926 self.remainder = 0
b0078f26
PG
927 try:
928 data = self.encryption.done ()
929 except crypto.InvalidGCMTag as exn:
930 raise DecryptionError ("decryption failed: %s" % exn)
c7c736b6
PG
931 return data
932
933
7584f5c9
ERE
934 def tell(self):
935 """Return the stream's file pointer position.
936 """
937 return self.pos
938
939 def seek(self, pos=0):
940 """Set the stream's file pointer to pos. Negative seeking
941 is forbidden.
942 """
b750b280
PG
943 if pos == self.pos:
944 pass # nothing to do
945 elif pos - self.pos >= 0:
7584f5c9 946 blocks, remainder = divmod(pos - self.pos, self.bufsize)
b750b280
PG
947 if self.encryption is not None:
948 # IV succession is only preserved between successive objects.
949 self.encryption.reset_last_iv ()
be60ffd0 950 for i in range(blocks):
7584f5c9
ERE
951 self.read(self.bufsize)
952 self.read(remainder)
953 else:
954 raise StreamError("seeking backwards is not allowed")
955 return self.pos
956
957 def read(self, size=None):
958 """Return the next size number of bytes from the stream.
959 If size is not defined, return all bytes of the stream
960 up to EOF.
961 """
962 if size is None:
963 t = []
964 while True:
965 buf = self._read(self.bufsize)
966 if not buf:
967 break
968 t.append(buf)
9dc7ac5c 969 buf = b"".join(t)
7584f5c9
ERE
970 else:
971 buf = self._read(size)
972 self.pos += len(buf)
973 return buf
974
3a7e1a50
ERE
975 def readline(self):
976 """Reads just one line, new line character included
977 """
f0fd5e3a 978 # if \n in dbuf, no read neads to be done
be60ffd0
ERE
979 if b'\n' in self.dbuf:
980 pos = self.dbuf.index(b'\n') + 1
f0fd5e3a
ERE
981 ret = self.dbuf[:pos]
982 self.dbuf = self.dbuf[pos:]
983 return ret
984
1215b602 985 buf = []
3a7e1a50
ERE
986 while True:
987 chunk = self._read(self.bufsize)
988
f0fd5e3a 989 # nothing more to read, so return the buffer
3a7e1a50 990 if not chunk:
be60ffd0 991 return b''.join(buf)
3a7e1a50
ERE
992
993 buf.append(chunk)
f0fd5e3a
ERE
994
995 # if \n found, return the new line
be60ffd0
ERE
996 if b'\n' in chunk:
997 dbuf = b''.join(buf)
998 pos = dbuf.index(b'\n') + 1
1215b602 999 self.dbuf = dbuf[pos:] + self.dbuf
3a7e1a50
ERE
1000 return dbuf[:pos]
1001
7584f5c9
ERE
1002 def _read(self, size):
1003 """Return size bytes from the stream.
1004 """
7584f5c9
ERE
1005 c = len(self.dbuf)
1006 t = [self.dbuf]
e4e5d0b8 1007
7584f5c9 1008 while c < size:
867f75f7 1009 buf = self.__read(self.bufsize)
7584f5c9
ERE
1010 if not buf:
1011 break
3a7e1a50 1012
53732900 1013 if self.cmp is not None:
85737f48 1014 try:
3a7e1a50 1015 buf = self.cmp.decompress(buf)
54128a00
PG
1016 except self.exception as exn:
1017 raise ReadError("invalid compressed data (%r)" % exn)
be60ffd0 1018 except Exception as e:
04fb06f4
DGM
1019 # happens at the end of the file
1020 # _init_read_gz failed in the previous iteration so
e4e5d0b8 1021 # self.cmp.decompress fails here
d1c38f40 1022 if self.arcmode & ARCMODE_CONCAT:
be60ffd0
ERE
1023 pass
1024 else:
1025 raise ReadError("invalid compressed data")
d1c38f40 1026 if self.arcmode & ARCMODE_COMPRESS and hasattr(self, "crc"):
c2ffe2ec 1027 self.crc = self.zlib.crc32(buf, self.crc) & 0xFFFFffff
d1c38f40
PG
1028 if self.arcmode & ARCMODE_CONCAT \
1029 and len(self.cmp.unused_data) != 0:
3a7e1a50
ERE
1030 self.buf = self.cmp.unused_data + self.buf
1031 self.close(close_fileobj=False)
1032 try:
1033 self._init_read_gz()
8de91f4f 1034 except DecryptionError:
04f4c7ab 1035 if self.tolerance != TOLERANCE_STRICT:
8de91f4f
PG
1036 # return whatever data was processed successfully
1037 if len (buf) > 0:
1038 t.append (buf)
1039 if len (t) > 0:
1040 break
24afaf18
PG
1041 raise
1042 except ReadError: # gzip troubles
1043 if self.tolerance == TOLERANCE_RESCUE:
1044 if len (buf) > 0:
1045 t.append (buf)
1046 if len (t) > 0:
1047 break
1048 raise
e50fa574 1049 except EndOfFile:
3a7e1a50
ERE
1050 # happens at the end of the file
1051 pass
c2ffe2ec 1052 self.crc = self.zlib.crc32(b"") & 0xFFFFffff
3a7e1a50 1053 self.closed = False
7584f5c9
ERE
1054 t.append(buf)
1055 c += len(buf)
be60ffd0 1056 t = b"".join(t)
7584f5c9
ERE
1057 self.dbuf = t[size:]
1058 return t[:size]
1059
e4e5d0b8 1060
7584f5c9 1061 def __read(self, size):
ef3b4499
PG
1062 """
1063 Return size bytes from stream. If internal buffer is empty, read
1064 another block from the stream.
1065
1066 The function returns up to size bytes of data. When an error occurs
1067 during decryption, everything until the end of the last successfully
1068 finalized object is returned.
7584f5c9
ERE
1069 """
1070 c = len(self.buf)
8de91f4f 1071 t = [self.buf] if c > 0 else []
1ed44e7b 1072 good_crypto = len (t)
8de91f4f 1073
7584f5c9 1074 while c < size:
c7c736b6 1075 todo = size
8de91f4f
PG
1076 try:
1077 if self.arcmode & ARCMODE_ENCRYPT:
1078 if self.remainder <= 0:
1079 # prepare next object
044585c6
PG
1080 if self._init_read_encrypt () is False: # EOF
1081 buf = None
1082 break # while
8de91f4f
PG
1083
1084 # only read up to the end of the encrypted object
1085 todo = min (size, self.remainder)
1086 buf = self.fileobj.read(todo)
1087 if self.arcmode & ARCMODE_ENCRYPT:
1088 # decrypt the thing
1089 buf = self._read_encrypt (buf)
1090 if todo == self.remainder:
1091 # at the end of a crypto object; finalization will fail if
1092 # the GCM tag does not match
ef3b4499 1093 trailing = self._finalize_read_encrypt ()
8de91f4f
PG
1094 good_crypto = len (t) + 1
1095 if len (trailing) > 0:
1096 buf += trailing
1097 self.remainder = 0
1098 else:
1099 self.remainder -= todo
1100 except DecryptionError:
04f4c7ab 1101 if self.tolerance == TOLERANCE_STRICT:
8de91f4f
PG
1102 raise
1103 self.encryption.drop ()
24afaf18
PG
1104 if self.tolerance == TOLERANCE_RECOVER:
1105 if good_crypto == 0:
1106 raise
1107 # this may occur at any of the three crypto operations above.
1108 # some objects did validate; discard all data after it; next
1109 # call will start with the bad object and error out immediately
1110 self.buf = b"".join (t [good_crypto:])
1111 return b"".join (t [:good_crypto])
1112 elif self.tolerance == TOLERANCE_RESCUE:
1113 # keep what we have so far despite the finalization issue
1114 t.append (buf)
1115 c += len (buf)
1116 break
1117 else:
1118 raise RuntimeError("internal error: bad tolerance level")
c7c736b6
PG
1119
1120 if not buf: ## XXX stream terminated prematurely; this should be an error
7584f5c9 1121 break
c7c736b6 1122
7584f5c9
ERE
1123 t.append(buf)
1124 c += len(buf)
be60ffd0 1125 t = b"".join(t)
7584f5c9 1126 self.buf = t[size:]
fb27c6e8 1127
7584f5c9 1128 return t[:size]
7d372216 1129
7584f5c9
ERE
1130
1131class _StreamProxy(object):
1132 """Small proxy class that enables transparent compression
1133 detection for the Stream interface (mode 'r|*').
1134 """
1135
1136 def __init__(self, fileobj):
1137 self.fileobj = fileobj
1138 self.buf = self.fileobj.read(BLOCKSIZE)
1139
f0287fb7 1140 def read(self, size): # pylint: disable=method-hidden
7584f5c9
ERE
1141 self.read = self.fileobj.read
1142 return self.buf
1143
1144 def getcomptype(self):
d601d33b 1145 if self.buf.startswith(GZ_MAGIC_DEFLATE):
7584f5c9 1146 return "gz"
be60ffd0 1147 elif self.buf[0:3] == b"BZh" and self.buf[4:10] == b"1AY&SY":
7584f5c9 1148 return "bz2"
be60ffd0
ERE
1149 elif self.buf.startswith((b"\x5d\x00\x00\x80", b"\xfd7zXZ")):
1150 return "xz"
1151 else:
1152 return "tar"
7584f5c9
ERE
1153
1154 def close(self):
1155 self.fileobj.close()
1156# class StreamProxy
1157
7584f5c9
ERE
1158#------------------------
1159# Extraction file object
1160#------------------------
1161class _FileInFile(object):
1162 """A thin wrapper around an existing file object that
1163 provides a part of its data as an individual file
1164 object.
1165 """
1166
be60ffd0 1167 def __init__(self, fileobj, offset, size, blockinfo=None):
7584f5c9
ERE
1168 self.fileobj = fileobj
1169 self.offset = offset
1170 self.size = size
7584f5c9 1171 self.position = 0
be60ffd0
ERE
1172 self.name = getattr(fileobj, "name", None)
1173 self.closed = False
1174
1175 if blockinfo is None:
1176 blockinfo = [(0, size)]
1177
1178 # Construct a map with data and zero blocks.
1179 self.map_index = 0
1180 self.map = []
1181 lastpos = 0
1182 realpos = self.offset
1183 for offset, size in blockinfo:
1184 if offset > lastpos:
1185 self.map.append((False, lastpos, offset, None))
1186 self.map.append((True, offset, offset + size, realpos))
1187 realpos += size
1188 lastpos = offset + size
1189 if lastpos < self.size:
1190 self.map.append((False, lastpos, self.size, None))
1191
1192 def flush(self):
1193 pass
1194
1195 def readable(self):
1196 return True
1197
1198 def writable(self):
1199 return False
1200
1201 def seekable(self):
1202 return self.fileobj.seekable()
7584f5c9
ERE
1203
1204 def tell(self):
1205 """Return the current file position.
1206 """
1207 return self.position
1208
be60ffd0 1209 def seek(self, position, whence=io.SEEK_SET):
7584f5c9
ERE
1210 """Seek to a position in the file.
1211 """
be60ffd0
ERE
1212 if whence == io.SEEK_SET:
1213 self.position = min(max(position, 0), self.size)
1214 elif whence == io.SEEK_CUR:
1215 if position < 0:
1216 self.position = max(self.position + position, 0)
1217 else:
1218 self.position = min(self.position + position, self.size)
1219 elif whence == io.SEEK_END:
1220 self.position = max(min(self.size + position, self.size), 0)
1221 else:
1222 raise ValueError("Invalid argument")
1223 return self.position
7584f5c9
ERE
1224
1225 def read(self, size=None):
1226 """Read data from the file.
1227 """
1228 if size is None:
1229 size = self.size - self.position
1230 else:
1231 size = min(size, self.size - self.position)
1232
be60ffd0 1233 buf = b""
7584f5c9 1234 while size > 0:
7584f5c9 1235 while True:
be60ffd0
ERE
1236 data, start, stop, offset = self.map[self.map_index]
1237 if start <= self.position < stop:
7584f5c9 1238 break
be60ffd0
ERE
1239 else:
1240 self.map_index += 1
1241 if self.map_index == len(self.map):
1242 self.map_index = 0
1243 length = min(size, stop - self.position)
1244 if data:
1245 self.fileobj.seek(offset + (self.position - start))
1246 buf += self.fileobj.read(length)
7584f5c9 1247 else:
be60ffd0
ERE
1248 buf += NUL * length
1249 size -= length
1250 self.position += length
1251 return buf
7584f5c9 1252
be60ffd0
ERE
1253 def readinto(self, b):
1254 buf = self.read(len(b))
1255 b[:len(buf)] = buf
1256 return len(buf)
7584f5c9
ERE
1257
1258 def close(self):
7584f5c9 1259 self.closed = True
be60ffd0 1260#class _FileInFile
7584f5c9 1261
be60ffd0
ERE
1262
1263class ExFileObject(io.BufferedReader):
1264
1265 def __init__(self, tarfile, tarinfo):
1266 fileobj = _FileInFile(tarfile.fileobj, tarinfo.offset_data,
1267 tarinfo.size, tarinfo.sparse)
1268 super().__init__(fileobj)
7584f5c9
ERE
1269#class ExFileObject
1270
1271#------------------
1272# Exported Classes
1273#------------------
1274class TarInfo(object):
1275 """Informational class which holds the details about an
1276 archive member given by a tar header block.
1277 TarInfo objects are returned by TarFile.getmember(),
1278 TarFile.getmembers() and TarFile.gettarinfo() and are
1279 usually created internally.
1280 """
1281
be60ffd0
ERE
1282 __slots__ = ("name", "mode", "uid", "gid", "size", "mtime",
1283 "chksum", "type", "linkname", "uname", "gname",
1284 "devmajor", "devminor", "volume_offset",
1285 "offset", "offset_data", "pax_headers", "sparse",
1286 "tarfile", "_sparse_structs", "_link_target")
1287
7584f5c9
ERE
1288 def __init__(self, name=""):
1289 """Construct a TarInfo object. name is the optional name
1290 of the member.
1291 """
1292 self.name = name # member name
be60ffd0 1293 self.mode = 0o644 # file permissions
7584f5c9
ERE
1294 self.uid = 0 # user id
1295 self.gid = 0 # group id
1296 self.size = 0 # file size
1297 self.mtime = 0 # modification time
1298 self.chksum = 0 # header checksum
1299 self.type = REGTYPE # member type
1300 self.linkname = "" # link name
1301 self.uname = "" # user name
1302 self.gname = "" # group name
1303 self.devmajor = 0 # device major number
1304 self.devminor = 0 # device minor number
1305
1306 self.offset = 0 # the tar header starts here
1307 self.offset_data = 0 # the file's data starts here
0eb5048f
ERE
1308 self.volume_offset = 0 # the file's data corresponds with the data
1309 # starting at this position
7584f5c9 1310
be60ffd0 1311 self.sparse = None # sparse member information
7584f5c9
ERE
1312 self.pax_headers = {} # pax header information
1313
1314 # In pax headers the "name" and "linkname" field are called
1315 # "path" and "linkpath".
1316 def _getpath(self):
1317 return self.name
1318 def _setpath(self, name):
1319 self.name = name
1320 path = property(_getpath, _setpath)
1321
1322 def _getlinkpath(self):
1323 return self.linkname
1324 def _setlinkpath(self, linkname):
1325 self.linkname = linkname
1326 linkpath = property(_getlinkpath, _setlinkpath)
1327
1328 def __repr__(self):
1329 return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self))
1330
be60ffd0 1331 def get_info(self, encoding=None, errors=None):
7584f5c9
ERE
1332 """Return the TarInfo's attributes as a dictionary.
1333 """
1334 info = {
1335 "name": self.name,
be60ffd0 1336 "mode": self.mode & 0o7777,
7584f5c9
ERE
1337 "uid": self.uid,
1338 "gid": self.gid,
1339 "size": self.size,
1340 "mtime": self.mtime,
1341 "chksum": self.chksum,
1342 "type": self.type,
1343 "linkname": self.linkname,
1344 "uname": self.uname,
1345 "gname": self.gname,
1346 "devmajor": self.devmajor,
36a315a0 1347 "devminor": self.devminor,
0eb5048f
ERE
1348 "offset_data": self.offset_data,
1349 "volume_offset": self.volume_offset
7584f5c9
ERE
1350 }
1351
1352 if info["type"] == DIRTYPE and not info["name"].endswith("/"):
1353 info["name"] += "/"
1354
7584f5c9
ERE
1355 return info
1356
be60ffd0
ERE
1357 def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING,
1358 errors="surrogateescape"):
7584f5c9
ERE
1359 """Return a tar header as a string of 512 byte blocks.
1360 """
1361 info = self.get_info(encoding, errors)
1362
1363 if format == USTAR_FORMAT:
be60ffd0 1364 return self.create_ustar_header(info, encoding, errors)
7584f5c9 1365 elif format == GNU_FORMAT:
be60ffd0 1366 return self.create_gnu_header(info, encoding, errors)
7584f5c9
ERE
1367 elif format == PAX_FORMAT:
1368 return self.create_pax_header(info, encoding, errors)
1369 else:
1370 raise ValueError("invalid format")
1371
be60ffd0 1372 def create_ustar_header(self, info, encoding, errors):
7584f5c9
ERE
1373 """Return the object as a ustar header block.
1374 """
1375 info["magic"] = POSIX_MAGIC
1376
1377 if len(info["linkname"]) > LENGTH_LINK:
1378 raise ValueError("linkname is too long")
1379
1380 if len(info["name"]) > LENGTH_NAME:
1381 info["prefix"], info["name"] = self._posix_split_name(info["name"])
1382
be60ffd0 1383 return self._create_header(info, USTAR_FORMAT, encoding, errors)
7584f5c9 1384
be60ffd0 1385 def create_gnu_header(self, info, encoding, errors):
7584f5c9
ERE
1386 """Return the object as a GNU header block sequence.
1387 """
1388 info["magic"] = GNU_MAGIC
1389
2f854e77
ERE
1390 if self.ismultivol():
1391 prefix = [
1392 itn(info.get("atime", 0), 12, GNU_FORMAT),
1393 itn(info.get("ctime", 0), 12, GNU_FORMAT),
0eb5048f 1394 itn(self.volume_offset, 12, GNU_FORMAT),
2f854e77
ERE
1395 itn(0, 119, GNU_FORMAT), # stuff unused in this tar implementation, set to zero
1396 ]
be60ffd0 1397 info['prefix'] = b"".join(prefix)
0eb5048f 1398 info['size'] = info['size'] - self.volume_offset
2f854e77 1399
be60ffd0 1400 buf = b""
7584f5c9 1401 if len(info["linkname"]) > LENGTH_LINK:
be60ffd0
ERE
1402 buf += self._create_gnu_long_header(info["linkname"],
1403 GNUTYPE_LONGLINK, encoding, errors)
7584f5c9
ERE
1404
1405 if len(info["name"]) > LENGTH_NAME:
be60ffd0
ERE
1406 buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME,
1407 encoding, errors)
7584f5c9 1408
be60ffd0 1409 return buf + self._create_header(info, GNU_FORMAT, encoding, errors)
7584f5c9
ERE
1410
1411 def create_pax_header(self, info, encoding, errors):
1412 """Return the object as a ustar header block. If it cannot be
1413 represented this way, prepend a pax extended header sequence
1414 with supplement information.
1415 """
1416 info["magic"] = POSIX_MAGIC
1417 pax_headers = self.pax_headers.copy()
c04e0751
ERE
1418 if self.ismultivol():
1419 info['size'] = info['size'] - self.volume_offset
7584f5c9
ERE
1420
1421 # Test string fields for values that exceed the field length or cannot
1422 # be represented in ASCII encoding.
1423 for name, hname, length in (
36a315a0
ERE
1424 ("name", "path", LENGTH_NAME),
1425 ("linkname", "linkpath", LENGTH_LINK),
1426 ("uname", "uname", 32),
1427 ("gname", "gname", 32)):
7584f5c9
ERE
1428
1429 if hname in pax_headers:
1430 # The pax header has priority.
1431 continue
1432
7584f5c9
ERE
1433 # Try to encode the string as ASCII.
1434 try:
be60ffd0 1435 info[name].encode("ascii", "strict")
7584f5c9 1436 except UnicodeEncodeError:
be60ffd0 1437 pax_headers[hname] = info[name]
7584f5c9
ERE
1438 continue
1439
1440 if len(info[name]) > length:
be60ffd0 1441 pax_headers[hname] = info[name]
7584f5c9
ERE
1442
1443 # Test number fields for values that exceed the field limit or values
1444 # that like to be stored as float.
1445 for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)):
1446 if name in pax_headers:
1447 # The pax header has priority. Avoid overflow.
1448 info[name] = 0
1449 continue
1450
1451 val = info[name]
1452 if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float):
be60ffd0 1453 pax_headers[name] = str(val)
7584f5c9
ERE
1454 info[name] = 0
1455
1456 # Create a pax extended header if necessary.
1457 if pax_headers:
be60ffd0 1458 buf = self._create_pax_generic_header(pax_headers, XHDTYPE, encoding)
7584f5c9 1459 else:
be60ffd0 1460 buf = b""
7584f5c9 1461
be60ffd0 1462 return buf + self._create_header(info, USTAR_FORMAT, "ascii", "replace")
7584f5c9
ERE
1463
1464 @classmethod
1465 def create_pax_global_header(cls, pax_headers):
1466 """Return the object as a pax global header block sequence.
1467 """
be60ffd0 1468 return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf-8")
7584f5c9
ERE
1469
1470 def _posix_split_name(self, name):
1471 """Split a name longer than 100 chars into a prefix
1472 and a name part.
1473 """
1474 prefix = name[:LENGTH_PREFIX + 1]
1475 while prefix and prefix[-1] != "/":
1476 prefix = prefix[:-1]
1477
1478 name = name[len(prefix):]
1479 prefix = prefix[:-1]
1480
1481 if not prefix or len(name) > LENGTH_NAME:
1482 raise ValueError("name is too long")
1483 return prefix, name
1484
1485 @staticmethod
be60ffd0 1486 def _create_header(info, format, encoding, errors):
7584f5c9
ERE
1487 """Return a header block. info is a dictionary with file
1488 information, format must be one of the *_FORMAT constants.
1489 """
1490 parts = [
be60ffd0
ERE
1491 stn(info.get("name", ""), 100, encoding, errors),
1492 itn(info.get("mode", 0) & 0o7777, 8, format),
7584f5c9
ERE
1493 itn(info.get("uid", 0), 8, format),
1494 itn(info.get("gid", 0), 8, format),
1495 itn(info.get("size", 0), 12, format),
1496 itn(info.get("mtime", 0), 12, format),
be60ffd0 1497 b" ", # checksum field
2f854e77 1498 info.get("type", REGTYPE),
be60ffd0
ERE
1499 stn(info.get("linkname", ""), 100, encoding, errors),
1500 info.get("magic", POSIX_MAGIC),
1501 stn(info.get("uname", ""), 32, encoding, errors),
1502 stn(info.get("gname", ""), 32, encoding, errors),
7584f5c9
ERE
1503 itn(info.get("devmajor", 0), 8, format),
1504 itn(info.get("devminor", 0), 8, format),
be60ffd0 1505 sbtn(info.get("prefix", ""), 155, encoding, errors)
7584f5c9
ERE
1506 ]
1507
be60ffd0 1508 buf = struct.pack("%ds" % BLOCKSIZE, b"".join(parts))
7584f5c9 1509 chksum = calc_chksums(buf[-BLOCKSIZE:])[0]
be60ffd0 1510 buf = buf[:-364] + bytes("%06o\0" % chksum, "ascii") + buf[-357:]
7584f5c9
ERE
1511 return buf
1512
1513 @staticmethod
1514 def _create_payload(payload):
1515 """Return the string payload filled with zero bytes
1516 up to the next 512 byte border.
1517 """
1518 blocks, remainder = divmod(len(payload), BLOCKSIZE)
1519 if remainder > 0:
1520 payload += (BLOCKSIZE - remainder) * NUL
1521 return payload
1522
1523 @classmethod
be60ffd0 1524 def _create_gnu_long_header(cls, name, type, encoding, errors):
7584f5c9
ERE
1525 """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence
1526 for name.
1527 """
be60ffd0 1528 name = name.encode(encoding, errors) + NUL
7584f5c9
ERE
1529
1530 info = {}
1531 info["name"] = "././@LongLink"
1532 info["type"] = type
1533 info["size"] = len(name)
1534 info["magic"] = GNU_MAGIC
1535
1536 # create extended header + name blocks.
be60ffd0 1537 return cls._create_header(info, USTAR_FORMAT, encoding, errors) + \
7584f5c9
ERE
1538 cls._create_payload(name)
1539
1540 @classmethod
be60ffd0
ERE
1541 def _create_pax_generic_header(cls, pax_headers, type, encoding):
1542 """Return a POSIX.1-2008 extended or global header sequence
7584f5c9 1543 that contains a list of keyword, value pairs. The values
be60ffd0 1544 must be strings.
7584f5c9 1545 """
be60ffd0
ERE
1546 # Check if one of the fields contains surrogate characters and thereby
1547 # forces hdrcharset=BINARY, see _proc_pax() for more information.
1548 binary = False
1549 for keyword, value in pax_headers.items():
1550 try:
1551 value.encode("utf-8", "strict")
1552 except UnicodeEncodeError:
1553 binary = True
1554 break
1555
1556 records = b""
1557 if binary:
1558 # Put the hdrcharset field at the beginning of the header.
1559 records += b"21 hdrcharset=BINARY\n"
1560
1561 for keyword, value in pax_headers.items():
1562 keyword = keyword.encode("utf-8")
1563 if binary:
1564 # Try to restore the original byte representation of `value'.
1565 # Needless to say, that the encoding must match the string.
1566 value = value.encode(encoding, "surrogateescape")
1567 else:
1568 value = value.encode("utf-8")
1569
7584f5c9
ERE
1570 l = len(keyword) + len(value) + 3 # ' ' + '=' + '\n'
1571 n = p = 0
1572 while True:
1573 n = l + len(str(p))
1574 if n == p:
1575 break
1576 p = n
be60ffd0 1577 records += bytes(str(p), "ascii") + b" " + keyword + b"=" + value + b"\n"
7584f5c9
ERE
1578
1579 # We use a hardcoded "././@PaxHeader" name like star does
1580 # instead of the one that POSIX recommends.
1581 info = {}
1582 info["name"] = "././@PaxHeader"
1583 info["type"] = type
1584 info["size"] = len(records)
1585 info["magic"] = POSIX_MAGIC
1586
1587 # Create pax header + record blocks.
be60ffd0 1588 return cls._create_header(info, USTAR_FORMAT, "ascii", "replace") + \
7584f5c9
ERE
1589 cls._create_payload(records)
1590
1591 @classmethod
be60ffd0
ERE
1592 def frombuf(cls, buf, encoding, errors):
1593 """Construct a TarInfo object from a 512 byte bytes object.
7584f5c9
ERE
1594 """
1595 if len(buf) == 0:
1596 raise EmptyHeaderError("empty header")
1597 if len(buf) != BLOCKSIZE:
1598 raise TruncatedHeaderError("truncated header")
1599 if buf.count(NUL) == BLOCKSIZE:
1600 raise EOFHeaderError("end of file header")
1601
1602 chksum = nti(buf[148:156])
1603 if chksum not in calc_chksums(buf):
1604 raise InvalidHeaderError("bad checksum")
1605
1606 obj = cls()
be60ffd0 1607 obj.name = nts(buf[0:100], encoding, errors)
7584f5c9
ERE
1608 obj.mode = nti(buf[100:108])
1609 obj.uid = nti(buf[108:116])
1610 obj.gid = nti(buf[116:124])
1611 obj.size = nti(buf[124:136])
1612 obj.mtime = nti(buf[136:148])
1613 obj.chksum = chksum
1614 obj.type = buf[156:157]
be60ffd0
ERE
1615 obj.linkname = nts(buf[157:257], encoding, errors)
1616 obj.uname = nts(buf[265:297], encoding, errors)
1617 obj.gname = nts(buf[297:329], encoding, errors)
7584f5c9
ERE
1618 obj.devmajor = nti(buf[329:337])
1619 obj.devminor = nti(buf[337:345])
be60ffd0
ERE
1620 prefix = nts(buf[345:500], encoding, errors)
1621
1622 # The old GNU sparse format occupies some of the unused
1623 # space in the buffer for up to 4 sparse structures.
1624 # Save the them for later processing in _proc_sparse().
1625 if obj.type == GNUTYPE_SPARSE:
1626 pos = 386
1627 structs = []
1628 for i in range(4):
1629 try:
1630 offset = nti(buf[pos:pos + 12])
1631 numbytes = nti(buf[pos + 12:pos + 24])
1632 except ValueError:
1633 break
1634 structs.append((offset, numbytes))
1635 pos += 24
1636 isextended = bool(buf[482])
1637 origsize = nti(buf[483:495])
1638 obj._sparse_structs = (structs, isextended, origsize)
7584f5c9
ERE
1639
1640 # Old V7 tar format represents a directory as a regular
1641 # file with a trailing slash.
1642 if obj.type == AREGTYPE and obj.name.endswith("/"):
1643 obj.type = DIRTYPE
1644
1645 # Remove redundant slashes from directories.
1646 if obj.isdir():
1647 obj.name = obj.name.rstrip("/")
1648
1649 # Reconstruct a ustar longname.
1650 if prefix and obj.type not in GNU_TYPES:
1651 obj.name = prefix + "/" + obj.name
c474439c
ERE
1652 else:
1653 obj.offset_data = nti(buf[369:381])
7584f5c9
ERE
1654 return obj
1655
1656 @classmethod
1657 def fromtarfile(cls, tarfile):
1658 """Return the next TarInfo object from TarFile object
1659 tarfile.
1660 """
1661 buf = tarfile.fileobj.read(BLOCKSIZE)
be60ffd0 1662 obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors)
7584f5c9
ERE
1663 obj.offset = tarfile.fileobj.tell() - BLOCKSIZE
1664 return obj._proc_member(tarfile)
1665
1666 #--------------------------------------------------------------------------
1667 # The following are methods that are called depending on the type of a
1668 # member. The entry point is _proc_member() which can be overridden in a
1669 # subclass to add custom _proc_*() methods. A _proc_*() method MUST
1670 # implement the following
1671 # operations:
1672 # 1. Set self.offset_data to the position where the data blocks begin,
1673 # if there is data that follows.
1674 # 2. Set tarfile.offset to the position where the next member's header will
1675 # begin.
1676 # 3. Return self or another valid TarInfo object.
1677 def _proc_member(self, tarfile):
1678 """Choose the right processing method depending on
1679 the type and call it.
1680 """
1681 if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK):
1682 return self._proc_gnulong(tarfile)
1683 elif self.type == GNUTYPE_SPARSE:
1684 return self._proc_sparse(tarfile)
1685 elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE):
1686 return self._proc_pax(tarfile)
1687 else:
1688 return self._proc_builtin(tarfile)
1689
1690 def _proc_builtin(self, tarfile):
1691 """Process a builtin type or an unknown type which
1692 will be treated as a regular file.
1693 """
1694 self.offset_data = tarfile.fileobj.tell()
1695 offset = self.offset_data
00c34a12 1696 if self.isreg() or self.ismultivol() or self.type not in SUPPORTED_TYPES:
7584f5c9
ERE
1697 # Skip the following data blocks.
1698 offset += self._block(self.size)
1699 tarfile.offset = offset
1700
1701 # Patch the TarInfo object with saved global
1702 # header information.
1703 self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors)
1704
1705 return self
1706
1707 def _proc_gnulong(self, tarfile):
1708 """Process the blocks that hold a GNU longname
1709 or longlink member.
1710 """
1711 buf = tarfile.fileobj.read(self._block(self.size))
1712
1713 # Fetch the next header and process it.
1714 try:
1715 next = self.fromtarfile(tarfile)
1716 except HeaderError:
1717 raise SubsequentHeaderError("missing or bad subsequent header")
1718
1719 # Patch the TarInfo object from the next header with
1720 # the longname information.
1721 next.offset = self.offset
1722 if self.type == GNUTYPE_LONGNAME:
be60ffd0 1723 next.name = nts(buf, tarfile.encoding, tarfile.errors)
7584f5c9 1724 elif self.type == GNUTYPE_LONGLINK:
be60ffd0 1725 next.linkname = nts(buf, tarfile.encoding, tarfile.errors)
7584f5c9
ERE
1726
1727 return next
1728
1729 def _proc_sparse(self, tarfile):
1730 """Process a GNU sparse header plus extra headers.
1731 """
be60ffd0
ERE
1732 # We already collected some sparse structures in frombuf().
1733 structs, isextended, origsize = self._sparse_structs
1734 del self._sparse_structs
1735
1736 # Collect sparse structures from extended header blocks.
1737 while isextended:
7584f5c9
ERE
1738 buf = tarfile.fileobj.read(BLOCKSIZE)
1739 pos = 0
be60ffd0 1740 for i in range(21):
7584f5c9
ERE
1741 try:
1742 offset = nti(buf[pos:pos + 12])
1743 numbytes = nti(buf[pos + 12:pos + 24])
1744 except ValueError:
1745 break
be60ffd0
ERE
1746 if offset and numbytes:
1747 structs.append((offset, numbytes))
7584f5c9 1748 pos += 24
be60ffd0
ERE
1749 isextended = bool(buf[504])
1750 self.sparse = structs
7584f5c9
ERE
1751
1752 self.offset_data = tarfile.fileobj.tell()
1753 tarfile.offset = self.offset_data + self._block(self.size)
1754 self.size = origsize
7584f5c9
ERE
1755 return self
1756
1757 def _proc_pax(self, tarfile):
1758 """Process an extended or global header as described in
be60ffd0 1759 POSIX.1-2008.
7584f5c9
ERE
1760 """
1761 # Read the header information.
1762 buf = tarfile.fileobj.read(self._block(self.size))
1763
1764 # A pax header stores supplemental information for either
1765 # the following file (extended) or all following files
1766 # (global).
1767 if self.type == XGLTYPE:
1768 pax_headers = tarfile.pax_headers
1769 else:
1770 pax_headers = tarfile.pax_headers.copy()
1771
be60ffd0
ERE
1772 # Check if the pax header contains a hdrcharset field. This tells us
1773 # the encoding of the path, linkpath, uname and gname fields. Normally,
1774 # these fields are UTF-8 encoded but since POSIX.1-2008 tar
1775 # implementations are allowed to store them as raw binary strings if
1776 # the translation to UTF-8 fails.
1777 match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf)
1778 if match is not None:
1779 pax_headers["hdrcharset"] = match.group(1).decode("utf-8")
1780
1781 # For the time being, we don't care about anything other than "BINARY".
1782 # The only other value that is currently allowed by the standard is
1783 # "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
1784 hdrcharset = pax_headers.get("hdrcharset")
1785 if hdrcharset == "BINARY":
1786 encoding = tarfile.encoding
1787 else:
1788 encoding = "utf-8"
1789
7584f5c9
ERE
1790 # Parse pax header information. A record looks like that:
1791 # "%d %s=%s\n" % (length, keyword, value). length is the size
1792 # of the complete record including the length field itself and
1793 # the newline. keyword and value are both UTF-8 encoded strings.
be60ffd0 1794 regex = re.compile(br"(\d+) ([^=]+)=")
7584f5c9
ERE
1795 pos = 0
1796 while True:
1797 match = regex.match(buf, pos)
1798 if not match:
1799 break
1800
1801 length, keyword = match.groups()
1802 length = int(length)
1803 value = buf[match.end(2) + 1:match.start(1) + length - 1]
1804
be60ffd0
ERE
1805 # Normally, we could just use "utf-8" as the encoding and "strict"
1806 # as the error handler, but we better not take the risk. For
1807 # example, GNU tar <= 1.23 is known to store filenames it cannot
1808 # translate to UTF-8 as raw strings (unfortunately without a
1809 # hdrcharset=BINARY header).
1810 # We first try the strict standard encoding, and if that fails we
1811 # fall back on the user's encoding and error handler.
1812 keyword = self._decode_pax_field(keyword, "utf-8", "utf-8",
1813 tarfile.errors)
1814 if keyword in PAX_NAME_FIELDS:
1815 value = self._decode_pax_field(value, encoding, tarfile.encoding,
1816 tarfile.errors)
1817 else:
1818 value = self._decode_pax_field(value, "utf-8", "utf-8",
1819 tarfile.errors)
7584f5c9
ERE
1820
1821 pax_headers[keyword] = value
1822 pos += length
1823
36a315a0 1824
7584f5c9
ERE
1825 # Fetch the next header.
1826 try:
1827 next = self.fromtarfile(tarfile)
1828 except HeaderError:
1829 raise SubsequentHeaderError("missing or bad subsequent header")
1830
be60ffd0
ERE
1831 # Process GNU sparse information.
1832 if "GNU.sparse.map" in pax_headers:
1833 # GNU extended sparse format version 0.1.
1834 self._proc_gnusparse_01(next, pax_headers)
1835
1836 elif "GNU.sparse.size" in pax_headers:
1837 # GNU extended sparse format version 0.0.
1838 self._proc_gnusparse_00(next, pax_headers, buf)
1839
1840 elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0":
1841 # GNU extended sparse format version 1.0.
1842 self._proc_gnusparse_10(next, pax_headers, tarfile)
1843
7584f5c9
ERE
1844 if self.type in (XHDTYPE, SOLARIS_XHDTYPE):
1845 # Patch the TarInfo object with the extended header info.
1846 next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors)
1847 next.offset = self.offset
1848
1849 if "size" in pax_headers:
1850 # If the extended header replaces the size field,
1851 # we need to recalculate the offset where the next
1852 # header starts.
1853 offset = next.offset_data
1854 if next.isreg() or next.type not in SUPPORTED_TYPES:
1855 offset += next._block(next.size)
1856 tarfile.offset = offset
1857
c04e0751
ERE
1858 if next is not None:
1859 if "GNU.volume.filename" in pax_headers:
1860 if pax_headers["GNU.volume.filename"] == next.name:
1861 if "GNU.volume.size" in pax_headers:
1862 next.size = int(pax_headers["GNU.volume.size"])
1863 if "GNU.volume.offset" in pax_headers:
1864 next.volume_offset = int(pax_headers["GNU.volume.offset"])
1865
1866 for key in pax_headers.keys():
1867 if key.startswith("GNU.volume"):
1868 del tarfile.pax_headers[key]
0eb5048f 1869
7584f5c9
ERE
1870 return next
1871
be60ffd0
ERE
1872 def _proc_gnusparse_00(self, next, pax_headers, buf):
1873 """Process a GNU tar extended sparse header, version 0.0.
7584f5c9 1874 """
be60ffd0
ERE
1875 offsets = []
1876 for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf):
1877 offsets.append(int(match.group(1)))
1878 numbytes = []
1879 for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf):
1880 numbytes.append(int(match.group(1)))
1881 next.sparse = list(zip(offsets, numbytes))
7584f5c9 1882
be60ffd0
ERE
1883 def _proc_gnusparse_01(self, next, pax_headers):
1884 """Process a GNU tar extended sparse header, version 0.1.
1885 """
1886 sparse = [int(x) for x in pax_headers["GNU.sparse.map"].split(",")]
1887 next.sparse = list(zip(sparse[::2], sparse[1::2]))
7584f5c9 1888
be60ffd0
ERE
1889 def _proc_gnusparse_10(self, next, pax_headers, tarfile):
1890 """Process a GNU tar extended sparse header, version 1.0.
1891 """
1892 fields = None
1893 sparse = []
1894 buf = tarfile.fileobj.read(BLOCKSIZE)
1895 fields, buf = buf.split(b"\n", 1)
1896 fields = int(fields)
1897 while len(sparse) < fields * 2:
1898 if b"\n" not in buf:
1899 buf += tarfile.fileobj.read(BLOCKSIZE)
1900 number, buf = buf.split(b"\n", 1)
1901 sparse.append(int(number))
1902 next.offset_data = tarfile.fileobj.tell()
1903 next.sparse = list(zip(sparse[::2], sparse[1::2]))
7584f5c9 1904
be60ffd0
ERE
1905 def _apply_pax_info(self, pax_headers, encoding, errors):
1906 """Replace fields with supplemental information from a previous
1907 pax extended or global header.
1908 """
1909 for keyword, value in pax_headers.items():
1910 if keyword == "GNU.sparse.name":
1911 setattr(self, "path", value)
1912 elif keyword == "GNU.sparse.size":
1913 setattr(self, "size", int(value))
1914 elif keyword == "GNU.sparse.realsize":
1915 setattr(self, "size", int(value))
1916 elif keyword in PAX_FIELDS:
1917 if keyword in PAX_NUMBER_FIELDS:
1918 try:
1919 value = PAX_NUMBER_FIELDS[keyword](value)
1920 except ValueError:
1921 value = 0
1922 if keyword == "path":
f0287fb7 1923 value = value.rstrip("/") # pylint: disable=no-member
be60ffd0 1924 setattr(self, keyword, value)
7584f5c9
ERE
1925
1926 self.pax_headers = pax_headers.copy()
1927
be60ffd0
ERE
1928 def _decode_pax_field(self, value, encoding, fallback_encoding, fallback_errors):
1929 """Decode a single field from a pax record.
1930 """
1931 try:
1932 return value.decode(encoding, "strict")
1933 except UnicodeDecodeError:
1934 return value.decode(fallback_encoding, fallback_errors)
1935
7584f5c9
ERE
1936 def _block(self, count):
1937 """Round up a byte count by BLOCKSIZE and return it,
1938 e.g. _block(834) => 1024.
1939 """
1940 blocks, remainder = divmod(count, BLOCKSIZE)
1941 if remainder:
1942 blocks += 1
1943 return blocks * BLOCKSIZE
1944
1945 def isreg(self):
1946 return self.type in REGULAR_TYPES
1947 def isfile(self):
1948 return self.isreg()
1949 def isdir(self):
1950 return self.type == DIRTYPE
1951 def issym(self):
1952 return self.type == SYMTYPE
1953 def islnk(self):
1954 return self.type == LNKTYPE
1955 def ischr(self):
1956 return self.type == CHRTYPE
1957 def isblk(self):
1958 return self.type == BLKTYPE
1959 def isfifo(self):
1960 return self.type == FIFOTYPE
1961 def issparse(self):
be60ffd0 1962 return self.sparse is not None
7584f5c9
ERE
1963 def isdev(self):
1964 return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE)
68ddf955 1965 def ismultivol(self):
c04e0751
ERE
1966 return self.type == GNUTYPE_MULTIVOL or self.volume_offset > 0 or\
1967 "GNU.volume.offset" in self.pax_headers
7584f5c9
ERE
1968# class TarInfo
1969
1970class TarFile(object):
1971 """The TarFile Class provides an interface to tar archives.
1972 """
1973
1974 debug = 0 # May be set from 0 (no msgs) to 3 (all msgs)
1975
1976 dereference = False # If true, add content of linked file to the
1977 # tar file, else the link.
1978
1979 ignore_zeros = False # If true, skips empty or invalid blocks and
1980 # continues processing.
1981
83f2d71e 1982 max_volume_size = None # If different from None, establishes maximum
68ddf955
ERE
1983 # size of tar volumes
1984
1985 new_volume_handler = None # function handler to be executed before when
1986 # a new volume is needed
1987
1988 volume_number = 0 # current volume number, used for multi volume
1989 # support
1990
7584f5c9
ERE
1991 errorlevel = 1 # If 0, fatal errors only appear in debug
1992 # messages (if debug >= 0). If > 0, errors
1993 # are passed to the caller as exceptions.
1994
1995 format = DEFAULT_FORMAT # The format to use when creating an archive.
1996
1997 encoding = ENCODING # Encoding for 8-bit character strings.
1998
1999 errors = None # Error handler for unicode conversion.
2000
2001 tarinfo = TarInfo # The default TarInfo class to use.
2002
be60ffd0 2003 fileobject = ExFileObject # The file-object for extractfile().
7584f5c9 2004
d1c38f40
PG
2005 arcmode = ARCMODE_PLAIN # Object processing mode (“concat”, encryption,
2006 # compression)
5fdff89f 2007
ea625b04
ERE
2008 save_to_members = True # If new members are saved. This can be disabled
2009 # if you manage lots of files and don't want
2010 # to have high memory usage
2011
9ef1fb87
TJ
2012 cache_uid2user = {} # cache to avoid getpwuid calls. It always parses /etc/passwd.
2013 cache_gid2group = {} # same cache for groups
2014
7584f5c9
ERE
2015 def __init__(self, name=None, mode="r", fileobj=None, format=None,
2016 tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
be60ffd0 2017 errors="surrogateescape", pax_headers=None, debug=None,
548bb8d5 2018 errorlevel=None, max_volume_size=None, new_volume_handler=None,
d1c38f40 2019 concat=False, nacl=None,
c7c736b6 2020 save_to_members=True):
7584f5c9
ERE
2021 """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
2022 read from an existing archive, 'a' to append data to an existing
2023 file or 'w' to create a new file overwriting an existing one. `mode'
2024 defaults to 'r'.
2025 If `fileobj' is given, it is used for reading or writing data. If it
2026 can be determined, `mode' is overridden by `fileobj's mode.
2027 `fileobj' is not closed, when TarFile is closed.
2028 """
2029 if len(mode) > 1 or mode not in "raw":
2030 raise ValueError("mode must be 'r', 'a' or 'w'")
2031 self.mode = mode
d1c38f40 2032 self.arcmode = arcmode_set (concat)
c7c736b6 2033 self.nacl = nacl
7584f5c9
ERE
2034 self._mode = {"r": "rb", "a": "r+b", "w": "wb"}[mode]
2035
2036 if not fileobj:
2037 if self.mode == "a" and not os.path.exists(name):
2038 # Create nonexistent files in append mode.
2039 self.mode = "w"
2040 self._mode = "wb"
2041 fileobj = bltn_open(name, self._mode)
2042 self._extfileobj = False
2043 else:
2044 if name is None and hasattr(fileobj, "name"):
2045 name = fileobj.name
d5361dac 2046 # when fileobj is a gzip.GzipFile, fileobj.mode is an int (not valid for us)
be60ffd0 2047 if hasattr(fileobj, "mode") and isinstance(fileobj.mode, str):
7584f5c9
ERE
2048 self._mode = fileobj.mode
2049 self._extfileobj = True
be60ffd0 2050 self.name = os.path.abspath(name) if name else None
2f854e77 2051 self.base_name = self.name = os.path.abspath(name) if name else None
7584f5c9
ERE
2052 self.fileobj = fileobj
2053
2054 # Init attributes.
2055 if format is not None:
2056 self.format = format
2057 if tarinfo is not None:
2058 self.tarinfo = tarinfo
2059 if dereference is not None:
2060 self.dereference = dereference
2061 if ignore_zeros is not None:
2062 self.ignore_zeros = ignore_zeros
2063 if encoding is not None:
2064 self.encoding = encoding
2065
be60ffd0 2066 self.errors = errors
7584f5c9
ERE
2067
2068 if pax_headers is not None and self.format == PAX_FORMAT:
2069 self.pax_headers = pax_headers
2070 else:
2071 self.pax_headers = {}
2072
2073 if debug is not None:
2074 self.debug = debug
2075 if errorlevel is not None:
2076 self.errorlevel = errorlevel
2077
2078 # Init datastructures.
ae48acc8 2079 if max_volume_size and max_volume_size < 3*BLOCKSIZE:
0c818a18 2080 raise ValueError("max_volume_size needs to be at least %d" % (3*BLOCKSIZE))
ae48acc8
ERE
2081 if max_volume_size and not callable(new_volume_handler):
2082 raise ValueError("new_volume_handler needs to be set and be callable for multivolume support")
5ab3f8f9
CH
2083 if max_volume_size:
2084 self.max_volume_size = int(max_volume_size)
2085 else:
2086 self.max_volume_size = None
ae48acc8 2087
ea625b04 2088 self.save_to_members = save_to_members
68ddf955 2089 self.new_volume_handler = new_volume_handler
7584f5c9
ERE
2090 self.closed = False
2091 self.members = [] # list of members as TarInfo objects
2092 self._loaded = False # flag if all members have been read
2093 self.offset = self.fileobj.tell()
2094 # current position in the archive file
2095 self.inodes = {} # dictionary caching the inodes of
2096 # archive members already added
2097
2098 try:
2099 if self.mode == "r":
2100 self.firstmember = None
2101 self.firstmember = self.next()
2102
2103 if self.mode == "a":
2104 # Move to the end of the archive,
2105 # before the first empty block.
2106 while True:
2107 self.fileobj.seek(self.offset)
2108 try:
2109 tarinfo = self.tarinfo.fromtarfile(self)
2110 self.members.append(tarinfo)
2111 except EOFHeaderError:
2112 self.fileobj.seek(self.offset)
2113 break
be60ffd0 2114 except HeaderError as e:
7584f5c9
ERE
2115 raise ReadError(str(e))
2116
2117 if self.mode in "aw":
2118 self._loaded = True
2119
2120 if self.pax_headers:
2121 buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy())
2122 self.fileobj.write(buf)
2123 self.offset += len(buf)
2124 except:
2125 if not self._extfileobj:
2126 self.fileobj.close()
2127 self.closed = True
2128 raise
2129
7584f5c9
ERE
2130 #--------------------------------------------------------------------------
2131 # Below are the classmethods which act as alternate constructors to the
2132 # TarFile class. The open() method is the only one that is needed for
2133 # public use; it is the "super"-constructor and is able to select an
2134 # adequate "sub"-constructor for a particular compression using the mapping
2135 # from OPEN_METH.
2136 #
2137 # This concept allows one to subclass TarFile without losing the comfort of
2138 # the super-constructor. A sub-constructor is registered and made available
2139 # by adding it to the mapping in OPEN_METH.
2140
2141 @classmethod
2b82f50c 2142 def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE,
04f4c7ab
PG
2143 encryption=None, compresslevel=9, tolerance=TOLERANCE_STRICT,
2144 **kwargs):
7584f5c9
ERE
2145 """Open a tar archive for reading, writing or appending. Return
2146 an appropriate TarFile class.
2147
2148 mode:
2149 'r' or 'r:*' open for reading with transparent compression
2150 'r:' open for reading exclusively uncompressed
2151 'r:gz' open for reading with gzip compression
2152 'r:bz2' open for reading with bzip2 compression
be60ffd0 2153 'r:xz' open for reading with lzma compression
7584f5c9
ERE
2154 'a' or 'a:' open for appending, creating the file if necessary
2155 'w' or 'w:' open for writing without compression
2156 'w:gz' open for writing with gzip compression
2157 'w:bz2' open for writing with bzip2 compression
be60ffd0 2158 'w:xz' open for writing with lzma compression
7584f5c9
ERE
2159
2160 'r|*' open a stream of tar blocks with transparent compression
2161 'r|' open an uncompressed stream of tar blocks for reading
2162 'r|gz' open a gzip compressed stream of tar blocks
2163 'r|bz2' open a bzip2 compressed stream of tar blocks
be60ffd0 2164 'r|xz' open an lzma compressed stream of tar blocks
7584f5c9
ERE
2165 'w|' open an uncompressed stream for writing
2166 'w|gz' open a gzip compressed stream for writing
2167 'w|bz2' open a bzip2 compressed stream for writing
be60ffd0 2168 'w|xz' open an lzma compressed stream for writing
85737f48
ERE
2169
2170 'r#gz' open a stream of gzip compressed tar blocks for reading
2171 'w#gz' open a stream of gzip compressed tar blocks for writing
7584f5c9 2172 """
7584f5c9
ERE
2173 if not name and not fileobj:
2174 raise ValueError("nothing to open")
2175
2176 if mode in ("r", "r:*"):
2177 # Find out which *open() is appropriate for opening the file.
2178 for comptype in cls.OPEN_METH:
2179 func = getattr(cls, cls.OPEN_METH[comptype])
2180 if fileobj is not None:
2181 saved_pos = fileobj.tell()
2182 try:
2183 return func(name, "r", fileobj, **kwargs)
be60ffd0 2184 except (ReadError, CompressionError) as e:
c7c736b6 2185 # usually nothing exceptional but sometimes is
7584f5c9
ERE
2186 if fileobj is not None:
2187 fileobj.seek(saved_pos)
2188 continue
2189 raise ReadError("file could not be opened successfully")
2190
2191 elif ":" in mode:
2192 filemode, comptype = mode.split(":", 1)
2193 filemode = filemode or "r"
2194 comptype = comptype or "tar"
2195
2196 # Select the *open() function according to
2197 # given compression.
2198 if comptype in cls.OPEN_METH:
2199 func = getattr(cls, cls.OPEN_METH[comptype])
2200 else:
2201 raise CompressionError("unknown compression type %r" % comptype)
e05f0440
TJ
2202
2203 # Pass on compression level for gzip / bzip2.
2204 if comptype == 'gz' or comptype == 'bz2':
2205 kwargs['compresslevel'] = compresslevel
2206
7a2b9329
CH
2207 if 'max_volume_size' in kwargs:
2208 if comptype != 'tar' and filemode in 'wa' \
2209 and kwargs['max_volume_size']:
2210 import warnings
2211 warnings.warn('Only the first volume will be compressed '
2212 'for modes with "w:"!')
2213
e05f0440 2214 return func(name, filemode, fileobj, **kwargs)
7584f5c9
ERE
2215
2216 elif "|" in mode:
2217 filemode, comptype = mode.split("|", 1)
2218 filemode = filemode or "r"
2219 comptype = comptype or "tar"
2220
2221 if filemode not in "rw":
2222 raise ValueError("mode must be 'r' or 'w'")
2223
2224 t = cls(name, filemode,
2b82f50c
ERE
2225 _Stream(name, filemode, comptype, fileobj, bufsize,
2226 compresslevel=compresslevel),
7584f5c9
ERE
2227 **kwargs)
2228 t._extfileobj = False
2229 return t
2230
5fdff89f
ERE
2231 elif "#" in mode:
2232 filemode, comptype = mode.split("#", 1)
2233 filemode = filemode or "r"
5fdff89f
ERE
2234
2235 if filemode not in "rw":
5faea0e1
PG
2236 raise ValueError ("mode %s not compatible with concat "
2237 "archive; must be 'r' or 'w'" % mode)
5fdff89f 2238
be60ffd0 2239 stream = _Stream(name, filemode, comptype, fileobj, bufsize,
d1c38f40 2240 concat=True, encryption=encryption,
04f4c7ab 2241 compresslevel=compresslevel, tolerance=tolerance)
d1c38f40 2242 kwargs ["concat"] = True
be60ffd0
ERE
2243 try:
2244 t = cls(name, filemode, stream, **kwargs)
c7c736b6 2245 except: # XXX except what?
be60ffd0 2246 stream.close()
c7c736b6 2247 raise # XXX raise what?
5fdff89f
ERE
2248 t._extfileobj = False
2249 return t
2250
7584f5c9
ERE
2251 elif mode in "aw":
2252 return cls.taropen(name, mode, fileobj, **kwargs)
2253
133d30da 2254 raise ValueError("undiscernible mode %r" % mode)
7584f5c9 2255
d39d4cbf
PG
2256
2257 @classmethod
2258 def open_at_offset(cls, offset, *a, **kwa):
2259 """
2260 Same as ``.open()``, but start reading at the given offset. Assumes a
5bd2d4b5
PG
2261 seekable file object. Returns *None* if opening failed due to a read
2262 problem.
d39d4cbf
PG
2263 """
2264 fileobj = kwa.get ("fileobj")
2265 if fileobj is not None:
2266 fileobj.seek (offset)
5bd2d4b5 2267
d39d4cbf
PG
2268 return cls.open (*a, **kwa)
2269
2270
7584f5c9
ERE
2271 @classmethod
2272 def taropen(cls, name, mode="r", fileobj=None, **kwargs):
2273 """Open uncompressed tar archive name for reading or writing.
2274 """
2275 if len(mode) > 1 or mode not in "raw":
2276 raise ValueError("mode must be 'r', 'a' or 'w'")
2277 return cls(name, mode, fileobj, **kwargs)
2278
2279 @classmethod
2280 def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
2281 """Open gzip compressed tar archive name for reading or writing.
2282 Appending is not allowed.
2283 """
2284 if len(mode) > 1 or mode not in "rw":
2285 raise ValueError("mode must be 'r' or 'w'")
2286
2287 try:
2288 import gzip
2289 gzip.GzipFile
2290 except (ImportError, AttributeError):
2291 raise CompressionError("gzip module is not available")
2292
be60ffd0 2293 extfileobj = fileobj is not None
7584f5c9 2294 try:
be60ffd0
ERE
2295 fileobj = gzip.GzipFile(name, mode + "b", compresslevel, fileobj)
2296 t = cls.taropen(name, mode, fileobj, **kwargs)
2297 except OSError:
2298 if not extfileobj and fileobj is not None:
2299 fileobj.close()
2300 if fileobj is None:
2301 raise
7584f5c9 2302 raise ReadError("not a gzip file")
be60ffd0
ERE
2303 except:
2304 if not extfileobj and fileobj is not None:
2305 fileobj.close()
2306 raise
2307 t._extfileobj = extfileobj
7584f5c9
ERE
2308 return t
2309
2310 @classmethod
2311 def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
2312 """Open bzip2 compressed tar archive name for reading or writing.
2313 Appending is not allowed.
2314 """
2315 if len(mode) > 1 or mode not in "rw":
2316 raise ValueError("mode must be 'r' or 'w'.")
2317
2318 try:
2319 import bz2
2320 except ImportError:
2321 raise CompressionError("bz2 module is not available")
2322
be60ffd0
ERE
2323 fileobj = bz2.BZ2File(fileobj or name, mode,
2324 compresslevel=compresslevel)
7584f5c9
ERE
2325
2326 try:
2327 t = cls.taropen(name, mode, fileobj, **kwargs)
be60ffd0
ERE
2328 except (OSError, EOFError):
2329 fileobj.close()
7584f5c9
ERE
2330 raise ReadError("not a bzip2 file")
2331 t._extfileobj = False
2332 return t
2333
be60ffd0
ERE
2334 @classmethod
2335 def xzopen(cls, name, mode="r", fileobj=None, preset=None, **kwargs):
2336 """Open lzma compressed tar archive name for reading or writing.
2337 Appending is not allowed.
2338 """
2339 if mode not in ("r", "w"):
2340 raise ValueError("mode must be 'r' or 'w'")
2341
2342 try:
2343 import lzma
2344 except ImportError:
2345 raise CompressionError("lzma module is not available")
2346
2347 fileobj = lzma.LZMAFile(fileobj or name, mode, preset=preset)
2348
2349 try:
2350 t = cls.taropen(name, mode, fileobj, **kwargs)
2351 except (lzma.LZMAError, EOFError):
2352 fileobj.close()
2353 raise ReadError("not an lzma file")
2354 t._extfileobj = False
2355 return t
2356
7584f5c9
ERE
2357 # All *open() methods are registered here.
2358 OPEN_METH = {
2359 "tar": "taropen", # uncompressed tar
2360 "gz": "gzopen", # gzip compressed tar
be60ffd0
ERE
2361 "bz2": "bz2open", # bzip2 compressed tar
2362 "xz": "xzopen" # lzma compressed tar
7584f5c9
ERE
2363 }
2364
2365 #--------------------------------------------------------------------------
2366 # The public methods which TarFile provides:
2367
2368 def close(self):
2369 """Close the TarFile. In write-mode, two finishing zero blocks are
fd2f01f2
PG
2370 appended to the archive. A special case are empty archives which are
2371 initialized accordingly so the two mandatory blocks of zeros are
2372 written abiding by the requested encryption and compression settings.
7584f5c9
ERE
2373 """
2374 if self.closed:
2375 return
2376
2377 if self.mode in "aw":
fd2f01f2
PG
2378 if self.arcmode & ARCMODE_CONCAT and self.fileobj.tell () == 0:
2379 self.fileobj.next ("")
7584f5c9
ERE
2380 self.fileobj.write(NUL * (BLOCKSIZE * 2))
2381 self.offset += (BLOCKSIZE * 2)
2382 # fill up the end with zero-blocks
2383 # (like option -b20 for tar does)
2384 blocks, remainder = divmod(self.offset, RECORDSIZE)
2385 if remainder > 0:
2386 self.fileobj.write(NUL * (RECORDSIZE - remainder))
7584f5c9
ERE
2387 if not self._extfileobj:
2388 self.fileobj.close()
2389 self.closed = True
2390
2391 def getmember(self, name):
2392 """Return a TarInfo object for member `name'. If `name' can not be
2393 found in the archive, KeyError is raised. If a member occurs more
2394 than once in the archive, its last occurrence is assumed to be the
2395 most up-to-date version.
2396 """
2397 tarinfo = self._getmember(name)
2398 if tarinfo is None:
2399 raise KeyError("filename %r not found" % name)
2400 return tarinfo
2401
2402 def getmembers(self):
2403 """Return the members of the archive as a list of TarInfo objects. The
2404 list has the same order as the members in the archive.
2405 """
2406 self._check()
2407 if not self._loaded: # if we want to obtain a list of
2408 self._load() # all members, we first have to
2409 # scan the whole archive.
2410 return self.members
2411
ad4402e8
ERE
2412 def get_last_member_offset(self):
2413 """Return the last member offset. Usually this is self.fileobj.tell(),
2414 but when there's encryption or concat compression going on it's more
2415 complicated than that.
2416 """
b8fc2f5d 2417 return self.last_block_offset
ad4402e8 2418
7584f5c9
ERE
2419 def getnames(self):
2420 """Return the members of the archive as a list of their names. It has
2421 the same order as the list returned by getmembers().
2422 """
2423 return [tarinfo.name for tarinfo in self.getmembers()]
2424
2425 def gettarinfo(self, name=None, arcname=None, fileobj=None):
2426 """Create a TarInfo object for either the file `name' or the file
2427 object `fileobj' (using os.fstat on its file descriptor). You can
2428 modify some of the TarInfo's attributes before you add it using
2429 addfile(). If given, `arcname' specifies an alternative name for the
2430 file in the archive.
2431 """
2432 self._check("aw")
2433
2434 # When fileobj is given, replace name by
2435 # fileobj's real name.
2436 if fileobj is not None:
2437 name = fileobj.name
2438
2439 # Building the name of the member in the archive.
2440 # Backward slashes are converted to forward slashes,
2441 # Absolute paths are turned to relative paths.
2442 if arcname is None:
2443 arcname = name
2444 drv, arcname = os.path.splitdrive(arcname)
be60ffd0 2445 arcname = arcname.replace(os.sep, "/")
7584f5c9
ERE
2446 arcname = arcname.lstrip("/")
2447
2448 # Now, fill the TarInfo object with
2449 # information specific for the file.
2450 tarinfo = self.tarinfo()
2451 tarinfo.tarfile = self
2452
2453 # Use os.stat or os.lstat, depending on platform
2454 # and if symlinks shall be resolved.
2455 if fileobj is None:
2456 if hasattr(os, "lstat") and not self.dereference:
2457 statres = os.lstat(name)
2458 else:
2459 statres = os.stat(name)
2460 else:
2461 statres = os.fstat(fileobj.fileno())
2462 linkname = ""
2463
2464 stmd = statres.st_mode
2465 if stat.S_ISREG(stmd):
2466 inode = (statres.st_ino, statres.st_dev)
2467 if not self.dereference and statres.st_nlink > 1 and \
2468 inode in self.inodes and arcname != self.inodes[inode]:
2469 # Is it a hardlink to an already
2470 # archived file?
2471 type = LNKTYPE
2472 linkname = self.inodes[inode]
2473 else:
2474 # The inode is added only if its valid.
2475 # For win32 it is always 0.
2476 type = REGTYPE
6f422b65 2477 if inode[0] and self.save_to_members:
7584f5c9
ERE
2478 self.inodes[inode] = arcname
2479 elif stat.S_ISDIR(stmd):
2480 type = DIRTYPE
2481 elif stat.S_ISFIFO(stmd):
2482 type = FIFOTYPE
2483 elif stat.S_ISLNK(stmd):
2484 type = SYMTYPE
2485 linkname = os.readlink(name)
2486 elif stat.S_ISCHR(stmd):
2487 type = CHRTYPE
2488 elif stat.S_ISBLK(stmd):
2489 type = BLKTYPE
2490 else:
2491 return None
2492
2493 # Fill the TarInfo object with all
2494 # information we can get.
2495 tarinfo.name = arcname
2496 tarinfo.mode = stmd
2497 tarinfo.uid = statres.st_uid
2498 tarinfo.gid = statres.st_gid
2499 if type == REGTYPE:
2500 tarinfo.size = statres.st_size
2501 else:
be60ffd0 2502 tarinfo.size = 0
7584f5c9
ERE
2503 tarinfo.mtime = statres.st_mtime
2504 tarinfo.type = type
2505 tarinfo.linkname = linkname
2506 if pwd:
9ef1fb87
TJ
2507 if tarinfo.uid in self.cache_uid2user:
2508 tarinfo.uname = self.cache_uid2user[tarinfo.uid]
2509 else:
2510 try:
2511 tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0]
2512 self.cache_uid2user[tarinfo.uid] = tarinfo.uname
2513 except KeyError:
2514 # remember user does not exist:
2515 # same default value as in tarinfo class
2516 self.cache_uid2user[tarinfo.uid] = ""
7584f5c9 2517 if grp:
9ef1fb87
TJ
2518 if tarinfo.gid in self.cache_gid2group:
2519 tarinfo.gname = self.cache_gid2group[tarinfo.gid]
2520 else:
2521 try:
2522 tarinfo.gname = grp.getgrgid(tarinfo.gid)[0]
2523 self.cache_gid2group[tarinfo.gid] = tarinfo.gname
2524 except KeyError:
2525 # remember group does not exist:
2526 # same default value as in tarinfo class
2527 self.cache_gid2group[tarinfo.gid] = ""
7584f5c9
ERE
2528
2529 if type in (CHRTYPE, BLKTYPE):
2530 if hasattr(os, "major") and hasattr(os, "minor"):
2531 tarinfo.devmajor = os.major(statres.st_rdev)
2532 tarinfo.devminor = os.minor(statres.st_rdev)
2533 return tarinfo
2534
2535 def list(self, verbose=True):
2536 """Print a table of contents to sys.stdout. If `verbose' is False, only
2537 the names of the members are printed. If it is True, an `ls -l'-like
2538 output is produced.
2539 """
2540 self._check()
2541
2542 for tarinfo in self:
2543 if verbose:
be60ffd0
ERE
2544 print(stat.filemode(tarinfo.mode), end=' ')
2545 print("%s/%s" % (tarinfo.uname or tarinfo.uid,
2546 tarinfo.gname or tarinfo.gid), end=' ')
7584f5c9 2547 if tarinfo.ischr() or tarinfo.isblk():
be60ffd0
ERE
2548 print("%10s" % ("%d,%d" \
2549 % (tarinfo.devmajor, tarinfo.devminor)), end=' ')
7584f5c9 2550 else:
be60ffd0
ERE
2551 print("%10d" % tarinfo.size, end=' ')
2552 print("%d-%02d-%02d %02d:%02d:%02d" \
2553 % time.localtime(tarinfo.mtime)[:6], end=' ')
7584f5c9 2554
be60ffd0 2555 print(tarinfo.name + ("/" if tarinfo.isdir() else ""), end=' ')
7584f5c9
ERE
2556
2557 if verbose:
2558 if tarinfo.issym():
be60ffd0 2559 print("->", tarinfo.linkname, end=' ')
7584f5c9 2560 if tarinfo.islnk():
be60ffd0
ERE
2561 print("link to", tarinfo.linkname, end=' ')
2562 print()
7584f5c9 2563
be60ffd0 2564 def add(self, name, arcname=None, recursive=True, exclude=None, *, filter=None):
7584f5c9
ERE
2565 """Add the file `name' to the archive. `name' may be any type of file
2566 (directory, fifo, symbolic link, etc.). If given, `arcname'
2567 specifies an alternative name for the file in the archive.
2568 Directories are added recursively by default. This can be avoided by
2569 setting `recursive' to False. `exclude' is a function that should
2570 return True for each filename to be excluded. `filter' is a function
2571 that expects a TarInfo object argument and returns the changed
2572 TarInfo object, if it returns None the TarInfo object will be
2573 excluded from the archive.
2574 """
2575 self._check("aw")
2576
2577 if arcname is None:
2578 arcname = name
2579
2580 # Exclude pathnames.
2581 if exclude is not None:
2582 import warnings
2583 warnings.warn("use the filter argument instead",
2584 DeprecationWarning, 2)
2585 if exclude(name):
2586 self._dbg(2, "tarfile: Excluded %r" % name)
2587 return
2588
2589 # Skip if somebody tries to archive the archive...
2590 if self.name is not None and os.path.abspath(name) == self.name:
2591 self._dbg(2, "tarfile: Skipped %r" % name)
2592 return
2593
2594 self._dbg(1, name)
2595
2596 # Create a TarInfo object from the file.
2597 tarinfo = self.gettarinfo(name, arcname)
2598
2599 if tarinfo is None:
2600 self._dbg(1, "tarfile: Unsupported type %r" % name)
2601 return
2602
2603 # Change or exclude the TarInfo object.
2604 if filter is not None:
2605 tarinfo = filter(tarinfo)
2606 if tarinfo is None:
2607 self._dbg(2, "tarfile: Excluded %r" % name)
2608 return
2609
2610 # Append the tar header and data to the archive.
2611 if tarinfo.isreg():
2612 with bltn_open(name, "rb") as f:
2613 self.addfile(tarinfo, f)
2614
2615 elif tarinfo.isdir():
2616 self.addfile(tarinfo)
2617 if recursive:
2618 for f in os.listdir(name):
2619 self.add(os.path.join(name, f), os.path.join(arcname, f),
be60ffd0 2620 recursive, exclude, filter=filter)
7584f5c9
ERE
2621
2622 else:
2623 self.addfile(tarinfo)
2624
defc9a22 2625 def _size_left_file(self):
be60ffd0 2626 """Calculates size left in a volume with a maximum volume size.
ba5a449e 2627
be60ffd0 2628 Assumes self.max_volume_size is set.
ba5a449e 2629 If using compression through a _Stream, use _size_left_stream instead
be60ffd0 2630 """
ba5a449e 2631 # left-over size = max_size - offset - 2 zero-blocks written in close
ae48acc8
ERE
2632 size_left = self.max_volume_size - 2*BLOCKSIZE - self.offset
2633 # limit size left to a discrete number of blocks, because we won't
be60ffd0 2634 # write only half a block when writting the end of a volume
ae48acc8 2635 # and filling with zeros
defc9a22
CH
2636 return BLOCKSIZE * (size_left // BLOCKSIZE)
2637
2638 def _size_left_stream(self):
ba5a449e
CH
2639 """ Calculates size left in a volume if using comression/encryption
2640
2641 Assumes self.max_volume_size is set and self.fileobj is a _Stream
2642 (otherwise use _size_left_file)
2643 """
2644 # left-over size = max_size - bytes written - 2 zero-blocks (close)
defc9a22
CH
2645 size_left = self.max_volume_size - self.fileobj.estim_file_size() \
2646 - 2*BLOCKSIZE
2647 return BLOCKSIZE * (size_left // BLOCKSIZE)
ae48acc8 2648
7584f5c9
ERE
2649 def addfile(self, tarinfo, fileobj=None):
2650 """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is
2651 given, tarinfo.size bytes are read from it and added to the archive.
2652 You can create TarInfo objects using gettarinfo().
2653 On Windows platforms, `fileobj' should always be opened with mode
2654 'rb' to avoid irritation about the file size.
2655 """
2656 self._check("aw")
2657
2658 tarinfo = copy.copy(tarinfo)
cbf55ffb 2659
d1c38f40
PG
2660 if self.arcmode & ARCMODE_CONCAT:
2661 self.last_block_offset = self.fileobj.next (tarinfo.name)
11684b1d
ERE
2662 else:
2663 self.last_block_offset = self.fileobj.tell()
7584f5c9
ERE
2664
2665 buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
2666 self.fileobj.write(buf)
2667 self.offset += len(buf)
2668
ae9c8de2
CH
2669 if self.max_volume_size:
2670 if isinstance(self.fileobj, _Stream):
2671 _size_left = self._size_left_stream
2672 else:
2673 _size_left = self._size_left_file
2674 else:
2675 _size_left = lambda: tarinfo.size
68ddf955 2676
29c354ac
PG
2677 # If there's no data to follow, finish
2678 if not fileobj:
29c354ac
PG
2679 if self.save_to_members:
2680 self.members.append(tarinfo)
2681 return
2682
2683 target_size_left = _size_left()
2684 source_size_left = tarinfo.size
2685 assert tarinfo.volume_offset == 0
2686
2687 # we only split volumes in the middle of a file, that means we have
2688 # to write at least one block
2689 if target_size_left < BLOCKSIZE:
2690 target_size_left = BLOCKSIZE
2691
ae9c8de2
CH
2692 # loop over multiple volumes
2693 while source_size_left > 0:
ae48acc8 2694
ae9c8de2
CH
2695 # Write as much data as possble from source into target.
2696 # When compressing data, we cannot easily predict how much data we
2697 # can write until target_size_left == 0 --> need to iterate
2698 size_can_write = min(target_size_left, source_size_left)
c04e0751 2699
ae9c8de2
CH
2700 while size_can_write > 0:
2701 copyfileobj(fileobj, self.fileobj, size_can_write)
2702 self.offset += size_can_write
2703 source_size_left -= size_can_write
2704 target_size_left = _size_left()
2705 size_can_write = min(target_size_left, source_size_left)
68ddf955 2706
ae9c8de2
CH
2707 # now target_size_left == 0 or source_size_left == 0
2708
2709 # if there is data left to write, we need to create a new volume
2710 if source_size_left > 0:
5f38bff6
PG
2711 # Only finalize the crypto entry here if we’re continuing with
2712 # another one; otherwise, the encryption must include the block
2713 # padding below.
2f854e77 2714 tarinfo.type = GNUTYPE_MULTIVOL
68ddf955
ERE
2715
2716 if not self.new_volume_handler or\
2717 not callable(self.new_volume_handler):
c04e0751 2718 raise Exception("We need to create a new volume and you "
ae9c8de2 2719 "didn't supply a new_volume_handler")
68ddf955 2720
54128a00 2721
68ddf955
ERE
2722 # the new volume handler should do everything needed to
2723 # start working in a new volume. usually, the handler calls
2724 # to self.open_volume
2f854e77 2725 self.volume_number += 1
0eb5048f 2726
ae9c8de2 2727 # set to be used by open_volume, because in the case of a PAX
0eb5048f
ERE
2728 # tar it needs to write information about the volume and offset
2729 # in the global header
ae9c8de2 2730 tarinfo.volume_offset = tarinfo.size - source_size_left
0eb5048f 2731 self.volume_tarinfo = tarinfo
ae9c8de2 2732
a0873dcc
PG
2733 # the “new_volume_handler” is supposed to call .close() on the
2734 # “fileobj” _Stream
2f854e77
ERE
2735 self.new_volume_handler(self, self.base_name, self.volume_number)
2736
0eb5048f
ERE
2737 self.volume_tarinfo = None
2738
d1c38f40
PG
2739 if self.arcmode & ARCMODE_CONCAT:
2740 self.fileobj.next_volume (tarinfo.name)
5f38bff6 2741
2f854e77
ERE
2742 # write new volume header
2743 buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
2f854e77 2744 self.fileobj.write(buf)
ae9c8de2
CH
2745 self.offset += len(buf)
2746
2747 # adjust variables; open_volume should have reset self.offset
2748 # --> _size_left should be big again
2749 target_size_left = _size_left()
2750 size_can_write = min(target_size_left, source_size_left)
e0da4709 2751 self._dbg(3, 'new volume')
ae9c8de2
CH
2752
2753 # now, all data has been written. We may have to fill up the rest of
2754 # the block in target with 0s
2755 remainder = (tarinfo.size - tarinfo.volume_offset) % BLOCKSIZE
2756 if remainder > 0:
2757 self.fileobj.write(NUL * (BLOCKSIZE - remainder))
2758 self.offset += BLOCKSIZE - remainder
7584f5c9 2759
ea625b04
ERE
2760 if self.save_to_members:
2761 self.members.append(tarinfo)
7584f5c9 2762
170c6c52 2763 def open_volume(self, name="", fileobj=None, encryption=None):
68ddf955 2764 '''
0eb5048f 2765 Called by the user to change this tar file to point to a new volume.
68ddf955 2766 '''
27ee4dd4 2767
68ddf955
ERE
2768 # open the file using either fileobj or name
2769 if not fileobj:
2770 if self.mode == "a" and not os.path.exists(name):
2771 # Create nonexistent files in append mode.
2772 self.mode = "w"
2773 self._mode = "wb"
68ddf955 2774 self._extfileobj = False
26fa5ad5
ERE
2775
2776 if isinstance(self.fileobj, _Stream):
e0da4709 2777 self._dbg(3, 'open_volume: create a _Stream')
26fa5ad5
ERE
2778 fileobj = _Stream(name=name,
2779 mode=self.fileobj.mode,
2780 comptype=self.fileobj.comptype,
2781 fileobj=None,
2782 bufsize=self.fileobj.bufsize,
cea130ec 2783 encryption=encryption or self.fileobj.encryption,
27ee4dd4
PG
2784 concat=self.fileobj.arcmode & ARCMODE_CONCAT,
2785 tolerance=self.fileobj.tolerance)
26fa5ad5 2786 else:
7a2b9329 2787 # here, we lose information about compression/encryption!
e0da4709 2788 self._dbg(3, 'open_volume: builtin open')
26fa5ad5 2789 fileobj = bltn_open(name, self._mode)
68ddf955
ERE
2790 else:
2791 if name is None and hasattr(fileobj, "name"):
2792 name = fileobj.name
2793 if hasattr(fileobj, "mode"):
2794 self._mode = fileobj.mode
2795 self._extfileobj = True
1027433a 2796 self._dbg(3, 'open_volume: using external fileobj {}', fileobj)
68ddf955 2797 self.name = os.path.abspath(name) if name else None
5cf23eab 2798 self.fileobj.close()
68ddf955
ERE
2799 self.fileobj = fileobj
2800
2801 # init data structures
2802 self.closed = False
2803 self.members = [] # list of members as TarInfo objects
2804 self._loaded = False # flag if all members have been read
2805 self.offset = self.fileobj.tell()
2806 # current position in the archive file
2807 self.inodes = {} # dictionary caching the inodes of
2808 # archive members already added
2809
2810 try:
2811 if self.mode == "r":
2812 self.firstmember = None
2813 self.firstmember = self.next()
2814
2815 if self.mode == "a":
2816 # Move to the end of the archive,
2817 # before the first empty block.
2818 while True:
2819 self.fileobj.seek(self.offset)
2820 try:
2821 tarinfo = self.tarinfo.fromtarfile(self)
2822 self.members.append(tarinfo)
2823 except EOFHeaderError:
2824 self.fileobj.seek(self.offset)
2825 break
be60ffd0 2826 except HeaderError as e:
68ddf955
ERE
2827 raise ReadError(str(e))
2828
2829 if self.mode in "aw":
2830 self._loaded = True
2831
c04e0751
ERE
2832 if self.format == PAX_FORMAT:
2833 volume_info = {
be60ffd0
ERE
2834 "GNU.volume.filename": str(self.volume_tarinfo.name),
2835 "GNU.volume.size": str(self.volume_tarinfo.size - self.volume_tarinfo.volume_offset),
2836 "GNU.volume.offset": str(self.volume_tarinfo.volume_offset),
c04e0751 2837 }
0eb5048f 2838
c04e0751
ERE
2839 self.pax_headers.update(volume_info)
2840
a0873dcc
PG
2841 if isinstance(self.fileobj, _Stream):
2842 self.fileobj._init_write_gz ()
c04e0751
ERE
2843 buf = self.tarinfo.create_pax_global_header(volume_info.copy())
2844 self.fileobj.write(buf)
2845 self.offset += len(buf)
54128a00 2846 except Exception as exn:
68ddf955
ERE
2847 if not self._extfileobj:
2848 self.fileobj.close()
2849 self.closed = True
2850 raise
2851
c650acfa 2852 def extractall(self, path=".", members=None, filter=None, unlink=False):
7584f5c9
ERE
2853 """Extract all members from the archive to the current working
2854 directory and set owner, modification time and permissions on
2855 directories afterwards. `path' specifies a different directory
2856 to extract to. `members' is optional and must be a subset of the
2857 list returned by getmembers().
2858 """
2859 directories = []
2860
2861 if members is None:
2862 members = self
2863
2864 for tarinfo in members:
c474439c
ERE
2865 if self.volume_number > 0 and tarinfo.ismultivol():
2866 continue
2867
974408b5 2868 if filter and not filter(tarinfo):
e5f5681b
ERE
2869 continue
2870
7584f5c9
ERE
2871 if tarinfo.isdir():
2872 # Extract directories with a safe mode.
2873 directories.append(tarinfo)
2874 tarinfo = copy.copy(tarinfo)
be60ffd0
ERE
2875 tarinfo.mode = 0o0700
2876 # Do not set_attrs directories, as we will do that further down
c650acfa 2877 self.extract(tarinfo, path, set_attrs=not tarinfo.isdir(), unlink=unlink)
7584f5c9
ERE
2878
2879 # Reverse sort directories.
be60ffd0 2880 directories.sort(key=lambda a: a.name)
7584f5c9
ERE
2881 directories.reverse()
2882
2883 # Set correct owner, mtime and filemode on directories.
2884 for tarinfo in directories:
2885 dirpath = os.path.join(path, tarinfo.name)
2886 try:
2887 self.chown(tarinfo, dirpath)
2888 self.utime(tarinfo, dirpath)
2889 self.chmod(tarinfo, dirpath)
be60ffd0 2890 except ExtractError as e:
7584f5c9
ERE
2891 if self.errorlevel > 1:
2892 raise
2893 else:
2894 self._dbg(1, "tarfile: %s" % e)
2895
c650acfa
PG
2896 def extract(self, member, path="", set_attrs=True, symlink_cb=None,
2897 unlink=False):
7584f5c9
ERE
2898 """Extract a member from the archive to the current working directory,
2899 using its full name. Its file information is extracted as accurately
2900 as possible. `member' may be a filename or a TarInfo object. You can
be60ffd0
ERE
2901 specify a different directory using `path'. File attributes (owner,
2902 mtime, mode) are set unless `set_attrs' is False.
786addd6
PG
2903 ``symlink_cb`` is a hook accepting a function that is passed the
2904 ``member``, ``path``, and ``set_attrs`` arguments if the tarinfo for
2905 ``member`` indicates a symlink in which case only the callback
9b13f5c4
PG
2906 passed will be applied, skipping the actual extraction. In case the
2907 callback is invoked, its return value is passed on to the caller.
7584f5c9
ERE
2908 """
2909 self._check("r")
2910
be60ffd0 2911 if isinstance(member, str):
7584f5c9
ERE
2912 tarinfo = self.getmember(member)
2913 else:
2914 tarinfo = member
2915
2916 # Prepare the link target for makelink().
2917 if tarinfo.islnk():
2918 tarinfo._link_target = os.path.join(path, tarinfo.linkname)
2919
9b13f5c4 2920 if symlink_cb is not None and tarinfo.issym():
83f5fd71 2921 return symlink_cb(member, path, set_attrs)
786addd6 2922
7584f5c9 2923 try:
be60ffd0 2924 self._extract_member(tarinfo, os.path.join(path, tarinfo.name),
c650acfa 2925 set_attrs=set_attrs, unlink=unlink)
be60ffd0 2926 except EnvironmentError as e:
7584f5c9
ERE
2927 if self.errorlevel > 0:
2928 raise
2929 else:
2930 if e.filename is None:
2931 self._dbg(1, "tarfile: %s" % e.strerror)
2932 else:
2933 self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename))
be60ffd0 2934 except ExtractError as e:
7584f5c9
ERE
2935 if self.errorlevel > 1:
2936 raise
2937 else:
2938 self._dbg(1, "tarfile: %s" % e)
2939
2940 def extractfile(self, member):
2941 """Extract a member from the archive as a file object. `member' may be
be60ffd0
ERE
2942 a filename or a TarInfo object. If `member' is a regular file or a
2943 link, an io.BufferedReader object is returned. Otherwise, None is
2944 returned.
7584f5c9
ERE
2945 """
2946 self._check("r")
2947
be60ffd0 2948 if isinstance(member, str):
7584f5c9
ERE
2949 tarinfo = self.getmember(member)
2950 else:
2951 tarinfo = member
2952
be60ffd0
ERE
2953 if tarinfo.isreg() or tarinfo.ismultivol() or\
2954 tarinfo.type not in SUPPORTED_TYPES:
7584f5c9
ERE
2955 # If a member's type is unknown, it is treated as a
2956 # regular file.
2957 return self.fileobject(self, tarinfo)
2958
2959 elif tarinfo.islnk() or tarinfo.issym():
2960 if isinstance(self.fileobj, _Stream):
2961 # A small but ugly workaround for the case that someone tries
2962 # to extract a (sym)link as a file-object from a non-seekable
2963 # stream of tar blocks.
2964 raise StreamError("cannot extract (sym)link as file object")
2965 else:
2966 # A (sym)link's file object is its target's file object.
2967 return self.extractfile(self._find_link_target(tarinfo))
2968 else:
2969 # If there's no data associated with the member (directory, chrdev,
2970 # blkdev, etc.), return None instead of a file object.
2971 return None
2972
c650acfa 2973 def _extract_member(self, tarinfo, targetpath, set_attrs=True, unlink=False):
7584f5c9
ERE
2974 """Extract the TarInfo object tarinfo to a physical
2975 file called targetpath.
2976 """
2977 # Fetch the TarInfo object for the given name
2978 # and build the destination pathname, replacing
2979 # forward slashes to platform specific separators.
2980 targetpath = targetpath.rstrip("/")
2981 targetpath = targetpath.replace("/", os.sep)
2982
2983 # Create all upper directories.
2984 upperdirs = os.path.dirname(targetpath)
2985 if upperdirs and not os.path.exists(upperdirs):
2986 # Create directories that are not part of the archive with
2987 # default permissions.
2988 os.makedirs(upperdirs)
2989
2990 if tarinfo.islnk() or tarinfo.issym():
2991 self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname))
2992 else:
2993 self._dbg(1, tarinfo.name)
2994
c650acfa
PG
2995 if unlink is True:
2996 _unlinkfirst(targetpath)
2997
7584f5c9
ERE
2998 if tarinfo.isreg():
2999 self.makefile(tarinfo, targetpath)
3000 elif tarinfo.isdir():
3001 self.makedir(tarinfo, targetpath)
3002 elif tarinfo.isfifo():
3003 self.makefifo(tarinfo, targetpath)
3004 elif tarinfo.ischr() or tarinfo.isblk():
3005 self.makedev(tarinfo, targetpath)
3006 elif tarinfo.islnk() or tarinfo.issym():
3007 self.makelink(tarinfo, targetpath)
3008 elif tarinfo.type not in SUPPORTED_TYPES:
3009 self.makeunknown(tarinfo, targetpath)
3010 else:
3011 self.makefile(tarinfo, targetpath)
3012
be60ffd0
ERE
3013 if set_attrs:
3014 self.chown(tarinfo, targetpath)
3015 if not tarinfo.issym():
3016 self.chmod(tarinfo, targetpath)
3017 self.utime(tarinfo, targetpath)
7584f5c9
ERE
3018
3019 #--------------------------------------------------------------------------
3020 # Below are the different file methods. They are called via
3021 # _extract_member() when extract() is called. They can be replaced in a
3022 # subclass to implement other functionality.
3023
3024 def makedir(self, tarinfo, targetpath):
3025 """Make a directory called targetpath.
3026 """
3027 try:
3028 # Use a safe mode for the directory, the real mode is set
3029 # later in _extract_member().
be60ffd0
ERE
3030 os.mkdir(targetpath, 0o0700)
3031 except FileExistsError:
3032 pass
7584f5c9
ERE
3033
3034 def makefile(self, tarinfo, targetpath):
3035 """Make a file called targetpath.
3036 """
be60ffd0
ERE
3037 source = self.fileobj
3038 source.seek(tarinfo.offset_data)
c7c736b6 3039 decrypt = False
c474439c
ERE
3040 iterate = True
3041 target = bltn_open(targetpath, "wb")
3042
be60ffd0
ERE
3043 if tarinfo.sparse is not None:
3044 try:
3045 for offset, size in tarinfo.sparse:
3046 target.seek(offset)
3047 copyfileobj(source, target, size)
3048 target.seek(tarinfo.size)
3049 target.truncate()
3050 finally:
3051 target.close()
3052 return
3053
c474439c
ERE
3054 while iterate:
3055 iterate = False
3056 try:
3057 copyfileobj(source, target, tarinfo.size)
aa828cd1 3058 except OSError:
c474439c
ERE
3059 source.close()
3060 # only if we are extracting a multivolume this can be treated
3061 if not self.new_volume_handler:
c474439c
ERE
3062 raise Exception("We need to read a new volume and you"
3063 " didn't supply a new_volume_handler")
3064
3065 # the new volume handler should do everything needed to
3066 # start working in a new volume. usually, the handler calls
3067 # to self.open_volume
3068 self.volume_number += 1
3069 self.new_volume_handler(self, self.base_name, self.volume_number)
be60ffd0
ERE
3070 tarinfo = self.firstmember
3071 source = self.fileobj
c474439c 3072 iterate = True
bcc8b174
PG
3073 finally:
3074 if iterate is False: target.close()
c474439c 3075
7584f5c9
ERE
3076
3077 def makeunknown(self, tarinfo, targetpath):
3078 """Make a file from a TarInfo object with an unknown type
3079 at targetpath.
3080 """
3081 self.makefile(tarinfo, targetpath)
3082 self._dbg(1, "tarfile: Unknown file type %r, " \
3083 "extracted as regular file." % tarinfo.type)
3084
3085 def makefifo(self, tarinfo, targetpath):
3086 """Make a fifo called targetpath.
3087 """
3088 if hasattr(os, "mkfifo"):
3089 os.mkfifo(targetpath)
3090 else:
3091 raise ExtractError("fifo not supported by system")
3092
3093 def makedev(self, tarinfo, targetpath):
3094 """Make a character or block device called targetpath.
3095 """
3096 if not hasattr(os, "mknod") or not hasattr(os, "makedev"):
3097 raise ExtractError("special devices not supported by system")
3098
3099 mode = tarinfo.mode
3100 if tarinfo.isblk():
3101 mode |= stat.S_IFBLK
3102 else:
3103 mode |= stat.S_IFCHR
3104
3105 os.mknod(targetpath, mode,
3106 os.makedev(tarinfo.devmajor, tarinfo.devminor))
3107
3108 def makelink(self, tarinfo, targetpath):
3109 """Make a (symbolic) link called targetpath. If it cannot be created
3110 (platform limitation), we try to make a copy of the referenced file
3111 instead of a link.
3112 """
be60ffd0 3113 try:
7584f5c9
ERE
3114 # For systems that support symbolic and hard links.
3115 if tarinfo.issym():
7584f5c9
ERE
3116 os.symlink(tarinfo.linkname, targetpath)
3117 else:
3118 # See extract().
3119 if os.path.exists(tarinfo._link_target):
7584f5c9
ERE
3120 os.link(tarinfo._link_target, targetpath)
3121 else:
be60ffd0
ERE
3122 self._extract_member(self._find_link_target(tarinfo),
3123 targetpath)
3124 except symlink_exception:
7584f5c9 3125 try:
be60ffd0
ERE
3126 self._extract_member(self._find_link_target(tarinfo),
3127 targetpath)
7584f5c9
ERE
3128 except KeyError:
3129 raise ExtractError("unable to resolve link inside archive")
3130
3131 def chown(self, tarinfo, targetpath):
3132 """Set owner of targetpath according to tarinfo.
3133 """
3134 if pwd and hasattr(os, "geteuid") and os.geteuid() == 0:
3135 # We have to be root to do so.
3136 try:
3137 g = grp.getgrnam(tarinfo.gname)[2]
3138 except KeyError:
3139 g = tarinfo.gid
3140 try:
3141 u = pwd.getpwnam(tarinfo.uname)[2]
3142 except KeyError:
3143 u = tarinfo.uid
3144 try:
3145 if tarinfo.issym() and hasattr(os, "lchown"):
3146 os.lchown(targetpath, u, g)
3147 else:
be60ffd0
ERE
3148 os.chown(targetpath, u, g)
3149 except OSError as e:
7584f5c9
ERE
3150 raise ExtractError("could not change owner")
3151
3152 def chmod(self, tarinfo, targetpath):
3153 """Set file permissions of targetpath according to tarinfo.
3154 """
3155 if hasattr(os, 'chmod'):
3156 try:
3157 os.chmod(targetpath, tarinfo.mode)
be60ffd0 3158 except OSError as e:
7584f5c9
ERE
3159 raise ExtractError("could not change mode")
3160
3161 def utime(self, tarinfo, targetpath):
3162 """Set modification time of targetpath according to tarinfo.
3163 """
3164 if not hasattr(os, 'utime'):
3165 return
3166 try:
3167 os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime))
be60ffd0 3168 except OSError as e:
7584f5c9
ERE
3169 raise ExtractError("could not change modification time")
3170
3171 #--------------------------------------------------------------------------
3172 def next(self):
3173 """Return the next member of the archive as a TarInfo object, when
3174 TarFile is opened for reading. Return None if there is no more
3175 available.
3176 """
3177 self._check("ra")
3178 if self.firstmember is not None:
3179 m = self.firstmember
3180 self.firstmember = None
3181 return m
3182
be60ffd0
ERE
3183 # Read the next block.
3184 self.fileobj.seek(self.offset)
7584f5c9
ERE
3185 tarinfo = None
3186 while True:
3187 try:
3188 tarinfo = self.tarinfo.fromtarfile(self)
be60ffd0 3189 except EOFHeaderError as e:
7584f5c9
ERE
3190 if self.ignore_zeros:
3191 self._dbg(2, "0x%X: %s" % (self.offset, e))
3192 self.offset += BLOCKSIZE
3193 continue
be60ffd0 3194 except InvalidHeaderError as e:
7584f5c9
ERE
3195 if self.ignore_zeros:
3196 self._dbg(2, "0x%X: %s" % (self.offset, e))
3197 self.offset += BLOCKSIZE
3198 continue
3199 elif self.offset == 0:
3200 raise ReadError(str(e))
3201 except EmptyHeaderError:
3202 if self.offset == 0:
3203 raise ReadError("empty file")
be60ffd0 3204 except TruncatedHeaderError as e:
7584f5c9
ERE
3205 if self.offset == 0:
3206 raise ReadError(str(e))
be60ffd0 3207 except SubsequentHeaderError as e:
7584f5c9
ERE
3208 raise ReadError(str(e))
3209 break
3210
3211 if tarinfo is not None:
ea625b04
ERE
3212 if self.save_to_members:
3213 self.members.append(tarinfo)
7584f5c9
ERE
3214 else:
3215 self._loaded = True
3216
3217 return tarinfo
3218
3219 #--------------------------------------------------------------------------
3220 # Little helper methods:
3221
3222 def _getmember(self, name, tarinfo=None, normalize=False):
3223 """Find an archive member by name from bottom to top.
3224 If tarinfo is given, it is used as the starting point.
3225 """
3226 # Ensure that all members have been loaded.
3227 members = self.getmembers()
3228
3229 # Limit the member search list up to tarinfo.
3230 if tarinfo is not None:
3231 members = members[:members.index(tarinfo)]
3232
3233 if normalize:
3234 name = os.path.normpath(name)
3235
3236 for member in reversed(members):
3237 if normalize:
3238 member_name = os.path.normpath(member.name)
3239 else:
3240 member_name = member.name
3241
3242 if name == member_name:
3243 return member
3244
3245 def _load(self):
3246 """Read through the entire archive file and look for readable
3247 members.
3248 """
3249 while True:
3250 tarinfo = self.next()
3251 if tarinfo is None:
3252 break
3253 self._loaded = True
3254
3255 def _check(self, mode=None):
3256 """Check if TarFile is still open, and if the operation's mode
3257 corresponds to TarFile's mode.
3258 """
3259 if self.closed:
be60ffd0 3260 raise OSError("%s is closed" % self.__class__.__name__)
7584f5c9 3261 if mode is not None and self.mode not in mode:
be60ffd0 3262 raise OSError("bad operation for mode %r" % self.mode)
7584f5c9
ERE
3263
3264 def _find_link_target(self, tarinfo):
3265 """Find the target member of a symlink or hardlink member in the
3266 archive.
3267 """
3268 if tarinfo.issym():
3269 # Always search the entire archive.
3270 linkname = "/".join(filter(None, (os.path.dirname(tarinfo.name), tarinfo.linkname)))
3271 limit = None
3272 else:
3273 # Search the archive before the link, because a hard link is
3274 # just a reference to an already archived file.
3275 linkname = tarinfo.linkname
3276 limit = tarinfo
3277
3278 member = self._getmember(linkname, tarinfo=limit, normalize=True)
3279 if member is None:
3280 raise KeyError("linkname %r not found" % linkname)
3281 return member
3282
3283 def __iter__(self):
3284 """Provide an iterator object.
3285 """
3286 if self._loaded:
3287 return iter(self.members)
3288 else:
3289 return TarIter(self)
3290
1027433a 3291 def _dbg(self, level, msg, *args):
7584f5c9
ERE
3292 """Write debugging output to sys.stderr.
3293 """
3294 if level <= self.debug:
1027433a 3295 print(msg.format(*args), file=sys.stderr)
7584f5c9
ERE
3296
3297 def __enter__(self):
3298 self._check()
3299 return self
3300
3301 def __exit__(self, type, value, traceback):
3302 if type is None:
3303 self.close()
3304 else:
3305 # An exception occurred. We must not call close() because
3306 # it would try to write end-of-archive blocks and padding.
3307 if not self._extfileobj:
3308 self.fileobj.close()
3309 self.closed = True
c650acfa
PG
3310
3311def _unlinkfirst(targetpath):
3312 try:
3313 os.unlink(targetpath)
3314 except OSError as e:
3315 if e.errno == errno.ENOENT or e.errno == errno.EISDIR:
3316 pass
3317
3318
7584f5c9
ERE
3319# class TarFile
3320
3321class TarIter:
3322 """Iterator Class.
3323
3324 for tarinfo in TarFile(...):
3325 suite...
3326 """
3327
3328 def __init__(self, tarfile):
3329 """Construct a TarIter object.
3330 """
3331 self.tarfile = tarfile
3332 self.index = 0
3333 def __iter__(self):
3334 """Return iterator object.
3335 """
3336 return self
be60ffd0 3337 def __next__(self):
7584f5c9
ERE
3338 """Return the next item using TarFile's next() method.
3339 When all members have been read, set TarFile as _loaded.
3340 """
3341 # Fix for SF #1100429: Under rare circumstances it can
3342 # happen that getmembers() is called during iteration,
3343 # which will cause TarIter to stop prematurely.
3344
3345 if self.index == 0 and self.tarfile.firstmember is not None:
3346 tarinfo = self.tarfile.next()
3347 elif self.index < len(self.tarfile.members):
3348 tarinfo = self.tarfile.members[self.index]
3349 elif not self.tarfile._loaded:
3350 tarinfo = self.tarfile.next()
3351 if not tarinfo:
3352 self.tarfile._loaded = True
3353 raise StopIteration
3354 else:
3355 raise StopIteration
3356 self.index += 1
fb27c6e8 3357
7584f5c9
ERE
3358 return tarinfo
3359
6690f5e0
PG
3360#---------------------------------------------------------
3361# support functionality for rescue mode
3362#---------------------------------------------------------
3363
8fc6040c
PG
3364TAR_FMT_HDR = (# See tar(5):
3365 "<"
3366 "100s" # ← char name[100]; /* 100 */
3367 "8s" # ← char mode[8]; /* 108 */
3368 "8s" # ← char uid[8]; /* 116 */
3369 "8s" # ← char gid[8]; /* 124 */
3370 "12s" # ← char size[12]; /* 136 */
3371 "12s" # ← char mtime[12]; /* 148 */
3372 "8s" # ← char checksum[8]; /* 156 */
3373 "B" # ← char typeflag[1]; /* 157 */
3374 "100s" # ← char linkname[100]; /* 257 */
3375 "6s" # ← char magic[6]; /* 263 */
3376 "2s" # ← char version[2]; /* 265 */
3377 "32s" # ← char uname[32]; /* 297 */
3378 "32s" # ← char gname[32]; /* 329 */
3379 "8s" # ← char devmajor[8]; /* 337 */
3380 "8s" # ← char devminor[8]; /* 345 */
3381 "12s" # ← char atime[12]; /* 357 */
3382 "12s" # ← char ctime[12]; /* 369 */
3383 "12s" # ← char offset[12]; /* 381 */
3384 "4s" # ← char longnames[4]; /* 385 */
3385 "B" # ← char unused[1]; /* 386 */
3386 "" # struct {
3387 "12s" # ← char offset[12];
3388 "12s" # ← char numbytes[12];
3389 "12s" # ← char offset[12];
3390 "12s" # ← char numbytes[12];
3391 "12s" # ← char offset[12];
3392 "12s" # ← char numbytes[12];
3393 "12s" # ← char offset[12];
3394 "12s" # ← char numbytes[12];
3395 "" # } sparse[4]; /* 482 */
3396 "B" # ← char isextended[1]; /* 483 */
3397 "12s" # ← char realsize[12]; /* 495 */
3398 "17s" # ← char pad[17]; /* 512 */
3399)
3400
3401# The “magic” and “version” fields are special:
3402#
3403# tar(5)
3404# magic The magic field holds the five characters “ustar” followed by a
3405# space. Note that POSIX ustar archives have a trailing null.
3406#
3407# however, “tar.h”:
3408#
3409# /* OLDGNU_MAGIC uses both magic and version fields, which are contiguous.
3410# Found in an archive, it indicates an old GNU header format, which will be
3411# hopefully become obsolescent. With OLDGNU_MAGIC, uname and gname are
3412# valid, though the header is not truly POSIX conforming. */
3413#
3414#
a793ee30 3415TAR_HDR_OFF_MAGIC = 257
8fc6040c
PG
3416TAR_FMT_OLDGNU_MAGIC = b"ustar "
3417
3418def read_gnu_tar_hdr (data):
3419 if len (data) != BLOCKSIZE: # header requires one complete block
3420 return None
65b35c42 3421
8fc6040c
PG
3422 try:
3423 name, mode, \
3424 uid, gid, \
3425 size, mtime, \
3426 checksum, \
3427 typeflag, \
3428 linkname, \
3429 magic, \
3430 version, \
3431 uname, \
3432 gname, \
3433 devmajor, \
3434 devminor, \
3435 atime, \
3436 ctime, \
3437 offset, \
3438 longnames, \
3439 unused, \
3440 offset1, numbytes1, \
3441 offset2, numbytes2, \
3442 offset3, numbytes3, \
3443 offset4, numbytes4, \
3444 isextended, \
3445 realsize, \
3446 pad = struct.unpack (TAR_FMT_HDR, data)
3447 except struct.error:
3448 return None
3449
3450 if magic != TAR_FMT_OLDGNU_MAGIC:
3451 return None
3452
3453 # return all except “unused” and “pad”
3454 return \
3455 { "name" : name, "mode" : mode
3456 , "uid" : uid , "gid" : gid
3457 , "size" : size, "mtime" : mtime
3458 , "checksum" : checksum
3459 , "typeflag" : typeflag
3460 , "linkname" : linkname
3461 , "magic" : magic
3462 , "version" : version
3463 , "uname" : uname, "gname" : gname
3464 , "devmajor" : devmajor, "devminor" : devminor
3465 , "atime" : atime, "ctime" : ctime
3466 , "offset" : offset
3467 , "longnames" : longnames
3468 , "offset1" : offset1, "numbytes1" : numbytes1
3469 , "offset2" : offset2, "numbytes2" : numbytes2
3470 , "offset3" : offset3, "numbytes3" : numbytes3
3471 , "offset4" : offset4, "numbytes4" : numbytes4
3472 , "isextended" : isextended
3473 , "realsize" : realsize
3474 }
3475
3476
a793ee30
PG
3477def tar_hdr_check_chksum (data):
3478 hdr = read_gnu_tar_hdr (data)
3479 if hdr is None:
3480 return False
3481 s = calc_chksums (data)
3482 return nti (hdr ["checksum"]) in s
3483
3484
8fc6040c
PG
3485def readable_tar_objects_offsets (ifd):
3486 """
3487 Traverse blocks in file, trying to extract tar headers.
3488 """
3489 pos = 0
3490 offsets = []
3491
a793ee30
PG
3492 mm = mmap.mmap(ifd, 0, mmap.MAP_SHARED, mmap.PROT_READ)
3493 pos = TAR_HDR_OFF_MAGIC
3494
8fc6040c 3495 while True:
a793ee30
PG
3496 pos = mm.find (TAR_FMT_OLDGNU_MAGIC, pos)
3497 if pos == -1:
8fc6040c 3498 break
a793ee30
PG
3499 off = pos - TAR_HDR_OFF_MAGIC
3500 mm.seek (off)
3501 blk = mm.read (BLOCKSIZE)
3502 if tar_hdr_check_chksum (blk) is True:
3503 offsets.append (off)
3504 pos += 1
65b35c42 3505
8fc6040c 3506 return offsets
65b35c42
PG
3507
3508
dfd7865e
PG
3509def locate_gz_hdr_candidates (fd):
3510 """
3511 Walk over instances of the GZ magic in the payload, collecting their
3512 positions. If the offset of the first found instance is not zero, the file
3513 begins with leading garbage.
3514
3515 Note that since the GZ magic consists of only two bytes, we expect a lot of
3516 false positives inside binary data.
3517
3518 :return: The list of offsets in the file.
3519 """
3520 pos = 0
3521 cands = []
3522 mm = mmap.mmap(fd, 0, mmap.MAP_SHARED, mmap.PROT_READ)
3523
3524 while True:
3525 pos = mm.find (GZ_MAGIC_BYTES, pos)
3526 if pos == -1:
3527 break
3528 cands.append (pos)
3529 pos += len (GZ_MAGIC_BYTES)
3530
3531 return cands
3532
3533
3534HDR_CAND_GOOD = 0 # header marks begin of valid object
3535HDR_CAND_FISHY = 1 # inconclusive
3536HDR_CAND_JUNK = 2 # not a header / object unreadable
3537
3538
3539def read_cstring (fd, max=-1, encoding=None):
3540 """
3541 Read one NUL-terminated string from *fd* into a Python string. If *max* is
3542 non-negative, reading will terminate after the specified number of bytes.
3543
3544 Optionally, an *encoding* may be specified to interpret the data as.
3545
3546 :returns: *None* if parsing failed or the maximum number of bytes has been
3547 exceeded; a Python string with the data otherwise.
3548 """
3549 buf = b""
3550 l = 0
3551
3552 while True:
3553 c = os.read (fd, 1)
3554 if c == NUL:
3555 break
3556 if max >= 0 and l > max:
3557 return None
3558 buf += c
3559 l += 1
3560 if encoding is not None:
3561 buf = buf.decode (encoding)
3562
3563 return buf
3564
3565
3566def inspect_gz_hdr (fd, off):
3567 """
3568 Attempt to parse a Gzip header in *fd* at position *off*. The format is
3569 documented as RFC1952.
3570
3571 Returns a verdict about the quality of that header plus the parsed header
3572 when readable. Problematic sizes such as fields running past the EOF are
3573 treated as garbage. Properties in which the header merely doesn’t conform
3574 to the spec (garbage flag bits, bogus timestamp) are considered “fishy”. No
3575 validation is possible on embedded strings because they are single-byte
3576 encoded.
3577 """
3578 fname = None
3579 flags = 0x00
3580 dflags = 0x00
3581 mtime = 0x00000000
3582 oscode = 0x00
3583 verdict = HDR_CAND_GOOD
3584
3585 os.lseek (fd, off, os.SEEK_SET)
3586 if os.lseek (fd, 0, os.SEEK_CUR) != off:
3587 return HDR_CAND_JUNK, None
3588
3589 raw = os.read (fd, GZ_HEADER_SIZE)
3590 if len (raw) != GZ_HEADER_SIZE:
3591 return HDR_CAND_JUNK, None
3592
3593 flags = 0x0
3594 try:
3595 _m1, _m2, meth, flags, mtime, dflags, oscode = \
3596 struct.unpack (GZ_FMT_HEADER, raw)
3597 if meth != GZ_METHOD_DEFLATE: # only deflate is supported
3598 return HDR_CAND_JUNK, None
3599 except struct.error as exn:
3600 return HDR_CAND_JUNK, None
3601
3602 if mtime > int (time.time ()):
3603 verdict = HDR_CAND_FISHY
3604
3605 if dflags != GZ_DEFLATE_FLAGS:
3606 verdict = HDR_CAND_FISHY
3607
3608 if oscode != GZ_OS_CODE:
3609 verdict = HDR_CAND_FISHY
3610
3611 if flags & GZ_FLAG_FTEXT: # created by some contrarian
3612 verdict = HDR_CAND_FISHY
3613 if flags & GZ_FLAG_FEXTRA:
04263795 3614 xlen = struct.unpack ("<H", os.read (fd, 2))[0]
dfd7865e
PG
3615 xtra = os.read (fd, xlen)
3616 if len (xtra) != xlen: # eof inside header
3617 return HDR_CAND_JUNK, None
3618 if flags & GZ_FLAG_FNAME:
3619 # read up to the next NUL byte, not exceeding the maximum path length
3620 # allowed by tar(5)
3621 fname = read_cstring (fd, max=(LENGTH_NAME + LENGTH_PREFIX),
3622 encoding="iso-8859-1")
3623 if fname is None:
3624 return HDR_CAND_JUNK, None
3625 if flags & GZ_FLAG_FCOMMENT:
3626 fname = read_cstring (fd, max=(LENGTH_NAME + LENGTH_PREFIX),
3627 encoding="iso-8859-1")
3628 if fname is None:
3629 return HDR_CAND_JUNK, None
3630 if flags & GZ_FLAG_FHCRC: # half a CRC32; discarded
3631 crc16 = os.read (fd, 2)
3632 if len (crc16) != 2: # eof inside header
3633 return HDR_CAND_JUNK, None
3634 if flags & GZ_FLAG_RESERVED:
3635 # according to the RFC, these must not be set
3636 verdict = HDR_CAND_FISHY
3637
3638 hlen = os.lseek (fd, 0, os.SEEK_CUR) - off
3639
3640 return verdict, \
3641 { "fname" : fname
3642 , "flags" : flags
3643 , "dflags" : dflags
3644 , "mtime" : mtime
3645 , "oscode" : oscode
3646 , "hlen" : hlen
3647 }
3648
3649
3650def try_decompress (ifd, off, hdr):
3651 """
3652 Attempt to process the object starting at *off* with gzip.
3653
3654 :returns: A pair containing the values of the decompressed data and
3655 the length of the input consumed. Note that the latter value
3656 may exceed the length of the compressed data because the
3657 *zlib* module does not provide a means to query how much
3658 of the input it processed before the end of an object.
3659 """
3660 import zlib
3661 decmp = zlib.decompressobj (-zlib.MAX_WBITS)
3662 pos = off
3663 dlen = 0 # size of decompressed data
3664
3665 os.lseek (ifd, pos, os.SEEK_SET)
3666 while True:
3667 cnk = os.read (ifd, BUFSIZE)
3668 pos += len (cnk)
3669 try:
3670 data = decmp.decompress (cnk)
3671 except zlib.error as exn: # probably CRC32 mismatch; terminate softly
3672 break # fishy
3673 dlen += len (data)
3674 if decmp.eof is True:
3675 break
3676 if len (cnk) != BUFSIZE: # eof, but not end of decompressed object!
3677 break # fishy
3678
3679 return dlen, pos - off
3680
3681def readable_gz_objects_offsets (ifd, cands):
3682 """
3683 Inspect header candidates for parseable *ifd* gzipped objects.
3684 """
3685 good = []
3686 nobj = 0
3687
3688 for cand in cands:
3689 nobj += 1
3690 vdt, hdr = inspect_gz_hdr (ifd, cand)
3691 if vdt == HDR_CAND_JUNK:
3692 pass # ignore unreadable ones
3693 elif vdt in [HDR_CAND_GOOD, HDR_CAND_FISHY]:
3694 off0 = cand + hdr ["hlen"]
3695 dlen, clen = try_decompress (ifd, off0, hdr)
3696 if dlen > 0 and clen > 0:
3697 good.append (cand)
3698
3699 return good
3700
3701
3702def reconstruct_offsets_gz (fname):
3703 """
3704 From the given file, retrieve all GZ header-like offsets (“candidates”).
3705 Then check each of those locations whether they can be processed as
3706 compressed data.
3707 """
3708 ifd = os.open (fname, os.O_RDONLY)
3709
3710 try:
3711 cands = locate_gz_hdr_candidates (ifd)
3712 return readable_gz_objects_offsets (ifd, cands)
3713 finally:
3714 os.close (ifd)
3715
3716
65b35c42
PG
3717def reconstruct_offsets_tar (fname):
3718 """
3719 From the given file, retrieve all tar header-like offsets (“candidates”).
3720 Then check each of those locations whether they can be processed as tar
3721 data.
3722 """
3723 ifd = os.open (fname, os.O_RDONLY)
3724
3725 try:
8fc6040c 3726 return readable_tar_objects_offsets (ifd)
65b35c42
PG
3727 finally:
3728 os.close (ifd)
3729
3730
b750b280
PG
3731def read_tarobj_at_offset (fileobj, offset, mode, secret=None,
3732 strict_validation=True):
3733 """
3734 :type strict_validation: bool
3735 :param strict_validation: Enable strict IV checking in the crypto
3736 layer. Should be disabled when dealing with
3737 potentially corrupted data.
3738 """
d39d4cbf 3739 decr = None
d39d4cbf 3740
dfd7865e
PG
3741 if secret is not None:
3742 ks = secret [0]
3743
3744 if ks == crypto.PDTCRYPT_SECRET_PW:
b750b280
PG
3745 decr = crypto.Decrypt (password=secret [1],
3746 strict_ivs=strict_validation)
dfd7865e
PG
3747 elif ks == crypto.PDTCRYPT_SECRET_KEY:
3748 key = binascii.unhexlify (secret [1])
b750b280
PG
3749 decr = crypto.Decrypt (key=key,
3750 strict_ivs=strict_validation)
dfd7865e
PG
3751 else:
3752 raise RuntimeError
d39d4cbf 3753
5bd2d4b5
PG
3754 try:
3755 tarobj = \
3756 TarFile.open_at_offset (offset,
3757 mode=mode,
3758 fileobj=fileobj,
3759 format=GNU_FORMAT,
3760 concat='#' in mode,
3761 encryption=decr,
3762 save_to_members=False,
3763 tolerance=TOLERANCE_RESCUE)
3764 except (ReadError, EndOfFile):
3765 return None
d39d4cbf
PG
3766
3767 return tarobj.next ()
3768
3769
2d50b7f7
PG
3770def idxent_of_tarinfo (tarinfo):
3771 """
3772 Scrape the information relevant for the index from a *TarInfo* object.
3773 Keys like the inode number that lack a corresponding field in a TarInfo
3774 will be set to some neutral value.
3775 Example output:
3776
3777 { "inode" : 0
3778 , "uid" : 0
3779 , "path" : "snapshot://annotations.db"
3780 , "offset" : 0
3781 , "volume" : 0
3782 , "mode" : 33152
3783 , "ctime" : 1502798115
3784 , "mtime" : 1502196423
3785 , "size" : 144
3786 , "type" : "file"
3787 , "gid" : 0
3788 }
3789
3790 """
3791
3792 return \
3793 { "inode" : 0 # ignored when reading the index
3794 , "uid" : tarinfo.uid
3795 , "gid" : tarinfo.gid
3796 , "path" : tarinfo.name # keeping URI scheme
3797 , "offset" : 0 # to be added by the caller
3798 , "volume" : tarinfo.volume_offset
3799 , "mode" : tarinfo.mode
3800 , "ctime" : tarinfo.mtime
3801 , "mtime" : tarinfo.mtime
3802 , "size" : tarinfo.size
3803 , "type" : tarinfo.type
3804 }
3805
3806
27ee4dd4
PG
3807def gen_rescue_index (gen_volume_name, mode, maxvol=None, password=None, key=None):
3808 infos = []
6690f5e0
PG
3809 psidx = [] # pseudo index, return value
3810 offsets = None
addcec42 3811 secret = crypto.make_secret (password=password, key=key)
6690f5e0 3812
27ee4dd4 3813 nvol = 0
dfd7865e 3814
27ee4dd4
PG
3815 while True:
3816 vpath = gen_volume_name (nvol)
3817 try:
3818 if secret is not None:
3819 offsets = crypto.reconstruct_offsets (vpath, secret)
3820 elif mode == "#gz":
3821 offsets = reconstruct_offsets_gz (vpath)
3822 elif mode == "#":
3823 offsets = reconstruct_offsets_tar (vpath)
3824 else:
3825 raise TarError ("no rescue handling for mode “%s”" % mode)
3826 except FileNotFoundError as exn:
3827 # volume does not exist
611c5d03 3828 if maxvol is not None and nvol < maxvol:
27ee4dd4
PG
3829 continue # explicit volume number specified, ignore missing ones
3830 else:
3831 break
3832
3833 fileobj = bltn_open (vpath, "rb")
5bd2d4b5
PG
3834
3835 def aux (acc, off):
b750b280
PG
3836 obj = read_tarobj_at_offset (fileobj, off, mode, secret=secret,
3837 strict_validation=False)
5bd2d4b5
PG
3838 if obj is not None:
3839 acc.append ((off, nvol, obj))
3840 return acc
3841 infos += functools.reduce (aux, offsets, [])
3842
bcc8b174
PG
3843 fileobj.close()
3844
27ee4dd4
PG
3845 nvol += 1
3846
5bd2d4b5
PG
3847 def aux (o, nvol, ti):
3848 ie = idxent_of_tarinfo (ti)
3849 ie ["offset"] = o
3850 ie ["volume"] = nvol
3851 return ie
3852
27ee4dd4 3853 psidx = [ aux (o, nvol, ti) for o, nvol, ti in infos ]
6690f5e0
PG
3854
3855 return psidx
7584f5c9
ERE
3856
3857#--------------------
3858# exported functions
3859#--------------------
3860def is_tarfile(name):
3861 """Return True if name points to a tar archive that we
3862 are able to handle, else return False.
3863 """
3864 try:
3865 t = open(name)
3866 t.close()
3867 return True
3868 except TarError:
3869 return False
3870
3871bltn_open = open
3872open = TarFile.open