bpo-32713: Fix tarfile.itn for large/negative float values. (GH-5434)
[python-delta-tar] / deltatar / tarfile.py
CommitLineData
be60ffd0 1#!/usr/bin/env python3
7584f5c9
ERE
2#-------------------------------------------------------------------
3# tarfile.py
4#-------------------------------------------------------------------
5# Copyright (C) 2002 Lars Gustäbel <lars@gustaebel.de>
6# All rights reserved.
7#
8# Permission is hereby granted, free of charge, to any person
9# obtaining a copy of this software and associated documentation
10# files (the "Software"), to deal in the Software without
11# restriction, including without limitation the rights to use,
12# copy, modify, merge, publish, distribute, sublicense, and/or sell
13# copies of the Software, and to permit persons to whom the
14# Software is furnished to do so, subject to the following
15# conditions:
16#
17# The above copyright notice and this permission notice shall be
18# included in all copies or substantial portions of the Software.
19#
20# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27# OTHER DEALINGS IN THE SOFTWARE.
28#
29"""Read from and write to tar format archives.
30"""
31
32__version__ = "$Revision: 85213 $"
33# $Source$
34
35version = "0.9.0"
36__author__ = "Lars Gustäbel (lars@gustaebel.de)"
37__date__ = "$Date$"
38__cvsid__ = "$Id$"
5fdff89f 39__credits__ = "Gustavo Niemeyer, Niels Gustäbel, Richard Townsend, Eduardo Robles."
7584f5c9
ERE
40
41#---------
42# Imports
43#---------
c7c736b6 44import binascii
dfd7865e
PG
45import copy
46import errno
5bd2d4b5 47import functools
be60ffd0 48import io
dfd7865e
PG
49import mmap
50import operator
51import os
52import re
7584f5c9
ERE
53import shutil
54import stat
7584f5c9 55import struct
dfd7865e
PG
56import sys
57import time
7584f5c9 58
c7c736b6
PG
59import traceback # XXX
60
8ab8fac5 61from . import crypto
6e812ad9 62
7584f5c9
ERE
63try:
64 import grp, pwd
65except ImportError:
66 grp = pwd = None
67
be60ffd0
ERE
68# os.symlink on Windows prior to 6.0 raises NotImplementedError
69symlink_exception = (AttributeError, NotImplementedError)
70try:
71 # OSError (winerror=1314) will be raised if the caller does not hold the
72 # SeCreateSymbolicLinkPrivilege privilege
73 symlink_exception += (OSError,)
74except NameError:
75 pass
76
7584f5c9
ERE
77# from tarfile import *
78__all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError"]
79
be60ffd0
ERE
80from builtins import open as _open # Since 'open' is TarFile.open
81
7584f5c9
ERE
82#---------------------------------------------------------
83# tar constants
84#---------------------------------------------------------
be60ffd0 85NUL = b"\0" # the null character
7584f5c9
ERE
86BLOCKSIZE = 512 # length of processing blocks
87RECORDSIZE = BLOCKSIZE * 20 # length of records
be60ffd0
ERE
88GNU_MAGIC = b"ustar \0" # magic gnu tar string
89POSIX_MAGIC = b"ustar\x0000" # magic posix tar string
7584f5c9
ERE
90
91LENGTH_NAME = 100 # maximum length of a filename
92LENGTH_LINK = 100 # maximum length of a linkname
93LENGTH_PREFIX = 155 # maximum length of the prefix field
94
be60ffd0
ERE
95REGTYPE = b"0" # regular file
96AREGTYPE = b"\0" # regular file
97LNKTYPE = b"1" # link (inside tarfile)
98SYMTYPE = b"2" # symbolic link
99CHRTYPE = b"3" # character special device
100BLKTYPE = b"4" # block special device
101DIRTYPE = b"5" # directory
102FIFOTYPE = b"6" # fifo special device
103CONTTYPE = b"7" # contiguous file
104
105GNUTYPE_LONGNAME = b"L" # GNU tar longname
106GNUTYPE_LONGLINK = b"K" # GNU tar longlink
107GNUTYPE_SPARSE = b"S" # GNU tar sparse file
108GNUTYPE_MULTIVOL = b"M" # GNU tar continuation of a file that began on
68ddf955 109 # another volume
7584f5c9 110
be60ffd0
ERE
111XHDTYPE = b"x" # POSIX.1-2001 extended header
112XGLTYPE = b"g" # POSIX.1-2001 global header
113SOLARIS_XHDTYPE = b"X" # Solaris extended header
7584f5c9
ERE
114
115USTAR_FORMAT = 0 # POSIX.1-1988 (ustar) format
116GNU_FORMAT = 1 # GNU tar format
117PAX_FORMAT = 2 # POSIX.1-2001 (pax) format
118DEFAULT_FORMAT = GNU_FORMAT
119
15a81fc0 120GZ_FMT_HEADER = b"<BBBBLBB"
203cb25e 121GZ_HEADER_SIZE = 10 # not including the name
15a81fc0
PG
122GZ_MAGIC = (0x1f, 0x8b) # 0o37, 0o213
123GZ_METHOD_DEFLATE = 0x08 # 0o10
dfd7865e
PG
124GZ_FLAG_FTEXT = 1 << 0 # ASCII payload
125GZ_FLAG_FHCRC = 1 << 1 # CRC16
126GZ_FLAG_FEXTRA = 1 << 2 # extra field
127GZ_FLAG_FNAME = 1 << 3 # set by default in gzip
128GZ_FLAG_FCOMMENT = 1 << 4 # NUL-terminated comment
129GZ_FLAG_RESERVED = 7 << 5 # unassigned
15a81fc0
PG
130GZ_DEFLATE_FLAGS = 0x00 # 0o00, never read (deflate.c)
131GZ_OS_CODE = 0x03 # 0o03, default in gzip (tailor.h)
d601d33b
PG
132GZ_MAGIC_BYTES = struct.pack ("<BB", GZ_MAGIC [0], GZ_MAGIC [1])
133GZ_MAGIC_DEFLATE = struct.pack ("<BBB", GZ_MAGIC [0], GZ_MAGIC [1],
134 GZ_METHOD_DEFLATE)
15a81fc0 135
04f4c7ab
PG
136TOLERANCE_STRICT = 0
137TOLERANCE_RECOVER = 1 # rely on offsets in index
138TOLERANCE_RESCUE = 2 # deduce metadata from archive contents
139
dfd7865e
PG
140BUFSIZE = 16 * 1024
141
7584f5c9 142#---------------------------------------------------------
d1c38f40
PG
143# archive handling mode
144#---------------------------------------------------------
145
146ARCMODE_PLAIN = 0
147ARCMODE_ENCRYPT = 1 << 0
148ARCMODE_COMPRESS = 1 << 1
149ARCMODE_CONCAT = 1 << 2
150
151def arcmode_fmt (m):
152 if m == ARCMODE_PLAIN:
153 return "PLAIN"
154 first = True
155 ret = "["
156 def chkappend (b, s):
157 nonlocal m
158 nonlocal ret
159 nonlocal first
160 if m & b:
161 if first is True: first = False
162 else: ret += " |"
163 ret += " " + s
164 chkappend (ARCMODE_ENCRYPT, "ENCRYPT")
165 chkappend (ARCMODE_COMPRESS, "COMPRESS")
166 chkappend (ARCMODE_CONCAT, "CONCAT")
167 return ret + " ]"
168
169
170def arcmode_set (concat=False, encryption=None, comptype=None, init=ARCMODE_PLAIN):
171 ret = init
172 if bool (concat) is True:
173 ret |= ARCMODE_CONCAT
174 if encryption is not None:
175 ret |= ARCMODE_ENCRYPT
176 if comptype == "gz":
177 ret |= ARCMODE_COMPRESS
178 return ret
179
180#---------------------------------------------------------
7584f5c9
ERE
181# tarfile constants
182#---------------------------------------------------------
183# File types that tarfile supports:
184SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE,
185 SYMTYPE, DIRTYPE, FIFOTYPE,
186 CONTTYPE, CHRTYPE, BLKTYPE,
187 GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
68ddf955 188 GNUTYPE_SPARSE, GNUTYPE_MULTIVOL)
7584f5c9
ERE
189
190# File types that will be treated as a regular file.
191REGULAR_TYPES = (REGTYPE, AREGTYPE,
192 CONTTYPE, GNUTYPE_SPARSE)
193
194# File types that are part of the GNU tar format.
195GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
68ddf955 196 GNUTYPE_SPARSE, GNUTYPE_MULTIVOL)
7584f5c9
ERE
197
198# Fields from a pax header that override a TarInfo attribute.
199PAX_FIELDS = ("path", "linkpath", "size", "mtime",
200 "uid", "gid", "uname", "gname")
201
be60ffd0
ERE
202# Fields from a pax header that are affected by hdrcharset.
203PAX_NAME_FIELDS = {"path", "linkpath", "uname", "gname"}
204
7584f5c9
ERE
205# Fields in a pax header that are numbers, all other fields
206# are treated as strings.
207PAX_NUMBER_FIELDS = {
208 "atime": float,
209 "ctime": float,
210 "mtime": float,
211 "uid": int,
212 "gid": int,
213 "size": int
214}
215
216#---------------------------------------------------------
7584f5c9
ERE
217# initialization
218#---------------------------------------------------------
be60ffd0
ERE
219
220if os.name in ("nt", "ce"):
221 ENCODING = "utf-8"
222else:
223 ENCODING = sys.getfilesystemencoding()
7584f5c9
ERE
224
225#---------------------------------------------------------
226# Some useful functions
227#---------------------------------------------------------
228
be60ffd0
ERE
229def stn(s, length, encoding, errors):
230 """Convert a string to a null-terminated bytes object.
7584f5c9 231 """
be60ffd0 232 s = s.encode(encoding, errors)
7584f5c9
ERE
233 return s[:length] + (length - len(s)) * NUL
234
be60ffd0
ERE
235def nts(s, encoding, errors):
236 """Convert a null-terminated bytes object to a string.
7584f5c9 237 """
be60ffd0
ERE
238 p = s.find(b"\0")
239 if p != -1:
240 s = s[:p]
241 return s.decode(encoding, errors)
242
243def sbtn(s, length, encoding, errors):
244 """Convert a string or a bunch of bytes to a null-terminated bytes object
245 of specific size.
246 """
247 if isinstance(s, str):
248 s = s.encode(encoding, errors)
249 return s[:length] + (length - len(s)) * NUL
7584f5c9
ERE
250
251def nti(s):
252 """Convert a number field to a python number.
253 """
254 # There are two possible encodings for a number field, see
255 # itn() below.
be60ffd0
ERE
256 if s[0] in (0o200, 0o377):
257 n = 0
258 for i in range(len(s) - 1):
259 n <<= 8
260 n += s[i + 1]
261 if s[0] == 0o377:
262 n = -(256 ** (len(s) - 1) - n)
263 else:
7584f5c9 264 try:
be60ffd0 265 n = int(nts(s, "ascii", "strict") or "0", 8)
7584f5c9
ERE
266 except ValueError:
267 raise InvalidHeaderError("invalid header")
7584f5c9
ERE
268 return n
269
270def itn(n, digits=8, format=DEFAULT_FORMAT):
271 """Convert a python number to a number field.
272 """
273 # POSIX 1003.1-1988 requires numbers to be encoded as a string of
274 # octal digits followed by a null-byte, this allows values up to
275 # (8**(digits-1))-1. GNU tar allows storing numbers greater than
be60ffd0
ERE
276 # that if necessary. A leading 0o200 or 0o377 byte indicate this
277 # particular encoding, the following digits-1 bytes are a big-endian
278 # base-256 representation. This allows values up to (256**(digits-1))-1.
279 # A 0o200 byte indicates a positive number, a 0o377 byte a negative
280 # number.
b95e7da5 281 n = int(n)
7584f5c9 282 if 0 <= n < 8 ** (digits - 1):
b95e7da5 283 s = bytes("%0*o" % (digits - 1, n), "ascii") + NUL
be60ffd0
ERE
284 elif format == GNU_FORMAT and -256 ** (digits - 1) <= n < 256 ** (digits - 1):
285 if n >= 0:
286 s = bytearray([0o200])
287 else:
288 s = bytearray([0o377])
289 n = 256 ** digits + n
7584f5c9 290
be60ffd0
ERE
291 for i in range(digits - 1):
292 s.insert(1, n & 0o377)
7584f5c9 293 n >>= 8
7584f5c9 294 else:
be60ffd0
ERE
295 raise ValueError("overflow in number field")
296
297 return s
7584f5c9
ERE
298
299def calc_chksums(buf):
300 """Calculate the checksum for a member's header by summing up all
301 characters except for the chksum field which is treated as if
302 it was filled with spaces. According to the GNU tar sources,
303 some tars (Sun and NeXT) calculate chksum with signed char,
304 which will be different if there are chars in the buffer with
305 the high bit set. So we calculate two checksums, unsigned and
306 signed.
307 """
be60ffd0
ERE
308 unsigned_chksum = 256 + sum(struct.unpack_from("148B8x356B", buf))
309 signed_chksum = 256 + sum(struct.unpack_from("148b8x356b", buf))
7584f5c9
ERE
310 return unsigned_chksum, signed_chksum
311
312def copyfileobj(src, dst, length=None):
313 """Copy length bytes from fileobj src to fileobj dst.
314 If length is None, copy the entire content.
315 """
316 if length == 0:
317 return
318 if length is None:
319 shutil.copyfileobj(src, dst)
320 return
321
7584f5c9 322 blocks, remainder = divmod(length, BUFSIZE)
be60ffd0 323 for b in range(blocks):
7584f5c9 324 buf = src.read(BUFSIZE)
c474439c 325 dst.write(buf)
7584f5c9 326 if len(buf) < BUFSIZE:
be60ffd0 327 raise OSError("end of file reached")
7584f5c9
ERE
328 if remainder != 0:
329 buf = src.read(remainder)
c474439c 330 dst.write(buf)
7584f5c9 331 if len(buf) < remainder:
be60ffd0 332 raise OSError("end of file reached")
c7c736b6 333
7584f5c9 334
7584f5c9 335def filemode(mode):
be60ffd0
ERE
336 """Deprecated in this location; use stat.filemode."""
337 import warnings
338 warnings.warn("deprecated in favor of stat.filemode",
339 DeprecationWarning, 2)
340 return stat.filemode(mode)
7584f5c9
ERE
341
342class TarError(Exception):
343 """Base exception."""
344 pass
345class ExtractError(TarError):
346 """General exception for extract errors."""
347 pass
348class ReadError(TarError):
be60ffd0 349 """Exception for unreadable tar archives."""
7584f5c9
ERE
350 pass
351class CompressionError(TarError):
352 """Exception for unavailable compression methods."""
353 pass
354class StreamError(TarError):
355 """Exception for unsupported operations on stream-like TarFiles."""
356 pass
357class HeaderError(TarError):
358 """Base exception for header errors."""
359 pass
360class EmptyHeaderError(HeaderError):
361 """Exception for empty headers."""
362 pass
363class TruncatedHeaderError(HeaderError):
364 """Exception for truncated headers."""
365 pass
366class EOFHeaderError(HeaderError):
367 """Exception for end of file headers."""
368 pass
369class InvalidHeaderError(HeaderError):
370 """Exception for invalid headers."""
371 pass
372class SubsequentHeaderError(HeaderError):
373 """Exception for missing and invalid extended headers."""
374 pass
8ab8fac5
PG
375class InvalidEncryptionError(TarError):
376 """Exception for undefined crypto modes and combinations."""
377 pass
e4e5d0b8
PG
378class DecryptionError(TarError):
379 """Exception for error during decryption."""
380 pass
c7c736b6 381class EncryptionError(TarError):
e93f83f1 382 """Exception for error during encryption."""
c7c736b6 383 pass
e50fa574
PG
384class EndOfFile(Exception):
385 """Signal end of file condition when they’re not an error."""
65b35c42 386 pass
7584f5c9
ERE
387
388#---------------------------
389# internal stream interface
390#---------------------------
391class _LowLevelFile:
392 """Low-level file object. Supports reading and writing.
393 It is used instead of a regular file object for streaming
394 access.
395 """
396
397 def __init__(self, name, mode):
ad4402e8 398 _mode = {
7584f5c9 399 "r": os.O_RDONLY,
c7c736b6 400 "w": os.O_RDWR | os.O_CREAT | os.O_TRUNC,
7584f5c9
ERE
401 }[mode]
402 if hasattr(os, "O_BINARY"):
f0287fb7 403 _mode |= os.O_BINARY # pylint: disable=no-member
be60ffd0 404 self.fd = os.open(name, _mode, 0o666)
ad4402e8 405 self.offset = 0
7584f5c9
ERE
406
407 def close(self):
408 os.close(self.fd)
409
410 def read(self, size):
ad4402e8
ERE
411 ret = os.read(self.fd, size)
412 self.offset += len(ret)
413 return ret
7584f5c9 414
867f75f7
PG
415 def write(self, s, pos=None):
416 if pos is not None:
417 p0 = self.offset
418 os.lseek (self.fd, pos, os.SEEK_SET)
419 n = os.write(self.fd, s)
420 if pos is None:
421 self.offset += len(s)
422 else:
423 append = pos + n - p0
424 if append > 0:
425 self.offset += append
426 os.lseek (self.fd, p0, os.SEEK_SET)
7584f5c9 427
ad4402e8
ERE
428 def tell(self):
429 return self.offset
430
c7c736b6
PG
431 def seek_set (self, pos):
432 os.lseek (self.fd, pos, os.SEEK_SET)
433 self.offset = pos
434
8ab8fac5 435
15a81fc0
PG
436def gz_header (name=None):
437 timestamp = int(time.time())
438 flags = 0x0
439
440 if name is None:
441 name = b""
442 else:
dfd7865e 443 flags |= GZ_FLAG_FNAME
15a81fc0
PG
444 if type(name) is str:
445 name = name.encode("iso-8859-1", "replace")
6e99d23a
PG
446 if name.endswith(b".pdtcrypt"):
447 name = name[:-9]
15a81fc0
PG
448 if name.endswith(b".gz"):
449 name = name[:-3]
450 # RFC1952 says we must use ISO-8859-1 for the FNAME field.
451 name += NUL
452
453 hdr = struct.pack (GZ_FMT_HEADER,
454 GZ_MAGIC [0], GZ_MAGIC [1],
455 GZ_METHOD_DEFLATE, flags,
456 timestamp,
457 GZ_DEFLATE_FLAGS, GZ_OS_CODE)
458
459 return hdr + name
460
d601d33b 461
7584f5c9
ERE
462class _Stream:
463 """Class that serves as an adapter between TarFile and
464 a stream-like object. The stream-like object only
465 needs to have a read() or write() method and is accessed
466 blockwise. Use of gzip or bzip2 compression is possible.
467 A stream-like object could be for example: sys.stdin,
468 sys.stdout, a socket, a tape device etc.
469
3031b7ae
PG
470 _Stream is intended to be used only internally but is
471 nevertherless used externally by Deltatar.
472
473 When encrypting, the ``enccounter`` will be used for
474 initializing the first cryptographic context. When
475 decrypting, its value will be compared to the decrypted
476 object. Decryption fails if the value does not match.
477 In effect, this means that a ``_Stream`` whose ctor was
478 passed ``enccounter`` can only be used to encrypt or
479 decrypt a single object.
7584f5c9
ERE
480 """
481
c7c736b6 482 remainder = -1 # track size in encrypted entries
04f4c7ab 483 tolerance = TOLERANCE_STRICT
c7c736b6 484
6e812ad9 485 def __init__(self, name, mode, comptype, fileobj, bufsize,
d1c38f40 486 concat=False, encryption=None, enccounter=None,
04f4c7ab 487 compresslevel=9, tolerance=TOLERANCE_STRICT):
7584f5c9
ERE
488 """Construct a _Stream object.
489 """
d1c38f40 490 self.arcmode = arcmode_set (concat, encryption, comptype)
04f4c7ab 491 self.tolerance = tolerance
d1c38f40 492
7584f5c9
ERE
493 self._extfileobj = True
494 if fileobj is None:
495 fileobj = _LowLevelFile(name, mode)
496 self._extfileobj = False
497
498 if comptype == '*':
499 # Enable transparent compression detection for the
500 # stream interface
501 fileobj = _StreamProxy(fileobj)
502 comptype = fileobj.getcomptype()
d1c38f40
PG
503 if comptype == '':
504 comptype = "tar"
7584f5c9 505
3031b7ae
PG
506 self.enccounter = None
507 if self.arcmode & ARCMODE_ENCRYPT:
508 self.enccounter = enccounter
509
7584f5c9
ERE
510 self.name = name or ""
511 self.mode = mode
512 self.comptype = comptype
53732900 513 self.cmp = None
7584f5c9
ERE
514 self.fileobj = fileobj
515 self.bufsize = bufsize
be60ffd0
ERE
516 self.buf = b""
517 self.pos = 0
518 self.concat_pos = 0
7584f5c9 519 self.closed = False
be60ffd0 520 self.flags = 0
be60ffd0 521 self.last_block_offset = 0
e4e5d0b8 522 self.dbuf = b"" # ???
46c03c02 523 self.exception = None # communicate decompression failure
2b82f50c 524 self.compresslevel = compresslevel
784175ba 525 self.bytes_written = 0
c7c736b6 526 # crypto parameters
2ae46844 527 self.encryption = encryption
c7c736b6 528 self.lasthdr = None
7584f5c9 529
b750b280
PG
530 if encryption is not None:
531 encryption.reset_last_iv ()
532
be60ffd0
ERE
533 try:
534 if comptype == "gz":
535 try:
536 import zlib
537 except ImportError:
538 raise CompressionError("zlib module is not available")
539 self.zlib = zlib
bec34b42
PG
540 if mode == "r":
541 self.exception = zlib.error
8ae983c4 542 self._init_read_gz()
bec34b42 543 elif mode == "w":
d1c38f40
PG
544 if not (self.arcmode & ARCMODE_CONCAT):
545 if self.arcmode & ARCMODE_ENCRYPT:
3031b7ae 546 self._init_write_encrypt (name)
a0873dcc 547 self._init_write_gz ()
c2ffe2ec 548 self.crc = zlib.crc32(b"") & 0xFFFFffff
7584f5c9 549
be60ffd0 550 elif comptype == "bz2":
d1c38f40 551 if self.arcmode & ARCMODE_ENCRYPT:
2ae46844 552 raise InvalidEncryptionError("encryption not available for "
d1c38f40 553 "compression “%s”" % comptype)
be60ffd0
ERE
554 try:
555 import bz2
556 except ImportError:
557 raise CompressionError("bz2 module is not available")
558 if mode == "r":
559 self.dbuf = b""
560 self.cmp = bz2.BZ2Decompressor()
561 self.exception = OSError
562 else:
563 self.cmp = bz2.BZ2Compressor()
7584f5c9 564
be60ffd0 565 elif comptype == 'xz':
d1c38f40 566 if self.arcmode & ARCMODE_ENCRYPT:
2ae46844 567 raise InvalidEncryptionError("encryption not available for "
d1c38f40 568 "compression “%s”" % comptype)
c7c736b6
PG
569 try:
570 import lzma
571 except ImportError:
572 raise CompressionError("lzma module is not available")
573 if mode == "r":
574 self.dbuf = b""
575 self.cmp = lzma.LZMADecompressor()
576 self.exception = lzma.LZMAError
577 else:
578 self.cmp = lzma.LZMACompressor()
579
6de9444a 580 elif comptype == "tar":
d1c38f40 581 if not (self.arcmode & ARCMODE_CONCAT) \
6de9444a 582 and mode == "w" \
d1c38f40 583 and self.arcmode & ARCMODE_ENCRYPT:
3031b7ae 584 self._init_write_encrypt (name)
6de9444a
PG
585
586 else:
d1c38f40 587 if self.arcmode & ARCMODE_ENCRYPT:
2ae46844 588 raise InvalidEncryptionError("encryption not available for "
d1c38f40 589 "compression “%s”" % comptype)
c7c736b6 590 raise CompressionError("unknown compression type %r" % comptype)
be60ffd0 591
200d4866 592 except:
be60ffd0
ERE
593 if not self._extfileobj:
594 self.fileobj.close()
595 self.closed = True
596 raise
ac5e4184 597
7584f5c9
ERE
598 def __del__(self):
599 if hasattr(self, "closed") and not self.closed:
fac2cfe1
PG
600 try:
601 self.close()
602 except crypto.InternalError:
603 # context already finalized due to abort but close() tried
604 # to use it
605 pass
7584f5c9 606
c7c736b6 607
d1c38f40
PG
608 def next (self, name):
609 if self.arcmode & ARCMODE_COMPRESS:
610 if getattr (self, "cmp", None) is not None:
611 self._finalize_write_gz ()
0349168a
PG
612 self.__sync()
613 if self.arcmode & ~(ARCMODE_ENCRYPT | ARCMODE_COMPRESS):
614 self.last_block_offset = self.fileobj.tell()
d1c38f40
PG
615 if self.arcmode & ARCMODE_ENCRYPT:
616 self._finalize_write_encrypt ()
617 self._init_write_encrypt (name, set_last_block_offset=True)
618 if self.arcmode & ARCMODE_COMPRESS:
619 self._init_write_gz (set_last_block_offset =
0349168a 620 not (self.arcmode & ARCMODE_ENCRYPT))
d1c38f40
PG
621 return self.last_block_offset
622
623
624 def next_volume (self, name):
625 # with non-concat modes, this is taken care by the _Stream
626 # ctor as invoked by the newvol handler
627 if self.arcmode & ARCMODE_COMPRESS:
628 if getattr (self, "cmp", None) is not None:
629 # e. g. compressed PAX header written
630 self._finalize_write_gz ()
631 if self.arcmode & ARCMODE_ENCRYPT:
632 self._init_write_encrypt (name)
633 if self.arcmode & ARCMODE_COMPRESS:
634 self._init_write_gz ()
635
c7c736b6 636
d1c38f40
PG
637 def _init_write_encrypt (self, entry=None, set_last_block_offset=False):
638 """
639 Save position for delayed write of header; fill the header location
640 with dummy bytes.
641 """
642 # first thing, proclaim new object to the encryption context
643 # secondly, assemble the header with the updated parameters
644 # and commit it directly to the underlying stream, bypassing the
645 # encryption layer in .__write().
646 dummyhdr = self.encryption.next (entry, counter=self.enccounter)
647 if dummyhdr is None:
648 raise EncryptionError ("Crypto.next(): bad dummy header") # XXX
649 self.lasthdr = self.fileobj.tell()
650 self.__write_to_file(dummyhdr)
651 if set_last_block_offset is True:
652 self.last_block_offset = self.lasthdr
c7c736b6
PG
653
654
655 def _finalize_write_encrypt (self):
656 """
657 Seek back to header position, read dummy bytes, finalize crypto
658 obtaining the actual header, write header, seek back to current
659 position.
963d0db4
PG
660
661 Returns the list of IV fixed parts as used during encryption.
c7c736b6 662 """
d1c38f40 663 if self.lasthdr is not None:
c7c736b6
PG
664 pos0 = self.fileobj.tell ()
665 self.fileobj.seek_set (self.lasthdr)
dd47d6a2 666 dummy = self.fileobj.read (crypto.PDTCRYPT_HDR_SIZE)
c7c736b6
PG
667 pos1 = self.fileobj.tell ()
668 dpos = pos1 - self.lasthdr
dd47d6a2 669 assert dpos == crypto.PDTCRYPT_HDR_SIZE
c7c736b6 670 self.fileobj.seek_set (pos0)
c8c72fe1 671 data, hdr, _ = self.encryption.done (dummy)
5f38bff6 672 self.__write_to_file(hdr, pos=self.lasthdr)
c7c736b6
PG
673 self.__write_to_file(data) # append remainder of data
674 self.lasthdr = -1
675
676
57db1546
PG
677 def _finalize_write_gz (self):
678 if self.cmp is not None:
679 chunk = self.buf + self.cmp.flush()
680 if chunk:
681 if self.comptype == "gz":
682 # The native zlib crc is an unsigned 32-bit integer, but
683 # the Python wrapper implicitly casts that to a signed C
684 # long. So, on a 32-bit box self.crc may "look negative",
685 # while the same crc on a 64-bit box may "look positive".
686 # To avoid irksome warnings from the `struct` module, force
687 # it to look positive on all boxes.
688 chunk += struct.pack("<L", self.crc & 0xffffffff)
689 chunk += struct.pack("<L", self.concat_pos & 0xffffFFFF)
690 self.__enc_write (chunk)
15a81fc0 691 self.buf = b""
57db1546
PG
692
693
a0873dcc 694 def _init_write_gz (self, set_last_block_offset=False):
5fdff89f
ERE
695 '''
696 Add a new gzip block, closing last one
697 '''
be60ffd0 698 self.concat_pos = 0
c2ffe2ec 699 self.crc = self.zlib.crc32(b"") & 0xFFFFffff
6de9444a 700 first = self.cmp is None
2b82f50c
ERE
701 self.cmp = self.zlib.compressobj(self.compresslevel,
702 self.zlib.DEFLATED,
703 -self.zlib.MAX_WBITS,
704 self.zlib.DEF_MEM_LEVEL,
705 0)
6e812ad9
DGM
706
707 # if aes, we encrypt after compression
6de9444a 708 if set_last_block_offset is True:
ad4402e8 709 self.last_block_offset = self.fileobj.tell()
6e812ad9 710
15a81fc0 711 self.__write(gz_header (self.name if first is True else None))
5fdff89f 712
ac5e4184 713
7584f5c9
ERE
714 def write(self, s):
715 """Write string s to the stream.
716 """
717 if self.comptype == "gz":
c2ffe2ec 718 self.crc = self.zlib.crc32(s, self.crc) & 0xFFFFffff
7584f5c9 719 self.pos += len(s)
5fdff89f 720 self.concat_pos += len(s)
53732900 721 if self.cmp is not None:
7584f5c9
ERE
722 s = self.cmp.compress(s)
723 self.__write(s)
724
c7c736b6 725 def __sync(self):
cb7a3911 726 """Write what’s left in the buffer to the stream."""
c7c736b6
PG
727 self.__write (b"") # → len (buf) <= bufsiz
728 self.__enc_write (self.buf)
729 self.buf = b""
730
7584f5c9 731 def __write(self, s):
548bb8d5
CH
732 """Writes (and encodes) string s to the stream blockwise
733
734 will wait with encoding/writing until block is complete
7584f5c9
ERE
735 """
736 self.buf += s
737 while len(self.buf) > self.bufsize:
6e812ad9 738 self.__enc_write(self.buf[:self.bufsize])
7584f5c9
ERE
739 self.buf = self.buf[self.bufsize:]
740
867f75f7 741
5f38bff6 742 def __write_to_file(self, s, pos=None):
6e812ad9 743 '''
5f38bff6 744 Writes directly to the fileobj; updates self.bytes_written. If “pos” is
cb7a3911 745 given, the stream will seek to that position first and back afterwards,
5f38bff6 746 and the total of bytes written is not updated.
6e812ad9 747 '''
867f75f7 748 self.fileobj.write(s, pos)
5f38bff6
PG
749 if pos is None:
750 self.bytes_written += len(s)
867f75f7 751
6e812ad9
DGM
752
753 def __enc_write(self, s):
cb7a3911
PG
754 """
755 If encryption is active, the string s is encrypted before being written
756 to the file.
757 """
758 if len (s) == 0:
759 return
d1c38f40 760 if self.arcmode & ARCMODE_ENCRYPT:
cb7a3911
PG
761 buf = s
762 while len (buf) > 0:
763 n, ct = self.encryption.process(buf)
764 self.__write_to_file(ct)
765 buf = buf [n:]
766 if len (buf) > 0:
767 # The entire plaintext was not consumed: The size limit
768 # for encrypted objects was reached. Transparently create
769 # a new encrypted object and continue processing the input.
770 self._finalize_write_encrypt ()
771 self._init_write_encrypt ()
772 else:
773 self.__write_to_file(s)
774
6e812ad9 775
784175ba
CH
776 def estim_file_size(self):
777 """ estimates size of file if closing it now
778
779 The result may differ greatly from the amount of data sent to write()
780 due to compression, encryption and buffering.
781
782 In tests the result (before calling close()) was up to 12k smaller than
783 the final file size if compression is being used because zlib/bz2
784 compressors do not allow inspection of their buffered data :-(
785
ba5a449e
CH
786 Still, we add what close() would add: 8 bytes for gz checksum, one
787 encryption block size if encryption is used and the size of our own
788 buffer
784175ba
CH
789 """
790 if self.closed:
791 return self.bytes_written
792
793 result = self.bytes_written
794 if self.buf:
795 result += len(self.buf)
796 if self.comptype == 'gz':
ba5a449e 797 result += 8 # 2 longs = 8 byte (no extra info written for bzip2)
784175ba
CH
798 return result
799
5fdff89f 800 def close(self, close_fileobj=True):
7584f5c9
ERE
801 """Close the _Stream object. No operation should be
802 done on it afterwards.
803 """
963d0db4 804
7584f5c9
ERE
805 if self.closed:
806 return
807
963d0db4 808 if close_fileobj is True:
a0873dcc 809
ae3d0f2a 810 if self.mode == "w":
d1c38f40 811 if self.arcmode & ARCMODE_COMPRESS:
a0873dcc 812 self._finalize_write_gz ()
ae3d0f2a 813 # end of Tar archive marker (two empty blocks) was written
267bc643
PG
814 # finalize encryption last; no writes may be performed after
815 # this point
cb7a3911 816 self.__sync ()
d1c38f40
PG
817 if self.arcmode & ARCMODE_ENCRYPT:
818 self._finalize_write_encrypt ()
267bc643 819
963d0db4
PG
820 if not self._extfileobj:
821 self.fileobj.close()
822 else:
823 # read the zlib crc and length and check them
824 if self.mode == "r" and self.comptype == "gz":
825 read_crc = self.__read(4)
826 read_length = self.__read(4)
827 calculated_crc = self.crc
828 if struct.unpack("<L", read_crc)[0] != calculated_crc:
829 raise CompressionError("bad gzip crc")
7584f5c9
ERE
830 self.closed = True
831
54128a00 832
7584f5c9
ERE
833 def _init_read_gz(self):
834 """Initialize for reading a gzip compressed fileobj.
835 """
836 self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
7584f5c9 837
85737f48 838 read2 = self.__read(2)
e50fa574
PG
839 if read2 == b"":
840 raise EndOfFile ("_init_read_gz(): read returned zero bytes at pos "
841 "%d" % self.fileobj.tell())
842 # taken from gzip.GzipFile with some alterations
d601d33b 843 if read2 != GZ_MAGIC_BYTES:
7584f5c9 844 raise ReadError("not a gzip file")
85737f48 845
5bd2d4b5
PG
846 read1 = self.__read(1)
847 if read1 == b"":
848 raise EndOfFile ("_init_read_gz(): read returned zero bytes inside "
849 "gzip header at pos %d" % self.fileobj.tell())
850 if ord (read1) != GZ_METHOD_DEFLATE:
7584f5c9
ERE
851 raise CompressionError("unsupported compression method")
852
85737f48 853 self.flags = flag = ord(self.__read(1))
dfd7865e 854 self.__read(6) # discard timestamp[4], deflate flags, os code
7584f5c9 855
dfd7865e 856 if flag & GZ_FLAG_FEXTRA:
7584f5c9
ERE
857 xlen = ord(self.__read(1)) + 256 * ord(self.__read(1))
858 self.read(xlen)
dfd7865e 859 if flag & GZ_FLAG_FNAME:
7584f5c9
ERE
860 while True:
861 s = self.__read(1)
862 if not s or s == NUL:
863 break
dfd7865e 864 if flag & GZ_FLAG_FCOMMENT:
7584f5c9
ERE
865 while True:
866 s = self.__read(1)
867 if not s or s == NUL:
868 break
dfd7865e 869 if flag & GZ_FLAG_FHCRC:
7584f5c9
ERE
870 self.__read(2)
871
c7c736b6
PG
872 def _init_read_encrypt (self):
873 """Initialize encryption for next entry in archive. Read a header and
874 notify the crypto context."""
d1c38f40 875 if self.arcmode & ARCMODE_ENCRYPT:
6e99d23a 876 lasthdr = self.fileobj.tell ()
15d3eefd
PG
877 try:
878 hdr = crypto.hdr_read_stream (self.fileobj)
8a8ac469
PG
879 except crypto.EndOfFile:
880 return False
6e99d23a 881 except crypto.InvalidHeader as exn:
c7c736b6 882 raise DecryptionError ("Crypto.hdr_read_stream(): error “%s” "
6e99d23a
PG
883 "processing %r at pos %d"
884 % (exn, self.fileobj, lasthdr)) \
ae3d0f2a 885 from exn
3031b7ae
PG
886 if self.enccounter is not None:
887 # enforce that the iv counter in the header matches an
888 # explicitly requested one
889 iv = crypto.hdr_iv_counter (hdr)
890 if iv != self.enccounter:
891 raise DecryptionError ("expected IV counter %d, got %d"
892 % (self.enccounter, iv))
6e99d23a 893 self.lasthdr = lasthdr
c7c736b6 894 self.remainder = hdr ["ctsize"] # distance to next header
1ed44e7b
PG
895 try:
896 self.encryption.next (hdr)
897 except crypto.InvalidParameter as exn:
898 raise DecryptionError ("Crypto.next(): error “%s” "
899 "processing %r at pos %d"
900 % (exn, self.fileobj, lasthdr)) \
901 from exn
8a8ac469
PG
902
903 return True
c7c736b6
PG
904
905
8de91f4f
PG
906 def _read_encrypt (self, buf):
907 """
908 Demote a program error to a decryption error in tolerant mode. This
909 allows recovery from corrupted headers and invalid data.
910 """
911 try:
912 return self.encryption.process (buf)
913 except RuntimeError as exn:
04f4c7ab 914 if self.tolerance != TOLERANCE_STRICT:
8de91f4f
PG
915 raise DecryptionError (exn)
916 raise
917
918
c7c736b6
PG
919 def _finalize_read_encrypt (self):
920 """
921 Finalize decryption.
922 """
d1c38f40
PG
923 if self.arcmode & ARCMODE_ENCRYPT \
924 and self.lasthdr is not None :
c7c736b6
PG
925 assert self.remainder >= 0
926 if self.remainder > 0:
c7c736b6 927 self.remainder = 0
b0078f26
PG
928 try:
929 data = self.encryption.done ()
930 except crypto.InvalidGCMTag as exn:
931 raise DecryptionError ("decryption failed: %s" % exn)
c7c736b6
PG
932 return data
933
934
7584f5c9
ERE
935 def tell(self):
936 """Return the stream's file pointer position.
937 """
938 return self.pos
939
940 def seek(self, pos=0):
941 """Set the stream's file pointer to pos. Negative seeking
942 is forbidden.
943 """
b750b280
PG
944 if pos == self.pos:
945 pass # nothing to do
946 elif pos - self.pos >= 0:
7584f5c9 947 blocks, remainder = divmod(pos - self.pos, self.bufsize)
b750b280
PG
948 if self.encryption is not None:
949 # IV succession is only preserved between successive objects.
950 self.encryption.reset_last_iv ()
be60ffd0 951 for i in range(blocks):
7584f5c9
ERE
952 self.read(self.bufsize)
953 self.read(remainder)
954 else:
955 raise StreamError("seeking backwards is not allowed")
956 return self.pos
957
958 def read(self, size=None):
959 """Return the next size number of bytes from the stream.
960 If size is not defined, return all bytes of the stream
961 up to EOF.
962 """
963 if size is None:
964 t = []
965 while True:
966 buf = self._read(self.bufsize)
967 if not buf:
968 break
969 t.append(buf)
9dc7ac5c 970 buf = b"".join(t)
7584f5c9
ERE
971 else:
972 buf = self._read(size)
973 self.pos += len(buf)
974 return buf
975
3a7e1a50
ERE
976 def readline(self):
977 """Reads just one line, new line character included
978 """
f0fd5e3a 979 # if \n in dbuf, no read neads to be done
be60ffd0
ERE
980 if b'\n' in self.dbuf:
981 pos = self.dbuf.index(b'\n') + 1
f0fd5e3a
ERE
982 ret = self.dbuf[:pos]
983 self.dbuf = self.dbuf[pos:]
984 return ret
985
1215b602 986 buf = []
3a7e1a50
ERE
987 while True:
988 chunk = self._read(self.bufsize)
989
f0fd5e3a 990 # nothing more to read, so return the buffer
3a7e1a50 991 if not chunk:
be60ffd0 992 return b''.join(buf)
3a7e1a50
ERE
993
994 buf.append(chunk)
f0fd5e3a
ERE
995
996 # if \n found, return the new line
be60ffd0
ERE
997 if b'\n' in chunk:
998 dbuf = b''.join(buf)
999 pos = dbuf.index(b'\n') + 1
1215b602 1000 self.dbuf = dbuf[pos:] + self.dbuf
3a7e1a50
ERE
1001 return dbuf[:pos]
1002
7584f5c9
ERE
1003 def _read(self, size):
1004 """Return size bytes from the stream.
1005 """
7584f5c9
ERE
1006 c = len(self.dbuf)
1007 t = [self.dbuf]
e4e5d0b8 1008
7584f5c9 1009 while c < size:
867f75f7 1010 buf = self.__read(self.bufsize)
7584f5c9
ERE
1011 if not buf:
1012 break
3a7e1a50 1013
53732900 1014 if self.cmp is not None:
85737f48 1015 try:
3a7e1a50 1016 buf = self.cmp.decompress(buf)
54128a00
PG
1017 except self.exception as exn:
1018 raise ReadError("invalid compressed data (%r)" % exn)
be60ffd0 1019 except Exception as e:
04fb06f4
DGM
1020 # happens at the end of the file
1021 # _init_read_gz failed in the previous iteration so
e4e5d0b8 1022 # self.cmp.decompress fails here
d1c38f40 1023 if self.arcmode & ARCMODE_CONCAT:
be60ffd0
ERE
1024 pass
1025 else:
1026 raise ReadError("invalid compressed data")
d1c38f40 1027 if self.arcmode & ARCMODE_COMPRESS and hasattr(self, "crc"):
c2ffe2ec 1028 self.crc = self.zlib.crc32(buf, self.crc) & 0xFFFFffff
d1c38f40
PG
1029 if self.arcmode & ARCMODE_CONCAT \
1030 and len(self.cmp.unused_data) != 0:
3a7e1a50
ERE
1031 self.buf = self.cmp.unused_data + self.buf
1032 self.close(close_fileobj=False)
1033 try:
1034 self._init_read_gz()
8de91f4f 1035 except DecryptionError:
04f4c7ab 1036 if self.tolerance != TOLERANCE_STRICT:
8de91f4f
PG
1037 # return whatever data was processed successfully
1038 if len (buf) > 0:
1039 t.append (buf)
1040 if len (t) > 0:
1041 break
24afaf18
PG
1042 raise
1043 except ReadError: # gzip troubles
1044 if self.tolerance == TOLERANCE_RESCUE:
1045 if len (buf) > 0:
1046 t.append (buf)
1047 if len (t) > 0:
1048 break
1049 raise
e50fa574 1050 except EndOfFile:
3a7e1a50
ERE
1051 # happens at the end of the file
1052 pass
c2ffe2ec 1053 self.crc = self.zlib.crc32(b"") & 0xFFFFffff
3a7e1a50 1054 self.closed = False
7584f5c9
ERE
1055 t.append(buf)
1056 c += len(buf)
be60ffd0 1057 t = b"".join(t)
7584f5c9
ERE
1058 self.dbuf = t[size:]
1059 return t[:size]
1060
e4e5d0b8 1061
7584f5c9 1062 def __read(self, size):
ef3b4499
PG
1063 """
1064 Return size bytes from stream. If internal buffer is empty, read
1065 another block from the stream.
1066
1067 The function returns up to size bytes of data. When an error occurs
1068 during decryption, everything until the end of the last successfully
1069 finalized object is returned.
7584f5c9
ERE
1070 """
1071 c = len(self.buf)
8de91f4f 1072 t = [self.buf] if c > 0 else []
1ed44e7b 1073 good_crypto = len (t)
8de91f4f 1074
7584f5c9 1075 while c < size:
c7c736b6 1076 todo = size
8de91f4f
PG
1077 try:
1078 if self.arcmode & ARCMODE_ENCRYPT:
1079 if self.remainder <= 0:
1080 # prepare next object
044585c6
PG
1081 if self._init_read_encrypt () is False: # EOF
1082 buf = None
1083 break # while
8de91f4f
PG
1084
1085 # only read up to the end of the encrypted object
1086 todo = min (size, self.remainder)
1087 buf = self.fileobj.read(todo)
1088 if self.arcmode & ARCMODE_ENCRYPT:
1089 # decrypt the thing
1090 buf = self._read_encrypt (buf)
1091 if todo == self.remainder:
1092 # at the end of a crypto object; finalization will fail if
1093 # the GCM tag does not match
ef3b4499 1094 trailing = self._finalize_read_encrypt ()
8de91f4f
PG
1095 good_crypto = len (t) + 1
1096 if len (trailing) > 0:
1097 buf += trailing
1098 self.remainder = 0
1099 else:
1100 self.remainder -= todo
1101 except DecryptionError:
04f4c7ab 1102 if self.tolerance == TOLERANCE_STRICT:
8de91f4f
PG
1103 raise
1104 self.encryption.drop ()
24afaf18
PG
1105 if self.tolerance == TOLERANCE_RECOVER:
1106 if good_crypto == 0:
1107 raise
1108 # this may occur at any of the three crypto operations above.
1109 # some objects did validate; discard all data after it; next
1110 # call will start with the bad object and error out immediately
1111 self.buf = b"".join (t [good_crypto:])
1112 return b"".join (t [:good_crypto])
1113 elif self.tolerance == TOLERANCE_RESCUE:
1114 # keep what we have so far despite the finalization issue
1115 t.append (buf)
1116 c += len (buf)
1117 break
1118 else:
1119 raise RuntimeError("internal error: bad tolerance level")
c7c736b6
PG
1120
1121 if not buf: ## XXX stream terminated prematurely; this should be an error
7584f5c9 1122 break
c7c736b6 1123
7584f5c9
ERE
1124 t.append(buf)
1125 c += len(buf)
be60ffd0 1126 t = b"".join(t)
7584f5c9 1127 self.buf = t[size:]
fb27c6e8 1128
7584f5c9 1129 return t[:size]
7d372216 1130
7584f5c9
ERE
1131
1132class _StreamProxy(object):
1133 """Small proxy class that enables transparent compression
1134 detection for the Stream interface (mode 'r|*').
1135 """
1136
1137 def __init__(self, fileobj):
1138 self.fileobj = fileobj
1139 self.buf = self.fileobj.read(BLOCKSIZE)
1140
f0287fb7 1141 def read(self, size): # pylint: disable=method-hidden
7584f5c9
ERE
1142 self.read = self.fileobj.read
1143 return self.buf
1144
1145 def getcomptype(self):
d601d33b 1146 if self.buf.startswith(GZ_MAGIC_DEFLATE):
7584f5c9 1147 return "gz"
be60ffd0 1148 elif self.buf[0:3] == b"BZh" and self.buf[4:10] == b"1AY&SY":
7584f5c9 1149 return "bz2"
be60ffd0
ERE
1150 elif self.buf.startswith((b"\x5d\x00\x00\x80", b"\xfd7zXZ")):
1151 return "xz"
1152 else:
1153 return "tar"
7584f5c9
ERE
1154
1155 def close(self):
1156 self.fileobj.close()
1157# class StreamProxy
1158
7584f5c9
ERE
1159#------------------------
1160# Extraction file object
1161#------------------------
1162class _FileInFile(object):
1163 """A thin wrapper around an existing file object that
1164 provides a part of its data as an individual file
1165 object.
1166 """
1167
be60ffd0 1168 def __init__(self, fileobj, offset, size, blockinfo=None):
7584f5c9
ERE
1169 self.fileobj = fileobj
1170 self.offset = offset
1171 self.size = size
7584f5c9 1172 self.position = 0
be60ffd0
ERE
1173 self.name = getattr(fileobj, "name", None)
1174 self.closed = False
1175
1176 if blockinfo is None:
1177 blockinfo = [(0, size)]
1178
1179 # Construct a map with data and zero blocks.
1180 self.map_index = 0
1181 self.map = []
1182 lastpos = 0
1183 realpos = self.offset
1184 for offset, size in blockinfo:
1185 if offset > lastpos:
1186 self.map.append((False, lastpos, offset, None))
1187 self.map.append((True, offset, offset + size, realpos))
1188 realpos += size
1189 lastpos = offset + size
1190 if lastpos < self.size:
1191 self.map.append((False, lastpos, self.size, None))
1192
1193 def flush(self):
1194 pass
1195
1196 def readable(self):
1197 return True
1198
1199 def writable(self):
1200 return False
1201
1202 def seekable(self):
1203 return self.fileobj.seekable()
7584f5c9
ERE
1204
1205 def tell(self):
1206 """Return the current file position.
1207 """
1208 return self.position
1209
be60ffd0 1210 def seek(self, position, whence=io.SEEK_SET):
7584f5c9
ERE
1211 """Seek to a position in the file.
1212 """
be60ffd0
ERE
1213 if whence == io.SEEK_SET:
1214 self.position = min(max(position, 0), self.size)
1215 elif whence == io.SEEK_CUR:
1216 if position < 0:
1217 self.position = max(self.position + position, 0)
1218 else:
1219 self.position = min(self.position + position, self.size)
1220 elif whence == io.SEEK_END:
1221 self.position = max(min(self.size + position, self.size), 0)
1222 else:
1223 raise ValueError("Invalid argument")
1224 return self.position
7584f5c9
ERE
1225
1226 def read(self, size=None):
1227 """Read data from the file.
1228 """
1229 if size is None:
1230 size = self.size - self.position
1231 else:
1232 size = min(size, self.size - self.position)
1233
be60ffd0 1234 buf = b""
7584f5c9 1235 while size > 0:
7584f5c9 1236 while True:
be60ffd0
ERE
1237 data, start, stop, offset = self.map[self.map_index]
1238 if start <= self.position < stop:
7584f5c9 1239 break
be60ffd0
ERE
1240 else:
1241 self.map_index += 1
1242 if self.map_index == len(self.map):
1243 self.map_index = 0
1244 length = min(size, stop - self.position)
1245 if data:
1246 self.fileobj.seek(offset + (self.position - start))
1247 buf += self.fileobj.read(length)
7584f5c9 1248 else:
be60ffd0
ERE
1249 buf += NUL * length
1250 size -= length
1251 self.position += length
1252 return buf
7584f5c9 1253
be60ffd0
ERE
1254 def readinto(self, b):
1255 buf = self.read(len(b))
1256 b[:len(buf)] = buf
1257 return len(buf)
7584f5c9
ERE
1258
1259 def close(self):
7584f5c9 1260 self.closed = True
be60ffd0 1261#class _FileInFile
7584f5c9 1262
be60ffd0
ERE
1263
1264class ExFileObject(io.BufferedReader):
1265
1266 def __init__(self, tarfile, tarinfo):
1267 fileobj = _FileInFile(tarfile.fileobj, tarinfo.offset_data,
1268 tarinfo.size, tarinfo.sparse)
1269 super().__init__(fileobj)
7584f5c9
ERE
1270#class ExFileObject
1271
1272#------------------
1273# Exported Classes
1274#------------------
1275class TarInfo(object):
1276 """Informational class which holds the details about an
1277 archive member given by a tar header block.
1278 TarInfo objects are returned by TarFile.getmember(),
1279 TarFile.getmembers() and TarFile.gettarinfo() and are
1280 usually created internally.
1281 """
1282
be60ffd0
ERE
1283 __slots__ = ("name", "mode", "uid", "gid", "size", "mtime",
1284 "chksum", "type", "linkname", "uname", "gname",
1285 "devmajor", "devminor", "volume_offset",
1286 "offset", "offset_data", "pax_headers", "sparse",
1287 "tarfile", "_sparse_structs", "_link_target")
1288
7584f5c9
ERE
1289 def __init__(self, name=""):
1290 """Construct a TarInfo object. name is the optional name
1291 of the member.
1292 """
1293 self.name = name # member name
be60ffd0 1294 self.mode = 0o644 # file permissions
7584f5c9
ERE
1295 self.uid = 0 # user id
1296 self.gid = 0 # group id
1297 self.size = 0 # file size
1298 self.mtime = 0 # modification time
1299 self.chksum = 0 # header checksum
1300 self.type = REGTYPE # member type
1301 self.linkname = "" # link name
1302 self.uname = "" # user name
1303 self.gname = "" # group name
1304 self.devmajor = 0 # device major number
1305 self.devminor = 0 # device minor number
1306
1307 self.offset = 0 # the tar header starts here
1308 self.offset_data = 0 # the file's data starts here
0eb5048f
ERE
1309 self.volume_offset = 0 # the file's data corresponds with the data
1310 # starting at this position
7584f5c9 1311
be60ffd0 1312 self.sparse = None # sparse member information
7584f5c9
ERE
1313 self.pax_headers = {} # pax header information
1314
1315 # In pax headers the "name" and "linkname" field are called
1316 # "path" and "linkpath".
1317 def _getpath(self):
1318 return self.name
1319 def _setpath(self, name):
1320 self.name = name
1321 path = property(_getpath, _setpath)
1322
1323 def _getlinkpath(self):
1324 return self.linkname
1325 def _setlinkpath(self, linkname):
1326 self.linkname = linkname
1327 linkpath = property(_getlinkpath, _setlinkpath)
1328
1329 def __repr__(self):
1330 return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self))
1331
be60ffd0 1332 def get_info(self, encoding=None, errors=None):
7584f5c9
ERE
1333 """Return the TarInfo's attributes as a dictionary.
1334 """
1335 info = {
1336 "name": self.name,
be60ffd0 1337 "mode": self.mode & 0o7777,
7584f5c9
ERE
1338 "uid": self.uid,
1339 "gid": self.gid,
1340 "size": self.size,
1341 "mtime": self.mtime,
1342 "chksum": self.chksum,
1343 "type": self.type,
1344 "linkname": self.linkname,
1345 "uname": self.uname,
1346 "gname": self.gname,
1347 "devmajor": self.devmajor,
36a315a0 1348 "devminor": self.devminor,
0eb5048f
ERE
1349 "offset_data": self.offset_data,
1350 "volume_offset": self.volume_offset
7584f5c9
ERE
1351 }
1352
1353 if info["type"] == DIRTYPE and not info["name"].endswith("/"):
1354 info["name"] += "/"
1355
7584f5c9
ERE
1356 return info
1357
be60ffd0
ERE
1358 def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING,
1359 errors="surrogateescape"):
7584f5c9
ERE
1360 """Return a tar header as a string of 512 byte blocks.
1361 """
1362 info = self.get_info(encoding, errors)
1363
1364 if format == USTAR_FORMAT:
be60ffd0 1365 return self.create_ustar_header(info, encoding, errors)
7584f5c9 1366 elif format == GNU_FORMAT:
be60ffd0 1367 return self.create_gnu_header(info, encoding, errors)
7584f5c9
ERE
1368 elif format == PAX_FORMAT:
1369 return self.create_pax_header(info, encoding, errors)
1370 else:
1371 raise ValueError("invalid format")
1372
be60ffd0 1373 def create_ustar_header(self, info, encoding, errors):
7584f5c9
ERE
1374 """Return the object as a ustar header block.
1375 """
1376 info["magic"] = POSIX_MAGIC
1377
1378 if len(info["linkname"]) > LENGTH_LINK:
1379 raise ValueError("linkname is too long")
1380
1381 if len(info["name"]) > LENGTH_NAME:
1382 info["prefix"], info["name"] = self._posix_split_name(info["name"])
1383
be60ffd0 1384 return self._create_header(info, USTAR_FORMAT, encoding, errors)
7584f5c9 1385
be60ffd0 1386 def create_gnu_header(self, info, encoding, errors):
7584f5c9
ERE
1387 """Return the object as a GNU header block sequence.
1388 """
1389 info["magic"] = GNU_MAGIC
1390
2f854e77
ERE
1391 if self.ismultivol():
1392 prefix = [
1393 itn(info.get("atime", 0), 12, GNU_FORMAT),
1394 itn(info.get("ctime", 0), 12, GNU_FORMAT),
0eb5048f 1395 itn(self.volume_offset, 12, GNU_FORMAT),
2f854e77
ERE
1396 itn(0, 119, GNU_FORMAT), # stuff unused in this tar implementation, set to zero
1397 ]
be60ffd0 1398 info['prefix'] = b"".join(prefix)
0eb5048f 1399 info['size'] = info['size'] - self.volume_offset
2f854e77 1400
be60ffd0 1401 buf = b""
7584f5c9 1402 if len(info["linkname"]) > LENGTH_LINK:
be60ffd0
ERE
1403 buf += self._create_gnu_long_header(info["linkname"],
1404 GNUTYPE_LONGLINK, encoding, errors)
7584f5c9
ERE
1405
1406 if len(info["name"]) > LENGTH_NAME:
be60ffd0
ERE
1407 buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME,
1408 encoding, errors)
7584f5c9 1409
be60ffd0 1410 return buf + self._create_header(info, GNU_FORMAT, encoding, errors)
7584f5c9
ERE
1411
1412 def create_pax_header(self, info, encoding, errors):
1413 """Return the object as a ustar header block. If it cannot be
1414 represented this way, prepend a pax extended header sequence
1415 with supplement information.
1416 """
1417 info["magic"] = POSIX_MAGIC
1418 pax_headers = self.pax_headers.copy()
c04e0751
ERE
1419 if self.ismultivol():
1420 info['size'] = info['size'] - self.volume_offset
7584f5c9
ERE
1421
1422 # Test string fields for values that exceed the field length or cannot
1423 # be represented in ASCII encoding.
1424 for name, hname, length in (
36a315a0
ERE
1425 ("name", "path", LENGTH_NAME),
1426 ("linkname", "linkpath", LENGTH_LINK),
1427 ("uname", "uname", 32),
1428 ("gname", "gname", 32)):
7584f5c9
ERE
1429
1430 if hname in pax_headers:
1431 # The pax header has priority.
1432 continue
1433
7584f5c9
ERE
1434 # Try to encode the string as ASCII.
1435 try:
be60ffd0 1436 info[name].encode("ascii", "strict")
7584f5c9 1437 except UnicodeEncodeError:
be60ffd0 1438 pax_headers[hname] = info[name]
7584f5c9
ERE
1439 continue
1440
1441 if len(info[name]) > length:
be60ffd0 1442 pax_headers[hname] = info[name]
7584f5c9
ERE
1443
1444 # Test number fields for values that exceed the field limit or values
1445 # that like to be stored as float.
1446 for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)):
1447 if name in pax_headers:
1448 # The pax header has priority. Avoid overflow.
1449 info[name] = 0
1450 continue
1451
1452 val = info[name]
1453 if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float):
be60ffd0 1454 pax_headers[name] = str(val)
7584f5c9
ERE
1455 info[name] = 0
1456
1457 # Create a pax extended header if necessary.
1458 if pax_headers:
be60ffd0 1459 buf = self._create_pax_generic_header(pax_headers, XHDTYPE, encoding)
7584f5c9 1460 else:
be60ffd0 1461 buf = b""
7584f5c9 1462
be60ffd0 1463 return buf + self._create_header(info, USTAR_FORMAT, "ascii", "replace")
7584f5c9
ERE
1464
1465 @classmethod
1466 def create_pax_global_header(cls, pax_headers):
1467 """Return the object as a pax global header block sequence.
1468 """
be60ffd0 1469 return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf-8")
7584f5c9
ERE
1470
1471 def _posix_split_name(self, name):
1472 """Split a name longer than 100 chars into a prefix
1473 and a name part.
1474 """
1475 prefix = name[:LENGTH_PREFIX + 1]
1476 while prefix and prefix[-1] != "/":
1477 prefix = prefix[:-1]
1478
1479 name = name[len(prefix):]
1480 prefix = prefix[:-1]
1481
1482 if not prefix or len(name) > LENGTH_NAME:
1483 raise ValueError("name is too long")
1484 return prefix, name
1485
1486 @staticmethod
be60ffd0 1487 def _create_header(info, format, encoding, errors):
7584f5c9
ERE
1488 """Return a header block. info is a dictionary with file
1489 information, format must be one of the *_FORMAT constants.
1490 """
1491 parts = [
be60ffd0
ERE
1492 stn(info.get("name", ""), 100, encoding, errors),
1493 itn(info.get("mode", 0) & 0o7777, 8, format),
7584f5c9
ERE
1494 itn(info.get("uid", 0), 8, format),
1495 itn(info.get("gid", 0), 8, format),
1496 itn(info.get("size", 0), 12, format),
1497 itn(info.get("mtime", 0), 12, format),
be60ffd0 1498 b" ", # checksum field
2f854e77 1499 info.get("type", REGTYPE),
be60ffd0
ERE
1500 stn(info.get("linkname", ""), 100, encoding, errors),
1501 info.get("magic", POSIX_MAGIC),
1502 stn(info.get("uname", ""), 32, encoding, errors),
1503 stn(info.get("gname", ""), 32, encoding, errors),
7584f5c9
ERE
1504 itn(info.get("devmajor", 0), 8, format),
1505 itn(info.get("devminor", 0), 8, format),
be60ffd0 1506 sbtn(info.get("prefix", ""), 155, encoding, errors)
7584f5c9
ERE
1507 ]
1508
be60ffd0 1509 buf = struct.pack("%ds" % BLOCKSIZE, b"".join(parts))
7584f5c9 1510 chksum = calc_chksums(buf[-BLOCKSIZE:])[0]
be60ffd0 1511 buf = buf[:-364] + bytes("%06o\0" % chksum, "ascii") + buf[-357:]
7584f5c9
ERE
1512 return buf
1513
1514 @staticmethod
1515 def _create_payload(payload):
1516 """Return the string payload filled with zero bytes
1517 up to the next 512 byte border.
1518 """
1519 blocks, remainder = divmod(len(payload), BLOCKSIZE)
1520 if remainder > 0:
1521 payload += (BLOCKSIZE - remainder) * NUL
1522 return payload
1523
1524 @classmethod
be60ffd0 1525 def _create_gnu_long_header(cls, name, type, encoding, errors):
7584f5c9
ERE
1526 """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence
1527 for name.
1528 """
be60ffd0 1529 name = name.encode(encoding, errors) + NUL
7584f5c9
ERE
1530
1531 info = {}
1532 info["name"] = "././@LongLink"
1533 info["type"] = type
1534 info["size"] = len(name)
1535 info["magic"] = GNU_MAGIC
1536
1537 # create extended header + name blocks.
be60ffd0 1538 return cls._create_header(info, USTAR_FORMAT, encoding, errors) + \
7584f5c9
ERE
1539 cls._create_payload(name)
1540
1541 @classmethod
be60ffd0
ERE
1542 def _create_pax_generic_header(cls, pax_headers, type, encoding):
1543 """Return a POSIX.1-2008 extended or global header sequence
7584f5c9 1544 that contains a list of keyword, value pairs. The values
be60ffd0 1545 must be strings.
7584f5c9 1546 """
be60ffd0
ERE
1547 # Check if one of the fields contains surrogate characters and thereby
1548 # forces hdrcharset=BINARY, see _proc_pax() for more information.
1549 binary = False
1550 for keyword, value in pax_headers.items():
1551 try:
1552 value.encode("utf-8", "strict")
1553 except UnicodeEncodeError:
1554 binary = True
1555 break
1556
1557 records = b""
1558 if binary:
1559 # Put the hdrcharset field at the beginning of the header.
1560 records += b"21 hdrcharset=BINARY\n"
1561
1562 for keyword, value in pax_headers.items():
1563 keyword = keyword.encode("utf-8")
1564 if binary:
1565 # Try to restore the original byte representation of `value'.
1566 # Needless to say, that the encoding must match the string.
1567 value = value.encode(encoding, "surrogateescape")
1568 else:
1569 value = value.encode("utf-8")
1570
7584f5c9
ERE
1571 l = len(keyword) + len(value) + 3 # ' ' + '=' + '\n'
1572 n = p = 0
1573 while True:
1574 n = l + len(str(p))
1575 if n == p:
1576 break
1577 p = n
be60ffd0 1578 records += bytes(str(p), "ascii") + b" " + keyword + b"=" + value + b"\n"
7584f5c9
ERE
1579
1580 # We use a hardcoded "././@PaxHeader" name like star does
1581 # instead of the one that POSIX recommends.
1582 info = {}
1583 info["name"] = "././@PaxHeader"
1584 info["type"] = type
1585 info["size"] = len(records)
1586 info["magic"] = POSIX_MAGIC
1587
1588 # Create pax header + record blocks.
be60ffd0 1589 return cls._create_header(info, USTAR_FORMAT, "ascii", "replace") + \
7584f5c9
ERE
1590 cls._create_payload(records)
1591
1592 @classmethod
be60ffd0
ERE
1593 def frombuf(cls, buf, encoding, errors):
1594 """Construct a TarInfo object from a 512 byte bytes object.
7584f5c9
ERE
1595 """
1596 if len(buf) == 0:
1597 raise EmptyHeaderError("empty header")
1598 if len(buf) != BLOCKSIZE:
1599 raise TruncatedHeaderError("truncated header")
1600 if buf.count(NUL) == BLOCKSIZE:
1601 raise EOFHeaderError("end of file header")
1602
1603 chksum = nti(buf[148:156])
1604 if chksum not in calc_chksums(buf):
1605 raise InvalidHeaderError("bad checksum")
1606
1607 obj = cls()
be60ffd0 1608 obj.name = nts(buf[0:100], encoding, errors)
7584f5c9
ERE
1609 obj.mode = nti(buf[100:108])
1610 obj.uid = nti(buf[108:116])
1611 obj.gid = nti(buf[116:124])
1612 obj.size = nti(buf[124:136])
1613 obj.mtime = nti(buf[136:148])
1614 obj.chksum = chksum
1615 obj.type = buf[156:157]
be60ffd0
ERE
1616 obj.linkname = nts(buf[157:257], encoding, errors)
1617 obj.uname = nts(buf[265:297], encoding, errors)
1618 obj.gname = nts(buf[297:329], encoding, errors)
7584f5c9
ERE
1619 obj.devmajor = nti(buf[329:337])
1620 obj.devminor = nti(buf[337:345])
be60ffd0
ERE
1621 prefix = nts(buf[345:500], encoding, errors)
1622
1623 # The old GNU sparse format occupies some of the unused
1624 # space in the buffer for up to 4 sparse structures.
1625 # Save the them for later processing in _proc_sparse().
1626 if obj.type == GNUTYPE_SPARSE:
1627 pos = 386
1628 structs = []
1629 for i in range(4):
1630 try:
1631 offset = nti(buf[pos:pos + 12])
1632 numbytes = nti(buf[pos + 12:pos + 24])
1633 except ValueError:
1634 break
1635 structs.append((offset, numbytes))
1636 pos += 24
1637 isextended = bool(buf[482])
1638 origsize = nti(buf[483:495])
1639 obj._sparse_structs = (structs, isextended, origsize)
7584f5c9
ERE
1640
1641 # Old V7 tar format represents a directory as a regular
1642 # file with a trailing slash.
1643 if obj.type == AREGTYPE and obj.name.endswith("/"):
1644 obj.type = DIRTYPE
1645
1646 # Remove redundant slashes from directories.
1647 if obj.isdir():
1648 obj.name = obj.name.rstrip("/")
1649
1650 # Reconstruct a ustar longname.
1651 if prefix and obj.type not in GNU_TYPES:
1652 obj.name = prefix + "/" + obj.name
c474439c
ERE
1653 else:
1654 obj.offset_data = nti(buf[369:381])
7584f5c9
ERE
1655 return obj
1656
1657 @classmethod
1658 def fromtarfile(cls, tarfile):
1659 """Return the next TarInfo object from TarFile object
1660 tarfile.
1661 """
1662 buf = tarfile.fileobj.read(BLOCKSIZE)
be60ffd0 1663 obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors)
7584f5c9
ERE
1664 obj.offset = tarfile.fileobj.tell() - BLOCKSIZE
1665 return obj._proc_member(tarfile)
1666
1667 #--------------------------------------------------------------------------
1668 # The following are methods that are called depending on the type of a
1669 # member. The entry point is _proc_member() which can be overridden in a
1670 # subclass to add custom _proc_*() methods. A _proc_*() method MUST
1671 # implement the following
1672 # operations:
1673 # 1. Set self.offset_data to the position where the data blocks begin,
1674 # if there is data that follows.
1675 # 2. Set tarfile.offset to the position where the next member's header will
1676 # begin.
1677 # 3. Return self or another valid TarInfo object.
1678 def _proc_member(self, tarfile):
1679 """Choose the right processing method depending on
1680 the type and call it.
1681 """
1682 if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK):
1683 return self._proc_gnulong(tarfile)
1684 elif self.type == GNUTYPE_SPARSE:
1685 return self._proc_sparse(tarfile)
1686 elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE):
1687 return self._proc_pax(tarfile)
1688 else:
1689 return self._proc_builtin(tarfile)
1690
1691 def _proc_builtin(self, tarfile):
1692 """Process a builtin type or an unknown type which
1693 will be treated as a regular file.
1694 """
1695 self.offset_data = tarfile.fileobj.tell()
1696 offset = self.offset_data
00c34a12 1697 if self.isreg() or self.ismultivol() or self.type not in SUPPORTED_TYPES:
7584f5c9
ERE
1698 # Skip the following data blocks.
1699 offset += self._block(self.size)
1700 tarfile.offset = offset
1701
1702 # Patch the TarInfo object with saved global
1703 # header information.
1704 self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors)
1705
1706 return self
1707
1708 def _proc_gnulong(self, tarfile):
1709 """Process the blocks that hold a GNU longname
1710 or longlink member.
1711 """
1712 buf = tarfile.fileobj.read(self._block(self.size))
1713
1714 # Fetch the next header and process it.
1715 try:
1716 next = self.fromtarfile(tarfile)
1717 except HeaderError:
1718 raise SubsequentHeaderError("missing or bad subsequent header")
1719
1720 # Patch the TarInfo object from the next header with
1721 # the longname information.
1722 next.offset = self.offset
1723 if self.type == GNUTYPE_LONGNAME:
be60ffd0 1724 next.name = nts(buf, tarfile.encoding, tarfile.errors)
7584f5c9 1725 elif self.type == GNUTYPE_LONGLINK:
be60ffd0 1726 next.linkname = nts(buf, tarfile.encoding, tarfile.errors)
7584f5c9
ERE
1727
1728 return next
1729
1730 def _proc_sparse(self, tarfile):
1731 """Process a GNU sparse header plus extra headers.
1732 """
be60ffd0
ERE
1733 # We already collected some sparse structures in frombuf().
1734 structs, isextended, origsize = self._sparse_structs
1735 del self._sparse_structs
1736
1737 # Collect sparse structures from extended header blocks.
1738 while isextended:
7584f5c9
ERE
1739 buf = tarfile.fileobj.read(BLOCKSIZE)
1740 pos = 0
be60ffd0 1741 for i in range(21):
7584f5c9
ERE
1742 try:
1743 offset = nti(buf[pos:pos + 12])
1744 numbytes = nti(buf[pos + 12:pos + 24])
1745 except ValueError:
1746 break
be60ffd0
ERE
1747 if offset and numbytes:
1748 structs.append((offset, numbytes))
7584f5c9 1749 pos += 24
be60ffd0
ERE
1750 isextended = bool(buf[504])
1751 self.sparse = structs
7584f5c9
ERE
1752
1753 self.offset_data = tarfile.fileobj.tell()
1754 tarfile.offset = self.offset_data + self._block(self.size)
1755 self.size = origsize
7584f5c9
ERE
1756 return self
1757
1758 def _proc_pax(self, tarfile):
1759 """Process an extended or global header as described in
be60ffd0 1760 POSIX.1-2008.
7584f5c9
ERE
1761 """
1762 # Read the header information.
1763 buf = tarfile.fileobj.read(self._block(self.size))
1764
1765 # A pax header stores supplemental information for either
1766 # the following file (extended) or all following files
1767 # (global).
1768 if self.type == XGLTYPE:
1769 pax_headers = tarfile.pax_headers
1770 else:
1771 pax_headers = tarfile.pax_headers.copy()
1772
be60ffd0
ERE
1773 # Check if the pax header contains a hdrcharset field. This tells us
1774 # the encoding of the path, linkpath, uname and gname fields. Normally,
1775 # these fields are UTF-8 encoded but since POSIX.1-2008 tar
1776 # implementations are allowed to store them as raw binary strings if
1777 # the translation to UTF-8 fails.
1778 match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf)
1779 if match is not None:
1780 pax_headers["hdrcharset"] = match.group(1).decode("utf-8")
1781
1782 # For the time being, we don't care about anything other than "BINARY".
1783 # The only other value that is currently allowed by the standard is
1784 # "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
1785 hdrcharset = pax_headers.get("hdrcharset")
1786 if hdrcharset == "BINARY":
1787 encoding = tarfile.encoding
1788 else:
1789 encoding = "utf-8"
1790
7584f5c9
ERE
1791 # Parse pax header information. A record looks like that:
1792 # "%d %s=%s\n" % (length, keyword, value). length is the size
1793 # of the complete record including the length field itself and
1794 # the newline. keyword and value are both UTF-8 encoded strings.
be60ffd0 1795 regex = re.compile(br"(\d+) ([^=]+)=")
7584f5c9
ERE
1796 pos = 0
1797 while True:
1798 match = regex.match(buf, pos)
1799 if not match:
1800 break
1801
1802 length, keyword = match.groups()
1803 length = int(length)
1804 value = buf[match.end(2) + 1:match.start(1) + length - 1]
1805
be60ffd0
ERE
1806 # Normally, we could just use "utf-8" as the encoding and "strict"
1807 # as the error handler, but we better not take the risk. For
1808 # example, GNU tar <= 1.23 is known to store filenames it cannot
1809 # translate to UTF-8 as raw strings (unfortunately without a
1810 # hdrcharset=BINARY header).
1811 # We first try the strict standard encoding, and if that fails we
1812 # fall back on the user's encoding and error handler.
1813 keyword = self._decode_pax_field(keyword, "utf-8", "utf-8",
1814 tarfile.errors)
1815 if keyword in PAX_NAME_FIELDS:
1816 value = self._decode_pax_field(value, encoding, tarfile.encoding,
1817 tarfile.errors)
1818 else:
1819 value = self._decode_pax_field(value, "utf-8", "utf-8",
1820 tarfile.errors)
7584f5c9
ERE
1821
1822 pax_headers[keyword] = value
1823 pos += length
1824
36a315a0 1825
7584f5c9
ERE
1826 # Fetch the next header.
1827 try:
1828 next = self.fromtarfile(tarfile)
1829 except HeaderError:
1830 raise SubsequentHeaderError("missing or bad subsequent header")
1831
be60ffd0
ERE
1832 # Process GNU sparse information.
1833 if "GNU.sparse.map" in pax_headers:
1834 # GNU extended sparse format version 0.1.
1835 self._proc_gnusparse_01(next, pax_headers)
1836
1837 elif "GNU.sparse.size" in pax_headers:
1838 # GNU extended sparse format version 0.0.
1839 self._proc_gnusparse_00(next, pax_headers, buf)
1840
1841 elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0":
1842 # GNU extended sparse format version 1.0.
1843 self._proc_gnusparse_10(next, pax_headers, tarfile)
1844
7584f5c9
ERE
1845 if self.type in (XHDTYPE, SOLARIS_XHDTYPE):
1846 # Patch the TarInfo object with the extended header info.
1847 next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors)
1848 next.offset = self.offset
1849
1850 if "size" in pax_headers:
1851 # If the extended header replaces the size field,
1852 # we need to recalculate the offset where the next
1853 # header starts.
1854 offset = next.offset_data
1855 if next.isreg() or next.type not in SUPPORTED_TYPES:
1856 offset += next._block(next.size)
1857 tarfile.offset = offset
1858
c04e0751
ERE
1859 if next is not None:
1860 if "GNU.volume.filename" in pax_headers:
1861 if pax_headers["GNU.volume.filename"] == next.name:
1862 if "GNU.volume.size" in pax_headers:
1863 next.size = int(pax_headers["GNU.volume.size"])
1864 if "GNU.volume.offset" in pax_headers:
1865 next.volume_offset = int(pax_headers["GNU.volume.offset"])
1866
1867 for key in pax_headers.keys():
1868 if key.startswith("GNU.volume"):
1869 del tarfile.pax_headers[key]
0eb5048f 1870
7584f5c9
ERE
1871 return next
1872
be60ffd0
ERE
1873 def _proc_gnusparse_00(self, next, pax_headers, buf):
1874 """Process a GNU tar extended sparse header, version 0.0.
7584f5c9 1875 """
be60ffd0
ERE
1876 offsets = []
1877 for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf):
1878 offsets.append(int(match.group(1)))
1879 numbytes = []
1880 for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf):
1881 numbytes.append(int(match.group(1)))
1882 next.sparse = list(zip(offsets, numbytes))
7584f5c9 1883
be60ffd0
ERE
1884 def _proc_gnusparse_01(self, next, pax_headers):
1885 """Process a GNU tar extended sparse header, version 0.1.
1886 """
1887 sparse = [int(x) for x in pax_headers["GNU.sparse.map"].split(",")]
1888 next.sparse = list(zip(sparse[::2], sparse[1::2]))
7584f5c9 1889
be60ffd0
ERE
1890 def _proc_gnusparse_10(self, next, pax_headers, tarfile):
1891 """Process a GNU tar extended sparse header, version 1.0.
1892 """
1893 fields = None
1894 sparse = []
1895 buf = tarfile.fileobj.read(BLOCKSIZE)
1896 fields, buf = buf.split(b"\n", 1)
1897 fields = int(fields)
1898 while len(sparse) < fields * 2:
1899 if b"\n" not in buf:
1900 buf += tarfile.fileobj.read(BLOCKSIZE)
1901 number, buf = buf.split(b"\n", 1)
1902 sparse.append(int(number))
1903 next.offset_data = tarfile.fileobj.tell()
1904 next.sparse = list(zip(sparse[::2], sparse[1::2]))
7584f5c9 1905
be60ffd0
ERE
1906 def _apply_pax_info(self, pax_headers, encoding, errors):
1907 """Replace fields with supplemental information from a previous
1908 pax extended or global header.
1909 """
1910 for keyword, value in pax_headers.items():
1911 if keyword == "GNU.sparse.name":
1912 setattr(self, "path", value)
1913 elif keyword == "GNU.sparse.size":
1914 setattr(self, "size", int(value))
1915 elif keyword == "GNU.sparse.realsize":
1916 setattr(self, "size", int(value))
1917 elif keyword in PAX_FIELDS:
1918 if keyword in PAX_NUMBER_FIELDS:
1919 try:
1920 value = PAX_NUMBER_FIELDS[keyword](value)
1921 except ValueError:
1922 value = 0
1923 if keyword == "path":
f0287fb7 1924 value = value.rstrip("/") # pylint: disable=no-member
be60ffd0 1925 setattr(self, keyword, value)
7584f5c9
ERE
1926
1927 self.pax_headers = pax_headers.copy()
1928
be60ffd0
ERE
1929 def _decode_pax_field(self, value, encoding, fallback_encoding, fallback_errors):
1930 """Decode a single field from a pax record.
1931 """
1932 try:
1933 return value.decode(encoding, "strict")
1934 except UnicodeDecodeError:
1935 return value.decode(fallback_encoding, fallback_errors)
1936
7584f5c9
ERE
1937 def _block(self, count):
1938 """Round up a byte count by BLOCKSIZE and return it,
1939 e.g. _block(834) => 1024.
1940 """
1941 blocks, remainder = divmod(count, BLOCKSIZE)
1942 if remainder:
1943 blocks += 1
1944 return blocks * BLOCKSIZE
1945
1946 def isreg(self):
1947 return self.type in REGULAR_TYPES
1948 def isfile(self):
1949 return self.isreg()
1950 def isdir(self):
1951 return self.type == DIRTYPE
1952 def issym(self):
1953 return self.type == SYMTYPE
1954 def islnk(self):
1955 return self.type == LNKTYPE
1956 def ischr(self):
1957 return self.type == CHRTYPE
1958 def isblk(self):
1959 return self.type == BLKTYPE
1960 def isfifo(self):
1961 return self.type == FIFOTYPE
1962 def issparse(self):
be60ffd0 1963 return self.sparse is not None
7584f5c9
ERE
1964 def isdev(self):
1965 return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE)
68ddf955 1966 def ismultivol(self):
c04e0751
ERE
1967 return self.type == GNUTYPE_MULTIVOL or self.volume_offset > 0 or\
1968 "GNU.volume.offset" in self.pax_headers
7584f5c9
ERE
1969# class TarInfo
1970
1971class TarFile(object):
1972 """The TarFile Class provides an interface to tar archives.
1973 """
1974
1975 debug = 0 # May be set from 0 (no msgs) to 3 (all msgs)
1976
1977 dereference = False # If true, add content of linked file to the
1978 # tar file, else the link.
1979
1980 ignore_zeros = False # If true, skips empty or invalid blocks and
1981 # continues processing.
1982
83f2d71e 1983 max_volume_size = None # If different from None, establishes maximum
68ddf955
ERE
1984 # size of tar volumes
1985
1986 new_volume_handler = None # function handler to be executed before when
1987 # a new volume is needed
1988
1989 volume_number = 0 # current volume number, used for multi volume
1990 # support
1991
7584f5c9
ERE
1992 errorlevel = 1 # If 0, fatal errors only appear in debug
1993 # messages (if debug >= 0). If > 0, errors
1994 # are passed to the caller as exceptions.
1995
1996 format = DEFAULT_FORMAT # The format to use when creating an archive.
1997
1998 encoding = ENCODING # Encoding for 8-bit character strings.
1999
2000 errors = None # Error handler for unicode conversion.
2001
2002 tarinfo = TarInfo # The default TarInfo class to use.
2003
be60ffd0 2004 fileobject = ExFileObject # The file-object for extractfile().
7584f5c9 2005
d1c38f40
PG
2006 arcmode = ARCMODE_PLAIN # Object processing mode (“concat”, encryption,
2007 # compression)
5fdff89f 2008
ea625b04
ERE
2009 save_to_members = True # If new members are saved. This can be disabled
2010 # if you manage lots of files and don't want
2011 # to have high memory usage
2012
9ef1fb87
TJ
2013 cache_uid2user = {} # cache to avoid getpwuid calls. It always parses /etc/passwd.
2014 cache_gid2group = {} # same cache for groups
2015
7584f5c9
ERE
2016 def __init__(self, name=None, mode="r", fileobj=None, format=None,
2017 tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
be60ffd0 2018 errors="surrogateescape", pax_headers=None, debug=None,
548bb8d5 2019 errorlevel=None, max_volume_size=None, new_volume_handler=None,
d1c38f40 2020 concat=False, nacl=None,
c7c736b6 2021 save_to_members=True):
7584f5c9
ERE
2022 """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
2023 read from an existing archive, 'a' to append data to an existing
2024 file or 'w' to create a new file overwriting an existing one. `mode'
2025 defaults to 'r'.
2026 If `fileobj' is given, it is used for reading or writing data. If it
2027 can be determined, `mode' is overridden by `fileobj's mode.
2028 `fileobj' is not closed, when TarFile is closed.
2029 """
2030 if len(mode) > 1 or mode not in "raw":
2031 raise ValueError("mode must be 'r', 'a' or 'w'")
2032 self.mode = mode
d1c38f40 2033 self.arcmode = arcmode_set (concat)
c7c736b6 2034 self.nacl = nacl
7584f5c9
ERE
2035 self._mode = {"r": "rb", "a": "r+b", "w": "wb"}[mode]
2036
2037 if not fileobj:
2038 if self.mode == "a" and not os.path.exists(name):
2039 # Create nonexistent files in append mode.
2040 self.mode = "w"
2041 self._mode = "wb"
2042 fileobj = bltn_open(name, self._mode)
2043 self._extfileobj = False
2044 else:
2045 if name is None and hasattr(fileobj, "name"):
2046 name = fileobj.name
d5361dac 2047 # when fileobj is a gzip.GzipFile, fileobj.mode is an int (not valid for us)
be60ffd0 2048 if hasattr(fileobj, "mode") and isinstance(fileobj.mode, str):
7584f5c9
ERE
2049 self._mode = fileobj.mode
2050 self._extfileobj = True
be60ffd0 2051 self.name = os.path.abspath(name) if name else None
2f854e77 2052 self.base_name = self.name = os.path.abspath(name) if name else None
7584f5c9
ERE
2053 self.fileobj = fileobj
2054
2055 # Init attributes.
2056 if format is not None:
2057 self.format = format
2058 if tarinfo is not None:
2059 self.tarinfo = tarinfo
2060 if dereference is not None:
2061 self.dereference = dereference
2062 if ignore_zeros is not None:
2063 self.ignore_zeros = ignore_zeros
2064 if encoding is not None:
2065 self.encoding = encoding
2066
be60ffd0 2067 self.errors = errors
7584f5c9
ERE
2068
2069 if pax_headers is not None and self.format == PAX_FORMAT:
2070 self.pax_headers = pax_headers
2071 else:
2072 self.pax_headers = {}
2073
2074 if debug is not None:
2075 self.debug = debug
2076 if errorlevel is not None:
2077 self.errorlevel = errorlevel
2078
2079 # Init datastructures.
ae48acc8 2080 if max_volume_size and max_volume_size < 3*BLOCKSIZE:
0c818a18 2081 raise ValueError("max_volume_size needs to be at least %d" % (3*BLOCKSIZE))
ae48acc8
ERE
2082 if max_volume_size and not callable(new_volume_handler):
2083 raise ValueError("new_volume_handler needs to be set and be callable for multivolume support")
5ab3f8f9
CH
2084 if max_volume_size:
2085 self.max_volume_size = int(max_volume_size)
2086 else:
2087 self.max_volume_size = None
ae48acc8 2088
ea625b04 2089 self.save_to_members = save_to_members
68ddf955 2090 self.new_volume_handler = new_volume_handler
7584f5c9
ERE
2091 self.closed = False
2092 self.members = [] # list of members as TarInfo objects
2093 self._loaded = False # flag if all members have been read
2094 self.offset = self.fileobj.tell()
2095 # current position in the archive file
2096 self.inodes = {} # dictionary caching the inodes of
2097 # archive members already added
2098
2099 try:
2100 if self.mode == "r":
2101 self.firstmember = None
2102 self.firstmember = self.next()
2103
2104 if self.mode == "a":
2105 # Move to the end of the archive,
2106 # before the first empty block.
2107 while True:
2108 self.fileobj.seek(self.offset)
2109 try:
2110 tarinfo = self.tarinfo.fromtarfile(self)
2111 self.members.append(tarinfo)
2112 except EOFHeaderError:
2113 self.fileobj.seek(self.offset)
2114 break
be60ffd0 2115 except HeaderError as e:
7584f5c9
ERE
2116 raise ReadError(str(e))
2117
2118 if self.mode in "aw":
2119 self._loaded = True
2120
2121 if self.pax_headers:
2122 buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy())
2123 self.fileobj.write(buf)
2124 self.offset += len(buf)
2125 except:
2126 if not self._extfileobj:
2127 self.fileobj.close()
2128 self.closed = True
2129 raise
2130
7584f5c9
ERE
2131 #--------------------------------------------------------------------------
2132 # Below are the classmethods which act as alternate constructors to the
2133 # TarFile class. The open() method is the only one that is needed for
2134 # public use; it is the "super"-constructor and is able to select an
2135 # adequate "sub"-constructor for a particular compression using the mapping
2136 # from OPEN_METH.
2137 #
2138 # This concept allows one to subclass TarFile without losing the comfort of
2139 # the super-constructor. A sub-constructor is registered and made available
2140 # by adding it to the mapping in OPEN_METH.
2141
2142 @classmethod
2b82f50c 2143 def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE,
04f4c7ab
PG
2144 encryption=None, compresslevel=9, tolerance=TOLERANCE_STRICT,
2145 **kwargs):
7584f5c9
ERE
2146 """Open a tar archive for reading, writing or appending. Return
2147 an appropriate TarFile class.
2148
2149 mode:
2150 'r' or 'r:*' open for reading with transparent compression
2151 'r:' open for reading exclusively uncompressed
2152 'r:gz' open for reading with gzip compression
2153 'r:bz2' open for reading with bzip2 compression
be60ffd0 2154 'r:xz' open for reading with lzma compression
7584f5c9
ERE
2155 'a' or 'a:' open for appending, creating the file if necessary
2156 'w' or 'w:' open for writing without compression
2157 'w:gz' open for writing with gzip compression
2158 'w:bz2' open for writing with bzip2 compression
be60ffd0 2159 'w:xz' open for writing with lzma compression
7584f5c9
ERE
2160
2161 'r|*' open a stream of tar blocks with transparent compression
2162 'r|' open an uncompressed stream of tar blocks for reading
2163 'r|gz' open a gzip compressed stream of tar blocks
2164 'r|bz2' open a bzip2 compressed stream of tar blocks
be60ffd0 2165 'r|xz' open an lzma compressed stream of tar blocks
7584f5c9
ERE
2166 'w|' open an uncompressed stream for writing
2167 'w|gz' open a gzip compressed stream for writing
2168 'w|bz2' open a bzip2 compressed stream for writing
be60ffd0 2169 'w|xz' open an lzma compressed stream for writing
85737f48
ERE
2170
2171 'r#gz' open a stream of gzip compressed tar blocks for reading
2172 'w#gz' open a stream of gzip compressed tar blocks for writing
7584f5c9 2173 """
7584f5c9
ERE
2174 if not name and not fileobj:
2175 raise ValueError("nothing to open")
2176
2177 if mode in ("r", "r:*"):
2178 # Find out which *open() is appropriate for opening the file.
2179 for comptype in cls.OPEN_METH:
2180 func = getattr(cls, cls.OPEN_METH[comptype])
2181 if fileobj is not None:
2182 saved_pos = fileobj.tell()
2183 try:
2184 return func(name, "r", fileobj, **kwargs)
be60ffd0 2185 except (ReadError, CompressionError) as e:
c7c736b6 2186 # usually nothing exceptional but sometimes is
7584f5c9
ERE
2187 if fileobj is not None:
2188 fileobj.seek(saved_pos)
2189 continue
2190 raise ReadError("file could not be opened successfully")
2191
2192 elif ":" in mode:
2193 filemode, comptype = mode.split(":", 1)
2194 filemode = filemode or "r"
2195 comptype = comptype or "tar"
2196
2197 # Select the *open() function according to
2198 # given compression.
2199 if comptype in cls.OPEN_METH:
2200 func = getattr(cls, cls.OPEN_METH[comptype])
2201 else:
2202 raise CompressionError("unknown compression type %r" % comptype)
e05f0440
TJ
2203
2204 # Pass on compression level for gzip / bzip2.
2205 if comptype == 'gz' or comptype == 'bz2':
2206 kwargs['compresslevel'] = compresslevel
2207
7a2b9329
CH
2208 if 'max_volume_size' in kwargs:
2209 if comptype != 'tar' and filemode in 'wa' \
2210 and kwargs['max_volume_size']:
2211 import warnings
2212 warnings.warn('Only the first volume will be compressed '
2213 'for modes with "w:"!')
2214
e05f0440 2215 return func(name, filemode, fileobj, **kwargs)
7584f5c9
ERE
2216
2217 elif "|" in mode:
2218 filemode, comptype = mode.split("|", 1)
2219 filemode = filemode or "r"
2220 comptype = comptype or "tar"
2221
2222 if filemode not in "rw":
2223 raise ValueError("mode must be 'r' or 'w'")
2224
2225 t = cls(name, filemode,
2b82f50c
ERE
2226 _Stream(name, filemode, comptype, fileobj, bufsize,
2227 compresslevel=compresslevel),
7584f5c9
ERE
2228 **kwargs)
2229 t._extfileobj = False
2230 return t
2231
5fdff89f
ERE
2232 elif "#" in mode:
2233 filemode, comptype = mode.split("#", 1)
2234 filemode = filemode or "r"
5fdff89f
ERE
2235
2236 if filemode not in "rw":
5faea0e1
PG
2237 raise ValueError ("mode %s not compatible with concat "
2238 "archive; must be 'r' or 'w'" % mode)
5fdff89f 2239
be60ffd0 2240 stream = _Stream(name, filemode, comptype, fileobj, bufsize,
d1c38f40 2241 concat=True, encryption=encryption,
04f4c7ab 2242 compresslevel=compresslevel, tolerance=tolerance)
d1c38f40 2243 kwargs ["concat"] = True
be60ffd0
ERE
2244 try:
2245 t = cls(name, filemode, stream, **kwargs)
c7c736b6 2246 except: # XXX except what?
be60ffd0 2247 stream.close()
c7c736b6 2248 raise # XXX raise what?
5fdff89f
ERE
2249 t._extfileobj = False
2250 return t
2251
7584f5c9
ERE
2252 elif mode in "aw":
2253 return cls.taropen(name, mode, fileobj, **kwargs)
2254
133d30da 2255 raise ValueError("undiscernible mode %r" % mode)
7584f5c9 2256
d39d4cbf
PG
2257
2258 @classmethod
2259 def open_at_offset(cls, offset, *a, **kwa):
2260 """
2261 Same as ``.open()``, but start reading at the given offset. Assumes a
5bd2d4b5
PG
2262 seekable file object. Returns *None* if opening failed due to a read
2263 problem.
d39d4cbf
PG
2264 """
2265 fileobj = kwa.get ("fileobj")
2266 if fileobj is not None:
2267 fileobj.seek (offset)
5bd2d4b5 2268
d39d4cbf
PG
2269 return cls.open (*a, **kwa)
2270
2271
7584f5c9
ERE
2272 @classmethod
2273 def taropen(cls, name, mode="r", fileobj=None, **kwargs):
2274 """Open uncompressed tar archive name for reading or writing.
2275 """
2276 if len(mode) > 1 or mode not in "raw":
2277 raise ValueError("mode must be 'r', 'a' or 'w'")
2278 return cls(name, mode, fileobj, **kwargs)
2279
2280 @classmethod
2281 def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
2282 """Open gzip compressed tar archive name for reading or writing.
2283 Appending is not allowed.
2284 """
2285 if len(mode) > 1 or mode not in "rw":
2286 raise ValueError("mode must be 'r' or 'w'")
2287
2288 try:
2289 import gzip
2290 gzip.GzipFile
2291 except (ImportError, AttributeError):
2292 raise CompressionError("gzip module is not available")
2293
be60ffd0 2294 extfileobj = fileobj is not None
7584f5c9 2295 try:
be60ffd0
ERE
2296 fileobj = gzip.GzipFile(name, mode + "b", compresslevel, fileobj)
2297 t = cls.taropen(name, mode, fileobj, **kwargs)
2298 except OSError:
2299 if not extfileobj and fileobj is not None:
2300 fileobj.close()
2301 if fileobj is None:
2302 raise
7584f5c9 2303 raise ReadError("not a gzip file")
be60ffd0
ERE
2304 except:
2305 if not extfileobj and fileobj is not None:
2306 fileobj.close()
2307 raise
2308 t._extfileobj = extfileobj
7584f5c9
ERE
2309 return t
2310
2311 @classmethod
2312 def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
2313 """Open bzip2 compressed tar archive name for reading or writing.
2314 Appending is not allowed.
2315 """
2316 if len(mode) > 1 or mode not in "rw":
2317 raise ValueError("mode must be 'r' or 'w'.")
2318
2319 try:
2320 import bz2
2321 except ImportError:
2322 raise CompressionError("bz2 module is not available")
2323
be60ffd0
ERE
2324 fileobj = bz2.BZ2File(fileobj or name, mode,
2325 compresslevel=compresslevel)
7584f5c9
ERE
2326
2327 try:
2328 t = cls.taropen(name, mode, fileobj, **kwargs)
be60ffd0
ERE
2329 except (OSError, EOFError):
2330 fileobj.close()
7584f5c9
ERE
2331 raise ReadError("not a bzip2 file")
2332 t._extfileobj = False
2333 return t
2334
be60ffd0
ERE
2335 @classmethod
2336 def xzopen(cls, name, mode="r", fileobj=None, preset=None, **kwargs):
2337 """Open lzma compressed tar archive name for reading or writing.
2338 Appending is not allowed.
2339 """
2340 if mode not in ("r", "w"):
2341 raise ValueError("mode must be 'r' or 'w'")
2342
2343 try:
2344 import lzma
2345 except ImportError:
2346 raise CompressionError("lzma module is not available")
2347
2348 fileobj = lzma.LZMAFile(fileobj or name, mode, preset=preset)
2349
2350 try:
2351 t = cls.taropen(name, mode, fileobj, **kwargs)
2352 except (lzma.LZMAError, EOFError):
2353 fileobj.close()
2354 raise ReadError("not an lzma file")
2355 t._extfileobj = False
2356 return t
2357
7584f5c9
ERE
2358 # All *open() methods are registered here.
2359 OPEN_METH = {
2360 "tar": "taropen", # uncompressed tar
2361 "gz": "gzopen", # gzip compressed tar
be60ffd0
ERE
2362 "bz2": "bz2open", # bzip2 compressed tar
2363 "xz": "xzopen" # lzma compressed tar
7584f5c9
ERE
2364 }
2365
2366 #--------------------------------------------------------------------------
2367 # The public methods which TarFile provides:
2368
2369 def close(self):
2370 """Close the TarFile. In write-mode, two finishing zero blocks are
fd2f01f2
PG
2371 appended to the archive. A special case are empty archives which are
2372 initialized accordingly so the two mandatory blocks of zeros are
2373 written abiding by the requested encryption and compression settings.
7584f5c9
ERE
2374 """
2375 if self.closed:
2376 return
2377
2378 if self.mode in "aw":
fd2f01f2
PG
2379 if self.arcmode & ARCMODE_CONCAT and self.fileobj.tell () == 0:
2380 self.fileobj.next ("")
7584f5c9
ERE
2381 self.fileobj.write(NUL * (BLOCKSIZE * 2))
2382 self.offset += (BLOCKSIZE * 2)
2383 # fill up the end with zero-blocks
2384 # (like option -b20 for tar does)
2385 blocks, remainder = divmod(self.offset, RECORDSIZE)
2386 if remainder > 0:
2387 self.fileobj.write(NUL * (RECORDSIZE - remainder))
7584f5c9
ERE
2388 if not self._extfileobj:
2389 self.fileobj.close()
2390 self.closed = True
2391
2392 def getmember(self, name):
2393 """Return a TarInfo object for member `name'. If `name' can not be
2394 found in the archive, KeyError is raised. If a member occurs more
2395 than once in the archive, its last occurrence is assumed to be the
2396 most up-to-date version.
2397 """
2398 tarinfo = self._getmember(name)
2399 if tarinfo is None:
2400 raise KeyError("filename %r not found" % name)
2401 return tarinfo
2402
2403 def getmembers(self):
2404 """Return the members of the archive as a list of TarInfo objects. The
2405 list has the same order as the members in the archive.
2406 """
2407 self._check()
2408 if not self._loaded: # if we want to obtain a list of
2409 self._load() # all members, we first have to
2410 # scan the whole archive.
2411 return self.members
2412
ad4402e8
ERE
2413 def get_last_member_offset(self):
2414 """Return the last member offset. Usually this is self.fileobj.tell(),
2415 but when there's encryption or concat compression going on it's more
2416 complicated than that.
2417 """
b8fc2f5d 2418 return self.last_block_offset
ad4402e8 2419
7584f5c9
ERE
2420 def getnames(self):
2421 """Return the members of the archive as a list of their names. It has
2422 the same order as the list returned by getmembers().
2423 """
2424 return [tarinfo.name for tarinfo in self.getmembers()]
2425
2426 def gettarinfo(self, name=None, arcname=None, fileobj=None):
2427 """Create a TarInfo object for either the file `name' or the file
2428 object `fileobj' (using os.fstat on its file descriptor). You can
2429 modify some of the TarInfo's attributes before you add it using
2430 addfile(). If given, `arcname' specifies an alternative name for the
2431 file in the archive.
2432 """
2433 self._check("aw")
2434
2435 # When fileobj is given, replace name by
2436 # fileobj's real name.
2437 if fileobj is not None:
2438 name = fileobj.name
2439
2440 # Building the name of the member in the archive.
2441 # Backward slashes are converted to forward slashes,
2442 # Absolute paths are turned to relative paths.
2443 if arcname is None:
2444 arcname = name
2445 drv, arcname = os.path.splitdrive(arcname)
be60ffd0 2446 arcname = arcname.replace(os.sep, "/")
7584f5c9
ERE
2447 arcname = arcname.lstrip("/")
2448
2449 # Now, fill the TarInfo object with
2450 # information specific for the file.
2451 tarinfo = self.tarinfo()
2452 tarinfo.tarfile = self
2453
2454 # Use os.stat or os.lstat, depending on platform
2455 # and if symlinks shall be resolved.
2456 if fileobj is None:
2457 if hasattr(os, "lstat") and not self.dereference:
2458 statres = os.lstat(name)
2459 else:
2460 statres = os.stat(name)
2461 else:
2462 statres = os.fstat(fileobj.fileno())
2463 linkname = ""
2464
2465 stmd = statres.st_mode
2466 if stat.S_ISREG(stmd):
2467 inode = (statres.st_ino, statres.st_dev)
2468 if not self.dereference and statres.st_nlink > 1 and \
2469 inode in self.inodes and arcname != self.inodes[inode]:
2470 # Is it a hardlink to an already
2471 # archived file?
2472 type = LNKTYPE
2473 linkname = self.inodes[inode]
2474 else:
2475 # The inode is added only if its valid.
2476 # For win32 it is always 0.
2477 type = REGTYPE
6f422b65 2478 if inode[0] and self.save_to_members:
7584f5c9
ERE
2479 self.inodes[inode] = arcname
2480 elif stat.S_ISDIR(stmd):
2481 type = DIRTYPE
2482 elif stat.S_ISFIFO(stmd):
2483 type = FIFOTYPE
2484 elif stat.S_ISLNK(stmd):
2485 type = SYMTYPE
2486 linkname = os.readlink(name)
2487 elif stat.S_ISCHR(stmd):
2488 type = CHRTYPE
2489 elif stat.S_ISBLK(stmd):
2490 type = BLKTYPE
2491 else:
2492 return None
2493
2494 # Fill the TarInfo object with all
2495 # information we can get.
2496 tarinfo.name = arcname
2497 tarinfo.mode = stmd
2498 tarinfo.uid = statres.st_uid
2499 tarinfo.gid = statres.st_gid
2500 if type == REGTYPE:
2501 tarinfo.size = statres.st_size
2502 else:
be60ffd0 2503 tarinfo.size = 0
7584f5c9
ERE
2504 tarinfo.mtime = statres.st_mtime
2505 tarinfo.type = type
2506 tarinfo.linkname = linkname
2507 if pwd:
9ef1fb87
TJ
2508 if tarinfo.uid in self.cache_uid2user:
2509 tarinfo.uname = self.cache_uid2user[tarinfo.uid]
2510 else:
2511 try:
2512 tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0]
2513 self.cache_uid2user[tarinfo.uid] = tarinfo.uname
2514 except KeyError:
2515 # remember user does not exist:
2516 # same default value as in tarinfo class
2517 self.cache_uid2user[tarinfo.uid] = ""
7584f5c9 2518 if grp:
9ef1fb87
TJ
2519 if tarinfo.gid in self.cache_gid2group:
2520 tarinfo.gname = self.cache_gid2group[tarinfo.gid]
2521 else:
2522 try:
2523 tarinfo.gname = grp.getgrgid(tarinfo.gid)[0]
2524 self.cache_gid2group[tarinfo.gid] = tarinfo.gname
2525 except KeyError:
2526 # remember group does not exist:
2527 # same default value as in tarinfo class
2528 self.cache_gid2group[tarinfo.gid] = ""
7584f5c9
ERE
2529
2530 if type in (CHRTYPE, BLKTYPE):
2531 if hasattr(os, "major") and hasattr(os, "minor"):
2532 tarinfo.devmajor = os.major(statres.st_rdev)
2533 tarinfo.devminor = os.minor(statres.st_rdev)
2534 return tarinfo
2535
2536 def list(self, verbose=True):
2537 """Print a table of contents to sys.stdout. If `verbose' is False, only
2538 the names of the members are printed. If it is True, an `ls -l'-like
2539 output is produced.
2540 """
2541 self._check()
2542
2543 for tarinfo in self:
2544 if verbose:
be60ffd0
ERE
2545 print(stat.filemode(tarinfo.mode), end=' ')
2546 print("%s/%s" % (tarinfo.uname or tarinfo.uid,
2547 tarinfo.gname or tarinfo.gid), end=' ')
7584f5c9 2548 if tarinfo.ischr() or tarinfo.isblk():
be60ffd0
ERE
2549 print("%10s" % ("%d,%d" \
2550 % (tarinfo.devmajor, tarinfo.devminor)), end=' ')
7584f5c9 2551 else:
be60ffd0
ERE
2552 print("%10d" % tarinfo.size, end=' ')
2553 print("%d-%02d-%02d %02d:%02d:%02d" \
2554 % time.localtime(tarinfo.mtime)[:6], end=' ')
7584f5c9 2555
be60ffd0 2556 print(tarinfo.name + ("/" if tarinfo.isdir() else ""), end=' ')
7584f5c9
ERE
2557
2558 if verbose:
2559 if tarinfo.issym():
be60ffd0 2560 print("->", tarinfo.linkname, end=' ')
7584f5c9 2561 if tarinfo.islnk():
be60ffd0
ERE
2562 print("link to", tarinfo.linkname, end=' ')
2563 print()
7584f5c9 2564
be60ffd0 2565 def add(self, name, arcname=None, recursive=True, exclude=None, *, filter=None):
7584f5c9
ERE
2566 """Add the file `name' to the archive. `name' may be any type of file
2567 (directory, fifo, symbolic link, etc.). If given, `arcname'
2568 specifies an alternative name for the file in the archive.
2569 Directories are added recursively by default. This can be avoided by
2570 setting `recursive' to False. `exclude' is a function that should
2571 return True for each filename to be excluded. `filter' is a function
2572 that expects a TarInfo object argument and returns the changed
2573 TarInfo object, if it returns None the TarInfo object will be
2574 excluded from the archive.
2575 """
2576 self._check("aw")
2577
2578 if arcname is None:
2579 arcname = name
2580
2581 # Exclude pathnames.
2582 if exclude is not None:
2583 import warnings
2584 warnings.warn("use the filter argument instead",
2585 DeprecationWarning, 2)
2586 if exclude(name):
2587 self._dbg(2, "tarfile: Excluded %r" % name)
2588 return
2589
2590 # Skip if somebody tries to archive the archive...
2591 if self.name is not None and os.path.abspath(name) == self.name:
2592 self._dbg(2, "tarfile: Skipped %r" % name)
2593 return
2594
2595 self._dbg(1, name)
2596
2597 # Create a TarInfo object from the file.
2598 tarinfo = self.gettarinfo(name, arcname)
2599
2600 if tarinfo is None:
2601 self._dbg(1, "tarfile: Unsupported type %r" % name)
2602 return
2603
2604 # Change or exclude the TarInfo object.
2605 if filter is not None:
2606 tarinfo = filter(tarinfo)
2607 if tarinfo is None:
2608 self._dbg(2, "tarfile: Excluded %r" % name)
2609 return
2610
2611 # Append the tar header and data to the archive.
2612 if tarinfo.isreg():
2613 with bltn_open(name, "rb") as f:
2614 self.addfile(tarinfo, f)
2615
2616 elif tarinfo.isdir():
2617 self.addfile(tarinfo)
2618 if recursive:
2619 for f in os.listdir(name):
2620 self.add(os.path.join(name, f), os.path.join(arcname, f),
be60ffd0 2621 recursive, exclude, filter=filter)
7584f5c9
ERE
2622
2623 else:
2624 self.addfile(tarinfo)
2625
defc9a22 2626 def _size_left_file(self):
be60ffd0 2627 """Calculates size left in a volume with a maximum volume size.
ba5a449e 2628
be60ffd0 2629 Assumes self.max_volume_size is set.
ba5a449e 2630 If using compression through a _Stream, use _size_left_stream instead
be60ffd0 2631 """
ba5a449e 2632 # left-over size = max_size - offset - 2 zero-blocks written in close
ae48acc8
ERE
2633 size_left = self.max_volume_size - 2*BLOCKSIZE - self.offset
2634 # limit size left to a discrete number of blocks, because we won't
be60ffd0 2635 # write only half a block when writting the end of a volume
ae48acc8 2636 # and filling with zeros
defc9a22
CH
2637 return BLOCKSIZE * (size_left // BLOCKSIZE)
2638
2639 def _size_left_stream(self):
ba5a449e
CH
2640 """ Calculates size left in a volume if using comression/encryption
2641
2642 Assumes self.max_volume_size is set and self.fileobj is a _Stream
2643 (otherwise use _size_left_file)
2644 """
2645 # left-over size = max_size - bytes written - 2 zero-blocks (close)
defc9a22
CH
2646 size_left = self.max_volume_size - self.fileobj.estim_file_size() \
2647 - 2*BLOCKSIZE
2648 return BLOCKSIZE * (size_left // BLOCKSIZE)
ae48acc8 2649
7584f5c9
ERE
2650 def addfile(self, tarinfo, fileobj=None):
2651 """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is
2652 given, tarinfo.size bytes are read from it and added to the archive.
2653 You can create TarInfo objects using gettarinfo().
2654 On Windows platforms, `fileobj' should always be opened with mode
2655 'rb' to avoid irritation about the file size.
2656 """
2657 self._check("aw")
2658
2659 tarinfo = copy.copy(tarinfo)
cbf55ffb 2660
d1c38f40
PG
2661 if self.arcmode & ARCMODE_CONCAT:
2662 self.last_block_offset = self.fileobj.next (tarinfo.name)
11684b1d
ERE
2663 else:
2664 self.last_block_offset = self.fileobj.tell()
7584f5c9
ERE
2665
2666 buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
2667 self.fileobj.write(buf)
2668 self.offset += len(buf)
2669
ae9c8de2
CH
2670 if self.max_volume_size:
2671 if isinstance(self.fileobj, _Stream):
2672 _size_left = self._size_left_stream
2673 else:
2674 _size_left = self._size_left_file
2675 else:
2676 _size_left = lambda: tarinfo.size
68ddf955 2677
29c354ac
PG
2678 # If there's no data to follow, finish
2679 if not fileobj:
29c354ac
PG
2680 if self.save_to_members:
2681 self.members.append(tarinfo)
2682 return
2683
2684 target_size_left = _size_left()
2685 source_size_left = tarinfo.size
2686 assert tarinfo.volume_offset == 0
2687
2688 # we only split volumes in the middle of a file, that means we have
2689 # to write at least one block
2690 if target_size_left < BLOCKSIZE:
2691 target_size_left = BLOCKSIZE
2692
ae9c8de2
CH
2693 # loop over multiple volumes
2694 while source_size_left > 0:
ae48acc8 2695
ae9c8de2
CH
2696 # Write as much data as possble from source into target.
2697 # When compressing data, we cannot easily predict how much data we
2698 # can write until target_size_left == 0 --> need to iterate
2699 size_can_write = min(target_size_left, source_size_left)
c04e0751 2700
ae9c8de2
CH
2701 while size_can_write > 0:
2702 copyfileobj(fileobj, self.fileobj, size_can_write)
2703 self.offset += size_can_write
2704 source_size_left -= size_can_write
2705 target_size_left = _size_left()
2706 size_can_write = min(target_size_left, source_size_left)
68ddf955 2707
ae9c8de2
CH
2708 # now target_size_left == 0 or source_size_left == 0
2709
2710 # if there is data left to write, we need to create a new volume
2711 if source_size_left > 0:
5f38bff6
PG
2712 # Only finalize the crypto entry here if we’re continuing with
2713 # another one; otherwise, the encryption must include the block
2714 # padding below.
2f854e77 2715 tarinfo.type = GNUTYPE_MULTIVOL
68ddf955
ERE
2716
2717 if not self.new_volume_handler or\
2718 not callable(self.new_volume_handler):
c04e0751 2719 raise Exception("We need to create a new volume and you "
ae9c8de2 2720 "didn't supply a new_volume_handler")
68ddf955 2721
54128a00 2722
68ddf955
ERE
2723 # the new volume handler should do everything needed to
2724 # start working in a new volume. usually, the handler calls
2725 # to self.open_volume
2f854e77 2726 self.volume_number += 1
0eb5048f 2727
ae9c8de2 2728 # set to be used by open_volume, because in the case of a PAX
0eb5048f
ERE
2729 # tar it needs to write information about the volume and offset
2730 # in the global header
ae9c8de2 2731 tarinfo.volume_offset = tarinfo.size - source_size_left
0eb5048f 2732 self.volume_tarinfo = tarinfo
ae9c8de2 2733
a0873dcc
PG
2734 # the “new_volume_handler” is supposed to call .close() on the
2735 # “fileobj” _Stream
2f854e77
ERE
2736 self.new_volume_handler(self, self.base_name, self.volume_number)
2737
0eb5048f
ERE
2738 self.volume_tarinfo = None
2739
d1c38f40
PG
2740 if self.arcmode & ARCMODE_CONCAT:
2741 self.fileobj.next_volume (tarinfo.name)
5f38bff6 2742
2f854e77
ERE
2743 # write new volume header
2744 buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
2f854e77 2745 self.fileobj.write(buf)
ae9c8de2
CH
2746 self.offset += len(buf)
2747
2748 # adjust variables; open_volume should have reset self.offset
2749 # --> _size_left should be big again
2750 target_size_left = _size_left()
2751 size_can_write = min(target_size_left, source_size_left)
e0da4709 2752 self._dbg(3, 'new volume')
ae9c8de2
CH
2753
2754 # now, all data has been written. We may have to fill up the rest of
2755 # the block in target with 0s
2756 remainder = (tarinfo.size - tarinfo.volume_offset) % BLOCKSIZE
2757 if remainder > 0:
2758 self.fileobj.write(NUL * (BLOCKSIZE - remainder))
2759 self.offset += BLOCKSIZE - remainder
7584f5c9 2760
ea625b04
ERE
2761 if self.save_to_members:
2762 self.members.append(tarinfo)
7584f5c9 2763
170c6c52 2764 def open_volume(self, name="", fileobj=None, encryption=None):
68ddf955 2765 '''
0eb5048f 2766 Called by the user to change this tar file to point to a new volume.
68ddf955 2767 '''
27ee4dd4 2768
68ddf955
ERE
2769 # open the file using either fileobj or name
2770 if not fileobj:
2771 if self.mode == "a" and not os.path.exists(name):
2772 # Create nonexistent files in append mode.
2773 self.mode = "w"
2774 self._mode = "wb"
68ddf955 2775 self._extfileobj = False
26fa5ad5
ERE
2776
2777 if isinstance(self.fileobj, _Stream):
e0da4709 2778 self._dbg(3, 'open_volume: create a _Stream')
26fa5ad5
ERE
2779 fileobj = _Stream(name=name,
2780 mode=self.fileobj.mode,
2781 comptype=self.fileobj.comptype,
2782 fileobj=None,
2783 bufsize=self.fileobj.bufsize,
cea130ec 2784 encryption=encryption or self.fileobj.encryption,
27ee4dd4
PG
2785 concat=self.fileobj.arcmode & ARCMODE_CONCAT,
2786 tolerance=self.fileobj.tolerance)
26fa5ad5 2787 else:
7a2b9329 2788 # here, we lose information about compression/encryption!
e0da4709 2789 self._dbg(3, 'open_volume: builtin open')
26fa5ad5 2790 fileobj = bltn_open(name, self._mode)
68ddf955
ERE
2791 else:
2792 if name is None and hasattr(fileobj, "name"):
2793 name = fileobj.name
2794 if hasattr(fileobj, "mode"):
2795 self._mode = fileobj.mode
2796 self._extfileobj = True
1027433a 2797 self._dbg(3, 'open_volume: using external fileobj {}', fileobj)
68ddf955 2798 self.name = os.path.abspath(name) if name else None
5cf23eab 2799 self.fileobj.close()
68ddf955
ERE
2800 self.fileobj = fileobj
2801
2802 # init data structures
2803 self.closed = False
2804 self.members = [] # list of members as TarInfo objects
2805 self._loaded = False # flag if all members have been read
2806 self.offset = self.fileobj.tell()
2807 # current position in the archive file
2808 self.inodes = {} # dictionary caching the inodes of
2809 # archive members already added
2810
2811 try:
2812 if self.mode == "r":
2813 self.firstmember = None
2814 self.firstmember = self.next()
2815
2816 if self.mode == "a":
2817 # Move to the end of the archive,
2818 # before the first empty block.
2819 while True:
2820 self.fileobj.seek(self.offset)
2821 try:
2822 tarinfo = self.tarinfo.fromtarfile(self)
2823 self.members.append(tarinfo)
2824 except EOFHeaderError:
2825 self.fileobj.seek(self.offset)
2826 break
be60ffd0 2827 except HeaderError as e:
68ddf955
ERE
2828 raise ReadError(str(e))
2829
2830 if self.mode in "aw":
2831 self._loaded = True
2832
c04e0751
ERE
2833 if self.format == PAX_FORMAT:
2834 volume_info = {
be60ffd0
ERE
2835 "GNU.volume.filename": str(self.volume_tarinfo.name),
2836 "GNU.volume.size": str(self.volume_tarinfo.size - self.volume_tarinfo.volume_offset),
2837 "GNU.volume.offset": str(self.volume_tarinfo.volume_offset),
c04e0751 2838 }
0eb5048f 2839
c04e0751
ERE
2840 self.pax_headers.update(volume_info)
2841
a0873dcc
PG
2842 if isinstance(self.fileobj, _Stream):
2843 self.fileobj._init_write_gz ()
c04e0751
ERE
2844 buf = self.tarinfo.create_pax_global_header(volume_info.copy())
2845 self.fileobj.write(buf)
2846 self.offset += len(buf)
54128a00 2847 except Exception as exn:
68ddf955
ERE
2848 if not self._extfileobj:
2849 self.fileobj.close()
2850 self.closed = True
2851 raise
2852
c650acfa 2853 def extractall(self, path=".", members=None, filter=None, unlink=False):
7584f5c9
ERE
2854 """Extract all members from the archive to the current working
2855 directory and set owner, modification time and permissions on
2856 directories afterwards. `path' specifies a different directory
2857 to extract to. `members' is optional and must be a subset of the
2858 list returned by getmembers().
2859 """
2860 directories = []
2861
2862 if members is None:
2863 members = self
2864
2865 for tarinfo in members:
c474439c
ERE
2866 if self.volume_number > 0 and tarinfo.ismultivol():
2867 continue
2868
974408b5 2869 if filter and not filter(tarinfo):
e5f5681b
ERE
2870 continue
2871
7584f5c9
ERE
2872 if tarinfo.isdir():
2873 # Extract directories with a safe mode.
2874 directories.append(tarinfo)
2875 tarinfo = copy.copy(tarinfo)
be60ffd0
ERE
2876 tarinfo.mode = 0o0700
2877 # Do not set_attrs directories, as we will do that further down
c650acfa 2878 self.extract(tarinfo, path, set_attrs=not tarinfo.isdir(), unlink=unlink)
7584f5c9
ERE
2879
2880 # Reverse sort directories.
be60ffd0 2881 directories.sort(key=lambda a: a.name)
7584f5c9
ERE
2882 directories.reverse()
2883
2884 # Set correct owner, mtime and filemode on directories.
2885 for tarinfo in directories:
2886 dirpath = os.path.join(path, tarinfo.name)
2887 try:
2888 self.chown(tarinfo, dirpath)
2889 self.utime(tarinfo, dirpath)
2890 self.chmod(tarinfo, dirpath)
be60ffd0 2891 except ExtractError as e:
7584f5c9
ERE
2892 if self.errorlevel > 1:
2893 raise
2894 else:
2895 self._dbg(1, "tarfile: %s" % e)
2896
c650acfa
PG
2897 def extract(self, member, path="", set_attrs=True, symlink_cb=None,
2898 unlink=False):
7584f5c9
ERE
2899 """Extract a member from the archive to the current working directory,
2900 using its full name. Its file information is extracted as accurately
2901 as possible. `member' may be a filename or a TarInfo object. You can
be60ffd0
ERE
2902 specify a different directory using `path'. File attributes (owner,
2903 mtime, mode) are set unless `set_attrs' is False.
786addd6
PG
2904 ``symlink_cb`` is a hook accepting a function that is passed the
2905 ``member``, ``path``, and ``set_attrs`` arguments if the tarinfo for
2906 ``member`` indicates a symlink in which case only the callback
9b13f5c4
PG
2907 passed will be applied, skipping the actual extraction. In case the
2908 callback is invoked, its return value is passed on to the caller.
7584f5c9
ERE
2909 """
2910 self._check("r")
2911
be60ffd0 2912 if isinstance(member, str):
7584f5c9
ERE
2913 tarinfo = self.getmember(member)
2914 else:
2915 tarinfo = member
2916
2917 # Prepare the link target for makelink().
2918 if tarinfo.islnk():
2919 tarinfo._link_target = os.path.join(path, tarinfo.linkname)
2920
9b13f5c4 2921 if symlink_cb is not None and tarinfo.issym():
83f5fd71 2922 return symlink_cb(member, path, set_attrs)
786addd6 2923
7584f5c9 2924 try:
be60ffd0 2925 self._extract_member(tarinfo, os.path.join(path, tarinfo.name),
c650acfa 2926 set_attrs=set_attrs, unlink=unlink)
be60ffd0 2927 except EnvironmentError as e:
7584f5c9
ERE
2928 if self.errorlevel > 0:
2929 raise
2930 else:
2931 if e.filename is None:
2932 self._dbg(1, "tarfile: %s" % e.strerror)
2933 else:
2934 self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename))
be60ffd0 2935 except ExtractError as e:
7584f5c9
ERE
2936 if self.errorlevel > 1:
2937 raise
2938 else:
2939 self._dbg(1, "tarfile: %s" % e)
2940
2941 def extractfile(self, member):
2942 """Extract a member from the archive as a file object. `member' may be
be60ffd0
ERE
2943 a filename or a TarInfo object. If `member' is a regular file or a
2944 link, an io.BufferedReader object is returned. Otherwise, None is
2945 returned.
7584f5c9
ERE
2946 """
2947 self._check("r")
2948
be60ffd0 2949 if isinstance(member, str):
7584f5c9
ERE
2950 tarinfo = self.getmember(member)
2951 else:
2952 tarinfo = member
2953
be60ffd0
ERE
2954 if tarinfo.isreg() or tarinfo.ismultivol() or\
2955 tarinfo.type not in SUPPORTED_TYPES:
7584f5c9
ERE
2956 # If a member's type is unknown, it is treated as a
2957 # regular file.
2958 return self.fileobject(self, tarinfo)
2959
2960 elif tarinfo.islnk() or tarinfo.issym():
2961 if isinstance(self.fileobj, _Stream):
2962 # A small but ugly workaround for the case that someone tries
2963 # to extract a (sym)link as a file-object from a non-seekable
2964 # stream of tar blocks.
2965 raise StreamError("cannot extract (sym)link as file object")
2966 else:
2967 # A (sym)link's file object is its target's file object.
2968 return self.extractfile(self._find_link_target(tarinfo))
2969 else:
2970 # If there's no data associated with the member (directory, chrdev,
2971 # blkdev, etc.), return None instead of a file object.
2972 return None
2973
c650acfa 2974 def _extract_member(self, tarinfo, targetpath, set_attrs=True, unlink=False):
7584f5c9
ERE
2975 """Extract the TarInfo object tarinfo to a physical
2976 file called targetpath.
2977 """
2978 # Fetch the TarInfo object for the given name
2979 # and build the destination pathname, replacing
2980 # forward slashes to platform specific separators.
2981 targetpath = targetpath.rstrip("/")
2982 targetpath = targetpath.replace("/", os.sep)
2983
2984 # Create all upper directories.
2985 upperdirs = os.path.dirname(targetpath)
2986 if upperdirs and not os.path.exists(upperdirs):
2987 # Create directories that are not part of the archive with
2988 # default permissions.
2989 os.makedirs(upperdirs)
2990
2991 if tarinfo.islnk() or tarinfo.issym():
2992 self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname))
2993 else:
2994 self._dbg(1, tarinfo.name)
2995
c650acfa
PG
2996 if unlink is True:
2997 _unlinkfirst(targetpath)
2998
7584f5c9
ERE
2999 if tarinfo.isreg():
3000 self.makefile(tarinfo, targetpath)
3001 elif tarinfo.isdir():
3002 self.makedir(tarinfo, targetpath)
3003 elif tarinfo.isfifo():
3004 self.makefifo(tarinfo, targetpath)
3005 elif tarinfo.ischr() or tarinfo.isblk():
3006 self.makedev(tarinfo, targetpath)
3007 elif tarinfo.islnk() or tarinfo.issym():
3008 self.makelink(tarinfo, targetpath)
3009 elif tarinfo.type not in SUPPORTED_TYPES:
3010 self.makeunknown(tarinfo, targetpath)
3011 else:
3012 self.makefile(tarinfo, targetpath)
3013
be60ffd0
ERE
3014 if set_attrs:
3015 self.chown(tarinfo, targetpath)
3016 if not tarinfo.issym():
3017 self.chmod(tarinfo, targetpath)
3018 self.utime(tarinfo, targetpath)
7584f5c9
ERE
3019
3020 #--------------------------------------------------------------------------
3021 # Below are the different file methods. They are called via
3022 # _extract_member() when extract() is called. They can be replaced in a
3023 # subclass to implement other functionality.
3024
3025 def makedir(self, tarinfo, targetpath):
3026 """Make a directory called targetpath.
3027 """
3028 try:
3029 # Use a safe mode for the directory, the real mode is set
3030 # later in _extract_member().
be60ffd0
ERE
3031 os.mkdir(targetpath, 0o0700)
3032 except FileExistsError:
3033 pass
7584f5c9
ERE
3034
3035 def makefile(self, tarinfo, targetpath):
3036 """Make a file called targetpath.
3037 """
be60ffd0
ERE
3038 source = self.fileobj
3039 source.seek(tarinfo.offset_data)
c7c736b6 3040 decrypt = False
c474439c
ERE
3041 iterate = True
3042 target = bltn_open(targetpath, "wb")
3043
be60ffd0
ERE
3044 if tarinfo.sparse is not None:
3045 try:
3046 for offset, size in tarinfo.sparse:
3047 target.seek(offset)
3048 copyfileobj(source, target, size)
3049 target.seek(tarinfo.size)
3050 target.truncate()
3051 finally:
3052 target.close()
3053 return
3054
c474439c
ERE
3055 while iterate:
3056 iterate = False
3057 try:
3058 copyfileobj(source, target, tarinfo.size)
aa828cd1 3059 except OSError:
c474439c
ERE
3060 source.close()
3061 # only if we are extracting a multivolume this can be treated
3062 if not self.new_volume_handler:
c474439c
ERE
3063 raise Exception("We need to read a new volume and you"
3064 " didn't supply a new_volume_handler")
3065
3066 # the new volume handler should do everything needed to
3067 # start working in a new volume. usually, the handler calls
3068 # to self.open_volume
3069 self.volume_number += 1
3070 self.new_volume_handler(self, self.base_name, self.volume_number)
be60ffd0
ERE
3071 tarinfo = self.firstmember
3072 source = self.fileobj
c474439c 3073 iterate = True
bcc8b174
PG
3074 finally:
3075 if iterate is False: target.close()
c474439c 3076
7584f5c9
ERE
3077
3078 def makeunknown(self, tarinfo, targetpath):
3079 """Make a file from a TarInfo object with an unknown type
3080 at targetpath.
3081 """
3082 self.makefile(tarinfo, targetpath)
3083 self._dbg(1, "tarfile: Unknown file type %r, " \
3084 "extracted as regular file." % tarinfo.type)
3085
3086 def makefifo(self, tarinfo, targetpath):
3087 """Make a fifo called targetpath.
3088 """
3089 if hasattr(os, "mkfifo"):
3090 os.mkfifo(targetpath)
3091 else:
3092 raise ExtractError("fifo not supported by system")
3093
3094 def makedev(self, tarinfo, targetpath):
3095 """Make a character or block device called targetpath.
3096 """
3097 if not hasattr(os, "mknod") or not hasattr(os, "makedev"):
3098 raise ExtractError("special devices not supported by system")
3099
3100 mode = tarinfo.mode
3101 if tarinfo.isblk():
3102 mode |= stat.S_IFBLK
3103 else:
3104 mode |= stat.S_IFCHR
3105
3106 os.mknod(targetpath, mode,
3107 os.makedev(tarinfo.devmajor, tarinfo.devminor))
3108
3109 def makelink(self, tarinfo, targetpath):
3110 """Make a (symbolic) link called targetpath. If it cannot be created
3111 (platform limitation), we try to make a copy of the referenced file
3112 instead of a link.
3113 """
be60ffd0 3114 try:
7584f5c9
ERE
3115 # For systems that support symbolic and hard links.
3116 if tarinfo.issym():
7584f5c9
ERE
3117 os.symlink(tarinfo.linkname, targetpath)
3118 else:
3119 # See extract().
3120 if os.path.exists(tarinfo._link_target):
7584f5c9
ERE
3121 os.link(tarinfo._link_target, targetpath)
3122 else:
be60ffd0
ERE
3123 self._extract_member(self._find_link_target(tarinfo),
3124 targetpath)
3125 except symlink_exception:
7584f5c9 3126 try:
be60ffd0
ERE
3127 self._extract_member(self._find_link_target(tarinfo),
3128 targetpath)
7584f5c9
ERE
3129 except KeyError:
3130 raise ExtractError("unable to resolve link inside archive")
3131
3132 def chown(self, tarinfo, targetpath):
3133 """Set owner of targetpath according to tarinfo.
3134 """
3135 if pwd and hasattr(os, "geteuid") and os.geteuid() == 0:
3136 # We have to be root to do so.
3137 try:
3138 g = grp.getgrnam(tarinfo.gname)[2]
3139 except KeyError:
3140 g = tarinfo.gid
3141 try:
3142 u = pwd.getpwnam(tarinfo.uname)[2]
3143 except KeyError:
3144 u = tarinfo.uid
3145 try:
3146 if tarinfo.issym() and hasattr(os, "lchown"):
3147 os.lchown(targetpath, u, g)
3148 else:
be60ffd0
ERE
3149 os.chown(targetpath, u, g)
3150 except OSError as e:
7584f5c9
ERE
3151 raise ExtractError("could not change owner")
3152
3153 def chmod(self, tarinfo, targetpath):
3154 """Set file permissions of targetpath according to tarinfo.
3155 """
3156 if hasattr(os, 'chmod'):
3157 try:
3158 os.chmod(targetpath, tarinfo.mode)
be60ffd0 3159 except OSError as e:
7584f5c9
ERE
3160 raise ExtractError("could not change mode")
3161
3162 def utime(self, tarinfo, targetpath):
3163 """Set modification time of targetpath according to tarinfo.
3164 """
3165 if not hasattr(os, 'utime'):
3166 return
3167 try:
3168 os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime))
be60ffd0 3169 except OSError as e:
7584f5c9
ERE
3170 raise ExtractError("could not change modification time")
3171
3172 #--------------------------------------------------------------------------
3173 def next(self):
3174 """Return the next member of the archive as a TarInfo object, when
3175 TarFile is opened for reading. Return None if there is no more
3176 available.
3177 """
3178 self._check("ra")
3179 if self.firstmember is not None:
3180 m = self.firstmember
3181 self.firstmember = None
3182 return m
3183
be60ffd0
ERE
3184 # Read the next block.
3185 self.fileobj.seek(self.offset)
7584f5c9
ERE
3186 tarinfo = None
3187 while True:
3188 try:
3189 tarinfo = self.tarinfo.fromtarfile(self)
be60ffd0 3190 except EOFHeaderError as e:
7584f5c9
ERE
3191 if self.ignore_zeros:
3192 self._dbg(2, "0x%X: %s" % (self.offset, e))
3193 self.offset += BLOCKSIZE
3194 continue
be60ffd0 3195 except InvalidHeaderError as e:
7584f5c9
ERE
3196 if self.ignore_zeros:
3197 self._dbg(2, "0x%X: %s" % (self.offset, e))
3198 self.offset += BLOCKSIZE
3199 continue
3200 elif self.offset == 0:
3201 raise ReadError(str(e))
3202 except EmptyHeaderError:
3203 if self.offset == 0:
3204 raise ReadError("empty file")
be60ffd0 3205 except TruncatedHeaderError as e:
7584f5c9
ERE
3206 if self.offset == 0:
3207 raise ReadError(str(e))
be60ffd0 3208 except SubsequentHeaderError as e:
7584f5c9
ERE
3209 raise ReadError(str(e))
3210 break
3211
3212 if tarinfo is not None:
ea625b04
ERE
3213 if self.save_to_members:
3214 self.members.append(tarinfo)
7584f5c9
ERE
3215 else:
3216 self._loaded = True
3217
3218 return tarinfo
3219
3220 #--------------------------------------------------------------------------
3221 # Little helper methods:
3222
3223 def _getmember(self, name, tarinfo=None, normalize=False):
3224 """Find an archive member by name from bottom to top.
3225 If tarinfo is given, it is used as the starting point.
3226 """
3227 # Ensure that all members have been loaded.
3228 members = self.getmembers()
3229
3230 # Limit the member search list up to tarinfo.
3231 if tarinfo is not None:
3232 members = members[:members.index(tarinfo)]
3233
3234 if normalize:
3235 name = os.path.normpath(name)
3236
3237 for member in reversed(members):
3238 if normalize:
3239 member_name = os.path.normpath(member.name)
3240 else:
3241 member_name = member.name
3242
3243 if name == member_name:
3244 return member
3245
3246 def _load(self):
3247 """Read through the entire archive file and look for readable
3248 members.
3249 """
3250 while True:
3251 tarinfo = self.next()
3252 if tarinfo is None:
3253 break
3254 self._loaded = True
3255
3256 def _check(self, mode=None):
3257 """Check if TarFile is still open, and if the operation's mode
3258 corresponds to TarFile's mode.
3259 """
3260 if self.closed:
be60ffd0 3261 raise OSError("%s is closed" % self.__class__.__name__)
7584f5c9 3262 if mode is not None and self.mode not in mode:
be60ffd0 3263 raise OSError("bad operation for mode %r" % self.mode)
7584f5c9
ERE
3264
3265 def _find_link_target(self, tarinfo):
3266 """Find the target member of a symlink or hardlink member in the
3267 archive.
3268 """
3269 if tarinfo.issym():
3270 # Always search the entire archive.
3271 linkname = "/".join(filter(None, (os.path.dirname(tarinfo.name), tarinfo.linkname)))
3272 limit = None
3273 else:
3274 # Search the archive before the link, because a hard link is
3275 # just a reference to an already archived file.
3276 linkname = tarinfo.linkname
3277 limit = tarinfo
3278
3279 member = self._getmember(linkname, tarinfo=limit, normalize=True)
3280 if member is None:
3281 raise KeyError("linkname %r not found" % linkname)
3282 return member
3283
3284 def __iter__(self):
3285 """Provide an iterator object.
3286 """
3287 if self._loaded:
3288 return iter(self.members)
3289 else:
3290 return TarIter(self)
3291
1027433a 3292 def _dbg(self, level, msg, *args):
7584f5c9
ERE
3293 """Write debugging output to sys.stderr.
3294 """
3295 if level <= self.debug:
1027433a 3296 print(msg.format(*args), file=sys.stderr)
7584f5c9
ERE
3297
3298 def __enter__(self):
3299 self._check()
3300 return self
3301
3302 def __exit__(self, type, value, traceback):
3303 if type is None:
3304 self.close()
3305 else:
3306 # An exception occurred. We must not call close() because
3307 # it would try to write end-of-archive blocks and padding.
3308 if not self._extfileobj:
3309 self.fileobj.close()
3310 self.closed = True
c650acfa
PG
3311
3312def _unlinkfirst(targetpath):
3313 try:
3314 os.unlink(targetpath)
3315 except OSError as e:
3316 if e.errno == errno.ENOENT or e.errno == errno.EISDIR:
3317 pass
3318
3319
7584f5c9
ERE
3320# class TarFile
3321
3322class TarIter:
3323 """Iterator Class.
3324
3325 for tarinfo in TarFile(...):
3326 suite...
3327 """
3328
3329 def __init__(self, tarfile):
3330 """Construct a TarIter object.
3331 """
3332 self.tarfile = tarfile
3333 self.index = 0
3334 def __iter__(self):
3335 """Return iterator object.
3336 """
3337 return self
be60ffd0 3338 def __next__(self):
7584f5c9
ERE
3339 """Return the next item using TarFile's next() method.
3340 When all members have been read, set TarFile as _loaded.
3341 """
3342 # Fix for SF #1100429: Under rare circumstances it can
3343 # happen that getmembers() is called during iteration,
3344 # which will cause TarIter to stop prematurely.
3345
3346 if self.index == 0 and self.tarfile.firstmember is not None:
3347 tarinfo = self.tarfile.next()
3348 elif self.index < len(self.tarfile.members):
3349 tarinfo = self.tarfile.members[self.index]
3350 elif not self.tarfile._loaded:
3351 tarinfo = self.tarfile.next()
3352 if not tarinfo:
3353 self.tarfile._loaded = True
3354 raise StopIteration
3355 else:
3356 raise StopIteration
3357 self.index += 1
fb27c6e8 3358
7584f5c9
ERE
3359 return tarinfo
3360
6690f5e0
PG
3361#---------------------------------------------------------
3362# support functionality for rescue mode
3363#---------------------------------------------------------
3364
8fc6040c
PG
3365TAR_FMT_HDR = (# See tar(5):
3366 "<"
3367 "100s" # ← char name[100]; /* 100 */
3368 "8s" # ← char mode[8]; /* 108 */
3369 "8s" # ← char uid[8]; /* 116 */
3370 "8s" # ← char gid[8]; /* 124 */
3371 "12s" # ← char size[12]; /* 136 */
3372 "12s" # ← char mtime[12]; /* 148 */
3373 "8s" # ← char checksum[8]; /* 156 */
3374 "B" # ← char typeflag[1]; /* 157 */
3375 "100s" # ← char linkname[100]; /* 257 */
3376 "6s" # ← char magic[6]; /* 263 */
3377 "2s" # ← char version[2]; /* 265 */
3378 "32s" # ← char uname[32]; /* 297 */
3379 "32s" # ← char gname[32]; /* 329 */
3380 "8s" # ← char devmajor[8]; /* 337 */
3381 "8s" # ← char devminor[8]; /* 345 */
3382 "12s" # ← char atime[12]; /* 357 */
3383 "12s" # ← char ctime[12]; /* 369 */
3384 "12s" # ← char offset[12]; /* 381 */
3385 "4s" # ← char longnames[4]; /* 385 */
3386 "B" # ← char unused[1]; /* 386 */
3387 "" # struct {
3388 "12s" # ← char offset[12];
3389 "12s" # ← char numbytes[12];
3390 "12s" # ← char offset[12];
3391 "12s" # ← char numbytes[12];
3392 "12s" # ← char offset[12];
3393 "12s" # ← char numbytes[12];
3394 "12s" # ← char offset[12];
3395 "12s" # ← char numbytes[12];
3396 "" # } sparse[4]; /* 482 */
3397 "B" # ← char isextended[1]; /* 483 */
3398 "12s" # ← char realsize[12]; /* 495 */
3399 "17s" # ← char pad[17]; /* 512 */
3400)
3401
3402# The “magic” and “version” fields are special:
3403#
3404# tar(5)
3405# magic The magic field holds the five characters “ustar” followed by a
3406# space. Note that POSIX ustar archives have a trailing null.
3407#
3408# however, “tar.h”:
3409#
3410# /* OLDGNU_MAGIC uses both magic and version fields, which are contiguous.
3411# Found in an archive, it indicates an old GNU header format, which will be
3412# hopefully become obsolescent. With OLDGNU_MAGIC, uname and gname are
3413# valid, though the header is not truly POSIX conforming. */
3414#
3415#
a793ee30 3416TAR_HDR_OFF_MAGIC = 257
8fc6040c
PG
3417TAR_FMT_OLDGNU_MAGIC = b"ustar "
3418
3419def read_gnu_tar_hdr (data):
3420 if len (data) != BLOCKSIZE: # header requires one complete block
3421 return None
65b35c42 3422
8fc6040c
PG
3423 try:
3424 name, mode, \
3425 uid, gid, \
3426 size, mtime, \
3427 checksum, \
3428 typeflag, \
3429 linkname, \
3430 magic, \
3431 version, \
3432 uname, \
3433 gname, \
3434 devmajor, \
3435 devminor, \
3436 atime, \
3437 ctime, \
3438 offset, \
3439 longnames, \
3440 unused, \
3441 offset1, numbytes1, \
3442 offset2, numbytes2, \
3443 offset3, numbytes3, \
3444 offset4, numbytes4, \
3445 isextended, \
3446 realsize, \
3447 pad = struct.unpack (TAR_FMT_HDR, data)
3448 except struct.error:
3449 return None
3450
3451 if magic != TAR_FMT_OLDGNU_MAGIC:
3452 return None
3453
3454 # return all except “unused” and “pad”
3455 return \
3456 { "name" : name, "mode" : mode
3457 , "uid" : uid , "gid" : gid
3458 , "size" : size, "mtime" : mtime
3459 , "checksum" : checksum
3460 , "typeflag" : typeflag
3461 , "linkname" : linkname
3462 , "magic" : magic
3463 , "version" : version
3464 , "uname" : uname, "gname" : gname
3465 , "devmajor" : devmajor, "devminor" : devminor
3466 , "atime" : atime, "ctime" : ctime
3467 , "offset" : offset
3468 , "longnames" : longnames
3469 , "offset1" : offset1, "numbytes1" : numbytes1
3470 , "offset2" : offset2, "numbytes2" : numbytes2
3471 , "offset3" : offset3, "numbytes3" : numbytes3
3472 , "offset4" : offset4, "numbytes4" : numbytes4
3473 , "isextended" : isextended
3474 , "realsize" : realsize
3475 }
3476
3477
a793ee30
PG
3478def tar_hdr_check_chksum (data):
3479 hdr = read_gnu_tar_hdr (data)
3480 if hdr is None:
3481 return False
3482 s = calc_chksums (data)
3483 return nti (hdr ["checksum"]) in s
3484
3485
8fc6040c
PG
3486def readable_tar_objects_offsets (ifd):
3487 """
3488 Traverse blocks in file, trying to extract tar headers.
3489 """
3490 pos = 0
3491 offsets = []
3492
a793ee30
PG
3493 mm = mmap.mmap(ifd, 0, mmap.MAP_SHARED, mmap.PROT_READ)
3494 pos = TAR_HDR_OFF_MAGIC
3495
8fc6040c 3496 while True:
a793ee30
PG
3497 pos = mm.find (TAR_FMT_OLDGNU_MAGIC, pos)
3498 if pos == -1:
8fc6040c 3499 break
a793ee30
PG
3500 off = pos - TAR_HDR_OFF_MAGIC
3501 mm.seek (off)
3502 blk = mm.read (BLOCKSIZE)
3503 if tar_hdr_check_chksum (blk) is True:
3504 offsets.append (off)
3505 pos += 1
65b35c42 3506
8fc6040c 3507 return offsets
65b35c42
PG
3508
3509
dfd7865e
PG
3510def locate_gz_hdr_candidates (fd):
3511 """
3512 Walk over instances of the GZ magic in the payload, collecting their
3513 positions. If the offset of the first found instance is not zero, the file
3514 begins with leading garbage.
3515
3516 Note that since the GZ magic consists of only two bytes, we expect a lot of
3517 false positives inside binary data.
3518
3519 :return: The list of offsets in the file.
3520 """
3521 pos = 0
3522 cands = []
3523 mm = mmap.mmap(fd, 0, mmap.MAP_SHARED, mmap.PROT_READ)
3524
3525 while True:
3526 pos = mm.find (GZ_MAGIC_BYTES, pos)
3527 if pos == -1:
3528 break
3529 cands.append (pos)
3530 pos += len (GZ_MAGIC_BYTES)
3531
3532 return cands
3533
3534
3535HDR_CAND_GOOD = 0 # header marks begin of valid object
3536HDR_CAND_FISHY = 1 # inconclusive
3537HDR_CAND_JUNK = 2 # not a header / object unreadable
3538
3539
3540def read_cstring (fd, max=-1, encoding=None):
3541 """
3542 Read one NUL-terminated string from *fd* into a Python string. If *max* is
3543 non-negative, reading will terminate after the specified number of bytes.
3544
3545 Optionally, an *encoding* may be specified to interpret the data as.
3546
3547 :returns: *None* if parsing failed or the maximum number of bytes has been
3548 exceeded; a Python string with the data otherwise.
3549 """
3550 buf = b""
3551 l = 0
3552
3553 while True:
3554 c = os.read (fd, 1)
3555 if c == NUL:
3556 break
3557 if max >= 0 and l > max:
3558 return None
3559 buf += c
3560 l += 1
3561 if encoding is not None:
3562 buf = buf.decode (encoding)
3563
3564 return buf
3565
3566
3567def inspect_gz_hdr (fd, off):
3568 """
3569 Attempt to parse a Gzip header in *fd* at position *off*. The format is
3570 documented as RFC1952.
3571
3572 Returns a verdict about the quality of that header plus the parsed header
3573 when readable. Problematic sizes such as fields running past the EOF are
3574 treated as garbage. Properties in which the header merely doesn’t conform
3575 to the spec (garbage flag bits, bogus timestamp) are considered “fishy”. No
3576 validation is possible on embedded strings because they are single-byte
3577 encoded.
3578 """
3579 fname = None
3580 flags = 0x00
3581 dflags = 0x00
3582 mtime = 0x00000000
3583 oscode = 0x00
3584 verdict = HDR_CAND_GOOD
3585
3586 os.lseek (fd, off, os.SEEK_SET)
3587 if os.lseek (fd, 0, os.SEEK_CUR) != off:
3588 return HDR_CAND_JUNK, None
3589
3590 raw = os.read (fd, GZ_HEADER_SIZE)
3591 if len (raw) != GZ_HEADER_SIZE:
3592 return HDR_CAND_JUNK, None
3593
3594 flags = 0x0
3595 try:
3596 _m1, _m2, meth, flags, mtime, dflags, oscode = \
3597 struct.unpack (GZ_FMT_HEADER, raw)
3598 if meth != GZ_METHOD_DEFLATE: # only deflate is supported
3599 return HDR_CAND_JUNK, None
3600 except struct.error as exn:
3601 return HDR_CAND_JUNK, None
3602
3603 if mtime > int (time.time ()):
3604 verdict = HDR_CAND_FISHY
3605
3606 if dflags != GZ_DEFLATE_FLAGS:
3607 verdict = HDR_CAND_FISHY
3608
3609 if oscode != GZ_OS_CODE:
3610 verdict = HDR_CAND_FISHY
3611
3612 if flags & GZ_FLAG_FTEXT: # created by some contrarian
3613 verdict = HDR_CAND_FISHY
3614 if flags & GZ_FLAG_FEXTRA:
04263795 3615 xlen = struct.unpack ("<H", os.read (fd, 2))[0]
dfd7865e
PG
3616 xtra = os.read (fd, xlen)
3617 if len (xtra) != xlen: # eof inside header
3618 return HDR_CAND_JUNK, None
3619 if flags & GZ_FLAG_FNAME:
3620 # read up to the next NUL byte, not exceeding the maximum path length
3621 # allowed by tar(5)
3622 fname = read_cstring (fd, max=(LENGTH_NAME + LENGTH_PREFIX),
3623 encoding="iso-8859-1")
3624 if fname is None:
3625 return HDR_CAND_JUNK, None
3626 if flags & GZ_FLAG_FCOMMENT:
3627 fname = read_cstring (fd, max=(LENGTH_NAME + LENGTH_PREFIX),
3628 encoding="iso-8859-1")
3629 if fname is None:
3630 return HDR_CAND_JUNK, None
3631 if flags & GZ_FLAG_FHCRC: # half a CRC32; discarded
3632 crc16 = os.read (fd, 2)
3633 if len (crc16) != 2: # eof inside header
3634 return HDR_CAND_JUNK, None
3635 if flags & GZ_FLAG_RESERVED:
3636 # according to the RFC, these must not be set
3637 verdict = HDR_CAND_FISHY
3638
3639 hlen = os.lseek (fd, 0, os.SEEK_CUR) - off
3640
3641 return verdict, \
3642 { "fname" : fname
3643 , "flags" : flags
3644 , "dflags" : dflags
3645 , "mtime" : mtime
3646 , "oscode" : oscode
3647 , "hlen" : hlen
3648 }
3649
3650
3651def try_decompress (ifd, off, hdr):
3652 """
3653 Attempt to process the object starting at *off* with gzip.
3654
3655 :returns: A pair containing the values of the decompressed data and
3656 the length of the input consumed. Note that the latter value
3657 may exceed the length of the compressed data because the
3658 *zlib* module does not provide a means to query how much
3659 of the input it processed before the end of an object.
3660 """
3661 import zlib
3662 decmp = zlib.decompressobj (-zlib.MAX_WBITS)
3663 pos = off
3664 dlen = 0 # size of decompressed data
3665
3666 os.lseek (ifd, pos, os.SEEK_SET)
3667 while True:
3668 cnk = os.read (ifd, BUFSIZE)
3669 pos += len (cnk)
3670 try:
3671 data = decmp.decompress (cnk)
3672 except zlib.error as exn: # probably CRC32 mismatch; terminate softly
3673 break # fishy
3674 dlen += len (data)
3675 if decmp.eof is True:
3676 break
3677 if len (cnk) != BUFSIZE: # eof, but not end of decompressed object!
3678 break # fishy
3679
3680 return dlen, pos - off
3681
3682def readable_gz_objects_offsets (ifd, cands):
3683 """
3684 Inspect header candidates for parseable *ifd* gzipped objects.
3685 """
3686 good = []
3687 nobj = 0
3688
3689 for cand in cands:
3690 nobj += 1
3691 vdt, hdr = inspect_gz_hdr (ifd, cand)
3692 if vdt == HDR_CAND_JUNK:
3693 pass # ignore unreadable ones
3694 elif vdt in [HDR_CAND_GOOD, HDR_CAND_FISHY]:
3695 off0 = cand + hdr ["hlen"]
3696 dlen, clen = try_decompress (ifd, off0, hdr)
3697 if dlen > 0 and clen > 0:
3698 good.append (cand)
3699
3700 return good
3701
3702
3703def reconstruct_offsets_gz (fname):
3704 """
3705 From the given file, retrieve all GZ header-like offsets (“candidates”).
3706 Then check each of those locations whether they can be processed as
3707 compressed data.
3708 """
3709 ifd = os.open (fname, os.O_RDONLY)
3710
3711 try:
3712 cands = locate_gz_hdr_candidates (ifd)
3713 return readable_gz_objects_offsets (ifd, cands)
3714 finally:
3715 os.close (ifd)
3716
3717
65b35c42
PG
3718def reconstruct_offsets_tar (fname):
3719 """
3720 From the given file, retrieve all tar header-like offsets (“candidates”).
3721 Then check each of those locations whether they can be processed as tar
3722 data.
3723 """
3724 ifd = os.open (fname, os.O_RDONLY)
3725
3726 try:
8fc6040c 3727 return readable_tar_objects_offsets (ifd)
65b35c42
PG
3728 finally:
3729 os.close (ifd)
3730
3731
b750b280
PG
3732def read_tarobj_at_offset (fileobj, offset, mode, secret=None,
3733 strict_validation=True):
3734 """
3735 :type strict_validation: bool
3736 :param strict_validation: Enable strict IV checking in the crypto
3737 layer. Should be disabled when dealing with
3738 potentially corrupted data.
3739 """
d39d4cbf 3740 decr = None
d39d4cbf 3741
dfd7865e
PG
3742 if secret is not None:
3743 ks = secret [0]
3744
3745 if ks == crypto.PDTCRYPT_SECRET_PW:
b750b280
PG
3746 decr = crypto.Decrypt (password=secret [1],
3747 strict_ivs=strict_validation)
dfd7865e
PG
3748 elif ks == crypto.PDTCRYPT_SECRET_KEY:
3749 key = binascii.unhexlify (secret [1])
b750b280
PG
3750 decr = crypto.Decrypt (key=key,
3751 strict_ivs=strict_validation)
dfd7865e
PG
3752 else:
3753 raise RuntimeError
d39d4cbf 3754
5bd2d4b5
PG
3755 try:
3756 tarobj = \
3757 TarFile.open_at_offset (offset,
3758 mode=mode,
3759 fileobj=fileobj,
3760 format=GNU_FORMAT,
3761 concat='#' in mode,
3762 encryption=decr,
3763 save_to_members=False,
3764 tolerance=TOLERANCE_RESCUE)
3765 except (ReadError, EndOfFile):
3766 return None
d39d4cbf
PG
3767
3768 return tarobj.next ()
3769
3770
2d50b7f7
PG
3771def idxent_of_tarinfo (tarinfo):
3772 """
3773 Scrape the information relevant for the index from a *TarInfo* object.
3774 Keys like the inode number that lack a corresponding field in a TarInfo
3775 will be set to some neutral value.
3776 Example output:
3777
3778 { "inode" : 0
3779 , "uid" : 0
3780 , "path" : "snapshot://annotations.db"
3781 , "offset" : 0
3782 , "volume" : 0
3783 , "mode" : 33152
3784 , "ctime" : 1502798115
3785 , "mtime" : 1502196423
3786 , "size" : 144
3787 , "type" : "file"
3788 , "gid" : 0
3789 }
3790
3791 """
3792
3793 return \
3794 { "inode" : 0 # ignored when reading the index
3795 , "uid" : tarinfo.uid
3796 , "gid" : tarinfo.gid
3797 , "path" : tarinfo.name # keeping URI scheme
3798 , "offset" : 0 # to be added by the caller
3799 , "volume" : tarinfo.volume_offset
3800 , "mode" : tarinfo.mode
3801 , "ctime" : tarinfo.mtime
3802 , "mtime" : tarinfo.mtime
3803 , "size" : tarinfo.size
3804 , "type" : tarinfo.type
3805 }
3806
3807
27ee4dd4
PG
3808def gen_rescue_index (gen_volume_name, mode, maxvol=None, password=None, key=None):
3809 infos = []
6690f5e0
PG
3810 psidx = [] # pseudo index, return value
3811 offsets = None
addcec42 3812 secret = crypto.make_secret (password=password, key=key)
6690f5e0 3813
27ee4dd4 3814 nvol = 0
dfd7865e 3815
27ee4dd4
PG
3816 while True:
3817 vpath = gen_volume_name (nvol)
3818 try:
3819 if secret is not None:
3820 offsets = crypto.reconstruct_offsets (vpath, secret)
3821 elif mode == "#gz":
3822 offsets = reconstruct_offsets_gz (vpath)
3823 elif mode == "#":
3824 offsets = reconstruct_offsets_tar (vpath)
3825 else:
3826 raise TarError ("no rescue handling for mode “%s”" % mode)
3827 except FileNotFoundError as exn:
3828 # volume does not exist
611c5d03 3829 if maxvol is not None and nvol < maxvol:
27ee4dd4
PG
3830 continue # explicit volume number specified, ignore missing ones
3831 else:
3832 break
3833
3834 fileobj = bltn_open (vpath, "rb")
5bd2d4b5
PG
3835
3836 def aux (acc, off):
b750b280
PG
3837 obj = read_tarobj_at_offset (fileobj, off, mode, secret=secret,
3838 strict_validation=False)
5bd2d4b5
PG
3839 if obj is not None:
3840 acc.append ((off, nvol, obj))
3841 return acc
3842 infos += functools.reduce (aux, offsets, [])
3843
bcc8b174
PG
3844 fileobj.close()
3845
27ee4dd4
PG
3846 nvol += 1
3847
5bd2d4b5
PG
3848 def aux (o, nvol, ti):
3849 ie = idxent_of_tarinfo (ti)
3850 ie ["offset"] = o
3851 ie ["volume"] = nvol
3852 return ie
3853
27ee4dd4 3854 psidx = [ aux (o, nvol, ti) for o, nvol, ti in infos ]
6690f5e0
PG
3855
3856 return psidx
7584f5c9
ERE
3857
3858#--------------------
3859# exported functions
3860#--------------------
3861def is_tarfile(name):
3862 """Return True if name points to a tar archive that we
3863 are able to handle, else return False.
3864 """
3865 try:
3866 t = open(name)
3867 t.close()
3868 return True
3869 except TarError:
3870 return False
3871
3872bltn_open = open
3873open = TarFile.open