implement volume handling for rescue mode
[python-delta-tar] / deltatar / tarfile.py
CommitLineData
be60ffd0 1#!/usr/bin/env python3
7584f5c9
ERE
2#-------------------------------------------------------------------
3# tarfile.py
4#-------------------------------------------------------------------
5# Copyright (C) 2002 Lars Gustäbel <lars@gustaebel.de>
6# All rights reserved.
7#
8# Permission is hereby granted, free of charge, to any person
9# obtaining a copy of this software and associated documentation
10# files (the "Software"), to deal in the Software without
11# restriction, including without limitation the rights to use,
12# copy, modify, merge, publish, distribute, sublicense, and/or sell
13# copies of the Software, and to permit persons to whom the
14# Software is furnished to do so, subject to the following
15# conditions:
16#
17# The above copyright notice and this permission notice shall be
18# included in all copies or substantial portions of the Software.
19#
20# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27# OTHER DEALINGS IN THE SOFTWARE.
28#
29"""Read from and write to tar format archives.
30"""
31
32__version__ = "$Revision: 85213 $"
33# $Source$
34
35version = "0.9.0"
36__author__ = "Lars Gustäbel (lars@gustaebel.de)"
37__date__ = "$Date$"
38__cvsid__ = "$Id$"
5fdff89f 39__credits__ = "Gustavo Niemeyer, Niels Gustäbel, Richard Townsend, Eduardo Robles."
7584f5c9
ERE
40
41#---------
42# Imports
43#---------
c7c736b6 44import binascii
dfd7865e
PG
45import copy
46import errno
be60ffd0 47import io
dfd7865e
PG
48import mmap
49import operator
50import os
51import re
7584f5c9
ERE
52import shutil
53import stat
7584f5c9 54import struct
dfd7865e
PG
55import sys
56import time
7584f5c9 57
c7c736b6
PG
58import traceback # XXX
59
8ab8fac5 60from . import crypto
6e812ad9 61
7584f5c9
ERE
62try:
63 import grp, pwd
64except ImportError:
65 grp = pwd = None
66
be60ffd0
ERE
67# os.symlink on Windows prior to 6.0 raises NotImplementedError
68symlink_exception = (AttributeError, NotImplementedError)
69try:
70 # OSError (winerror=1314) will be raised if the caller does not hold the
71 # SeCreateSymbolicLinkPrivilege privilege
72 symlink_exception += (OSError,)
73except NameError:
74 pass
75
7584f5c9
ERE
76# from tarfile import *
77__all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError"]
78
be60ffd0
ERE
79from builtins import open as _open # Since 'open' is TarFile.open
80
7584f5c9
ERE
81#---------------------------------------------------------
82# tar constants
83#---------------------------------------------------------
be60ffd0 84NUL = b"\0" # the null character
7584f5c9
ERE
85BLOCKSIZE = 512 # length of processing blocks
86RECORDSIZE = BLOCKSIZE * 20 # length of records
be60ffd0
ERE
87GNU_MAGIC = b"ustar \0" # magic gnu tar string
88POSIX_MAGIC = b"ustar\x0000" # magic posix tar string
7584f5c9
ERE
89
90LENGTH_NAME = 100 # maximum length of a filename
91LENGTH_LINK = 100 # maximum length of a linkname
92LENGTH_PREFIX = 155 # maximum length of the prefix field
93
be60ffd0
ERE
94REGTYPE = b"0" # regular file
95AREGTYPE = b"\0" # regular file
96LNKTYPE = b"1" # link (inside tarfile)
97SYMTYPE = b"2" # symbolic link
98CHRTYPE = b"3" # character special device
99BLKTYPE = b"4" # block special device
100DIRTYPE = b"5" # directory
101FIFOTYPE = b"6" # fifo special device
102CONTTYPE = b"7" # contiguous file
103
104GNUTYPE_LONGNAME = b"L" # GNU tar longname
105GNUTYPE_LONGLINK = b"K" # GNU tar longlink
106GNUTYPE_SPARSE = b"S" # GNU tar sparse file
107GNUTYPE_MULTIVOL = b"M" # GNU tar continuation of a file that began on
68ddf955 108 # another volume
7584f5c9 109
be60ffd0
ERE
110XHDTYPE = b"x" # POSIX.1-2001 extended header
111XGLTYPE = b"g" # POSIX.1-2001 global header
112SOLARIS_XHDTYPE = b"X" # Solaris extended header
7584f5c9
ERE
113
114USTAR_FORMAT = 0 # POSIX.1-1988 (ustar) format
115GNU_FORMAT = 1 # GNU tar format
116PAX_FORMAT = 2 # POSIX.1-2001 (pax) format
117DEFAULT_FORMAT = GNU_FORMAT
118
15a81fc0 119GZ_FMT_HEADER = b"<BBBBLBB"
203cb25e 120GZ_HEADER_SIZE = 10 # not including the name
15a81fc0
PG
121GZ_MAGIC = (0x1f, 0x8b) # 0o37, 0o213
122GZ_METHOD_DEFLATE = 0x08 # 0o10
dfd7865e
PG
123GZ_FLAG_FTEXT = 1 << 0 # ASCII payload
124GZ_FLAG_FHCRC = 1 << 1 # CRC16
125GZ_FLAG_FEXTRA = 1 << 2 # extra field
126GZ_FLAG_FNAME = 1 << 3 # set by default in gzip
127GZ_FLAG_FCOMMENT = 1 << 4 # NUL-terminated comment
128GZ_FLAG_RESERVED = 7 << 5 # unassigned
15a81fc0
PG
129GZ_DEFLATE_FLAGS = 0x00 # 0o00, never read (deflate.c)
130GZ_OS_CODE = 0x03 # 0o03, default in gzip (tailor.h)
d601d33b
PG
131GZ_MAGIC_BYTES = struct.pack ("<BB", GZ_MAGIC [0], GZ_MAGIC [1])
132GZ_MAGIC_DEFLATE = struct.pack ("<BBB", GZ_MAGIC [0], GZ_MAGIC [1],
133 GZ_METHOD_DEFLATE)
15a81fc0 134
04f4c7ab
PG
135TOLERANCE_STRICT = 0
136TOLERANCE_RECOVER = 1 # rely on offsets in index
137TOLERANCE_RESCUE = 2 # deduce metadata from archive contents
138
dfd7865e
PG
139BUFSIZE = 16 * 1024
140
7584f5c9 141#---------------------------------------------------------
d1c38f40
PG
142# archive handling mode
143#---------------------------------------------------------
144
145ARCMODE_PLAIN = 0
146ARCMODE_ENCRYPT = 1 << 0
147ARCMODE_COMPRESS = 1 << 1
148ARCMODE_CONCAT = 1 << 2
149
150def arcmode_fmt (m):
151 if m == ARCMODE_PLAIN:
152 return "PLAIN"
153 first = True
154 ret = "["
155 def chkappend (b, s):
156 nonlocal m
157 nonlocal ret
158 nonlocal first
159 if m & b:
160 if first is True: first = False
161 else: ret += " |"
162 ret += " " + s
163 chkappend (ARCMODE_ENCRYPT, "ENCRYPT")
164 chkappend (ARCMODE_COMPRESS, "COMPRESS")
165 chkappend (ARCMODE_CONCAT, "CONCAT")
166 return ret + " ]"
167
168
169def arcmode_set (concat=False, encryption=None, comptype=None, init=ARCMODE_PLAIN):
170 ret = init
171 if bool (concat) is True:
172 ret |= ARCMODE_CONCAT
173 if encryption is not None:
174 ret |= ARCMODE_ENCRYPT
175 if comptype == "gz":
176 ret |= ARCMODE_COMPRESS
177 return ret
178
179#---------------------------------------------------------
7584f5c9
ERE
180# tarfile constants
181#---------------------------------------------------------
182# File types that tarfile supports:
183SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE,
184 SYMTYPE, DIRTYPE, FIFOTYPE,
185 CONTTYPE, CHRTYPE, BLKTYPE,
186 GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
68ddf955 187 GNUTYPE_SPARSE, GNUTYPE_MULTIVOL)
7584f5c9
ERE
188
189# File types that will be treated as a regular file.
190REGULAR_TYPES = (REGTYPE, AREGTYPE,
191 CONTTYPE, GNUTYPE_SPARSE)
192
193# File types that are part of the GNU tar format.
194GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
68ddf955 195 GNUTYPE_SPARSE, GNUTYPE_MULTIVOL)
7584f5c9
ERE
196
197# Fields from a pax header that override a TarInfo attribute.
198PAX_FIELDS = ("path", "linkpath", "size", "mtime",
199 "uid", "gid", "uname", "gname")
200
be60ffd0
ERE
201# Fields from a pax header that are affected by hdrcharset.
202PAX_NAME_FIELDS = {"path", "linkpath", "uname", "gname"}
203
7584f5c9
ERE
204# Fields in a pax header that are numbers, all other fields
205# are treated as strings.
206PAX_NUMBER_FIELDS = {
207 "atime": float,
208 "ctime": float,
209 "mtime": float,
210 "uid": int,
211 "gid": int,
212 "size": int
213}
214
215#---------------------------------------------------------
7584f5c9
ERE
216# initialization
217#---------------------------------------------------------
be60ffd0
ERE
218
219if os.name in ("nt", "ce"):
220 ENCODING = "utf-8"
221else:
222 ENCODING = sys.getfilesystemencoding()
7584f5c9
ERE
223
224#---------------------------------------------------------
225# Some useful functions
226#---------------------------------------------------------
227
be60ffd0
ERE
228def stn(s, length, encoding, errors):
229 """Convert a string to a null-terminated bytes object.
7584f5c9 230 """
be60ffd0 231 s = s.encode(encoding, errors)
7584f5c9
ERE
232 return s[:length] + (length - len(s)) * NUL
233
be60ffd0
ERE
234def nts(s, encoding, errors):
235 """Convert a null-terminated bytes object to a string.
7584f5c9 236 """
be60ffd0
ERE
237 p = s.find(b"\0")
238 if p != -1:
239 s = s[:p]
240 return s.decode(encoding, errors)
241
242def sbtn(s, length, encoding, errors):
243 """Convert a string or a bunch of bytes to a null-terminated bytes object
244 of specific size.
245 """
246 if isinstance(s, str):
247 s = s.encode(encoding, errors)
248 return s[:length] + (length - len(s)) * NUL
7584f5c9
ERE
249
250def nti(s):
251 """Convert a number field to a python number.
252 """
253 # There are two possible encodings for a number field, see
254 # itn() below.
be60ffd0
ERE
255 if s[0] in (0o200, 0o377):
256 n = 0
257 for i in range(len(s) - 1):
258 n <<= 8
259 n += s[i + 1]
260 if s[0] == 0o377:
261 n = -(256 ** (len(s) - 1) - n)
262 else:
7584f5c9 263 try:
be60ffd0 264 n = int(nts(s, "ascii", "strict") or "0", 8)
7584f5c9
ERE
265 except ValueError:
266 raise InvalidHeaderError("invalid header")
7584f5c9
ERE
267 return n
268
269def itn(n, digits=8, format=DEFAULT_FORMAT):
270 """Convert a python number to a number field.
271 """
272 # POSIX 1003.1-1988 requires numbers to be encoded as a string of
273 # octal digits followed by a null-byte, this allows values up to
274 # (8**(digits-1))-1. GNU tar allows storing numbers greater than
be60ffd0
ERE
275 # that if necessary. A leading 0o200 or 0o377 byte indicate this
276 # particular encoding, the following digits-1 bytes are a big-endian
277 # base-256 representation. This allows values up to (256**(digits-1))-1.
278 # A 0o200 byte indicates a positive number, a 0o377 byte a negative
279 # number.
7584f5c9 280 if 0 <= n < 8 ** (digits - 1):
8112b0ed 281 s = bytes("%0*o" % (digits - 1, int(n)), "ascii") + NUL
be60ffd0
ERE
282 elif format == GNU_FORMAT and -256 ** (digits - 1) <= n < 256 ** (digits - 1):
283 if n >= 0:
284 s = bytearray([0o200])
285 else:
286 s = bytearray([0o377])
287 n = 256 ** digits + n
7584f5c9 288
be60ffd0
ERE
289 for i in range(digits - 1):
290 s.insert(1, n & 0o377)
7584f5c9 291 n >>= 8
7584f5c9 292 else:
be60ffd0
ERE
293 raise ValueError("overflow in number field")
294
295 return s
7584f5c9
ERE
296
297def calc_chksums(buf):
298 """Calculate the checksum for a member's header by summing up all
299 characters except for the chksum field which is treated as if
300 it was filled with spaces. According to the GNU tar sources,
301 some tars (Sun and NeXT) calculate chksum with signed char,
302 which will be different if there are chars in the buffer with
303 the high bit set. So we calculate two checksums, unsigned and
304 signed.
305 """
be60ffd0
ERE
306 unsigned_chksum = 256 + sum(struct.unpack_from("148B8x356B", buf))
307 signed_chksum = 256 + sum(struct.unpack_from("148b8x356b", buf))
7584f5c9
ERE
308 return unsigned_chksum, signed_chksum
309
310def copyfileobj(src, dst, length=None):
311 """Copy length bytes from fileobj src to fileobj dst.
312 If length is None, copy the entire content.
313 """
314 if length == 0:
315 return
316 if length is None:
317 shutil.copyfileobj(src, dst)
318 return
319
7584f5c9 320 blocks, remainder = divmod(length, BUFSIZE)
be60ffd0 321 for b in range(blocks):
7584f5c9 322 buf = src.read(BUFSIZE)
c474439c 323 dst.write(buf)
7584f5c9 324 if len(buf) < BUFSIZE:
be60ffd0 325 raise OSError("end of file reached")
7584f5c9
ERE
326 if remainder != 0:
327 buf = src.read(remainder)
c474439c 328 dst.write(buf)
7584f5c9 329 if len(buf) < remainder:
be60ffd0 330 raise OSError("end of file reached")
c7c736b6 331
7584f5c9 332
7584f5c9 333def filemode(mode):
be60ffd0
ERE
334 """Deprecated in this location; use stat.filemode."""
335 import warnings
336 warnings.warn("deprecated in favor of stat.filemode",
337 DeprecationWarning, 2)
338 return stat.filemode(mode)
7584f5c9
ERE
339
340class TarError(Exception):
341 """Base exception."""
342 pass
343class ExtractError(TarError):
344 """General exception for extract errors."""
345 pass
346class ReadError(TarError):
be60ffd0 347 """Exception for unreadable tar archives."""
7584f5c9
ERE
348 pass
349class CompressionError(TarError):
350 """Exception for unavailable compression methods."""
351 pass
352class StreamError(TarError):
353 """Exception for unsupported operations on stream-like TarFiles."""
354 pass
355class HeaderError(TarError):
356 """Base exception for header errors."""
357 pass
358class EmptyHeaderError(HeaderError):
359 """Exception for empty headers."""
360 pass
361class TruncatedHeaderError(HeaderError):
362 """Exception for truncated headers."""
363 pass
364class EOFHeaderError(HeaderError):
365 """Exception for end of file headers."""
366 pass
367class InvalidHeaderError(HeaderError):
368 """Exception for invalid headers."""
369 pass
370class SubsequentHeaderError(HeaderError):
371 """Exception for missing and invalid extended headers."""
372 pass
8ab8fac5
PG
373class InvalidEncryptionError(TarError):
374 """Exception for undefined crypto modes and combinations."""
375 pass
e4e5d0b8
PG
376class DecryptionError(TarError):
377 """Exception for error during decryption."""
378 pass
c7c736b6 379class EncryptionError(TarError):
e93f83f1 380 """Exception for error during encryption."""
c7c736b6 381 pass
e50fa574
PG
382class EndOfFile(Exception):
383 """Signal end of file condition when they’re not an error."""
65b35c42 384 pass
7584f5c9
ERE
385
386#---------------------------
387# internal stream interface
388#---------------------------
389class _LowLevelFile:
390 """Low-level file object. Supports reading and writing.
391 It is used instead of a regular file object for streaming
392 access.
393 """
394
395 def __init__(self, name, mode):
ad4402e8 396 _mode = {
7584f5c9 397 "r": os.O_RDONLY,
c7c736b6 398 "w": os.O_RDWR | os.O_CREAT | os.O_TRUNC,
7584f5c9
ERE
399 }[mode]
400 if hasattr(os, "O_BINARY"):
f0287fb7 401 _mode |= os.O_BINARY # pylint: disable=no-member
be60ffd0 402 self.fd = os.open(name, _mode, 0o666)
ad4402e8 403 self.offset = 0
7584f5c9
ERE
404
405 def close(self):
406 os.close(self.fd)
407
408 def read(self, size):
ad4402e8
ERE
409 ret = os.read(self.fd, size)
410 self.offset += len(ret)
411 return ret
7584f5c9 412
867f75f7
PG
413 def write(self, s, pos=None):
414 if pos is not None:
415 p0 = self.offset
416 os.lseek (self.fd, pos, os.SEEK_SET)
417 n = os.write(self.fd, s)
418 if pos is None:
419 self.offset += len(s)
420 else:
421 append = pos + n - p0
422 if append > 0:
423 self.offset += append
424 os.lseek (self.fd, p0, os.SEEK_SET)
7584f5c9 425
ad4402e8
ERE
426 def tell(self):
427 return self.offset
428
c7c736b6
PG
429 def seek_set (self, pos):
430 os.lseek (self.fd, pos, os.SEEK_SET)
431 self.offset = pos
432
8ab8fac5 433
15a81fc0
PG
434def gz_header (name=None):
435 timestamp = int(time.time())
436 flags = 0x0
437
438 if name is None:
439 name = b""
440 else:
dfd7865e 441 flags |= GZ_FLAG_FNAME
15a81fc0
PG
442 if type(name) is str:
443 name = name.encode("iso-8859-1", "replace")
6e99d23a
PG
444 if name.endswith(b".pdtcrypt"):
445 name = name[:-9]
15a81fc0
PG
446 if name.endswith(b".gz"):
447 name = name[:-3]
448 # RFC1952 says we must use ISO-8859-1 for the FNAME field.
449 name += NUL
450
451 hdr = struct.pack (GZ_FMT_HEADER,
452 GZ_MAGIC [0], GZ_MAGIC [1],
453 GZ_METHOD_DEFLATE, flags,
454 timestamp,
455 GZ_DEFLATE_FLAGS, GZ_OS_CODE)
456
457 return hdr + name
458
d601d33b 459
7584f5c9
ERE
460class _Stream:
461 """Class that serves as an adapter between TarFile and
462 a stream-like object. The stream-like object only
463 needs to have a read() or write() method and is accessed
464 blockwise. Use of gzip or bzip2 compression is possible.
465 A stream-like object could be for example: sys.stdin,
466 sys.stdout, a socket, a tape device etc.
467
3031b7ae
PG
468 _Stream is intended to be used only internally but is
469 nevertherless used externally by Deltatar.
470
471 When encrypting, the ``enccounter`` will be used for
472 initializing the first cryptographic context. When
473 decrypting, its value will be compared to the decrypted
474 object. Decryption fails if the value does not match.
475 In effect, this means that a ``_Stream`` whose ctor was
476 passed ``enccounter`` can only be used to encrypt or
477 decrypt a single object.
7584f5c9
ERE
478 """
479
c7c736b6 480 remainder = -1 # track size in encrypted entries
04f4c7ab 481 tolerance = TOLERANCE_STRICT
c7c736b6 482
6e812ad9 483 def __init__(self, name, mode, comptype, fileobj, bufsize,
d1c38f40 484 concat=False, encryption=None, enccounter=None,
04f4c7ab 485 compresslevel=9, tolerance=TOLERANCE_STRICT):
7584f5c9
ERE
486 """Construct a _Stream object.
487 """
d1c38f40 488 self.arcmode = arcmode_set (concat, encryption, comptype)
04f4c7ab 489 self.tolerance = tolerance
d1c38f40 490
7584f5c9
ERE
491 self._extfileobj = True
492 if fileobj is None:
493 fileobj = _LowLevelFile(name, mode)
494 self._extfileobj = False
495
496 if comptype == '*':
497 # Enable transparent compression detection for the
498 # stream interface
499 fileobj = _StreamProxy(fileobj)
500 comptype = fileobj.getcomptype()
d1c38f40
PG
501 if comptype == '':
502 comptype = "tar"
7584f5c9 503
3031b7ae
PG
504 self.enccounter = None
505 if self.arcmode & ARCMODE_ENCRYPT:
506 self.enccounter = enccounter
507
7584f5c9
ERE
508 self.name = name or ""
509 self.mode = mode
510 self.comptype = comptype
53732900 511 self.cmp = None
7584f5c9
ERE
512 self.fileobj = fileobj
513 self.bufsize = bufsize
be60ffd0
ERE
514 self.buf = b""
515 self.pos = 0
516 self.concat_pos = 0
7584f5c9 517 self.closed = False
be60ffd0 518 self.flags = 0
be60ffd0 519 self.last_block_offset = 0
e4e5d0b8 520 self.dbuf = b"" # ???
46c03c02 521 self.exception = None # communicate decompression failure
2b82f50c 522 self.compresslevel = compresslevel
784175ba 523 self.bytes_written = 0
c7c736b6 524 # crypto parameters
2ae46844 525 self.encryption = encryption
c7c736b6 526 self.lasthdr = None
7584f5c9 527
be60ffd0
ERE
528 try:
529 if comptype == "gz":
530 try:
531 import zlib
532 except ImportError:
533 raise CompressionError("zlib module is not available")
534 self.zlib = zlib
bec34b42
PG
535 if mode == "r":
536 self.exception = zlib.error
8ae983c4 537 self._init_read_gz()
bec34b42 538 elif mode == "w":
d1c38f40
PG
539 if not (self.arcmode & ARCMODE_CONCAT):
540 if self.arcmode & ARCMODE_ENCRYPT:
3031b7ae 541 self._init_write_encrypt (name)
a0873dcc 542 self._init_write_gz ()
c2ffe2ec 543 self.crc = zlib.crc32(b"") & 0xFFFFffff
7584f5c9 544
be60ffd0 545 elif comptype == "bz2":
d1c38f40 546 if self.arcmode & ARCMODE_ENCRYPT:
2ae46844 547 raise InvalidEncryptionError("encryption not available for "
d1c38f40 548 "compression “%s”" % comptype)
be60ffd0
ERE
549 try:
550 import bz2
551 except ImportError:
552 raise CompressionError("bz2 module is not available")
553 if mode == "r":
554 self.dbuf = b""
555 self.cmp = bz2.BZ2Decompressor()
556 self.exception = OSError
557 else:
558 self.cmp = bz2.BZ2Compressor()
7584f5c9 559
be60ffd0 560 elif comptype == 'xz':
d1c38f40 561 if self.arcmode & ARCMODE_ENCRYPT:
2ae46844 562 raise InvalidEncryptionError("encryption not available for "
d1c38f40 563 "compression “%s”" % comptype)
c7c736b6
PG
564 try:
565 import lzma
566 except ImportError:
567 raise CompressionError("lzma module is not available")
568 if mode == "r":
569 self.dbuf = b""
570 self.cmp = lzma.LZMADecompressor()
571 self.exception = lzma.LZMAError
572 else:
573 self.cmp = lzma.LZMACompressor()
574
6de9444a 575 elif comptype == "tar":
d1c38f40 576 if not (self.arcmode & ARCMODE_CONCAT) \
6de9444a 577 and mode == "w" \
d1c38f40 578 and self.arcmode & ARCMODE_ENCRYPT:
3031b7ae 579 self._init_write_encrypt (name)
6de9444a
PG
580
581 else:
d1c38f40 582 if self.arcmode & ARCMODE_ENCRYPT:
2ae46844 583 raise InvalidEncryptionError("encryption not available for "
d1c38f40 584 "compression “%s”" % comptype)
c7c736b6 585 raise CompressionError("unknown compression type %r" % comptype)
be60ffd0 586
200d4866 587 except:
be60ffd0
ERE
588 if not self._extfileobj:
589 self.fileobj.close()
590 self.closed = True
591 raise
ac5e4184 592
7584f5c9
ERE
593 def __del__(self):
594 if hasattr(self, "closed") and not self.closed:
fac2cfe1
PG
595 try:
596 self.close()
597 except crypto.InternalError:
598 # context already finalized due to abort but close() tried
599 # to use it
600 pass
7584f5c9 601
c7c736b6 602
d1c38f40
PG
603 def next (self, name):
604 if self.arcmode & ARCMODE_COMPRESS:
605 if getattr (self, "cmp", None) is not None:
606 self._finalize_write_gz ()
0349168a
PG
607 self.__sync()
608 if self.arcmode & ~(ARCMODE_ENCRYPT | ARCMODE_COMPRESS):
609 self.last_block_offset = self.fileobj.tell()
d1c38f40
PG
610 if self.arcmode & ARCMODE_ENCRYPT:
611 self._finalize_write_encrypt ()
612 self._init_write_encrypt (name, set_last_block_offset=True)
613 if self.arcmode & ARCMODE_COMPRESS:
614 self._init_write_gz (set_last_block_offset =
0349168a 615 not (self.arcmode & ARCMODE_ENCRYPT))
d1c38f40
PG
616 return self.last_block_offset
617
618
619 def next_volume (self, name):
620 # with non-concat modes, this is taken care by the _Stream
621 # ctor as invoked by the newvol handler
622 if self.arcmode & ARCMODE_COMPRESS:
623 if getattr (self, "cmp", None) is not None:
624 # e. g. compressed PAX header written
625 self._finalize_write_gz ()
626 if self.arcmode & ARCMODE_ENCRYPT:
627 self._init_write_encrypt (name)
628 if self.arcmode & ARCMODE_COMPRESS:
629 self._init_write_gz ()
630
c7c736b6 631
d1c38f40
PG
632 def _init_write_encrypt (self, entry=None, set_last_block_offset=False):
633 """
634 Save position for delayed write of header; fill the header location
635 with dummy bytes.
636 """
637 # first thing, proclaim new object to the encryption context
638 # secondly, assemble the header with the updated parameters
639 # and commit it directly to the underlying stream, bypassing the
640 # encryption layer in .__write().
641 dummyhdr = self.encryption.next (entry, counter=self.enccounter)
642 if dummyhdr is None:
643 raise EncryptionError ("Crypto.next(): bad dummy header") # XXX
644 self.lasthdr = self.fileobj.tell()
645 self.__write_to_file(dummyhdr)
646 if set_last_block_offset is True:
647 self.last_block_offset = self.lasthdr
c7c736b6
PG
648
649
650 def _finalize_write_encrypt (self):
651 """
652 Seek back to header position, read dummy bytes, finalize crypto
653 obtaining the actual header, write header, seek back to current
654 position.
963d0db4
PG
655
656 Returns the list of IV fixed parts as used during encryption.
c7c736b6 657 """
d1c38f40 658 if self.lasthdr is not None:
c7c736b6
PG
659 pos0 = self.fileobj.tell ()
660 self.fileobj.seek_set (self.lasthdr)
dd47d6a2 661 dummy = self.fileobj.read (crypto.PDTCRYPT_HDR_SIZE)
c7c736b6
PG
662 pos1 = self.fileobj.tell ()
663 dpos = pos1 - self.lasthdr
dd47d6a2 664 assert dpos == crypto.PDTCRYPT_HDR_SIZE
c7c736b6 665 self.fileobj.seek_set (pos0)
c8c72fe1 666 data, hdr, _ = self.encryption.done (dummy)
5f38bff6 667 self.__write_to_file(hdr, pos=self.lasthdr)
c7c736b6
PG
668 self.__write_to_file(data) # append remainder of data
669 self.lasthdr = -1
670
671
57db1546
PG
672 def _finalize_write_gz (self):
673 if self.cmp is not None:
674 chunk = self.buf + self.cmp.flush()
675 if chunk:
676 if self.comptype == "gz":
677 # The native zlib crc is an unsigned 32-bit integer, but
678 # the Python wrapper implicitly casts that to a signed C
679 # long. So, on a 32-bit box self.crc may "look negative",
680 # while the same crc on a 64-bit box may "look positive".
681 # To avoid irksome warnings from the `struct` module, force
682 # it to look positive on all boxes.
683 chunk += struct.pack("<L", self.crc & 0xffffffff)
684 chunk += struct.pack("<L", self.concat_pos & 0xffffFFFF)
685 self.__enc_write (chunk)
15a81fc0 686 self.buf = b""
57db1546
PG
687
688
a0873dcc 689 def _init_write_gz (self, set_last_block_offset=False):
5fdff89f
ERE
690 '''
691 Add a new gzip block, closing last one
692 '''
be60ffd0 693 self.concat_pos = 0
c2ffe2ec 694 self.crc = self.zlib.crc32(b"") & 0xFFFFffff
6de9444a 695 first = self.cmp is None
2b82f50c
ERE
696 self.cmp = self.zlib.compressobj(self.compresslevel,
697 self.zlib.DEFLATED,
698 -self.zlib.MAX_WBITS,
699 self.zlib.DEF_MEM_LEVEL,
700 0)
6e812ad9
DGM
701
702 # if aes, we encrypt after compression
6de9444a 703 if set_last_block_offset is True:
ad4402e8 704 self.last_block_offset = self.fileobj.tell()
6e812ad9 705
15a81fc0 706 self.__write(gz_header (self.name if first is True else None))
5fdff89f 707
ac5e4184 708
7584f5c9
ERE
709 def write(self, s):
710 """Write string s to the stream.
711 """
712 if self.comptype == "gz":
c2ffe2ec 713 self.crc = self.zlib.crc32(s, self.crc) & 0xFFFFffff
7584f5c9 714 self.pos += len(s)
5fdff89f 715 self.concat_pos += len(s)
53732900 716 if self.cmp is not None:
7584f5c9
ERE
717 s = self.cmp.compress(s)
718 self.__write(s)
719
c7c736b6 720 def __sync(self):
cb7a3911 721 """Write what’s left in the buffer to the stream."""
c7c736b6
PG
722 self.__write (b"") # → len (buf) <= bufsiz
723 self.__enc_write (self.buf)
724 self.buf = b""
725
7584f5c9 726 def __write(self, s):
548bb8d5
CH
727 """Writes (and encodes) string s to the stream blockwise
728
729 will wait with encoding/writing until block is complete
7584f5c9
ERE
730 """
731 self.buf += s
732 while len(self.buf) > self.bufsize:
6e812ad9 733 self.__enc_write(self.buf[:self.bufsize])
7584f5c9
ERE
734 self.buf = self.buf[self.bufsize:]
735
867f75f7 736
5f38bff6 737 def __write_to_file(self, s, pos=None):
6e812ad9 738 '''
5f38bff6 739 Writes directly to the fileobj; updates self.bytes_written. If “pos” is
cb7a3911 740 given, the stream will seek to that position first and back afterwards,
5f38bff6 741 and the total of bytes written is not updated.
6e812ad9 742 '''
867f75f7 743 self.fileobj.write(s, pos)
5f38bff6
PG
744 if pos is None:
745 self.bytes_written += len(s)
867f75f7 746
6e812ad9
DGM
747
748 def __enc_write(self, s):
cb7a3911
PG
749 """
750 If encryption is active, the string s is encrypted before being written
751 to the file.
752 """
753 if len (s) == 0:
754 return
d1c38f40 755 if self.arcmode & ARCMODE_ENCRYPT:
cb7a3911
PG
756 buf = s
757 while len (buf) > 0:
758 n, ct = self.encryption.process(buf)
759 self.__write_to_file(ct)
760 buf = buf [n:]
761 if len (buf) > 0:
762 # The entire plaintext was not consumed: The size limit
763 # for encrypted objects was reached. Transparently create
764 # a new encrypted object and continue processing the input.
765 self._finalize_write_encrypt ()
766 self._init_write_encrypt ()
767 else:
768 self.__write_to_file(s)
769
6e812ad9 770
784175ba
CH
771 def estim_file_size(self):
772 """ estimates size of file if closing it now
773
774 The result may differ greatly from the amount of data sent to write()
775 due to compression, encryption and buffering.
776
777 In tests the result (before calling close()) was up to 12k smaller than
778 the final file size if compression is being used because zlib/bz2
779 compressors do not allow inspection of their buffered data :-(
780
ba5a449e
CH
781 Still, we add what close() would add: 8 bytes for gz checksum, one
782 encryption block size if encryption is used and the size of our own
783 buffer
784175ba
CH
784 """
785 if self.closed:
786 return self.bytes_written
787
788 result = self.bytes_written
789 if self.buf:
790 result += len(self.buf)
791 if self.comptype == 'gz':
ba5a449e 792 result += 8 # 2 longs = 8 byte (no extra info written for bzip2)
784175ba
CH
793 return result
794
5fdff89f 795 def close(self, close_fileobj=True):
7584f5c9
ERE
796 """Close the _Stream object. No operation should be
797 done on it afterwards.
798 """
963d0db4 799
7584f5c9
ERE
800 if self.closed:
801 return
802
963d0db4 803 if close_fileobj is True:
a0873dcc 804
ae3d0f2a 805 if self.mode == "w":
d1c38f40 806 if self.arcmode & ARCMODE_COMPRESS:
a0873dcc 807 self._finalize_write_gz ()
ae3d0f2a 808 # end of Tar archive marker (two empty blocks) was written
267bc643
PG
809 # finalize encryption last; no writes may be performed after
810 # this point
cb7a3911 811 self.__sync ()
d1c38f40
PG
812 if self.arcmode & ARCMODE_ENCRYPT:
813 self._finalize_write_encrypt ()
267bc643 814
963d0db4
PG
815 if not self._extfileobj:
816 self.fileobj.close()
817 else:
818 # read the zlib crc and length and check them
819 if self.mode == "r" and self.comptype == "gz":
820 read_crc = self.__read(4)
821 read_length = self.__read(4)
822 calculated_crc = self.crc
823 if struct.unpack("<L", read_crc)[0] != calculated_crc:
824 raise CompressionError("bad gzip crc")
7584f5c9
ERE
825 self.closed = True
826
54128a00 827
7584f5c9
ERE
828 def _init_read_gz(self):
829 """Initialize for reading a gzip compressed fileobj.
830 """
831 self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
7584f5c9 832
85737f48 833 read2 = self.__read(2)
e50fa574
PG
834 if read2 == b"":
835 raise EndOfFile ("_init_read_gz(): read returned zero bytes at pos "
836 "%d" % self.fileobj.tell())
837 # taken from gzip.GzipFile with some alterations
d601d33b 838 if read2 != GZ_MAGIC_BYTES:
7584f5c9 839 raise ReadError("not a gzip file")
85737f48 840
dfd7865e
PG
841 read1 = ord (self.__read(1))
842 if read1 != GZ_METHOD_DEFLATE:
7584f5c9
ERE
843 raise CompressionError("unsupported compression method")
844
85737f48 845 self.flags = flag = ord(self.__read(1))
dfd7865e 846 self.__read(6) # discard timestamp[4], deflate flags, os code
7584f5c9 847
dfd7865e 848 if flag & GZ_FLAG_FEXTRA:
7584f5c9
ERE
849 xlen = ord(self.__read(1)) + 256 * ord(self.__read(1))
850 self.read(xlen)
dfd7865e 851 if flag & GZ_FLAG_FNAME:
7584f5c9
ERE
852 while True:
853 s = self.__read(1)
854 if not s or s == NUL:
855 break
dfd7865e 856 if flag & GZ_FLAG_FCOMMENT:
7584f5c9
ERE
857 while True:
858 s = self.__read(1)
859 if not s or s == NUL:
860 break
dfd7865e 861 if flag & GZ_FLAG_FHCRC:
7584f5c9
ERE
862 self.__read(2)
863
c7c736b6
PG
864 def _init_read_encrypt (self):
865 """Initialize encryption for next entry in archive. Read a header and
866 notify the crypto context."""
d1c38f40 867 if self.arcmode & ARCMODE_ENCRYPT:
6e99d23a 868 lasthdr = self.fileobj.tell ()
15d3eefd
PG
869 try:
870 hdr = crypto.hdr_read_stream (self.fileobj)
8a8ac469
PG
871 except crypto.EndOfFile:
872 return False
6e99d23a 873 except crypto.InvalidHeader as exn:
c7c736b6 874 raise DecryptionError ("Crypto.hdr_read_stream(): error “%s” "
6e99d23a
PG
875 "processing %r at pos %d"
876 % (exn, self.fileobj, lasthdr)) \
ae3d0f2a 877 from exn
3031b7ae
PG
878 if self.enccounter is not None:
879 # enforce that the iv counter in the header matches an
880 # explicitly requested one
881 iv = crypto.hdr_iv_counter (hdr)
882 if iv != self.enccounter:
883 raise DecryptionError ("expected IV counter %d, got %d"
884 % (self.enccounter, iv))
6e99d23a 885 self.lasthdr = lasthdr
c7c736b6 886 self.remainder = hdr ["ctsize"] # distance to next header
1ed44e7b
PG
887 try:
888 self.encryption.next (hdr)
889 except crypto.InvalidParameter as exn:
890 raise DecryptionError ("Crypto.next(): error “%s” "
891 "processing %r at pos %d"
892 % (exn, self.fileobj, lasthdr)) \
893 from exn
8a8ac469
PG
894
895 return True
c7c736b6
PG
896
897
8de91f4f
PG
898 def _read_encrypt (self, buf):
899 """
900 Demote a program error to a decryption error in tolerant mode. This
901 allows recovery from corrupted headers and invalid data.
902 """
903 try:
904 return self.encryption.process (buf)
905 except RuntimeError as exn:
04f4c7ab 906 if self.tolerance != TOLERANCE_STRICT:
8de91f4f
PG
907 raise DecryptionError (exn)
908 raise
909
910
c7c736b6
PG
911 def _finalize_read_encrypt (self):
912 """
913 Finalize decryption.
914 """
d1c38f40
PG
915 if self.arcmode & ARCMODE_ENCRYPT \
916 and self.lasthdr is not None :
c7c736b6
PG
917 assert self.remainder >= 0
918 if self.remainder > 0:
c7c736b6 919 self.remainder = 0
b0078f26
PG
920 try:
921 data = self.encryption.done ()
922 except crypto.InvalidGCMTag as exn:
923 raise DecryptionError ("decryption failed: %s" % exn)
c7c736b6
PG
924 return data
925
926
7584f5c9
ERE
927 def tell(self):
928 """Return the stream's file pointer position.
929 """
930 return self.pos
931
932 def seek(self, pos=0):
933 """Set the stream's file pointer to pos. Negative seeking
934 is forbidden.
935 """
936 if pos - self.pos >= 0:
937 blocks, remainder = divmod(pos - self.pos, self.bufsize)
be60ffd0 938 for i in range(blocks):
7584f5c9
ERE
939 self.read(self.bufsize)
940 self.read(remainder)
941 else:
942 raise StreamError("seeking backwards is not allowed")
943 return self.pos
944
945 def read(self, size=None):
946 """Return the next size number of bytes from the stream.
947 If size is not defined, return all bytes of the stream
948 up to EOF.
949 """
950 if size is None:
951 t = []
952 while True:
953 buf = self._read(self.bufsize)
954 if not buf:
955 break
956 t.append(buf)
9dc7ac5c 957 buf = b"".join(t)
7584f5c9
ERE
958 else:
959 buf = self._read(size)
960 self.pos += len(buf)
961 return buf
962
3a7e1a50
ERE
963 def readline(self):
964 """Reads just one line, new line character included
965 """
f0fd5e3a 966 # if \n in dbuf, no read neads to be done
be60ffd0
ERE
967 if b'\n' in self.dbuf:
968 pos = self.dbuf.index(b'\n') + 1
f0fd5e3a
ERE
969 ret = self.dbuf[:pos]
970 self.dbuf = self.dbuf[pos:]
971 return ret
972
1215b602 973 buf = []
3a7e1a50
ERE
974 while True:
975 chunk = self._read(self.bufsize)
976
f0fd5e3a 977 # nothing more to read, so return the buffer
3a7e1a50 978 if not chunk:
be60ffd0 979 return b''.join(buf)
3a7e1a50
ERE
980
981 buf.append(chunk)
f0fd5e3a
ERE
982
983 # if \n found, return the new line
be60ffd0
ERE
984 if b'\n' in chunk:
985 dbuf = b''.join(buf)
986 pos = dbuf.index(b'\n') + 1
1215b602 987 self.dbuf = dbuf[pos:] + self.dbuf
3a7e1a50
ERE
988 return dbuf[:pos]
989
7584f5c9
ERE
990 def _read(self, size):
991 """Return size bytes from the stream.
992 """
7584f5c9
ERE
993 c = len(self.dbuf)
994 t = [self.dbuf]
e4e5d0b8 995
7584f5c9 996 while c < size:
867f75f7 997 buf = self.__read(self.bufsize)
7584f5c9
ERE
998 if not buf:
999 break
3a7e1a50 1000
53732900 1001 if self.cmp is not None:
85737f48 1002 try:
3a7e1a50 1003 buf = self.cmp.decompress(buf)
54128a00
PG
1004 except self.exception as exn:
1005 raise ReadError("invalid compressed data (%r)" % exn)
be60ffd0 1006 except Exception as e:
04fb06f4
DGM
1007 # happens at the end of the file
1008 # _init_read_gz failed in the previous iteration so
e4e5d0b8 1009 # self.cmp.decompress fails here
d1c38f40 1010 if self.arcmode & ARCMODE_CONCAT:
be60ffd0
ERE
1011 pass
1012 else:
1013 raise ReadError("invalid compressed data")
d1c38f40 1014 if self.arcmode & ARCMODE_COMPRESS and hasattr(self, "crc"):
c2ffe2ec 1015 self.crc = self.zlib.crc32(buf, self.crc) & 0xFFFFffff
d1c38f40
PG
1016 if self.arcmode & ARCMODE_CONCAT \
1017 and len(self.cmp.unused_data) != 0:
3a7e1a50
ERE
1018 self.buf = self.cmp.unused_data + self.buf
1019 self.close(close_fileobj=False)
1020 try:
1021 self._init_read_gz()
8de91f4f 1022 except DecryptionError:
04f4c7ab 1023 if self.tolerance != TOLERANCE_STRICT:
8de91f4f
PG
1024 # return whatever data was processed successfully
1025 if len (buf) > 0:
1026 t.append (buf)
1027 if len (t) > 0:
1028 break
24afaf18
PG
1029 raise
1030 except ReadError: # gzip troubles
1031 if self.tolerance == TOLERANCE_RESCUE:
1032 if len (buf) > 0:
1033 t.append (buf)
1034 if len (t) > 0:
1035 break
1036 raise
e50fa574 1037 except EndOfFile:
3a7e1a50
ERE
1038 # happens at the end of the file
1039 pass
c2ffe2ec 1040 self.crc = self.zlib.crc32(b"") & 0xFFFFffff
3a7e1a50 1041 self.closed = False
7584f5c9
ERE
1042 t.append(buf)
1043 c += len(buf)
be60ffd0 1044 t = b"".join(t)
7584f5c9
ERE
1045 self.dbuf = t[size:]
1046 return t[:size]
1047
e4e5d0b8 1048
7584f5c9 1049 def __read(self, size):
ef3b4499
PG
1050 """
1051 Return size bytes from stream. If internal buffer is empty, read
1052 another block from the stream.
1053
1054 The function returns up to size bytes of data. When an error occurs
1055 during decryption, everything until the end of the last successfully
1056 finalized object is returned.
7584f5c9
ERE
1057 """
1058 c = len(self.buf)
8de91f4f 1059 t = [self.buf] if c > 0 else []
1ed44e7b 1060 good_crypto = len (t)
8de91f4f 1061
7584f5c9 1062 while c < size:
c7c736b6 1063 todo = size
8de91f4f
PG
1064 try:
1065 if self.arcmode & ARCMODE_ENCRYPT:
1066 if self.remainder <= 0:
1067 # prepare next object
044585c6
PG
1068 if self._init_read_encrypt () is False: # EOF
1069 buf = None
1070 break # while
8de91f4f
PG
1071
1072 # only read up to the end of the encrypted object
1073 todo = min (size, self.remainder)
1074 buf = self.fileobj.read(todo)
1075 if self.arcmode & ARCMODE_ENCRYPT:
1076 # decrypt the thing
1077 buf = self._read_encrypt (buf)
1078 if todo == self.remainder:
1079 # at the end of a crypto object; finalization will fail if
1080 # the GCM tag does not match
ef3b4499 1081 trailing = self._finalize_read_encrypt ()
8de91f4f
PG
1082 good_crypto = len (t) + 1
1083 if len (trailing) > 0:
1084 buf += trailing
1085 self.remainder = 0
1086 else:
1087 self.remainder -= todo
1088 except DecryptionError:
04f4c7ab 1089 if self.tolerance == TOLERANCE_STRICT:
8de91f4f
PG
1090 raise
1091 self.encryption.drop ()
24afaf18
PG
1092 if self.tolerance == TOLERANCE_RECOVER:
1093 if good_crypto == 0:
1094 raise
1095 # this may occur at any of the three crypto operations above.
1096 # some objects did validate; discard all data after it; next
1097 # call will start with the bad object and error out immediately
1098 self.buf = b"".join (t [good_crypto:])
1099 return b"".join (t [:good_crypto])
1100 elif self.tolerance == TOLERANCE_RESCUE:
1101 # keep what we have so far despite the finalization issue
1102 t.append (buf)
1103 c += len (buf)
1104 break
1105 else:
1106 raise RuntimeError("internal error: bad tolerance level")
c7c736b6
PG
1107
1108 if not buf: ## XXX stream terminated prematurely; this should be an error
7584f5c9 1109 break
c7c736b6 1110
7584f5c9
ERE
1111 t.append(buf)
1112 c += len(buf)
be60ffd0 1113 t = b"".join(t)
7584f5c9 1114 self.buf = t[size:]
fb27c6e8 1115
7584f5c9 1116 return t[:size]
7d372216 1117
7584f5c9
ERE
1118
1119class _StreamProxy(object):
1120 """Small proxy class that enables transparent compression
1121 detection for the Stream interface (mode 'r|*').
1122 """
1123
1124 def __init__(self, fileobj):
1125 self.fileobj = fileobj
1126 self.buf = self.fileobj.read(BLOCKSIZE)
1127
f0287fb7 1128 def read(self, size): # pylint: disable=method-hidden
7584f5c9
ERE
1129 self.read = self.fileobj.read
1130 return self.buf
1131
1132 def getcomptype(self):
d601d33b 1133 if self.buf.startswith(GZ_MAGIC_DEFLATE):
7584f5c9 1134 return "gz"
be60ffd0 1135 elif self.buf[0:3] == b"BZh" and self.buf[4:10] == b"1AY&SY":
7584f5c9 1136 return "bz2"
be60ffd0
ERE
1137 elif self.buf.startswith((b"\x5d\x00\x00\x80", b"\xfd7zXZ")):
1138 return "xz"
1139 else:
1140 return "tar"
7584f5c9
ERE
1141
1142 def close(self):
1143 self.fileobj.close()
1144# class StreamProxy
1145
7584f5c9
ERE
1146#------------------------
1147# Extraction file object
1148#------------------------
1149class _FileInFile(object):
1150 """A thin wrapper around an existing file object that
1151 provides a part of its data as an individual file
1152 object.
1153 """
1154
be60ffd0 1155 def __init__(self, fileobj, offset, size, blockinfo=None):
7584f5c9
ERE
1156 self.fileobj = fileobj
1157 self.offset = offset
1158 self.size = size
7584f5c9 1159 self.position = 0
be60ffd0
ERE
1160 self.name = getattr(fileobj, "name", None)
1161 self.closed = False
1162
1163 if blockinfo is None:
1164 blockinfo = [(0, size)]
1165
1166 # Construct a map with data and zero blocks.
1167 self.map_index = 0
1168 self.map = []
1169 lastpos = 0
1170 realpos = self.offset
1171 for offset, size in blockinfo:
1172 if offset > lastpos:
1173 self.map.append((False, lastpos, offset, None))
1174 self.map.append((True, offset, offset + size, realpos))
1175 realpos += size
1176 lastpos = offset + size
1177 if lastpos < self.size:
1178 self.map.append((False, lastpos, self.size, None))
1179
1180 def flush(self):
1181 pass
1182
1183 def readable(self):
1184 return True
1185
1186 def writable(self):
1187 return False
1188
1189 def seekable(self):
1190 return self.fileobj.seekable()
7584f5c9
ERE
1191
1192 def tell(self):
1193 """Return the current file position.
1194 """
1195 return self.position
1196
be60ffd0 1197 def seek(self, position, whence=io.SEEK_SET):
7584f5c9
ERE
1198 """Seek to a position in the file.
1199 """
be60ffd0
ERE
1200 if whence == io.SEEK_SET:
1201 self.position = min(max(position, 0), self.size)
1202 elif whence == io.SEEK_CUR:
1203 if position < 0:
1204 self.position = max(self.position + position, 0)
1205 else:
1206 self.position = min(self.position + position, self.size)
1207 elif whence == io.SEEK_END:
1208 self.position = max(min(self.size + position, self.size), 0)
1209 else:
1210 raise ValueError("Invalid argument")
1211 return self.position
7584f5c9
ERE
1212
1213 def read(self, size=None):
1214 """Read data from the file.
1215 """
1216 if size is None:
1217 size = self.size - self.position
1218 else:
1219 size = min(size, self.size - self.position)
1220
be60ffd0 1221 buf = b""
7584f5c9 1222 while size > 0:
7584f5c9 1223 while True:
be60ffd0
ERE
1224 data, start, stop, offset = self.map[self.map_index]
1225 if start <= self.position < stop:
7584f5c9 1226 break
be60ffd0
ERE
1227 else:
1228 self.map_index += 1
1229 if self.map_index == len(self.map):
1230 self.map_index = 0
1231 length = min(size, stop - self.position)
1232 if data:
1233 self.fileobj.seek(offset + (self.position - start))
1234 buf += self.fileobj.read(length)
7584f5c9 1235 else:
be60ffd0
ERE
1236 buf += NUL * length
1237 size -= length
1238 self.position += length
1239 return buf
7584f5c9 1240
be60ffd0
ERE
1241 def readinto(self, b):
1242 buf = self.read(len(b))
1243 b[:len(buf)] = buf
1244 return len(buf)
7584f5c9
ERE
1245
1246 def close(self):
7584f5c9 1247 self.closed = True
be60ffd0 1248#class _FileInFile
7584f5c9 1249
be60ffd0
ERE
1250
1251class ExFileObject(io.BufferedReader):
1252
1253 def __init__(self, tarfile, tarinfo):
1254 fileobj = _FileInFile(tarfile.fileobj, tarinfo.offset_data,
1255 tarinfo.size, tarinfo.sparse)
1256 super().__init__(fileobj)
7584f5c9
ERE
1257#class ExFileObject
1258
1259#------------------
1260# Exported Classes
1261#------------------
1262class TarInfo(object):
1263 """Informational class which holds the details about an
1264 archive member given by a tar header block.
1265 TarInfo objects are returned by TarFile.getmember(),
1266 TarFile.getmembers() and TarFile.gettarinfo() and are
1267 usually created internally.
1268 """
1269
be60ffd0
ERE
1270 __slots__ = ("name", "mode", "uid", "gid", "size", "mtime",
1271 "chksum", "type", "linkname", "uname", "gname",
1272 "devmajor", "devminor", "volume_offset",
1273 "offset", "offset_data", "pax_headers", "sparse",
1274 "tarfile", "_sparse_structs", "_link_target")
1275
7584f5c9
ERE
1276 def __init__(self, name=""):
1277 """Construct a TarInfo object. name is the optional name
1278 of the member.
1279 """
1280 self.name = name # member name
be60ffd0 1281 self.mode = 0o644 # file permissions
7584f5c9
ERE
1282 self.uid = 0 # user id
1283 self.gid = 0 # group id
1284 self.size = 0 # file size
1285 self.mtime = 0 # modification time
1286 self.chksum = 0 # header checksum
1287 self.type = REGTYPE # member type
1288 self.linkname = "" # link name
1289 self.uname = "" # user name
1290 self.gname = "" # group name
1291 self.devmajor = 0 # device major number
1292 self.devminor = 0 # device minor number
1293
1294 self.offset = 0 # the tar header starts here
1295 self.offset_data = 0 # the file's data starts here
0eb5048f
ERE
1296 self.volume_offset = 0 # the file's data corresponds with the data
1297 # starting at this position
7584f5c9 1298
be60ffd0 1299 self.sparse = None # sparse member information
7584f5c9
ERE
1300 self.pax_headers = {} # pax header information
1301
1302 # In pax headers the "name" and "linkname" field are called
1303 # "path" and "linkpath".
1304 def _getpath(self):
1305 return self.name
1306 def _setpath(self, name):
1307 self.name = name
1308 path = property(_getpath, _setpath)
1309
1310 def _getlinkpath(self):
1311 return self.linkname
1312 def _setlinkpath(self, linkname):
1313 self.linkname = linkname
1314 linkpath = property(_getlinkpath, _setlinkpath)
1315
1316 def __repr__(self):
1317 return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self))
1318
be60ffd0 1319 def get_info(self, encoding=None, errors=None):
7584f5c9
ERE
1320 """Return the TarInfo's attributes as a dictionary.
1321 """
1322 info = {
1323 "name": self.name,
be60ffd0 1324 "mode": self.mode & 0o7777,
7584f5c9
ERE
1325 "uid": self.uid,
1326 "gid": self.gid,
1327 "size": self.size,
1328 "mtime": self.mtime,
1329 "chksum": self.chksum,
1330 "type": self.type,
1331 "linkname": self.linkname,
1332 "uname": self.uname,
1333 "gname": self.gname,
1334 "devmajor": self.devmajor,
36a315a0 1335 "devminor": self.devminor,
0eb5048f
ERE
1336 "offset_data": self.offset_data,
1337 "volume_offset": self.volume_offset
7584f5c9
ERE
1338 }
1339
1340 if info["type"] == DIRTYPE and not info["name"].endswith("/"):
1341 info["name"] += "/"
1342
7584f5c9
ERE
1343 return info
1344
be60ffd0
ERE
1345 def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING,
1346 errors="surrogateescape"):
7584f5c9
ERE
1347 """Return a tar header as a string of 512 byte blocks.
1348 """
1349 info = self.get_info(encoding, errors)
1350
1351 if format == USTAR_FORMAT:
be60ffd0 1352 return self.create_ustar_header(info, encoding, errors)
7584f5c9 1353 elif format == GNU_FORMAT:
be60ffd0 1354 return self.create_gnu_header(info, encoding, errors)
7584f5c9
ERE
1355 elif format == PAX_FORMAT:
1356 return self.create_pax_header(info, encoding, errors)
1357 else:
1358 raise ValueError("invalid format")
1359
be60ffd0 1360 def create_ustar_header(self, info, encoding, errors):
7584f5c9
ERE
1361 """Return the object as a ustar header block.
1362 """
1363 info["magic"] = POSIX_MAGIC
1364
1365 if len(info["linkname"]) > LENGTH_LINK:
1366 raise ValueError("linkname is too long")
1367
1368 if len(info["name"]) > LENGTH_NAME:
1369 info["prefix"], info["name"] = self._posix_split_name(info["name"])
1370
be60ffd0 1371 return self._create_header(info, USTAR_FORMAT, encoding, errors)
7584f5c9 1372
be60ffd0 1373 def create_gnu_header(self, info, encoding, errors):
7584f5c9
ERE
1374 """Return the object as a GNU header block sequence.
1375 """
1376 info["magic"] = GNU_MAGIC
1377
2f854e77
ERE
1378 if self.ismultivol():
1379 prefix = [
1380 itn(info.get("atime", 0), 12, GNU_FORMAT),
1381 itn(info.get("ctime", 0), 12, GNU_FORMAT),
0eb5048f 1382 itn(self.volume_offset, 12, GNU_FORMAT),
2f854e77
ERE
1383 itn(0, 119, GNU_FORMAT), # stuff unused in this tar implementation, set to zero
1384 ]
be60ffd0 1385 info['prefix'] = b"".join(prefix)
0eb5048f 1386 info['size'] = info['size'] - self.volume_offset
2f854e77 1387
be60ffd0 1388 buf = b""
7584f5c9 1389 if len(info["linkname"]) > LENGTH_LINK:
be60ffd0
ERE
1390 buf += self._create_gnu_long_header(info["linkname"],
1391 GNUTYPE_LONGLINK, encoding, errors)
7584f5c9
ERE
1392
1393 if len(info["name"]) > LENGTH_NAME:
be60ffd0
ERE
1394 buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME,
1395 encoding, errors)
7584f5c9 1396
be60ffd0 1397 return buf + self._create_header(info, GNU_FORMAT, encoding, errors)
7584f5c9
ERE
1398
1399 def create_pax_header(self, info, encoding, errors):
1400 """Return the object as a ustar header block. If it cannot be
1401 represented this way, prepend a pax extended header sequence
1402 with supplement information.
1403 """
1404 info["magic"] = POSIX_MAGIC
1405 pax_headers = self.pax_headers.copy()
c04e0751
ERE
1406 if self.ismultivol():
1407 info['size'] = info['size'] - self.volume_offset
7584f5c9
ERE
1408
1409 # Test string fields for values that exceed the field length or cannot
1410 # be represented in ASCII encoding.
1411 for name, hname, length in (
36a315a0
ERE
1412 ("name", "path", LENGTH_NAME),
1413 ("linkname", "linkpath", LENGTH_LINK),
1414 ("uname", "uname", 32),
1415 ("gname", "gname", 32)):
7584f5c9
ERE
1416
1417 if hname in pax_headers:
1418 # The pax header has priority.
1419 continue
1420
7584f5c9
ERE
1421 # Try to encode the string as ASCII.
1422 try:
be60ffd0 1423 info[name].encode("ascii", "strict")
7584f5c9 1424 except UnicodeEncodeError:
be60ffd0 1425 pax_headers[hname] = info[name]
7584f5c9
ERE
1426 continue
1427
1428 if len(info[name]) > length:
be60ffd0 1429 pax_headers[hname] = info[name]
7584f5c9
ERE
1430
1431 # Test number fields for values that exceed the field limit or values
1432 # that like to be stored as float.
1433 for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)):
1434 if name in pax_headers:
1435 # The pax header has priority. Avoid overflow.
1436 info[name] = 0
1437 continue
1438
1439 val = info[name]
1440 if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float):
be60ffd0 1441 pax_headers[name] = str(val)
7584f5c9
ERE
1442 info[name] = 0
1443
1444 # Create a pax extended header if necessary.
1445 if pax_headers:
be60ffd0 1446 buf = self._create_pax_generic_header(pax_headers, XHDTYPE, encoding)
7584f5c9 1447 else:
be60ffd0 1448 buf = b""
7584f5c9 1449
be60ffd0 1450 return buf + self._create_header(info, USTAR_FORMAT, "ascii", "replace")
7584f5c9
ERE
1451
1452 @classmethod
1453 def create_pax_global_header(cls, pax_headers):
1454 """Return the object as a pax global header block sequence.
1455 """
be60ffd0 1456 return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf-8")
7584f5c9
ERE
1457
1458 def _posix_split_name(self, name):
1459 """Split a name longer than 100 chars into a prefix
1460 and a name part.
1461 """
1462 prefix = name[:LENGTH_PREFIX + 1]
1463 while prefix and prefix[-1] != "/":
1464 prefix = prefix[:-1]
1465
1466 name = name[len(prefix):]
1467 prefix = prefix[:-1]
1468
1469 if not prefix or len(name) > LENGTH_NAME:
1470 raise ValueError("name is too long")
1471 return prefix, name
1472
1473 @staticmethod
be60ffd0 1474 def _create_header(info, format, encoding, errors):
7584f5c9
ERE
1475 """Return a header block. info is a dictionary with file
1476 information, format must be one of the *_FORMAT constants.
1477 """
1478 parts = [
be60ffd0
ERE
1479 stn(info.get("name", ""), 100, encoding, errors),
1480 itn(info.get("mode", 0) & 0o7777, 8, format),
7584f5c9
ERE
1481 itn(info.get("uid", 0), 8, format),
1482 itn(info.get("gid", 0), 8, format),
1483 itn(info.get("size", 0), 12, format),
1484 itn(info.get("mtime", 0), 12, format),
be60ffd0 1485 b" ", # checksum field
2f854e77 1486 info.get("type", REGTYPE),
be60ffd0
ERE
1487 stn(info.get("linkname", ""), 100, encoding, errors),
1488 info.get("magic", POSIX_MAGIC),
1489 stn(info.get("uname", ""), 32, encoding, errors),
1490 stn(info.get("gname", ""), 32, encoding, errors),
7584f5c9
ERE
1491 itn(info.get("devmajor", 0), 8, format),
1492 itn(info.get("devminor", 0), 8, format),
be60ffd0 1493 sbtn(info.get("prefix", ""), 155, encoding, errors)
7584f5c9
ERE
1494 ]
1495
be60ffd0 1496 buf = struct.pack("%ds" % BLOCKSIZE, b"".join(parts))
7584f5c9 1497 chksum = calc_chksums(buf[-BLOCKSIZE:])[0]
be60ffd0 1498 buf = buf[:-364] + bytes("%06o\0" % chksum, "ascii") + buf[-357:]
7584f5c9
ERE
1499 return buf
1500
1501 @staticmethod
1502 def _create_payload(payload):
1503 """Return the string payload filled with zero bytes
1504 up to the next 512 byte border.
1505 """
1506 blocks, remainder = divmod(len(payload), BLOCKSIZE)
1507 if remainder > 0:
1508 payload += (BLOCKSIZE - remainder) * NUL
1509 return payload
1510
1511 @classmethod
be60ffd0 1512 def _create_gnu_long_header(cls, name, type, encoding, errors):
7584f5c9
ERE
1513 """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence
1514 for name.
1515 """
be60ffd0 1516 name = name.encode(encoding, errors) + NUL
7584f5c9
ERE
1517
1518 info = {}
1519 info["name"] = "././@LongLink"
1520 info["type"] = type
1521 info["size"] = len(name)
1522 info["magic"] = GNU_MAGIC
1523
1524 # create extended header + name blocks.
be60ffd0 1525 return cls._create_header(info, USTAR_FORMAT, encoding, errors) + \
7584f5c9
ERE
1526 cls._create_payload(name)
1527
1528 @classmethod
be60ffd0
ERE
1529 def _create_pax_generic_header(cls, pax_headers, type, encoding):
1530 """Return a POSIX.1-2008 extended or global header sequence
7584f5c9 1531 that contains a list of keyword, value pairs. The values
be60ffd0 1532 must be strings.
7584f5c9 1533 """
be60ffd0
ERE
1534 # Check if one of the fields contains surrogate characters and thereby
1535 # forces hdrcharset=BINARY, see _proc_pax() for more information.
1536 binary = False
1537 for keyword, value in pax_headers.items():
1538 try:
1539 value.encode("utf-8", "strict")
1540 except UnicodeEncodeError:
1541 binary = True
1542 break
1543
1544 records = b""
1545 if binary:
1546 # Put the hdrcharset field at the beginning of the header.
1547 records += b"21 hdrcharset=BINARY\n"
1548
1549 for keyword, value in pax_headers.items():
1550 keyword = keyword.encode("utf-8")
1551 if binary:
1552 # Try to restore the original byte representation of `value'.
1553 # Needless to say, that the encoding must match the string.
1554 value = value.encode(encoding, "surrogateescape")
1555 else:
1556 value = value.encode("utf-8")
1557
7584f5c9
ERE
1558 l = len(keyword) + len(value) + 3 # ' ' + '=' + '\n'
1559 n = p = 0
1560 while True:
1561 n = l + len(str(p))
1562 if n == p:
1563 break
1564 p = n
be60ffd0 1565 records += bytes(str(p), "ascii") + b" " + keyword + b"=" + value + b"\n"
7584f5c9
ERE
1566
1567 # We use a hardcoded "././@PaxHeader" name like star does
1568 # instead of the one that POSIX recommends.
1569 info = {}
1570 info["name"] = "././@PaxHeader"
1571 info["type"] = type
1572 info["size"] = len(records)
1573 info["magic"] = POSIX_MAGIC
1574
1575 # Create pax header + record blocks.
be60ffd0 1576 return cls._create_header(info, USTAR_FORMAT, "ascii", "replace") + \
7584f5c9
ERE
1577 cls._create_payload(records)
1578
1579 @classmethod
be60ffd0
ERE
1580 def frombuf(cls, buf, encoding, errors):
1581 """Construct a TarInfo object from a 512 byte bytes object.
7584f5c9
ERE
1582 """
1583 if len(buf) == 0:
1584 raise EmptyHeaderError("empty header")
1585 if len(buf) != BLOCKSIZE:
1586 raise TruncatedHeaderError("truncated header")
1587 if buf.count(NUL) == BLOCKSIZE:
1588 raise EOFHeaderError("end of file header")
1589
1590 chksum = nti(buf[148:156])
1591 if chksum not in calc_chksums(buf):
1592 raise InvalidHeaderError("bad checksum")
1593
1594 obj = cls()
be60ffd0 1595 obj.name = nts(buf[0:100], encoding, errors)
7584f5c9
ERE
1596 obj.mode = nti(buf[100:108])
1597 obj.uid = nti(buf[108:116])
1598 obj.gid = nti(buf[116:124])
1599 obj.size = nti(buf[124:136])
1600 obj.mtime = nti(buf[136:148])
1601 obj.chksum = chksum
1602 obj.type = buf[156:157]
be60ffd0
ERE
1603 obj.linkname = nts(buf[157:257], encoding, errors)
1604 obj.uname = nts(buf[265:297], encoding, errors)
1605 obj.gname = nts(buf[297:329], encoding, errors)
7584f5c9
ERE
1606 obj.devmajor = nti(buf[329:337])
1607 obj.devminor = nti(buf[337:345])
be60ffd0
ERE
1608 prefix = nts(buf[345:500], encoding, errors)
1609
1610 # The old GNU sparse format occupies some of the unused
1611 # space in the buffer for up to 4 sparse structures.
1612 # Save the them for later processing in _proc_sparse().
1613 if obj.type == GNUTYPE_SPARSE:
1614 pos = 386
1615 structs = []
1616 for i in range(4):
1617 try:
1618 offset = nti(buf[pos:pos + 12])
1619 numbytes = nti(buf[pos + 12:pos + 24])
1620 except ValueError:
1621 break
1622 structs.append((offset, numbytes))
1623 pos += 24
1624 isextended = bool(buf[482])
1625 origsize = nti(buf[483:495])
1626 obj._sparse_structs = (structs, isextended, origsize)
7584f5c9
ERE
1627
1628 # Old V7 tar format represents a directory as a regular
1629 # file with a trailing slash.
1630 if obj.type == AREGTYPE and obj.name.endswith("/"):
1631 obj.type = DIRTYPE
1632
1633 # Remove redundant slashes from directories.
1634 if obj.isdir():
1635 obj.name = obj.name.rstrip("/")
1636
1637 # Reconstruct a ustar longname.
1638 if prefix and obj.type not in GNU_TYPES:
1639 obj.name = prefix + "/" + obj.name
c474439c
ERE
1640 else:
1641 obj.offset_data = nti(buf[369:381])
7584f5c9
ERE
1642 return obj
1643
1644 @classmethod
1645 def fromtarfile(cls, tarfile):
1646 """Return the next TarInfo object from TarFile object
1647 tarfile.
1648 """
1649 buf = tarfile.fileobj.read(BLOCKSIZE)
be60ffd0 1650 obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors)
7584f5c9
ERE
1651 obj.offset = tarfile.fileobj.tell() - BLOCKSIZE
1652 return obj._proc_member(tarfile)
1653
1654 #--------------------------------------------------------------------------
1655 # The following are methods that are called depending on the type of a
1656 # member. The entry point is _proc_member() which can be overridden in a
1657 # subclass to add custom _proc_*() methods. A _proc_*() method MUST
1658 # implement the following
1659 # operations:
1660 # 1. Set self.offset_data to the position where the data blocks begin,
1661 # if there is data that follows.
1662 # 2. Set tarfile.offset to the position where the next member's header will
1663 # begin.
1664 # 3. Return self or another valid TarInfo object.
1665 def _proc_member(self, tarfile):
1666 """Choose the right processing method depending on
1667 the type and call it.
1668 """
1669 if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK):
1670 return self._proc_gnulong(tarfile)
1671 elif self.type == GNUTYPE_SPARSE:
1672 return self._proc_sparse(tarfile)
1673 elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE):
1674 return self._proc_pax(tarfile)
1675 else:
1676 return self._proc_builtin(tarfile)
1677
1678 def _proc_builtin(self, tarfile):
1679 """Process a builtin type or an unknown type which
1680 will be treated as a regular file.
1681 """
1682 self.offset_data = tarfile.fileobj.tell()
1683 offset = self.offset_data
00c34a12 1684 if self.isreg() or self.ismultivol() or self.type not in SUPPORTED_TYPES:
7584f5c9
ERE
1685 # Skip the following data blocks.
1686 offset += self._block(self.size)
1687 tarfile.offset = offset
1688
1689 # Patch the TarInfo object with saved global
1690 # header information.
1691 self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors)
1692
1693 return self
1694
1695 def _proc_gnulong(self, tarfile):
1696 """Process the blocks that hold a GNU longname
1697 or longlink member.
1698 """
1699 buf = tarfile.fileobj.read(self._block(self.size))
1700
1701 # Fetch the next header and process it.
1702 try:
1703 next = self.fromtarfile(tarfile)
1704 except HeaderError:
1705 raise SubsequentHeaderError("missing or bad subsequent header")
1706
1707 # Patch the TarInfo object from the next header with
1708 # the longname information.
1709 next.offset = self.offset
1710 if self.type == GNUTYPE_LONGNAME:
be60ffd0 1711 next.name = nts(buf, tarfile.encoding, tarfile.errors)
7584f5c9 1712 elif self.type == GNUTYPE_LONGLINK:
be60ffd0 1713 next.linkname = nts(buf, tarfile.encoding, tarfile.errors)
7584f5c9
ERE
1714
1715 return next
1716
1717 def _proc_sparse(self, tarfile):
1718 """Process a GNU sparse header plus extra headers.
1719 """
be60ffd0
ERE
1720 # We already collected some sparse structures in frombuf().
1721 structs, isextended, origsize = self._sparse_structs
1722 del self._sparse_structs
1723
1724 # Collect sparse structures from extended header blocks.
1725 while isextended:
7584f5c9
ERE
1726 buf = tarfile.fileobj.read(BLOCKSIZE)
1727 pos = 0
be60ffd0 1728 for i in range(21):
7584f5c9
ERE
1729 try:
1730 offset = nti(buf[pos:pos + 12])
1731 numbytes = nti(buf[pos + 12:pos + 24])
1732 except ValueError:
1733 break
be60ffd0
ERE
1734 if offset and numbytes:
1735 structs.append((offset, numbytes))
7584f5c9 1736 pos += 24
be60ffd0
ERE
1737 isextended = bool(buf[504])
1738 self.sparse = structs
7584f5c9
ERE
1739
1740 self.offset_data = tarfile.fileobj.tell()
1741 tarfile.offset = self.offset_data + self._block(self.size)
1742 self.size = origsize
7584f5c9
ERE
1743 return self
1744
1745 def _proc_pax(self, tarfile):
1746 """Process an extended or global header as described in
be60ffd0 1747 POSIX.1-2008.
7584f5c9
ERE
1748 """
1749 # Read the header information.
1750 buf = tarfile.fileobj.read(self._block(self.size))
1751
1752 # A pax header stores supplemental information for either
1753 # the following file (extended) or all following files
1754 # (global).
1755 if self.type == XGLTYPE:
1756 pax_headers = tarfile.pax_headers
1757 else:
1758 pax_headers = tarfile.pax_headers.copy()
1759
be60ffd0
ERE
1760 # Check if the pax header contains a hdrcharset field. This tells us
1761 # the encoding of the path, linkpath, uname and gname fields. Normally,
1762 # these fields are UTF-8 encoded but since POSIX.1-2008 tar
1763 # implementations are allowed to store them as raw binary strings if
1764 # the translation to UTF-8 fails.
1765 match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf)
1766 if match is not None:
1767 pax_headers["hdrcharset"] = match.group(1).decode("utf-8")
1768
1769 # For the time being, we don't care about anything other than "BINARY".
1770 # The only other value that is currently allowed by the standard is
1771 # "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
1772 hdrcharset = pax_headers.get("hdrcharset")
1773 if hdrcharset == "BINARY":
1774 encoding = tarfile.encoding
1775 else:
1776 encoding = "utf-8"
1777
7584f5c9
ERE
1778 # Parse pax header information. A record looks like that:
1779 # "%d %s=%s\n" % (length, keyword, value). length is the size
1780 # of the complete record including the length field itself and
1781 # the newline. keyword and value are both UTF-8 encoded strings.
be60ffd0 1782 regex = re.compile(br"(\d+) ([^=]+)=")
7584f5c9
ERE
1783 pos = 0
1784 while True:
1785 match = regex.match(buf, pos)
1786 if not match:
1787 break
1788
1789 length, keyword = match.groups()
1790 length = int(length)
1791 value = buf[match.end(2) + 1:match.start(1) + length - 1]
1792
be60ffd0
ERE
1793 # Normally, we could just use "utf-8" as the encoding and "strict"
1794 # as the error handler, but we better not take the risk. For
1795 # example, GNU tar <= 1.23 is known to store filenames it cannot
1796 # translate to UTF-8 as raw strings (unfortunately without a
1797 # hdrcharset=BINARY header).
1798 # We first try the strict standard encoding, and if that fails we
1799 # fall back on the user's encoding and error handler.
1800 keyword = self._decode_pax_field(keyword, "utf-8", "utf-8",
1801 tarfile.errors)
1802 if keyword in PAX_NAME_FIELDS:
1803 value = self._decode_pax_field(value, encoding, tarfile.encoding,
1804 tarfile.errors)
1805 else:
1806 value = self._decode_pax_field(value, "utf-8", "utf-8",
1807 tarfile.errors)
7584f5c9
ERE
1808
1809 pax_headers[keyword] = value
1810 pos += length
1811
36a315a0 1812
7584f5c9
ERE
1813 # Fetch the next header.
1814 try:
1815 next = self.fromtarfile(tarfile)
1816 except HeaderError:
1817 raise SubsequentHeaderError("missing or bad subsequent header")
1818
be60ffd0
ERE
1819 # Process GNU sparse information.
1820 if "GNU.sparse.map" in pax_headers:
1821 # GNU extended sparse format version 0.1.
1822 self._proc_gnusparse_01(next, pax_headers)
1823
1824 elif "GNU.sparse.size" in pax_headers:
1825 # GNU extended sparse format version 0.0.
1826 self._proc_gnusparse_00(next, pax_headers, buf)
1827
1828 elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0":
1829 # GNU extended sparse format version 1.0.
1830 self._proc_gnusparse_10(next, pax_headers, tarfile)
1831
7584f5c9
ERE
1832 if self.type in (XHDTYPE, SOLARIS_XHDTYPE):
1833 # Patch the TarInfo object with the extended header info.
1834 next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors)
1835 next.offset = self.offset
1836
1837 if "size" in pax_headers:
1838 # If the extended header replaces the size field,
1839 # we need to recalculate the offset where the next
1840 # header starts.
1841 offset = next.offset_data
1842 if next.isreg() or next.type not in SUPPORTED_TYPES:
1843 offset += next._block(next.size)
1844 tarfile.offset = offset
1845
c04e0751
ERE
1846 if next is not None:
1847 if "GNU.volume.filename" in pax_headers:
1848 if pax_headers["GNU.volume.filename"] == next.name:
1849 if "GNU.volume.size" in pax_headers:
1850 next.size = int(pax_headers["GNU.volume.size"])
1851 if "GNU.volume.offset" in pax_headers:
1852 next.volume_offset = int(pax_headers["GNU.volume.offset"])
1853
1854 for key in pax_headers.keys():
1855 if key.startswith("GNU.volume"):
1856 del tarfile.pax_headers[key]
0eb5048f 1857
7584f5c9
ERE
1858 return next
1859
be60ffd0
ERE
1860 def _proc_gnusparse_00(self, next, pax_headers, buf):
1861 """Process a GNU tar extended sparse header, version 0.0.
7584f5c9 1862 """
be60ffd0
ERE
1863 offsets = []
1864 for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf):
1865 offsets.append(int(match.group(1)))
1866 numbytes = []
1867 for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf):
1868 numbytes.append(int(match.group(1)))
1869 next.sparse = list(zip(offsets, numbytes))
7584f5c9 1870
be60ffd0
ERE
1871 def _proc_gnusparse_01(self, next, pax_headers):
1872 """Process a GNU tar extended sparse header, version 0.1.
1873 """
1874 sparse = [int(x) for x in pax_headers["GNU.sparse.map"].split(",")]
1875 next.sparse = list(zip(sparse[::2], sparse[1::2]))
7584f5c9 1876
be60ffd0
ERE
1877 def _proc_gnusparse_10(self, next, pax_headers, tarfile):
1878 """Process a GNU tar extended sparse header, version 1.0.
1879 """
1880 fields = None
1881 sparse = []
1882 buf = tarfile.fileobj.read(BLOCKSIZE)
1883 fields, buf = buf.split(b"\n", 1)
1884 fields = int(fields)
1885 while len(sparse) < fields * 2:
1886 if b"\n" not in buf:
1887 buf += tarfile.fileobj.read(BLOCKSIZE)
1888 number, buf = buf.split(b"\n", 1)
1889 sparse.append(int(number))
1890 next.offset_data = tarfile.fileobj.tell()
1891 next.sparse = list(zip(sparse[::2], sparse[1::2]))
7584f5c9 1892
be60ffd0
ERE
1893 def _apply_pax_info(self, pax_headers, encoding, errors):
1894 """Replace fields with supplemental information from a previous
1895 pax extended or global header.
1896 """
1897 for keyword, value in pax_headers.items():
1898 if keyword == "GNU.sparse.name":
1899 setattr(self, "path", value)
1900 elif keyword == "GNU.sparse.size":
1901 setattr(self, "size", int(value))
1902 elif keyword == "GNU.sparse.realsize":
1903 setattr(self, "size", int(value))
1904 elif keyword in PAX_FIELDS:
1905 if keyword in PAX_NUMBER_FIELDS:
1906 try:
1907 value = PAX_NUMBER_FIELDS[keyword](value)
1908 except ValueError:
1909 value = 0
1910 if keyword == "path":
f0287fb7 1911 value = value.rstrip("/") # pylint: disable=no-member
be60ffd0 1912 setattr(self, keyword, value)
7584f5c9
ERE
1913
1914 self.pax_headers = pax_headers.copy()
1915
be60ffd0
ERE
1916 def _decode_pax_field(self, value, encoding, fallback_encoding, fallback_errors):
1917 """Decode a single field from a pax record.
1918 """
1919 try:
1920 return value.decode(encoding, "strict")
1921 except UnicodeDecodeError:
1922 return value.decode(fallback_encoding, fallback_errors)
1923
7584f5c9
ERE
1924 def _block(self, count):
1925 """Round up a byte count by BLOCKSIZE and return it,
1926 e.g. _block(834) => 1024.
1927 """
1928 blocks, remainder = divmod(count, BLOCKSIZE)
1929 if remainder:
1930 blocks += 1
1931 return blocks * BLOCKSIZE
1932
1933 def isreg(self):
1934 return self.type in REGULAR_TYPES
1935 def isfile(self):
1936 return self.isreg()
1937 def isdir(self):
1938 return self.type == DIRTYPE
1939 def issym(self):
1940 return self.type == SYMTYPE
1941 def islnk(self):
1942 return self.type == LNKTYPE
1943 def ischr(self):
1944 return self.type == CHRTYPE
1945 def isblk(self):
1946 return self.type == BLKTYPE
1947 def isfifo(self):
1948 return self.type == FIFOTYPE
1949 def issparse(self):
be60ffd0 1950 return self.sparse is not None
7584f5c9
ERE
1951 def isdev(self):
1952 return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE)
68ddf955 1953 def ismultivol(self):
c04e0751
ERE
1954 return self.type == GNUTYPE_MULTIVOL or self.volume_offset > 0 or\
1955 "GNU.volume.offset" in self.pax_headers
7584f5c9
ERE
1956# class TarInfo
1957
1958class TarFile(object):
1959 """The TarFile Class provides an interface to tar archives.
1960 """
1961
1962 debug = 0 # May be set from 0 (no msgs) to 3 (all msgs)
1963
1964 dereference = False # If true, add content of linked file to the
1965 # tar file, else the link.
1966
1967 ignore_zeros = False # If true, skips empty or invalid blocks and
1968 # continues processing.
1969
83f2d71e 1970 max_volume_size = None # If different from None, establishes maximum
68ddf955
ERE
1971 # size of tar volumes
1972
1973 new_volume_handler = None # function handler to be executed before when
1974 # a new volume is needed
1975
1976 volume_number = 0 # current volume number, used for multi volume
1977 # support
1978
7584f5c9
ERE
1979 errorlevel = 1 # If 0, fatal errors only appear in debug
1980 # messages (if debug >= 0). If > 0, errors
1981 # are passed to the caller as exceptions.
1982
1983 format = DEFAULT_FORMAT # The format to use when creating an archive.
1984
1985 encoding = ENCODING # Encoding for 8-bit character strings.
1986
1987 errors = None # Error handler for unicode conversion.
1988
1989 tarinfo = TarInfo # The default TarInfo class to use.
1990
be60ffd0 1991 fileobject = ExFileObject # The file-object for extractfile().
7584f5c9 1992
d1c38f40
PG
1993 arcmode = ARCMODE_PLAIN # Object processing mode (“concat”, encryption,
1994 # compression)
5fdff89f 1995
ea625b04
ERE
1996 save_to_members = True # If new members are saved. This can be disabled
1997 # if you manage lots of files and don't want
1998 # to have high memory usage
1999
9ef1fb87
TJ
2000 cache_uid2user = {} # cache to avoid getpwuid calls. It always parses /etc/passwd.
2001 cache_gid2group = {} # same cache for groups
2002
7584f5c9
ERE
2003 def __init__(self, name=None, mode="r", fileobj=None, format=None,
2004 tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
be60ffd0 2005 errors="surrogateescape", pax_headers=None, debug=None,
548bb8d5 2006 errorlevel=None, max_volume_size=None, new_volume_handler=None,
d1c38f40 2007 concat=False, nacl=None,
c7c736b6 2008 save_to_members=True):
7584f5c9
ERE
2009 """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
2010 read from an existing archive, 'a' to append data to an existing
2011 file or 'w' to create a new file overwriting an existing one. `mode'
2012 defaults to 'r'.
2013 If `fileobj' is given, it is used for reading or writing data. If it
2014 can be determined, `mode' is overridden by `fileobj's mode.
2015 `fileobj' is not closed, when TarFile is closed.
2016 """
2017 if len(mode) > 1 or mode not in "raw":
2018 raise ValueError("mode must be 'r', 'a' or 'w'")
2019 self.mode = mode
d1c38f40 2020 self.arcmode = arcmode_set (concat)
c7c736b6 2021 self.nacl = nacl
7584f5c9
ERE
2022 self._mode = {"r": "rb", "a": "r+b", "w": "wb"}[mode]
2023
2024 if not fileobj:
2025 if self.mode == "a" and not os.path.exists(name):
2026 # Create nonexistent files in append mode.
2027 self.mode = "w"
2028 self._mode = "wb"
2029 fileobj = bltn_open(name, self._mode)
2030 self._extfileobj = False
2031 else:
2032 if name is None and hasattr(fileobj, "name"):
2033 name = fileobj.name
d5361dac 2034 # when fileobj is a gzip.GzipFile, fileobj.mode is an int (not valid for us)
be60ffd0 2035 if hasattr(fileobj, "mode") and isinstance(fileobj.mode, str):
7584f5c9
ERE
2036 self._mode = fileobj.mode
2037 self._extfileobj = True
be60ffd0 2038 self.name = os.path.abspath(name) if name else None
2f854e77 2039 self.base_name = self.name = os.path.abspath(name) if name else None
7584f5c9
ERE
2040 self.fileobj = fileobj
2041
2042 # Init attributes.
2043 if format is not None:
2044 self.format = format
2045 if tarinfo is not None:
2046 self.tarinfo = tarinfo
2047 if dereference is not None:
2048 self.dereference = dereference
2049 if ignore_zeros is not None:
2050 self.ignore_zeros = ignore_zeros
2051 if encoding is not None:
2052 self.encoding = encoding
2053
be60ffd0 2054 self.errors = errors
7584f5c9
ERE
2055
2056 if pax_headers is not None and self.format == PAX_FORMAT:
2057 self.pax_headers = pax_headers
2058 else:
2059 self.pax_headers = {}
2060
2061 if debug is not None:
2062 self.debug = debug
2063 if errorlevel is not None:
2064 self.errorlevel = errorlevel
2065
2066 # Init datastructures.
ae48acc8 2067 if max_volume_size and max_volume_size < 3*BLOCKSIZE:
0c818a18 2068 raise ValueError("max_volume_size needs to be at least %d" % (3*BLOCKSIZE))
ae48acc8
ERE
2069 if max_volume_size and not callable(new_volume_handler):
2070 raise ValueError("new_volume_handler needs to be set and be callable for multivolume support")
5ab3f8f9
CH
2071 if max_volume_size:
2072 self.max_volume_size = int(max_volume_size)
2073 else:
2074 self.max_volume_size = None
ae48acc8 2075
ea625b04 2076 self.save_to_members = save_to_members
68ddf955 2077 self.new_volume_handler = new_volume_handler
7584f5c9
ERE
2078 self.closed = False
2079 self.members = [] # list of members as TarInfo objects
2080 self._loaded = False # flag if all members have been read
2081 self.offset = self.fileobj.tell()
2082 # current position in the archive file
2083 self.inodes = {} # dictionary caching the inodes of
2084 # archive members already added
2085
2086 try:
2087 if self.mode == "r":
2088 self.firstmember = None
2089 self.firstmember = self.next()
2090
2091 if self.mode == "a":
2092 # Move to the end of the archive,
2093 # before the first empty block.
2094 while True:
2095 self.fileobj.seek(self.offset)
2096 try:
2097 tarinfo = self.tarinfo.fromtarfile(self)
2098 self.members.append(tarinfo)
2099 except EOFHeaderError:
2100 self.fileobj.seek(self.offset)
2101 break
be60ffd0 2102 except HeaderError as e:
7584f5c9
ERE
2103 raise ReadError(str(e))
2104
2105 if self.mode in "aw":
2106 self._loaded = True
2107
2108 if self.pax_headers:
2109 buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy())
2110 self.fileobj.write(buf)
2111 self.offset += len(buf)
2112 except:
2113 if not self._extfileobj:
2114 self.fileobj.close()
2115 self.closed = True
2116 raise
2117
7584f5c9
ERE
2118 #--------------------------------------------------------------------------
2119 # Below are the classmethods which act as alternate constructors to the
2120 # TarFile class. The open() method is the only one that is needed for
2121 # public use; it is the "super"-constructor and is able to select an
2122 # adequate "sub"-constructor for a particular compression using the mapping
2123 # from OPEN_METH.
2124 #
2125 # This concept allows one to subclass TarFile without losing the comfort of
2126 # the super-constructor. A sub-constructor is registered and made available
2127 # by adding it to the mapping in OPEN_METH.
2128
2129 @classmethod
2b82f50c 2130 def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE,
04f4c7ab
PG
2131 encryption=None, compresslevel=9, tolerance=TOLERANCE_STRICT,
2132 **kwargs):
7584f5c9
ERE
2133 """Open a tar archive for reading, writing or appending. Return
2134 an appropriate TarFile class.
2135
2136 mode:
2137 'r' or 'r:*' open for reading with transparent compression
2138 'r:' open for reading exclusively uncompressed
2139 'r:gz' open for reading with gzip compression
2140 'r:bz2' open for reading with bzip2 compression
be60ffd0 2141 'r:xz' open for reading with lzma compression
7584f5c9
ERE
2142 'a' or 'a:' open for appending, creating the file if necessary
2143 'w' or 'w:' open for writing without compression
2144 'w:gz' open for writing with gzip compression
2145 'w:bz2' open for writing with bzip2 compression
be60ffd0 2146 'w:xz' open for writing with lzma compression
7584f5c9
ERE
2147
2148 'r|*' open a stream of tar blocks with transparent compression
2149 'r|' open an uncompressed stream of tar blocks for reading
2150 'r|gz' open a gzip compressed stream of tar blocks
2151 'r|bz2' open a bzip2 compressed stream of tar blocks
be60ffd0 2152 'r|xz' open an lzma compressed stream of tar blocks
7584f5c9
ERE
2153 'w|' open an uncompressed stream for writing
2154 'w|gz' open a gzip compressed stream for writing
2155 'w|bz2' open a bzip2 compressed stream for writing
be60ffd0 2156 'w|xz' open an lzma compressed stream for writing
85737f48
ERE
2157
2158 'r#gz' open a stream of gzip compressed tar blocks for reading
2159 'w#gz' open a stream of gzip compressed tar blocks for writing
7584f5c9 2160 """
7584f5c9
ERE
2161 if not name and not fileobj:
2162 raise ValueError("nothing to open")
2163
2164 if mode in ("r", "r:*"):
2165 # Find out which *open() is appropriate for opening the file.
2166 for comptype in cls.OPEN_METH:
2167 func = getattr(cls, cls.OPEN_METH[comptype])
2168 if fileobj is not None:
2169 saved_pos = fileobj.tell()
2170 try:
2171 return func(name, "r", fileobj, **kwargs)
be60ffd0 2172 except (ReadError, CompressionError) as e:
c7c736b6 2173 # usually nothing exceptional but sometimes is
7584f5c9
ERE
2174 if fileobj is not None:
2175 fileobj.seek(saved_pos)
2176 continue
2177 raise ReadError("file could not be opened successfully")
2178
2179 elif ":" in mode:
2180 filemode, comptype = mode.split(":", 1)
2181 filemode = filemode or "r"
2182 comptype = comptype or "tar"
2183
2184 # Select the *open() function according to
2185 # given compression.
2186 if comptype in cls.OPEN_METH:
2187 func = getattr(cls, cls.OPEN_METH[comptype])
2188 else:
2189 raise CompressionError("unknown compression type %r" % comptype)
e05f0440
TJ
2190
2191 # Pass on compression level for gzip / bzip2.
2192 if comptype == 'gz' or comptype == 'bz2':
2193 kwargs['compresslevel'] = compresslevel
2194
7a2b9329
CH
2195 if 'max_volume_size' in kwargs:
2196 if comptype != 'tar' and filemode in 'wa' \
2197 and kwargs['max_volume_size']:
2198 import warnings
2199 warnings.warn('Only the first volume will be compressed '
2200 'for modes with "w:"!')
2201
e05f0440 2202 return func(name, filemode, fileobj, **kwargs)
7584f5c9
ERE
2203
2204 elif "|" in mode:
2205 filemode, comptype = mode.split("|", 1)
2206 filemode = filemode or "r"
2207 comptype = comptype or "tar"
2208
2209 if filemode not in "rw":
2210 raise ValueError("mode must be 'r' or 'w'")
2211
2212 t = cls(name, filemode,
2b82f50c
ERE
2213 _Stream(name, filemode, comptype, fileobj, bufsize,
2214 compresslevel=compresslevel),
7584f5c9
ERE
2215 **kwargs)
2216 t._extfileobj = False
2217 return t
2218
5fdff89f
ERE
2219 elif "#" in mode:
2220 filemode, comptype = mode.split("#", 1)
2221 filemode = filemode or "r"
5fdff89f
ERE
2222
2223 if filemode not in "rw":
5faea0e1
PG
2224 raise ValueError ("mode %s not compatible with concat "
2225 "archive; must be 'r' or 'w'" % mode)
5fdff89f 2226
be60ffd0 2227 stream = _Stream(name, filemode, comptype, fileobj, bufsize,
d1c38f40 2228 concat=True, encryption=encryption,
04f4c7ab 2229 compresslevel=compresslevel, tolerance=tolerance)
d1c38f40 2230 kwargs ["concat"] = True
be60ffd0
ERE
2231 try:
2232 t = cls(name, filemode, stream, **kwargs)
c7c736b6 2233 except: # XXX except what?
be60ffd0 2234 stream.close()
c7c736b6 2235 raise # XXX raise what?
5fdff89f
ERE
2236 t._extfileobj = False
2237 return t
2238
7584f5c9
ERE
2239 elif mode in "aw":
2240 return cls.taropen(name, mode, fileobj, **kwargs)
2241
133d30da 2242 raise ValueError("undiscernible mode %r" % mode)
7584f5c9 2243
d39d4cbf
PG
2244
2245 @classmethod
2246 def open_at_offset(cls, offset, *a, **kwa):
2247 """
2248 Same as ``.open()``, but start reading at the given offset. Assumes a
2249 seekable file object.
2250 """
2251 fileobj = kwa.get ("fileobj")
2252 if fileobj is not None:
2253 fileobj.seek (offset)
2254 return cls.open (*a, **kwa)
2255
2256
7584f5c9
ERE
2257 @classmethod
2258 def taropen(cls, name, mode="r", fileobj=None, **kwargs):
2259 """Open uncompressed tar archive name for reading or writing.
2260 """
2261 if len(mode) > 1 or mode not in "raw":
2262 raise ValueError("mode must be 'r', 'a' or 'w'")
2263 return cls(name, mode, fileobj, **kwargs)
2264
2265 @classmethod
2266 def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
2267 """Open gzip compressed tar archive name for reading or writing.
2268 Appending is not allowed.
2269 """
2270 if len(mode) > 1 or mode not in "rw":
2271 raise ValueError("mode must be 'r' or 'w'")
2272
2273 try:
2274 import gzip
2275 gzip.GzipFile
2276 except (ImportError, AttributeError):
2277 raise CompressionError("gzip module is not available")
2278
be60ffd0 2279 extfileobj = fileobj is not None
7584f5c9 2280 try:
be60ffd0
ERE
2281 fileobj = gzip.GzipFile(name, mode + "b", compresslevel, fileobj)
2282 t = cls.taropen(name, mode, fileobj, **kwargs)
2283 except OSError:
2284 if not extfileobj and fileobj is not None:
2285 fileobj.close()
2286 if fileobj is None:
2287 raise
7584f5c9 2288 raise ReadError("not a gzip file")
be60ffd0
ERE
2289 except:
2290 if not extfileobj and fileobj is not None:
2291 fileobj.close()
2292 raise
2293 t._extfileobj = extfileobj
7584f5c9
ERE
2294 return t
2295
2296 @classmethod
2297 def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
2298 """Open bzip2 compressed tar archive name for reading or writing.
2299 Appending is not allowed.
2300 """
2301 if len(mode) > 1 or mode not in "rw":
2302 raise ValueError("mode must be 'r' or 'w'.")
2303
2304 try:
2305 import bz2
2306 except ImportError:
2307 raise CompressionError("bz2 module is not available")
2308
be60ffd0
ERE
2309 fileobj = bz2.BZ2File(fileobj or name, mode,
2310 compresslevel=compresslevel)
7584f5c9
ERE
2311
2312 try:
2313 t = cls.taropen(name, mode, fileobj, **kwargs)
be60ffd0
ERE
2314 except (OSError, EOFError):
2315 fileobj.close()
7584f5c9
ERE
2316 raise ReadError("not a bzip2 file")
2317 t._extfileobj = False
2318 return t
2319
be60ffd0
ERE
2320 @classmethod
2321 def xzopen(cls, name, mode="r", fileobj=None, preset=None, **kwargs):
2322 """Open lzma compressed tar archive name for reading or writing.
2323 Appending is not allowed.
2324 """
2325 if mode not in ("r", "w"):
2326 raise ValueError("mode must be 'r' or 'w'")
2327
2328 try:
2329 import lzma
2330 except ImportError:
2331 raise CompressionError("lzma module is not available")
2332
2333 fileobj = lzma.LZMAFile(fileobj or name, mode, preset=preset)
2334
2335 try:
2336 t = cls.taropen(name, mode, fileobj, **kwargs)
2337 except (lzma.LZMAError, EOFError):
2338 fileobj.close()
2339 raise ReadError("not an lzma file")
2340 t._extfileobj = False
2341 return t
2342
7584f5c9
ERE
2343 # All *open() methods are registered here.
2344 OPEN_METH = {
2345 "tar": "taropen", # uncompressed tar
2346 "gz": "gzopen", # gzip compressed tar
be60ffd0
ERE
2347 "bz2": "bz2open", # bzip2 compressed tar
2348 "xz": "xzopen" # lzma compressed tar
7584f5c9
ERE
2349 }
2350
2351 #--------------------------------------------------------------------------
2352 # The public methods which TarFile provides:
2353
2354 def close(self):
2355 """Close the TarFile. In write-mode, two finishing zero blocks are
fd2f01f2
PG
2356 appended to the archive. A special case are empty archives which are
2357 initialized accordingly so the two mandatory blocks of zeros are
2358 written abiding by the requested encryption and compression settings.
7584f5c9
ERE
2359 """
2360 if self.closed:
2361 return
2362
2363 if self.mode in "aw":
fd2f01f2
PG
2364 if self.arcmode & ARCMODE_CONCAT and self.fileobj.tell () == 0:
2365 self.fileobj.next ("")
7584f5c9
ERE
2366 self.fileobj.write(NUL * (BLOCKSIZE * 2))
2367 self.offset += (BLOCKSIZE * 2)
2368 # fill up the end with zero-blocks
2369 # (like option -b20 for tar does)
2370 blocks, remainder = divmod(self.offset, RECORDSIZE)
2371 if remainder > 0:
2372 self.fileobj.write(NUL * (RECORDSIZE - remainder))
7584f5c9
ERE
2373 if not self._extfileobj:
2374 self.fileobj.close()
2375 self.closed = True
2376
2377 def getmember(self, name):
2378 """Return a TarInfo object for member `name'. If `name' can not be
2379 found in the archive, KeyError is raised. If a member occurs more
2380 than once in the archive, its last occurrence is assumed to be the
2381 most up-to-date version.
2382 """
2383 tarinfo = self._getmember(name)
2384 if tarinfo is None:
2385 raise KeyError("filename %r not found" % name)
2386 return tarinfo
2387
2388 def getmembers(self):
2389 """Return the members of the archive as a list of TarInfo objects. The
2390 list has the same order as the members in the archive.
2391 """
2392 self._check()
2393 if not self._loaded: # if we want to obtain a list of
2394 self._load() # all members, we first have to
2395 # scan the whole archive.
2396 return self.members
2397
ad4402e8
ERE
2398 def get_last_member_offset(self):
2399 """Return the last member offset. Usually this is self.fileobj.tell(),
2400 but when there's encryption or concat compression going on it's more
2401 complicated than that.
2402 """
b8fc2f5d 2403 return self.last_block_offset
ad4402e8 2404
7584f5c9
ERE
2405 def getnames(self):
2406 """Return the members of the archive as a list of their names. It has
2407 the same order as the list returned by getmembers().
2408 """
2409 return [tarinfo.name for tarinfo in self.getmembers()]
2410
2411 def gettarinfo(self, name=None, arcname=None, fileobj=None):
2412 """Create a TarInfo object for either the file `name' or the file
2413 object `fileobj' (using os.fstat on its file descriptor). You can
2414 modify some of the TarInfo's attributes before you add it using
2415 addfile(). If given, `arcname' specifies an alternative name for the
2416 file in the archive.
2417 """
2418 self._check("aw")
2419
2420 # When fileobj is given, replace name by
2421 # fileobj's real name.
2422 if fileobj is not None:
2423 name = fileobj.name
2424
2425 # Building the name of the member in the archive.
2426 # Backward slashes are converted to forward slashes,
2427 # Absolute paths are turned to relative paths.
2428 if arcname is None:
2429 arcname = name
2430 drv, arcname = os.path.splitdrive(arcname)
be60ffd0 2431 arcname = arcname.replace(os.sep, "/")
7584f5c9
ERE
2432 arcname = arcname.lstrip("/")
2433
2434 # Now, fill the TarInfo object with
2435 # information specific for the file.
2436 tarinfo = self.tarinfo()
2437 tarinfo.tarfile = self
2438
2439 # Use os.stat or os.lstat, depending on platform
2440 # and if symlinks shall be resolved.
2441 if fileobj is None:
2442 if hasattr(os, "lstat") and not self.dereference:
2443 statres = os.lstat(name)
2444 else:
2445 statres = os.stat(name)
2446 else:
2447 statres = os.fstat(fileobj.fileno())
2448 linkname = ""
2449
2450 stmd = statres.st_mode
2451 if stat.S_ISREG(stmd):
2452 inode = (statres.st_ino, statres.st_dev)
2453 if not self.dereference and statres.st_nlink > 1 and \
2454 inode in self.inodes and arcname != self.inodes[inode]:
2455 # Is it a hardlink to an already
2456 # archived file?
2457 type = LNKTYPE
2458 linkname = self.inodes[inode]
2459 else:
2460 # The inode is added only if its valid.
2461 # For win32 it is always 0.
2462 type = REGTYPE
6f422b65 2463 if inode[0] and self.save_to_members:
7584f5c9
ERE
2464 self.inodes[inode] = arcname
2465 elif stat.S_ISDIR(stmd):
2466 type = DIRTYPE
2467 elif stat.S_ISFIFO(stmd):
2468 type = FIFOTYPE
2469 elif stat.S_ISLNK(stmd):
2470 type = SYMTYPE
2471 linkname = os.readlink(name)
2472 elif stat.S_ISCHR(stmd):
2473 type = CHRTYPE
2474 elif stat.S_ISBLK(stmd):
2475 type = BLKTYPE
2476 else:
2477 return None
2478
2479 # Fill the TarInfo object with all
2480 # information we can get.
2481 tarinfo.name = arcname
2482 tarinfo.mode = stmd
2483 tarinfo.uid = statres.st_uid
2484 tarinfo.gid = statres.st_gid
2485 if type == REGTYPE:
2486 tarinfo.size = statres.st_size
2487 else:
be60ffd0 2488 tarinfo.size = 0
7584f5c9
ERE
2489 tarinfo.mtime = statres.st_mtime
2490 tarinfo.type = type
2491 tarinfo.linkname = linkname
2492 if pwd:
9ef1fb87
TJ
2493 if tarinfo.uid in self.cache_uid2user:
2494 tarinfo.uname = self.cache_uid2user[tarinfo.uid]
2495 else:
2496 try:
2497 tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0]
2498 self.cache_uid2user[tarinfo.uid] = tarinfo.uname
2499 except KeyError:
2500 # remember user does not exist:
2501 # same default value as in tarinfo class
2502 self.cache_uid2user[tarinfo.uid] = ""
7584f5c9 2503 if grp:
9ef1fb87
TJ
2504 if tarinfo.gid in self.cache_gid2group:
2505 tarinfo.gname = self.cache_gid2group[tarinfo.gid]
2506 else:
2507 try:
2508 tarinfo.gname = grp.getgrgid(tarinfo.gid)[0]
2509 self.cache_gid2group[tarinfo.gid] = tarinfo.gname
2510 except KeyError:
2511 # remember group does not exist:
2512 # same default value as in tarinfo class
2513 self.cache_gid2group[tarinfo.gid] = ""
7584f5c9
ERE
2514
2515 if type in (CHRTYPE, BLKTYPE):
2516 if hasattr(os, "major") and hasattr(os, "minor"):
2517 tarinfo.devmajor = os.major(statres.st_rdev)
2518 tarinfo.devminor = os.minor(statres.st_rdev)
2519 return tarinfo
2520
2521 def list(self, verbose=True):
2522 """Print a table of contents to sys.stdout. If `verbose' is False, only
2523 the names of the members are printed. If it is True, an `ls -l'-like
2524 output is produced.
2525 """
2526 self._check()
2527
2528 for tarinfo in self:
2529 if verbose:
be60ffd0
ERE
2530 print(stat.filemode(tarinfo.mode), end=' ')
2531 print("%s/%s" % (tarinfo.uname or tarinfo.uid,
2532 tarinfo.gname or tarinfo.gid), end=' ')
7584f5c9 2533 if tarinfo.ischr() or tarinfo.isblk():
be60ffd0
ERE
2534 print("%10s" % ("%d,%d" \
2535 % (tarinfo.devmajor, tarinfo.devminor)), end=' ')
7584f5c9 2536 else:
be60ffd0
ERE
2537 print("%10d" % tarinfo.size, end=' ')
2538 print("%d-%02d-%02d %02d:%02d:%02d" \
2539 % time.localtime(tarinfo.mtime)[:6], end=' ')
7584f5c9 2540
be60ffd0 2541 print(tarinfo.name + ("/" if tarinfo.isdir() else ""), end=' ')
7584f5c9
ERE
2542
2543 if verbose:
2544 if tarinfo.issym():
be60ffd0 2545 print("->", tarinfo.linkname, end=' ')
7584f5c9 2546 if tarinfo.islnk():
be60ffd0
ERE
2547 print("link to", tarinfo.linkname, end=' ')
2548 print()
7584f5c9 2549
be60ffd0 2550 def add(self, name, arcname=None, recursive=True, exclude=None, *, filter=None):
7584f5c9
ERE
2551 """Add the file `name' to the archive. `name' may be any type of file
2552 (directory, fifo, symbolic link, etc.). If given, `arcname'
2553 specifies an alternative name for the file in the archive.
2554 Directories are added recursively by default. This can be avoided by
2555 setting `recursive' to False. `exclude' is a function that should
2556 return True for each filename to be excluded. `filter' is a function
2557 that expects a TarInfo object argument and returns the changed
2558 TarInfo object, if it returns None the TarInfo object will be
2559 excluded from the archive.
2560 """
2561 self._check("aw")
2562
2563 if arcname is None:
2564 arcname = name
2565
2566 # Exclude pathnames.
2567 if exclude is not None:
2568 import warnings
2569 warnings.warn("use the filter argument instead",
2570 DeprecationWarning, 2)
2571 if exclude(name):
2572 self._dbg(2, "tarfile: Excluded %r" % name)
2573 return
2574
2575 # Skip if somebody tries to archive the archive...
2576 if self.name is not None and os.path.abspath(name) == self.name:
2577 self._dbg(2, "tarfile: Skipped %r" % name)
2578 return
2579
2580 self._dbg(1, name)
2581
2582 # Create a TarInfo object from the file.
2583 tarinfo = self.gettarinfo(name, arcname)
2584
2585 if tarinfo is None:
2586 self._dbg(1, "tarfile: Unsupported type %r" % name)
2587 return
2588
2589 # Change or exclude the TarInfo object.
2590 if filter is not None:
2591 tarinfo = filter(tarinfo)
2592 if tarinfo is None:
2593 self._dbg(2, "tarfile: Excluded %r" % name)
2594 return
2595
2596 # Append the tar header and data to the archive.
2597 if tarinfo.isreg():
2598 with bltn_open(name, "rb") as f:
2599 self.addfile(tarinfo, f)
2600
2601 elif tarinfo.isdir():
2602 self.addfile(tarinfo)
2603 if recursive:
2604 for f in os.listdir(name):
2605 self.add(os.path.join(name, f), os.path.join(arcname, f),
be60ffd0 2606 recursive, exclude, filter=filter)
7584f5c9
ERE
2607
2608 else:
2609 self.addfile(tarinfo)
2610
defc9a22 2611 def _size_left_file(self):
be60ffd0 2612 """Calculates size left in a volume with a maximum volume size.
ba5a449e 2613
be60ffd0 2614 Assumes self.max_volume_size is set.
ba5a449e 2615 If using compression through a _Stream, use _size_left_stream instead
be60ffd0 2616 """
ba5a449e 2617 # left-over size = max_size - offset - 2 zero-blocks written in close
ae48acc8
ERE
2618 size_left = self.max_volume_size - 2*BLOCKSIZE - self.offset
2619 # limit size left to a discrete number of blocks, because we won't
be60ffd0 2620 # write only half a block when writting the end of a volume
ae48acc8 2621 # and filling with zeros
defc9a22
CH
2622 return BLOCKSIZE * (size_left // BLOCKSIZE)
2623
2624 def _size_left_stream(self):
ba5a449e
CH
2625 """ Calculates size left in a volume if using comression/encryption
2626
2627 Assumes self.max_volume_size is set and self.fileobj is a _Stream
2628 (otherwise use _size_left_file)
2629 """
2630 # left-over size = max_size - bytes written - 2 zero-blocks (close)
defc9a22
CH
2631 size_left = self.max_volume_size - self.fileobj.estim_file_size() \
2632 - 2*BLOCKSIZE
2633 return BLOCKSIZE * (size_left // BLOCKSIZE)
ae48acc8 2634
7584f5c9
ERE
2635 def addfile(self, tarinfo, fileobj=None):
2636 """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is
2637 given, tarinfo.size bytes are read from it and added to the archive.
2638 You can create TarInfo objects using gettarinfo().
2639 On Windows platforms, `fileobj' should always be opened with mode
2640 'rb' to avoid irritation about the file size.
2641 """
2642 self._check("aw")
2643
2644 tarinfo = copy.copy(tarinfo)
cbf55ffb 2645
d1c38f40
PG
2646 if self.arcmode & ARCMODE_CONCAT:
2647 self.last_block_offset = self.fileobj.next (tarinfo.name)
11684b1d
ERE
2648 else:
2649 self.last_block_offset = self.fileobj.tell()
7584f5c9
ERE
2650
2651 buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
2652 self.fileobj.write(buf)
2653 self.offset += len(buf)
2654
ae9c8de2
CH
2655 if self.max_volume_size:
2656 if isinstance(self.fileobj, _Stream):
2657 _size_left = self._size_left_stream
2658 else:
2659 _size_left = self._size_left_file
2660 else:
2661 _size_left = lambda: tarinfo.size
68ddf955 2662
29c354ac
PG
2663 # If there's no data to follow, finish
2664 if not fileobj:
29c354ac
PG
2665 if self.save_to_members:
2666 self.members.append(tarinfo)
2667 return
2668
2669 target_size_left = _size_left()
2670 source_size_left = tarinfo.size
2671 assert tarinfo.volume_offset == 0
2672
2673 # we only split volumes in the middle of a file, that means we have
2674 # to write at least one block
2675 if target_size_left < BLOCKSIZE:
2676 target_size_left = BLOCKSIZE
2677
ae9c8de2
CH
2678 # loop over multiple volumes
2679 while source_size_left > 0:
ae48acc8 2680
ae9c8de2
CH
2681 # Write as much data as possble from source into target.
2682 # When compressing data, we cannot easily predict how much data we
2683 # can write until target_size_left == 0 --> need to iterate
2684 size_can_write = min(target_size_left, source_size_left)
c04e0751 2685
ae9c8de2
CH
2686 while size_can_write > 0:
2687 copyfileobj(fileobj, self.fileobj, size_can_write)
2688 self.offset += size_can_write
2689 source_size_left -= size_can_write
2690 target_size_left = _size_left()
2691 size_can_write = min(target_size_left, source_size_left)
68ddf955 2692
ae9c8de2
CH
2693 # now target_size_left == 0 or source_size_left == 0
2694
2695 # if there is data left to write, we need to create a new volume
2696 if source_size_left > 0:
5f38bff6
PG
2697 # Only finalize the crypto entry here if we’re continuing with
2698 # another one; otherwise, the encryption must include the block
2699 # padding below.
2f854e77 2700 tarinfo.type = GNUTYPE_MULTIVOL
68ddf955
ERE
2701
2702 if not self.new_volume_handler or\
2703 not callable(self.new_volume_handler):
c04e0751 2704 raise Exception("We need to create a new volume and you "
ae9c8de2 2705 "didn't supply a new_volume_handler")
68ddf955 2706
54128a00 2707
68ddf955
ERE
2708 # the new volume handler should do everything needed to
2709 # start working in a new volume. usually, the handler calls
2710 # to self.open_volume
2f854e77 2711 self.volume_number += 1
0eb5048f 2712
ae9c8de2 2713 # set to be used by open_volume, because in the case of a PAX
0eb5048f
ERE
2714 # tar it needs to write information about the volume and offset
2715 # in the global header
ae9c8de2 2716 tarinfo.volume_offset = tarinfo.size - source_size_left
0eb5048f 2717 self.volume_tarinfo = tarinfo
ae9c8de2 2718
a0873dcc
PG
2719 # the “new_volume_handler” is supposed to call .close() on the
2720 # “fileobj” _Stream
2f854e77
ERE
2721 self.new_volume_handler(self, self.base_name, self.volume_number)
2722
0eb5048f
ERE
2723 self.volume_tarinfo = None
2724
d1c38f40
PG
2725 if self.arcmode & ARCMODE_CONCAT:
2726 self.fileobj.next_volume (tarinfo.name)
5f38bff6 2727
2f854e77
ERE
2728 # write new volume header
2729 buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
2f854e77 2730 self.fileobj.write(buf)
ae9c8de2
CH
2731 self.offset += len(buf)
2732
2733 # adjust variables; open_volume should have reset self.offset
2734 # --> _size_left should be big again
2735 target_size_left = _size_left()
2736 size_can_write = min(target_size_left, source_size_left)
e0da4709 2737 self._dbg(3, 'new volume')
ae9c8de2
CH
2738
2739 # now, all data has been written. We may have to fill up the rest of
2740 # the block in target with 0s
2741 remainder = (tarinfo.size - tarinfo.volume_offset) % BLOCKSIZE
2742 if remainder > 0:
2743 self.fileobj.write(NUL * (BLOCKSIZE - remainder))
2744 self.offset += BLOCKSIZE - remainder
7584f5c9 2745
ea625b04
ERE
2746 if self.save_to_members:
2747 self.members.append(tarinfo)
7584f5c9 2748
170c6c52 2749 def open_volume(self, name="", fileobj=None, encryption=None):
68ddf955 2750 '''
0eb5048f 2751 Called by the user to change this tar file to point to a new volume.
68ddf955 2752 '''
27ee4dd4 2753
68ddf955
ERE
2754 # open the file using either fileobj or name
2755 if not fileobj:
2756 if self.mode == "a" and not os.path.exists(name):
2757 # Create nonexistent files in append mode.
2758 self.mode = "w"
2759 self._mode = "wb"
68ddf955 2760 self._extfileobj = False
26fa5ad5
ERE
2761
2762 if isinstance(self.fileobj, _Stream):
e0da4709 2763 self._dbg(3, 'open_volume: create a _Stream')
26fa5ad5
ERE
2764 fileobj = _Stream(name=name,
2765 mode=self.fileobj.mode,
2766 comptype=self.fileobj.comptype,
2767 fileobj=None,
2768 bufsize=self.fileobj.bufsize,
cea130ec 2769 encryption=encryption or self.fileobj.encryption,
27ee4dd4
PG
2770 concat=self.fileobj.arcmode & ARCMODE_CONCAT,
2771 tolerance=self.fileobj.tolerance)
26fa5ad5 2772 else:
7a2b9329 2773 # here, we lose information about compression/encryption!
e0da4709 2774 self._dbg(3, 'open_volume: builtin open')
26fa5ad5 2775 fileobj = bltn_open(name, self._mode)
68ddf955
ERE
2776 else:
2777 if name is None and hasattr(fileobj, "name"):
2778 name = fileobj.name
2779 if hasattr(fileobj, "mode"):
2780 self._mode = fileobj.mode
2781 self._extfileobj = True
1027433a 2782 self._dbg(3, 'open_volume: using external fileobj {}', fileobj)
68ddf955
ERE
2783 self.name = os.path.abspath(name) if name else None
2784 self.fileobj = fileobj
2785
2786 # init data structures
2787 self.closed = False
2788 self.members = [] # list of members as TarInfo objects
2789 self._loaded = False # flag if all members have been read
2790 self.offset = self.fileobj.tell()
2791 # current position in the archive file
2792 self.inodes = {} # dictionary caching the inodes of
2793 # archive members already added
2794
2795 try:
2796 if self.mode == "r":
2797 self.firstmember = None
2798 self.firstmember = self.next()
2799
2800 if self.mode == "a":
2801 # Move to the end of the archive,
2802 # before the first empty block.
2803 while True:
2804 self.fileobj.seek(self.offset)
2805 try:
2806 tarinfo = self.tarinfo.fromtarfile(self)
2807 self.members.append(tarinfo)
2808 except EOFHeaderError:
2809 self.fileobj.seek(self.offset)
2810 break
be60ffd0 2811 except HeaderError as e:
68ddf955
ERE
2812 raise ReadError(str(e))
2813
2814 if self.mode in "aw":
2815 self._loaded = True
2816
c04e0751
ERE
2817 if self.format == PAX_FORMAT:
2818 volume_info = {
be60ffd0
ERE
2819 "GNU.volume.filename": str(self.volume_tarinfo.name),
2820 "GNU.volume.size": str(self.volume_tarinfo.size - self.volume_tarinfo.volume_offset),
2821 "GNU.volume.offset": str(self.volume_tarinfo.volume_offset),
c04e0751 2822 }
0eb5048f 2823
c04e0751
ERE
2824 self.pax_headers.update(volume_info)
2825
a0873dcc
PG
2826 if isinstance(self.fileobj, _Stream):
2827 self.fileobj._init_write_gz ()
c04e0751
ERE
2828 buf = self.tarinfo.create_pax_global_header(volume_info.copy())
2829 self.fileobj.write(buf)
2830 self.offset += len(buf)
54128a00 2831 except Exception as exn:
68ddf955
ERE
2832 if not self._extfileobj:
2833 self.fileobj.close()
2834 self.closed = True
2835 raise
2836
e5f5681b 2837 def extractall(self, path=".", members=None, filter=None):
7584f5c9
ERE
2838 """Extract all members from the archive to the current working
2839 directory and set owner, modification time and permissions on
2840 directories afterwards. `path' specifies a different directory
2841 to extract to. `members' is optional and must be a subset of the
2842 list returned by getmembers().
2843 """
2844 directories = []
2845
2846 if members is None:
2847 members = self
2848
2849 for tarinfo in members:
c474439c
ERE
2850 if self.volume_number > 0 and tarinfo.ismultivol():
2851 continue
2852
974408b5 2853 if filter and not filter(tarinfo):
e5f5681b
ERE
2854 continue
2855
7584f5c9
ERE
2856 if tarinfo.isdir():
2857 # Extract directories with a safe mode.
2858 directories.append(tarinfo)
2859 tarinfo = copy.copy(tarinfo)
be60ffd0
ERE
2860 tarinfo.mode = 0o0700
2861 # Do not set_attrs directories, as we will do that further down
2862 self.extract(tarinfo, path, set_attrs=not tarinfo.isdir())
7584f5c9
ERE
2863
2864 # Reverse sort directories.
be60ffd0 2865 directories.sort(key=lambda a: a.name)
7584f5c9
ERE
2866 directories.reverse()
2867
2868 # Set correct owner, mtime and filemode on directories.
2869 for tarinfo in directories:
2870 dirpath = os.path.join(path, tarinfo.name)
2871 try:
2872 self.chown(tarinfo, dirpath)
2873 self.utime(tarinfo, dirpath)
2874 self.chmod(tarinfo, dirpath)
be60ffd0 2875 except ExtractError as e:
7584f5c9
ERE
2876 if self.errorlevel > 1:
2877 raise
2878 else:
2879 self._dbg(1, "tarfile: %s" % e)
2880
786addd6 2881 def extract(self, member, path="", set_attrs=True, symlink_cb=None):
7584f5c9
ERE
2882 """Extract a member from the archive to the current working directory,
2883 using its full name. Its file information is extracted as accurately
2884 as possible. `member' may be a filename or a TarInfo object. You can
be60ffd0
ERE
2885 specify a different directory using `path'. File attributes (owner,
2886 mtime, mode) are set unless `set_attrs' is False.
786addd6
PG
2887 ``symlink_cb`` is a hook accepting a function that is passed the
2888 ``member``, ``path``, and ``set_attrs`` arguments if the tarinfo for
2889 ``member`` indicates a symlink in which case only the callback
9b13f5c4
PG
2890 passed will be applied, skipping the actual extraction. In case the
2891 callback is invoked, its return value is passed on to the caller.
7584f5c9
ERE
2892 """
2893 self._check("r")
2894
be60ffd0 2895 if isinstance(member, str):
7584f5c9
ERE
2896 tarinfo = self.getmember(member)
2897 else:
2898 tarinfo = member
2899
2900 # Prepare the link target for makelink().
2901 if tarinfo.islnk():
2902 tarinfo._link_target = os.path.join(path, tarinfo.linkname)
2903
9b13f5c4 2904 if symlink_cb is not None and tarinfo.issym():
83f5fd71 2905 return symlink_cb(member, path, set_attrs)
786addd6 2906
7584f5c9 2907 try:
be60ffd0
ERE
2908 self._extract_member(tarinfo, os.path.join(path, tarinfo.name),
2909 set_attrs=set_attrs)
2910 except EnvironmentError as e:
7584f5c9
ERE
2911 if self.errorlevel > 0:
2912 raise
2913 else:
2914 if e.filename is None:
2915 self._dbg(1, "tarfile: %s" % e.strerror)
2916 else:
2917 self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename))
be60ffd0 2918 except ExtractError as e:
7584f5c9
ERE
2919 if self.errorlevel > 1:
2920 raise
2921 else:
2922 self._dbg(1, "tarfile: %s" % e)
2923
2924 def extractfile(self, member):
2925 """Extract a member from the archive as a file object. `member' may be
be60ffd0
ERE
2926 a filename or a TarInfo object. If `member' is a regular file or a
2927 link, an io.BufferedReader object is returned. Otherwise, None is
2928 returned.
7584f5c9
ERE
2929 """
2930 self._check("r")
2931
be60ffd0 2932 if isinstance(member, str):
7584f5c9
ERE
2933 tarinfo = self.getmember(member)
2934 else:
2935 tarinfo = member
2936
be60ffd0
ERE
2937 if tarinfo.isreg() or tarinfo.ismultivol() or\
2938 tarinfo.type not in SUPPORTED_TYPES:
7584f5c9
ERE
2939 # If a member's type is unknown, it is treated as a
2940 # regular file.
2941 return self.fileobject(self, tarinfo)
2942
2943 elif tarinfo.islnk() or tarinfo.issym():
2944 if isinstance(self.fileobj, _Stream):
2945 # A small but ugly workaround for the case that someone tries
2946 # to extract a (sym)link as a file-object from a non-seekable
2947 # stream of tar blocks.
2948 raise StreamError("cannot extract (sym)link as file object")
2949 else:
2950 # A (sym)link's file object is its target's file object.
2951 return self.extractfile(self._find_link_target(tarinfo))
2952 else:
2953 # If there's no data associated with the member (directory, chrdev,
2954 # blkdev, etc.), return None instead of a file object.
2955 return None
2956
be60ffd0 2957 def _extract_member(self, tarinfo, targetpath, set_attrs=True):
7584f5c9
ERE
2958 """Extract the TarInfo object tarinfo to a physical
2959 file called targetpath.
2960 """
2961 # Fetch the TarInfo object for the given name
2962 # and build the destination pathname, replacing
2963 # forward slashes to platform specific separators.
2964 targetpath = targetpath.rstrip("/")
2965 targetpath = targetpath.replace("/", os.sep)
2966
2967 # Create all upper directories.
2968 upperdirs = os.path.dirname(targetpath)
2969 if upperdirs and not os.path.exists(upperdirs):
2970 # Create directories that are not part of the archive with
2971 # default permissions.
2972 os.makedirs(upperdirs)
2973
2974 if tarinfo.islnk() or tarinfo.issym():
2975 self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname))
2976 else:
2977 self._dbg(1, tarinfo.name)
2978
2979 if tarinfo.isreg():
2980 self.makefile(tarinfo, targetpath)
2981 elif tarinfo.isdir():
2982 self.makedir(tarinfo, targetpath)
2983 elif tarinfo.isfifo():
2984 self.makefifo(tarinfo, targetpath)
2985 elif tarinfo.ischr() or tarinfo.isblk():
2986 self.makedev(tarinfo, targetpath)
2987 elif tarinfo.islnk() or tarinfo.issym():
2988 self.makelink(tarinfo, targetpath)
2989 elif tarinfo.type not in SUPPORTED_TYPES:
2990 self.makeunknown(tarinfo, targetpath)
2991 else:
2992 self.makefile(tarinfo, targetpath)
2993
be60ffd0
ERE
2994 if set_attrs:
2995 self.chown(tarinfo, targetpath)
2996 if not tarinfo.issym():
2997 self.chmod(tarinfo, targetpath)
2998 self.utime(tarinfo, targetpath)
7584f5c9
ERE
2999
3000 #--------------------------------------------------------------------------
3001 # Below are the different file methods. They are called via
3002 # _extract_member() when extract() is called. They can be replaced in a
3003 # subclass to implement other functionality.
3004
3005 def makedir(self, tarinfo, targetpath):
3006 """Make a directory called targetpath.
3007 """
3008 try:
3009 # Use a safe mode for the directory, the real mode is set
3010 # later in _extract_member().
be60ffd0
ERE
3011 os.mkdir(targetpath, 0o0700)
3012 except FileExistsError:
3013 pass
7584f5c9
ERE
3014
3015 def makefile(self, tarinfo, targetpath):
3016 """Make a file called targetpath.
3017 """
be60ffd0
ERE
3018 source = self.fileobj
3019 source.seek(tarinfo.offset_data)
c7c736b6 3020 decrypt = False
c474439c
ERE
3021 iterate = True
3022 target = bltn_open(targetpath, "wb")
3023
be60ffd0
ERE
3024 if tarinfo.sparse is not None:
3025 try:
3026 for offset, size in tarinfo.sparse:
3027 target.seek(offset)
3028 copyfileobj(source, target, size)
3029 target.seek(tarinfo.size)
3030 target.truncate()
3031 finally:
3032 target.close()
3033 return
3034
c474439c
ERE
3035 while iterate:
3036 iterate = False
3037 try:
3038 copyfileobj(source, target, tarinfo.size)
aa828cd1 3039 except OSError:
c474439c
ERE
3040 source.close()
3041 # only if we are extracting a multivolume this can be treated
3042 if not self.new_volume_handler:
3043 target.close()
3044 raise Exception("We need to read a new volume and you"
3045 " didn't supply a new_volume_handler")
3046
3047 # the new volume handler should do everything needed to
3048 # start working in a new volume. usually, the handler calls
3049 # to self.open_volume
3050 self.volume_number += 1
3051 self.new_volume_handler(self, self.base_name, self.volume_number)
be60ffd0
ERE
3052 tarinfo = self.firstmember
3053 source = self.fileobj
c474439c 3054 iterate = True
c474439c
ERE
3055 target.close()
3056
7584f5c9
ERE
3057
3058 def makeunknown(self, tarinfo, targetpath):
3059 """Make a file from a TarInfo object with an unknown type
3060 at targetpath.
3061 """
3062 self.makefile(tarinfo, targetpath)
3063 self._dbg(1, "tarfile: Unknown file type %r, " \
3064 "extracted as regular file." % tarinfo.type)
3065
3066 def makefifo(self, tarinfo, targetpath):
3067 """Make a fifo called targetpath.
3068 """
3069 if hasattr(os, "mkfifo"):
3070 os.mkfifo(targetpath)
3071 else:
3072 raise ExtractError("fifo not supported by system")
3073
3074 def makedev(self, tarinfo, targetpath):
3075 """Make a character or block device called targetpath.
3076 """
3077 if not hasattr(os, "mknod") or not hasattr(os, "makedev"):
3078 raise ExtractError("special devices not supported by system")
3079
3080 mode = tarinfo.mode
3081 if tarinfo.isblk():
3082 mode |= stat.S_IFBLK
3083 else:
3084 mode |= stat.S_IFCHR
3085
3086 os.mknod(targetpath, mode,
3087 os.makedev(tarinfo.devmajor, tarinfo.devminor))
3088
3089 def makelink(self, tarinfo, targetpath):
3090 """Make a (symbolic) link called targetpath. If it cannot be created
3091 (platform limitation), we try to make a copy of the referenced file
3092 instead of a link.
3093 """
be60ffd0 3094 try:
7584f5c9
ERE
3095 # For systems that support symbolic and hard links.
3096 if tarinfo.issym():
7584f5c9
ERE
3097 os.symlink(tarinfo.linkname, targetpath)
3098 else:
3099 # See extract().
3100 if os.path.exists(tarinfo._link_target):
7584f5c9
ERE
3101 os.link(tarinfo._link_target, targetpath)
3102 else:
be60ffd0
ERE
3103 self._extract_member(self._find_link_target(tarinfo),
3104 targetpath)
3105 except symlink_exception:
7584f5c9 3106 try:
be60ffd0
ERE
3107 self._extract_member(self._find_link_target(tarinfo),
3108 targetpath)
7584f5c9
ERE
3109 except KeyError:
3110 raise ExtractError("unable to resolve link inside archive")
3111
3112 def chown(self, tarinfo, targetpath):
3113 """Set owner of targetpath according to tarinfo.
3114 """
3115 if pwd and hasattr(os, "geteuid") and os.geteuid() == 0:
3116 # We have to be root to do so.
3117 try:
3118 g = grp.getgrnam(tarinfo.gname)[2]
3119 except KeyError:
3120 g = tarinfo.gid
3121 try:
3122 u = pwd.getpwnam(tarinfo.uname)[2]
3123 except KeyError:
3124 u = tarinfo.uid
3125 try:
3126 if tarinfo.issym() and hasattr(os, "lchown"):
3127 os.lchown(targetpath, u, g)
3128 else:
be60ffd0
ERE
3129 os.chown(targetpath, u, g)
3130 except OSError as e:
7584f5c9
ERE
3131 raise ExtractError("could not change owner")
3132
3133 def chmod(self, tarinfo, targetpath):
3134 """Set file permissions of targetpath according to tarinfo.
3135 """
3136 if hasattr(os, 'chmod'):
3137 try:
3138 os.chmod(targetpath, tarinfo.mode)
be60ffd0 3139 except OSError as e:
7584f5c9
ERE
3140 raise ExtractError("could not change mode")
3141
3142 def utime(self, tarinfo, targetpath):
3143 """Set modification time of targetpath according to tarinfo.
3144 """
3145 if not hasattr(os, 'utime'):
3146 return
3147 try:
3148 os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime))
be60ffd0 3149 except OSError as e:
7584f5c9
ERE
3150 raise ExtractError("could not change modification time")
3151
3152 #--------------------------------------------------------------------------
3153 def next(self):
3154 """Return the next member of the archive as a TarInfo object, when
3155 TarFile is opened for reading. Return None if there is no more
3156 available.
3157 """
3158 self._check("ra")
3159 if self.firstmember is not None:
3160 m = self.firstmember
3161 self.firstmember = None
3162 return m
3163
be60ffd0
ERE
3164 # Read the next block.
3165 self.fileobj.seek(self.offset)
7584f5c9
ERE
3166 tarinfo = None
3167 while True:
3168 try:
3169 tarinfo = self.tarinfo.fromtarfile(self)
be60ffd0 3170 except EOFHeaderError as e:
7584f5c9
ERE
3171 if self.ignore_zeros:
3172 self._dbg(2, "0x%X: %s" % (self.offset, e))
3173 self.offset += BLOCKSIZE
3174 continue
be60ffd0 3175 except InvalidHeaderError as e:
7584f5c9
ERE
3176 if self.ignore_zeros:
3177 self._dbg(2, "0x%X: %s" % (self.offset, e))
3178 self.offset += BLOCKSIZE
3179 continue
3180 elif self.offset == 0:
3181 raise ReadError(str(e))
3182 except EmptyHeaderError:
3183 if self.offset == 0:
3184 raise ReadError("empty file")
be60ffd0 3185 except TruncatedHeaderError as e:
7584f5c9
ERE
3186 if self.offset == 0:
3187 raise ReadError(str(e))
be60ffd0 3188 except SubsequentHeaderError as e:
7584f5c9
ERE
3189 raise ReadError(str(e))
3190 break
3191
3192 if tarinfo is not None:
ea625b04
ERE
3193 if self.save_to_members:
3194 self.members.append(tarinfo)
7584f5c9
ERE
3195 else:
3196 self._loaded = True
3197
3198 return tarinfo
3199
3200 #--------------------------------------------------------------------------
3201 # Little helper methods:
3202
3203 def _getmember(self, name, tarinfo=None, normalize=False):
3204 """Find an archive member by name from bottom to top.
3205 If tarinfo is given, it is used as the starting point.
3206 """
3207 # Ensure that all members have been loaded.
3208 members = self.getmembers()
3209
3210 # Limit the member search list up to tarinfo.
3211 if tarinfo is not None:
3212 members = members[:members.index(tarinfo)]
3213
3214 if normalize:
3215 name = os.path.normpath(name)
3216
3217 for member in reversed(members):
3218 if normalize:
3219 member_name = os.path.normpath(member.name)
3220 else:
3221 member_name = member.name
3222
3223 if name == member_name:
3224 return member
3225
3226 def _load(self):
3227 """Read through the entire archive file and look for readable
3228 members.
3229 """
3230 while True:
3231 tarinfo = self.next()
3232 if tarinfo is None:
3233 break
3234 self._loaded = True
3235
3236 def _check(self, mode=None):
3237 """Check if TarFile is still open, and if the operation's mode
3238 corresponds to TarFile's mode.
3239 """
3240 if self.closed:
be60ffd0 3241 raise OSError("%s is closed" % self.__class__.__name__)
7584f5c9 3242 if mode is not None and self.mode not in mode:
be60ffd0 3243 raise OSError("bad operation for mode %r" % self.mode)
7584f5c9
ERE
3244
3245 def _find_link_target(self, tarinfo):
3246 """Find the target member of a symlink or hardlink member in the
3247 archive.
3248 """
3249 if tarinfo.issym():
3250 # Always search the entire archive.
3251 linkname = "/".join(filter(None, (os.path.dirname(tarinfo.name), tarinfo.linkname)))
3252 limit = None
3253 else:
3254 # Search the archive before the link, because a hard link is
3255 # just a reference to an already archived file.
3256 linkname = tarinfo.linkname
3257 limit = tarinfo
3258
3259 member = self._getmember(linkname, tarinfo=limit, normalize=True)
3260 if member is None:
3261 raise KeyError("linkname %r not found" % linkname)
3262 return member
3263
3264 def __iter__(self):
3265 """Provide an iterator object.
3266 """
3267 if self._loaded:
3268 return iter(self.members)
3269 else:
3270 return TarIter(self)
3271
1027433a 3272 def _dbg(self, level, msg, *args):
7584f5c9
ERE
3273 """Write debugging output to sys.stderr.
3274 """
3275 if level <= self.debug:
1027433a 3276 print(msg.format(*args), file=sys.stderr)
7584f5c9
ERE
3277
3278 def __enter__(self):
3279 self._check()
3280 return self
3281
3282 def __exit__(self, type, value, traceback):
3283 if type is None:
3284 self.close()
3285 else:
3286 # An exception occurred. We must not call close() because
3287 # it would try to write end-of-archive blocks and padding.
3288 if not self._extfileobj:
3289 self.fileobj.close()
3290 self.closed = True
3291# class TarFile
3292
3293class TarIter:
3294 """Iterator Class.
3295
3296 for tarinfo in TarFile(...):
3297 suite...
3298 """
3299
3300 def __init__(self, tarfile):
3301 """Construct a TarIter object.
3302 """
3303 self.tarfile = tarfile
3304 self.index = 0
3305 def __iter__(self):
3306 """Return iterator object.
3307 """
3308 return self
be60ffd0 3309 def __next__(self):
7584f5c9
ERE
3310 """Return the next item using TarFile's next() method.
3311 When all members have been read, set TarFile as _loaded.
3312 """
3313 # Fix for SF #1100429: Under rare circumstances it can
3314 # happen that getmembers() is called during iteration,
3315 # which will cause TarIter to stop prematurely.
3316
3317 if self.index == 0 and self.tarfile.firstmember is not None:
3318 tarinfo = self.tarfile.next()
3319 elif self.index < len(self.tarfile.members):
3320 tarinfo = self.tarfile.members[self.index]
3321 elif not self.tarfile._loaded:
3322 tarinfo = self.tarfile.next()
3323 if not tarinfo:
3324 self.tarfile._loaded = True
3325 raise StopIteration
3326 else:
3327 raise StopIteration
3328 self.index += 1
fb27c6e8 3329
7584f5c9
ERE
3330 return tarinfo
3331
6690f5e0
PG
3332#---------------------------------------------------------
3333# support functionality for rescue mode
3334#---------------------------------------------------------
3335
8fc6040c
PG
3336TAR_FMT_HDR = (# See tar(5):
3337 "<"
3338 "100s" # ← char name[100]; /* 100 */
3339 "8s" # ← char mode[8]; /* 108 */
3340 "8s" # ← char uid[8]; /* 116 */
3341 "8s" # ← char gid[8]; /* 124 */
3342 "12s" # ← char size[12]; /* 136 */
3343 "12s" # ← char mtime[12]; /* 148 */
3344 "8s" # ← char checksum[8]; /* 156 */
3345 "B" # ← char typeflag[1]; /* 157 */
3346 "100s" # ← char linkname[100]; /* 257 */
3347 "6s" # ← char magic[6]; /* 263 */
3348 "2s" # ← char version[2]; /* 265 */
3349 "32s" # ← char uname[32]; /* 297 */
3350 "32s" # ← char gname[32]; /* 329 */
3351 "8s" # ← char devmajor[8]; /* 337 */
3352 "8s" # ← char devminor[8]; /* 345 */
3353 "12s" # ← char atime[12]; /* 357 */
3354 "12s" # ← char ctime[12]; /* 369 */
3355 "12s" # ← char offset[12]; /* 381 */
3356 "4s" # ← char longnames[4]; /* 385 */
3357 "B" # ← char unused[1]; /* 386 */
3358 "" # struct {
3359 "12s" # ← char offset[12];
3360 "12s" # ← char numbytes[12];
3361 "12s" # ← char offset[12];
3362 "12s" # ← char numbytes[12];
3363 "12s" # ← char offset[12];
3364 "12s" # ← char numbytes[12];
3365 "12s" # ← char offset[12];
3366 "12s" # ← char numbytes[12];
3367 "" # } sparse[4]; /* 482 */
3368 "B" # ← char isextended[1]; /* 483 */
3369 "12s" # ← char realsize[12]; /* 495 */
3370 "17s" # ← char pad[17]; /* 512 */
3371)
3372
3373# The “magic” and “version” fields are special:
3374#
3375# tar(5)
3376# magic The magic field holds the five characters “ustar” followed by a
3377# space. Note that POSIX ustar archives have a trailing null.
3378#
3379# however, “tar.h”:
3380#
3381# /* OLDGNU_MAGIC uses both magic and version fields, which are contiguous.
3382# Found in an archive, it indicates an old GNU header format, which will be
3383# hopefully become obsolescent. With OLDGNU_MAGIC, uname and gname are
3384# valid, though the header is not truly POSIX conforming. */
3385#
3386#
3387TAR_FMT_OLDGNU_MAGIC = b"ustar "
3388
3389def read_gnu_tar_hdr (data):
3390 if len (data) != BLOCKSIZE: # header requires one complete block
3391 return None
65b35c42 3392
8fc6040c
PG
3393 try:
3394 name, mode, \
3395 uid, gid, \
3396 size, mtime, \
3397 checksum, \
3398 typeflag, \
3399 linkname, \
3400 magic, \
3401 version, \
3402 uname, \
3403 gname, \
3404 devmajor, \
3405 devminor, \
3406 atime, \
3407 ctime, \
3408 offset, \
3409 longnames, \
3410 unused, \
3411 offset1, numbytes1, \
3412 offset2, numbytes2, \
3413 offset3, numbytes3, \
3414 offset4, numbytes4, \
3415 isextended, \
3416 realsize, \
3417 pad = struct.unpack (TAR_FMT_HDR, data)
3418 except struct.error:
3419 return None
3420
3421 if magic != TAR_FMT_OLDGNU_MAGIC:
3422 return None
3423
3424 # return all except “unused” and “pad”
3425 return \
3426 { "name" : name, "mode" : mode
3427 , "uid" : uid , "gid" : gid
3428 , "size" : size, "mtime" : mtime
3429 , "checksum" : checksum
3430 , "typeflag" : typeflag
3431 , "linkname" : linkname
3432 , "magic" : magic
3433 , "version" : version
3434 , "uname" : uname, "gname" : gname
3435 , "devmajor" : devmajor, "devminor" : devminor
3436 , "atime" : atime, "ctime" : ctime
3437 , "offset" : offset
3438 , "longnames" : longnames
3439 , "offset1" : offset1, "numbytes1" : numbytes1
3440 , "offset2" : offset2, "numbytes2" : numbytes2
3441 , "offset3" : offset3, "numbytes3" : numbytes3
3442 , "offset4" : offset4, "numbytes4" : numbytes4
3443 , "isextended" : isextended
3444 , "realsize" : realsize
3445 }
3446
3447
3448def readable_tar_objects_offsets (ifd):
3449 """
3450 Traverse blocks in file, trying to extract tar headers.
3451 """
3452 pos = 0
3453 offsets = []
3454
3455 while True:
3456 blk = os.read (ifd, BLOCKSIZE)
3457 if len (blk) != BLOCKSIZE:
3458 break
3459 hdr = read_gnu_tar_hdr (blk)
3460 if hdr is not None:
3461 offsets.append (pos)
3462 pos += BLOCKSIZE
65b35c42 3463
8fc6040c 3464 return offsets
65b35c42
PG
3465
3466
dfd7865e
PG
3467def locate_gz_hdr_candidates (fd):
3468 """
3469 Walk over instances of the GZ magic in the payload, collecting their
3470 positions. If the offset of the first found instance is not zero, the file
3471 begins with leading garbage.
3472
3473 Note that since the GZ magic consists of only two bytes, we expect a lot of
3474 false positives inside binary data.
3475
3476 :return: The list of offsets in the file.
3477 """
3478 pos = 0
3479 cands = []
3480 mm = mmap.mmap(fd, 0, mmap.MAP_SHARED, mmap.PROT_READ)
3481
3482 while True:
3483 pos = mm.find (GZ_MAGIC_BYTES, pos)
3484 if pos == -1:
3485 break
3486 cands.append (pos)
3487 pos += len (GZ_MAGIC_BYTES)
3488
3489 return cands
3490
3491
3492HDR_CAND_GOOD = 0 # header marks begin of valid object
3493HDR_CAND_FISHY = 1 # inconclusive
3494HDR_CAND_JUNK = 2 # not a header / object unreadable
3495
3496
3497def read_cstring (fd, max=-1, encoding=None):
3498 """
3499 Read one NUL-terminated string from *fd* into a Python string. If *max* is
3500 non-negative, reading will terminate after the specified number of bytes.
3501
3502 Optionally, an *encoding* may be specified to interpret the data as.
3503
3504 :returns: *None* if parsing failed or the maximum number of bytes has been
3505 exceeded; a Python string with the data otherwise.
3506 """
3507 buf = b""
3508 l = 0
3509
3510 while True:
3511 c = os.read (fd, 1)
3512 if c == NUL:
3513 break
3514 if max >= 0 and l > max:
3515 return None
3516 buf += c
3517 l += 1
3518 if encoding is not None:
3519 buf = buf.decode (encoding)
3520
3521 return buf
3522
3523
3524def inspect_gz_hdr (fd, off):
3525 """
3526 Attempt to parse a Gzip header in *fd* at position *off*. The format is
3527 documented as RFC1952.
3528
3529 Returns a verdict about the quality of that header plus the parsed header
3530 when readable. Problematic sizes such as fields running past the EOF are
3531 treated as garbage. Properties in which the header merely doesn’t conform
3532 to the spec (garbage flag bits, bogus timestamp) are considered “fishy”. No
3533 validation is possible on embedded strings because they are single-byte
3534 encoded.
3535 """
3536 fname = None
3537 flags = 0x00
3538 dflags = 0x00
3539 mtime = 0x00000000
3540 oscode = 0x00
3541 verdict = HDR_CAND_GOOD
3542
3543 os.lseek (fd, off, os.SEEK_SET)
3544 if os.lseek (fd, 0, os.SEEK_CUR) != off:
3545 return HDR_CAND_JUNK, None
3546
3547 raw = os.read (fd, GZ_HEADER_SIZE)
3548 if len (raw) != GZ_HEADER_SIZE:
3549 return HDR_CAND_JUNK, None
3550
3551 flags = 0x0
3552 try:
3553 _m1, _m2, meth, flags, mtime, dflags, oscode = \
3554 struct.unpack (GZ_FMT_HEADER, raw)
3555 if meth != GZ_METHOD_DEFLATE: # only deflate is supported
3556 return HDR_CAND_JUNK, None
3557 except struct.error as exn:
3558 return HDR_CAND_JUNK, None
3559
3560 if mtime > int (time.time ()):
3561 verdict = HDR_CAND_FISHY
3562
3563 if dflags != GZ_DEFLATE_FLAGS:
3564 verdict = HDR_CAND_FISHY
3565
3566 if oscode != GZ_OS_CODE:
3567 verdict = HDR_CAND_FISHY
3568
3569 if flags & GZ_FLAG_FTEXT: # created by some contrarian
3570 verdict = HDR_CAND_FISHY
3571 if flags & GZ_FLAG_FEXTRA:
3572 xlen = struct.unpack ("<H", os.read (fd, 2))
3573 xtra = os.read (fd, xlen)
3574 if len (xtra) != xlen: # eof inside header
3575 return HDR_CAND_JUNK, None
3576 if flags & GZ_FLAG_FNAME:
3577 # read up to the next NUL byte, not exceeding the maximum path length
3578 # allowed by tar(5)
3579 fname = read_cstring (fd, max=(LENGTH_NAME + LENGTH_PREFIX),
3580 encoding="iso-8859-1")
3581 if fname is None:
3582 return HDR_CAND_JUNK, None
3583 if flags & GZ_FLAG_FCOMMENT:
3584 fname = read_cstring (fd, max=(LENGTH_NAME + LENGTH_PREFIX),
3585 encoding="iso-8859-1")
3586 if fname is None:
3587 return HDR_CAND_JUNK, None
3588 if flags & GZ_FLAG_FHCRC: # half a CRC32; discarded
3589 crc16 = os.read (fd, 2)
3590 if len (crc16) != 2: # eof inside header
3591 return HDR_CAND_JUNK, None
3592 if flags & GZ_FLAG_RESERVED:
3593 # according to the RFC, these must not be set
3594 verdict = HDR_CAND_FISHY
3595
3596 hlen = os.lseek (fd, 0, os.SEEK_CUR) - off
3597
3598 return verdict, \
3599 { "fname" : fname
3600 , "flags" : flags
3601 , "dflags" : dflags
3602 , "mtime" : mtime
3603 , "oscode" : oscode
3604 , "hlen" : hlen
3605 }
3606
3607
3608def try_decompress (ifd, off, hdr):
3609 """
3610 Attempt to process the object starting at *off* with gzip.
3611
3612 :returns: A pair containing the values of the decompressed data and
3613 the length of the input consumed. Note that the latter value
3614 may exceed the length of the compressed data because the
3615 *zlib* module does not provide a means to query how much
3616 of the input it processed before the end of an object.
3617 """
3618 import zlib
3619 decmp = zlib.decompressobj (-zlib.MAX_WBITS)
3620 pos = off
3621 dlen = 0 # size of decompressed data
3622
3623 os.lseek (ifd, pos, os.SEEK_SET)
3624 while True:
3625 cnk = os.read (ifd, BUFSIZE)
3626 pos += len (cnk)
3627 try:
3628 data = decmp.decompress (cnk)
3629 except zlib.error as exn: # probably CRC32 mismatch; terminate softly
3630 break # fishy
3631 dlen += len (data)
3632 if decmp.eof is True:
3633 break
3634 if len (cnk) != BUFSIZE: # eof, but not end of decompressed object!
3635 break # fishy
3636
3637 return dlen, pos - off
3638
3639def readable_gz_objects_offsets (ifd, cands):
3640 """
3641 Inspect header candidates for parseable *ifd* gzipped objects.
3642 """
3643 good = []
3644 nobj = 0
3645
3646 for cand in cands:
3647 nobj += 1
3648 vdt, hdr = inspect_gz_hdr (ifd, cand)
3649 if vdt == HDR_CAND_JUNK:
3650 pass # ignore unreadable ones
3651 elif vdt in [HDR_CAND_GOOD, HDR_CAND_FISHY]:
3652 off0 = cand + hdr ["hlen"]
3653 dlen, clen = try_decompress (ifd, off0, hdr)
3654 if dlen > 0 and clen > 0:
3655 good.append (cand)
3656
3657 return good
3658
3659
3660def reconstruct_offsets_gz (fname):
3661 """
3662 From the given file, retrieve all GZ header-like offsets (“candidates”).
3663 Then check each of those locations whether they can be processed as
3664 compressed data.
3665 """
3666 ifd = os.open (fname, os.O_RDONLY)
3667
3668 try:
3669 cands = locate_gz_hdr_candidates (ifd)
3670 return readable_gz_objects_offsets (ifd, cands)
3671 finally:
3672 os.close (ifd)
3673
3674
65b35c42
PG
3675def reconstruct_offsets_tar (fname):
3676 """
3677 From the given file, retrieve all tar header-like offsets (“candidates”).
3678 Then check each of those locations whether they can be processed as tar
3679 data.
3680 """
3681 ifd = os.open (fname, os.O_RDONLY)
3682
3683 try:
8fc6040c 3684 return readable_tar_objects_offsets (ifd)
65b35c42
PG
3685 finally:
3686 os.close (ifd)
3687
3688
d39d4cbf
PG
3689def read_tarobj_at_offset (fileobj, offset, mode, secret=None):
3690 decr = None
d39d4cbf 3691
dfd7865e
PG
3692 if secret is not None:
3693 ks = secret [0]
3694
3695 if ks == crypto.PDTCRYPT_SECRET_PW:
3696 decr = crypto.Decrypt (password=secret [1])
3697 elif ks == crypto.PDTCRYPT_SECRET_KEY:
3698 key = binascii.unhexlify (secret [1])
3699 decr = crypto.Decrypt (key=key)
3700 else:
3701 raise RuntimeError
d39d4cbf
PG
3702
3703 tarobj = \
3704 TarFile.open_at_offset (offset,
3705 mode=mode,
3706 fileobj=fileobj,
3707 format=GNU_FORMAT,
3708 concat='#' in mode,
3709 encryption=decr,
3710 save_to_members=False,
3711 tolerance=TOLERANCE_RESCUE)
3712
3713 return tarobj.next ()
3714
3715
2d50b7f7
PG
3716def idxent_of_tarinfo (tarinfo):
3717 """
3718 Scrape the information relevant for the index from a *TarInfo* object.
3719 Keys like the inode number that lack a corresponding field in a TarInfo
3720 will be set to some neutral value.
3721 Example output:
3722
3723 { "inode" : 0
3724 , "uid" : 0
3725 , "path" : "snapshot://annotations.db"
3726 , "offset" : 0
3727 , "volume" : 0
3728 , "mode" : 33152
3729 , "ctime" : 1502798115
3730 , "mtime" : 1502196423
3731 , "size" : 144
3732 , "type" : "file"
3733 , "gid" : 0
3734 }
3735
3736 """
3737
3738 return \
3739 { "inode" : 0 # ignored when reading the index
3740 , "uid" : tarinfo.uid
3741 , "gid" : tarinfo.gid
3742 , "path" : tarinfo.name # keeping URI scheme
3743 , "offset" : 0 # to be added by the caller
3744 , "volume" : tarinfo.volume_offset
3745 , "mode" : tarinfo.mode
3746 , "ctime" : tarinfo.mtime
3747 , "mtime" : tarinfo.mtime
3748 , "size" : tarinfo.size
3749 , "type" : tarinfo.type
3750 }
3751
3752
27ee4dd4
PG
3753def gen_rescue_index (gen_volume_name, mode, maxvol=None, password=None, key=None):
3754 infos = []
6690f5e0
PG
3755 psidx = [] # pseudo index, return value
3756 offsets = None
addcec42 3757 secret = crypto.make_secret (password=password, key=key)
6690f5e0 3758
27ee4dd4 3759 nvol = 0
dfd7865e 3760
27ee4dd4 3761 def aux (o, nvol, ti):
dfd7865e
PG
3762 ie = idxent_of_tarinfo (ti)
3763 ie ["offset"] = o
27ee4dd4 3764 ie ["volume"] = nvol
dfd7865e 3765 return ie
27ee4dd4
PG
3766
3767 while True:
3768 vpath = gen_volume_name (nvol)
3769 try:
3770 if secret is not None:
3771 offsets = crypto.reconstruct_offsets (vpath, secret)
3772 elif mode == "#gz":
3773 offsets = reconstruct_offsets_gz (vpath)
3774 elif mode == "#":
3775 offsets = reconstruct_offsets_tar (vpath)
3776 else:
3777 raise TarError ("no rescue handling for mode “%s”" % mode)
3778 except FileNotFoundError as exn:
3779 # volume does not exist
3780 if maxvol is not None and i < maxvol:
3781 continue # explicit volume number specified, ignore missing ones
3782 else:
3783 break
3784
3785 fileobj = bltn_open (vpath, "rb")
3786 infos += [ (off, nvol, read_tarobj_at_offset (fileobj, off, mode,
3787 secret=secret))
3788 for off in offsets ]
3789 nvol += 1
3790
3791 psidx = [ aux (o, nvol, ti) for o, nvol, ti in infos ]
6690f5e0
PG
3792
3793 return psidx
7584f5c9
ERE
3794
3795#--------------------
3796# exported functions
3797#--------------------
3798def is_tarfile(name):
3799 """Return True if name points to a tar archive that we
3800 are able to handle, else return False.
3801 """
3802 try:
3803 t = open(name)
3804 t.close()
3805 return True
3806 except TarError:
3807 return False
3808
3809bltn_open = open
3810open = TarFile.open