bpo-32713: Fix tarfile.itn for large/negative float values. (GH-5434)
[python-delta-tar] / deltatar / tarfile.py
... / ...
CommitLineData
1#!/usr/bin/env python3
2#-------------------------------------------------------------------
3# tarfile.py
4#-------------------------------------------------------------------
5# Copyright (C) 2002 Lars Gustäbel <lars@gustaebel.de>
6# All rights reserved.
7#
8# Permission is hereby granted, free of charge, to any person
9# obtaining a copy of this software and associated documentation
10# files (the "Software"), to deal in the Software without
11# restriction, including without limitation the rights to use,
12# copy, modify, merge, publish, distribute, sublicense, and/or sell
13# copies of the Software, and to permit persons to whom the
14# Software is furnished to do so, subject to the following
15# conditions:
16#
17# The above copyright notice and this permission notice shall be
18# included in all copies or substantial portions of the Software.
19#
20# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27# OTHER DEALINGS IN THE SOFTWARE.
28#
29"""Read from and write to tar format archives.
30"""
31
32__version__ = "$Revision: 85213 $"
33# $Source$
34
35version = "0.9.0"
36__author__ = "Lars Gustäbel (lars@gustaebel.de)"
37__date__ = "$Date$"
38__cvsid__ = "$Id$"
39__credits__ = "Gustavo Niemeyer, Niels Gustäbel, Richard Townsend, Eduardo Robles."
40
41#---------
42# Imports
43#---------
44import binascii
45import copy
46import errno
47import functools
48import io
49import mmap
50import operator
51import os
52import re
53import shutil
54import stat
55import struct
56import sys
57import time
58
59import traceback # XXX
60
61from . import crypto
62
63try:
64 import grp, pwd
65except ImportError:
66 grp = pwd = None
67
68# os.symlink on Windows prior to 6.0 raises NotImplementedError
69symlink_exception = (AttributeError, NotImplementedError)
70try:
71 # OSError (winerror=1314) will be raised if the caller does not hold the
72 # SeCreateSymbolicLinkPrivilege privilege
73 symlink_exception += (OSError,)
74except NameError:
75 pass
76
77# from tarfile import *
78__all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError"]
79
80from builtins import open as _open # Since 'open' is TarFile.open
81
82#---------------------------------------------------------
83# tar constants
84#---------------------------------------------------------
85NUL = b"\0" # the null character
86BLOCKSIZE = 512 # length of processing blocks
87RECORDSIZE = BLOCKSIZE * 20 # length of records
88GNU_MAGIC = b"ustar \0" # magic gnu tar string
89POSIX_MAGIC = b"ustar\x0000" # magic posix tar string
90
91LENGTH_NAME = 100 # maximum length of a filename
92LENGTH_LINK = 100 # maximum length of a linkname
93LENGTH_PREFIX = 155 # maximum length of the prefix field
94
95REGTYPE = b"0" # regular file
96AREGTYPE = b"\0" # regular file
97LNKTYPE = b"1" # link (inside tarfile)
98SYMTYPE = b"2" # symbolic link
99CHRTYPE = b"3" # character special device
100BLKTYPE = b"4" # block special device
101DIRTYPE = b"5" # directory
102FIFOTYPE = b"6" # fifo special device
103CONTTYPE = b"7" # contiguous file
104
105GNUTYPE_LONGNAME = b"L" # GNU tar longname
106GNUTYPE_LONGLINK = b"K" # GNU tar longlink
107GNUTYPE_SPARSE = b"S" # GNU tar sparse file
108GNUTYPE_MULTIVOL = b"M" # GNU tar continuation of a file that began on
109 # another volume
110
111XHDTYPE = b"x" # POSIX.1-2001 extended header
112XGLTYPE = b"g" # POSIX.1-2001 global header
113SOLARIS_XHDTYPE = b"X" # Solaris extended header
114
115USTAR_FORMAT = 0 # POSIX.1-1988 (ustar) format
116GNU_FORMAT = 1 # GNU tar format
117PAX_FORMAT = 2 # POSIX.1-2001 (pax) format
118DEFAULT_FORMAT = GNU_FORMAT
119
120GZ_FMT_HEADER = b"<BBBBLBB"
121GZ_HEADER_SIZE = 10 # not including the name
122GZ_MAGIC = (0x1f, 0x8b) # 0o37, 0o213
123GZ_METHOD_DEFLATE = 0x08 # 0o10
124GZ_FLAG_FTEXT = 1 << 0 # ASCII payload
125GZ_FLAG_FHCRC = 1 << 1 # CRC16
126GZ_FLAG_FEXTRA = 1 << 2 # extra field
127GZ_FLAG_FNAME = 1 << 3 # set by default in gzip
128GZ_FLAG_FCOMMENT = 1 << 4 # NUL-terminated comment
129GZ_FLAG_RESERVED = 7 << 5 # unassigned
130GZ_DEFLATE_FLAGS = 0x00 # 0o00, never read (deflate.c)
131GZ_OS_CODE = 0x03 # 0o03, default in gzip (tailor.h)
132GZ_MAGIC_BYTES = struct.pack ("<BB", GZ_MAGIC [0], GZ_MAGIC [1])
133GZ_MAGIC_DEFLATE = struct.pack ("<BBB", GZ_MAGIC [0], GZ_MAGIC [1],
134 GZ_METHOD_DEFLATE)
135
136TOLERANCE_STRICT = 0
137TOLERANCE_RECOVER = 1 # rely on offsets in index
138TOLERANCE_RESCUE = 2 # deduce metadata from archive contents
139
140BUFSIZE = 16 * 1024
141
142#---------------------------------------------------------
143# archive handling mode
144#---------------------------------------------------------
145
146ARCMODE_PLAIN = 0
147ARCMODE_ENCRYPT = 1 << 0
148ARCMODE_COMPRESS = 1 << 1
149ARCMODE_CONCAT = 1 << 2
150
151def arcmode_fmt (m):
152 if m == ARCMODE_PLAIN:
153 return "PLAIN"
154 first = True
155 ret = "["
156 def chkappend (b, s):
157 nonlocal m
158 nonlocal ret
159 nonlocal first
160 if m & b:
161 if first is True: first = False
162 else: ret += " |"
163 ret += " " + s
164 chkappend (ARCMODE_ENCRYPT, "ENCRYPT")
165 chkappend (ARCMODE_COMPRESS, "COMPRESS")
166 chkappend (ARCMODE_CONCAT, "CONCAT")
167 return ret + " ]"
168
169
170def arcmode_set (concat=False, encryption=None, comptype=None, init=ARCMODE_PLAIN):
171 ret = init
172 if bool (concat) is True:
173 ret |= ARCMODE_CONCAT
174 if encryption is not None:
175 ret |= ARCMODE_ENCRYPT
176 if comptype == "gz":
177 ret |= ARCMODE_COMPRESS
178 return ret
179
180#---------------------------------------------------------
181# tarfile constants
182#---------------------------------------------------------
183# File types that tarfile supports:
184SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE,
185 SYMTYPE, DIRTYPE, FIFOTYPE,
186 CONTTYPE, CHRTYPE, BLKTYPE,
187 GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
188 GNUTYPE_SPARSE, GNUTYPE_MULTIVOL)
189
190# File types that will be treated as a regular file.
191REGULAR_TYPES = (REGTYPE, AREGTYPE,
192 CONTTYPE, GNUTYPE_SPARSE)
193
194# File types that are part of the GNU tar format.
195GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
196 GNUTYPE_SPARSE, GNUTYPE_MULTIVOL)
197
198# Fields from a pax header that override a TarInfo attribute.
199PAX_FIELDS = ("path", "linkpath", "size", "mtime",
200 "uid", "gid", "uname", "gname")
201
202# Fields from a pax header that are affected by hdrcharset.
203PAX_NAME_FIELDS = {"path", "linkpath", "uname", "gname"}
204
205# Fields in a pax header that are numbers, all other fields
206# are treated as strings.
207PAX_NUMBER_FIELDS = {
208 "atime": float,
209 "ctime": float,
210 "mtime": float,
211 "uid": int,
212 "gid": int,
213 "size": int
214}
215
216#---------------------------------------------------------
217# initialization
218#---------------------------------------------------------
219
220if os.name in ("nt", "ce"):
221 ENCODING = "utf-8"
222else:
223 ENCODING = sys.getfilesystemencoding()
224
225#---------------------------------------------------------
226# Some useful functions
227#---------------------------------------------------------
228
229def stn(s, length, encoding, errors):
230 """Convert a string to a null-terminated bytes object.
231 """
232 s = s.encode(encoding, errors)
233 return s[:length] + (length - len(s)) * NUL
234
235def nts(s, encoding, errors):
236 """Convert a null-terminated bytes object to a string.
237 """
238 p = s.find(b"\0")
239 if p != -1:
240 s = s[:p]
241 return s.decode(encoding, errors)
242
243def sbtn(s, length, encoding, errors):
244 """Convert a string or a bunch of bytes to a null-terminated bytes object
245 of specific size.
246 """
247 if isinstance(s, str):
248 s = s.encode(encoding, errors)
249 return s[:length] + (length - len(s)) * NUL
250
251def nti(s):
252 """Convert a number field to a python number.
253 """
254 # There are two possible encodings for a number field, see
255 # itn() below.
256 if s[0] in (0o200, 0o377):
257 n = 0
258 for i in range(len(s) - 1):
259 n <<= 8
260 n += s[i + 1]
261 if s[0] == 0o377:
262 n = -(256 ** (len(s) - 1) - n)
263 else:
264 try:
265 n = int(nts(s, "ascii", "strict") or "0", 8)
266 except ValueError:
267 raise InvalidHeaderError("invalid header")
268 return n
269
270def itn(n, digits=8, format=DEFAULT_FORMAT):
271 """Convert a python number to a number field.
272 """
273 # POSIX 1003.1-1988 requires numbers to be encoded as a string of
274 # octal digits followed by a null-byte, this allows values up to
275 # (8**(digits-1))-1. GNU tar allows storing numbers greater than
276 # that if necessary. A leading 0o200 or 0o377 byte indicate this
277 # particular encoding, the following digits-1 bytes are a big-endian
278 # base-256 representation. This allows values up to (256**(digits-1))-1.
279 # A 0o200 byte indicates a positive number, a 0o377 byte a negative
280 # number.
281 n = int(n)
282 if 0 <= n < 8 ** (digits - 1):
283 s = bytes("%0*o" % (digits - 1, n), "ascii") + NUL
284 elif format == GNU_FORMAT and -256 ** (digits - 1) <= n < 256 ** (digits - 1):
285 if n >= 0:
286 s = bytearray([0o200])
287 else:
288 s = bytearray([0o377])
289 n = 256 ** digits + n
290
291 for i in range(digits - 1):
292 s.insert(1, n & 0o377)
293 n >>= 8
294 else:
295 raise ValueError("overflow in number field")
296
297 return s
298
299def calc_chksums(buf):
300 """Calculate the checksum for a member's header by summing up all
301 characters except for the chksum field which is treated as if
302 it was filled with spaces. According to the GNU tar sources,
303 some tars (Sun and NeXT) calculate chksum with signed char,
304 which will be different if there are chars in the buffer with
305 the high bit set. So we calculate two checksums, unsigned and
306 signed.
307 """
308 unsigned_chksum = 256 + sum(struct.unpack_from("148B8x356B", buf))
309 signed_chksum = 256 + sum(struct.unpack_from("148b8x356b", buf))
310 return unsigned_chksum, signed_chksum
311
312def copyfileobj(src, dst, length=None):
313 """Copy length bytes from fileobj src to fileobj dst.
314 If length is None, copy the entire content.
315 """
316 if length == 0:
317 return
318 if length is None:
319 shutil.copyfileobj(src, dst)
320 return
321
322 blocks, remainder = divmod(length, BUFSIZE)
323 for b in range(blocks):
324 buf = src.read(BUFSIZE)
325 dst.write(buf)
326 if len(buf) < BUFSIZE:
327 raise OSError("end of file reached")
328 if remainder != 0:
329 buf = src.read(remainder)
330 dst.write(buf)
331 if len(buf) < remainder:
332 raise OSError("end of file reached")
333
334
335def filemode(mode):
336 """Deprecated in this location; use stat.filemode."""
337 import warnings
338 warnings.warn("deprecated in favor of stat.filemode",
339 DeprecationWarning, 2)
340 return stat.filemode(mode)
341
342class TarError(Exception):
343 """Base exception."""
344 pass
345class ExtractError(TarError):
346 """General exception for extract errors."""
347 pass
348class ReadError(TarError):
349 """Exception for unreadable tar archives."""
350 pass
351class CompressionError(TarError):
352 """Exception for unavailable compression methods."""
353 pass
354class StreamError(TarError):
355 """Exception for unsupported operations on stream-like TarFiles."""
356 pass
357class HeaderError(TarError):
358 """Base exception for header errors."""
359 pass
360class EmptyHeaderError(HeaderError):
361 """Exception for empty headers."""
362 pass
363class TruncatedHeaderError(HeaderError):
364 """Exception for truncated headers."""
365 pass
366class EOFHeaderError(HeaderError):
367 """Exception for end of file headers."""
368 pass
369class InvalidHeaderError(HeaderError):
370 """Exception for invalid headers."""
371 pass
372class SubsequentHeaderError(HeaderError):
373 """Exception for missing and invalid extended headers."""
374 pass
375class InvalidEncryptionError(TarError):
376 """Exception for undefined crypto modes and combinations."""
377 pass
378class DecryptionError(TarError):
379 """Exception for error during decryption."""
380 pass
381class EncryptionError(TarError):
382 """Exception for error during encryption."""
383 pass
384class EndOfFile(Exception):
385 """Signal end of file condition when they’re not an error."""
386 pass
387
388#---------------------------
389# internal stream interface
390#---------------------------
391class _LowLevelFile:
392 """Low-level file object. Supports reading and writing.
393 It is used instead of a regular file object for streaming
394 access.
395 """
396
397 def __init__(self, name, mode):
398 _mode = {
399 "r": os.O_RDONLY,
400 "w": os.O_RDWR | os.O_CREAT | os.O_TRUNC,
401 }[mode]
402 if hasattr(os, "O_BINARY"):
403 _mode |= os.O_BINARY # pylint: disable=no-member
404 self.fd = os.open(name, _mode, 0o666)
405 self.offset = 0
406
407 def close(self):
408 os.close(self.fd)
409
410 def read(self, size):
411 ret = os.read(self.fd, size)
412 self.offset += len(ret)
413 return ret
414
415 def write(self, s, pos=None):
416 if pos is not None:
417 p0 = self.offset
418 os.lseek (self.fd, pos, os.SEEK_SET)
419 n = os.write(self.fd, s)
420 if pos is None:
421 self.offset += len(s)
422 else:
423 append = pos + n - p0
424 if append > 0:
425 self.offset += append
426 os.lseek (self.fd, p0, os.SEEK_SET)
427
428 def tell(self):
429 return self.offset
430
431 def seek_set (self, pos):
432 os.lseek (self.fd, pos, os.SEEK_SET)
433 self.offset = pos
434
435
436def gz_header (name=None):
437 timestamp = int(time.time())
438 flags = 0x0
439
440 if name is None:
441 name = b""
442 else:
443 flags |= GZ_FLAG_FNAME
444 if type(name) is str:
445 name = name.encode("iso-8859-1", "replace")
446 if name.endswith(b".pdtcrypt"):
447 name = name[:-9]
448 if name.endswith(b".gz"):
449 name = name[:-3]
450 # RFC1952 says we must use ISO-8859-1 for the FNAME field.
451 name += NUL
452
453 hdr = struct.pack (GZ_FMT_HEADER,
454 GZ_MAGIC [0], GZ_MAGIC [1],
455 GZ_METHOD_DEFLATE, flags,
456 timestamp,
457 GZ_DEFLATE_FLAGS, GZ_OS_CODE)
458
459 return hdr + name
460
461
462class _Stream:
463 """Class that serves as an adapter between TarFile and
464 a stream-like object. The stream-like object only
465 needs to have a read() or write() method and is accessed
466 blockwise. Use of gzip or bzip2 compression is possible.
467 A stream-like object could be for example: sys.stdin,
468 sys.stdout, a socket, a tape device etc.
469
470 _Stream is intended to be used only internally but is
471 nevertherless used externally by Deltatar.
472
473 When encrypting, the ``enccounter`` will be used for
474 initializing the first cryptographic context. When
475 decrypting, its value will be compared to the decrypted
476 object. Decryption fails if the value does not match.
477 In effect, this means that a ``_Stream`` whose ctor was
478 passed ``enccounter`` can only be used to encrypt or
479 decrypt a single object.
480 """
481
482 remainder = -1 # track size in encrypted entries
483 tolerance = TOLERANCE_STRICT
484
485 def __init__(self, name, mode, comptype, fileobj, bufsize,
486 concat=False, encryption=None, enccounter=None,
487 compresslevel=9, tolerance=TOLERANCE_STRICT):
488 """Construct a _Stream object.
489 """
490 self.arcmode = arcmode_set (concat, encryption, comptype)
491 self.tolerance = tolerance
492
493 self._extfileobj = True
494 if fileobj is None:
495 fileobj = _LowLevelFile(name, mode)
496 self._extfileobj = False
497
498 if comptype == '*':
499 # Enable transparent compression detection for the
500 # stream interface
501 fileobj = _StreamProxy(fileobj)
502 comptype = fileobj.getcomptype()
503 if comptype == '':
504 comptype = "tar"
505
506 self.enccounter = None
507 if self.arcmode & ARCMODE_ENCRYPT:
508 self.enccounter = enccounter
509
510 self.name = name or ""
511 self.mode = mode
512 self.comptype = comptype
513 self.cmp = None
514 self.fileobj = fileobj
515 self.bufsize = bufsize
516 self.buf = b""
517 self.pos = 0
518 self.concat_pos = 0
519 self.closed = False
520 self.flags = 0
521 self.last_block_offset = 0
522 self.dbuf = b"" # ???
523 self.exception = None # communicate decompression failure
524 self.compresslevel = compresslevel
525 self.bytes_written = 0
526 # crypto parameters
527 self.encryption = encryption
528 self.lasthdr = None
529
530 if encryption is not None:
531 encryption.reset_last_iv ()
532
533 try:
534 if comptype == "gz":
535 try:
536 import zlib
537 except ImportError:
538 raise CompressionError("zlib module is not available")
539 self.zlib = zlib
540 if mode == "r":
541 self.exception = zlib.error
542 self._init_read_gz()
543 elif mode == "w":
544 if not (self.arcmode & ARCMODE_CONCAT):
545 if self.arcmode & ARCMODE_ENCRYPT:
546 self._init_write_encrypt (name)
547 self._init_write_gz ()
548 self.crc = zlib.crc32(b"") & 0xFFFFffff
549
550 elif comptype == "bz2":
551 if self.arcmode & ARCMODE_ENCRYPT:
552 raise InvalidEncryptionError("encryption not available for "
553 "compression “%s”" % comptype)
554 try:
555 import bz2
556 except ImportError:
557 raise CompressionError("bz2 module is not available")
558 if mode == "r":
559 self.dbuf = b""
560 self.cmp = bz2.BZ2Decompressor()
561 self.exception = OSError
562 else:
563 self.cmp = bz2.BZ2Compressor()
564
565 elif comptype == 'xz':
566 if self.arcmode & ARCMODE_ENCRYPT:
567 raise InvalidEncryptionError("encryption not available for "
568 "compression “%s”" % comptype)
569 try:
570 import lzma
571 except ImportError:
572 raise CompressionError("lzma module is not available")
573 if mode == "r":
574 self.dbuf = b""
575 self.cmp = lzma.LZMADecompressor()
576 self.exception = lzma.LZMAError
577 else:
578 self.cmp = lzma.LZMACompressor()
579
580 elif comptype == "tar":
581 if not (self.arcmode & ARCMODE_CONCAT) \
582 and mode == "w" \
583 and self.arcmode & ARCMODE_ENCRYPT:
584 self._init_write_encrypt (name)
585
586 else:
587 if self.arcmode & ARCMODE_ENCRYPT:
588 raise InvalidEncryptionError("encryption not available for "
589 "compression “%s”" % comptype)
590 raise CompressionError("unknown compression type %r" % comptype)
591
592 except:
593 if not self._extfileobj:
594 self.fileobj.close()
595 self.closed = True
596 raise
597
598 def __del__(self):
599 if hasattr(self, "closed") and not self.closed:
600 try:
601 self.close()
602 except crypto.InternalError:
603 # context already finalized due to abort but close() tried
604 # to use it
605 pass
606
607
608 def next (self, name):
609 if self.arcmode & ARCMODE_COMPRESS:
610 if getattr (self, "cmp", None) is not None:
611 self._finalize_write_gz ()
612 self.__sync()
613 if self.arcmode & ~(ARCMODE_ENCRYPT | ARCMODE_COMPRESS):
614 self.last_block_offset = self.fileobj.tell()
615 if self.arcmode & ARCMODE_ENCRYPT:
616 self._finalize_write_encrypt ()
617 self._init_write_encrypt (name, set_last_block_offset=True)
618 if self.arcmode & ARCMODE_COMPRESS:
619 self._init_write_gz (set_last_block_offset =
620 not (self.arcmode & ARCMODE_ENCRYPT))
621 return self.last_block_offset
622
623
624 def next_volume (self, name):
625 # with non-concat modes, this is taken care by the _Stream
626 # ctor as invoked by the newvol handler
627 if self.arcmode & ARCMODE_COMPRESS:
628 if getattr (self, "cmp", None) is not None:
629 # e. g. compressed PAX header written
630 self._finalize_write_gz ()
631 if self.arcmode & ARCMODE_ENCRYPT:
632 self._init_write_encrypt (name)
633 if self.arcmode & ARCMODE_COMPRESS:
634 self._init_write_gz ()
635
636
637 def _init_write_encrypt (self, entry=None, set_last_block_offset=False):
638 """
639 Save position for delayed write of header; fill the header location
640 with dummy bytes.
641 """
642 # first thing, proclaim new object to the encryption context
643 # secondly, assemble the header with the updated parameters
644 # and commit it directly to the underlying stream, bypassing the
645 # encryption layer in .__write().
646 dummyhdr = self.encryption.next (entry, counter=self.enccounter)
647 if dummyhdr is None:
648 raise EncryptionError ("Crypto.next(): bad dummy header") # XXX
649 self.lasthdr = self.fileobj.tell()
650 self.__write_to_file(dummyhdr)
651 if set_last_block_offset is True:
652 self.last_block_offset = self.lasthdr
653
654
655 def _finalize_write_encrypt (self):
656 """
657 Seek back to header position, read dummy bytes, finalize crypto
658 obtaining the actual header, write header, seek back to current
659 position.
660
661 Returns the list of IV fixed parts as used during encryption.
662 """
663 if self.lasthdr is not None:
664 pos0 = self.fileobj.tell ()
665 self.fileobj.seek_set (self.lasthdr)
666 dummy = self.fileobj.read (crypto.PDTCRYPT_HDR_SIZE)
667 pos1 = self.fileobj.tell ()
668 dpos = pos1 - self.lasthdr
669 assert dpos == crypto.PDTCRYPT_HDR_SIZE
670 self.fileobj.seek_set (pos0)
671 data, hdr, _ = self.encryption.done (dummy)
672 self.__write_to_file(hdr, pos=self.lasthdr)
673 self.__write_to_file(data) # append remainder of data
674 self.lasthdr = -1
675
676
677 def _finalize_write_gz (self):
678 if self.cmp is not None:
679 chunk = self.buf + self.cmp.flush()
680 if chunk:
681 if self.comptype == "gz":
682 # The native zlib crc is an unsigned 32-bit integer, but
683 # the Python wrapper implicitly casts that to a signed C
684 # long. So, on a 32-bit box self.crc may "look negative",
685 # while the same crc on a 64-bit box may "look positive".
686 # To avoid irksome warnings from the `struct` module, force
687 # it to look positive on all boxes.
688 chunk += struct.pack("<L", self.crc & 0xffffffff)
689 chunk += struct.pack("<L", self.concat_pos & 0xffffFFFF)
690 self.__enc_write (chunk)
691 self.buf = b""
692
693
694 def _init_write_gz (self, set_last_block_offset=False):
695 '''
696 Add a new gzip block, closing last one
697 '''
698 self.concat_pos = 0
699 self.crc = self.zlib.crc32(b"") & 0xFFFFffff
700 first = self.cmp is None
701 self.cmp = self.zlib.compressobj(self.compresslevel,
702 self.zlib.DEFLATED,
703 -self.zlib.MAX_WBITS,
704 self.zlib.DEF_MEM_LEVEL,
705 0)
706
707 # if aes, we encrypt after compression
708 if set_last_block_offset is True:
709 self.last_block_offset = self.fileobj.tell()
710
711 self.__write(gz_header (self.name if first is True else None))
712
713
714 def write(self, s):
715 """Write string s to the stream.
716 """
717 if self.comptype == "gz":
718 self.crc = self.zlib.crc32(s, self.crc) & 0xFFFFffff
719 self.pos += len(s)
720 self.concat_pos += len(s)
721 if self.cmp is not None:
722 s = self.cmp.compress(s)
723 self.__write(s)
724
725 def __sync(self):
726 """Write what’s left in the buffer to the stream."""
727 self.__write (b"") # → len (buf) <= bufsiz
728 self.__enc_write (self.buf)
729 self.buf = b""
730
731 def __write(self, s):
732 """Writes (and encodes) string s to the stream blockwise
733
734 will wait with encoding/writing until block is complete
735 """
736 self.buf += s
737 while len(self.buf) > self.bufsize:
738 self.__enc_write(self.buf[:self.bufsize])
739 self.buf = self.buf[self.bufsize:]
740
741
742 def __write_to_file(self, s, pos=None):
743 '''
744 Writes directly to the fileobj; updates self.bytes_written. If “pos” is
745 given, the stream will seek to that position first and back afterwards,
746 and the total of bytes written is not updated.
747 '''
748 self.fileobj.write(s, pos)
749 if pos is None:
750 self.bytes_written += len(s)
751
752
753 def __enc_write(self, s):
754 """
755 If encryption is active, the string s is encrypted before being written
756 to the file.
757 """
758 if len (s) == 0:
759 return
760 if self.arcmode & ARCMODE_ENCRYPT:
761 buf = s
762 while len (buf) > 0:
763 n, ct = self.encryption.process(buf)
764 self.__write_to_file(ct)
765 buf = buf [n:]
766 if len (buf) > 0:
767 # The entire plaintext was not consumed: The size limit
768 # for encrypted objects was reached. Transparently create
769 # a new encrypted object and continue processing the input.
770 self._finalize_write_encrypt ()
771 self._init_write_encrypt ()
772 else:
773 self.__write_to_file(s)
774
775
776 def estim_file_size(self):
777 """ estimates size of file if closing it now
778
779 The result may differ greatly from the amount of data sent to write()
780 due to compression, encryption and buffering.
781
782 In tests the result (before calling close()) was up to 12k smaller than
783 the final file size if compression is being used because zlib/bz2
784 compressors do not allow inspection of their buffered data :-(
785
786 Still, we add what close() would add: 8 bytes for gz checksum, one
787 encryption block size if encryption is used and the size of our own
788 buffer
789 """
790 if self.closed:
791 return self.bytes_written
792
793 result = self.bytes_written
794 if self.buf:
795 result += len(self.buf)
796 if self.comptype == 'gz':
797 result += 8 # 2 longs = 8 byte (no extra info written for bzip2)
798 return result
799
800 def close(self, close_fileobj=True):
801 """Close the _Stream object. No operation should be
802 done on it afterwards.
803 """
804
805 if self.closed:
806 return
807
808 if close_fileobj is True:
809
810 if self.mode == "w":
811 if self.arcmode & ARCMODE_COMPRESS:
812 self._finalize_write_gz ()
813 # end of Tar archive marker (two empty blocks) was written
814 # finalize encryption last; no writes may be performed after
815 # this point
816 self.__sync ()
817 if self.arcmode & ARCMODE_ENCRYPT:
818 self._finalize_write_encrypt ()
819
820 if not self._extfileobj:
821 self.fileobj.close()
822 else:
823 # read the zlib crc and length and check them
824 if self.mode == "r" and self.comptype == "gz":
825 read_crc = self.__read(4)
826 read_length = self.__read(4)
827 calculated_crc = self.crc
828 if struct.unpack("<L", read_crc)[0] != calculated_crc:
829 raise CompressionError("bad gzip crc")
830 self.closed = True
831
832
833 def _init_read_gz(self):
834 """Initialize for reading a gzip compressed fileobj.
835 """
836 self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
837
838 read2 = self.__read(2)
839 if read2 == b"":
840 raise EndOfFile ("_init_read_gz(): read returned zero bytes at pos "
841 "%d" % self.fileobj.tell())
842 # taken from gzip.GzipFile with some alterations
843 if read2 != GZ_MAGIC_BYTES:
844 raise ReadError("not a gzip file")
845
846 read1 = self.__read(1)
847 if read1 == b"":
848 raise EndOfFile ("_init_read_gz(): read returned zero bytes inside "
849 "gzip header at pos %d" % self.fileobj.tell())
850 if ord (read1) != GZ_METHOD_DEFLATE:
851 raise CompressionError("unsupported compression method")
852
853 self.flags = flag = ord(self.__read(1))
854 self.__read(6) # discard timestamp[4], deflate flags, os code
855
856 if flag & GZ_FLAG_FEXTRA:
857 xlen = ord(self.__read(1)) + 256 * ord(self.__read(1))
858 self.read(xlen)
859 if flag & GZ_FLAG_FNAME:
860 while True:
861 s = self.__read(1)
862 if not s or s == NUL:
863 break
864 if flag & GZ_FLAG_FCOMMENT:
865 while True:
866 s = self.__read(1)
867 if not s or s == NUL:
868 break
869 if flag & GZ_FLAG_FHCRC:
870 self.__read(2)
871
872 def _init_read_encrypt (self):
873 """Initialize encryption for next entry in archive. Read a header and
874 notify the crypto context."""
875 if self.arcmode & ARCMODE_ENCRYPT:
876 lasthdr = self.fileobj.tell ()
877 try:
878 hdr = crypto.hdr_read_stream (self.fileobj)
879 except crypto.EndOfFile:
880 return False
881 except crypto.InvalidHeader as exn:
882 raise DecryptionError ("Crypto.hdr_read_stream(): error “%s” "
883 "processing %r at pos %d"
884 % (exn, self.fileobj, lasthdr)) \
885 from exn
886 if self.enccounter is not None:
887 # enforce that the iv counter in the header matches an
888 # explicitly requested one
889 iv = crypto.hdr_iv_counter (hdr)
890 if iv != self.enccounter:
891 raise DecryptionError ("expected IV counter %d, got %d"
892 % (self.enccounter, iv))
893 self.lasthdr = lasthdr
894 self.remainder = hdr ["ctsize"] # distance to next header
895 try:
896 self.encryption.next (hdr)
897 except crypto.InvalidParameter as exn:
898 raise DecryptionError ("Crypto.next(): error “%s” "
899 "processing %r at pos %d"
900 % (exn, self.fileobj, lasthdr)) \
901 from exn
902
903 return True
904
905
906 def _read_encrypt (self, buf):
907 """
908 Demote a program error to a decryption error in tolerant mode. This
909 allows recovery from corrupted headers and invalid data.
910 """
911 try:
912 return self.encryption.process (buf)
913 except RuntimeError as exn:
914 if self.tolerance != TOLERANCE_STRICT:
915 raise DecryptionError (exn)
916 raise
917
918
919 def _finalize_read_encrypt (self):
920 """
921 Finalize decryption.
922 """
923 if self.arcmode & ARCMODE_ENCRYPT \
924 and self.lasthdr is not None :
925 assert self.remainder >= 0
926 if self.remainder > 0:
927 self.remainder = 0
928 try:
929 data = self.encryption.done ()
930 except crypto.InvalidGCMTag as exn:
931 raise DecryptionError ("decryption failed: %s" % exn)
932 return data
933
934
935 def tell(self):
936 """Return the stream's file pointer position.
937 """
938 return self.pos
939
940 def seek(self, pos=0):
941 """Set the stream's file pointer to pos. Negative seeking
942 is forbidden.
943 """
944 if pos == self.pos:
945 pass # nothing to do
946 elif pos - self.pos >= 0:
947 blocks, remainder = divmod(pos - self.pos, self.bufsize)
948 if self.encryption is not None:
949 # IV succession is only preserved between successive objects.
950 self.encryption.reset_last_iv ()
951 for i in range(blocks):
952 self.read(self.bufsize)
953 self.read(remainder)
954 else:
955 raise StreamError("seeking backwards is not allowed")
956 return self.pos
957
958 def read(self, size=None):
959 """Return the next size number of bytes from the stream.
960 If size is not defined, return all bytes of the stream
961 up to EOF.
962 """
963 if size is None:
964 t = []
965 while True:
966 buf = self._read(self.bufsize)
967 if not buf:
968 break
969 t.append(buf)
970 buf = b"".join(t)
971 else:
972 buf = self._read(size)
973 self.pos += len(buf)
974 return buf
975
976 def readline(self):
977 """Reads just one line, new line character included
978 """
979 # if \n in dbuf, no read neads to be done
980 if b'\n' in self.dbuf:
981 pos = self.dbuf.index(b'\n') + 1
982 ret = self.dbuf[:pos]
983 self.dbuf = self.dbuf[pos:]
984 return ret
985
986 buf = []
987 while True:
988 chunk = self._read(self.bufsize)
989
990 # nothing more to read, so return the buffer
991 if not chunk:
992 return b''.join(buf)
993
994 buf.append(chunk)
995
996 # if \n found, return the new line
997 if b'\n' in chunk:
998 dbuf = b''.join(buf)
999 pos = dbuf.index(b'\n') + 1
1000 self.dbuf = dbuf[pos:] + self.dbuf
1001 return dbuf[:pos]
1002
1003 def _read(self, size):
1004 """Return size bytes from the stream.
1005 """
1006 c = len(self.dbuf)
1007 t = [self.dbuf]
1008
1009 while c < size:
1010 buf = self.__read(self.bufsize)
1011 if not buf:
1012 break
1013
1014 if self.cmp is not None:
1015 try:
1016 buf = self.cmp.decompress(buf)
1017 except self.exception as exn:
1018 raise ReadError("invalid compressed data (%r)" % exn)
1019 except Exception as e:
1020 # happens at the end of the file
1021 # _init_read_gz failed in the previous iteration so
1022 # self.cmp.decompress fails here
1023 if self.arcmode & ARCMODE_CONCAT:
1024 pass
1025 else:
1026 raise ReadError("invalid compressed data")
1027 if self.arcmode & ARCMODE_COMPRESS and hasattr(self, "crc"):
1028 self.crc = self.zlib.crc32(buf, self.crc) & 0xFFFFffff
1029 if self.arcmode & ARCMODE_CONCAT \
1030 and len(self.cmp.unused_data) != 0:
1031 self.buf = self.cmp.unused_data + self.buf
1032 self.close(close_fileobj=False)
1033 try:
1034 self._init_read_gz()
1035 except DecryptionError:
1036 if self.tolerance != TOLERANCE_STRICT:
1037 # return whatever data was processed successfully
1038 if len (buf) > 0:
1039 t.append (buf)
1040 if len (t) > 0:
1041 break
1042 raise
1043 except ReadError: # gzip troubles
1044 if self.tolerance == TOLERANCE_RESCUE:
1045 if len (buf) > 0:
1046 t.append (buf)
1047 if len (t) > 0:
1048 break
1049 raise
1050 except EndOfFile:
1051 # happens at the end of the file
1052 pass
1053 self.crc = self.zlib.crc32(b"") & 0xFFFFffff
1054 self.closed = False
1055 t.append(buf)
1056 c += len(buf)
1057 t = b"".join(t)
1058 self.dbuf = t[size:]
1059 return t[:size]
1060
1061
1062 def __read(self, size):
1063 """
1064 Return size bytes from stream. If internal buffer is empty, read
1065 another block from the stream.
1066
1067 The function returns up to size bytes of data. When an error occurs
1068 during decryption, everything until the end of the last successfully
1069 finalized object is returned.
1070 """
1071 c = len(self.buf)
1072 t = [self.buf] if c > 0 else []
1073 good_crypto = len (t)
1074
1075 while c < size:
1076 todo = size
1077 try:
1078 if self.arcmode & ARCMODE_ENCRYPT:
1079 if self.remainder <= 0:
1080 # prepare next object
1081 if self._init_read_encrypt () is False: # EOF
1082 buf = None
1083 break # while
1084
1085 # only read up to the end of the encrypted object
1086 todo = min (size, self.remainder)
1087 buf = self.fileobj.read(todo)
1088 if self.arcmode & ARCMODE_ENCRYPT:
1089 # decrypt the thing
1090 buf = self._read_encrypt (buf)
1091 if todo == self.remainder:
1092 # at the end of a crypto object; finalization will fail if
1093 # the GCM tag does not match
1094 trailing = self._finalize_read_encrypt ()
1095 good_crypto = len (t) + 1
1096 if len (trailing) > 0:
1097 buf += trailing
1098 self.remainder = 0
1099 else:
1100 self.remainder -= todo
1101 except DecryptionError:
1102 if self.tolerance == TOLERANCE_STRICT:
1103 raise
1104 self.encryption.drop ()
1105 if self.tolerance == TOLERANCE_RECOVER:
1106 if good_crypto == 0:
1107 raise
1108 # this may occur at any of the three crypto operations above.
1109 # some objects did validate; discard all data after it; next
1110 # call will start with the bad object and error out immediately
1111 self.buf = b"".join (t [good_crypto:])
1112 return b"".join (t [:good_crypto])
1113 elif self.tolerance == TOLERANCE_RESCUE:
1114 # keep what we have so far despite the finalization issue
1115 t.append (buf)
1116 c += len (buf)
1117 break
1118 else:
1119 raise RuntimeError("internal error: bad tolerance level")
1120
1121 if not buf: ## XXX stream terminated prematurely; this should be an error
1122 break
1123
1124 t.append(buf)
1125 c += len(buf)
1126 t = b"".join(t)
1127 self.buf = t[size:]
1128
1129 return t[:size]
1130
1131
1132class _StreamProxy(object):
1133 """Small proxy class that enables transparent compression
1134 detection for the Stream interface (mode 'r|*').
1135 """
1136
1137 def __init__(self, fileobj):
1138 self.fileobj = fileobj
1139 self.buf = self.fileobj.read(BLOCKSIZE)
1140
1141 def read(self, size): # pylint: disable=method-hidden
1142 self.read = self.fileobj.read
1143 return self.buf
1144
1145 def getcomptype(self):
1146 if self.buf.startswith(GZ_MAGIC_DEFLATE):
1147 return "gz"
1148 elif self.buf[0:3] == b"BZh" and self.buf[4:10] == b"1AY&SY":
1149 return "bz2"
1150 elif self.buf.startswith((b"\x5d\x00\x00\x80", b"\xfd7zXZ")):
1151 return "xz"
1152 else:
1153 return "tar"
1154
1155 def close(self):
1156 self.fileobj.close()
1157# class StreamProxy
1158
1159#------------------------
1160# Extraction file object
1161#------------------------
1162class _FileInFile(object):
1163 """A thin wrapper around an existing file object that
1164 provides a part of its data as an individual file
1165 object.
1166 """
1167
1168 def __init__(self, fileobj, offset, size, blockinfo=None):
1169 self.fileobj = fileobj
1170 self.offset = offset
1171 self.size = size
1172 self.position = 0
1173 self.name = getattr(fileobj, "name", None)
1174 self.closed = False
1175
1176 if blockinfo is None:
1177 blockinfo = [(0, size)]
1178
1179 # Construct a map with data and zero blocks.
1180 self.map_index = 0
1181 self.map = []
1182 lastpos = 0
1183 realpos = self.offset
1184 for offset, size in blockinfo:
1185 if offset > lastpos:
1186 self.map.append((False, lastpos, offset, None))
1187 self.map.append((True, offset, offset + size, realpos))
1188 realpos += size
1189 lastpos = offset + size
1190 if lastpos < self.size:
1191 self.map.append((False, lastpos, self.size, None))
1192
1193 def flush(self):
1194 pass
1195
1196 def readable(self):
1197 return True
1198
1199 def writable(self):
1200 return False
1201
1202 def seekable(self):
1203 return self.fileobj.seekable()
1204
1205 def tell(self):
1206 """Return the current file position.
1207 """
1208 return self.position
1209
1210 def seek(self, position, whence=io.SEEK_SET):
1211 """Seek to a position in the file.
1212 """
1213 if whence == io.SEEK_SET:
1214 self.position = min(max(position, 0), self.size)
1215 elif whence == io.SEEK_CUR:
1216 if position < 0:
1217 self.position = max(self.position + position, 0)
1218 else:
1219 self.position = min(self.position + position, self.size)
1220 elif whence == io.SEEK_END:
1221 self.position = max(min(self.size + position, self.size), 0)
1222 else:
1223 raise ValueError("Invalid argument")
1224 return self.position
1225
1226 def read(self, size=None):
1227 """Read data from the file.
1228 """
1229 if size is None:
1230 size = self.size - self.position
1231 else:
1232 size = min(size, self.size - self.position)
1233
1234 buf = b""
1235 while size > 0:
1236 while True:
1237 data, start, stop, offset = self.map[self.map_index]
1238 if start <= self.position < stop:
1239 break
1240 else:
1241 self.map_index += 1
1242 if self.map_index == len(self.map):
1243 self.map_index = 0
1244 length = min(size, stop - self.position)
1245 if data:
1246 self.fileobj.seek(offset + (self.position - start))
1247 buf += self.fileobj.read(length)
1248 else:
1249 buf += NUL * length
1250 size -= length
1251 self.position += length
1252 return buf
1253
1254 def readinto(self, b):
1255 buf = self.read(len(b))
1256 b[:len(buf)] = buf
1257 return len(buf)
1258
1259 def close(self):
1260 self.closed = True
1261#class _FileInFile
1262
1263
1264class ExFileObject(io.BufferedReader):
1265
1266 def __init__(self, tarfile, tarinfo):
1267 fileobj = _FileInFile(tarfile.fileobj, tarinfo.offset_data,
1268 tarinfo.size, tarinfo.sparse)
1269 super().__init__(fileobj)
1270#class ExFileObject
1271
1272#------------------
1273# Exported Classes
1274#------------------
1275class TarInfo(object):
1276 """Informational class which holds the details about an
1277 archive member given by a tar header block.
1278 TarInfo objects are returned by TarFile.getmember(),
1279 TarFile.getmembers() and TarFile.gettarinfo() and are
1280 usually created internally.
1281 """
1282
1283 __slots__ = ("name", "mode", "uid", "gid", "size", "mtime",
1284 "chksum", "type", "linkname", "uname", "gname",
1285 "devmajor", "devminor", "volume_offset",
1286 "offset", "offset_data", "pax_headers", "sparse",
1287 "tarfile", "_sparse_structs", "_link_target")
1288
1289 def __init__(self, name=""):
1290 """Construct a TarInfo object. name is the optional name
1291 of the member.
1292 """
1293 self.name = name # member name
1294 self.mode = 0o644 # file permissions
1295 self.uid = 0 # user id
1296 self.gid = 0 # group id
1297 self.size = 0 # file size
1298 self.mtime = 0 # modification time
1299 self.chksum = 0 # header checksum
1300 self.type = REGTYPE # member type
1301 self.linkname = "" # link name
1302 self.uname = "" # user name
1303 self.gname = "" # group name
1304 self.devmajor = 0 # device major number
1305 self.devminor = 0 # device minor number
1306
1307 self.offset = 0 # the tar header starts here
1308 self.offset_data = 0 # the file's data starts here
1309 self.volume_offset = 0 # the file's data corresponds with the data
1310 # starting at this position
1311
1312 self.sparse = None # sparse member information
1313 self.pax_headers = {} # pax header information
1314
1315 # In pax headers the "name" and "linkname" field are called
1316 # "path" and "linkpath".
1317 def _getpath(self):
1318 return self.name
1319 def _setpath(self, name):
1320 self.name = name
1321 path = property(_getpath, _setpath)
1322
1323 def _getlinkpath(self):
1324 return self.linkname
1325 def _setlinkpath(self, linkname):
1326 self.linkname = linkname
1327 linkpath = property(_getlinkpath, _setlinkpath)
1328
1329 def __repr__(self):
1330 return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self))
1331
1332 def get_info(self, encoding=None, errors=None):
1333 """Return the TarInfo's attributes as a dictionary.
1334 """
1335 info = {
1336 "name": self.name,
1337 "mode": self.mode & 0o7777,
1338 "uid": self.uid,
1339 "gid": self.gid,
1340 "size": self.size,
1341 "mtime": self.mtime,
1342 "chksum": self.chksum,
1343 "type": self.type,
1344 "linkname": self.linkname,
1345 "uname": self.uname,
1346 "gname": self.gname,
1347 "devmajor": self.devmajor,
1348 "devminor": self.devminor,
1349 "offset_data": self.offset_data,
1350 "volume_offset": self.volume_offset
1351 }
1352
1353 if info["type"] == DIRTYPE and not info["name"].endswith("/"):
1354 info["name"] += "/"
1355
1356 return info
1357
1358 def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING,
1359 errors="surrogateescape"):
1360 """Return a tar header as a string of 512 byte blocks.
1361 """
1362 info = self.get_info(encoding, errors)
1363
1364 if format == USTAR_FORMAT:
1365 return self.create_ustar_header(info, encoding, errors)
1366 elif format == GNU_FORMAT:
1367 return self.create_gnu_header(info, encoding, errors)
1368 elif format == PAX_FORMAT:
1369 return self.create_pax_header(info, encoding, errors)
1370 else:
1371 raise ValueError("invalid format")
1372
1373 def create_ustar_header(self, info, encoding, errors):
1374 """Return the object as a ustar header block.
1375 """
1376 info["magic"] = POSIX_MAGIC
1377
1378 if len(info["linkname"]) > LENGTH_LINK:
1379 raise ValueError("linkname is too long")
1380
1381 if len(info["name"]) > LENGTH_NAME:
1382 info["prefix"], info["name"] = self._posix_split_name(info["name"])
1383
1384 return self._create_header(info, USTAR_FORMAT, encoding, errors)
1385
1386 def create_gnu_header(self, info, encoding, errors):
1387 """Return the object as a GNU header block sequence.
1388 """
1389 info["magic"] = GNU_MAGIC
1390
1391 if self.ismultivol():
1392 prefix = [
1393 itn(info.get("atime", 0), 12, GNU_FORMAT),
1394 itn(info.get("ctime", 0), 12, GNU_FORMAT),
1395 itn(self.volume_offset, 12, GNU_FORMAT),
1396 itn(0, 119, GNU_FORMAT), # stuff unused in this tar implementation, set to zero
1397 ]
1398 info['prefix'] = b"".join(prefix)
1399 info['size'] = info['size'] - self.volume_offset
1400
1401 buf = b""
1402 if len(info["linkname"]) > LENGTH_LINK:
1403 buf += self._create_gnu_long_header(info["linkname"],
1404 GNUTYPE_LONGLINK, encoding, errors)
1405
1406 if len(info["name"]) > LENGTH_NAME:
1407 buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME,
1408 encoding, errors)
1409
1410 return buf + self._create_header(info, GNU_FORMAT, encoding, errors)
1411
1412 def create_pax_header(self, info, encoding, errors):
1413 """Return the object as a ustar header block. If it cannot be
1414 represented this way, prepend a pax extended header sequence
1415 with supplement information.
1416 """
1417 info["magic"] = POSIX_MAGIC
1418 pax_headers = self.pax_headers.copy()
1419 if self.ismultivol():
1420 info['size'] = info['size'] - self.volume_offset
1421
1422 # Test string fields for values that exceed the field length or cannot
1423 # be represented in ASCII encoding.
1424 for name, hname, length in (
1425 ("name", "path", LENGTH_NAME),
1426 ("linkname", "linkpath", LENGTH_LINK),
1427 ("uname", "uname", 32),
1428 ("gname", "gname", 32)):
1429
1430 if hname in pax_headers:
1431 # The pax header has priority.
1432 continue
1433
1434 # Try to encode the string as ASCII.
1435 try:
1436 info[name].encode("ascii", "strict")
1437 except UnicodeEncodeError:
1438 pax_headers[hname] = info[name]
1439 continue
1440
1441 if len(info[name]) > length:
1442 pax_headers[hname] = info[name]
1443
1444 # Test number fields for values that exceed the field limit or values
1445 # that like to be stored as float.
1446 for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)):
1447 if name in pax_headers:
1448 # The pax header has priority. Avoid overflow.
1449 info[name] = 0
1450 continue
1451
1452 val = info[name]
1453 if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float):
1454 pax_headers[name] = str(val)
1455 info[name] = 0
1456
1457 # Create a pax extended header if necessary.
1458 if pax_headers:
1459 buf = self._create_pax_generic_header(pax_headers, XHDTYPE, encoding)
1460 else:
1461 buf = b""
1462
1463 return buf + self._create_header(info, USTAR_FORMAT, "ascii", "replace")
1464
1465 @classmethod
1466 def create_pax_global_header(cls, pax_headers):
1467 """Return the object as a pax global header block sequence.
1468 """
1469 return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf-8")
1470
1471 def _posix_split_name(self, name):
1472 """Split a name longer than 100 chars into a prefix
1473 and a name part.
1474 """
1475 prefix = name[:LENGTH_PREFIX + 1]
1476 while prefix and prefix[-1] != "/":
1477 prefix = prefix[:-1]
1478
1479 name = name[len(prefix):]
1480 prefix = prefix[:-1]
1481
1482 if not prefix or len(name) > LENGTH_NAME:
1483 raise ValueError("name is too long")
1484 return prefix, name
1485
1486 @staticmethod
1487 def _create_header(info, format, encoding, errors):
1488 """Return a header block. info is a dictionary with file
1489 information, format must be one of the *_FORMAT constants.
1490 """
1491 parts = [
1492 stn(info.get("name", ""), 100, encoding, errors),
1493 itn(info.get("mode", 0) & 0o7777, 8, format),
1494 itn(info.get("uid", 0), 8, format),
1495 itn(info.get("gid", 0), 8, format),
1496 itn(info.get("size", 0), 12, format),
1497 itn(info.get("mtime", 0), 12, format),
1498 b" ", # checksum field
1499 info.get("type", REGTYPE),
1500 stn(info.get("linkname", ""), 100, encoding, errors),
1501 info.get("magic", POSIX_MAGIC),
1502 stn(info.get("uname", ""), 32, encoding, errors),
1503 stn(info.get("gname", ""), 32, encoding, errors),
1504 itn(info.get("devmajor", 0), 8, format),
1505 itn(info.get("devminor", 0), 8, format),
1506 sbtn(info.get("prefix", ""), 155, encoding, errors)
1507 ]
1508
1509 buf = struct.pack("%ds" % BLOCKSIZE, b"".join(parts))
1510 chksum = calc_chksums(buf[-BLOCKSIZE:])[0]
1511 buf = buf[:-364] + bytes("%06o\0" % chksum, "ascii") + buf[-357:]
1512 return buf
1513
1514 @staticmethod
1515 def _create_payload(payload):
1516 """Return the string payload filled with zero bytes
1517 up to the next 512 byte border.
1518 """
1519 blocks, remainder = divmod(len(payload), BLOCKSIZE)
1520 if remainder > 0:
1521 payload += (BLOCKSIZE - remainder) * NUL
1522 return payload
1523
1524 @classmethod
1525 def _create_gnu_long_header(cls, name, type, encoding, errors):
1526 """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence
1527 for name.
1528 """
1529 name = name.encode(encoding, errors) + NUL
1530
1531 info = {}
1532 info["name"] = "././@LongLink"
1533 info["type"] = type
1534 info["size"] = len(name)
1535 info["magic"] = GNU_MAGIC
1536
1537 # create extended header + name blocks.
1538 return cls._create_header(info, USTAR_FORMAT, encoding, errors) + \
1539 cls._create_payload(name)
1540
1541 @classmethod
1542 def _create_pax_generic_header(cls, pax_headers, type, encoding):
1543 """Return a POSIX.1-2008 extended or global header sequence
1544 that contains a list of keyword, value pairs. The values
1545 must be strings.
1546 """
1547 # Check if one of the fields contains surrogate characters and thereby
1548 # forces hdrcharset=BINARY, see _proc_pax() for more information.
1549 binary = False
1550 for keyword, value in pax_headers.items():
1551 try:
1552 value.encode("utf-8", "strict")
1553 except UnicodeEncodeError:
1554 binary = True
1555 break
1556
1557 records = b""
1558 if binary:
1559 # Put the hdrcharset field at the beginning of the header.
1560 records += b"21 hdrcharset=BINARY\n"
1561
1562 for keyword, value in pax_headers.items():
1563 keyword = keyword.encode("utf-8")
1564 if binary:
1565 # Try to restore the original byte representation of `value'.
1566 # Needless to say, that the encoding must match the string.
1567 value = value.encode(encoding, "surrogateescape")
1568 else:
1569 value = value.encode("utf-8")
1570
1571 l = len(keyword) + len(value) + 3 # ' ' + '=' + '\n'
1572 n = p = 0
1573 while True:
1574 n = l + len(str(p))
1575 if n == p:
1576 break
1577 p = n
1578 records += bytes(str(p), "ascii") + b" " + keyword + b"=" + value + b"\n"
1579
1580 # We use a hardcoded "././@PaxHeader" name like star does
1581 # instead of the one that POSIX recommends.
1582 info = {}
1583 info["name"] = "././@PaxHeader"
1584 info["type"] = type
1585 info["size"] = len(records)
1586 info["magic"] = POSIX_MAGIC
1587
1588 # Create pax header + record blocks.
1589 return cls._create_header(info, USTAR_FORMAT, "ascii", "replace") + \
1590 cls._create_payload(records)
1591
1592 @classmethod
1593 def frombuf(cls, buf, encoding, errors):
1594 """Construct a TarInfo object from a 512 byte bytes object.
1595 """
1596 if len(buf) == 0:
1597 raise EmptyHeaderError("empty header")
1598 if len(buf) != BLOCKSIZE:
1599 raise TruncatedHeaderError("truncated header")
1600 if buf.count(NUL) == BLOCKSIZE:
1601 raise EOFHeaderError("end of file header")
1602
1603 chksum = nti(buf[148:156])
1604 if chksum not in calc_chksums(buf):
1605 raise InvalidHeaderError("bad checksum")
1606
1607 obj = cls()
1608 obj.name = nts(buf[0:100], encoding, errors)
1609 obj.mode = nti(buf[100:108])
1610 obj.uid = nti(buf[108:116])
1611 obj.gid = nti(buf[116:124])
1612 obj.size = nti(buf[124:136])
1613 obj.mtime = nti(buf[136:148])
1614 obj.chksum = chksum
1615 obj.type = buf[156:157]
1616 obj.linkname = nts(buf[157:257], encoding, errors)
1617 obj.uname = nts(buf[265:297], encoding, errors)
1618 obj.gname = nts(buf[297:329], encoding, errors)
1619 obj.devmajor = nti(buf[329:337])
1620 obj.devminor = nti(buf[337:345])
1621 prefix = nts(buf[345:500], encoding, errors)
1622
1623 # The old GNU sparse format occupies some of the unused
1624 # space in the buffer for up to 4 sparse structures.
1625 # Save the them for later processing in _proc_sparse().
1626 if obj.type == GNUTYPE_SPARSE:
1627 pos = 386
1628 structs = []
1629 for i in range(4):
1630 try:
1631 offset = nti(buf[pos:pos + 12])
1632 numbytes = nti(buf[pos + 12:pos + 24])
1633 except ValueError:
1634 break
1635 structs.append((offset, numbytes))
1636 pos += 24
1637 isextended = bool(buf[482])
1638 origsize = nti(buf[483:495])
1639 obj._sparse_structs = (structs, isextended, origsize)
1640
1641 # Old V7 tar format represents a directory as a regular
1642 # file with a trailing slash.
1643 if obj.type == AREGTYPE and obj.name.endswith("/"):
1644 obj.type = DIRTYPE
1645
1646 # Remove redundant slashes from directories.
1647 if obj.isdir():
1648 obj.name = obj.name.rstrip("/")
1649
1650 # Reconstruct a ustar longname.
1651 if prefix and obj.type not in GNU_TYPES:
1652 obj.name = prefix + "/" + obj.name
1653 else:
1654 obj.offset_data = nti(buf[369:381])
1655 return obj
1656
1657 @classmethod
1658 def fromtarfile(cls, tarfile):
1659 """Return the next TarInfo object from TarFile object
1660 tarfile.
1661 """
1662 buf = tarfile.fileobj.read(BLOCKSIZE)
1663 obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors)
1664 obj.offset = tarfile.fileobj.tell() - BLOCKSIZE
1665 return obj._proc_member(tarfile)
1666
1667 #--------------------------------------------------------------------------
1668 # The following are methods that are called depending on the type of a
1669 # member. The entry point is _proc_member() which can be overridden in a
1670 # subclass to add custom _proc_*() methods. A _proc_*() method MUST
1671 # implement the following
1672 # operations:
1673 # 1. Set self.offset_data to the position where the data blocks begin,
1674 # if there is data that follows.
1675 # 2. Set tarfile.offset to the position where the next member's header will
1676 # begin.
1677 # 3. Return self or another valid TarInfo object.
1678 def _proc_member(self, tarfile):
1679 """Choose the right processing method depending on
1680 the type and call it.
1681 """
1682 if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK):
1683 return self._proc_gnulong(tarfile)
1684 elif self.type == GNUTYPE_SPARSE:
1685 return self._proc_sparse(tarfile)
1686 elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE):
1687 return self._proc_pax(tarfile)
1688 else:
1689 return self._proc_builtin(tarfile)
1690
1691 def _proc_builtin(self, tarfile):
1692 """Process a builtin type or an unknown type which
1693 will be treated as a regular file.
1694 """
1695 self.offset_data = tarfile.fileobj.tell()
1696 offset = self.offset_data
1697 if self.isreg() or self.ismultivol() or self.type not in SUPPORTED_TYPES:
1698 # Skip the following data blocks.
1699 offset += self._block(self.size)
1700 tarfile.offset = offset
1701
1702 # Patch the TarInfo object with saved global
1703 # header information.
1704 self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors)
1705
1706 return self
1707
1708 def _proc_gnulong(self, tarfile):
1709 """Process the blocks that hold a GNU longname
1710 or longlink member.
1711 """
1712 buf = tarfile.fileobj.read(self._block(self.size))
1713
1714 # Fetch the next header and process it.
1715 try:
1716 next = self.fromtarfile(tarfile)
1717 except HeaderError:
1718 raise SubsequentHeaderError("missing or bad subsequent header")
1719
1720 # Patch the TarInfo object from the next header with
1721 # the longname information.
1722 next.offset = self.offset
1723 if self.type == GNUTYPE_LONGNAME:
1724 next.name = nts(buf, tarfile.encoding, tarfile.errors)
1725 elif self.type == GNUTYPE_LONGLINK:
1726 next.linkname = nts(buf, tarfile.encoding, tarfile.errors)
1727
1728 return next
1729
1730 def _proc_sparse(self, tarfile):
1731 """Process a GNU sparse header plus extra headers.
1732 """
1733 # We already collected some sparse structures in frombuf().
1734 structs, isextended, origsize = self._sparse_structs
1735 del self._sparse_structs
1736
1737 # Collect sparse structures from extended header blocks.
1738 while isextended:
1739 buf = tarfile.fileobj.read(BLOCKSIZE)
1740 pos = 0
1741 for i in range(21):
1742 try:
1743 offset = nti(buf[pos:pos + 12])
1744 numbytes = nti(buf[pos + 12:pos + 24])
1745 except ValueError:
1746 break
1747 if offset and numbytes:
1748 structs.append((offset, numbytes))
1749 pos += 24
1750 isextended = bool(buf[504])
1751 self.sparse = structs
1752
1753 self.offset_data = tarfile.fileobj.tell()
1754 tarfile.offset = self.offset_data + self._block(self.size)
1755 self.size = origsize
1756 return self
1757
1758 def _proc_pax(self, tarfile):
1759 """Process an extended or global header as described in
1760 POSIX.1-2008.
1761 """
1762 # Read the header information.
1763 buf = tarfile.fileobj.read(self._block(self.size))
1764
1765 # A pax header stores supplemental information for either
1766 # the following file (extended) or all following files
1767 # (global).
1768 if self.type == XGLTYPE:
1769 pax_headers = tarfile.pax_headers
1770 else:
1771 pax_headers = tarfile.pax_headers.copy()
1772
1773 # Check if the pax header contains a hdrcharset field. This tells us
1774 # the encoding of the path, linkpath, uname and gname fields. Normally,
1775 # these fields are UTF-8 encoded but since POSIX.1-2008 tar
1776 # implementations are allowed to store them as raw binary strings if
1777 # the translation to UTF-8 fails.
1778 match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf)
1779 if match is not None:
1780 pax_headers["hdrcharset"] = match.group(1).decode("utf-8")
1781
1782 # For the time being, we don't care about anything other than "BINARY".
1783 # The only other value that is currently allowed by the standard is
1784 # "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
1785 hdrcharset = pax_headers.get("hdrcharset")
1786 if hdrcharset == "BINARY":
1787 encoding = tarfile.encoding
1788 else:
1789 encoding = "utf-8"
1790
1791 # Parse pax header information. A record looks like that:
1792 # "%d %s=%s\n" % (length, keyword, value). length is the size
1793 # of the complete record including the length field itself and
1794 # the newline. keyword and value are both UTF-8 encoded strings.
1795 regex = re.compile(br"(\d+) ([^=]+)=")
1796 pos = 0
1797 while True:
1798 match = regex.match(buf, pos)
1799 if not match:
1800 break
1801
1802 length, keyword = match.groups()
1803 length = int(length)
1804 value = buf[match.end(2) + 1:match.start(1) + length - 1]
1805
1806 # Normally, we could just use "utf-8" as the encoding and "strict"
1807 # as the error handler, but we better not take the risk. For
1808 # example, GNU tar <= 1.23 is known to store filenames it cannot
1809 # translate to UTF-8 as raw strings (unfortunately without a
1810 # hdrcharset=BINARY header).
1811 # We first try the strict standard encoding, and if that fails we
1812 # fall back on the user's encoding and error handler.
1813 keyword = self._decode_pax_field(keyword, "utf-8", "utf-8",
1814 tarfile.errors)
1815 if keyword in PAX_NAME_FIELDS:
1816 value = self._decode_pax_field(value, encoding, tarfile.encoding,
1817 tarfile.errors)
1818 else:
1819 value = self._decode_pax_field(value, "utf-8", "utf-8",
1820 tarfile.errors)
1821
1822 pax_headers[keyword] = value
1823 pos += length
1824
1825
1826 # Fetch the next header.
1827 try:
1828 next = self.fromtarfile(tarfile)
1829 except HeaderError:
1830 raise SubsequentHeaderError("missing or bad subsequent header")
1831
1832 # Process GNU sparse information.
1833 if "GNU.sparse.map" in pax_headers:
1834 # GNU extended sparse format version 0.1.
1835 self._proc_gnusparse_01(next, pax_headers)
1836
1837 elif "GNU.sparse.size" in pax_headers:
1838 # GNU extended sparse format version 0.0.
1839 self._proc_gnusparse_00(next, pax_headers, buf)
1840
1841 elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0":
1842 # GNU extended sparse format version 1.0.
1843 self._proc_gnusparse_10(next, pax_headers, tarfile)
1844
1845 if self.type in (XHDTYPE, SOLARIS_XHDTYPE):
1846 # Patch the TarInfo object with the extended header info.
1847 next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors)
1848 next.offset = self.offset
1849
1850 if "size" in pax_headers:
1851 # If the extended header replaces the size field,
1852 # we need to recalculate the offset where the next
1853 # header starts.
1854 offset = next.offset_data
1855 if next.isreg() or next.type not in SUPPORTED_TYPES:
1856 offset += next._block(next.size)
1857 tarfile.offset = offset
1858
1859 if next is not None:
1860 if "GNU.volume.filename" in pax_headers:
1861 if pax_headers["GNU.volume.filename"] == next.name:
1862 if "GNU.volume.size" in pax_headers:
1863 next.size = int(pax_headers["GNU.volume.size"])
1864 if "GNU.volume.offset" in pax_headers:
1865 next.volume_offset = int(pax_headers["GNU.volume.offset"])
1866
1867 for key in pax_headers.keys():
1868 if key.startswith("GNU.volume"):
1869 del tarfile.pax_headers[key]
1870
1871 return next
1872
1873 def _proc_gnusparse_00(self, next, pax_headers, buf):
1874 """Process a GNU tar extended sparse header, version 0.0.
1875 """
1876 offsets = []
1877 for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf):
1878 offsets.append(int(match.group(1)))
1879 numbytes = []
1880 for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf):
1881 numbytes.append(int(match.group(1)))
1882 next.sparse = list(zip(offsets, numbytes))
1883
1884 def _proc_gnusparse_01(self, next, pax_headers):
1885 """Process a GNU tar extended sparse header, version 0.1.
1886 """
1887 sparse = [int(x) for x in pax_headers["GNU.sparse.map"].split(",")]
1888 next.sparse = list(zip(sparse[::2], sparse[1::2]))
1889
1890 def _proc_gnusparse_10(self, next, pax_headers, tarfile):
1891 """Process a GNU tar extended sparse header, version 1.0.
1892 """
1893 fields = None
1894 sparse = []
1895 buf = tarfile.fileobj.read(BLOCKSIZE)
1896 fields, buf = buf.split(b"\n", 1)
1897 fields = int(fields)
1898 while len(sparse) < fields * 2:
1899 if b"\n" not in buf:
1900 buf += tarfile.fileobj.read(BLOCKSIZE)
1901 number, buf = buf.split(b"\n", 1)
1902 sparse.append(int(number))
1903 next.offset_data = tarfile.fileobj.tell()
1904 next.sparse = list(zip(sparse[::2], sparse[1::2]))
1905
1906 def _apply_pax_info(self, pax_headers, encoding, errors):
1907 """Replace fields with supplemental information from a previous
1908 pax extended or global header.
1909 """
1910 for keyword, value in pax_headers.items():
1911 if keyword == "GNU.sparse.name":
1912 setattr(self, "path", value)
1913 elif keyword == "GNU.sparse.size":
1914 setattr(self, "size", int(value))
1915 elif keyword == "GNU.sparse.realsize":
1916 setattr(self, "size", int(value))
1917 elif keyword in PAX_FIELDS:
1918 if keyword in PAX_NUMBER_FIELDS:
1919 try:
1920 value = PAX_NUMBER_FIELDS[keyword](value)
1921 except ValueError:
1922 value = 0
1923 if keyword == "path":
1924 value = value.rstrip("/") # pylint: disable=no-member
1925 setattr(self, keyword, value)
1926
1927 self.pax_headers = pax_headers.copy()
1928
1929 def _decode_pax_field(self, value, encoding, fallback_encoding, fallback_errors):
1930 """Decode a single field from a pax record.
1931 """
1932 try:
1933 return value.decode(encoding, "strict")
1934 except UnicodeDecodeError:
1935 return value.decode(fallback_encoding, fallback_errors)
1936
1937 def _block(self, count):
1938 """Round up a byte count by BLOCKSIZE and return it,
1939 e.g. _block(834) => 1024.
1940 """
1941 blocks, remainder = divmod(count, BLOCKSIZE)
1942 if remainder:
1943 blocks += 1
1944 return blocks * BLOCKSIZE
1945
1946 def isreg(self):
1947 return self.type in REGULAR_TYPES
1948 def isfile(self):
1949 return self.isreg()
1950 def isdir(self):
1951 return self.type == DIRTYPE
1952 def issym(self):
1953 return self.type == SYMTYPE
1954 def islnk(self):
1955 return self.type == LNKTYPE
1956 def ischr(self):
1957 return self.type == CHRTYPE
1958 def isblk(self):
1959 return self.type == BLKTYPE
1960 def isfifo(self):
1961 return self.type == FIFOTYPE
1962 def issparse(self):
1963 return self.sparse is not None
1964 def isdev(self):
1965 return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE)
1966 def ismultivol(self):
1967 return self.type == GNUTYPE_MULTIVOL or self.volume_offset > 0 or\
1968 "GNU.volume.offset" in self.pax_headers
1969# class TarInfo
1970
1971class TarFile(object):
1972 """The TarFile Class provides an interface to tar archives.
1973 """
1974
1975 debug = 0 # May be set from 0 (no msgs) to 3 (all msgs)
1976
1977 dereference = False # If true, add content of linked file to the
1978 # tar file, else the link.
1979
1980 ignore_zeros = False # If true, skips empty or invalid blocks and
1981 # continues processing.
1982
1983 max_volume_size = None # If different from None, establishes maximum
1984 # size of tar volumes
1985
1986 new_volume_handler = None # function handler to be executed before when
1987 # a new volume is needed
1988
1989 volume_number = 0 # current volume number, used for multi volume
1990 # support
1991
1992 errorlevel = 1 # If 0, fatal errors only appear in debug
1993 # messages (if debug >= 0). If > 0, errors
1994 # are passed to the caller as exceptions.
1995
1996 format = DEFAULT_FORMAT # The format to use when creating an archive.
1997
1998 encoding = ENCODING # Encoding for 8-bit character strings.
1999
2000 errors = None # Error handler for unicode conversion.
2001
2002 tarinfo = TarInfo # The default TarInfo class to use.
2003
2004 fileobject = ExFileObject # The file-object for extractfile().
2005
2006 arcmode = ARCMODE_PLAIN # Object processing mode (“concat”, encryption,
2007 # compression)
2008
2009 save_to_members = True # If new members are saved. This can be disabled
2010 # if you manage lots of files and don't want
2011 # to have high memory usage
2012
2013 cache_uid2user = {} # cache to avoid getpwuid calls. It always parses /etc/passwd.
2014 cache_gid2group = {} # same cache for groups
2015
2016 def __init__(self, name=None, mode="r", fileobj=None, format=None,
2017 tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
2018 errors="surrogateescape", pax_headers=None, debug=None,
2019 errorlevel=None, max_volume_size=None, new_volume_handler=None,
2020 concat=False, nacl=None,
2021 save_to_members=True):
2022 """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
2023 read from an existing archive, 'a' to append data to an existing
2024 file or 'w' to create a new file overwriting an existing one. `mode'
2025 defaults to 'r'.
2026 If `fileobj' is given, it is used for reading or writing data. If it
2027 can be determined, `mode' is overridden by `fileobj's mode.
2028 `fileobj' is not closed, when TarFile is closed.
2029 """
2030 if len(mode) > 1 or mode not in "raw":
2031 raise ValueError("mode must be 'r', 'a' or 'w'")
2032 self.mode = mode
2033 self.arcmode = arcmode_set (concat)
2034 self.nacl = nacl
2035 self._mode = {"r": "rb", "a": "r+b", "w": "wb"}[mode]
2036
2037 if not fileobj:
2038 if self.mode == "a" and not os.path.exists(name):
2039 # Create nonexistent files in append mode.
2040 self.mode = "w"
2041 self._mode = "wb"
2042 fileobj = bltn_open(name, self._mode)
2043 self._extfileobj = False
2044 else:
2045 if name is None and hasattr(fileobj, "name"):
2046 name = fileobj.name
2047 # when fileobj is a gzip.GzipFile, fileobj.mode is an int (not valid for us)
2048 if hasattr(fileobj, "mode") and isinstance(fileobj.mode, str):
2049 self._mode = fileobj.mode
2050 self._extfileobj = True
2051 self.name = os.path.abspath(name) if name else None
2052 self.base_name = self.name = os.path.abspath(name) if name else None
2053 self.fileobj = fileobj
2054
2055 # Init attributes.
2056 if format is not None:
2057 self.format = format
2058 if tarinfo is not None:
2059 self.tarinfo = tarinfo
2060 if dereference is not None:
2061 self.dereference = dereference
2062 if ignore_zeros is not None:
2063 self.ignore_zeros = ignore_zeros
2064 if encoding is not None:
2065 self.encoding = encoding
2066
2067 self.errors = errors
2068
2069 if pax_headers is not None and self.format == PAX_FORMAT:
2070 self.pax_headers = pax_headers
2071 else:
2072 self.pax_headers = {}
2073
2074 if debug is not None:
2075 self.debug = debug
2076 if errorlevel is not None:
2077 self.errorlevel = errorlevel
2078
2079 # Init datastructures.
2080 if max_volume_size and max_volume_size < 3*BLOCKSIZE:
2081 raise ValueError("max_volume_size needs to be at least %d" % (3*BLOCKSIZE))
2082 if max_volume_size and not callable(new_volume_handler):
2083 raise ValueError("new_volume_handler needs to be set and be callable for multivolume support")
2084 if max_volume_size:
2085 self.max_volume_size = int(max_volume_size)
2086 else:
2087 self.max_volume_size = None
2088
2089 self.save_to_members = save_to_members
2090 self.new_volume_handler = new_volume_handler
2091 self.closed = False
2092 self.members = [] # list of members as TarInfo objects
2093 self._loaded = False # flag if all members have been read
2094 self.offset = self.fileobj.tell()
2095 # current position in the archive file
2096 self.inodes = {} # dictionary caching the inodes of
2097 # archive members already added
2098
2099 try:
2100 if self.mode == "r":
2101 self.firstmember = None
2102 self.firstmember = self.next()
2103
2104 if self.mode == "a":
2105 # Move to the end of the archive,
2106 # before the first empty block.
2107 while True:
2108 self.fileobj.seek(self.offset)
2109 try:
2110 tarinfo = self.tarinfo.fromtarfile(self)
2111 self.members.append(tarinfo)
2112 except EOFHeaderError:
2113 self.fileobj.seek(self.offset)
2114 break
2115 except HeaderError as e:
2116 raise ReadError(str(e))
2117
2118 if self.mode in "aw":
2119 self._loaded = True
2120
2121 if self.pax_headers:
2122 buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy())
2123 self.fileobj.write(buf)
2124 self.offset += len(buf)
2125 except:
2126 if not self._extfileobj:
2127 self.fileobj.close()
2128 self.closed = True
2129 raise
2130
2131 #--------------------------------------------------------------------------
2132 # Below are the classmethods which act as alternate constructors to the
2133 # TarFile class. The open() method is the only one that is needed for
2134 # public use; it is the "super"-constructor and is able to select an
2135 # adequate "sub"-constructor for a particular compression using the mapping
2136 # from OPEN_METH.
2137 #
2138 # This concept allows one to subclass TarFile without losing the comfort of
2139 # the super-constructor. A sub-constructor is registered and made available
2140 # by adding it to the mapping in OPEN_METH.
2141
2142 @classmethod
2143 def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE,
2144 encryption=None, compresslevel=9, tolerance=TOLERANCE_STRICT,
2145 **kwargs):
2146 """Open a tar archive for reading, writing or appending. Return
2147 an appropriate TarFile class.
2148
2149 mode:
2150 'r' or 'r:*' open for reading with transparent compression
2151 'r:' open for reading exclusively uncompressed
2152 'r:gz' open for reading with gzip compression
2153 'r:bz2' open for reading with bzip2 compression
2154 'r:xz' open for reading with lzma compression
2155 'a' or 'a:' open for appending, creating the file if necessary
2156 'w' or 'w:' open for writing without compression
2157 'w:gz' open for writing with gzip compression
2158 'w:bz2' open for writing with bzip2 compression
2159 'w:xz' open for writing with lzma compression
2160
2161 'r|*' open a stream of tar blocks with transparent compression
2162 'r|' open an uncompressed stream of tar blocks for reading
2163 'r|gz' open a gzip compressed stream of tar blocks
2164 'r|bz2' open a bzip2 compressed stream of tar blocks
2165 'r|xz' open an lzma compressed stream of tar blocks
2166 'w|' open an uncompressed stream for writing
2167 'w|gz' open a gzip compressed stream for writing
2168 'w|bz2' open a bzip2 compressed stream for writing
2169 'w|xz' open an lzma compressed stream for writing
2170
2171 'r#gz' open a stream of gzip compressed tar blocks for reading
2172 'w#gz' open a stream of gzip compressed tar blocks for writing
2173 """
2174 if not name and not fileobj:
2175 raise ValueError("nothing to open")
2176
2177 if mode in ("r", "r:*"):
2178 # Find out which *open() is appropriate for opening the file.
2179 for comptype in cls.OPEN_METH:
2180 func = getattr(cls, cls.OPEN_METH[comptype])
2181 if fileobj is not None:
2182 saved_pos = fileobj.tell()
2183 try:
2184 return func(name, "r", fileobj, **kwargs)
2185 except (ReadError, CompressionError) as e:
2186 # usually nothing exceptional but sometimes is
2187 if fileobj is not None:
2188 fileobj.seek(saved_pos)
2189 continue
2190 raise ReadError("file could not be opened successfully")
2191
2192 elif ":" in mode:
2193 filemode, comptype = mode.split(":", 1)
2194 filemode = filemode or "r"
2195 comptype = comptype or "tar"
2196
2197 # Select the *open() function according to
2198 # given compression.
2199 if comptype in cls.OPEN_METH:
2200 func = getattr(cls, cls.OPEN_METH[comptype])
2201 else:
2202 raise CompressionError("unknown compression type %r" % comptype)
2203
2204 # Pass on compression level for gzip / bzip2.
2205 if comptype == 'gz' or comptype == 'bz2':
2206 kwargs['compresslevel'] = compresslevel
2207
2208 if 'max_volume_size' in kwargs:
2209 if comptype != 'tar' and filemode in 'wa' \
2210 and kwargs['max_volume_size']:
2211 import warnings
2212 warnings.warn('Only the first volume will be compressed '
2213 'for modes with "w:"!')
2214
2215 return func(name, filemode, fileobj, **kwargs)
2216
2217 elif "|" in mode:
2218 filemode, comptype = mode.split("|", 1)
2219 filemode = filemode or "r"
2220 comptype = comptype or "tar"
2221
2222 if filemode not in "rw":
2223 raise ValueError("mode must be 'r' or 'w'")
2224
2225 t = cls(name, filemode,
2226 _Stream(name, filemode, comptype, fileobj, bufsize,
2227 compresslevel=compresslevel),
2228 **kwargs)
2229 t._extfileobj = False
2230 return t
2231
2232 elif "#" in mode:
2233 filemode, comptype = mode.split("#", 1)
2234 filemode = filemode or "r"
2235
2236 if filemode not in "rw":
2237 raise ValueError ("mode %s not compatible with concat "
2238 "archive; must be 'r' or 'w'" % mode)
2239
2240 stream = _Stream(name, filemode, comptype, fileobj, bufsize,
2241 concat=True, encryption=encryption,
2242 compresslevel=compresslevel, tolerance=tolerance)
2243 kwargs ["concat"] = True
2244 try:
2245 t = cls(name, filemode, stream, **kwargs)
2246 except: # XXX except what?
2247 stream.close()
2248 raise # XXX raise what?
2249 t._extfileobj = False
2250 return t
2251
2252 elif mode in "aw":
2253 return cls.taropen(name, mode, fileobj, **kwargs)
2254
2255 raise ValueError("undiscernible mode %r" % mode)
2256
2257
2258 @classmethod
2259 def open_at_offset(cls, offset, *a, **kwa):
2260 """
2261 Same as ``.open()``, but start reading at the given offset. Assumes a
2262 seekable file object. Returns *None* if opening failed due to a read
2263 problem.
2264 """
2265 fileobj = kwa.get ("fileobj")
2266 if fileobj is not None:
2267 fileobj.seek (offset)
2268
2269 return cls.open (*a, **kwa)
2270
2271
2272 @classmethod
2273 def taropen(cls, name, mode="r", fileobj=None, **kwargs):
2274 """Open uncompressed tar archive name for reading or writing.
2275 """
2276 if len(mode) > 1 or mode not in "raw":
2277 raise ValueError("mode must be 'r', 'a' or 'w'")
2278 return cls(name, mode, fileobj, **kwargs)
2279
2280 @classmethod
2281 def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
2282 """Open gzip compressed tar archive name for reading or writing.
2283 Appending is not allowed.
2284 """
2285 if len(mode) > 1 or mode not in "rw":
2286 raise ValueError("mode must be 'r' or 'w'")
2287
2288 try:
2289 import gzip
2290 gzip.GzipFile
2291 except (ImportError, AttributeError):
2292 raise CompressionError("gzip module is not available")
2293
2294 extfileobj = fileobj is not None
2295 try:
2296 fileobj = gzip.GzipFile(name, mode + "b", compresslevel, fileobj)
2297 t = cls.taropen(name, mode, fileobj, **kwargs)
2298 except OSError:
2299 if not extfileobj and fileobj is not None:
2300 fileobj.close()
2301 if fileobj is None:
2302 raise
2303 raise ReadError("not a gzip file")
2304 except:
2305 if not extfileobj and fileobj is not None:
2306 fileobj.close()
2307 raise
2308 t._extfileobj = extfileobj
2309 return t
2310
2311 @classmethod
2312 def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
2313 """Open bzip2 compressed tar archive name for reading or writing.
2314 Appending is not allowed.
2315 """
2316 if len(mode) > 1 or mode not in "rw":
2317 raise ValueError("mode must be 'r' or 'w'.")
2318
2319 try:
2320 import bz2
2321 except ImportError:
2322 raise CompressionError("bz2 module is not available")
2323
2324 fileobj = bz2.BZ2File(fileobj or name, mode,
2325 compresslevel=compresslevel)
2326
2327 try:
2328 t = cls.taropen(name, mode, fileobj, **kwargs)
2329 except (OSError, EOFError):
2330 fileobj.close()
2331 raise ReadError("not a bzip2 file")
2332 t._extfileobj = False
2333 return t
2334
2335 @classmethod
2336 def xzopen(cls, name, mode="r", fileobj=None, preset=None, **kwargs):
2337 """Open lzma compressed tar archive name for reading or writing.
2338 Appending is not allowed.
2339 """
2340 if mode not in ("r", "w"):
2341 raise ValueError("mode must be 'r' or 'w'")
2342
2343 try:
2344 import lzma
2345 except ImportError:
2346 raise CompressionError("lzma module is not available")
2347
2348 fileobj = lzma.LZMAFile(fileobj or name, mode, preset=preset)
2349
2350 try:
2351 t = cls.taropen(name, mode, fileobj, **kwargs)
2352 except (lzma.LZMAError, EOFError):
2353 fileobj.close()
2354 raise ReadError("not an lzma file")
2355 t._extfileobj = False
2356 return t
2357
2358 # All *open() methods are registered here.
2359 OPEN_METH = {
2360 "tar": "taropen", # uncompressed tar
2361 "gz": "gzopen", # gzip compressed tar
2362 "bz2": "bz2open", # bzip2 compressed tar
2363 "xz": "xzopen" # lzma compressed tar
2364 }
2365
2366 #--------------------------------------------------------------------------
2367 # The public methods which TarFile provides:
2368
2369 def close(self):
2370 """Close the TarFile. In write-mode, two finishing zero blocks are
2371 appended to the archive. A special case are empty archives which are
2372 initialized accordingly so the two mandatory blocks of zeros are
2373 written abiding by the requested encryption and compression settings.
2374 """
2375 if self.closed:
2376 return
2377
2378 if self.mode in "aw":
2379 if self.arcmode & ARCMODE_CONCAT and self.fileobj.tell () == 0:
2380 self.fileobj.next ("")
2381 self.fileobj.write(NUL * (BLOCKSIZE * 2))
2382 self.offset += (BLOCKSIZE * 2)
2383 # fill up the end with zero-blocks
2384 # (like option -b20 for tar does)
2385 blocks, remainder = divmod(self.offset, RECORDSIZE)
2386 if remainder > 0:
2387 self.fileobj.write(NUL * (RECORDSIZE - remainder))
2388 if not self._extfileobj:
2389 self.fileobj.close()
2390 self.closed = True
2391
2392 def getmember(self, name):
2393 """Return a TarInfo object for member `name'. If `name' can not be
2394 found in the archive, KeyError is raised. If a member occurs more
2395 than once in the archive, its last occurrence is assumed to be the
2396 most up-to-date version.
2397 """
2398 tarinfo = self._getmember(name)
2399 if tarinfo is None:
2400 raise KeyError("filename %r not found" % name)
2401 return tarinfo
2402
2403 def getmembers(self):
2404 """Return the members of the archive as a list of TarInfo objects. The
2405 list has the same order as the members in the archive.
2406 """
2407 self._check()
2408 if not self._loaded: # if we want to obtain a list of
2409 self._load() # all members, we first have to
2410 # scan the whole archive.
2411 return self.members
2412
2413 def get_last_member_offset(self):
2414 """Return the last member offset. Usually this is self.fileobj.tell(),
2415 but when there's encryption or concat compression going on it's more
2416 complicated than that.
2417 """
2418 return self.last_block_offset
2419
2420 def getnames(self):
2421 """Return the members of the archive as a list of their names. It has
2422 the same order as the list returned by getmembers().
2423 """
2424 return [tarinfo.name for tarinfo in self.getmembers()]
2425
2426 def gettarinfo(self, name=None, arcname=None, fileobj=None):
2427 """Create a TarInfo object for either the file `name' or the file
2428 object `fileobj' (using os.fstat on its file descriptor). You can
2429 modify some of the TarInfo's attributes before you add it using
2430 addfile(). If given, `arcname' specifies an alternative name for the
2431 file in the archive.
2432 """
2433 self._check("aw")
2434
2435 # When fileobj is given, replace name by
2436 # fileobj's real name.
2437 if fileobj is not None:
2438 name = fileobj.name
2439
2440 # Building the name of the member in the archive.
2441 # Backward slashes are converted to forward slashes,
2442 # Absolute paths are turned to relative paths.
2443 if arcname is None:
2444 arcname = name
2445 drv, arcname = os.path.splitdrive(arcname)
2446 arcname = arcname.replace(os.sep, "/")
2447 arcname = arcname.lstrip("/")
2448
2449 # Now, fill the TarInfo object with
2450 # information specific for the file.
2451 tarinfo = self.tarinfo()
2452 tarinfo.tarfile = self
2453
2454 # Use os.stat or os.lstat, depending on platform
2455 # and if symlinks shall be resolved.
2456 if fileobj is None:
2457 if hasattr(os, "lstat") and not self.dereference:
2458 statres = os.lstat(name)
2459 else:
2460 statres = os.stat(name)
2461 else:
2462 statres = os.fstat(fileobj.fileno())
2463 linkname = ""
2464
2465 stmd = statres.st_mode
2466 if stat.S_ISREG(stmd):
2467 inode = (statres.st_ino, statres.st_dev)
2468 if not self.dereference and statres.st_nlink > 1 and \
2469 inode in self.inodes and arcname != self.inodes[inode]:
2470 # Is it a hardlink to an already
2471 # archived file?
2472 type = LNKTYPE
2473 linkname = self.inodes[inode]
2474 else:
2475 # The inode is added only if its valid.
2476 # For win32 it is always 0.
2477 type = REGTYPE
2478 if inode[0] and self.save_to_members:
2479 self.inodes[inode] = arcname
2480 elif stat.S_ISDIR(stmd):
2481 type = DIRTYPE
2482 elif stat.S_ISFIFO(stmd):
2483 type = FIFOTYPE
2484 elif stat.S_ISLNK(stmd):
2485 type = SYMTYPE
2486 linkname = os.readlink(name)
2487 elif stat.S_ISCHR(stmd):
2488 type = CHRTYPE
2489 elif stat.S_ISBLK(stmd):
2490 type = BLKTYPE
2491 else:
2492 return None
2493
2494 # Fill the TarInfo object with all
2495 # information we can get.
2496 tarinfo.name = arcname
2497 tarinfo.mode = stmd
2498 tarinfo.uid = statres.st_uid
2499 tarinfo.gid = statres.st_gid
2500 if type == REGTYPE:
2501 tarinfo.size = statres.st_size
2502 else:
2503 tarinfo.size = 0
2504 tarinfo.mtime = statres.st_mtime
2505 tarinfo.type = type
2506 tarinfo.linkname = linkname
2507 if pwd:
2508 if tarinfo.uid in self.cache_uid2user:
2509 tarinfo.uname = self.cache_uid2user[tarinfo.uid]
2510 else:
2511 try:
2512 tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0]
2513 self.cache_uid2user[tarinfo.uid] = tarinfo.uname
2514 except KeyError:
2515 # remember user does not exist:
2516 # same default value as in tarinfo class
2517 self.cache_uid2user[tarinfo.uid] = ""
2518 if grp:
2519 if tarinfo.gid in self.cache_gid2group:
2520 tarinfo.gname = self.cache_gid2group[tarinfo.gid]
2521 else:
2522 try:
2523 tarinfo.gname = grp.getgrgid(tarinfo.gid)[0]
2524 self.cache_gid2group[tarinfo.gid] = tarinfo.gname
2525 except KeyError:
2526 # remember group does not exist:
2527 # same default value as in tarinfo class
2528 self.cache_gid2group[tarinfo.gid] = ""
2529
2530 if type in (CHRTYPE, BLKTYPE):
2531 if hasattr(os, "major") and hasattr(os, "minor"):
2532 tarinfo.devmajor = os.major(statres.st_rdev)
2533 tarinfo.devminor = os.minor(statres.st_rdev)
2534 return tarinfo
2535
2536 def list(self, verbose=True):
2537 """Print a table of contents to sys.stdout. If `verbose' is False, only
2538 the names of the members are printed. If it is True, an `ls -l'-like
2539 output is produced.
2540 """
2541 self._check()
2542
2543 for tarinfo in self:
2544 if verbose:
2545 print(stat.filemode(tarinfo.mode), end=' ')
2546 print("%s/%s" % (tarinfo.uname or tarinfo.uid,
2547 tarinfo.gname or tarinfo.gid), end=' ')
2548 if tarinfo.ischr() or tarinfo.isblk():
2549 print("%10s" % ("%d,%d" \
2550 % (tarinfo.devmajor, tarinfo.devminor)), end=' ')
2551 else:
2552 print("%10d" % tarinfo.size, end=' ')
2553 print("%d-%02d-%02d %02d:%02d:%02d" \
2554 % time.localtime(tarinfo.mtime)[:6], end=' ')
2555
2556 print(tarinfo.name + ("/" if tarinfo.isdir() else ""), end=' ')
2557
2558 if verbose:
2559 if tarinfo.issym():
2560 print("->", tarinfo.linkname, end=' ')
2561 if tarinfo.islnk():
2562 print("link to", tarinfo.linkname, end=' ')
2563 print()
2564
2565 def add(self, name, arcname=None, recursive=True, exclude=None, *, filter=None):
2566 """Add the file `name' to the archive. `name' may be any type of file
2567 (directory, fifo, symbolic link, etc.). If given, `arcname'
2568 specifies an alternative name for the file in the archive.
2569 Directories are added recursively by default. This can be avoided by
2570 setting `recursive' to False. `exclude' is a function that should
2571 return True for each filename to be excluded. `filter' is a function
2572 that expects a TarInfo object argument and returns the changed
2573 TarInfo object, if it returns None the TarInfo object will be
2574 excluded from the archive.
2575 """
2576 self._check("aw")
2577
2578 if arcname is None:
2579 arcname = name
2580
2581 # Exclude pathnames.
2582 if exclude is not None:
2583 import warnings
2584 warnings.warn("use the filter argument instead",
2585 DeprecationWarning, 2)
2586 if exclude(name):
2587 self._dbg(2, "tarfile: Excluded %r" % name)
2588 return
2589
2590 # Skip if somebody tries to archive the archive...
2591 if self.name is not None and os.path.abspath(name) == self.name:
2592 self._dbg(2, "tarfile: Skipped %r" % name)
2593 return
2594
2595 self._dbg(1, name)
2596
2597 # Create a TarInfo object from the file.
2598 tarinfo = self.gettarinfo(name, arcname)
2599
2600 if tarinfo is None:
2601 self._dbg(1, "tarfile: Unsupported type %r" % name)
2602 return
2603
2604 # Change or exclude the TarInfo object.
2605 if filter is not None:
2606 tarinfo = filter(tarinfo)
2607 if tarinfo is None:
2608 self._dbg(2, "tarfile: Excluded %r" % name)
2609 return
2610
2611 # Append the tar header and data to the archive.
2612 if tarinfo.isreg():
2613 with bltn_open(name, "rb") as f:
2614 self.addfile(tarinfo, f)
2615
2616 elif tarinfo.isdir():
2617 self.addfile(tarinfo)
2618 if recursive:
2619 for f in os.listdir(name):
2620 self.add(os.path.join(name, f), os.path.join(arcname, f),
2621 recursive, exclude, filter=filter)
2622
2623 else:
2624 self.addfile(tarinfo)
2625
2626 def _size_left_file(self):
2627 """Calculates size left in a volume with a maximum volume size.
2628
2629 Assumes self.max_volume_size is set.
2630 If using compression through a _Stream, use _size_left_stream instead
2631 """
2632 # left-over size = max_size - offset - 2 zero-blocks written in close
2633 size_left = self.max_volume_size - 2*BLOCKSIZE - self.offset
2634 # limit size left to a discrete number of blocks, because we won't
2635 # write only half a block when writting the end of a volume
2636 # and filling with zeros
2637 return BLOCKSIZE * (size_left // BLOCKSIZE)
2638
2639 def _size_left_stream(self):
2640 """ Calculates size left in a volume if using comression/encryption
2641
2642 Assumes self.max_volume_size is set and self.fileobj is a _Stream
2643 (otherwise use _size_left_file)
2644 """
2645 # left-over size = max_size - bytes written - 2 zero-blocks (close)
2646 size_left = self.max_volume_size - self.fileobj.estim_file_size() \
2647 - 2*BLOCKSIZE
2648 return BLOCKSIZE * (size_left // BLOCKSIZE)
2649
2650 def addfile(self, tarinfo, fileobj=None):
2651 """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is
2652 given, tarinfo.size bytes are read from it and added to the archive.
2653 You can create TarInfo objects using gettarinfo().
2654 On Windows platforms, `fileobj' should always be opened with mode
2655 'rb' to avoid irritation about the file size.
2656 """
2657 self._check("aw")
2658
2659 tarinfo = copy.copy(tarinfo)
2660
2661 if self.arcmode & ARCMODE_CONCAT:
2662 self.last_block_offset = self.fileobj.next (tarinfo.name)
2663 else:
2664 self.last_block_offset = self.fileobj.tell()
2665
2666 buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
2667 self.fileobj.write(buf)
2668 self.offset += len(buf)
2669
2670 if self.max_volume_size:
2671 if isinstance(self.fileobj, _Stream):
2672 _size_left = self._size_left_stream
2673 else:
2674 _size_left = self._size_left_file
2675 else:
2676 _size_left = lambda: tarinfo.size
2677
2678 # If there's no data to follow, finish
2679 if not fileobj:
2680 if self.save_to_members:
2681 self.members.append(tarinfo)
2682 return
2683
2684 target_size_left = _size_left()
2685 source_size_left = tarinfo.size
2686 assert tarinfo.volume_offset == 0
2687
2688 # we only split volumes in the middle of a file, that means we have
2689 # to write at least one block
2690 if target_size_left < BLOCKSIZE:
2691 target_size_left = BLOCKSIZE
2692
2693 # loop over multiple volumes
2694 while source_size_left > 0:
2695
2696 # Write as much data as possble from source into target.
2697 # When compressing data, we cannot easily predict how much data we
2698 # can write until target_size_left == 0 --> need to iterate
2699 size_can_write = min(target_size_left, source_size_left)
2700
2701 while size_can_write > 0:
2702 copyfileobj(fileobj, self.fileobj, size_can_write)
2703 self.offset += size_can_write
2704 source_size_left -= size_can_write
2705 target_size_left = _size_left()
2706 size_can_write = min(target_size_left, source_size_left)
2707
2708 # now target_size_left == 0 or source_size_left == 0
2709
2710 # if there is data left to write, we need to create a new volume
2711 if source_size_left > 0:
2712 # Only finalize the crypto entry here if we’re continuing with
2713 # another one; otherwise, the encryption must include the block
2714 # padding below.
2715 tarinfo.type = GNUTYPE_MULTIVOL
2716
2717 if not self.new_volume_handler or\
2718 not callable(self.new_volume_handler):
2719 raise Exception("We need to create a new volume and you "
2720 "didn't supply a new_volume_handler")
2721
2722
2723 # the new volume handler should do everything needed to
2724 # start working in a new volume. usually, the handler calls
2725 # to self.open_volume
2726 self.volume_number += 1
2727
2728 # set to be used by open_volume, because in the case of a PAX
2729 # tar it needs to write information about the volume and offset
2730 # in the global header
2731 tarinfo.volume_offset = tarinfo.size - source_size_left
2732 self.volume_tarinfo = tarinfo
2733
2734 # the “new_volume_handler” is supposed to call .close() on the
2735 # “fileobj” _Stream
2736 self.new_volume_handler(self, self.base_name, self.volume_number)
2737
2738 self.volume_tarinfo = None
2739
2740 if self.arcmode & ARCMODE_CONCAT:
2741 self.fileobj.next_volume (tarinfo.name)
2742
2743 # write new volume header
2744 buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
2745 self.fileobj.write(buf)
2746 self.offset += len(buf)
2747
2748 # adjust variables; open_volume should have reset self.offset
2749 # --> _size_left should be big again
2750 target_size_left = _size_left()
2751 size_can_write = min(target_size_left, source_size_left)
2752 self._dbg(3, 'new volume')
2753
2754 # now, all data has been written. We may have to fill up the rest of
2755 # the block in target with 0s
2756 remainder = (tarinfo.size - tarinfo.volume_offset) % BLOCKSIZE
2757 if remainder > 0:
2758 self.fileobj.write(NUL * (BLOCKSIZE - remainder))
2759 self.offset += BLOCKSIZE - remainder
2760
2761 if self.save_to_members:
2762 self.members.append(tarinfo)
2763
2764 def open_volume(self, name="", fileobj=None, encryption=None):
2765 '''
2766 Called by the user to change this tar file to point to a new volume.
2767 '''
2768
2769 # open the file using either fileobj or name
2770 if not fileobj:
2771 if self.mode == "a" and not os.path.exists(name):
2772 # Create nonexistent files in append mode.
2773 self.mode = "w"
2774 self._mode = "wb"
2775 self._extfileobj = False
2776
2777 if isinstance(self.fileobj, _Stream):
2778 self._dbg(3, 'open_volume: create a _Stream')
2779 fileobj = _Stream(name=name,
2780 mode=self.fileobj.mode,
2781 comptype=self.fileobj.comptype,
2782 fileobj=None,
2783 bufsize=self.fileobj.bufsize,
2784 encryption=encryption or self.fileobj.encryption,
2785 concat=self.fileobj.arcmode & ARCMODE_CONCAT,
2786 tolerance=self.fileobj.tolerance)
2787 else:
2788 # here, we lose information about compression/encryption!
2789 self._dbg(3, 'open_volume: builtin open')
2790 fileobj = bltn_open(name, self._mode)
2791 else:
2792 if name is None and hasattr(fileobj, "name"):
2793 name = fileobj.name
2794 if hasattr(fileobj, "mode"):
2795 self._mode = fileobj.mode
2796 self._extfileobj = True
2797 self._dbg(3, 'open_volume: using external fileobj {}', fileobj)
2798 self.name = os.path.abspath(name) if name else None
2799 self.fileobj.close()
2800 self.fileobj = fileobj
2801
2802 # init data structures
2803 self.closed = False
2804 self.members = [] # list of members as TarInfo objects
2805 self._loaded = False # flag if all members have been read
2806 self.offset = self.fileobj.tell()
2807 # current position in the archive file
2808 self.inodes = {} # dictionary caching the inodes of
2809 # archive members already added
2810
2811 try:
2812 if self.mode == "r":
2813 self.firstmember = None
2814 self.firstmember = self.next()
2815
2816 if self.mode == "a":
2817 # Move to the end of the archive,
2818 # before the first empty block.
2819 while True:
2820 self.fileobj.seek(self.offset)
2821 try:
2822 tarinfo = self.tarinfo.fromtarfile(self)
2823 self.members.append(tarinfo)
2824 except EOFHeaderError:
2825 self.fileobj.seek(self.offset)
2826 break
2827 except HeaderError as e:
2828 raise ReadError(str(e))
2829
2830 if self.mode in "aw":
2831 self._loaded = True
2832
2833 if self.format == PAX_FORMAT:
2834 volume_info = {
2835 "GNU.volume.filename": str(self.volume_tarinfo.name),
2836 "GNU.volume.size": str(self.volume_tarinfo.size - self.volume_tarinfo.volume_offset),
2837 "GNU.volume.offset": str(self.volume_tarinfo.volume_offset),
2838 }
2839
2840 self.pax_headers.update(volume_info)
2841
2842 if isinstance(self.fileobj, _Stream):
2843 self.fileobj._init_write_gz ()
2844 buf = self.tarinfo.create_pax_global_header(volume_info.copy())
2845 self.fileobj.write(buf)
2846 self.offset += len(buf)
2847 except Exception as exn:
2848 if not self._extfileobj:
2849 self.fileobj.close()
2850 self.closed = True
2851 raise
2852
2853 def extractall(self, path=".", members=None, filter=None, unlink=False):
2854 """Extract all members from the archive to the current working
2855 directory and set owner, modification time and permissions on
2856 directories afterwards. `path' specifies a different directory
2857 to extract to. `members' is optional and must be a subset of the
2858 list returned by getmembers().
2859 """
2860 directories = []
2861
2862 if members is None:
2863 members = self
2864
2865 for tarinfo in members:
2866 if self.volume_number > 0 and tarinfo.ismultivol():
2867 continue
2868
2869 if filter and not filter(tarinfo):
2870 continue
2871
2872 if tarinfo.isdir():
2873 # Extract directories with a safe mode.
2874 directories.append(tarinfo)
2875 tarinfo = copy.copy(tarinfo)
2876 tarinfo.mode = 0o0700
2877 # Do not set_attrs directories, as we will do that further down
2878 self.extract(tarinfo, path, set_attrs=not tarinfo.isdir(), unlink=unlink)
2879
2880 # Reverse sort directories.
2881 directories.sort(key=lambda a: a.name)
2882 directories.reverse()
2883
2884 # Set correct owner, mtime and filemode on directories.
2885 for tarinfo in directories:
2886 dirpath = os.path.join(path, tarinfo.name)
2887 try:
2888 self.chown(tarinfo, dirpath)
2889 self.utime(tarinfo, dirpath)
2890 self.chmod(tarinfo, dirpath)
2891 except ExtractError as e:
2892 if self.errorlevel > 1:
2893 raise
2894 else:
2895 self._dbg(1, "tarfile: %s" % e)
2896
2897 def extract(self, member, path="", set_attrs=True, symlink_cb=None,
2898 unlink=False):
2899 """Extract a member from the archive to the current working directory,
2900 using its full name. Its file information is extracted as accurately
2901 as possible. `member' may be a filename or a TarInfo object. You can
2902 specify a different directory using `path'. File attributes (owner,
2903 mtime, mode) are set unless `set_attrs' is False.
2904 ``symlink_cb`` is a hook accepting a function that is passed the
2905 ``member``, ``path``, and ``set_attrs`` arguments if the tarinfo for
2906 ``member`` indicates a symlink in which case only the callback
2907 passed will be applied, skipping the actual extraction. In case the
2908 callback is invoked, its return value is passed on to the caller.
2909 """
2910 self._check("r")
2911
2912 if isinstance(member, str):
2913 tarinfo = self.getmember(member)
2914 else:
2915 tarinfo = member
2916
2917 # Prepare the link target for makelink().
2918 if tarinfo.islnk():
2919 tarinfo._link_target = os.path.join(path, tarinfo.linkname)
2920
2921 if symlink_cb is not None and tarinfo.issym():
2922 return symlink_cb(member, path, set_attrs)
2923
2924 try:
2925 self._extract_member(tarinfo, os.path.join(path, tarinfo.name),
2926 set_attrs=set_attrs, unlink=unlink)
2927 except EnvironmentError as e:
2928 if self.errorlevel > 0:
2929 raise
2930 else:
2931 if e.filename is None:
2932 self._dbg(1, "tarfile: %s" % e.strerror)
2933 else:
2934 self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename))
2935 except ExtractError as e:
2936 if self.errorlevel > 1:
2937 raise
2938 else:
2939 self._dbg(1, "tarfile: %s" % e)
2940
2941 def extractfile(self, member):
2942 """Extract a member from the archive as a file object. `member' may be
2943 a filename or a TarInfo object. If `member' is a regular file or a
2944 link, an io.BufferedReader object is returned. Otherwise, None is
2945 returned.
2946 """
2947 self._check("r")
2948
2949 if isinstance(member, str):
2950 tarinfo = self.getmember(member)
2951 else:
2952 tarinfo = member
2953
2954 if tarinfo.isreg() or tarinfo.ismultivol() or\
2955 tarinfo.type not in SUPPORTED_TYPES:
2956 # If a member's type is unknown, it is treated as a
2957 # regular file.
2958 return self.fileobject(self, tarinfo)
2959
2960 elif tarinfo.islnk() or tarinfo.issym():
2961 if isinstance(self.fileobj, _Stream):
2962 # A small but ugly workaround for the case that someone tries
2963 # to extract a (sym)link as a file-object from a non-seekable
2964 # stream of tar blocks.
2965 raise StreamError("cannot extract (sym)link as file object")
2966 else:
2967 # A (sym)link's file object is its target's file object.
2968 return self.extractfile(self._find_link_target(tarinfo))
2969 else:
2970 # If there's no data associated with the member (directory, chrdev,
2971 # blkdev, etc.), return None instead of a file object.
2972 return None
2973
2974 def _extract_member(self, tarinfo, targetpath, set_attrs=True, unlink=False):
2975 """Extract the TarInfo object tarinfo to a physical
2976 file called targetpath.
2977 """
2978 # Fetch the TarInfo object for the given name
2979 # and build the destination pathname, replacing
2980 # forward slashes to platform specific separators.
2981 targetpath = targetpath.rstrip("/")
2982 targetpath = targetpath.replace("/", os.sep)
2983
2984 # Create all upper directories.
2985 upperdirs = os.path.dirname(targetpath)
2986 if upperdirs and not os.path.exists(upperdirs):
2987 # Create directories that are not part of the archive with
2988 # default permissions.
2989 os.makedirs(upperdirs)
2990
2991 if tarinfo.islnk() or tarinfo.issym():
2992 self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname))
2993 else:
2994 self._dbg(1, tarinfo.name)
2995
2996 if unlink is True:
2997 _unlinkfirst(targetpath)
2998
2999 if tarinfo.isreg():
3000 self.makefile(tarinfo, targetpath)
3001 elif tarinfo.isdir():
3002 self.makedir(tarinfo, targetpath)
3003 elif tarinfo.isfifo():
3004 self.makefifo(tarinfo, targetpath)
3005 elif tarinfo.ischr() or tarinfo.isblk():
3006 self.makedev(tarinfo, targetpath)
3007 elif tarinfo.islnk() or tarinfo.issym():
3008 self.makelink(tarinfo, targetpath)
3009 elif tarinfo.type not in SUPPORTED_TYPES:
3010 self.makeunknown(tarinfo, targetpath)
3011 else:
3012 self.makefile(tarinfo, targetpath)
3013
3014 if set_attrs:
3015 self.chown(tarinfo, targetpath)
3016 if not tarinfo.issym():
3017 self.chmod(tarinfo, targetpath)
3018 self.utime(tarinfo, targetpath)
3019
3020 #--------------------------------------------------------------------------
3021 # Below are the different file methods. They are called via
3022 # _extract_member() when extract() is called. They can be replaced in a
3023 # subclass to implement other functionality.
3024
3025 def makedir(self, tarinfo, targetpath):
3026 """Make a directory called targetpath.
3027 """
3028 try:
3029 # Use a safe mode for the directory, the real mode is set
3030 # later in _extract_member().
3031 os.mkdir(targetpath, 0o0700)
3032 except FileExistsError:
3033 pass
3034
3035 def makefile(self, tarinfo, targetpath):
3036 """Make a file called targetpath.
3037 """
3038 source = self.fileobj
3039 source.seek(tarinfo.offset_data)
3040 decrypt = False
3041 iterate = True
3042 target = bltn_open(targetpath, "wb")
3043
3044 if tarinfo.sparse is not None:
3045 try:
3046 for offset, size in tarinfo.sparse:
3047 target.seek(offset)
3048 copyfileobj(source, target, size)
3049 target.seek(tarinfo.size)
3050 target.truncate()
3051 finally:
3052 target.close()
3053 return
3054
3055 while iterate:
3056 iterate = False
3057 try:
3058 copyfileobj(source, target, tarinfo.size)
3059 except OSError:
3060 source.close()
3061 # only if we are extracting a multivolume this can be treated
3062 if not self.new_volume_handler:
3063 raise Exception("We need to read a new volume and you"
3064 " didn't supply a new_volume_handler")
3065
3066 # the new volume handler should do everything needed to
3067 # start working in a new volume. usually, the handler calls
3068 # to self.open_volume
3069 self.volume_number += 1
3070 self.new_volume_handler(self, self.base_name, self.volume_number)
3071 tarinfo = self.firstmember
3072 source = self.fileobj
3073 iterate = True
3074 finally:
3075 if iterate is False: target.close()
3076
3077
3078 def makeunknown(self, tarinfo, targetpath):
3079 """Make a file from a TarInfo object with an unknown type
3080 at targetpath.
3081 """
3082 self.makefile(tarinfo, targetpath)
3083 self._dbg(1, "tarfile: Unknown file type %r, " \
3084 "extracted as regular file." % tarinfo.type)
3085
3086 def makefifo(self, tarinfo, targetpath):
3087 """Make a fifo called targetpath.
3088 """
3089 if hasattr(os, "mkfifo"):
3090 os.mkfifo(targetpath)
3091 else:
3092 raise ExtractError("fifo not supported by system")
3093
3094 def makedev(self, tarinfo, targetpath):
3095 """Make a character or block device called targetpath.
3096 """
3097 if not hasattr(os, "mknod") or not hasattr(os, "makedev"):
3098 raise ExtractError("special devices not supported by system")
3099
3100 mode = tarinfo.mode
3101 if tarinfo.isblk():
3102 mode |= stat.S_IFBLK
3103 else:
3104 mode |= stat.S_IFCHR
3105
3106 os.mknod(targetpath, mode,
3107 os.makedev(tarinfo.devmajor, tarinfo.devminor))
3108
3109 def makelink(self, tarinfo, targetpath):
3110 """Make a (symbolic) link called targetpath. If it cannot be created
3111 (platform limitation), we try to make a copy of the referenced file
3112 instead of a link.
3113 """
3114 try:
3115 # For systems that support symbolic and hard links.
3116 if tarinfo.issym():
3117 os.symlink(tarinfo.linkname, targetpath)
3118 else:
3119 # See extract().
3120 if os.path.exists(tarinfo._link_target):
3121 os.link(tarinfo._link_target, targetpath)
3122 else:
3123 self._extract_member(self._find_link_target(tarinfo),
3124 targetpath)
3125 except symlink_exception:
3126 try:
3127 self._extract_member(self._find_link_target(tarinfo),
3128 targetpath)
3129 except KeyError:
3130 raise ExtractError("unable to resolve link inside archive")
3131
3132 def chown(self, tarinfo, targetpath):
3133 """Set owner of targetpath according to tarinfo.
3134 """
3135 if pwd and hasattr(os, "geteuid") and os.geteuid() == 0:
3136 # We have to be root to do so.
3137 try:
3138 g = grp.getgrnam(tarinfo.gname)[2]
3139 except KeyError:
3140 g = tarinfo.gid
3141 try:
3142 u = pwd.getpwnam(tarinfo.uname)[2]
3143 except KeyError:
3144 u = tarinfo.uid
3145 try:
3146 if tarinfo.issym() and hasattr(os, "lchown"):
3147 os.lchown(targetpath, u, g)
3148 else:
3149 os.chown(targetpath, u, g)
3150 except OSError as e:
3151 raise ExtractError("could not change owner")
3152
3153 def chmod(self, tarinfo, targetpath):
3154 """Set file permissions of targetpath according to tarinfo.
3155 """
3156 if hasattr(os, 'chmod'):
3157 try:
3158 os.chmod(targetpath, tarinfo.mode)
3159 except OSError as e:
3160 raise ExtractError("could not change mode")
3161
3162 def utime(self, tarinfo, targetpath):
3163 """Set modification time of targetpath according to tarinfo.
3164 """
3165 if not hasattr(os, 'utime'):
3166 return
3167 try:
3168 os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime))
3169 except OSError as e:
3170 raise ExtractError("could not change modification time")
3171
3172 #--------------------------------------------------------------------------
3173 def next(self):
3174 """Return the next member of the archive as a TarInfo object, when
3175 TarFile is opened for reading. Return None if there is no more
3176 available.
3177 """
3178 self._check("ra")
3179 if self.firstmember is not None:
3180 m = self.firstmember
3181 self.firstmember = None
3182 return m
3183
3184 # Read the next block.
3185 self.fileobj.seek(self.offset)
3186 tarinfo = None
3187 while True:
3188 try:
3189 tarinfo = self.tarinfo.fromtarfile(self)
3190 except EOFHeaderError as e:
3191 if self.ignore_zeros:
3192 self._dbg(2, "0x%X: %s" % (self.offset, e))
3193 self.offset += BLOCKSIZE
3194 continue
3195 except InvalidHeaderError as e:
3196 if self.ignore_zeros:
3197 self._dbg(2, "0x%X: %s" % (self.offset, e))
3198 self.offset += BLOCKSIZE
3199 continue
3200 elif self.offset == 0:
3201 raise ReadError(str(e))
3202 except EmptyHeaderError:
3203 if self.offset == 0:
3204 raise ReadError("empty file")
3205 except TruncatedHeaderError as e:
3206 if self.offset == 0:
3207 raise ReadError(str(e))
3208 except SubsequentHeaderError as e:
3209 raise ReadError(str(e))
3210 break
3211
3212 if tarinfo is not None:
3213 if self.save_to_members:
3214 self.members.append(tarinfo)
3215 else:
3216 self._loaded = True
3217
3218 return tarinfo
3219
3220 #--------------------------------------------------------------------------
3221 # Little helper methods:
3222
3223 def _getmember(self, name, tarinfo=None, normalize=False):
3224 """Find an archive member by name from bottom to top.
3225 If tarinfo is given, it is used as the starting point.
3226 """
3227 # Ensure that all members have been loaded.
3228 members = self.getmembers()
3229
3230 # Limit the member search list up to tarinfo.
3231 if tarinfo is not None:
3232 members = members[:members.index(tarinfo)]
3233
3234 if normalize:
3235 name = os.path.normpath(name)
3236
3237 for member in reversed(members):
3238 if normalize:
3239 member_name = os.path.normpath(member.name)
3240 else:
3241 member_name = member.name
3242
3243 if name == member_name:
3244 return member
3245
3246 def _load(self):
3247 """Read through the entire archive file and look for readable
3248 members.
3249 """
3250 while True:
3251 tarinfo = self.next()
3252 if tarinfo is None:
3253 break
3254 self._loaded = True
3255
3256 def _check(self, mode=None):
3257 """Check if TarFile is still open, and if the operation's mode
3258 corresponds to TarFile's mode.
3259 """
3260 if self.closed:
3261 raise OSError("%s is closed" % self.__class__.__name__)
3262 if mode is not None and self.mode not in mode:
3263 raise OSError("bad operation for mode %r" % self.mode)
3264
3265 def _find_link_target(self, tarinfo):
3266 """Find the target member of a symlink or hardlink member in the
3267 archive.
3268 """
3269 if tarinfo.issym():
3270 # Always search the entire archive.
3271 linkname = "/".join(filter(None, (os.path.dirname(tarinfo.name), tarinfo.linkname)))
3272 limit = None
3273 else:
3274 # Search the archive before the link, because a hard link is
3275 # just a reference to an already archived file.
3276 linkname = tarinfo.linkname
3277 limit = tarinfo
3278
3279 member = self._getmember(linkname, tarinfo=limit, normalize=True)
3280 if member is None:
3281 raise KeyError("linkname %r not found" % linkname)
3282 return member
3283
3284 def __iter__(self):
3285 """Provide an iterator object.
3286 """
3287 if self._loaded:
3288 return iter(self.members)
3289 else:
3290 return TarIter(self)
3291
3292 def _dbg(self, level, msg, *args):
3293 """Write debugging output to sys.stderr.
3294 """
3295 if level <= self.debug:
3296 print(msg.format(*args), file=sys.stderr)
3297
3298 def __enter__(self):
3299 self._check()
3300 return self
3301
3302 def __exit__(self, type, value, traceback):
3303 if type is None:
3304 self.close()
3305 else:
3306 # An exception occurred. We must not call close() because
3307 # it would try to write end-of-archive blocks and padding.
3308 if not self._extfileobj:
3309 self.fileobj.close()
3310 self.closed = True
3311
3312def _unlinkfirst(targetpath):
3313 try:
3314 os.unlink(targetpath)
3315 except OSError as e:
3316 if e.errno == errno.ENOENT or e.errno == errno.EISDIR:
3317 pass
3318
3319
3320# class TarFile
3321
3322class TarIter:
3323 """Iterator Class.
3324
3325 for tarinfo in TarFile(...):
3326 suite...
3327 """
3328
3329 def __init__(self, tarfile):
3330 """Construct a TarIter object.
3331 """
3332 self.tarfile = tarfile
3333 self.index = 0
3334 def __iter__(self):
3335 """Return iterator object.
3336 """
3337 return self
3338 def __next__(self):
3339 """Return the next item using TarFile's next() method.
3340 When all members have been read, set TarFile as _loaded.
3341 """
3342 # Fix for SF #1100429: Under rare circumstances it can
3343 # happen that getmembers() is called during iteration,
3344 # which will cause TarIter to stop prematurely.
3345
3346 if self.index == 0 and self.tarfile.firstmember is not None:
3347 tarinfo = self.tarfile.next()
3348 elif self.index < len(self.tarfile.members):
3349 tarinfo = self.tarfile.members[self.index]
3350 elif not self.tarfile._loaded:
3351 tarinfo = self.tarfile.next()
3352 if not tarinfo:
3353 self.tarfile._loaded = True
3354 raise StopIteration
3355 else:
3356 raise StopIteration
3357 self.index += 1
3358
3359 return tarinfo
3360
3361#---------------------------------------------------------
3362# support functionality for rescue mode
3363#---------------------------------------------------------
3364
3365TAR_FMT_HDR = (# See tar(5):
3366 "<"
3367 "100s" # ← char name[100]; /* 100 */
3368 "8s" # ← char mode[8]; /* 108 */
3369 "8s" # ← char uid[8]; /* 116 */
3370 "8s" # ← char gid[8]; /* 124 */
3371 "12s" # ← char size[12]; /* 136 */
3372 "12s" # ← char mtime[12]; /* 148 */
3373 "8s" # ← char checksum[8]; /* 156 */
3374 "B" # ← char typeflag[1]; /* 157 */
3375 "100s" # ← char linkname[100]; /* 257 */
3376 "6s" # ← char magic[6]; /* 263 */
3377 "2s" # ← char version[2]; /* 265 */
3378 "32s" # ← char uname[32]; /* 297 */
3379 "32s" # ← char gname[32]; /* 329 */
3380 "8s" # ← char devmajor[8]; /* 337 */
3381 "8s" # ← char devminor[8]; /* 345 */
3382 "12s" # ← char atime[12]; /* 357 */
3383 "12s" # ← char ctime[12]; /* 369 */
3384 "12s" # ← char offset[12]; /* 381 */
3385 "4s" # ← char longnames[4]; /* 385 */
3386 "B" # ← char unused[1]; /* 386 */
3387 "" # struct {
3388 "12s" # ← char offset[12];
3389 "12s" # ← char numbytes[12];
3390 "12s" # ← char offset[12];
3391 "12s" # ← char numbytes[12];
3392 "12s" # ← char offset[12];
3393 "12s" # ← char numbytes[12];
3394 "12s" # ← char offset[12];
3395 "12s" # ← char numbytes[12];
3396 "" # } sparse[4]; /* 482 */
3397 "B" # ← char isextended[1]; /* 483 */
3398 "12s" # ← char realsize[12]; /* 495 */
3399 "17s" # ← char pad[17]; /* 512 */
3400)
3401
3402# The “magic” and “version” fields are special:
3403#
3404# tar(5)
3405# magic The magic field holds the five characters “ustar” followed by a
3406# space. Note that POSIX ustar archives have a trailing null.
3407#
3408# however, “tar.h”:
3409#
3410# /* OLDGNU_MAGIC uses both magic and version fields, which are contiguous.
3411# Found in an archive, it indicates an old GNU header format, which will be
3412# hopefully become obsolescent. With OLDGNU_MAGIC, uname and gname are
3413# valid, though the header is not truly POSIX conforming. */
3414#
3415#
3416TAR_HDR_OFF_MAGIC = 257
3417TAR_FMT_OLDGNU_MAGIC = b"ustar "
3418
3419def read_gnu_tar_hdr (data):
3420 if len (data) != BLOCKSIZE: # header requires one complete block
3421 return None
3422
3423 try:
3424 name, mode, \
3425 uid, gid, \
3426 size, mtime, \
3427 checksum, \
3428 typeflag, \
3429 linkname, \
3430 magic, \
3431 version, \
3432 uname, \
3433 gname, \
3434 devmajor, \
3435 devminor, \
3436 atime, \
3437 ctime, \
3438 offset, \
3439 longnames, \
3440 unused, \
3441 offset1, numbytes1, \
3442 offset2, numbytes2, \
3443 offset3, numbytes3, \
3444 offset4, numbytes4, \
3445 isextended, \
3446 realsize, \
3447 pad = struct.unpack (TAR_FMT_HDR, data)
3448 except struct.error:
3449 return None
3450
3451 if magic != TAR_FMT_OLDGNU_MAGIC:
3452 return None
3453
3454 # return all except “unused” and “pad”
3455 return \
3456 { "name" : name, "mode" : mode
3457 , "uid" : uid , "gid" : gid
3458 , "size" : size, "mtime" : mtime
3459 , "checksum" : checksum
3460 , "typeflag" : typeflag
3461 , "linkname" : linkname
3462 , "magic" : magic
3463 , "version" : version
3464 , "uname" : uname, "gname" : gname
3465 , "devmajor" : devmajor, "devminor" : devminor
3466 , "atime" : atime, "ctime" : ctime
3467 , "offset" : offset
3468 , "longnames" : longnames
3469 , "offset1" : offset1, "numbytes1" : numbytes1
3470 , "offset2" : offset2, "numbytes2" : numbytes2
3471 , "offset3" : offset3, "numbytes3" : numbytes3
3472 , "offset4" : offset4, "numbytes4" : numbytes4
3473 , "isextended" : isextended
3474 , "realsize" : realsize
3475 }
3476
3477
3478def tar_hdr_check_chksum (data):
3479 hdr = read_gnu_tar_hdr (data)
3480 if hdr is None:
3481 return False
3482 s = calc_chksums (data)
3483 return nti (hdr ["checksum"]) in s
3484
3485
3486def readable_tar_objects_offsets (ifd):
3487 """
3488 Traverse blocks in file, trying to extract tar headers.
3489 """
3490 pos = 0
3491 offsets = []
3492
3493 mm = mmap.mmap(ifd, 0, mmap.MAP_SHARED, mmap.PROT_READ)
3494 pos = TAR_HDR_OFF_MAGIC
3495
3496 while True:
3497 pos = mm.find (TAR_FMT_OLDGNU_MAGIC, pos)
3498 if pos == -1:
3499 break
3500 off = pos - TAR_HDR_OFF_MAGIC
3501 mm.seek (off)
3502 blk = mm.read (BLOCKSIZE)
3503 if tar_hdr_check_chksum (blk) is True:
3504 offsets.append (off)
3505 pos += 1
3506
3507 return offsets
3508
3509
3510def locate_gz_hdr_candidates (fd):
3511 """
3512 Walk over instances of the GZ magic in the payload, collecting their
3513 positions. If the offset of the first found instance is not zero, the file
3514 begins with leading garbage.
3515
3516 Note that since the GZ magic consists of only two bytes, we expect a lot of
3517 false positives inside binary data.
3518
3519 :return: The list of offsets in the file.
3520 """
3521 pos = 0
3522 cands = []
3523 mm = mmap.mmap(fd, 0, mmap.MAP_SHARED, mmap.PROT_READ)
3524
3525 while True:
3526 pos = mm.find (GZ_MAGIC_BYTES, pos)
3527 if pos == -1:
3528 break
3529 cands.append (pos)
3530 pos += len (GZ_MAGIC_BYTES)
3531
3532 return cands
3533
3534
3535HDR_CAND_GOOD = 0 # header marks begin of valid object
3536HDR_CAND_FISHY = 1 # inconclusive
3537HDR_CAND_JUNK = 2 # not a header / object unreadable
3538
3539
3540def read_cstring (fd, max=-1, encoding=None):
3541 """
3542 Read one NUL-terminated string from *fd* into a Python string. If *max* is
3543 non-negative, reading will terminate after the specified number of bytes.
3544
3545 Optionally, an *encoding* may be specified to interpret the data as.
3546
3547 :returns: *None* if parsing failed or the maximum number of bytes has been
3548 exceeded; a Python string with the data otherwise.
3549 """
3550 buf = b""
3551 l = 0
3552
3553 while True:
3554 c = os.read (fd, 1)
3555 if c == NUL:
3556 break
3557 if max >= 0 and l > max:
3558 return None
3559 buf += c
3560 l += 1
3561 if encoding is not None:
3562 buf = buf.decode (encoding)
3563
3564 return buf
3565
3566
3567def inspect_gz_hdr (fd, off):
3568 """
3569 Attempt to parse a Gzip header in *fd* at position *off*. The format is
3570 documented as RFC1952.
3571
3572 Returns a verdict about the quality of that header plus the parsed header
3573 when readable. Problematic sizes such as fields running past the EOF are
3574 treated as garbage. Properties in which the header merely doesn’t conform
3575 to the spec (garbage flag bits, bogus timestamp) are considered “fishy”. No
3576 validation is possible on embedded strings because they are single-byte
3577 encoded.
3578 """
3579 fname = None
3580 flags = 0x00
3581 dflags = 0x00
3582 mtime = 0x00000000
3583 oscode = 0x00
3584 verdict = HDR_CAND_GOOD
3585
3586 os.lseek (fd, off, os.SEEK_SET)
3587 if os.lseek (fd, 0, os.SEEK_CUR) != off:
3588 return HDR_CAND_JUNK, None
3589
3590 raw = os.read (fd, GZ_HEADER_SIZE)
3591 if len (raw) != GZ_HEADER_SIZE:
3592 return HDR_CAND_JUNK, None
3593
3594 flags = 0x0
3595 try:
3596 _m1, _m2, meth, flags, mtime, dflags, oscode = \
3597 struct.unpack (GZ_FMT_HEADER, raw)
3598 if meth != GZ_METHOD_DEFLATE: # only deflate is supported
3599 return HDR_CAND_JUNK, None
3600 except struct.error as exn:
3601 return HDR_CAND_JUNK, None
3602
3603 if mtime > int (time.time ()):
3604 verdict = HDR_CAND_FISHY
3605
3606 if dflags != GZ_DEFLATE_FLAGS:
3607 verdict = HDR_CAND_FISHY
3608
3609 if oscode != GZ_OS_CODE:
3610 verdict = HDR_CAND_FISHY
3611
3612 if flags & GZ_FLAG_FTEXT: # created by some contrarian
3613 verdict = HDR_CAND_FISHY
3614 if flags & GZ_FLAG_FEXTRA:
3615 xlen = struct.unpack ("<H", os.read (fd, 2))[0]
3616 xtra = os.read (fd, xlen)
3617 if len (xtra) != xlen: # eof inside header
3618 return HDR_CAND_JUNK, None
3619 if flags & GZ_FLAG_FNAME:
3620 # read up to the next NUL byte, not exceeding the maximum path length
3621 # allowed by tar(5)
3622 fname = read_cstring (fd, max=(LENGTH_NAME + LENGTH_PREFIX),
3623 encoding="iso-8859-1")
3624 if fname is None:
3625 return HDR_CAND_JUNK, None
3626 if flags & GZ_FLAG_FCOMMENT:
3627 fname = read_cstring (fd, max=(LENGTH_NAME + LENGTH_PREFIX),
3628 encoding="iso-8859-1")
3629 if fname is None:
3630 return HDR_CAND_JUNK, None
3631 if flags & GZ_FLAG_FHCRC: # half a CRC32; discarded
3632 crc16 = os.read (fd, 2)
3633 if len (crc16) != 2: # eof inside header
3634 return HDR_CAND_JUNK, None
3635 if flags & GZ_FLAG_RESERVED:
3636 # according to the RFC, these must not be set
3637 verdict = HDR_CAND_FISHY
3638
3639 hlen = os.lseek (fd, 0, os.SEEK_CUR) - off
3640
3641 return verdict, \
3642 { "fname" : fname
3643 , "flags" : flags
3644 , "dflags" : dflags
3645 , "mtime" : mtime
3646 , "oscode" : oscode
3647 , "hlen" : hlen
3648 }
3649
3650
3651def try_decompress (ifd, off, hdr):
3652 """
3653 Attempt to process the object starting at *off* with gzip.
3654
3655 :returns: A pair containing the values of the decompressed data and
3656 the length of the input consumed. Note that the latter value
3657 may exceed the length of the compressed data because the
3658 *zlib* module does not provide a means to query how much
3659 of the input it processed before the end of an object.
3660 """
3661 import zlib
3662 decmp = zlib.decompressobj (-zlib.MAX_WBITS)
3663 pos = off
3664 dlen = 0 # size of decompressed data
3665
3666 os.lseek (ifd, pos, os.SEEK_SET)
3667 while True:
3668 cnk = os.read (ifd, BUFSIZE)
3669 pos += len (cnk)
3670 try:
3671 data = decmp.decompress (cnk)
3672 except zlib.error as exn: # probably CRC32 mismatch; terminate softly
3673 break # fishy
3674 dlen += len (data)
3675 if decmp.eof is True:
3676 break
3677 if len (cnk) != BUFSIZE: # eof, but not end of decompressed object!
3678 break # fishy
3679
3680 return dlen, pos - off
3681
3682def readable_gz_objects_offsets (ifd, cands):
3683 """
3684 Inspect header candidates for parseable *ifd* gzipped objects.
3685 """
3686 good = []
3687 nobj = 0
3688
3689 for cand in cands:
3690 nobj += 1
3691 vdt, hdr = inspect_gz_hdr (ifd, cand)
3692 if vdt == HDR_CAND_JUNK:
3693 pass # ignore unreadable ones
3694 elif vdt in [HDR_CAND_GOOD, HDR_CAND_FISHY]:
3695 off0 = cand + hdr ["hlen"]
3696 dlen, clen = try_decompress (ifd, off0, hdr)
3697 if dlen > 0 and clen > 0:
3698 good.append (cand)
3699
3700 return good
3701
3702
3703def reconstruct_offsets_gz (fname):
3704 """
3705 From the given file, retrieve all GZ header-like offsets (“candidates”).
3706 Then check each of those locations whether they can be processed as
3707 compressed data.
3708 """
3709 ifd = os.open (fname, os.O_RDONLY)
3710
3711 try:
3712 cands = locate_gz_hdr_candidates (ifd)
3713 return readable_gz_objects_offsets (ifd, cands)
3714 finally:
3715 os.close (ifd)
3716
3717
3718def reconstruct_offsets_tar (fname):
3719 """
3720 From the given file, retrieve all tar header-like offsets (“candidates”).
3721 Then check each of those locations whether they can be processed as tar
3722 data.
3723 """
3724 ifd = os.open (fname, os.O_RDONLY)
3725
3726 try:
3727 return readable_tar_objects_offsets (ifd)
3728 finally:
3729 os.close (ifd)
3730
3731
3732def read_tarobj_at_offset (fileobj, offset, mode, secret=None,
3733 strict_validation=True):
3734 """
3735 :type strict_validation: bool
3736 :param strict_validation: Enable strict IV checking in the crypto
3737 layer. Should be disabled when dealing with
3738 potentially corrupted data.
3739 """
3740 decr = None
3741
3742 if secret is not None:
3743 ks = secret [0]
3744
3745 if ks == crypto.PDTCRYPT_SECRET_PW:
3746 decr = crypto.Decrypt (password=secret [1],
3747 strict_ivs=strict_validation)
3748 elif ks == crypto.PDTCRYPT_SECRET_KEY:
3749 key = binascii.unhexlify (secret [1])
3750 decr = crypto.Decrypt (key=key,
3751 strict_ivs=strict_validation)
3752 else:
3753 raise RuntimeError
3754
3755 try:
3756 tarobj = \
3757 TarFile.open_at_offset (offset,
3758 mode=mode,
3759 fileobj=fileobj,
3760 format=GNU_FORMAT,
3761 concat='#' in mode,
3762 encryption=decr,
3763 save_to_members=False,
3764 tolerance=TOLERANCE_RESCUE)
3765 except (ReadError, EndOfFile):
3766 return None
3767
3768 return tarobj.next ()
3769
3770
3771def idxent_of_tarinfo (tarinfo):
3772 """
3773 Scrape the information relevant for the index from a *TarInfo* object.
3774 Keys like the inode number that lack a corresponding field in a TarInfo
3775 will be set to some neutral value.
3776 Example output:
3777
3778 { "inode" : 0
3779 , "uid" : 0
3780 , "path" : "snapshot://annotations.db"
3781 , "offset" : 0
3782 , "volume" : 0
3783 , "mode" : 33152
3784 , "ctime" : 1502798115
3785 , "mtime" : 1502196423
3786 , "size" : 144
3787 , "type" : "file"
3788 , "gid" : 0
3789 }
3790
3791 """
3792
3793 return \
3794 { "inode" : 0 # ignored when reading the index
3795 , "uid" : tarinfo.uid
3796 , "gid" : tarinfo.gid
3797 , "path" : tarinfo.name # keeping URI scheme
3798 , "offset" : 0 # to be added by the caller
3799 , "volume" : tarinfo.volume_offset
3800 , "mode" : tarinfo.mode
3801 , "ctime" : tarinfo.mtime
3802 , "mtime" : tarinfo.mtime
3803 , "size" : tarinfo.size
3804 , "type" : tarinfo.type
3805 }
3806
3807
3808def gen_rescue_index (gen_volume_name, mode, maxvol=None, password=None, key=None):
3809 infos = []
3810 psidx = [] # pseudo index, return value
3811 offsets = None
3812 secret = crypto.make_secret (password=password, key=key)
3813
3814 nvol = 0
3815
3816 while True:
3817 vpath = gen_volume_name (nvol)
3818 try:
3819 if secret is not None:
3820 offsets = crypto.reconstruct_offsets (vpath, secret)
3821 elif mode == "#gz":
3822 offsets = reconstruct_offsets_gz (vpath)
3823 elif mode == "#":
3824 offsets = reconstruct_offsets_tar (vpath)
3825 else:
3826 raise TarError ("no rescue handling for mode “%s”" % mode)
3827 except FileNotFoundError as exn:
3828 # volume does not exist
3829 if maxvol is not None and nvol < maxvol:
3830 continue # explicit volume number specified, ignore missing ones
3831 else:
3832 break
3833
3834 fileobj = bltn_open (vpath, "rb")
3835
3836 def aux (acc, off):
3837 obj = read_tarobj_at_offset (fileobj, off, mode, secret=secret,
3838 strict_validation=False)
3839 if obj is not None:
3840 acc.append ((off, nvol, obj))
3841 return acc
3842 infos += functools.reduce (aux, offsets, [])
3843
3844 fileobj.close()
3845
3846 nvol += 1
3847
3848 def aux (o, nvol, ti):
3849 ie = idxent_of_tarinfo (ti)
3850 ie ["offset"] = o
3851 ie ["volume"] = nvol
3852 return ie
3853
3854 psidx = [ aux (o, nvol, ti) for o, nvol, ti in infos ]
3855
3856 return psidx
3857
3858#--------------------
3859# exported functions
3860#--------------------
3861def is_tarfile(name):
3862 """Return True if name points to a tar archive that we
3863 are able to handle, else return False.
3864 """
3865 try:
3866 t = open(name)
3867 t.close()
3868 return True
3869 except TarError:
3870 return False
3871
3872bltn_open = open
3873open = TarFile.open