2 #-------------------------------------------------------------------
4 #-------------------------------------------------------------------
5 # Copyright (C) 2002 Lars Gustäbel <lars@gustaebel.de>
8 # Permission is hereby granted, free of charge, to any person
9 # obtaining a copy of this software and associated documentation
10 # files (the "Software"), to deal in the Software without
11 # restriction, including without limitation the rights to use,
12 # copy, modify, merge, publish, distribute, sublicense, and/or sell
13 # copies of the Software, and to permit persons to whom the
14 # Software is furnished to do so, subject to the following
17 # The above copyright notice and this permission notice shall be
18 # included in all copies or substantial portions of the Software.
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22 # OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24 # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25 # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27 # OTHER DEALINGS IN THE SOFTWARE.
29 """Read from and write to tar format archives.
32 __version__ = "$Revision: 85213 $"
36 __author__ = "Lars Gustäbel (lars@gustaebel.de)"
39 __credits__ = "Gustavo Niemeyer, Niels Gustäbel, Richard Townsend, Eduardo Robles."
59 import traceback # XXX
68 # os.symlink on Windows prior to 6.0 raises NotImplementedError
69 symlink_exception = (AttributeError, NotImplementedError)
71 # OSError (winerror=1314) will be raised if the caller does not hold the
72 # SeCreateSymbolicLinkPrivilege privilege
73 symlink_exception += (OSError,)
77 # from tarfile import *
78 __all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError"]
80 from builtins import open as _open # Since 'open' is TarFile.open
82 #---------------------------------------------------------
84 #---------------------------------------------------------
85 NUL = b"\0" # the null character
86 BLOCKSIZE = 512 # length of processing blocks
87 RECORDSIZE = BLOCKSIZE * 20 # length of records
88 GNU_MAGIC = b"ustar \0" # magic gnu tar string
89 POSIX_MAGIC = b"ustar\x0000" # magic posix tar string
91 LENGTH_NAME = 100 # maximum length of a filename
92 LENGTH_LINK = 100 # maximum length of a linkname
93 LENGTH_PREFIX = 155 # maximum length of the prefix field
95 REGTYPE = b"0" # regular file
96 AREGTYPE = b"\0" # regular file
97 LNKTYPE = b"1" # link (inside tarfile)
98 SYMTYPE = b"2" # symbolic link
99 CHRTYPE = b"3" # character special device
100 BLKTYPE = b"4" # block special device
101 DIRTYPE = b"5" # directory
102 FIFOTYPE = b"6" # fifo special device
103 CONTTYPE = b"7" # contiguous file
105 GNUTYPE_LONGNAME = b"L" # GNU tar longname
106 GNUTYPE_LONGLINK = b"K" # GNU tar longlink
107 GNUTYPE_SPARSE = b"S" # GNU tar sparse file
108 GNUTYPE_MULTIVOL = b"M" # GNU tar continuation of a file that began on
111 XHDTYPE = b"x" # POSIX.1-2001 extended header
112 XGLTYPE = b"g" # POSIX.1-2001 global header
113 SOLARIS_XHDTYPE = b"X" # Solaris extended header
115 USTAR_FORMAT = 0 # POSIX.1-1988 (ustar) format
116 GNU_FORMAT = 1 # GNU tar format
117 PAX_FORMAT = 2 # POSIX.1-2001 (pax) format
118 DEFAULT_FORMAT = GNU_FORMAT
120 GZ_FMT_HEADER = b"<BBBBLBB"
121 GZ_HEADER_SIZE = 10 # not including the name
122 GZ_MAGIC = (0x1f, 0x8b) # 0o37, 0o213
123 GZ_METHOD_DEFLATE = 0x08 # 0o10
124 GZ_FLAG_FTEXT = 1 << 0 # ASCII payload
125 GZ_FLAG_FHCRC = 1 << 1 # CRC16
126 GZ_FLAG_FEXTRA = 1 << 2 # extra field
127 GZ_FLAG_FNAME = 1 << 3 # set by default in gzip
128 GZ_FLAG_FCOMMENT = 1 << 4 # NUL-terminated comment
129 GZ_FLAG_RESERVED = 7 << 5 # unassigned
130 GZ_DEFLATE_FLAGS = 0x00 # 0o00, never read (deflate.c)
131 GZ_OS_CODE = 0x03 # 0o03, default in gzip (tailor.h)
132 GZ_MAGIC_BYTES = struct.pack ("<BB", GZ_MAGIC [0], GZ_MAGIC [1])
133 GZ_MAGIC_DEFLATE = struct.pack ("<BBB", GZ_MAGIC [0], GZ_MAGIC [1],
137 TOLERANCE_RECOVER = 1 # rely on offsets in index
138 TOLERANCE_RESCUE = 2 # deduce metadata from archive contents
142 #---------------------------------------------------------
143 # archive handling mode
144 #---------------------------------------------------------
147 ARCMODE_ENCRYPT = 1 << 0
148 ARCMODE_COMPRESS = 1 << 1
149 ARCMODE_CONCAT = 1 << 2
152 if m == ARCMODE_PLAIN:
156 def chkappend (b, s):
161 if first is True: first = False
164 chkappend (ARCMODE_ENCRYPT, "ENCRYPT")
165 chkappend (ARCMODE_COMPRESS, "COMPRESS")
166 chkappend (ARCMODE_CONCAT, "CONCAT")
170 def arcmode_set (concat=False, encryption=None, comptype=None, init=ARCMODE_PLAIN):
172 if bool (concat) is True:
173 ret |= ARCMODE_CONCAT
174 if encryption is not None:
175 ret |= ARCMODE_ENCRYPT
177 ret |= ARCMODE_COMPRESS
180 #---------------------------------------------------------
182 #---------------------------------------------------------
183 # File types that tarfile supports:
184 SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE,
185 SYMTYPE, DIRTYPE, FIFOTYPE,
186 CONTTYPE, CHRTYPE, BLKTYPE,
187 GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
188 GNUTYPE_SPARSE, GNUTYPE_MULTIVOL)
190 # File types that will be treated as a regular file.
191 REGULAR_TYPES = (REGTYPE, AREGTYPE,
192 CONTTYPE, GNUTYPE_SPARSE)
194 # File types that are part of the GNU tar format.
195 GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
196 GNUTYPE_SPARSE, GNUTYPE_MULTIVOL)
198 # Fields from a pax header that override a TarInfo attribute.
199 PAX_FIELDS = ("path", "linkpath", "size", "mtime",
200 "uid", "gid", "uname", "gname")
202 # Fields from a pax header that are affected by hdrcharset.
203 PAX_NAME_FIELDS = {"path", "linkpath", "uname", "gname"}
205 # Fields in a pax header that are numbers, all other fields
206 # are treated as strings.
207 PAX_NUMBER_FIELDS = {
216 #---------------------------------------------------------
218 #---------------------------------------------------------
220 if os.name in ("nt", "ce"):
223 ENCODING = sys.getfilesystemencoding()
225 #---------------------------------------------------------
226 # Some useful functions
227 #---------------------------------------------------------
229 def stn(s, length, encoding, errors):
230 """Convert a string to a null-terminated bytes object.
232 s = s.encode(encoding, errors)
233 return s[:length] + (length - len(s)) * NUL
235 def nts(s, encoding, errors):
236 """Convert a null-terminated bytes object to a string.
241 return s.decode(encoding, errors)
243 def sbtn(s, length, encoding, errors):
244 """Convert a string or a bunch of bytes to a null-terminated bytes object
247 if isinstance(s, str):
248 s = s.encode(encoding, errors)
249 return s[:length] + (length - len(s)) * NUL
252 """Convert a number field to a python number.
254 # There are two possible encodings for a number field, see
256 if s[0] in (0o200, 0o377):
258 for i in range(len(s) - 1):
262 n = -(256 ** (len(s) - 1) - n)
265 n = int(nts(s, "ascii", "strict") or "0", 8)
267 raise InvalidHeaderError("invalid header")
270 def itn(n, digits=8, format=DEFAULT_FORMAT):
271 """Convert a python number to a number field.
273 # POSIX 1003.1-1988 requires numbers to be encoded as a string of
274 # octal digits followed by a null-byte, this allows values up to
275 # (8**(digits-1))-1. GNU tar allows storing numbers greater than
276 # that if necessary. A leading 0o200 or 0o377 byte indicate this
277 # particular encoding, the following digits-1 bytes are a big-endian
278 # base-256 representation. This allows values up to (256**(digits-1))-1.
279 # A 0o200 byte indicates a positive number, a 0o377 byte a negative
282 if 0 <= n < 8 ** (digits - 1):
283 s = bytes("%0*o" % (digits - 1, n), "ascii") + NUL
284 elif format == GNU_FORMAT and -256 ** (digits - 1) <= n < 256 ** (digits - 1):
286 s = bytearray([0o200])
288 s = bytearray([0o377])
289 n = 256 ** digits + n
291 for i in range(digits - 1):
292 s.insert(1, n & 0o377)
295 raise ValueError("overflow in number field")
299 def calc_chksums(buf):
300 """Calculate the checksum for a member's header by summing up all
301 characters except for the chksum field which is treated as if
302 it was filled with spaces. According to the GNU tar sources,
303 some tars (Sun and NeXT) calculate chksum with signed char,
304 which will be different if there are chars in the buffer with
305 the high bit set. So we calculate two checksums, unsigned and
308 unsigned_chksum = 256 + sum(struct.unpack_from("148B8x356B", buf))
309 signed_chksum = 256 + sum(struct.unpack_from("148b8x356b", buf))
310 return unsigned_chksum, signed_chksum
312 def copyfileobj(src, dst, length=None):
313 """Copy length bytes from fileobj src to fileobj dst.
314 If length is None, copy the entire content.
319 shutil.copyfileobj(src, dst)
322 blocks, remainder = divmod(length, BUFSIZE)
323 for b in range(blocks):
324 buf = src.read(BUFSIZE)
326 if len(buf) < BUFSIZE:
327 raise OSError("end of file reached")
329 buf = src.read(remainder)
331 if len(buf) < remainder:
332 raise OSError("end of file reached")
336 """Deprecated in this location; use stat.filemode."""
338 warnings.warn("deprecated in favor of stat.filemode",
339 DeprecationWarning, 2)
340 return stat.filemode(mode)
342 class TarError(Exception):
343 """Base exception."""
345 class ExtractError(TarError):
346 """General exception for extract errors."""
348 class ReadError(TarError):
349 """Exception for unreadable tar archives."""
351 class CompressionError(TarError):
352 """Exception for unavailable compression methods."""
354 class StreamError(TarError):
355 """Exception for unsupported operations on stream-like TarFiles."""
357 class HeaderError(TarError):
358 """Base exception for header errors."""
360 class EmptyHeaderError(HeaderError):
361 """Exception for empty headers."""
363 class TruncatedHeaderError(HeaderError):
364 """Exception for truncated headers."""
366 class EOFHeaderError(HeaderError):
367 """Exception for end of file headers."""
369 class InvalidHeaderError(HeaderError):
370 """Exception for invalid headers."""
372 class SubsequentHeaderError(HeaderError):
373 """Exception for missing and invalid extended headers."""
375 class InvalidEncryptionError(TarError):
376 """Exception for undefined crypto modes and combinations."""
378 class DecryptionError(TarError):
379 """Exception for error during decryption."""
381 class EncryptionError(TarError):
382 """Exception for error during encryption."""
384 class EndOfFile(Exception):
385 """Signal end of file condition when they’re not an error."""
388 #---------------------------
389 # internal stream interface
390 #---------------------------
392 """Low-level file object. Supports reading and writing.
393 It is used instead of a regular file object for streaming
397 def __init__(self, name, mode):
400 "w": os.O_RDWR | os.O_CREAT | os.O_TRUNC,
402 if hasattr(os, "O_BINARY"):
403 _mode |= os.O_BINARY # pylint: disable=no-member
404 self.fd = os.open(name, _mode, 0o666)
410 def read(self, size):
411 ret = os.read(self.fd, size)
412 self.offset += len(ret)
415 def write(self, s, pos=None):
418 os.lseek (self.fd, pos, os.SEEK_SET)
419 n = os.write(self.fd, s)
421 self.offset += len(s)
423 append = pos + n - p0
425 self.offset += append
426 os.lseek (self.fd, p0, os.SEEK_SET)
431 def seek_set (self, pos):
432 os.lseek (self.fd, pos, os.SEEK_SET)
436 def gz_header (name=None):
437 timestamp = int(time.time())
443 flags |= GZ_FLAG_FNAME
444 if type(name) is str:
445 name = name.encode("iso-8859-1", "replace")
446 if name.endswith(b".pdtcrypt"):
448 if name.endswith(b".gz"):
450 # RFC1952 says we must use ISO-8859-1 for the FNAME field.
453 hdr = struct.pack (GZ_FMT_HEADER,
454 GZ_MAGIC [0], GZ_MAGIC [1],
455 GZ_METHOD_DEFLATE, flags,
457 GZ_DEFLATE_FLAGS, GZ_OS_CODE)
463 """Class that serves as an adapter between TarFile and
464 a stream-like object. The stream-like object only
465 needs to have a read() or write() method and is accessed
466 blockwise. Use of gzip or bzip2 compression is possible.
467 A stream-like object could be for example: sys.stdin,
468 sys.stdout, a socket, a tape device etc.
470 _Stream is intended to be used only internally but is
471 nevertherless used externally by Deltatar.
473 When encrypting, the ``enccounter`` will be used for
474 initializing the first cryptographic context. When
475 decrypting, its value will be compared to the decrypted
476 object. Decryption fails if the value does not match.
477 In effect, this means that a ``_Stream`` whose ctor was
478 passed ``enccounter`` can only be used to encrypt or
479 decrypt a single object.
482 remainder = -1 # track size in encrypted entries
483 tolerance = TOLERANCE_STRICT
485 def __init__(self, name, mode, comptype, fileobj, bufsize,
486 concat=False, encryption=None, enccounter=None,
487 compresslevel=9, tolerance=TOLERANCE_STRICT):
488 """Construct a _Stream object.
490 self.arcmode = arcmode_set (concat, encryption, comptype)
491 self.tolerance = tolerance
493 self._extfileobj = True
495 fileobj = _LowLevelFile(name, mode)
496 self._extfileobj = False
499 # Enable transparent compression detection for the
501 fileobj = _StreamProxy(fileobj)
502 comptype = fileobj.getcomptype()
506 self.enccounter = None
507 if self.arcmode & ARCMODE_ENCRYPT:
508 self.enccounter = enccounter
510 self.name = name or ""
512 self.comptype = comptype
514 self.fileobj = fileobj
515 self.bufsize = bufsize
521 self.last_block_offset = 0
522 self.dbuf = b"" # ???
523 self.exception = None # communicate decompression failure
524 self.compresslevel = compresslevel
525 self.bytes_written = 0
527 self.encryption = encryption
530 if encryption is not None:
531 encryption.reset_last_iv ()
538 raise CompressionError("zlib module is not available")
541 self.exception = zlib.error
544 if not (self.arcmode & ARCMODE_CONCAT):
545 if self.arcmode & ARCMODE_ENCRYPT:
546 self._init_write_encrypt (name)
547 self._init_write_gz ()
548 self.crc = zlib.crc32(b"") & 0xFFFFffff
550 elif comptype == "bz2":
551 if self.arcmode & ARCMODE_ENCRYPT:
552 raise InvalidEncryptionError("encryption not available for "
553 "compression “%s”" % comptype)
557 raise CompressionError("bz2 module is not available")
560 self.cmp = bz2.BZ2Decompressor()
561 self.exception = OSError
563 self.cmp = bz2.BZ2Compressor()
565 elif comptype == 'xz':
566 if self.arcmode & ARCMODE_ENCRYPT:
567 raise InvalidEncryptionError("encryption not available for "
568 "compression “%s”" % comptype)
572 raise CompressionError("lzma module is not available")
575 self.cmp = lzma.LZMADecompressor()
576 self.exception = lzma.LZMAError
578 self.cmp = lzma.LZMACompressor()
580 elif comptype == "tar":
581 if not (self.arcmode & ARCMODE_CONCAT) \
583 and self.arcmode & ARCMODE_ENCRYPT:
584 self._init_write_encrypt (name)
587 if self.arcmode & ARCMODE_ENCRYPT:
588 raise InvalidEncryptionError("encryption not available for "
589 "compression “%s”" % comptype)
590 raise CompressionError("unknown compression type %r" % comptype)
593 if not self._extfileobj:
599 if hasattr(self, "closed") and not self.closed:
602 except crypto.InternalError:
603 # context already finalized due to abort but close() tried
608 def next (self, name):
609 if self.arcmode & ARCMODE_COMPRESS:
610 if getattr (self, "cmp", None) is not None:
611 self._finalize_write_gz ()
613 if self.arcmode & ~(ARCMODE_ENCRYPT | ARCMODE_COMPRESS):
614 self.last_block_offset = self.fileobj.tell()
615 if self.arcmode & ARCMODE_ENCRYPT:
616 self._finalize_write_encrypt ()
617 self._init_write_encrypt (name, set_last_block_offset=True)
618 if self.arcmode & ARCMODE_COMPRESS:
619 self._init_write_gz (set_last_block_offset =
620 not (self.arcmode & ARCMODE_ENCRYPT))
621 return self.last_block_offset
624 def next_volume (self, name):
625 # with non-concat modes, this is taken care by the _Stream
626 # ctor as invoked by the newvol handler
627 if self.arcmode & ARCMODE_COMPRESS:
628 if getattr (self, "cmp", None) is not None:
629 # e. g. compressed PAX header written
630 self._finalize_write_gz ()
631 if self.arcmode & ARCMODE_ENCRYPT:
632 self._init_write_encrypt (name)
633 if self.arcmode & ARCMODE_COMPRESS:
634 self._init_write_gz ()
637 def _init_write_encrypt (self, entry=None, set_last_block_offset=False):
639 Save position for delayed write of header; fill the header location
642 # first thing, proclaim new object to the encryption context
643 # secondly, assemble the header with the updated parameters
644 # and commit it directly to the underlying stream, bypassing the
645 # encryption layer in .__write().
646 dummyhdr = self.encryption.next (entry, counter=self.enccounter)
648 raise EncryptionError ("Crypto.next(): bad dummy header") # XXX
649 self.lasthdr = self.fileobj.tell()
650 self.__write_to_file(dummyhdr)
651 if set_last_block_offset is True:
652 self.last_block_offset = self.lasthdr
655 def _finalize_write_encrypt (self):
657 Seek back to header position, read dummy bytes, finalize crypto
658 obtaining the actual header, write header, seek back to current
661 Returns the list of IV fixed parts as used during encryption.
663 if self.lasthdr is not None:
664 pos0 = self.fileobj.tell ()
665 self.fileobj.seek_set (self.lasthdr)
666 dummy = self.fileobj.read (crypto.PDTCRYPT_HDR_SIZE)
667 pos1 = self.fileobj.tell ()
668 dpos = pos1 - self.lasthdr
669 assert dpos == crypto.PDTCRYPT_HDR_SIZE
670 self.fileobj.seek_set (pos0)
671 data, hdr, _ = self.encryption.done (dummy)
672 self.__write_to_file(hdr, pos=self.lasthdr)
673 self.__write_to_file(data) # append remainder of data
677 def _finalize_write_gz (self):
678 if self.cmp is not None:
679 chunk = self.buf + self.cmp.flush()
681 if self.comptype == "gz":
682 # The native zlib crc is an unsigned 32-bit integer, but
683 # the Python wrapper implicitly casts that to a signed C
684 # long. So, on a 32-bit box self.crc may "look negative",
685 # while the same crc on a 64-bit box may "look positive".
686 # To avoid irksome warnings from the `struct` module, force
687 # it to look positive on all boxes.
688 chunk += struct.pack("<L", self.crc & 0xffffffff)
689 chunk += struct.pack("<L", self.concat_pos & 0xffffFFFF)
690 self.__enc_write (chunk)
694 def _init_write_gz (self, set_last_block_offset=False):
696 Add a new gzip block, closing last one
699 self.crc = self.zlib.crc32(b"") & 0xFFFFffff
700 first = self.cmp is None
701 self.cmp = self.zlib.compressobj(self.compresslevel,
703 -self.zlib.MAX_WBITS,
704 self.zlib.DEF_MEM_LEVEL,
707 # if aes, we encrypt after compression
708 if set_last_block_offset is True:
709 self.last_block_offset = self.fileobj.tell()
711 self.__write(gz_header (self.name if first is True else None))
715 """Write string s to the stream.
717 if self.comptype == "gz":
718 self.crc = self.zlib.crc32(s, self.crc) & 0xFFFFffff
720 self.concat_pos += len(s)
721 if self.cmp is not None:
722 s = self.cmp.compress(s)
726 """Write what’s left in the buffer to the stream."""
727 self.__write (b"") # → len (buf) <= bufsiz
728 self.__enc_write (self.buf)
731 def __write(self, s):
732 """Writes (and encodes) string s to the stream blockwise
734 will wait with encoding/writing until block is complete
737 while len(self.buf) > self.bufsize:
738 self.__enc_write(self.buf[:self.bufsize])
739 self.buf = self.buf[self.bufsize:]
742 def __write_to_file(self, s, pos=None):
744 Writes directly to the fileobj; updates self.bytes_written. If “pos” is
745 given, the stream will seek to that position first and back afterwards,
746 and the total of bytes written is not updated.
748 self.fileobj.write(s, pos)
750 self.bytes_written += len(s)
753 def __enc_write(self, s):
755 If encryption is active, the string s is encrypted before being written
760 if self.arcmode & ARCMODE_ENCRYPT:
763 n, ct = self.encryption.process(buf)
764 self.__write_to_file(ct)
767 # The entire plaintext was not consumed: The size limit
768 # for encrypted objects was reached. Transparently create
769 # a new encrypted object and continue processing the input.
770 self._finalize_write_encrypt ()
771 self._init_write_encrypt ()
773 self.__write_to_file(s)
776 def estim_file_size(self):
777 """ estimates size of file if closing it now
779 The result may differ greatly from the amount of data sent to write()
780 due to compression, encryption and buffering.
782 In tests the result (before calling close()) was up to 12k smaller than
783 the final file size if compression is being used because zlib/bz2
784 compressors do not allow inspection of their buffered data :-(
786 Still, we add what close() would add: 8 bytes for gz checksum, one
787 encryption block size if encryption is used and the size of our own
791 return self.bytes_written
793 result = self.bytes_written
795 result += len(self.buf)
796 if self.comptype == 'gz':
797 result += 8 # 2 longs = 8 byte (no extra info written for bzip2)
800 def close(self, close_fileobj=True):
801 """Close the _Stream object. No operation should be
802 done on it afterwards.
808 if close_fileobj is True:
811 if self.arcmode & ARCMODE_COMPRESS:
812 self._finalize_write_gz ()
813 # end of Tar archive marker (two empty blocks) was written
814 # finalize encryption last; no writes may be performed after
817 if self.arcmode & ARCMODE_ENCRYPT:
818 self._finalize_write_encrypt ()
820 if not self._extfileobj:
823 # read the zlib crc and length and check them
824 if self.mode == "r" and self.comptype == "gz":
825 read_crc = self.__read(4)
826 read_length = self.__read(4)
827 calculated_crc = self.crc
828 if struct.unpack("<L", read_crc)[0] != calculated_crc:
829 raise CompressionError("bad gzip crc")
833 def _init_read_gz(self):
834 """Initialize for reading a gzip compressed fileobj.
836 self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
838 read2 = self.__read(2)
840 raise EndOfFile ("_init_read_gz(): read returned zero bytes at pos "
841 "%d" % self.fileobj.tell())
842 # taken from gzip.GzipFile with some alterations
843 if read2 != GZ_MAGIC_BYTES:
844 raise ReadError("not a gzip file")
846 read1 = self.__read(1)
848 raise EndOfFile ("_init_read_gz(): read returned zero bytes inside "
849 "gzip header at pos %d" % self.fileobj.tell())
850 if ord (read1) != GZ_METHOD_DEFLATE:
851 raise CompressionError("unsupported compression method")
853 self.flags = flag = ord(self.__read(1))
854 self.__read(6) # discard timestamp[4], deflate flags, os code
856 if flag & GZ_FLAG_FEXTRA:
857 xlen = ord(self.__read(1)) + 256 * ord(self.__read(1))
859 if flag & GZ_FLAG_FNAME:
862 if not s or s == NUL:
864 if flag & GZ_FLAG_FCOMMENT:
867 if not s or s == NUL:
869 if flag & GZ_FLAG_FHCRC:
872 def _init_read_encrypt (self):
873 """Initialize encryption for next entry in archive. Read a header and
874 notify the crypto context."""
875 if self.arcmode & ARCMODE_ENCRYPT:
876 lasthdr = self.fileobj.tell ()
878 hdr = crypto.hdr_read_stream (self.fileobj)
879 except crypto.EndOfFile:
881 except crypto.InvalidHeader as exn:
882 raise DecryptionError ("Crypto.hdr_read_stream(): error “%s” "
883 "processing %r at pos %d"
884 % (exn, self.fileobj, lasthdr)) \
886 if self.enccounter is not None:
887 # enforce that the iv counter in the header matches an
888 # explicitly requested one
889 iv = crypto.hdr_iv_counter (hdr)
890 if iv != self.enccounter:
891 raise DecryptionError ("expected IV counter %d, got %d"
892 % (self.enccounter, iv))
893 self.lasthdr = lasthdr
894 self.remainder = hdr ["ctsize"] # distance to next header
896 self.encryption.next (hdr)
897 except crypto.InvalidParameter as exn:
898 raise DecryptionError ("Crypto.next(): error “%s” "
899 "processing %r at pos %d"
900 % (exn, self.fileobj, lasthdr)) \
906 def _read_encrypt (self, buf):
908 Demote a program error to a decryption error in tolerant mode. This
909 allows recovery from corrupted headers and invalid data.
912 return self.encryption.process (buf)
913 except RuntimeError as exn:
914 if self.tolerance != TOLERANCE_STRICT:
915 raise DecryptionError (exn)
919 def _finalize_read_encrypt (self):
923 if self.arcmode & ARCMODE_ENCRYPT \
924 and self.lasthdr is not None :
925 assert self.remainder >= 0
926 if self.remainder > 0:
929 data = self.encryption.done ()
930 except crypto.InvalidGCMTag as exn:
931 raise DecryptionError ("decryption failed: %s" % exn)
936 """Return the stream's file pointer position.
940 def seek(self, pos=0):
941 """Set the stream's file pointer to pos. Negative seeking
946 elif pos - self.pos >= 0:
947 blocks, remainder = divmod(pos - self.pos, self.bufsize)
948 if self.encryption is not None:
949 # IV succession is only preserved between successive objects.
950 self.encryption.reset_last_iv ()
951 for i in range(blocks):
952 self.read(self.bufsize)
955 raise StreamError("seeking backwards is not allowed")
958 def read(self, size=None):
959 """Return the next size number of bytes from the stream.
960 If size is not defined, return all bytes of the stream
966 buf = self._read(self.bufsize)
972 buf = self._read(size)
977 """Reads just one line, new line character included
979 # if \n in dbuf, no read neads to be done
980 if b'\n' in self.dbuf:
981 pos = self.dbuf.index(b'\n') + 1
982 ret = self.dbuf[:pos]
983 self.dbuf = self.dbuf[pos:]
988 chunk = self._read(self.bufsize)
990 # nothing more to read, so return the buffer
996 # if \n found, return the new line
999 pos = dbuf.index(b'\n') + 1
1000 self.dbuf = dbuf[pos:] + self.dbuf
1003 def _read(self, size):
1004 """Return size bytes from the stream.
1010 buf = self.__read(self.bufsize)
1014 if self.cmp is not None:
1016 buf = self.cmp.decompress(buf)
1017 except self.exception as exn:
1018 raise ReadError("invalid compressed data (%r)" % exn)
1019 except Exception as e:
1020 # happens at the end of the file
1021 # _init_read_gz failed in the previous iteration so
1022 # self.cmp.decompress fails here
1023 if self.arcmode & ARCMODE_CONCAT:
1026 raise ReadError("invalid compressed data")
1027 if self.arcmode & ARCMODE_COMPRESS and hasattr(self, "crc"):
1028 self.crc = self.zlib.crc32(buf, self.crc) & 0xFFFFffff
1029 if self.arcmode & ARCMODE_CONCAT \
1030 and len(self.cmp.unused_data) != 0:
1031 self.buf = self.cmp.unused_data + self.buf
1032 self.close(close_fileobj=False)
1034 self._init_read_gz()
1035 except DecryptionError:
1036 if self.tolerance != TOLERANCE_STRICT:
1037 # return whatever data was processed successfully
1043 except ReadError: # gzip troubles
1044 if self.tolerance == TOLERANCE_RESCUE:
1051 # happens at the end of the file
1053 self.crc = self.zlib.crc32(b"") & 0xFFFFffff
1058 self.dbuf = t[size:]
1062 def __read(self, size):
1064 Return size bytes from stream. If internal buffer is empty, read
1065 another block from the stream.
1067 The function returns up to size bytes of data. When an error occurs
1068 during decryption, everything until the end of the last successfully
1069 finalized object is returned.
1072 t = [self.buf] if c > 0 else []
1073 good_crypto = len (t)
1078 if self.arcmode & ARCMODE_ENCRYPT:
1079 if self.remainder <= 0:
1080 # prepare next object
1081 if self._init_read_encrypt () is False: # EOF
1085 # only read up to the end of the encrypted object
1086 todo = min (size, self.remainder)
1087 buf = self.fileobj.read(todo)
1088 if self.arcmode & ARCMODE_ENCRYPT:
1090 buf = self._read_encrypt (buf)
1091 if todo == self.remainder:
1092 # at the end of a crypto object; finalization will fail if
1093 # the GCM tag does not match
1094 trailing = self._finalize_read_encrypt ()
1095 good_crypto = len (t) + 1
1096 if len (trailing) > 0:
1100 self.remainder -= todo
1101 except DecryptionError:
1102 if self.tolerance == TOLERANCE_STRICT:
1104 self.encryption.drop ()
1105 if self.tolerance == TOLERANCE_RECOVER:
1106 if good_crypto == 0:
1108 # this may occur at any of the three crypto operations above.
1109 # some objects did validate; discard all data after it; next
1110 # call will start with the bad object and error out immediately
1111 self.buf = b"".join (t [good_crypto:])
1112 return b"".join (t [:good_crypto])
1113 elif self.tolerance == TOLERANCE_RESCUE:
1114 # keep what we have so far despite the finalization issue
1119 raise RuntimeError("internal error: bad tolerance level")
1121 if not buf: ## XXX stream terminated prematurely; this should be an error
1132 class _StreamProxy(object):
1133 """Small proxy class that enables transparent compression
1134 detection for the Stream interface (mode 'r|*').
1137 def __init__(self, fileobj):
1138 self.fileobj = fileobj
1139 self.buf = self.fileobj.read(BLOCKSIZE)
1141 def read(self, size): # pylint: disable=method-hidden
1142 self.read = self.fileobj.read
1145 def getcomptype(self):
1146 if self.buf.startswith(GZ_MAGIC_DEFLATE):
1148 elif self.buf[0:3] == b"BZh" and self.buf[4:10] == b"1AY&SY":
1150 elif self.buf.startswith((b"\x5d\x00\x00\x80", b"\xfd7zXZ")):
1156 self.fileobj.close()
1159 #------------------------
1160 # Extraction file object
1161 #------------------------
1162 class _FileInFile(object):
1163 """A thin wrapper around an existing file object that
1164 provides a part of its data as an individual file
1168 def __init__(self, fileobj, offset, size, blockinfo=None):
1169 self.fileobj = fileobj
1170 self.offset = offset
1173 self.name = getattr(fileobj, "name", None)
1176 if blockinfo is None:
1177 blockinfo = [(0, size)]
1179 # Construct a map with data and zero blocks.
1183 realpos = self.offset
1184 for offset, size in blockinfo:
1185 if offset > lastpos:
1186 self.map.append((False, lastpos, offset, None))
1187 self.map.append((True, offset, offset + size, realpos))
1189 lastpos = offset + size
1190 if lastpos < self.size:
1191 self.map.append((False, lastpos, self.size, None))
1203 return self.fileobj.seekable()
1206 """Return the current file position.
1208 return self.position
1210 def seek(self, position, whence=io.SEEK_SET):
1211 """Seek to a position in the file.
1213 if whence == io.SEEK_SET:
1214 self.position = min(max(position, 0), self.size)
1215 elif whence == io.SEEK_CUR:
1217 self.position = max(self.position + position, 0)
1219 self.position = min(self.position + position, self.size)
1220 elif whence == io.SEEK_END:
1221 self.position = max(min(self.size + position, self.size), 0)
1223 raise ValueError("Invalid argument")
1224 return self.position
1226 def read(self, size=None):
1227 """Read data from the file.
1230 size = self.size - self.position
1232 size = min(size, self.size - self.position)
1237 data, start, stop, offset = self.map[self.map_index]
1238 if start <= self.position < stop:
1242 if self.map_index == len(self.map):
1244 length = min(size, stop - self.position)
1246 self.fileobj.seek(offset + (self.position - start))
1247 buf += self.fileobj.read(length)
1251 self.position += length
1254 def readinto(self, b):
1255 buf = self.read(len(b))
1264 class ExFileObject(io.BufferedReader):
1266 def __init__(self, tarfile, tarinfo):
1267 fileobj = _FileInFile(tarfile.fileobj, tarinfo.offset_data,
1268 tarinfo.size, tarinfo.sparse)
1269 super().__init__(fileobj)
1275 class TarInfo(object):
1276 """Informational class which holds the details about an
1277 archive member given by a tar header block.
1278 TarInfo objects are returned by TarFile.getmember(),
1279 TarFile.getmembers() and TarFile.gettarinfo() and are
1280 usually created internally.
1283 __slots__ = ("name", "mode", "uid", "gid", "size", "mtime",
1284 "chksum", "type", "linkname", "uname", "gname",
1285 "devmajor", "devminor", "volume_offset",
1286 "offset", "offset_data", "pax_headers", "sparse",
1287 "tarfile", "_sparse_structs", "_link_target")
1289 def __init__(self, name=""):
1290 """Construct a TarInfo object. name is the optional name
1293 self.name = name # member name
1294 self.mode = 0o644 # file permissions
1295 self.uid = 0 # user id
1296 self.gid = 0 # group id
1297 self.size = 0 # file size
1298 self.mtime = 0 # modification time
1299 self.chksum = 0 # header checksum
1300 self.type = REGTYPE # member type
1301 self.linkname = "" # link name
1302 self.uname = "" # user name
1303 self.gname = "" # group name
1304 self.devmajor = 0 # device major number
1305 self.devminor = 0 # device minor number
1307 self.offset = 0 # the tar header starts here
1308 self.offset_data = 0 # the file's data starts here
1309 self.volume_offset = 0 # the file's data corresponds with the data
1310 # starting at this position
1312 self.sparse = None # sparse member information
1313 self.pax_headers = {} # pax header information
1315 # In pax headers the "name" and "linkname" field are called
1316 # "path" and "linkpath".
1319 def _setpath(self, name):
1321 path = property(_getpath, _setpath)
1323 def _getlinkpath(self):
1324 return self.linkname
1325 def _setlinkpath(self, linkname):
1326 self.linkname = linkname
1327 linkpath = property(_getlinkpath, _setlinkpath)
1330 return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self))
1332 def get_info(self, encoding=None, errors=None):
1333 """Return the TarInfo's attributes as a dictionary.
1337 "mode": self.mode & 0o7777,
1341 "mtime": self.mtime,
1342 "chksum": self.chksum,
1344 "linkname": self.linkname,
1345 "uname": self.uname,
1346 "gname": self.gname,
1347 "devmajor": self.devmajor,
1348 "devminor": self.devminor,
1349 "offset_data": self.offset_data,
1350 "volume_offset": self.volume_offset
1353 if info["type"] == DIRTYPE and not info["name"].endswith("/"):
1358 def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING,
1359 errors="surrogateescape"):
1360 """Return a tar header as a string of 512 byte blocks.
1362 info = self.get_info(encoding, errors)
1364 if format == USTAR_FORMAT:
1365 return self.create_ustar_header(info, encoding, errors)
1366 elif format == GNU_FORMAT:
1367 return self.create_gnu_header(info, encoding, errors)
1368 elif format == PAX_FORMAT:
1369 return self.create_pax_header(info, encoding, errors)
1371 raise ValueError("invalid format")
1373 def create_ustar_header(self, info, encoding, errors):
1374 """Return the object as a ustar header block.
1376 info["magic"] = POSIX_MAGIC
1378 if len(info["linkname"]) > LENGTH_LINK:
1379 raise ValueError("linkname is too long")
1381 if len(info["name"]) > LENGTH_NAME:
1382 info["prefix"], info["name"] = self._posix_split_name(info["name"])
1384 return self._create_header(info, USTAR_FORMAT, encoding, errors)
1386 def create_gnu_header(self, info, encoding, errors):
1387 """Return the object as a GNU header block sequence.
1389 info["magic"] = GNU_MAGIC
1391 if self.ismultivol():
1393 itn(info.get("atime", 0), 12, GNU_FORMAT),
1394 itn(info.get("ctime", 0), 12, GNU_FORMAT),
1395 itn(self.volume_offset, 12, GNU_FORMAT),
1396 itn(0, 119, GNU_FORMAT), # stuff unused in this tar implementation, set to zero
1398 info['prefix'] = b"".join(prefix)
1399 info['size'] = info['size'] - self.volume_offset
1402 if len(info["linkname"]) > LENGTH_LINK:
1403 buf += self._create_gnu_long_header(info["linkname"],
1404 GNUTYPE_LONGLINK, encoding, errors)
1406 if len(info["name"]) > LENGTH_NAME:
1407 buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME,
1410 return buf + self._create_header(info, GNU_FORMAT, encoding, errors)
1412 def create_pax_header(self, info, encoding, errors):
1413 """Return the object as a ustar header block. If it cannot be
1414 represented this way, prepend a pax extended header sequence
1415 with supplement information.
1417 info["magic"] = POSIX_MAGIC
1418 pax_headers = self.pax_headers.copy()
1419 if self.ismultivol():
1420 info['size'] = info['size'] - self.volume_offset
1422 # Test string fields for values that exceed the field length or cannot
1423 # be represented in ASCII encoding.
1424 for name, hname, length in (
1425 ("name", "path", LENGTH_NAME),
1426 ("linkname", "linkpath", LENGTH_LINK),
1427 ("uname", "uname", 32),
1428 ("gname", "gname", 32)):
1430 if hname in pax_headers:
1431 # The pax header has priority.
1434 # Try to encode the string as ASCII.
1436 info[name].encode("ascii", "strict")
1437 except UnicodeEncodeError:
1438 pax_headers[hname] = info[name]
1441 if len(info[name]) > length:
1442 pax_headers[hname] = info[name]
1444 # Test number fields for values that exceed the field limit or values
1445 # that like to be stored as float.
1446 for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)):
1447 if name in pax_headers:
1448 # The pax header has priority. Avoid overflow.
1453 if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float):
1454 pax_headers[name] = str(val)
1457 # Create a pax extended header if necessary.
1459 buf = self._create_pax_generic_header(pax_headers, XHDTYPE, encoding)
1463 return buf + self._create_header(info, USTAR_FORMAT, "ascii", "replace")
1466 def create_pax_global_header(cls, pax_headers):
1467 """Return the object as a pax global header block sequence.
1469 return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf-8")
1471 def _posix_split_name(self, name):
1472 """Split a name longer than 100 chars into a prefix
1475 prefix = name[:LENGTH_PREFIX + 1]
1476 while prefix and prefix[-1] != "/":
1477 prefix = prefix[:-1]
1479 name = name[len(prefix):]
1480 prefix = prefix[:-1]
1482 if not prefix or len(name) > LENGTH_NAME:
1483 raise ValueError("name is too long")
1487 def _create_header(info, format, encoding, errors):
1488 """Return a header block. info is a dictionary with file
1489 information, format must be one of the *_FORMAT constants.
1492 stn(info.get("name", ""), 100, encoding, errors),
1493 itn(info.get("mode", 0) & 0o7777, 8, format),
1494 itn(info.get("uid", 0), 8, format),
1495 itn(info.get("gid", 0), 8, format),
1496 itn(info.get("size", 0), 12, format),
1497 itn(info.get("mtime", 0), 12, format),
1498 b" ", # checksum field
1499 info.get("type", REGTYPE),
1500 stn(info.get("linkname", ""), 100, encoding, errors),
1501 info.get("magic", POSIX_MAGIC),
1502 stn(info.get("uname", ""), 32, encoding, errors),
1503 stn(info.get("gname", ""), 32, encoding, errors),
1504 itn(info.get("devmajor", 0), 8, format),
1505 itn(info.get("devminor", 0), 8, format),
1506 sbtn(info.get("prefix", ""), 155, encoding, errors)
1509 buf = struct.pack("%ds" % BLOCKSIZE, b"".join(parts))
1510 chksum = calc_chksums(buf[-BLOCKSIZE:])[0]
1511 buf = buf[:-364] + bytes("%06o\0" % chksum, "ascii") + buf[-357:]
1515 def _create_payload(payload):
1516 """Return the string payload filled with zero bytes
1517 up to the next 512 byte border.
1519 blocks, remainder = divmod(len(payload), BLOCKSIZE)
1521 payload += (BLOCKSIZE - remainder) * NUL
1525 def _create_gnu_long_header(cls, name, type, encoding, errors):
1526 """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence
1529 name = name.encode(encoding, errors) + NUL
1532 info["name"] = "././@LongLink"
1534 info["size"] = len(name)
1535 info["magic"] = GNU_MAGIC
1537 # create extended header + name blocks.
1538 return cls._create_header(info, USTAR_FORMAT, encoding, errors) + \
1539 cls._create_payload(name)
1542 def _create_pax_generic_header(cls, pax_headers, type, encoding):
1543 """Return a POSIX.1-2008 extended or global header sequence
1544 that contains a list of keyword, value pairs. The values
1547 # Check if one of the fields contains surrogate characters and thereby
1548 # forces hdrcharset=BINARY, see _proc_pax() for more information.
1550 for keyword, value in pax_headers.items():
1552 value.encode("utf-8", "strict")
1553 except UnicodeEncodeError:
1559 # Put the hdrcharset field at the beginning of the header.
1560 records += b"21 hdrcharset=BINARY\n"
1562 for keyword, value in pax_headers.items():
1563 keyword = keyword.encode("utf-8")
1565 # Try to restore the original byte representation of `value'.
1566 # Needless to say, that the encoding must match the string.
1567 value = value.encode(encoding, "surrogateescape")
1569 value = value.encode("utf-8")
1571 l = len(keyword) + len(value) + 3 # ' ' + '=' + '\n'
1578 records += bytes(str(p), "ascii") + b" " + keyword + b"=" + value + b"\n"
1580 # We use a hardcoded "././@PaxHeader" name like star does
1581 # instead of the one that POSIX recommends.
1583 info["name"] = "././@PaxHeader"
1585 info["size"] = len(records)
1586 info["magic"] = POSIX_MAGIC
1588 # Create pax header + record blocks.
1589 return cls._create_header(info, USTAR_FORMAT, "ascii", "replace") + \
1590 cls._create_payload(records)
1593 def frombuf(cls, buf, encoding, errors):
1594 """Construct a TarInfo object from a 512 byte bytes object.
1597 raise EmptyHeaderError("empty header")
1598 if len(buf) != BLOCKSIZE:
1599 raise TruncatedHeaderError("truncated header")
1600 if buf.count(NUL) == BLOCKSIZE:
1601 raise EOFHeaderError("end of file header")
1603 chksum = nti(buf[148:156])
1604 if chksum not in calc_chksums(buf):
1605 raise InvalidHeaderError("bad checksum")
1608 obj.name = nts(buf[0:100], encoding, errors)
1609 obj.mode = nti(buf[100:108])
1610 obj.uid = nti(buf[108:116])
1611 obj.gid = nti(buf[116:124])
1612 obj.size = nti(buf[124:136])
1613 obj.mtime = nti(buf[136:148])
1615 obj.type = buf[156:157]
1616 obj.linkname = nts(buf[157:257], encoding, errors)
1617 obj.uname = nts(buf[265:297], encoding, errors)
1618 obj.gname = nts(buf[297:329], encoding, errors)
1619 obj.devmajor = nti(buf[329:337])
1620 obj.devminor = nti(buf[337:345])
1621 prefix = nts(buf[345:500], encoding, errors)
1623 # The old GNU sparse format occupies some of the unused
1624 # space in the buffer for up to 4 sparse structures.
1625 # Save the them for later processing in _proc_sparse().
1626 if obj.type == GNUTYPE_SPARSE:
1631 offset = nti(buf[pos:pos + 12])
1632 numbytes = nti(buf[pos + 12:pos + 24])
1635 structs.append((offset, numbytes))
1637 isextended = bool(buf[482])
1638 origsize = nti(buf[483:495])
1639 obj._sparse_structs = (structs, isextended, origsize)
1641 # Old V7 tar format represents a directory as a regular
1642 # file with a trailing slash.
1643 if obj.type == AREGTYPE and obj.name.endswith("/"):
1646 # Remove redundant slashes from directories.
1648 obj.name = obj.name.rstrip("/")
1650 # Reconstruct a ustar longname.
1651 if prefix and obj.type not in GNU_TYPES:
1652 obj.name = prefix + "/" + obj.name
1654 obj.offset_data = nti(buf[369:381])
1658 def fromtarfile(cls, tarfile):
1659 """Return the next TarInfo object from TarFile object
1662 buf = tarfile.fileobj.read(BLOCKSIZE)
1663 obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors)
1664 obj.offset = tarfile.fileobj.tell() - BLOCKSIZE
1665 return obj._proc_member(tarfile)
1667 #--------------------------------------------------------------------------
1668 # The following are methods that are called depending on the type of a
1669 # member. The entry point is _proc_member() which can be overridden in a
1670 # subclass to add custom _proc_*() methods. A _proc_*() method MUST
1671 # implement the following
1673 # 1. Set self.offset_data to the position where the data blocks begin,
1674 # if there is data that follows.
1675 # 2. Set tarfile.offset to the position where the next member's header will
1677 # 3. Return self or another valid TarInfo object.
1678 def _proc_member(self, tarfile):
1679 """Choose the right processing method depending on
1680 the type and call it.
1682 if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK):
1683 return self._proc_gnulong(tarfile)
1684 elif self.type == GNUTYPE_SPARSE:
1685 return self._proc_sparse(tarfile)
1686 elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE):
1687 return self._proc_pax(tarfile)
1689 return self._proc_builtin(tarfile)
1691 def _proc_builtin(self, tarfile):
1692 """Process a builtin type or an unknown type which
1693 will be treated as a regular file.
1695 self.offset_data = tarfile.fileobj.tell()
1696 offset = self.offset_data
1697 if self.isreg() or self.ismultivol() or self.type not in SUPPORTED_TYPES:
1698 # Skip the following data blocks.
1699 offset += self._block(self.size)
1700 tarfile.offset = offset
1702 # Patch the TarInfo object with saved global
1703 # header information.
1704 self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors)
1708 def _proc_gnulong(self, tarfile):
1709 """Process the blocks that hold a GNU longname
1712 buf = tarfile.fileobj.read(self._block(self.size))
1714 # Fetch the next header and process it.
1716 next = self.fromtarfile(tarfile)
1718 raise SubsequentHeaderError("missing or bad subsequent header")
1720 # Patch the TarInfo object from the next header with
1721 # the longname information.
1722 next.offset = self.offset
1723 if self.type == GNUTYPE_LONGNAME:
1724 next.name = nts(buf, tarfile.encoding, tarfile.errors)
1725 elif self.type == GNUTYPE_LONGLINK:
1726 next.linkname = nts(buf, tarfile.encoding, tarfile.errors)
1730 def _proc_sparse(self, tarfile):
1731 """Process a GNU sparse header plus extra headers.
1733 # We already collected some sparse structures in frombuf().
1734 structs, isextended, origsize = self._sparse_structs
1735 del self._sparse_structs
1737 # Collect sparse structures from extended header blocks.
1739 buf = tarfile.fileobj.read(BLOCKSIZE)
1743 offset = nti(buf[pos:pos + 12])
1744 numbytes = nti(buf[pos + 12:pos + 24])
1747 if offset and numbytes:
1748 structs.append((offset, numbytes))
1750 isextended = bool(buf[504])
1751 self.sparse = structs
1753 self.offset_data = tarfile.fileobj.tell()
1754 tarfile.offset = self.offset_data + self._block(self.size)
1755 self.size = origsize
1758 def _proc_pax(self, tarfile):
1759 """Process an extended or global header as described in
1762 # Read the header information.
1763 buf = tarfile.fileobj.read(self._block(self.size))
1765 # A pax header stores supplemental information for either
1766 # the following file (extended) or all following files
1768 if self.type == XGLTYPE:
1769 pax_headers = tarfile.pax_headers
1771 pax_headers = tarfile.pax_headers.copy()
1773 # Check if the pax header contains a hdrcharset field. This tells us
1774 # the encoding of the path, linkpath, uname and gname fields. Normally,
1775 # these fields are UTF-8 encoded but since POSIX.1-2008 tar
1776 # implementations are allowed to store them as raw binary strings if
1777 # the translation to UTF-8 fails.
1778 match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf)
1779 if match is not None:
1780 pax_headers["hdrcharset"] = match.group(1).decode("utf-8")
1782 # For the time being, we don't care about anything other than "BINARY".
1783 # The only other value that is currently allowed by the standard is
1784 # "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
1785 hdrcharset = pax_headers.get("hdrcharset")
1786 if hdrcharset == "BINARY":
1787 encoding = tarfile.encoding
1791 # Parse pax header information. A record looks like that:
1792 # "%d %s=%s\n" % (length, keyword, value). length is the size
1793 # of the complete record including the length field itself and
1794 # the newline. keyword and value are both UTF-8 encoded strings.
1795 regex = re.compile(br"(\d+) ([^=]+)=")
1798 match = regex.match(buf, pos)
1802 length, keyword = match.groups()
1803 length = int(length)
1804 value = buf[match.end(2) + 1:match.start(1) + length - 1]
1806 # Normally, we could just use "utf-8" as the encoding and "strict"
1807 # as the error handler, but we better not take the risk. For
1808 # example, GNU tar <= 1.23 is known to store filenames it cannot
1809 # translate to UTF-8 as raw strings (unfortunately without a
1810 # hdrcharset=BINARY header).
1811 # We first try the strict standard encoding, and if that fails we
1812 # fall back on the user's encoding and error handler.
1813 keyword = self._decode_pax_field(keyword, "utf-8", "utf-8",
1815 if keyword in PAX_NAME_FIELDS:
1816 value = self._decode_pax_field(value, encoding, tarfile.encoding,
1819 value = self._decode_pax_field(value, "utf-8", "utf-8",
1822 pax_headers[keyword] = value
1826 # Fetch the next header.
1828 next = self.fromtarfile(tarfile)
1830 raise SubsequentHeaderError("missing or bad subsequent header")
1832 # Process GNU sparse information.
1833 if "GNU.sparse.map" in pax_headers:
1834 # GNU extended sparse format version 0.1.
1835 self._proc_gnusparse_01(next, pax_headers)
1837 elif "GNU.sparse.size" in pax_headers:
1838 # GNU extended sparse format version 0.0.
1839 self._proc_gnusparse_00(next, pax_headers, buf)
1841 elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0":
1842 # GNU extended sparse format version 1.0.
1843 self._proc_gnusparse_10(next, pax_headers, tarfile)
1845 if self.type in (XHDTYPE, SOLARIS_XHDTYPE):
1846 # Patch the TarInfo object with the extended header info.
1847 next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors)
1848 next.offset = self.offset
1850 if "size" in pax_headers:
1851 # If the extended header replaces the size field,
1852 # we need to recalculate the offset where the next
1854 offset = next.offset_data
1855 if next.isreg() or next.type not in SUPPORTED_TYPES:
1856 offset += next._block(next.size)
1857 tarfile.offset = offset
1859 if next is not None:
1860 if "GNU.volume.filename" in pax_headers:
1861 if pax_headers["GNU.volume.filename"] == next.name:
1862 if "GNU.volume.size" in pax_headers:
1863 next.size = int(pax_headers["GNU.volume.size"])
1864 if "GNU.volume.offset" in pax_headers:
1865 next.volume_offset = int(pax_headers["GNU.volume.offset"])
1867 for key in pax_headers.keys():
1868 if key.startswith("GNU.volume"):
1869 del tarfile.pax_headers[key]
1873 def _proc_gnusparse_00(self, next, pax_headers, buf):
1874 """Process a GNU tar extended sparse header, version 0.0.
1877 for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf):
1878 offsets.append(int(match.group(1)))
1880 for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf):
1881 numbytes.append(int(match.group(1)))
1882 next.sparse = list(zip(offsets, numbytes))
1884 def _proc_gnusparse_01(self, next, pax_headers):
1885 """Process a GNU tar extended sparse header, version 0.1.
1887 sparse = [int(x) for x in pax_headers["GNU.sparse.map"].split(",")]
1888 next.sparse = list(zip(sparse[::2], sparse[1::2]))
1890 def _proc_gnusparse_10(self, next, pax_headers, tarfile):
1891 """Process a GNU tar extended sparse header, version 1.0.
1895 buf = tarfile.fileobj.read(BLOCKSIZE)
1896 fields, buf = buf.split(b"\n", 1)
1897 fields = int(fields)
1898 while len(sparse) < fields * 2:
1899 if b"\n" not in buf:
1900 buf += tarfile.fileobj.read(BLOCKSIZE)
1901 number, buf = buf.split(b"\n", 1)
1902 sparse.append(int(number))
1903 next.offset_data = tarfile.fileobj.tell()
1904 next.sparse = list(zip(sparse[::2], sparse[1::2]))
1906 def _apply_pax_info(self, pax_headers, encoding, errors):
1907 """Replace fields with supplemental information from a previous
1908 pax extended or global header.
1910 for keyword, value in pax_headers.items():
1911 if keyword == "GNU.sparse.name":
1912 setattr(self, "path", value)
1913 elif keyword == "GNU.sparse.size":
1914 setattr(self, "size", int(value))
1915 elif keyword == "GNU.sparse.realsize":
1916 setattr(self, "size", int(value))
1917 elif keyword in PAX_FIELDS:
1918 if keyword in PAX_NUMBER_FIELDS:
1920 value = PAX_NUMBER_FIELDS[keyword](value)
1923 if keyword == "path":
1924 value = value.rstrip("/") # pylint: disable=no-member
1925 setattr(self, keyword, value)
1927 self.pax_headers = pax_headers.copy()
1929 def _decode_pax_field(self, value, encoding, fallback_encoding, fallback_errors):
1930 """Decode a single field from a pax record.
1933 return value.decode(encoding, "strict")
1934 except UnicodeDecodeError:
1935 return value.decode(fallback_encoding, fallback_errors)
1937 def _block(self, count):
1938 """Round up a byte count by BLOCKSIZE and return it,
1939 e.g. _block(834) => 1024.
1941 blocks, remainder = divmod(count, BLOCKSIZE)
1944 return blocks * BLOCKSIZE
1947 return self.type in REGULAR_TYPES
1951 return self.type == DIRTYPE
1953 return self.type == SYMTYPE
1955 return self.type == LNKTYPE
1957 return self.type == CHRTYPE
1959 return self.type == BLKTYPE
1961 return self.type == FIFOTYPE
1963 return self.sparse is not None
1965 return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE)
1966 def ismultivol(self):
1967 return self.type == GNUTYPE_MULTIVOL or self.volume_offset > 0 or\
1968 "GNU.volume.offset" in self.pax_headers
1971 class TarFile(object):
1972 """The TarFile Class provides an interface to tar archives.
1975 debug = 0 # May be set from 0 (no msgs) to 3 (all msgs)
1977 dereference = False # If true, add content of linked file to the
1978 # tar file, else the link.
1980 ignore_zeros = False # If true, skips empty or invalid blocks and
1981 # continues processing.
1983 max_volume_size = None # If different from None, establishes maximum
1984 # size of tar volumes
1986 new_volume_handler = None # function handler to be executed before when
1987 # a new volume is needed
1989 volume_number = 0 # current volume number, used for multi volume
1992 errorlevel = 1 # If 0, fatal errors only appear in debug
1993 # messages (if debug >= 0). If > 0, errors
1994 # are passed to the caller as exceptions.
1996 format = DEFAULT_FORMAT # The format to use when creating an archive.
1998 encoding = ENCODING # Encoding for 8-bit character strings.
2000 errors = None # Error handler for unicode conversion.
2002 tarinfo = TarInfo # The default TarInfo class to use.
2004 fileobject = ExFileObject # The file-object for extractfile().
2006 arcmode = ARCMODE_PLAIN # Object processing mode (“concat”, encryption,
2009 save_to_members = True # If new members are saved. This can be disabled
2010 # if you manage lots of files and don't want
2011 # to have high memory usage
2013 cache_uid2user = {} # cache to avoid getpwuid calls. It always parses /etc/passwd.
2014 cache_gid2group = {} # same cache for groups
2016 def __init__(self, name=None, mode="r", fileobj=None, format=None,
2017 tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
2018 errors="surrogateescape", pax_headers=None, debug=None,
2019 errorlevel=None, max_volume_size=None, new_volume_handler=None,
2020 concat=False, nacl=None,
2021 save_to_members=True):
2022 """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
2023 read from an existing archive, 'a' to append data to an existing
2024 file or 'w' to create a new file overwriting an existing one. `mode'
2026 If `fileobj' is given, it is used for reading or writing data. If it
2027 can be determined, `mode' is overridden by `fileobj's mode.
2028 `fileobj' is not closed, when TarFile is closed.
2030 if len(mode) > 1 or mode not in "raw":
2031 raise ValueError("mode must be 'r', 'a' or 'w'")
2033 self.arcmode = arcmode_set (concat)
2035 self._mode = {"r": "rb", "a": "r+b", "w": "wb"}[mode]
2038 if self.mode == "a" and not os.path.exists(name):
2039 # Create nonexistent files in append mode.
2042 fileobj = bltn_open(name, self._mode)
2043 self._extfileobj = False
2045 if name is None and hasattr(fileobj, "name"):
2047 # when fileobj is a gzip.GzipFile, fileobj.mode is an int (not valid for us)
2048 if hasattr(fileobj, "mode") and isinstance(fileobj.mode, str):
2049 self._mode = fileobj.mode
2050 self._extfileobj = True
2051 self.name = os.path.abspath(name) if name else None
2052 self.base_name = self.name = os.path.abspath(name) if name else None
2053 self.fileobj = fileobj
2056 if format is not None:
2057 self.format = format
2058 if tarinfo is not None:
2059 self.tarinfo = tarinfo
2060 if dereference is not None:
2061 self.dereference = dereference
2062 if ignore_zeros is not None:
2063 self.ignore_zeros = ignore_zeros
2064 if encoding is not None:
2065 self.encoding = encoding
2067 self.errors = errors
2069 if pax_headers is not None and self.format == PAX_FORMAT:
2070 self.pax_headers = pax_headers
2072 self.pax_headers = {}
2074 if debug is not None:
2076 if errorlevel is not None:
2077 self.errorlevel = errorlevel
2079 # Init datastructures.
2080 if max_volume_size and max_volume_size < 3*BLOCKSIZE:
2081 raise ValueError("max_volume_size needs to be at least %d" % (3*BLOCKSIZE))
2082 if max_volume_size and not callable(new_volume_handler):
2083 raise ValueError("new_volume_handler needs to be set and be callable for multivolume support")
2085 self.max_volume_size = int(max_volume_size)
2087 self.max_volume_size = None
2089 self.save_to_members = save_to_members
2090 self.new_volume_handler = new_volume_handler
2092 self.members = [] # list of members as TarInfo objects
2093 self._loaded = False # flag if all members have been read
2094 self.offset = self.fileobj.tell()
2095 # current position in the archive file
2096 self.inodes = {} # dictionary caching the inodes of
2097 # archive members already added
2100 if self.mode == "r":
2101 self.firstmember = None
2102 self.firstmember = self.next()
2104 if self.mode == "a":
2105 # Move to the end of the archive,
2106 # before the first empty block.
2108 self.fileobj.seek(self.offset)
2110 tarinfo = self.tarinfo.fromtarfile(self)
2111 self.members.append(tarinfo)
2112 except EOFHeaderError:
2113 self.fileobj.seek(self.offset)
2115 except HeaderError as e:
2116 raise ReadError(str(e))
2118 if self.mode in "aw":
2121 if self.pax_headers:
2122 buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy())
2123 self.fileobj.write(buf)
2124 self.offset += len(buf)
2126 if not self._extfileobj:
2127 self.fileobj.close()
2131 #--------------------------------------------------------------------------
2132 # Below are the classmethods which act as alternate constructors to the
2133 # TarFile class. The open() method is the only one that is needed for
2134 # public use; it is the "super"-constructor and is able to select an
2135 # adequate "sub"-constructor for a particular compression using the mapping
2138 # This concept allows one to subclass TarFile without losing the comfort of
2139 # the super-constructor. A sub-constructor is registered and made available
2140 # by adding it to the mapping in OPEN_METH.
2143 def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE,
2144 encryption=None, compresslevel=9, tolerance=TOLERANCE_STRICT,
2146 """Open a tar archive for reading, writing or appending. Return
2147 an appropriate TarFile class.
2150 'r' or 'r:*' open for reading with transparent compression
2151 'r:' open for reading exclusively uncompressed
2152 'r:gz' open for reading with gzip compression
2153 'r:bz2' open for reading with bzip2 compression
2154 'r:xz' open for reading with lzma compression
2155 'a' or 'a:' open for appending, creating the file if necessary
2156 'w' or 'w:' open for writing without compression
2157 'w:gz' open for writing with gzip compression
2158 'w:bz2' open for writing with bzip2 compression
2159 'w:xz' open for writing with lzma compression
2161 'r|*' open a stream of tar blocks with transparent compression
2162 'r|' open an uncompressed stream of tar blocks for reading
2163 'r|gz' open a gzip compressed stream of tar blocks
2164 'r|bz2' open a bzip2 compressed stream of tar blocks
2165 'r|xz' open an lzma compressed stream of tar blocks
2166 'w|' open an uncompressed stream for writing
2167 'w|gz' open a gzip compressed stream for writing
2168 'w|bz2' open a bzip2 compressed stream for writing
2169 'w|xz' open an lzma compressed stream for writing
2171 'r#gz' open a stream of gzip compressed tar blocks for reading
2172 'w#gz' open a stream of gzip compressed tar blocks for writing
2174 if not name and not fileobj:
2175 raise ValueError("nothing to open")
2177 if mode in ("r", "r:*"):
2178 # Find out which *open() is appropriate for opening the file.
2179 for comptype in cls.OPEN_METH:
2180 func = getattr(cls, cls.OPEN_METH[comptype])
2181 if fileobj is not None:
2182 saved_pos = fileobj.tell()
2184 return func(name, "r", fileobj, **kwargs)
2185 except (ReadError, CompressionError) as e:
2186 # usually nothing exceptional but sometimes is
2187 if fileobj is not None:
2188 fileobj.seek(saved_pos)
2190 raise ReadError("file could not be opened successfully")
2193 filemode, comptype = mode.split(":", 1)
2194 filemode = filemode or "r"
2195 comptype = comptype or "tar"
2197 # Select the *open() function according to
2198 # given compression.
2199 if comptype in cls.OPEN_METH:
2200 func = getattr(cls, cls.OPEN_METH[comptype])
2202 raise CompressionError("unknown compression type %r" % comptype)
2204 # Pass on compression level for gzip / bzip2.
2205 if comptype == 'gz' or comptype == 'bz2':
2206 kwargs['compresslevel'] = compresslevel
2208 if 'max_volume_size' in kwargs:
2209 if comptype != 'tar' and filemode in 'wa' \
2210 and kwargs['max_volume_size']:
2212 warnings.warn('Only the first volume will be compressed '
2213 'for modes with "w:"!')
2215 return func(name, filemode, fileobj, **kwargs)
2218 filemode, comptype = mode.split("|", 1)
2219 filemode = filemode or "r"
2220 comptype = comptype or "tar"
2222 if filemode not in "rw":
2223 raise ValueError("mode must be 'r' or 'w'")
2225 t = cls(name, filemode,
2226 _Stream(name, filemode, comptype, fileobj, bufsize,
2227 compresslevel=compresslevel),
2229 t._extfileobj = False
2233 filemode, comptype = mode.split("#", 1)
2234 filemode = filemode or "r"
2236 if filemode not in "rw":
2237 raise ValueError ("mode %s not compatible with concat "
2238 "archive; must be 'r' or 'w'" % mode)
2240 stream = _Stream(name, filemode, comptype, fileobj, bufsize,
2241 concat=True, encryption=encryption,
2242 compresslevel=compresslevel, tolerance=tolerance)
2243 kwargs ["concat"] = True
2245 t = cls(name, filemode, stream, **kwargs)
2246 except: # XXX except what?
2248 raise # XXX raise what?
2249 t._extfileobj = False
2253 return cls.taropen(name, mode, fileobj, **kwargs)
2255 raise ValueError("undiscernible mode %r" % mode)
2259 def open_at_offset(cls, offset, *a, **kwa):
2261 Same as ``.open()``, but start reading at the given offset. Assumes a
2262 seekable file object. Returns *None* if opening failed due to a read
2265 fileobj = kwa.get ("fileobj")
2266 if fileobj is not None:
2267 fileobj.seek (offset)
2269 return cls.open (*a, **kwa)
2273 def taropen(cls, name, mode="r", fileobj=None, **kwargs):
2274 """Open uncompressed tar archive name for reading or writing.
2276 if len(mode) > 1 or mode not in "raw":
2277 raise ValueError("mode must be 'r', 'a' or 'w'")
2278 return cls(name, mode, fileobj, **kwargs)
2281 def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
2282 """Open gzip compressed tar archive name for reading or writing.
2283 Appending is not allowed.
2285 if len(mode) > 1 or mode not in "rw":
2286 raise ValueError("mode must be 'r' or 'w'")
2291 except (ImportError, AttributeError):
2292 raise CompressionError("gzip module is not available")
2294 extfileobj = fileobj is not None
2296 fileobj = gzip.GzipFile(name, mode + "b", compresslevel, fileobj)
2297 t = cls.taropen(name, mode, fileobj, **kwargs)
2299 if not extfileobj and fileobj is not None:
2303 raise ReadError("not a gzip file")
2305 if not extfileobj and fileobj is not None:
2308 t._extfileobj = extfileobj
2312 def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
2313 """Open bzip2 compressed tar archive name for reading or writing.
2314 Appending is not allowed.
2316 if len(mode) > 1 or mode not in "rw":
2317 raise ValueError("mode must be 'r' or 'w'.")
2322 raise CompressionError("bz2 module is not available")
2324 fileobj = bz2.BZ2File(fileobj or name, mode,
2325 compresslevel=compresslevel)
2328 t = cls.taropen(name, mode, fileobj, **kwargs)
2329 except (OSError, EOFError):
2331 raise ReadError("not a bzip2 file")
2332 t._extfileobj = False
2336 def xzopen(cls, name, mode="r", fileobj=None, preset=None, **kwargs):
2337 """Open lzma compressed tar archive name for reading or writing.
2338 Appending is not allowed.
2340 if mode not in ("r", "w"):
2341 raise ValueError("mode must be 'r' or 'w'")
2346 raise CompressionError("lzma module is not available")
2348 fileobj = lzma.LZMAFile(fileobj or name, mode, preset=preset)
2351 t = cls.taropen(name, mode, fileobj, **kwargs)
2352 except (lzma.LZMAError, EOFError):
2354 raise ReadError("not an lzma file")
2355 t._extfileobj = False
2358 # All *open() methods are registered here.
2360 "tar": "taropen", # uncompressed tar
2361 "gz": "gzopen", # gzip compressed tar
2362 "bz2": "bz2open", # bzip2 compressed tar
2363 "xz": "xzopen" # lzma compressed tar
2366 #--------------------------------------------------------------------------
2367 # The public methods which TarFile provides:
2370 """Close the TarFile. In write-mode, two finishing zero blocks are
2371 appended to the archive. A special case are empty archives which are
2372 initialized accordingly so the two mandatory blocks of zeros are
2373 written abiding by the requested encryption and compression settings.
2378 if self.mode in "aw":
2379 if self.arcmode & ARCMODE_CONCAT and self.fileobj.tell () == 0:
2380 self.fileobj.next ("")
2381 self.fileobj.write(NUL * (BLOCKSIZE * 2))
2382 self.offset += (BLOCKSIZE * 2)
2383 # fill up the end with zero-blocks
2384 # (like option -b20 for tar does)
2385 blocks, remainder = divmod(self.offset, RECORDSIZE)
2387 self.fileobj.write(NUL * (RECORDSIZE - remainder))
2388 if not self._extfileobj:
2389 self.fileobj.close()
2392 def getmember(self, name):
2393 """Return a TarInfo object for member `name'. If `name' can not be
2394 found in the archive, KeyError is raised. If a member occurs more
2395 than once in the archive, its last occurrence is assumed to be the
2396 most up-to-date version.
2398 tarinfo = self._getmember(name)
2400 raise KeyError("filename %r not found" % name)
2403 def getmembers(self):
2404 """Return the members of the archive as a list of TarInfo objects. The
2405 list has the same order as the members in the archive.
2408 if not self._loaded: # if we want to obtain a list of
2409 self._load() # all members, we first have to
2410 # scan the whole archive.
2413 def get_last_member_offset(self):
2414 """Return the last member offset. Usually this is self.fileobj.tell(),
2415 but when there's encryption or concat compression going on it's more
2416 complicated than that.
2418 return self.last_block_offset
2421 """Return the members of the archive as a list of their names. It has
2422 the same order as the list returned by getmembers().
2424 return [tarinfo.name for tarinfo in self.getmembers()]
2426 def gettarinfo(self, name=None, arcname=None, fileobj=None):
2427 """Create a TarInfo object for either the file `name' or the file
2428 object `fileobj' (using os.fstat on its file descriptor). You can
2429 modify some of the TarInfo's attributes before you add it using
2430 addfile(). If given, `arcname' specifies an alternative name for the
2431 file in the archive.
2435 # When fileobj is given, replace name by
2436 # fileobj's real name.
2437 if fileobj is not None:
2440 # Building the name of the member in the archive.
2441 # Backward slashes are converted to forward slashes,
2442 # Absolute paths are turned to relative paths.
2445 drv, arcname = os.path.splitdrive(arcname)
2446 arcname = arcname.replace(os.sep, "/")
2447 arcname = arcname.lstrip("/")
2449 # Now, fill the TarInfo object with
2450 # information specific for the file.
2451 tarinfo = self.tarinfo()
2452 tarinfo.tarfile = self
2454 # Use os.stat or os.lstat, depending on platform
2455 # and if symlinks shall be resolved.
2457 if hasattr(os, "lstat") and not self.dereference:
2458 statres = os.lstat(name)
2460 statres = os.stat(name)
2462 statres = os.fstat(fileobj.fileno())
2465 stmd = statres.st_mode
2466 if stat.S_ISREG(stmd):
2467 inode = (statres.st_ino, statres.st_dev)
2468 if not self.dereference and statres.st_nlink > 1 and \
2469 inode in self.inodes and arcname != self.inodes[inode]:
2470 # Is it a hardlink to an already
2473 linkname = self.inodes[inode]
2475 # The inode is added only if its valid.
2476 # For win32 it is always 0.
2478 if inode[0] and self.save_to_members:
2479 self.inodes[inode] = arcname
2480 elif stat.S_ISDIR(stmd):
2482 elif stat.S_ISFIFO(stmd):
2484 elif stat.S_ISLNK(stmd):
2486 linkname = os.readlink(name)
2487 elif stat.S_ISCHR(stmd):
2489 elif stat.S_ISBLK(stmd):
2494 # Fill the TarInfo object with all
2495 # information we can get.
2496 tarinfo.name = arcname
2498 tarinfo.uid = statres.st_uid
2499 tarinfo.gid = statres.st_gid
2501 tarinfo.size = statres.st_size
2504 tarinfo.mtime = statres.st_mtime
2506 tarinfo.linkname = linkname
2508 if tarinfo.uid in self.cache_uid2user:
2509 tarinfo.uname = self.cache_uid2user[tarinfo.uid]
2512 tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0]
2513 self.cache_uid2user[tarinfo.uid] = tarinfo.uname
2515 # remember user does not exist:
2516 # same default value as in tarinfo class
2517 self.cache_uid2user[tarinfo.uid] = ""
2519 if tarinfo.gid in self.cache_gid2group:
2520 tarinfo.gname = self.cache_gid2group[tarinfo.gid]
2523 tarinfo.gname = grp.getgrgid(tarinfo.gid)[0]
2524 self.cache_gid2group[tarinfo.gid] = tarinfo.gname
2526 # remember group does not exist:
2527 # same default value as in tarinfo class
2528 self.cache_gid2group[tarinfo.gid] = ""
2530 if type in (CHRTYPE, BLKTYPE):
2531 if hasattr(os, "major") and hasattr(os, "minor"):
2532 tarinfo.devmajor = os.major(statres.st_rdev)
2533 tarinfo.devminor = os.minor(statres.st_rdev)
2536 def list(self, verbose=True):
2537 """Print a table of contents to sys.stdout. If `verbose' is False, only
2538 the names of the members are printed. If it is True, an `ls -l'-like
2543 for tarinfo in self:
2545 print(stat.filemode(tarinfo.mode), end=' ')
2546 print("%s/%s" % (tarinfo.uname or tarinfo.uid,
2547 tarinfo.gname or tarinfo.gid), end=' ')
2548 if tarinfo.ischr() or tarinfo.isblk():
2549 print("%10s" % ("%d,%d" \
2550 % (tarinfo.devmajor, tarinfo.devminor)), end=' ')
2552 print("%10d" % tarinfo.size, end=' ')
2553 print("%d-%02d-%02d %02d:%02d:%02d" \
2554 % time.localtime(tarinfo.mtime)[:6], end=' ')
2556 print(tarinfo.name + ("/" if tarinfo.isdir() else ""), end=' ')
2560 print("->", tarinfo.linkname, end=' ')
2562 print("link to", tarinfo.linkname, end=' ')
2565 def add(self, name, arcname=None, recursive=True, exclude=None, *, filter=None):
2566 """Add the file `name' to the archive. `name' may be any type of file
2567 (directory, fifo, symbolic link, etc.). If given, `arcname'
2568 specifies an alternative name for the file in the archive.
2569 Directories are added recursively by default. This can be avoided by
2570 setting `recursive' to False. `exclude' is a function that should
2571 return True for each filename to be excluded. `filter' is a function
2572 that expects a TarInfo object argument and returns the changed
2573 TarInfo object, if it returns None the TarInfo object will be
2574 excluded from the archive.
2581 # Exclude pathnames.
2582 if exclude is not None:
2584 warnings.warn("use the filter argument instead",
2585 DeprecationWarning, 2)
2587 self._dbg(2, "tarfile: Excluded %r" % name)
2590 # Skip if somebody tries to archive the archive...
2591 if self.name is not None and os.path.abspath(name) == self.name:
2592 self._dbg(2, "tarfile: Skipped %r" % name)
2597 # Create a TarInfo object from the file.
2598 tarinfo = self.gettarinfo(name, arcname)
2601 self._dbg(1, "tarfile: Unsupported type %r" % name)
2604 # Change or exclude the TarInfo object.
2605 if filter is not None:
2606 tarinfo = filter(tarinfo)
2608 self._dbg(2, "tarfile: Excluded %r" % name)
2611 # Append the tar header and data to the archive.
2613 with bltn_open(name, "rb") as f:
2614 self.addfile(tarinfo, f)
2616 elif tarinfo.isdir():
2617 self.addfile(tarinfo)
2619 for f in os.listdir(name):
2620 self.add(os.path.join(name, f), os.path.join(arcname, f),
2621 recursive, exclude, filter=filter)
2624 self.addfile(tarinfo)
2626 def _size_left_file(self):
2627 """Calculates size left in a volume with a maximum volume size.
2629 Assumes self.max_volume_size is set.
2630 If using compression through a _Stream, use _size_left_stream instead
2632 # left-over size = max_size - offset - 2 zero-blocks written in close
2633 size_left = self.max_volume_size - 2*BLOCKSIZE - self.offset
2634 # limit size left to a discrete number of blocks, because we won't
2635 # write only half a block when writting the end of a volume
2636 # and filling with zeros
2637 return BLOCKSIZE * (size_left // BLOCKSIZE)
2639 def _size_left_stream(self):
2640 """ Calculates size left in a volume if using comression/encryption
2642 Assumes self.max_volume_size is set and self.fileobj is a _Stream
2643 (otherwise use _size_left_file)
2645 # left-over size = max_size - bytes written - 2 zero-blocks (close)
2646 size_left = self.max_volume_size - self.fileobj.estim_file_size() \
2648 return BLOCKSIZE * (size_left // BLOCKSIZE)
2650 def addfile(self, tarinfo, fileobj=None):
2651 """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is
2652 given, tarinfo.size bytes are read from it and added to the archive.
2653 You can create TarInfo objects using gettarinfo().
2654 On Windows platforms, `fileobj' should always be opened with mode
2655 'rb' to avoid irritation about the file size.
2659 tarinfo = copy.copy(tarinfo)
2661 if self.arcmode & ARCMODE_CONCAT:
2662 self.last_block_offset = self.fileobj.next (tarinfo.name)
2664 self.last_block_offset = self.fileobj.tell()
2666 buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
2667 self.fileobj.write(buf)
2668 self.offset += len(buf)
2670 if self.max_volume_size:
2671 if isinstance(self.fileobj, _Stream):
2672 _size_left = self._size_left_stream
2674 _size_left = self._size_left_file
2676 _size_left = lambda: tarinfo.size
2678 # If there's no data to follow, finish
2680 if self.save_to_members:
2681 self.members.append(tarinfo)
2684 target_size_left = _size_left()
2685 source_size_left = tarinfo.size
2686 assert tarinfo.volume_offset == 0
2688 # we only split volumes in the middle of a file, that means we have
2689 # to write at least one block
2690 if target_size_left < BLOCKSIZE:
2691 target_size_left = BLOCKSIZE
2693 # loop over multiple volumes
2694 while source_size_left > 0:
2696 # Write as much data as possble from source into target.
2697 # When compressing data, we cannot easily predict how much data we
2698 # can write until target_size_left == 0 --> need to iterate
2699 size_can_write = min(target_size_left, source_size_left)
2701 while size_can_write > 0:
2702 copyfileobj(fileobj, self.fileobj, size_can_write)
2703 self.offset += size_can_write
2704 source_size_left -= size_can_write
2705 target_size_left = _size_left()
2706 size_can_write = min(target_size_left, source_size_left)
2708 # now target_size_left == 0 or source_size_left == 0
2710 # if there is data left to write, we need to create a new volume
2711 if source_size_left > 0:
2712 # Only finalize the crypto entry here if we’re continuing with
2713 # another one; otherwise, the encryption must include the block
2715 tarinfo.type = GNUTYPE_MULTIVOL
2717 if not self.new_volume_handler or\
2718 not callable(self.new_volume_handler):
2719 raise Exception("We need to create a new volume and you "
2720 "didn't supply a new_volume_handler")
2723 # the new volume handler should do everything needed to
2724 # start working in a new volume. usually, the handler calls
2725 # to self.open_volume
2726 self.volume_number += 1
2728 # set to be used by open_volume, because in the case of a PAX
2729 # tar it needs to write information about the volume and offset
2730 # in the global header
2731 tarinfo.volume_offset = tarinfo.size - source_size_left
2732 self.volume_tarinfo = tarinfo
2734 # the “new_volume_handler” is supposed to call .close() on the
2736 self.new_volume_handler(self, self.base_name, self.volume_number)
2738 self.volume_tarinfo = None
2740 if self.arcmode & ARCMODE_CONCAT:
2741 self.fileobj.next_volume (tarinfo.name)
2743 # write new volume header
2744 buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
2745 self.fileobj.write(buf)
2746 self.offset += len(buf)
2748 # adjust variables; open_volume should have reset self.offset
2749 # --> _size_left should be big again
2750 target_size_left = _size_left()
2751 size_can_write = min(target_size_left, source_size_left)
2752 self._dbg(3, 'new volume')
2754 # now, all data has been written. We may have to fill up the rest of
2755 # the block in target with 0s
2756 remainder = (tarinfo.size - tarinfo.volume_offset) % BLOCKSIZE
2758 self.fileobj.write(NUL * (BLOCKSIZE - remainder))
2759 self.offset += BLOCKSIZE - remainder
2761 if self.save_to_members:
2762 self.members.append(tarinfo)
2764 def open_volume(self, name="", fileobj=None, encryption=None):
2766 Called by the user to change this tar file to point to a new volume.
2769 # open the file using either fileobj or name
2771 if self.mode == "a" and not os.path.exists(name):
2772 # Create nonexistent files in append mode.
2775 self._extfileobj = False
2777 if isinstance(self.fileobj, _Stream):
2778 self._dbg(3, 'open_volume: create a _Stream')
2779 fileobj = _Stream(name=name,
2780 mode=self.fileobj.mode,
2781 comptype=self.fileobj.comptype,
2783 bufsize=self.fileobj.bufsize,
2784 encryption=encryption or self.fileobj.encryption,
2785 concat=self.fileobj.arcmode & ARCMODE_CONCAT,
2786 tolerance=self.fileobj.tolerance)
2788 # here, we lose information about compression/encryption!
2789 self._dbg(3, 'open_volume: builtin open')
2790 fileobj = bltn_open(name, self._mode)
2792 if name is None and hasattr(fileobj, "name"):
2794 if hasattr(fileobj, "mode"):
2795 self._mode = fileobj.mode
2796 self._extfileobj = True
2797 self._dbg(3, 'open_volume: using external fileobj {}', fileobj)
2798 self.name = os.path.abspath(name) if name else None
2799 self.fileobj.close()
2800 self.fileobj = fileobj
2802 # init data structures
2804 self.members = [] # list of members as TarInfo objects
2805 self._loaded = False # flag if all members have been read
2806 self.offset = self.fileobj.tell()
2807 # current position in the archive file
2808 self.inodes = {} # dictionary caching the inodes of
2809 # archive members already added
2812 if self.mode == "r":
2813 self.firstmember = None
2814 self.firstmember = self.next()
2816 if self.mode == "a":
2817 # Move to the end of the archive,
2818 # before the first empty block.
2820 self.fileobj.seek(self.offset)
2822 tarinfo = self.tarinfo.fromtarfile(self)
2823 self.members.append(tarinfo)
2824 except EOFHeaderError:
2825 self.fileobj.seek(self.offset)
2827 except HeaderError as e:
2828 raise ReadError(str(e))
2830 if self.mode in "aw":
2833 if self.format == PAX_FORMAT:
2835 "GNU.volume.filename": str(self.volume_tarinfo.name),
2836 "GNU.volume.size": str(self.volume_tarinfo.size - self.volume_tarinfo.volume_offset),
2837 "GNU.volume.offset": str(self.volume_tarinfo.volume_offset),
2840 self.pax_headers.update(volume_info)
2842 if isinstance(self.fileobj, _Stream):
2843 self.fileobj._init_write_gz ()
2844 buf = self.tarinfo.create_pax_global_header(volume_info.copy())
2845 self.fileobj.write(buf)
2846 self.offset += len(buf)
2847 except Exception as exn:
2848 if not self._extfileobj:
2849 self.fileobj.close()
2853 def extractall(self, path=".", members=None, filter=None, unlink=False):
2854 """Extract all members from the archive to the current working
2855 directory and set owner, modification time and permissions on
2856 directories afterwards. `path' specifies a different directory
2857 to extract to. `members' is optional and must be a subset of the
2858 list returned by getmembers().
2865 for tarinfo in members:
2866 if self.volume_number > 0 and tarinfo.ismultivol():
2869 if filter and not filter(tarinfo):
2873 # Extract directories with a safe mode.
2874 directories.append(tarinfo)
2875 tarinfo = copy.copy(tarinfo)
2876 tarinfo.mode = 0o0700
2877 # Do not set_attrs directories, as we will do that further down
2878 self.extract(tarinfo, path, set_attrs=not tarinfo.isdir(), unlink=unlink)
2880 # Reverse sort directories.
2881 directories.sort(key=lambda a: a.name)
2882 directories.reverse()
2884 # Set correct owner, mtime and filemode on directories.
2885 for tarinfo in directories:
2886 dirpath = os.path.join(path, tarinfo.name)
2888 self.chown(tarinfo, dirpath)
2889 self.utime(tarinfo, dirpath)
2890 self.chmod(tarinfo, dirpath)
2891 except ExtractError as e:
2892 if self.errorlevel > 1:
2895 self._dbg(1, "tarfile: %s" % e)
2897 def extract(self, member, path="", set_attrs=True, symlink_cb=None,
2899 """Extract a member from the archive to the current working directory,
2900 using its full name. Its file information is extracted as accurately
2901 as possible. `member' may be a filename or a TarInfo object. You can
2902 specify a different directory using `path'. File attributes (owner,
2903 mtime, mode) are set unless `set_attrs' is False.
2904 ``symlink_cb`` is a hook accepting a function that is passed the
2905 ``member``, ``path``, and ``set_attrs`` arguments if the tarinfo for
2906 ``member`` indicates a symlink in which case only the callback
2907 passed will be applied, skipping the actual extraction. In case the
2908 callback is invoked, its return value is passed on to the caller.
2912 if isinstance(member, str):
2913 tarinfo = self.getmember(member)
2917 # Prepare the link target for makelink().
2919 tarinfo._link_target = os.path.join(path, tarinfo.linkname)
2921 if symlink_cb is not None and tarinfo.issym():
2922 return symlink_cb(member, path, set_attrs)
2925 self._extract_member(tarinfo, os.path.join(path, tarinfo.name),
2926 set_attrs=set_attrs, unlink=unlink)
2927 except EnvironmentError as e:
2928 if self.errorlevel > 0:
2931 if e.filename is None:
2932 self._dbg(1, "tarfile: %s" % e.strerror)
2934 self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename))
2935 except ExtractError as e:
2936 if self.errorlevel > 1:
2939 self._dbg(1, "tarfile: %s" % e)
2941 def extractfile(self, member):
2942 """Extract a member from the archive as a file object. `member' may be
2943 a filename or a TarInfo object. If `member' is a regular file or a
2944 link, an io.BufferedReader object is returned. Otherwise, None is
2949 if isinstance(member, str):
2950 tarinfo = self.getmember(member)
2954 if tarinfo.isreg() or tarinfo.ismultivol() or\
2955 tarinfo.type not in SUPPORTED_TYPES:
2956 # If a member's type is unknown, it is treated as a
2958 return self.fileobject(self, tarinfo)
2960 elif tarinfo.islnk() or tarinfo.issym():
2961 if isinstance(self.fileobj, _Stream):
2962 # A small but ugly workaround for the case that someone tries
2963 # to extract a (sym)link as a file-object from a non-seekable
2964 # stream of tar blocks.
2965 raise StreamError("cannot extract (sym)link as file object")
2967 # A (sym)link's file object is its target's file object.
2968 return self.extractfile(self._find_link_target(tarinfo))
2970 # If there's no data associated with the member (directory, chrdev,
2971 # blkdev, etc.), return None instead of a file object.
2974 def _extract_member(self, tarinfo, targetpath, set_attrs=True, unlink=False):
2975 """Extract the TarInfo object tarinfo to a physical
2976 file called targetpath.
2978 # Fetch the TarInfo object for the given name
2979 # and build the destination pathname, replacing
2980 # forward slashes to platform specific separators.
2981 targetpath = targetpath.rstrip("/")
2982 targetpath = targetpath.replace("/", os.sep)
2984 # Create all upper directories.
2985 upperdirs = os.path.dirname(targetpath)
2986 if upperdirs and not os.path.exists(upperdirs):
2987 # Create directories that are not part of the archive with
2988 # default permissions.
2989 os.makedirs(upperdirs)
2991 if tarinfo.islnk() or tarinfo.issym():
2992 self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname))
2994 self._dbg(1, tarinfo.name)
2997 _unlinkfirst(targetpath)
3000 self.makefile(tarinfo, targetpath)
3001 elif tarinfo.isdir():
3002 self.makedir(tarinfo, targetpath)
3003 elif tarinfo.isfifo():
3004 self.makefifo(tarinfo, targetpath)
3005 elif tarinfo.ischr() or tarinfo.isblk():
3006 self.makedev(tarinfo, targetpath)
3007 elif tarinfo.islnk() or tarinfo.issym():
3008 self.makelink(tarinfo, targetpath)
3009 elif tarinfo.type not in SUPPORTED_TYPES:
3010 self.makeunknown(tarinfo, targetpath)
3012 self.makefile(tarinfo, targetpath)
3015 self.chown(tarinfo, targetpath)
3016 if not tarinfo.issym():
3017 self.chmod(tarinfo, targetpath)
3018 self.utime(tarinfo, targetpath)
3020 #--------------------------------------------------------------------------
3021 # Below are the different file methods. They are called via
3022 # _extract_member() when extract() is called. They can be replaced in a
3023 # subclass to implement other functionality.
3025 def makedir(self, tarinfo, targetpath):
3026 """Make a directory called targetpath.
3029 # Use a safe mode for the directory, the real mode is set
3030 # later in _extract_member().
3031 os.mkdir(targetpath, 0o0700)
3032 except FileExistsError:
3035 def makefile(self, tarinfo, targetpath):
3036 """Make a file called targetpath.
3038 source = self.fileobj
3039 source.seek(tarinfo.offset_data)
3042 target = bltn_open(targetpath, "wb")
3044 if tarinfo.sparse is not None:
3046 for offset, size in tarinfo.sparse:
3048 copyfileobj(source, target, size)
3049 target.seek(tarinfo.size)
3058 copyfileobj(source, target, tarinfo.size)
3061 # only if we are extracting a multivolume this can be treated
3062 if not self.new_volume_handler:
3063 raise Exception("We need to read a new volume and you"
3064 " didn't supply a new_volume_handler")
3066 # the new volume handler should do everything needed to
3067 # start working in a new volume. usually, the handler calls
3068 # to self.open_volume
3069 self.volume_number += 1
3070 self.new_volume_handler(self, self.base_name, self.volume_number)
3071 tarinfo = self.firstmember
3072 source = self.fileobj
3075 if iterate is False: target.close()
3078 def makeunknown(self, tarinfo, targetpath):
3079 """Make a file from a TarInfo object with an unknown type
3082 self.makefile(tarinfo, targetpath)
3083 self._dbg(1, "tarfile: Unknown file type %r, " \
3084 "extracted as regular file." % tarinfo.type)
3086 def makefifo(self, tarinfo, targetpath):
3087 """Make a fifo called targetpath.
3089 if hasattr(os, "mkfifo"):
3090 os.mkfifo(targetpath)
3092 raise ExtractError("fifo not supported by system")
3094 def makedev(self, tarinfo, targetpath):
3095 """Make a character or block device called targetpath.
3097 if not hasattr(os, "mknod") or not hasattr(os, "makedev"):
3098 raise ExtractError("special devices not supported by system")
3102 mode |= stat.S_IFBLK
3104 mode |= stat.S_IFCHR
3106 os.mknod(targetpath, mode,
3107 os.makedev(tarinfo.devmajor, tarinfo.devminor))
3109 def makelink(self, tarinfo, targetpath):
3110 """Make a (symbolic) link called targetpath. If it cannot be created
3111 (platform limitation), we try to make a copy of the referenced file
3115 # For systems that support symbolic and hard links.
3117 os.symlink(tarinfo.linkname, targetpath)
3120 if os.path.exists(tarinfo._link_target):
3121 os.link(tarinfo._link_target, targetpath)
3123 self._extract_member(self._find_link_target(tarinfo),
3125 except symlink_exception:
3127 self._extract_member(self._find_link_target(tarinfo),
3130 raise ExtractError("unable to resolve link inside archive")
3132 def chown(self, tarinfo, targetpath):
3133 """Set owner of targetpath according to tarinfo.
3135 if pwd and hasattr(os, "geteuid") and os.geteuid() == 0:
3136 # We have to be root to do so.
3138 g = grp.getgrnam(tarinfo.gname)[2]
3142 u = pwd.getpwnam(tarinfo.uname)[2]
3146 if tarinfo.issym() and hasattr(os, "lchown"):
3147 os.lchown(targetpath, u, g)
3149 os.chown(targetpath, u, g)
3150 except OSError as e:
3151 raise ExtractError("could not change owner")
3153 def chmod(self, tarinfo, targetpath):
3154 """Set file permissions of targetpath according to tarinfo.
3156 if hasattr(os, 'chmod'):
3158 os.chmod(targetpath, tarinfo.mode)
3159 except OSError as e:
3160 raise ExtractError("could not change mode")
3162 def utime(self, tarinfo, targetpath):
3163 """Set modification time of targetpath according to tarinfo.
3165 if not hasattr(os, 'utime'):
3168 os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime))
3169 except OSError as e:
3170 raise ExtractError("could not change modification time")
3172 #--------------------------------------------------------------------------
3174 """Return the next member of the archive as a TarInfo object, when
3175 TarFile is opened for reading. Return None if there is no more
3179 if self.firstmember is not None:
3180 m = self.firstmember
3181 self.firstmember = None
3184 # Read the next block.
3185 self.fileobj.seek(self.offset)
3189 tarinfo = self.tarinfo.fromtarfile(self)
3190 except EOFHeaderError as e:
3191 if self.ignore_zeros:
3192 self._dbg(2, "0x%X: %s" % (self.offset, e))
3193 self.offset += BLOCKSIZE
3195 except InvalidHeaderError as e:
3196 if self.ignore_zeros:
3197 self._dbg(2, "0x%X: %s" % (self.offset, e))
3198 self.offset += BLOCKSIZE
3200 elif self.offset == 0:
3201 raise ReadError(str(e))
3202 except EmptyHeaderError:
3203 if self.offset == 0:
3204 raise ReadError("empty file")
3205 except TruncatedHeaderError as e:
3206 if self.offset == 0:
3207 raise ReadError(str(e))
3208 except SubsequentHeaderError as e:
3209 raise ReadError(str(e))
3212 if tarinfo is not None:
3213 if self.save_to_members:
3214 self.members.append(tarinfo)
3220 #--------------------------------------------------------------------------
3221 # Little helper methods:
3223 def _getmember(self, name, tarinfo=None, normalize=False):
3224 """Find an archive member by name from bottom to top.
3225 If tarinfo is given, it is used as the starting point.
3227 # Ensure that all members have been loaded.
3228 members = self.getmembers()
3230 # Limit the member search list up to tarinfo.
3231 if tarinfo is not None:
3232 members = members[:members.index(tarinfo)]
3235 name = os.path.normpath(name)
3237 for member in reversed(members):
3239 member_name = os.path.normpath(member.name)
3241 member_name = member.name
3243 if name == member_name:
3247 """Read through the entire archive file and look for readable
3251 tarinfo = self.next()
3256 def _check(self, mode=None):
3257 """Check if TarFile is still open, and if the operation's mode
3258 corresponds to TarFile's mode.
3261 raise OSError("%s is closed" % self.__class__.__name__)
3262 if mode is not None and self.mode not in mode:
3263 raise OSError("bad operation for mode %r" % self.mode)
3265 def _find_link_target(self, tarinfo):
3266 """Find the target member of a symlink or hardlink member in the
3270 # Always search the entire archive.
3271 linkname = "/".join(filter(None, (os.path.dirname(tarinfo.name), tarinfo.linkname)))
3274 # Search the archive before the link, because a hard link is
3275 # just a reference to an already archived file.
3276 linkname = tarinfo.linkname
3279 member = self._getmember(linkname, tarinfo=limit, normalize=True)
3281 raise KeyError("linkname %r not found" % linkname)
3285 """Provide an iterator object.
3288 return iter(self.members)
3290 return TarIter(self)
3292 def _dbg(self, level, msg, *args):
3293 """Write debugging output to sys.stderr.
3295 if level <= self.debug:
3296 print(msg.format(*args), file=sys.stderr)
3298 def __enter__(self):
3302 def __exit__(self, type, value, traceback):
3306 # An exception occurred. We must not call close() because
3307 # it would try to write end-of-archive blocks and padding.
3308 if not self._extfileobj:
3309 self.fileobj.close()
3312 def _unlinkfirst(targetpath):
3314 os.unlink(targetpath)
3315 except OSError as e:
3316 if e.errno == errno.ENOENT or e.errno == errno.EISDIR:
3325 for tarinfo in TarFile(...):
3329 def __init__(self, tarfile):
3330 """Construct a TarIter object.
3332 self.tarfile = tarfile
3335 """Return iterator object.
3339 """Return the next item using TarFile's next() method.
3340 When all members have been read, set TarFile as _loaded.
3342 # Fix for SF #1100429: Under rare circumstances it can
3343 # happen that getmembers() is called during iteration,
3344 # which will cause TarIter to stop prematurely.
3346 if self.index == 0 and self.tarfile.firstmember is not None:
3347 tarinfo = self.tarfile.next()
3348 elif self.index < len(self.tarfile.members):
3349 tarinfo = self.tarfile.members[self.index]
3350 elif not self.tarfile._loaded:
3351 tarinfo = self.tarfile.next()
3353 self.tarfile._loaded = True
3361 #---------------------------------------------------------
3362 # support functionality for rescue mode
3363 #---------------------------------------------------------
3365 TAR_FMT_HDR = (# See tar(5):
3367 "100s" # ← char name[100]; /* 100 */
3368 "8s" # ← char mode[8]; /* 108 */
3369 "8s" # ← char uid[8]; /* 116 */
3370 "8s" # ← char gid[8]; /* 124 */
3371 "12s" # ← char size[12]; /* 136 */
3372 "12s" # ← char mtime[12]; /* 148 */
3373 "8s" # ← char checksum[8]; /* 156 */
3374 "B" # ← char typeflag[1]; /* 157 */
3375 "100s" # ← char linkname[100]; /* 257 */
3376 "6s" # ← char magic[6]; /* 263 */
3377 "2s" # ← char version[2]; /* 265 */
3378 "32s" # ← char uname[32]; /* 297 */
3379 "32s" # ← char gname[32]; /* 329 */
3380 "8s" # ← char devmajor[8]; /* 337 */
3381 "8s" # ← char devminor[8]; /* 345 */
3382 "12s" # ← char atime[12]; /* 357 */
3383 "12s" # ← char ctime[12]; /* 369 */
3384 "12s" # ← char offset[12]; /* 381 */
3385 "4s" # ← char longnames[4]; /* 385 */
3386 "B" # ← char unused[1]; /* 386 */
3388 "12s" # ← char offset[12];
3389 "12s" # ← char numbytes[12];
3390 "12s" # ← char offset[12];
3391 "12s" # ← char numbytes[12];
3392 "12s" # ← char offset[12];
3393 "12s" # ← char numbytes[12];
3394 "12s" # ← char offset[12];
3395 "12s" # ← char numbytes[12];
3396 "" # } sparse[4]; /* 482 */
3397 "B" # ← char isextended[1]; /* 483 */
3398 "12s" # ← char realsize[12]; /* 495 */
3399 "17s" # ← char pad[17]; /* 512 */
3402 # The “magic” and “version” fields are special:
3405 # magic The magic field holds the five characters “ustar” followed by a
3406 # space. Note that POSIX ustar archives have a trailing null.
3410 # /* OLDGNU_MAGIC uses both magic and version fields, which are contiguous.
3411 # Found in an archive, it indicates an old GNU header format, which will be
3412 # hopefully become obsolescent. With OLDGNU_MAGIC, uname and gname are
3413 # valid, though the header is not truly POSIX conforming. */
3416 TAR_HDR_OFF_MAGIC = 257
3417 TAR_FMT_OLDGNU_MAGIC = b"ustar "
3419 def read_gnu_tar_hdr (data):
3420 if len (data) != BLOCKSIZE: # header requires one complete block
3441 offset1, numbytes1, \
3442 offset2, numbytes2, \
3443 offset3, numbytes3, \
3444 offset4, numbytes4, \
3447 pad = struct.unpack (TAR_FMT_HDR, data)
3448 except struct.error:
3451 if magic != TAR_FMT_OLDGNU_MAGIC:
3454 # return all except “unused” and “pad”
3456 { "name" : name, "mode" : mode
3457 , "uid" : uid , "gid" : gid
3458 , "size" : size, "mtime" : mtime
3459 , "checksum" : checksum
3460 , "typeflag" : typeflag
3461 , "linkname" : linkname
3463 , "version" : version
3464 , "uname" : uname, "gname" : gname
3465 , "devmajor" : devmajor, "devminor" : devminor
3466 , "atime" : atime, "ctime" : ctime
3468 , "longnames" : longnames
3469 , "offset1" : offset1, "numbytes1" : numbytes1
3470 , "offset2" : offset2, "numbytes2" : numbytes2
3471 , "offset3" : offset3, "numbytes3" : numbytes3
3472 , "offset4" : offset4, "numbytes4" : numbytes4
3473 , "isextended" : isextended
3474 , "realsize" : realsize
3478 def tar_hdr_check_chksum (data):
3479 hdr = read_gnu_tar_hdr (data)
3482 s = calc_chksums (data)
3483 return nti (hdr ["checksum"]) in s
3486 def readable_tar_objects_offsets (ifd):
3488 Traverse blocks in file, trying to extract tar headers.
3493 mm = mmap.mmap(ifd, 0, mmap.MAP_SHARED, mmap.PROT_READ)
3494 pos = TAR_HDR_OFF_MAGIC
3497 pos = mm.find (TAR_FMT_OLDGNU_MAGIC, pos)
3500 off = pos - TAR_HDR_OFF_MAGIC
3502 blk = mm.read (BLOCKSIZE)
3503 if tar_hdr_check_chksum (blk) is True:
3504 offsets.append (off)
3510 def locate_gz_hdr_candidates (fd):
3512 Walk over instances of the GZ magic in the payload, collecting their
3513 positions. If the offset of the first found instance is not zero, the file
3514 begins with leading garbage.
3516 Note that since the GZ magic consists of only two bytes, we expect a lot of
3517 false positives inside binary data.
3519 :return: The list of offsets in the file.
3523 mm = mmap.mmap(fd, 0, mmap.MAP_SHARED, mmap.PROT_READ)
3526 pos = mm.find (GZ_MAGIC_BYTES, pos)
3530 pos += len (GZ_MAGIC_BYTES)
3535 HDR_CAND_GOOD = 0 # header marks begin of valid object
3536 HDR_CAND_FISHY = 1 # inconclusive
3537 HDR_CAND_JUNK = 2 # not a header / object unreadable
3540 def read_cstring (fd, max=-1, encoding=None):
3542 Read one NUL-terminated string from *fd* into a Python string. If *max* is
3543 non-negative, reading will terminate after the specified number of bytes.
3545 Optionally, an *encoding* may be specified to interpret the data as.
3547 :returns: *None* if parsing failed or the maximum number of bytes has been
3548 exceeded; a Python string with the data otherwise.
3557 if max >= 0 and l > max:
3561 if encoding is not None:
3562 buf = buf.decode (encoding)
3567 def inspect_gz_hdr (fd, off):
3569 Attempt to parse a Gzip header in *fd* at position *off*. The format is
3570 documented as RFC1952.
3572 Returns a verdict about the quality of that header plus the parsed header
3573 when readable. Problematic sizes such as fields running past the EOF are
3574 treated as garbage. Properties in which the header merely doesn’t conform
3575 to the spec (garbage flag bits, bogus timestamp) are considered “fishy”. No
3576 validation is possible on embedded strings because they are single-byte
3584 verdict = HDR_CAND_GOOD
3586 os.lseek (fd, off, os.SEEK_SET)
3587 if os.lseek (fd, 0, os.SEEK_CUR) != off:
3588 return HDR_CAND_JUNK, None
3590 raw = os.read (fd, GZ_HEADER_SIZE)
3591 if len (raw) != GZ_HEADER_SIZE:
3592 return HDR_CAND_JUNK, None
3596 _m1, _m2, meth, flags, mtime, dflags, oscode = \
3597 struct.unpack (GZ_FMT_HEADER, raw)
3598 if meth != GZ_METHOD_DEFLATE: # only deflate is supported
3599 return HDR_CAND_JUNK, None
3600 except struct.error as exn:
3601 return HDR_CAND_JUNK, None
3603 if mtime > int (time.time ()):
3604 verdict = HDR_CAND_FISHY
3606 if dflags != GZ_DEFLATE_FLAGS:
3607 verdict = HDR_CAND_FISHY
3609 if oscode != GZ_OS_CODE:
3610 verdict = HDR_CAND_FISHY
3612 if flags & GZ_FLAG_FTEXT: # created by some contrarian
3613 verdict = HDR_CAND_FISHY
3614 if flags & GZ_FLAG_FEXTRA:
3615 xlen = struct.unpack ("<H", os.read (fd, 2))[0]
3616 xtra = os.read (fd, xlen)
3617 if len (xtra) != xlen: # eof inside header
3618 return HDR_CAND_JUNK, None
3619 if flags & GZ_FLAG_FNAME:
3620 # read up to the next NUL byte, not exceeding the maximum path length
3622 fname = read_cstring (fd, max=(LENGTH_NAME + LENGTH_PREFIX),
3623 encoding="iso-8859-1")
3625 return HDR_CAND_JUNK, None
3626 if flags & GZ_FLAG_FCOMMENT:
3627 fname = read_cstring (fd, max=(LENGTH_NAME + LENGTH_PREFIX),
3628 encoding="iso-8859-1")
3630 return HDR_CAND_JUNK, None
3631 if flags & GZ_FLAG_FHCRC: # half a CRC32; discarded
3632 crc16 = os.read (fd, 2)
3633 if len (crc16) != 2: # eof inside header
3634 return HDR_CAND_JUNK, None
3635 if flags & GZ_FLAG_RESERVED:
3636 # according to the RFC, these must not be set
3637 verdict = HDR_CAND_FISHY
3639 hlen = os.lseek (fd, 0, os.SEEK_CUR) - off
3651 def try_decompress (ifd, off, hdr):
3653 Attempt to process the object starting at *off* with gzip.
3655 :returns: A pair containing the values of the decompressed data and
3656 the length of the input consumed. Note that the latter value
3657 may exceed the length of the compressed data because the
3658 *zlib* module does not provide a means to query how much
3659 of the input it processed before the end of an object.
3662 decmp = zlib.decompressobj (-zlib.MAX_WBITS)
3664 dlen = 0 # size of decompressed data
3666 os.lseek (ifd, pos, os.SEEK_SET)
3668 cnk = os.read (ifd, BUFSIZE)
3671 data = decmp.decompress (cnk)
3672 except zlib.error as exn: # probably CRC32 mismatch; terminate softly
3675 if decmp.eof is True:
3677 if len (cnk) != BUFSIZE: # eof, but not end of decompressed object!
3680 return dlen, pos - off
3682 def readable_gz_objects_offsets (ifd, cands):
3684 Inspect header candidates for parseable *ifd* gzipped objects.
3691 vdt, hdr = inspect_gz_hdr (ifd, cand)
3692 if vdt == HDR_CAND_JUNK:
3693 pass # ignore unreadable ones
3694 elif vdt in [HDR_CAND_GOOD, HDR_CAND_FISHY]:
3695 off0 = cand + hdr ["hlen"]
3696 dlen, clen = try_decompress (ifd, off0, hdr)
3697 if dlen > 0 and clen > 0:
3703 def reconstruct_offsets_gz (fname):
3705 From the given file, retrieve all GZ header-like offsets (“candidates”).
3706 Then check each of those locations whether they can be processed as
3709 ifd = os.open (fname, os.O_RDONLY)
3712 cands = locate_gz_hdr_candidates (ifd)
3713 return readable_gz_objects_offsets (ifd, cands)
3718 def reconstruct_offsets_tar (fname):
3720 From the given file, retrieve all tar header-like offsets (“candidates”).
3721 Then check each of those locations whether they can be processed as tar
3724 ifd = os.open (fname, os.O_RDONLY)
3727 return readable_tar_objects_offsets (ifd)
3732 def read_tarobj_at_offset (fileobj, offset, mode, secret=None,
3733 strict_validation=True):
3735 :type strict_validation: bool
3736 :param strict_validation: Enable strict IV checking in the crypto
3737 layer. Should be disabled when dealing with
3738 potentially corrupted data.
3742 if secret is not None:
3745 if ks == crypto.PDTCRYPT_SECRET_PW:
3746 decr = crypto.Decrypt (password=secret [1],
3747 strict_ivs=strict_validation)
3748 elif ks == crypto.PDTCRYPT_SECRET_KEY:
3749 key = binascii.unhexlify (secret [1])
3750 decr = crypto.Decrypt (key=key,
3751 strict_ivs=strict_validation)
3757 TarFile.open_at_offset (offset,
3763 save_to_members=False,
3764 tolerance=TOLERANCE_RESCUE)
3765 except (ReadError, EndOfFile):
3768 return tarobj.next ()
3771 def idxent_of_tarinfo (tarinfo):
3773 Scrape the information relevant for the index from a *TarInfo* object.
3774 Keys like the inode number that lack a corresponding field in a TarInfo
3775 will be set to some neutral value.
3780 , "path" : "snapshot://annotations.db"
3784 , "ctime" : 1502798115
3785 , "mtime" : 1502196423
3794 { "inode" : 0 # ignored when reading the index
3795 , "uid" : tarinfo.uid
3796 , "gid" : tarinfo.gid
3797 , "path" : tarinfo.name # keeping URI scheme
3798 , "offset" : 0 # to be added by the caller
3799 , "volume" : tarinfo.volume_offset
3800 , "mode" : tarinfo.mode
3801 , "ctime" : tarinfo.mtime
3802 , "mtime" : tarinfo.mtime
3803 , "size" : tarinfo.size
3804 , "type" : tarinfo.type
3808 def gen_rescue_index (gen_volume_name, mode, maxvol=None, password=None, key=None):
3810 psidx = [] # pseudo index, return value
3812 secret = crypto.make_secret (password=password, key=key)
3817 vpath = gen_volume_name (nvol)
3819 if secret is not None:
3820 offsets = crypto.reconstruct_offsets (vpath, secret)
3822 offsets = reconstruct_offsets_gz (vpath)
3824 offsets = reconstruct_offsets_tar (vpath)
3826 raise TarError ("no rescue handling for mode “%s”" % mode)
3827 except FileNotFoundError as exn:
3828 # volume does not exist
3829 if maxvol is not None and nvol < maxvol:
3830 continue # explicit volume number specified, ignore missing ones
3834 fileobj = bltn_open (vpath, "rb")
3837 obj = read_tarobj_at_offset (fileobj, off, mode, secret=secret,
3838 strict_validation=False)
3840 acc.append ((off, nvol, obj))
3842 infos += functools.reduce (aux, offsets, [])
3848 def aux (o, nvol, ti):
3849 ie = idxent_of_tarinfo (ti)
3851 ie ["volume"] = nvol
3854 psidx = [ aux (o, nvol, ti) for o, nvol, ti in infos ]
3858 #--------------------
3859 # exported functions
3860 #--------------------
3861 def is_tarfile(name):
3862 """Return True if name points to a tar archive that we
3863 are able to handle, else return False.