2 #-------------------------------------------------------------------
4 #-------------------------------------------------------------------
5 # Copyright (C) 2002 Lars Gustäbel <lars@gustaebel.de>
8 # Permission is hereby granted, free of charge, to any person
9 # obtaining a copy of this software and associated documentation
10 # files (the "Software"), to deal in the Software without
11 # restriction, including without limitation the rights to use,
12 # copy, modify, merge, publish, distribute, sublicense, and/or sell
13 # copies of the Software, and to permit persons to whom the
14 # Software is furnished to do so, subject to the following
17 # The above copyright notice and this permission notice shall be
18 # included in all copies or substantial portions of the Software.
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22 # OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24 # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25 # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27 # OTHER DEALINGS IN THE SOFTWARE.
29 """Read from and write to tar format archives.
32 __version__ = "$Revision: 85213 $"
36 __author__ = "Lars Gustäbel (lars@gustaebel.de)"
39 __credits__ = "Gustavo Niemeyer, Niels Gustäbel, Richard Townsend, Eduardo Robles."
57 import traceback # XXX
66 # os.symlink on Windows prior to 6.0 raises NotImplementedError
67 symlink_exception = (AttributeError, NotImplementedError)
69 # OSError (winerror=1314) will be raised if the caller does not hold the
70 # SeCreateSymbolicLinkPrivilege privilege
71 symlink_exception += (OSError,)
75 # from tarfile import *
76 __all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError"]
78 from builtins import open as _open # Since 'open' is TarFile.open
80 #---------------------------------------------------------
82 #---------------------------------------------------------
83 NUL = b"\0" # the null character
84 BLOCKSIZE = 512 # length of processing blocks
85 RECORDSIZE = BLOCKSIZE * 20 # length of records
86 GNU_MAGIC = b"ustar \0" # magic gnu tar string
87 POSIX_MAGIC = b"ustar\x0000" # magic posix tar string
89 LENGTH_NAME = 100 # maximum length of a filename
90 LENGTH_LINK = 100 # maximum length of a linkname
91 LENGTH_PREFIX = 155 # maximum length of the prefix field
93 REGTYPE = b"0" # regular file
94 AREGTYPE = b"\0" # regular file
95 LNKTYPE = b"1" # link (inside tarfile)
96 SYMTYPE = b"2" # symbolic link
97 CHRTYPE = b"3" # character special device
98 BLKTYPE = b"4" # block special device
99 DIRTYPE = b"5" # directory
100 FIFOTYPE = b"6" # fifo special device
101 CONTTYPE = b"7" # contiguous file
103 GNUTYPE_LONGNAME = b"L" # GNU tar longname
104 GNUTYPE_LONGLINK = b"K" # GNU tar longlink
105 GNUTYPE_SPARSE = b"S" # GNU tar sparse file
106 GNUTYPE_MULTIVOL = b"M" # GNU tar continuation of a file that began on
109 XHDTYPE = b"x" # POSIX.1-2001 extended header
110 XGLTYPE = b"g" # POSIX.1-2001 global header
111 SOLARIS_XHDTYPE = b"X" # Solaris extended header
113 USTAR_FORMAT = 0 # POSIX.1-1988 (ustar) format
114 GNU_FORMAT = 1 # GNU tar format
115 PAX_FORMAT = 2 # POSIX.1-2001 (pax) format
116 DEFAULT_FORMAT = GNU_FORMAT
118 GZ_FMT_HEADER = b"<BBBBLBB"
119 GZ_HEADER_SIZE = 10 # not including the name
120 GZ_MAGIC = (0x1f, 0x8b) # 0o37, 0o213
121 GZ_METHOD_DEFLATE = 0x08 # 0o10
122 GZ_FLAG_ORIG_NAME = 0x08 # 0o10, default in gzip
123 GZ_DEFLATE_FLAGS = 0x00 # 0o00, never read (deflate.c)
124 GZ_OS_CODE = 0x03 # 0o03, default in gzip (tailor.h)
125 GZ_MAGIC_BYTES = struct.pack ("<BB", GZ_MAGIC [0], GZ_MAGIC [1])
126 GZ_MAGIC_DEFLATE = struct.pack ("<BBB", GZ_MAGIC [0], GZ_MAGIC [1],
129 #---------------------------------------------------------
130 # archive handling mode
131 #---------------------------------------------------------
134 ARCMODE_ENCRYPT = 1 << 0
135 ARCMODE_COMPRESS = 1 << 1
136 ARCMODE_CONCAT = 1 << 2
139 if m == ARCMODE_PLAIN:
143 def chkappend (b, s):
148 if first is True: first = False
151 chkappend (ARCMODE_ENCRYPT, "ENCRYPT")
152 chkappend (ARCMODE_COMPRESS, "COMPRESS")
153 chkappend (ARCMODE_CONCAT, "CONCAT")
157 def arcmode_set (concat=False, encryption=None, comptype=None, init=ARCMODE_PLAIN):
159 if bool (concat) is True:
160 ret |= ARCMODE_CONCAT
161 if encryption is not None:
162 ret |= ARCMODE_ENCRYPT
164 ret |= ARCMODE_COMPRESS
167 #---------------------------------------------------------
169 #---------------------------------------------------------
170 # File types that tarfile supports:
171 SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE,
172 SYMTYPE, DIRTYPE, FIFOTYPE,
173 CONTTYPE, CHRTYPE, BLKTYPE,
174 GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
175 GNUTYPE_SPARSE, GNUTYPE_MULTIVOL)
177 # File types that will be treated as a regular file.
178 REGULAR_TYPES = (REGTYPE, AREGTYPE,
179 CONTTYPE, GNUTYPE_SPARSE)
181 # File types that are part of the GNU tar format.
182 GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
183 GNUTYPE_SPARSE, GNUTYPE_MULTIVOL)
185 # Fields from a pax header that override a TarInfo attribute.
186 PAX_FIELDS = ("path", "linkpath", "size", "mtime",
187 "uid", "gid", "uname", "gname")
189 # Fields from a pax header that are affected by hdrcharset.
190 PAX_NAME_FIELDS = {"path", "linkpath", "uname", "gname"}
192 # Fields in a pax header that are numbers, all other fields
193 # are treated as strings.
194 PAX_NUMBER_FIELDS = {
203 #---------------------------------------------------------
205 #---------------------------------------------------------
207 if os.name in ("nt", "ce"):
210 ENCODING = sys.getfilesystemencoding()
212 #---------------------------------------------------------
213 # Some useful functions
214 #---------------------------------------------------------
216 def stn(s, length, encoding, errors):
217 """Convert a string to a null-terminated bytes object.
219 s = s.encode(encoding, errors)
220 return s[:length] + (length - len(s)) * NUL
222 def nts(s, encoding, errors):
223 """Convert a null-terminated bytes object to a string.
228 return s.decode(encoding, errors)
230 def sbtn(s, length, encoding, errors):
231 """Convert a string or a bunch of bytes to a null-terminated bytes object
234 if isinstance(s, str):
235 s = s.encode(encoding, errors)
236 return s[:length] + (length - len(s)) * NUL
239 """Convert a number field to a python number.
241 # There are two possible encodings for a number field, see
243 if s[0] in (0o200, 0o377):
245 for i in range(len(s) - 1):
249 n = -(256 ** (len(s) - 1) - n)
252 n = int(nts(s, "ascii", "strict") or "0", 8)
254 raise InvalidHeaderError("invalid header")
257 def itn(n, digits=8, format=DEFAULT_FORMAT):
258 """Convert a python number to a number field.
260 # POSIX 1003.1-1988 requires numbers to be encoded as a string of
261 # octal digits followed by a null-byte, this allows values up to
262 # (8**(digits-1))-1. GNU tar allows storing numbers greater than
263 # that if necessary. A leading 0o200 or 0o377 byte indicate this
264 # particular encoding, the following digits-1 bytes are a big-endian
265 # base-256 representation. This allows values up to (256**(digits-1))-1.
266 # A 0o200 byte indicates a positive number, a 0o377 byte a negative
268 if 0 <= n < 8 ** (digits - 1):
269 s = bytes("%0*o" % (digits - 1, int(n)), "ascii") + NUL
270 elif format == GNU_FORMAT and -256 ** (digits - 1) <= n < 256 ** (digits - 1):
272 s = bytearray([0o200])
274 s = bytearray([0o377])
275 n = 256 ** digits + n
277 for i in range(digits - 1):
278 s.insert(1, n & 0o377)
281 raise ValueError("overflow in number field")
285 def calc_chksums(buf):
286 """Calculate the checksum for a member's header by summing up all
287 characters except for the chksum field which is treated as if
288 it was filled with spaces. According to the GNU tar sources,
289 some tars (Sun and NeXT) calculate chksum with signed char,
290 which will be different if there are chars in the buffer with
291 the high bit set. So we calculate two checksums, unsigned and
294 unsigned_chksum = 256 + sum(struct.unpack_from("148B8x356B", buf))
295 signed_chksum = 256 + sum(struct.unpack_from("148b8x356b", buf))
296 return unsigned_chksum, signed_chksum
298 def copyfileobj(src, dst, length=None):
299 """Copy length bytes from fileobj src to fileobj dst.
300 If length is None, copy the entire content.
305 shutil.copyfileobj(src, dst)
309 blocks, remainder = divmod(length, BUFSIZE)
310 for b in range(blocks):
311 buf = src.read(BUFSIZE)
313 if len(buf) < BUFSIZE:
314 raise OSError("end of file reached")
316 buf = src.read(remainder)
318 if len(buf) < remainder:
319 raise OSError("end of file reached")
323 """Deprecated in this location; use stat.filemode."""
325 warnings.warn("deprecated in favor of stat.filemode",
326 DeprecationWarning, 2)
327 return stat.filemode(mode)
329 class TarError(Exception):
330 """Base exception."""
332 class ExtractError(TarError):
333 """General exception for extract errors."""
335 class ReadError(TarError):
336 """Exception for unreadable tar archives."""
338 class CompressionError(TarError):
339 """Exception for unavailable compression methods."""
341 class StreamError(TarError):
342 """Exception for unsupported operations on stream-like TarFiles."""
344 class HeaderError(TarError):
345 """Base exception for header errors."""
347 class EmptyHeaderError(HeaderError):
348 """Exception for empty headers."""
350 class TruncatedHeaderError(HeaderError):
351 """Exception for truncated headers."""
353 class EOFHeaderError(HeaderError):
354 """Exception for end of file headers."""
356 class InvalidHeaderError(HeaderError):
357 """Exception for invalid headers."""
359 class SubsequentHeaderError(HeaderError):
360 """Exception for missing and invalid extended headers."""
362 class InvalidEncryptionError(TarError):
363 """Exception for undefined crypto modes and combinations."""
365 class DecryptionError(TarError):
366 """Exception for error during decryption."""
368 class EncryptionError(TarError):
369 """Exception for error during encryption."""
371 class EndOfFile(Exception):
372 """Signal end of file condition when they’re not an error."""
374 #---------------------------
375 # internal stream interface
376 #---------------------------
378 """Low-level file object. Supports reading and writing.
379 It is used instead of a regular file object for streaming
383 def __init__(self, name, mode):
386 "w": os.O_RDWR | os.O_CREAT | os.O_TRUNC,
388 if hasattr(os, "O_BINARY"):
389 _mode |= os.O_BINARY # pylint: disable=no-member
390 self.fd = os.open(name, _mode, 0o666)
396 def read(self, size):
397 ret = os.read(self.fd, size)
398 self.offset += len(ret)
401 def write(self, s, pos=None):
404 os.lseek (self.fd, pos, os.SEEK_SET)
405 n = os.write(self.fd, s)
407 self.offset += len(s)
409 append = pos + n - p0
411 self.offset += append
412 os.lseek (self.fd, p0, os.SEEK_SET)
417 def seek_set (self, pos):
418 os.lseek (self.fd, pos, os.SEEK_SET)
422 def gz_header (name=None):
423 timestamp = int(time.time())
429 flags |= GZ_FLAG_ORIG_NAME
430 if type(name) is str:
431 name = name.encode("iso-8859-1", "replace")
432 if name.endswith(b".pdtcrypt"):
434 if name.endswith(b".gz"):
436 # RFC1952 says we must use ISO-8859-1 for the FNAME field.
439 hdr = struct.pack (GZ_FMT_HEADER,
440 GZ_MAGIC [0], GZ_MAGIC [1],
441 GZ_METHOD_DEFLATE, flags,
443 GZ_DEFLATE_FLAGS, GZ_OS_CODE)
449 """Class that serves as an adapter between TarFile and
450 a stream-like object. The stream-like object only
451 needs to have a read() or write() method and is accessed
452 blockwise. Use of gzip or bzip2 compression is possible.
453 A stream-like object could be for example: sys.stdin,
454 sys.stdout, a socket, a tape device etc.
456 _Stream is intended to be used only internally but is
457 nevertherless used externally by Deltatar.
459 When encrypting, the ``enccounter`` will be used for
460 initializing the first cryptographic context. When
461 decrypting, its value will be compared to the decrypted
462 object. Decryption fails if the value does not match.
463 In effect, this means that a ``_Stream`` whose ctor was
464 passed ``enccounter`` can only be used to encrypt or
465 decrypt a single object.
468 remainder = -1 # track size in encrypted entries
471 def __init__(self, name, mode, comptype, fileobj, bufsize,
472 concat=False, encryption=None, enccounter=None,
473 compresslevel=9, tolerant=False):
474 """Construct a _Stream object.
476 self.arcmode = arcmode_set (concat, encryption, comptype)
477 self.tolerant = tolerant
479 self._extfileobj = True
481 fileobj = _LowLevelFile(name, mode)
482 self._extfileobj = False
485 # Enable transparent compression detection for the
487 fileobj = _StreamProxy(fileobj)
488 comptype = fileobj.getcomptype()
492 self.enccounter = None
493 if self.arcmode & ARCMODE_ENCRYPT:
494 self.enccounter = enccounter
496 self.name = name or ""
498 self.comptype = comptype
500 self.fileobj = fileobj
501 self.bufsize = bufsize
507 self.last_block_offset = 0
508 self.dbuf = b"" # ???
509 self.exception = None # communicate decompression failure
510 self.compresslevel = compresslevel
511 self.bytes_written = 0
513 self.encryption = encryption
521 raise CompressionError("zlib module is not available")
524 self.exception = zlib.error
527 if not (self.arcmode & ARCMODE_CONCAT):
528 if self.arcmode & ARCMODE_ENCRYPT:
529 self._init_write_encrypt (name)
530 self._init_write_gz ()
531 self.crc = zlib.crc32(b"") & 0xFFFFffff
533 elif comptype == "bz2":
534 if self.arcmode & ARCMODE_ENCRYPT:
535 raise InvalidEncryptionError("encryption not available for "
536 "compression “%s”" % comptype)
540 raise CompressionError("bz2 module is not available")
543 self.cmp = bz2.BZ2Decompressor()
544 self.exception = OSError
546 self.cmp = bz2.BZ2Compressor()
548 elif comptype == 'xz':
549 if self.arcmode & ARCMODE_ENCRYPT:
550 raise InvalidEncryptionError("encryption not available for "
551 "compression “%s”" % comptype)
555 raise CompressionError("lzma module is not available")
558 self.cmp = lzma.LZMADecompressor()
559 self.exception = lzma.LZMAError
561 self.cmp = lzma.LZMACompressor()
563 elif comptype == "tar":
564 if not (self.arcmode & ARCMODE_CONCAT) \
566 and self.arcmode & ARCMODE_ENCRYPT:
567 self._init_write_encrypt (name)
570 if self.arcmode & ARCMODE_ENCRYPT:
571 raise InvalidEncryptionError("encryption not available for "
572 "compression “%s”" % comptype)
573 raise CompressionError("unknown compression type %r" % comptype)
576 if not self._extfileobj:
582 if hasattr(self, "closed") and not self.closed:
585 except crypto.InternalError:
586 # context already finalized due to abort but close() tried
591 def next (self, name):
592 if self.arcmode & ARCMODE_COMPRESS:
593 if getattr (self, "cmp", None) is not None:
594 self._finalize_write_gz ()
596 if self.arcmode & ~(ARCMODE_ENCRYPT | ARCMODE_COMPRESS):
597 self.last_block_offset = self.fileobj.tell()
598 if self.arcmode & ARCMODE_ENCRYPT:
599 self._finalize_write_encrypt ()
600 self._init_write_encrypt (name, set_last_block_offset=True)
601 if self.arcmode & ARCMODE_COMPRESS:
602 self._init_write_gz (set_last_block_offset =
603 not (self.arcmode & ARCMODE_ENCRYPT))
604 return self.last_block_offset
607 def next_volume (self, name):
608 # with non-concat modes, this is taken care by the _Stream
609 # ctor as invoked by the newvol handler
610 if self.arcmode & ARCMODE_COMPRESS:
611 if getattr (self, "cmp", None) is not None:
612 # e. g. compressed PAX header written
613 self._finalize_write_gz ()
614 if self.arcmode & ARCMODE_ENCRYPT:
615 self._init_write_encrypt (name)
616 if self.arcmode & ARCMODE_COMPRESS:
617 self._init_write_gz ()
620 def _init_write_encrypt (self, entry=None, set_last_block_offset=False):
622 Save position for delayed write of header; fill the header location
625 # first thing, proclaim new object to the encryption context
626 # secondly, assemble the header with the updated parameters
627 # and commit it directly to the underlying stream, bypassing the
628 # encryption layer in .__write().
629 dummyhdr = self.encryption.next (entry, counter=self.enccounter)
631 raise EncryptionError ("Crypto.next(): bad dummy header") # XXX
632 self.lasthdr = self.fileobj.tell()
633 self.__write_to_file(dummyhdr)
634 if set_last_block_offset is True:
635 self.last_block_offset = self.lasthdr
638 def _finalize_write_encrypt (self):
640 Seek back to header position, read dummy bytes, finalize crypto
641 obtaining the actual header, write header, seek back to current
644 Returns the list of IV fixed parts as used during encryption.
646 if self.lasthdr is not None:
647 pos0 = self.fileobj.tell ()
648 self.fileobj.seek_set (self.lasthdr)
649 dummy = self.fileobj.read (crypto.PDTCRYPT_HDR_SIZE)
650 pos1 = self.fileobj.tell ()
651 dpos = pos1 - self.lasthdr
652 assert dpos == crypto.PDTCRYPT_HDR_SIZE
653 self.fileobj.seek_set (pos0)
654 data, hdr, _ = self.encryption.done (dummy)
655 self.__write_to_file(hdr, pos=self.lasthdr)
656 self.__write_to_file(data) # append remainder of data
660 def _finalize_write_gz (self):
661 if self.cmp is not None:
662 chunk = self.buf + self.cmp.flush()
664 if self.comptype == "gz":
665 # The native zlib crc is an unsigned 32-bit integer, but
666 # the Python wrapper implicitly casts that to a signed C
667 # long. So, on a 32-bit box self.crc may "look negative",
668 # while the same crc on a 64-bit box may "look positive".
669 # To avoid irksome warnings from the `struct` module, force
670 # it to look positive on all boxes.
671 chunk += struct.pack("<L", self.crc & 0xffffffff)
672 chunk += struct.pack("<L", self.concat_pos & 0xffffFFFF)
673 self.__enc_write (chunk)
677 def _init_write_gz (self, set_last_block_offset=False):
679 Add a new gzip block, closing last one
682 self.crc = self.zlib.crc32(b"") & 0xFFFFffff
683 first = self.cmp is None
684 self.cmp = self.zlib.compressobj(self.compresslevel,
686 -self.zlib.MAX_WBITS,
687 self.zlib.DEF_MEM_LEVEL,
690 # if aes, we encrypt after compression
691 if set_last_block_offset is True:
692 self.last_block_offset = self.fileobj.tell()
694 self.__write(gz_header (self.name if first is True else None))
698 """Write string s to the stream.
700 if self.comptype == "gz":
701 self.crc = self.zlib.crc32(s, self.crc) & 0xFFFFffff
703 self.concat_pos += len(s)
704 if self.cmp is not None:
705 s = self.cmp.compress(s)
709 """Write what’s left in the buffer to the stream."""
710 self.__write (b"") # → len (buf) <= bufsiz
711 self.__enc_write (self.buf)
714 def __write(self, s):
715 """Writes (and encodes) string s to the stream blockwise
717 will wait with encoding/writing until block is complete
720 while len(self.buf) > self.bufsize:
721 self.__enc_write(self.buf[:self.bufsize])
722 self.buf = self.buf[self.bufsize:]
725 def __write_to_file(self, s, pos=None):
727 Writes directly to the fileobj; updates self.bytes_written. If “pos” is
728 given, the stream will seek to that position first and back afterwards,
729 and the total of bytes written is not updated.
731 self.fileobj.write(s, pos)
733 self.bytes_written += len(s)
736 def __enc_write(self, s):
738 If encryption is active, the string s is encrypted before being written
743 if self.arcmode & ARCMODE_ENCRYPT:
746 n, ct = self.encryption.process(buf)
747 self.__write_to_file(ct)
750 # The entire plaintext was not consumed: The size limit
751 # for encrypted objects was reached. Transparently create
752 # a new encrypted object and continue processing the input.
753 self._finalize_write_encrypt ()
754 self._init_write_encrypt ()
756 self.__write_to_file(s)
759 def estim_file_size(self):
760 """ estimates size of file if closing it now
762 The result may differ greatly from the amount of data sent to write()
763 due to compression, encryption and buffering.
765 In tests the result (before calling close()) was up to 12k smaller than
766 the final file size if compression is being used because zlib/bz2
767 compressors do not allow inspection of their buffered data :-(
769 Still, we add what close() would add: 8 bytes for gz checksum, one
770 encryption block size if encryption is used and the size of our own
774 return self.bytes_written
776 result = self.bytes_written
778 result += len(self.buf)
779 if self.comptype == 'gz':
780 result += 8 # 2 longs = 8 byte (no extra info written for bzip2)
783 def close(self, close_fileobj=True):
784 """Close the _Stream object. No operation should be
785 done on it afterwards.
791 if close_fileobj is True:
794 if self.arcmode & ARCMODE_COMPRESS:
795 self._finalize_write_gz ()
796 # end of Tar archive marker (two empty blocks) was written
797 # finalize encryption last; no writes may be performed after
800 if self.arcmode & ARCMODE_ENCRYPT:
801 self._finalize_write_encrypt ()
803 if not self._extfileobj:
806 # read the zlib crc and length and check them
807 if self.mode == "r" and self.comptype == "gz":
808 read_crc = self.__read(4)
809 read_length = self.__read(4)
810 calculated_crc = self.crc
811 if struct.unpack("<L", read_crc)[0] != calculated_crc:
812 raise CompressionError("bad gzip crc")
816 def _init_read_gz(self):
817 """Initialize for reading a gzip compressed fileobj.
819 self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
821 read2 = self.__read(2)
823 raise EndOfFile ("_init_read_gz(): read returned zero bytes at pos "
824 "%d" % self.fileobj.tell())
825 # taken from gzip.GzipFile with some alterations
826 if read2 != GZ_MAGIC_BYTES:
827 raise ReadError("not a gzip file")
829 read1 = self.__read(1)
831 raise CompressionError("unsupported compression method")
833 self.flags = flag = ord(self.__read(1))
837 xlen = ord(self.__read(1)) + 256 * ord(self.__read(1))
842 if not s or s == NUL:
847 if not s or s == NUL:
852 def _init_read_encrypt (self):
853 """Initialize encryption for next entry in archive. Read a header and
854 notify the crypto context."""
855 if self.arcmode & ARCMODE_ENCRYPT:
856 lasthdr = self.fileobj.tell ()
858 hdr = crypto.hdr_read_stream (self.fileobj)
859 except crypto.EndOfFile:
861 except crypto.InvalidHeader as exn:
862 raise DecryptionError ("Crypto.hdr_read_stream(): error “%s” "
863 "processing %r at pos %d"
864 % (exn, self.fileobj, lasthdr)) \
866 if self.enccounter is not None:
867 # enforce that the iv counter in the header matches an
868 # explicitly requested one
869 iv = crypto.hdr_iv_counter (hdr)
870 if iv != self.enccounter:
871 raise DecryptionError ("expected IV counter %d, got %d"
872 % (self.enccounter, iv))
873 self.lasthdr = lasthdr
874 self.remainder = hdr ["ctsize"] # distance to next header
876 self.encryption.next (hdr)
877 except crypto.InvalidParameter as exn:
878 raise DecryptionError ("Crypto.next(): error “%s” "
879 "processing %r at pos %d"
880 % (exn, self.fileobj, lasthdr)) \
886 def _read_encrypt (self, buf):
888 Demote a program error to a decryption error in tolerant mode. This
889 allows recovery from corrupted headers and invalid data.
892 return self.encryption.process (buf)
893 except RuntimeError as exn:
894 if self.tolerant is True:
895 raise DecryptionError (exn)
899 def _finalize_read_encrypt (self):
903 if self.arcmode & ARCMODE_ENCRYPT \
904 and self.lasthdr is not None :
905 assert self.remainder >= 0
906 if self.remainder > 0:
909 data = self.encryption.done ()
910 except crypto.InvalidGCMTag as exn:
911 raise DecryptionError ("decryption failed: %s" % exn)
916 """Return the stream's file pointer position.
920 def seek(self, pos=0):
921 """Set the stream's file pointer to pos. Negative seeking
924 if pos - self.pos >= 0:
925 blocks, remainder = divmod(pos - self.pos, self.bufsize)
926 for i in range(blocks):
927 self.read(self.bufsize)
930 raise StreamError("seeking backwards is not allowed")
933 def read(self, size=None):
934 """Return the next size number of bytes from the stream.
935 If size is not defined, return all bytes of the stream
941 buf = self._read(self.bufsize)
947 buf = self._read(size)
952 """Reads just one line, new line character included
954 # if \n in dbuf, no read neads to be done
955 if b'\n' in self.dbuf:
956 pos = self.dbuf.index(b'\n') + 1
957 ret = self.dbuf[:pos]
958 self.dbuf = self.dbuf[pos:]
963 chunk = self._read(self.bufsize)
965 # nothing more to read, so return the buffer
971 # if \n found, return the new line
974 pos = dbuf.index(b'\n') + 1
975 self.dbuf = dbuf[pos:] + self.dbuf
978 def _read(self, size):
979 """Return size bytes from the stream.
985 buf = self.__read(self.bufsize)
989 if self.cmp is not None:
991 buf = self.cmp.decompress(buf)
992 except self.exception as exn:
993 raise ReadError("invalid compressed data (%r)" % exn)
994 except Exception as e:
995 # happens at the end of the file
996 # _init_read_gz failed in the previous iteration so
997 # self.cmp.decompress fails here
998 if self.arcmode & ARCMODE_CONCAT:
1001 raise ReadError("invalid compressed data")
1002 if self.arcmode & ARCMODE_COMPRESS and hasattr(self, "crc"):
1003 self.crc = self.zlib.crc32(buf, self.crc) & 0xFFFFffff
1004 if self.arcmode & ARCMODE_CONCAT \
1005 and len(self.cmp.unused_data) != 0:
1006 self.buf = self.cmp.unused_data + self.buf
1007 self.close(close_fileobj=False)
1009 self._init_read_gz()
1010 except DecryptionError:
1011 if self.tolerant is True:
1012 # return whatever data was processed successfully
1019 # happens at the end of the file
1021 self.crc = self.zlib.crc32(b"") & 0xFFFFffff
1026 self.dbuf = t[size:]
1030 def __read(self, size):
1032 Return size bytes from stream. If internal buffer is empty, read
1033 another block from the stream.
1035 The function returns up to size bytes of data. When an error occurs
1036 during decryption, everything until the end of the last successfully
1037 finalized object is returned.
1040 t = [self.buf] if c > 0 else []
1041 good_crypto = len (t)
1046 if self.arcmode & ARCMODE_ENCRYPT:
1047 if self.remainder <= 0:
1048 # prepare next object
1049 if self._init_read_encrypt () is False: # EOF
1053 # only read up to the end of the encrypted object
1054 todo = min (size, self.remainder)
1055 buf = self.fileobj.read(todo)
1056 if self.arcmode & ARCMODE_ENCRYPT:
1058 buf = self._read_encrypt (buf)
1059 if todo == self.remainder:
1060 # at the end of a crypto object; finalization will fail if
1061 # the GCM tag does not match
1062 trailing = self._finalize_read_encrypt ()
1063 good_crypto = len (t) + 1
1064 if len (trailing) > 0:
1068 self.remainder -= todo
1069 except DecryptionError:
1070 if self.tolerant is False:
1072 self.encryption.drop ()
1073 if good_crypto == 0:
1075 # this may occur at any of the three crypto operations above.
1076 # some objects did validate; discard all data after it; next
1077 # call will start with the bad object and error out immediately
1078 self.buf = b"".join (t [good_crypto:])
1079 return b"".join (t [:good_crypto])
1081 if not buf: ## XXX stream terminated prematurely; this should be an error
1092 class _StreamProxy(object):
1093 """Small proxy class that enables transparent compression
1094 detection for the Stream interface (mode 'r|*').
1097 def __init__(self, fileobj):
1098 self.fileobj = fileobj
1099 self.buf = self.fileobj.read(BLOCKSIZE)
1101 def read(self, size): # pylint: disable=method-hidden
1102 self.read = self.fileobj.read
1105 def getcomptype(self):
1106 if self.buf.startswith(GZ_MAGIC_DEFLATE):
1108 elif self.buf[0:3] == b"BZh" and self.buf[4:10] == b"1AY&SY":
1110 elif self.buf.startswith((b"\x5d\x00\x00\x80", b"\xfd7zXZ")):
1116 self.fileobj.close()
1119 #------------------------
1120 # Extraction file object
1121 #------------------------
1122 class _FileInFile(object):
1123 """A thin wrapper around an existing file object that
1124 provides a part of its data as an individual file
1128 def __init__(self, fileobj, offset, size, blockinfo=None):
1129 self.fileobj = fileobj
1130 self.offset = offset
1133 self.name = getattr(fileobj, "name", None)
1136 if blockinfo is None:
1137 blockinfo = [(0, size)]
1139 # Construct a map with data and zero blocks.
1143 realpos = self.offset
1144 for offset, size in blockinfo:
1145 if offset > lastpos:
1146 self.map.append((False, lastpos, offset, None))
1147 self.map.append((True, offset, offset + size, realpos))
1149 lastpos = offset + size
1150 if lastpos < self.size:
1151 self.map.append((False, lastpos, self.size, None))
1163 return self.fileobj.seekable()
1166 """Return the current file position.
1168 return self.position
1170 def seek(self, position, whence=io.SEEK_SET):
1171 """Seek to a position in the file.
1173 if whence == io.SEEK_SET:
1174 self.position = min(max(position, 0), self.size)
1175 elif whence == io.SEEK_CUR:
1177 self.position = max(self.position + position, 0)
1179 self.position = min(self.position + position, self.size)
1180 elif whence == io.SEEK_END:
1181 self.position = max(min(self.size + position, self.size), 0)
1183 raise ValueError("Invalid argument")
1184 return self.position
1186 def read(self, size=None):
1187 """Read data from the file.
1190 size = self.size - self.position
1192 size = min(size, self.size - self.position)
1197 data, start, stop, offset = self.map[self.map_index]
1198 if start <= self.position < stop:
1202 if self.map_index == len(self.map):
1204 length = min(size, stop - self.position)
1206 self.fileobj.seek(offset + (self.position - start))
1207 buf += self.fileobj.read(length)
1211 self.position += length
1214 def readinto(self, b):
1215 buf = self.read(len(b))
1224 class ExFileObject(io.BufferedReader):
1226 def __init__(self, tarfile, tarinfo):
1227 fileobj = _FileInFile(tarfile.fileobj, tarinfo.offset_data,
1228 tarinfo.size, tarinfo.sparse)
1229 super().__init__(fileobj)
1235 class TarInfo(object):
1236 """Informational class which holds the details about an
1237 archive member given by a tar header block.
1238 TarInfo objects are returned by TarFile.getmember(),
1239 TarFile.getmembers() and TarFile.gettarinfo() and are
1240 usually created internally.
1243 __slots__ = ("name", "mode", "uid", "gid", "size", "mtime",
1244 "chksum", "type", "linkname", "uname", "gname",
1245 "devmajor", "devminor", "volume_offset",
1246 "offset", "offset_data", "pax_headers", "sparse",
1247 "tarfile", "_sparse_structs", "_link_target")
1249 def __init__(self, name=""):
1250 """Construct a TarInfo object. name is the optional name
1253 self.name = name # member name
1254 self.mode = 0o644 # file permissions
1255 self.uid = 0 # user id
1256 self.gid = 0 # group id
1257 self.size = 0 # file size
1258 self.mtime = 0 # modification time
1259 self.chksum = 0 # header checksum
1260 self.type = REGTYPE # member type
1261 self.linkname = "" # link name
1262 self.uname = "" # user name
1263 self.gname = "" # group name
1264 self.devmajor = 0 # device major number
1265 self.devminor = 0 # device minor number
1267 self.offset = 0 # the tar header starts here
1268 self.offset_data = 0 # the file's data starts here
1269 self.volume_offset = 0 # the file's data corresponds with the data
1270 # starting at this position
1272 self.sparse = None # sparse member information
1273 self.pax_headers = {} # pax header information
1275 # In pax headers the "name" and "linkname" field are called
1276 # "path" and "linkpath".
1279 def _setpath(self, name):
1281 path = property(_getpath, _setpath)
1283 def _getlinkpath(self):
1284 return self.linkname
1285 def _setlinkpath(self, linkname):
1286 self.linkname = linkname
1287 linkpath = property(_getlinkpath, _setlinkpath)
1290 return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self))
1292 def get_info(self, encoding=None, errors=None):
1293 """Return the TarInfo's attributes as a dictionary.
1297 "mode": self.mode & 0o7777,
1301 "mtime": self.mtime,
1302 "chksum": self.chksum,
1304 "linkname": self.linkname,
1305 "uname": self.uname,
1306 "gname": self.gname,
1307 "devmajor": self.devmajor,
1308 "devminor": self.devminor,
1309 "offset_data": self.offset_data,
1310 "volume_offset": self.volume_offset
1313 if info["type"] == DIRTYPE and not info["name"].endswith("/"):
1318 def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING,
1319 errors="surrogateescape"):
1320 """Return a tar header as a string of 512 byte blocks.
1322 info = self.get_info(encoding, errors)
1324 if format == USTAR_FORMAT:
1325 return self.create_ustar_header(info, encoding, errors)
1326 elif format == GNU_FORMAT:
1327 return self.create_gnu_header(info, encoding, errors)
1328 elif format == PAX_FORMAT:
1329 return self.create_pax_header(info, encoding, errors)
1331 raise ValueError("invalid format")
1333 def create_ustar_header(self, info, encoding, errors):
1334 """Return the object as a ustar header block.
1336 info["magic"] = POSIX_MAGIC
1338 if len(info["linkname"]) > LENGTH_LINK:
1339 raise ValueError("linkname is too long")
1341 if len(info["name"]) > LENGTH_NAME:
1342 info["prefix"], info["name"] = self._posix_split_name(info["name"])
1344 return self._create_header(info, USTAR_FORMAT, encoding, errors)
1346 def create_gnu_header(self, info, encoding, errors):
1347 """Return the object as a GNU header block sequence.
1349 info["magic"] = GNU_MAGIC
1351 if self.ismultivol():
1353 itn(info.get("atime", 0), 12, GNU_FORMAT),
1354 itn(info.get("ctime", 0), 12, GNU_FORMAT),
1355 itn(self.volume_offset, 12, GNU_FORMAT),
1356 itn(0, 119, GNU_FORMAT), # stuff unused in this tar implementation, set to zero
1358 info['prefix'] = b"".join(prefix)
1359 info['size'] = info['size'] - self.volume_offset
1362 if len(info["linkname"]) > LENGTH_LINK:
1363 buf += self._create_gnu_long_header(info["linkname"],
1364 GNUTYPE_LONGLINK, encoding, errors)
1366 if len(info["name"]) > LENGTH_NAME:
1367 buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME,
1370 return buf + self._create_header(info, GNU_FORMAT, encoding, errors)
1372 def create_pax_header(self, info, encoding, errors):
1373 """Return the object as a ustar header block. If it cannot be
1374 represented this way, prepend a pax extended header sequence
1375 with supplement information.
1377 info["magic"] = POSIX_MAGIC
1378 pax_headers = self.pax_headers.copy()
1379 if self.ismultivol():
1380 info['size'] = info['size'] - self.volume_offset
1382 # Test string fields for values that exceed the field length or cannot
1383 # be represented in ASCII encoding.
1384 for name, hname, length in (
1385 ("name", "path", LENGTH_NAME),
1386 ("linkname", "linkpath", LENGTH_LINK),
1387 ("uname", "uname", 32),
1388 ("gname", "gname", 32)):
1390 if hname in pax_headers:
1391 # The pax header has priority.
1394 # Try to encode the string as ASCII.
1396 info[name].encode("ascii", "strict")
1397 except UnicodeEncodeError:
1398 pax_headers[hname] = info[name]
1401 if len(info[name]) > length:
1402 pax_headers[hname] = info[name]
1404 # Test number fields for values that exceed the field limit or values
1405 # that like to be stored as float.
1406 for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)):
1407 if name in pax_headers:
1408 # The pax header has priority. Avoid overflow.
1413 if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float):
1414 pax_headers[name] = str(val)
1417 # Create a pax extended header if necessary.
1419 buf = self._create_pax_generic_header(pax_headers, XHDTYPE, encoding)
1423 return buf + self._create_header(info, USTAR_FORMAT, "ascii", "replace")
1426 def create_pax_global_header(cls, pax_headers):
1427 """Return the object as a pax global header block sequence.
1429 return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf-8")
1431 def _posix_split_name(self, name):
1432 """Split a name longer than 100 chars into a prefix
1435 prefix = name[:LENGTH_PREFIX + 1]
1436 while prefix and prefix[-1] != "/":
1437 prefix = prefix[:-1]
1439 name = name[len(prefix):]
1440 prefix = prefix[:-1]
1442 if not prefix or len(name) > LENGTH_NAME:
1443 raise ValueError("name is too long")
1447 def _create_header(info, format, encoding, errors):
1448 """Return a header block. info is a dictionary with file
1449 information, format must be one of the *_FORMAT constants.
1452 stn(info.get("name", ""), 100, encoding, errors),
1453 itn(info.get("mode", 0) & 0o7777, 8, format),
1454 itn(info.get("uid", 0), 8, format),
1455 itn(info.get("gid", 0), 8, format),
1456 itn(info.get("size", 0), 12, format),
1457 itn(info.get("mtime", 0), 12, format),
1458 b" ", # checksum field
1459 info.get("type", REGTYPE),
1460 stn(info.get("linkname", ""), 100, encoding, errors),
1461 info.get("magic", POSIX_MAGIC),
1462 stn(info.get("uname", ""), 32, encoding, errors),
1463 stn(info.get("gname", ""), 32, encoding, errors),
1464 itn(info.get("devmajor", 0), 8, format),
1465 itn(info.get("devminor", 0), 8, format),
1466 sbtn(info.get("prefix", ""), 155, encoding, errors)
1469 buf = struct.pack("%ds" % BLOCKSIZE, b"".join(parts))
1470 chksum = calc_chksums(buf[-BLOCKSIZE:])[0]
1471 buf = buf[:-364] + bytes("%06o\0" % chksum, "ascii") + buf[-357:]
1475 def _create_payload(payload):
1476 """Return the string payload filled with zero bytes
1477 up to the next 512 byte border.
1479 blocks, remainder = divmod(len(payload), BLOCKSIZE)
1481 payload += (BLOCKSIZE - remainder) * NUL
1485 def _create_gnu_long_header(cls, name, type, encoding, errors):
1486 """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence
1489 name = name.encode(encoding, errors) + NUL
1492 info["name"] = "././@LongLink"
1494 info["size"] = len(name)
1495 info["magic"] = GNU_MAGIC
1497 # create extended header + name blocks.
1498 return cls._create_header(info, USTAR_FORMAT, encoding, errors) + \
1499 cls._create_payload(name)
1502 def _create_pax_generic_header(cls, pax_headers, type, encoding):
1503 """Return a POSIX.1-2008 extended or global header sequence
1504 that contains a list of keyword, value pairs. The values
1507 # Check if one of the fields contains surrogate characters and thereby
1508 # forces hdrcharset=BINARY, see _proc_pax() for more information.
1510 for keyword, value in pax_headers.items():
1512 value.encode("utf-8", "strict")
1513 except UnicodeEncodeError:
1519 # Put the hdrcharset field at the beginning of the header.
1520 records += b"21 hdrcharset=BINARY\n"
1522 for keyword, value in pax_headers.items():
1523 keyword = keyword.encode("utf-8")
1525 # Try to restore the original byte representation of `value'.
1526 # Needless to say, that the encoding must match the string.
1527 value = value.encode(encoding, "surrogateescape")
1529 value = value.encode("utf-8")
1531 l = len(keyword) + len(value) + 3 # ' ' + '=' + '\n'
1538 records += bytes(str(p), "ascii") + b" " + keyword + b"=" + value + b"\n"
1540 # We use a hardcoded "././@PaxHeader" name like star does
1541 # instead of the one that POSIX recommends.
1543 info["name"] = "././@PaxHeader"
1545 info["size"] = len(records)
1546 info["magic"] = POSIX_MAGIC
1548 # Create pax header + record blocks.
1549 return cls._create_header(info, USTAR_FORMAT, "ascii", "replace") + \
1550 cls._create_payload(records)
1553 def frombuf(cls, buf, encoding, errors):
1554 """Construct a TarInfo object from a 512 byte bytes object.
1557 raise EmptyHeaderError("empty header")
1558 if len(buf) != BLOCKSIZE:
1559 raise TruncatedHeaderError("truncated header")
1560 if buf.count(NUL) == BLOCKSIZE:
1561 raise EOFHeaderError("end of file header")
1563 chksum = nti(buf[148:156])
1564 if chksum not in calc_chksums(buf):
1565 raise InvalidHeaderError("bad checksum")
1568 obj.name = nts(buf[0:100], encoding, errors)
1569 obj.mode = nti(buf[100:108])
1570 obj.uid = nti(buf[108:116])
1571 obj.gid = nti(buf[116:124])
1572 obj.size = nti(buf[124:136])
1573 obj.mtime = nti(buf[136:148])
1575 obj.type = buf[156:157]
1576 obj.linkname = nts(buf[157:257], encoding, errors)
1577 obj.uname = nts(buf[265:297], encoding, errors)
1578 obj.gname = nts(buf[297:329], encoding, errors)
1579 obj.devmajor = nti(buf[329:337])
1580 obj.devminor = nti(buf[337:345])
1581 prefix = nts(buf[345:500], encoding, errors)
1583 # The old GNU sparse format occupies some of the unused
1584 # space in the buffer for up to 4 sparse structures.
1585 # Save the them for later processing in _proc_sparse().
1586 if obj.type == GNUTYPE_SPARSE:
1591 offset = nti(buf[pos:pos + 12])
1592 numbytes = nti(buf[pos + 12:pos + 24])
1595 structs.append((offset, numbytes))
1597 isextended = bool(buf[482])
1598 origsize = nti(buf[483:495])
1599 obj._sparse_structs = (structs, isextended, origsize)
1601 # Old V7 tar format represents a directory as a regular
1602 # file with a trailing slash.
1603 if obj.type == AREGTYPE and obj.name.endswith("/"):
1606 # Remove redundant slashes from directories.
1608 obj.name = obj.name.rstrip("/")
1610 # Reconstruct a ustar longname.
1611 if prefix and obj.type not in GNU_TYPES:
1612 obj.name = prefix + "/" + obj.name
1614 obj.offset_data = nti(buf[369:381])
1618 def fromtarfile(cls, tarfile):
1619 """Return the next TarInfo object from TarFile object
1622 buf = tarfile.fileobj.read(BLOCKSIZE)
1623 obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors)
1624 obj.offset = tarfile.fileobj.tell() - BLOCKSIZE
1625 return obj._proc_member(tarfile)
1627 #--------------------------------------------------------------------------
1628 # The following are methods that are called depending on the type of a
1629 # member. The entry point is _proc_member() which can be overridden in a
1630 # subclass to add custom _proc_*() methods. A _proc_*() method MUST
1631 # implement the following
1633 # 1. Set self.offset_data to the position where the data blocks begin,
1634 # if there is data that follows.
1635 # 2. Set tarfile.offset to the position where the next member's header will
1637 # 3. Return self or another valid TarInfo object.
1638 def _proc_member(self, tarfile):
1639 """Choose the right processing method depending on
1640 the type and call it.
1642 if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK):
1643 return self._proc_gnulong(tarfile)
1644 elif self.type == GNUTYPE_SPARSE:
1645 return self._proc_sparse(tarfile)
1646 elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE):
1647 return self._proc_pax(tarfile)
1649 return self._proc_builtin(tarfile)
1651 def _proc_builtin(self, tarfile):
1652 """Process a builtin type or an unknown type which
1653 will be treated as a regular file.
1655 self.offset_data = tarfile.fileobj.tell()
1656 offset = self.offset_data
1657 if self.isreg() or self.ismultivol() or self.type not in SUPPORTED_TYPES:
1658 # Skip the following data blocks.
1659 offset += self._block(self.size)
1660 tarfile.offset = offset
1662 # Patch the TarInfo object with saved global
1663 # header information.
1664 self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors)
1668 def _proc_gnulong(self, tarfile):
1669 """Process the blocks that hold a GNU longname
1672 buf = tarfile.fileobj.read(self._block(self.size))
1674 # Fetch the next header and process it.
1676 next = self.fromtarfile(tarfile)
1678 raise SubsequentHeaderError("missing or bad subsequent header")
1680 # Patch the TarInfo object from the next header with
1681 # the longname information.
1682 next.offset = self.offset
1683 if self.type == GNUTYPE_LONGNAME:
1684 next.name = nts(buf, tarfile.encoding, tarfile.errors)
1685 elif self.type == GNUTYPE_LONGLINK:
1686 next.linkname = nts(buf, tarfile.encoding, tarfile.errors)
1690 def _proc_sparse(self, tarfile):
1691 """Process a GNU sparse header plus extra headers.
1693 # We already collected some sparse structures in frombuf().
1694 structs, isextended, origsize = self._sparse_structs
1695 del self._sparse_structs
1697 # Collect sparse structures from extended header blocks.
1699 buf = tarfile.fileobj.read(BLOCKSIZE)
1703 offset = nti(buf[pos:pos + 12])
1704 numbytes = nti(buf[pos + 12:pos + 24])
1707 if offset and numbytes:
1708 structs.append((offset, numbytes))
1710 isextended = bool(buf[504])
1711 self.sparse = structs
1713 self.offset_data = tarfile.fileobj.tell()
1714 tarfile.offset = self.offset_data + self._block(self.size)
1715 self.size = origsize
1718 def _proc_pax(self, tarfile):
1719 """Process an extended or global header as described in
1722 # Read the header information.
1723 buf = tarfile.fileobj.read(self._block(self.size))
1725 # A pax header stores supplemental information for either
1726 # the following file (extended) or all following files
1728 if self.type == XGLTYPE:
1729 pax_headers = tarfile.pax_headers
1731 pax_headers = tarfile.pax_headers.copy()
1733 # Check if the pax header contains a hdrcharset field. This tells us
1734 # the encoding of the path, linkpath, uname and gname fields. Normally,
1735 # these fields are UTF-8 encoded but since POSIX.1-2008 tar
1736 # implementations are allowed to store them as raw binary strings if
1737 # the translation to UTF-8 fails.
1738 match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf)
1739 if match is not None:
1740 pax_headers["hdrcharset"] = match.group(1).decode("utf-8")
1742 # For the time being, we don't care about anything other than "BINARY".
1743 # The only other value that is currently allowed by the standard is
1744 # "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
1745 hdrcharset = pax_headers.get("hdrcharset")
1746 if hdrcharset == "BINARY":
1747 encoding = tarfile.encoding
1751 # Parse pax header information. A record looks like that:
1752 # "%d %s=%s\n" % (length, keyword, value). length is the size
1753 # of the complete record including the length field itself and
1754 # the newline. keyword and value are both UTF-8 encoded strings.
1755 regex = re.compile(br"(\d+) ([^=]+)=")
1758 match = regex.match(buf, pos)
1762 length, keyword = match.groups()
1763 length = int(length)
1764 value = buf[match.end(2) + 1:match.start(1) + length - 1]
1766 # Normally, we could just use "utf-8" as the encoding and "strict"
1767 # as the error handler, but we better not take the risk. For
1768 # example, GNU tar <= 1.23 is known to store filenames it cannot
1769 # translate to UTF-8 as raw strings (unfortunately without a
1770 # hdrcharset=BINARY header).
1771 # We first try the strict standard encoding, and if that fails we
1772 # fall back on the user's encoding and error handler.
1773 keyword = self._decode_pax_field(keyword, "utf-8", "utf-8",
1775 if keyword in PAX_NAME_FIELDS:
1776 value = self._decode_pax_field(value, encoding, tarfile.encoding,
1779 value = self._decode_pax_field(value, "utf-8", "utf-8",
1782 pax_headers[keyword] = value
1786 # Fetch the next header.
1788 next = self.fromtarfile(tarfile)
1790 raise SubsequentHeaderError("missing or bad subsequent header")
1792 # Process GNU sparse information.
1793 if "GNU.sparse.map" in pax_headers:
1794 # GNU extended sparse format version 0.1.
1795 self._proc_gnusparse_01(next, pax_headers)
1797 elif "GNU.sparse.size" in pax_headers:
1798 # GNU extended sparse format version 0.0.
1799 self._proc_gnusparse_00(next, pax_headers, buf)
1801 elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0":
1802 # GNU extended sparse format version 1.0.
1803 self._proc_gnusparse_10(next, pax_headers, tarfile)
1805 if self.type in (XHDTYPE, SOLARIS_XHDTYPE):
1806 # Patch the TarInfo object with the extended header info.
1807 next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors)
1808 next.offset = self.offset
1810 if "size" in pax_headers:
1811 # If the extended header replaces the size field,
1812 # we need to recalculate the offset where the next
1814 offset = next.offset_data
1815 if next.isreg() or next.type not in SUPPORTED_TYPES:
1816 offset += next._block(next.size)
1817 tarfile.offset = offset
1819 if next is not None:
1820 if "GNU.volume.filename" in pax_headers:
1821 if pax_headers["GNU.volume.filename"] == next.name:
1822 if "GNU.volume.size" in pax_headers:
1823 next.size = int(pax_headers["GNU.volume.size"])
1824 if "GNU.volume.offset" in pax_headers:
1825 next.volume_offset = int(pax_headers["GNU.volume.offset"])
1827 for key in pax_headers.keys():
1828 if key.startswith("GNU.volume"):
1829 del tarfile.pax_headers[key]
1833 def _proc_gnusparse_00(self, next, pax_headers, buf):
1834 """Process a GNU tar extended sparse header, version 0.0.
1837 for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf):
1838 offsets.append(int(match.group(1)))
1840 for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf):
1841 numbytes.append(int(match.group(1)))
1842 next.sparse = list(zip(offsets, numbytes))
1844 def _proc_gnusparse_01(self, next, pax_headers):
1845 """Process a GNU tar extended sparse header, version 0.1.
1847 sparse = [int(x) for x in pax_headers["GNU.sparse.map"].split(",")]
1848 next.sparse = list(zip(sparse[::2], sparse[1::2]))
1850 def _proc_gnusparse_10(self, next, pax_headers, tarfile):
1851 """Process a GNU tar extended sparse header, version 1.0.
1855 buf = tarfile.fileobj.read(BLOCKSIZE)
1856 fields, buf = buf.split(b"\n", 1)
1857 fields = int(fields)
1858 while len(sparse) < fields * 2:
1859 if b"\n" not in buf:
1860 buf += tarfile.fileobj.read(BLOCKSIZE)
1861 number, buf = buf.split(b"\n", 1)
1862 sparse.append(int(number))
1863 next.offset_data = tarfile.fileobj.tell()
1864 next.sparse = list(zip(sparse[::2], sparse[1::2]))
1866 def _apply_pax_info(self, pax_headers, encoding, errors):
1867 """Replace fields with supplemental information from a previous
1868 pax extended or global header.
1870 for keyword, value in pax_headers.items():
1871 if keyword == "GNU.sparse.name":
1872 setattr(self, "path", value)
1873 elif keyword == "GNU.sparse.size":
1874 setattr(self, "size", int(value))
1875 elif keyword == "GNU.sparse.realsize":
1876 setattr(self, "size", int(value))
1877 elif keyword in PAX_FIELDS:
1878 if keyword in PAX_NUMBER_FIELDS:
1880 value = PAX_NUMBER_FIELDS[keyword](value)
1883 if keyword == "path":
1884 value = value.rstrip("/") # pylint: disable=no-member
1885 setattr(self, keyword, value)
1887 self.pax_headers = pax_headers.copy()
1889 def _decode_pax_field(self, value, encoding, fallback_encoding, fallback_errors):
1890 """Decode a single field from a pax record.
1893 return value.decode(encoding, "strict")
1894 except UnicodeDecodeError:
1895 return value.decode(fallback_encoding, fallback_errors)
1897 def _block(self, count):
1898 """Round up a byte count by BLOCKSIZE and return it,
1899 e.g. _block(834) => 1024.
1901 blocks, remainder = divmod(count, BLOCKSIZE)
1904 return blocks * BLOCKSIZE
1907 return self.type in REGULAR_TYPES
1911 return self.type == DIRTYPE
1913 return self.type == SYMTYPE
1915 return self.type == LNKTYPE
1917 return self.type == CHRTYPE
1919 return self.type == BLKTYPE
1921 return self.type == FIFOTYPE
1923 return self.sparse is not None
1925 return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE)
1926 def ismultivol(self):
1927 return self.type == GNUTYPE_MULTIVOL or self.volume_offset > 0 or\
1928 "GNU.volume.offset" in self.pax_headers
1931 class TarFile(object):
1932 """The TarFile Class provides an interface to tar archives.
1935 debug = 0 # May be set from 0 (no msgs) to 3 (all msgs)
1937 dereference = False # If true, add content of linked file to the
1938 # tar file, else the link.
1940 ignore_zeros = False # If true, skips empty or invalid blocks and
1941 # continues processing.
1943 max_volume_size = None # If different from None, establishes maximum
1944 # size of tar volumes
1946 new_volume_handler = None # function handler to be executed before when
1947 # a new volume is needed
1949 volume_number = 0 # current volume number, used for multi volume
1952 errorlevel = 1 # If 0, fatal errors only appear in debug
1953 # messages (if debug >= 0). If > 0, errors
1954 # are passed to the caller as exceptions.
1956 format = DEFAULT_FORMAT # The format to use when creating an archive.
1958 encoding = ENCODING # Encoding for 8-bit character strings.
1960 errors = None # Error handler for unicode conversion.
1962 tarinfo = TarInfo # The default TarInfo class to use.
1964 fileobject = ExFileObject # The file-object for extractfile().
1966 arcmode = ARCMODE_PLAIN # Object processing mode (“concat”, encryption,
1969 save_to_members = True # If new members are saved. This can be disabled
1970 # if you manage lots of files and don't want
1971 # to have high memory usage
1973 cache_uid2user = {} # cache to avoid getpwuid calls. It always parses /etc/passwd.
1974 cache_gid2group = {} # same cache for groups
1976 def __init__(self, name=None, mode="r", fileobj=None, format=None,
1977 tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
1978 errors="surrogateescape", pax_headers=None, debug=None,
1979 errorlevel=None, max_volume_size=None, new_volume_handler=None,
1980 concat=False, nacl=None,
1981 save_to_members=True):
1982 """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
1983 read from an existing archive, 'a' to append data to an existing
1984 file or 'w' to create a new file overwriting an existing one. `mode'
1986 If `fileobj' is given, it is used for reading or writing data. If it
1987 can be determined, `mode' is overridden by `fileobj's mode.
1988 `fileobj' is not closed, when TarFile is closed.
1990 if len(mode) > 1 or mode not in "raw":
1991 raise ValueError("mode must be 'r', 'a' or 'w'")
1993 self.arcmode = arcmode_set (concat)
1995 self._mode = {"r": "rb", "a": "r+b", "w": "wb"}[mode]
1998 if self.mode == "a" and not os.path.exists(name):
1999 # Create nonexistent files in append mode.
2002 fileobj = bltn_open(name, self._mode)
2003 self._extfileobj = False
2005 if name is None and hasattr(fileobj, "name"):
2007 # when fileobj is a gzip.GzipFile, fileobj.mode is an int (not valid for us)
2008 if hasattr(fileobj, "mode") and isinstance(fileobj.mode, str):
2009 self._mode = fileobj.mode
2010 self._extfileobj = True
2011 self.name = os.path.abspath(name) if name else None
2012 self.base_name = self.name = os.path.abspath(name) if name else None
2013 self.fileobj = fileobj
2016 if format is not None:
2017 self.format = format
2018 if tarinfo is not None:
2019 self.tarinfo = tarinfo
2020 if dereference is not None:
2021 self.dereference = dereference
2022 if ignore_zeros is not None:
2023 self.ignore_zeros = ignore_zeros
2024 if encoding is not None:
2025 self.encoding = encoding
2027 self.errors = errors
2029 if pax_headers is not None and self.format == PAX_FORMAT:
2030 self.pax_headers = pax_headers
2032 self.pax_headers = {}
2034 if debug is not None:
2036 if errorlevel is not None:
2037 self.errorlevel = errorlevel
2039 # Init datastructures.
2040 if max_volume_size and max_volume_size < 3*BLOCKSIZE:
2041 raise ValueError("max_volume_size needs to be at least %d" % (3*BLOCKSIZE))
2042 if max_volume_size and not callable(new_volume_handler):
2043 raise ValueError("new_volume_handler needs to be set and be callable for multivolume support")
2045 self.max_volume_size = int(max_volume_size)
2047 self.max_volume_size = None
2049 self.save_to_members = save_to_members
2050 self.new_volume_handler = new_volume_handler
2052 self.members = [] # list of members as TarInfo objects
2053 self._loaded = False # flag if all members have been read
2054 self.offset = self.fileobj.tell()
2055 # current position in the archive file
2056 self.inodes = {} # dictionary caching the inodes of
2057 # archive members already added
2060 if self.mode == "r":
2061 self.firstmember = None
2062 self.firstmember = self.next()
2064 if self.mode == "a":
2065 # Move to the end of the archive,
2066 # before the first empty block.
2068 self.fileobj.seek(self.offset)
2070 tarinfo = self.tarinfo.fromtarfile(self)
2071 self.members.append(tarinfo)
2072 except EOFHeaderError:
2073 self.fileobj.seek(self.offset)
2075 except HeaderError as e:
2076 raise ReadError(str(e))
2078 if self.mode in "aw":
2081 if self.pax_headers:
2082 buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy())
2083 self.fileobj.write(buf)
2084 self.offset += len(buf)
2086 if not self._extfileobj:
2087 self.fileobj.close()
2091 #--------------------------------------------------------------------------
2092 # Below are the classmethods which act as alternate constructors to the
2093 # TarFile class. The open() method is the only one that is needed for
2094 # public use; it is the "super"-constructor and is able to select an
2095 # adequate "sub"-constructor for a particular compression using the mapping
2098 # This concept allows one to subclass TarFile without losing the comfort of
2099 # the super-constructor. A sub-constructor is registered and made available
2100 # by adding it to the mapping in OPEN_METH.
2103 def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE,
2104 encryption=None, compresslevel=9, tolerant=False, **kwargs):
2105 """Open a tar archive for reading, writing or appending. Return
2106 an appropriate TarFile class.
2109 'r' or 'r:*' open for reading with transparent compression
2110 'r:' open for reading exclusively uncompressed
2111 'r:gz' open for reading with gzip compression
2112 'r:bz2' open for reading with bzip2 compression
2113 'r:xz' open for reading with lzma compression
2114 'a' or 'a:' open for appending, creating the file if necessary
2115 'w' or 'w:' open for writing without compression
2116 'w:gz' open for writing with gzip compression
2117 'w:bz2' open for writing with bzip2 compression
2118 'w:xz' open for writing with lzma compression
2120 'r|*' open a stream of tar blocks with transparent compression
2121 'r|' open an uncompressed stream of tar blocks for reading
2122 'r|gz' open a gzip compressed stream of tar blocks
2123 'r|bz2' open a bzip2 compressed stream of tar blocks
2124 'r|xz' open an lzma compressed stream of tar blocks
2125 'w|' open an uncompressed stream for writing
2126 'w|gz' open a gzip compressed stream for writing
2127 'w|bz2' open a bzip2 compressed stream for writing
2128 'w|xz' open an lzma compressed stream for writing
2130 'r#gz' open a stream of gzip compressed tar blocks for reading
2131 'w#gz' open a stream of gzip compressed tar blocks for writing
2133 if not name and not fileobj:
2134 raise ValueError("nothing to open")
2136 if mode in ("r", "r:*"):
2137 # Find out which *open() is appropriate for opening the file.
2138 for comptype in cls.OPEN_METH:
2139 func = getattr(cls, cls.OPEN_METH[comptype])
2140 if fileobj is not None:
2141 saved_pos = fileobj.tell()
2143 return func(name, "r", fileobj, **kwargs)
2144 except (ReadError, CompressionError) as e:
2145 # usually nothing exceptional but sometimes is
2146 if fileobj is not None:
2147 fileobj.seek(saved_pos)
2149 raise ReadError("file could not be opened successfully")
2152 filemode, comptype = mode.split(":", 1)
2153 filemode = filemode or "r"
2154 comptype = comptype or "tar"
2156 # Select the *open() function according to
2157 # given compression.
2158 if comptype in cls.OPEN_METH:
2159 func = getattr(cls, cls.OPEN_METH[comptype])
2161 raise CompressionError("unknown compression type %r" % comptype)
2163 # Pass on compression level for gzip / bzip2.
2164 if comptype == 'gz' or comptype == 'bz2':
2165 kwargs['compresslevel'] = compresslevel
2167 if 'max_volume_size' in kwargs:
2168 if comptype != 'tar' and filemode in 'wa' \
2169 and kwargs['max_volume_size']:
2171 warnings.warn('Only the first volume will be compressed '
2172 'for modes with "w:"!')
2174 return func(name, filemode, fileobj, **kwargs)
2177 filemode, comptype = mode.split("|", 1)
2178 filemode = filemode or "r"
2179 comptype = comptype or "tar"
2181 if filemode not in "rw":
2182 raise ValueError("mode must be 'r' or 'w'")
2184 t = cls(name, filemode,
2185 _Stream(name, filemode, comptype, fileobj, bufsize,
2186 compresslevel=compresslevel),
2188 t._extfileobj = False
2192 filemode, comptype = mode.split("#", 1)
2193 filemode = filemode or "r"
2195 if filemode not in "rw":
2196 raise ValueError ("mode %s not compatible with concat "
2197 "archive; must be 'r' or 'w'" % mode)
2199 stream = _Stream(name, filemode, comptype, fileobj, bufsize,
2200 concat=True, encryption=encryption,
2201 compresslevel=compresslevel, tolerant=tolerant)
2202 kwargs ["concat"] = True
2204 t = cls(name, filemode, stream, **kwargs)
2205 except: # XXX except what?
2207 raise # XXX raise what?
2208 t._extfileobj = False
2212 return cls.taropen(name, mode, fileobj, **kwargs)
2214 raise ValueError("undiscernible mode %r" % mode)
2217 def taropen(cls, name, mode="r", fileobj=None, **kwargs):
2218 """Open uncompressed tar archive name for reading or writing.
2220 if len(mode) > 1 or mode not in "raw":
2221 raise ValueError("mode must be 'r', 'a' or 'w'")
2222 return cls(name, mode, fileobj, **kwargs)
2225 def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
2226 """Open gzip compressed tar archive name for reading or writing.
2227 Appending is not allowed.
2229 if len(mode) > 1 or mode not in "rw":
2230 raise ValueError("mode must be 'r' or 'w'")
2235 except (ImportError, AttributeError):
2236 raise CompressionError("gzip module is not available")
2238 extfileobj = fileobj is not None
2240 fileobj = gzip.GzipFile(name, mode + "b", compresslevel, fileobj)
2241 t = cls.taropen(name, mode, fileobj, **kwargs)
2243 if not extfileobj and fileobj is not None:
2247 raise ReadError("not a gzip file")
2249 if not extfileobj and fileobj is not None:
2252 t._extfileobj = extfileobj
2256 def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
2257 """Open bzip2 compressed tar archive name for reading or writing.
2258 Appending is not allowed.
2260 if len(mode) > 1 or mode not in "rw":
2261 raise ValueError("mode must be 'r' or 'w'.")
2266 raise CompressionError("bz2 module is not available")
2268 fileobj = bz2.BZ2File(fileobj or name, mode,
2269 compresslevel=compresslevel)
2272 t = cls.taropen(name, mode, fileobj, **kwargs)
2273 except (OSError, EOFError):
2275 raise ReadError("not a bzip2 file")
2276 t._extfileobj = False
2280 def xzopen(cls, name, mode="r", fileobj=None, preset=None, **kwargs):
2281 """Open lzma compressed tar archive name for reading or writing.
2282 Appending is not allowed.
2284 if mode not in ("r", "w"):
2285 raise ValueError("mode must be 'r' or 'w'")
2290 raise CompressionError("lzma module is not available")
2292 fileobj = lzma.LZMAFile(fileobj or name, mode, preset=preset)
2295 t = cls.taropen(name, mode, fileobj, **kwargs)
2296 except (lzma.LZMAError, EOFError):
2298 raise ReadError("not an lzma file")
2299 t._extfileobj = False
2302 # All *open() methods are registered here.
2304 "tar": "taropen", # uncompressed tar
2305 "gz": "gzopen", # gzip compressed tar
2306 "bz2": "bz2open", # bzip2 compressed tar
2307 "xz": "xzopen" # lzma compressed tar
2310 #--------------------------------------------------------------------------
2311 # The public methods which TarFile provides:
2314 """Close the TarFile. In write-mode, two finishing zero blocks are
2315 appended to the archive. A special case are empty archives which are
2316 initialized accordingly so the two mandatory blocks of zeros are
2317 written abiding by the requested encryption and compression settings.
2322 if self.mode in "aw":
2323 if self.arcmode & ARCMODE_CONCAT and self.fileobj.tell () == 0:
2324 self.fileobj.next ("")
2325 self.fileobj.write(NUL * (BLOCKSIZE * 2))
2326 self.offset += (BLOCKSIZE * 2)
2327 # fill up the end with zero-blocks
2328 # (like option -b20 for tar does)
2329 blocks, remainder = divmod(self.offset, RECORDSIZE)
2331 self.fileobj.write(NUL * (RECORDSIZE - remainder))
2332 if not self._extfileobj:
2333 self.fileobj.close()
2336 def getmember(self, name):
2337 """Return a TarInfo object for member `name'. If `name' can not be
2338 found in the archive, KeyError is raised. If a member occurs more
2339 than once in the archive, its last occurrence is assumed to be the
2340 most up-to-date version.
2342 tarinfo = self._getmember(name)
2344 raise KeyError("filename %r not found" % name)
2347 def getmembers(self):
2348 """Return the members of the archive as a list of TarInfo objects. The
2349 list has the same order as the members in the archive.
2352 if not self._loaded: # if we want to obtain a list of
2353 self._load() # all members, we first have to
2354 # scan the whole archive.
2357 def get_last_member_offset(self):
2358 """Return the last member offset. Usually this is self.fileobj.tell(),
2359 but when there's encryption or concat compression going on it's more
2360 complicated than that.
2362 return self.last_block_offset
2365 """Return the members of the archive as a list of their names. It has
2366 the same order as the list returned by getmembers().
2368 return [tarinfo.name for tarinfo in self.getmembers()]
2370 def gettarinfo(self, name=None, arcname=None, fileobj=None):
2371 """Create a TarInfo object for either the file `name' or the file
2372 object `fileobj' (using os.fstat on its file descriptor). You can
2373 modify some of the TarInfo's attributes before you add it using
2374 addfile(). If given, `arcname' specifies an alternative name for the
2375 file in the archive.
2379 # When fileobj is given, replace name by
2380 # fileobj's real name.
2381 if fileobj is not None:
2384 # Building the name of the member in the archive.
2385 # Backward slashes are converted to forward slashes,
2386 # Absolute paths are turned to relative paths.
2389 drv, arcname = os.path.splitdrive(arcname)
2390 arcname = arcname.replace(os.sep, "/")
2391 arcname = arcname.lstrip("/")
2393 # Now, fill the TarInfo object with
2394 # information specific for the file.
2395 tarinfo = self.tarinfo()
2396 tarinfo.tarfile = self
2398 # Use os.stat or os.lstat, depending on platform
2399 # and if symlinks shall be resolved.
2401 if hasattr(os, "lstat") and not self.dereference:
2402 statres = os.lstat(name)
2404 statres = os.stat(name)
2406 statres = os.fstat(fileobj.fileno())
2409 stmd = statres.st_mode
2410 if stat.S_ISREG(stmd):
2411 inode = (statres.st_ino, statres.st_dev)
2412 if not self.dereference and statres.st_nlink > 1 and \
2413 inode in self.inodes and arcname != self.inodes[inode]:
2414 # Is it a hardlink to an already
2417 linkname = self.inodes[inode]
2419 # The inode is added only if its valid.
2420 # For win32 it is always 0.
2422 if inode[0] and self.save_to_members:
2423 self.inodes[inode] = arcname
2424 elif stat.S_ISDIR(stmd):
2426 elif stat.S_ISFIFO(stmd):
2428 elif stat.S_ISLNK(stmd):
2430 linkname = os.readlink(name)
2431 elif stat.S_ISCHR(stmd):
2433 elif stat.S_ISBLK(stmd):
2438 # Fill the TarInfo object with all
2439 # information we can get.
2440 tarinfo.name = arcname
2442 tarinfo.uid = statres.st_uid
2443 tarinfo.gid = statres.st_gid
2445 tarinfo.size = statres.st_size
2448 tarinfo.mtime = statres.st_mtime
2450 tarinfo.linkname = linkname
2452 if tarinfo.uid in self.cache_uid2user:
2453 tarinfo.uname = self.cache_uid2user[tarinfo.uid]
2456 tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0]
2457 self.cache_uid2user[tarinfo.uid] = tarinfo.uname
2459 # remember user does not exist:
2460 # same default value as in tarinfo class
2461 self.cache_uid2user[tarinfo.uid] = ""
2463 if tarinfo.gid in self.cache_gid2group:
2464 tarinfo.gname = self.cache_gid2group[tarinfo.gid]
2467 tarinfo.gname = grp.getgrgid(tarinfo.gid)[0]
2468 self.cache_gid2group[tarinfo.gid] = tarinfo.gname
2470 # remember group does not exist:
2471 # same default value as in tarinfo class
2472 self.cache_gid2group[tarinfo.gid] = ""
2474 if type in (CHRTYPE, BLKTYPE):
2475 if hasattr(os, "major") and hasattr(os, "minor"):
2476 tarinfo.devmajor = os.major(statres.st_rdev)
2477 tarinfo.devminor = os.minor(statres.st_rdev)
2480 def list(self, verbose=True):
2481 """Print a table of contents to sys.stdout. If `verbose' is False, only
2482 the names of the members are printed. If it is True, an `ls -l'-like
2487 for tarinfo in self:
2489 print(stat.filemode(tarinfo.mode), end=' ')
2490 print("%s/%s" % (tarinfo.uname or tarinfo.uid,
2491 tarinfo.gname or tarinfo.gid), end=' ')
2492 if tarinfo.ischr() or tarinfo.isblk():
2493 print("%10s" % ("%d,%d" \
2494 % (tarinfo.devmajor, tarinfo.devminor)), end=' ')
2496 print("%10d" % tarinfo.size, end=' ')
2497 print("%d-%02d-%02d %02d:%02d:%02d" \
2498 % time.localtime(tarinfo.mtime)[:6], end=' ')
2500 print(tarinfo.name + ("/" if tarinfo.isdir() else ""), end=' ')
2504 print("->", tarinfo.linkname, end=' ')
2506 print("link to", tarinfo.linkname, end=' ')
2509 def add(self, name, arcname=None, recursive=True, exclude=None, *, filter=None):
2510 """Add the file `name' to the archive. `name' may be any type of file
2511 (directory, fifo, symbolic link, etc.). If given, `arcname'
2512 specifies an alternative name for the file in the archive.
2513 Directories are added recursively by default. This can be avoided by
2514 setting `recursive' to False. `exclude' is a function that should
2515 return True for each filename to be excluded. `filter' is a function
2516 that expects a TarInfo object argument and returns the changed
2517 TarInfo object, if it returns None the TarInfo object will be
2518 excluded from the archive.
2525 # Exclude pathnames.
2526 if exclude is not None:
2528 warnings.warn("use the filter argument instead",
2529 DeprecationWarning, 2)
2531 self._dbg(2, "tarfile: Excluded %r" % name)
2534 # Skip if somebody tries to archive the archive...
2535 if self.name is not None and os.path.abspath(name) == self.name:
2536 self._dbg(2, "tarfile: Skipped %r" % name)
2541 # Create a TarInfo object from the file.
2542 tarinfo = self.gettarinfo(name, arcname)
2545 self._dbg(1, "tarfile: Unsupported type %r" % name)
2548 # Change or exclude the TarInfo object.
2549 if filter is not None:
2550 tarinfo = filter(tarinfo)
2552 self._dbg(2, "tarfile: Excluded %r" % name)
2555 # Append the tar header and data to the archive.
2557 with bltn_open(name, "rb") as f:
2558 self.addfile(tarinfo, f)
2560 elif tarinfo.isdir():
2561 self.addfile(tarinfo)
2563 for f in os.listdir(name):
2564 self.add(os.path.join(name, f), os.path.join(arcname, f),
2565 recursive, exclude, filter=filter)
2568 self.addfile(tarinfo)
2570 def _size_left_file(self):
2571 """Calculates size left in a volume with a maximum volume size.
2573 Assumes self.max_volume_size is set.
2574 If using compression through a _Stream, use _size_left_stream instead
2576 # left-over size = max_size - offset - 2 zero-blocks written in close
2577 size_left = self.max_volume_size - 2*BLOCKSIZE - self.offset
2578 # limit size left to a discrete number of blocks, because we won't
2579 # write only half a block when writting the end of a volume
2580 # and filling with zeros
2581 return BLOCKSIZE * (size_left // BLOCKSIZE)
2583 def _size_left_stream(self):
2584 """ Calculates size left in a volume if using comression/encryption
2586 Assumes self.max_volume_size is set and self.fileobj is a _Stream
2587 (otherwise use _size_left_file)
2589 # left-over size = max_size - bytes written - 2 zero-blocks (close)
2590 size_left = self.max_volume_size - self.fileobj.estim_file_size() \
2592 return BLOCKSIZE * (size_left // BLOCKSIZE)
2594 def addfile(self, tarinfo, fileobj=None):
2595 """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is
2596 given, tarinfo.size bytes are read from it and added to the archive.
2597 You can create TarInfo objects using gettarinfo().
2598 On Windows platforms, `fileobj' should always be opened with mode
2599 'rb' to avoid irritation about the file size.
2603 tarinfo = copy.copy(tarinfo)
2605 if self.arcmode & ARCMODE_CONCAT:
2606 self.last_block_offset = self.fileobj.next (tarinfo.name)
2608 self.last_block_offset = self.fileobj.tell()
2610 buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
2611 self.fileobj.write(buf)
2612 self.offset += len(buf)
2614 if self.max_volume_size:
2615 if isinstance(self.fileobj, _Stream):
2616 _size_left = self._size_left_stream
2618 _size_left = self._size_left_file
2620 _size_left = lambda: tarinfo.size
2622 # If there's no data to follow, finish
2624 if self.save_to_members:
2625 self.members.append(tarinfo)
2628 target_size_left = _size_left()
2629 source_size_left = tarinfo.size
2630 assert tarinfo.volume_offset == 0
2632 # we only split volumes in the middle of a file, that means we have
2633 # to write at least one block
2634 if target_size_left < BLOCKSIZE:
2635 target_size_left = BLOCKSIZE
2637 # loop over multiple volumes
2638 while source_size_left > 0:
2640 # Write as much data as possble from source into target.
2641 # When compressing data, we cannot easily predict how much data we
2642 # can write until target_size_left == 0 --> need to iterate
2643 size_can_write = min(target_size_left, source_size_left)
2645 while size_can_write > 0:
2646 copyfileobj(fileobj, self.fileobj, size_can_write)
2647 self.offset += size_can_write
2648 source_size_left -= size_can_write
2649 target_size_left = _size_left()
2650 size_can_write = min(target_size_left, source_size_left)
2652 # now target_size_left == 0 or source_size_left == 0
2654 # if there is data left to write, we need to create a new volume
2655 if source_size_left > 0:
2656 # Only finalize the crypto entry here if we’re continuing with
2657 # another one; otherwise, the encryption must include the block
2659 tarinfo.type = GNUTYPE_MULTIVOL
2661 if not self.new_volume_handler or\
2662 not callable(self.new_volume_handler):
2663 raise Exception("We need to create a new volume and you "
2664 "didn't supply a new_volume_handler")
2667 # the new volume handler should do everything needed to
2668 # start working in a new volume. usually, the handler calls
2669 # to self.open_volume
2670 self.volume_number += 1
2672 # set to be used by open_volume, because in the case of a PAX
2673 # tar it needs to write information about the volume and offset
2674 # in the global header
2675 tarinfo.volume_offset = tarinfo.size - source_size_left
2676 self.volume_tarinfo = tarinfo
2678 # the “new_volume_handler” is supposed to call .close() on the
2680 self.new_volume_handler(self, self.base_name, self.volume_number)
2682 self.volume_tarinfo = None
2684 if self.arcmode & ARCMODE_CONCAT:
2685 self.fileobj.next_volume (tarinfo.name)
2687 # write new volume header
2688 buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
2689 self.fileobj.write(buf)
2690 self.offset += len(buf)
2692 # adjust variables; open_volume should have reset self.offset
2693 # --> _size_left should be big again
2694 target_size_left = _size_left()
2695 size_can_write = min(target_size_left, source_size_left)
2696 self._dbg(3, 'new volume')
2698 # now, all data has been written. We may have to fill up the rest of
2699 # the block in target with 0s
2700 remainder = (tarinfo.size - tarinfo.volume_offset) % BLOCKSIZE
2702 self.fileobj.write(NUL * (BLOCKSIZE - remainder))
2703 self.offset += BLOCKSIZE - remainder
2705 if self.save_to_members:
2706 self.members.append(tarinfo)
2708 def open_volume(self, name="", fileobj=None, encryption=None):
2710 Called by the user to change this tar file to point to a new volume.
2712 # open the file using either fileobj or name
2714 if self.mode == "a" and not os.path.exists(name):
2715 # Create nonexistent files in append mode.
2718 self._extfileobj = False
2720 if isinstance(self.fileobj, _Stream):
2721 self._dbg(3, 'open_volume: create a _Stream')
2722 fileobj = _Stream(name=name,
2723 mode=self.fileobj.mode,
2724 comptype=self.fileobj.comptype,
2726 bufsize=self.fileobj.bufsize,
2727 encryption=encryption or self.fileobj.encryption,
2728 concat=self.fileobj.arcmode & ARCMODE_CONCAT)
2730 # here, we lose information about compression/encryption!
2731 self._dbg(3, 'open_volume: builtin open')
2732 fileobj = bltn_open(name, self._mode)
2734 if name is None and hasattr(fileobj, "name"):
2736 if hasattr(fileobj, "mode"):
2737 self._mode = fileobj.mode
2738 self._extfileobj = True
2739 self._dbg(3, 'open_volume: using external fileobj {}', fileobj)
2740 self.name = os.path.abspath(name) if name else None
2741 self.fileobj = fileobj
2743 # init data structures
2745 self.members = [] # list of members as TarInfo objects
2746 self._loaded = False # flag if all members have been read
2747 self.offset = self.fileobj.tell()
2748 # current position in the archive file
2749 self.inodes = {} # dictionary caching the inodes of
2750 # archive members already added
2753 if self.mode == "r":
2754 self.firstmember = None
2755 self.firstmember = self.next()
2757 if self.mode == "a":
2758 # Move to the end of the archive,
2759 # before the first empty block.
2761 self.fileobj.seek(self.offset)
2763 tarinfo = self.tarinfo.fromtarfile(self)
2764 self.members.append(tarinfo)
2765 except EOFHeaderError:
2766 self.fileobj.seek(self.offset)
2768 except HeaderError as e:
2769 raise ReadError(str(e))
2771 if self.mode in "aw":
2774 if self.format == PAX_FORMAT:
2776 "GNU.volume.filename": str(self.volume_tarinfo.name),
2777 "GNU.volume.size": str(self.volume_tarinfo.size - self.volume_tarinfo.volume_offset),
2778 "GNU.volume.offset": str(self.volume_tarinfo.volume_offset),
2781 self.pax_headers.update(volume_info)
2783 if isinstance(self.fileobj, _Stream):
2784 self.fileobj._init_write_gz ()
2785 buf = self.tarinfo.create_pax_global_header(volume_info.copy())
2786 self.fileobj.write(buf)
2787 self.offset += len(buf)
2788 except Exception as exn:
2789 if not self._extfileobj:
2790 self.fileobj.close()
2794 def extractall(self, path=".", members=None, filter=None):
2795 """Extract all members from the archive to the current working
2796 directory and set owner, modification time and permissions on
2797 directories afterwards. `path' specifies a different directory
2798 to extract to. `members' is optional and must be a subset of the
2799 list returned by getmembers().
2806 for tarinfo in members:
2807 if self.volume_number > 0 and tarinfo.ismultivol():
2810 if filter and not filter(tarinfo):
2814 # Extract directories with a safe mode.
2815 directories.append(tarinfo)
2816 tarinfo = copy.copy(tarinfo)
2817 tarinfo.mode = 0o0700
2818 # Do not set_attrs directories, as we will do that further down
2819 self.extract(tarinfo, path, set_attrs=not tarinfo.isdir())
2821 # Reverse sort directories.
2822 directories.sort(key=lambda a: a.name)
2823 directories.reverse()
2825 # Set correct owner, mtime and filemode on directories.
2826 for tarinfo in directories:
2827 dirpath = os.path.join(path, tarinfo.name)
2829 self.chown(tarinfo, dirpath)
2830 self.utime(tarinfo, dirpath)
2831 self.chmod(tarinfo, dirpath)
2832 except ExtractError as e:
2833 if self.errorlevel > 1:
2836 self._dbg(1, "tarfile: %s" % e)
2838 def extract(self, member, path="", set_attrs=True, symlink_cb=None):
2839 """Extract a member from the archive to the current working directory,
2840 using its full name. Its file information is extracted as accurately
2841 as possible. `member' may be a filename or a TarInfo object. You can
2842 specify a different directory using `path'. File attributes (owner,
2843 mtime, mode) are set unless `set_attrs' is False.
2844 ``symlink_cb`` is a hook accepting a function that is passed the
2845 ``member``, ``path``, and ``set_attrs`` arguments if the tarinfo for
2846 ``member`` indicates a symlink in which case only the callback
2847 passed will be applied, skipping the actual extraction. In case the
2848 callback is invoked, its return value is passed on to the caller.
2852 if isinstance(member, str):
2853 tarinfo = self.getmember(member)
2857 # Prepare the link target for makelink().
2859 tarinfo._link_target = os.path.join(path, tarinfo.linkname)
2861 if symlink_cb is not None and tarinfo.issym():
2862 return symlink_cb(member, path, set_attrs)
2865 self._extract_member(tarinfo, os.path.join(path, tarinfo.name),
2866 set_attrs=set_attrs)
2867 except EnvironmentError as e:
2868 if self.errorlevel > 0:
2871 if e.filename is None:
2872 self._dbg(1, "tarfile: %s" % e.strerror)
2874 self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename))
2875 except ExtractError as e:
2876 if self.errorlevel > 1:
2879 self._dbg(1, "tarfile: %s" % e)
2881 def extractfile(self, member):
2882 """Extract a member from the archive as a file object. `member' may be
2883 a filename or a TarInfo object. If `member' is a regular file or a
2884 link, an io.BufferedReader object is returned. Otherwise, None is
2889 if isinstance(member, str):
2890 tarinfo = self.getmember(member)
2894 if tarinfo.isreg() or tarinfo.ismultivol() or\
2895 tarinfo.type not in SUPPORTED_TYPES:
2896 # If a member's type is unknown, it is treated as a
2898 return self.fileobject(self, tarinfo)
2900 elif tarinfo.islnk() or tarinfo.issym():
2901 if isinstance(self.fileobj, _Stream):
2902 # A small but ugly workaround for the case that someone tries
2903 # to extract a (sym)link as a file-object from a non-seekable
2904 # stream of tar blocks.
2905 raise StreamError("cannot extract (sym)link as file object")
2907 # A (sym)link's file object is its target's file object.
2908 return self.extractfile(self._find_link_target(tarinfo))
2910 # If there's no data associated with the member (directory, chrdev,
2911 # blkdev, etc.), return None instead of a file object.
2914 def _extract_member(self, tarinfo, targetpath, set_attrs=True):
2915 """Extract the TarInfo object tarinfo to a physical
2916 file called targetpath.
2918 # Fetch the TarInfo object for the given name
2919 # and build the destination pathname, replacing
2920 # forward slashes to platform specific separators.
2921 targetpath = targetpath.rstrip("/")
2922 targetpath = targetpath.replace("/", os.sep)
2924 # Create all upper directories.
2925 upperdirs = os.path.dirname(targetpath)
2926 if upperdirs and not os.path.exists(upperdirs):
2927 # Create directories that are not part of the archive with
2928 # default permissions.
2929 os.makedirs(upperdirs)
2931 if tarinfo.islnk() or tarinfo.issym():
2932 self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname))
2934 self._dbg(1, tarinfo.name)
2937 self.makefile(tarinfo, targetpath)
2938 elif tarinfo.isdir():
2939 self.makedir(tarinfo, targetpath)
2940 elif tarinfo.isfifo():
2941 self.makefifo(tarinfo, targetpath)
2942 elif tarinfo.ischr() or tarinfo.isblk():
2943 self.makedev(tarinfo, targetpath)
2944 elif tarinfo.islnk() or tarinfo.issym():
2945 self.makelink(tarinfo, targetpath)
2946 elif tarinfo.type not in SUPPORTED_TYPES:
2947 self.makeunknown(tarinfo, targetpath)
2949 self.makefile(tarinfo, targetpath)
2952 self.chown(tarinfo, targetpath)
2953 if not tarinfo.issym():
2954 self.chmod(tarinfo, targetpath)
2955 self.utime(tarinfo, targetpath)
2957 #--------------------------------------------------------------------------
2958 # Below are the different file methods. They are called via
2959 # _extract_member() when extract() is called. They can be replaced in a
2960 # subclass to implement other functionality.
2962 def makedir(self, tarinfo, targetpath):
2963 """Make a directory called targetpath.
2966 # Use a safe mode for the directory, the real mode is set
2967 # later in _extract_member().
2968 os.mkdir(targetpath, 0o0700)
2969 except FileExistsError:
2972 def makefile(self, tarinfo, targetpath):
2973 """Make a file called targetpath.
2975 source = self.fileobj
2976 source.seek(tarinfo.offset_data)
2979 target = bltn_open(targetpath, "wb")
2981 if tarinfo.sparse is not None:
2983 for offset, size in tarinfo.sparse:
2985 copyfileobj(source, target, size)
2986 target.seek(tarinfo.size)
2995 copyfileobj(source, target, tarinfo.size)
2998 # only if we are extracting a multivolume this can be treated
2999 if not self.new_volume_handler:
3001 raise Exception("We need to read a new volume and you"
3002 " didn't supply a new_volume_handler")
3004 # the new volume handler should do everything needed to
3005 # start working in a new volume. usually, the handler calls
3006 # to self.open_volume
3007 self.volume_number += 1
3008 self.new_volume_handler(self, self.base_name, self.volume_number)
3009 tarinfo = self.firstmember
3010 source = self.fileobj
3015 def makeunknown(self, tarinfo, targetpath):
3016 """Make a file from a TarInfo object with an unknown type
3019 self.makefile(tarinfo, targetpath)
3020 self._dbg(1, "tarfile: Unknown file type %r, " \
3021 "extracted as regular file." % tarinfo.type)
3023 def makefifo(self, tarinfo, targetpath):
3024 """Make a fifo called targetpath.
3026 if hasattr(os, "mkfifo"):
3027 os.mkfifo(targetpath)
3029 raise ExtractError("fifo not supported by system")
3031 def makedev(self, tarinfo, targetpath):
3032 """Make a character or block device called targetpath.
3034 if not hasattr(os, "mknod") or not hasattr(os, "makedev"):
3035 raise ExtractError("special devices not supported by system")
3039 mode |= stat.S_IFBLK
3041 mode |= stat.S_IFCHR
3043 os.mknod(targetpath, mode,
3044 os.makedev(tarinfo.devmajor, tarinfo.devminor))
3046 def makelink(self, tarinfo, targetpath):
3047 """Make a (symbolic) link called targetpath. If it cannot be created
3048 (platform limitation), we try to make a copy of the referenced file
3052 # For systems that support symbolic and hard links.
3054 os.symlink(tarinfo.linkname, targetpath)
3057 if os.path.exists(tarinfo._link_target):
3058 os.link(tarinfo._link_target, targetpath)
3060 self._extract_member(self._find_link_target(tarinfo),
3062 except symlink_exception:
3064 self._extract_member(self._find_link_target(tarinfo),
3067 raise ExtractError("unable to resolve link inside archive")
3069 def chown(self, tarinfo, targetpath):
3070 """Set owner of targetpath according to tarinfo.
3072 if pwd and hasattr(os, "geteuid") and os.geteuid() == 0:
3073 # We have to be root to do so.
3075 g = grp.getgrnam(tarinfo.gname)[2]
3079 u = pwd.getpwnam(tarinfo.uname)[2]
3083 if tarinfo.issym() and hasattr(os, "lchown"):
3084 os.lchown(targetpath, u, g)
3086 os.chown(targetpath, u, g)
3087 except OSError as e:
3088 raise ExtractError("could not change owner")
3090 def chmod(self, tarinfo, targetpath):
3091 """Set file permissions of targetpath according to tarinfo.
3093 if hasattr(os, 'chmod'):
3095 os.chmod(targetpath, tarinfo.mode)
3096 except OSError as e:
3097 raise ExtractError("could not change mode")
3099 def utime(self, tarinfo, targetpath):
3100 """Set modification time of targetpath according to tarinfo.
3102 if not hasattr(os, 'utime'):
3105 os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime))
3106 except OSError as e:
3107 raise ExtractError("could not change modification time")
3109 #--------------------------------------------------------------------------
3111 """Return the next member of the archive as a TarInfo object, when
3112 TarFile is opened for reading. Return None if there is no more
3116 if self.firstmember is not None:
3117 m = self.firstmember
3118 self.firstmember = None
3121 # Read the next block.
3122 self.fileobj.seek(self.offset)
3126 tarinfo = self.tarinfo.fromtarfile(self)
3127 except EOFHeaderError as e:
3128 if self.ignore_zeros:
3129 self._dbg(2, "0x%X: %s" % (self.offset, e))
3130 self.offset += BLOCKSIZE
3132 except InvalidHeaderError as e:
3133 if self.ignore_zeros:
3134 self._dbg(2, "0x%X: %s" % (self.offset, e))
3135 self.offset += BLOCKSIZE
3137 elif self.offset == 0:
3138 raise ReadError(str(e))
3139 except EmptyHeaderError:
3140 if self.offset == 0:
3141 raise ReadError("empty file")
3142 except TruncatedHeaderError as e:
3143 if self.offset == 0:
3144 raise ReadError(str(e))
3145 except SubsequentHeaderError as e:
3146 raise ReadError(str(e))
3149 if tarinfo is not None:
3150 if self.save_to_members:
3151 self.members.append(tarinfo)
3157 #--------------------------------------------------------------------------
3158 # Little helper methods:
3160 def _getmember(self, name, tarinfo=None, normalize=False):
3161 """Find an archive member by name from bottom to top.
3162 If tarinfo is given, it is used as the starting point.
3164 # Ensure that all members have been loaded.
3165 members = self.getmembers()
3167 # Limit the member search list up to tarinfo.
3168 if tarinfo is not None:
3169 members = members[:members.index(tarinfo)]
3172 name = os.path.normpath(name)
3174 for member in reversed(members):
3176 member_name = os.path.normpath(member.name)
3178 member_name = member.name
3180 if name == member_name:
3184 """Read through the entire archive file and look for readable
3188 tarinfo = self.next()
3193 def _check(self, mode=None):
3194 """Check if TarFile is still open, and if the operation's mode
3195 corresponds to TarFile's mode.
3198 raise OSError("%s is closed" % self.__class__.__name__)
3199 if mode is not None and self.mode not in mode:
3200 raise OSError("bad operation for mode %r" % self.mode)
3202 def _find_link_target(self, tarinfo):
3203 """Find the target member of a symlink or hardlink member in the
3207 # Always search the entire archive.
3208 linkname = "/".join(filter(None, (os.path.dirname(tarinfo.name), tarinfo.linkname)))
3211 # Search the archive before the link, because a hard link is
3212 # just a reference to an already archived file.
3213 linkname = tarinfo.linkname
3216 member = self._getmember(linkname, tarinfo=limit, normalize=True)
3218 raise KeyError("linkname %r not found" % linkname)
3222 """Provide an iterator object.
3225 return iter(self.members)
3227 return TarIter(self)
3229 def _dbg(self, level, msg, *args):
3230 """Write debugging output to sys.stderr.
3232 if level <= self.debug:
3233 print(msg.format(*args), file=sys.stderr)
3235 def __enter__(self):
3239 def __exit__(self, type, value, traceback):
3243 # An exception occurred. We must not call close() because
3244 # it would try to write end-of-archive blocks and padding.
3245 if not self._extfileobj:
3246 self.fileobj.close()
3253 for tarinfo in TarFile(...):
3257 def __init__(self, tarfile):
3258 """Construct a TarIter object.
3260 self.tarfile = tarfile
3263 """Return iterator object.
3267 """Return the next item using TarFile's next() method.
3268 When all members have been read, set TarFile as _loaded.
3270 # Fix for SF #1100429: Under rare circumstances it can
3271 # happen that getmembers() is called during iteration,
3272 # which will cause TarIter to stop prematurely.
3274 if self.index == 0 and self.tarfile.firstmember is not None:
3275 tarinfo = self.tarfile.next()
3276 elif self.index < len(self.tarfile.members):
3277 tarinfo = self.tarfile.members[self.index]
3278 elif not self.tarfile._loaded:
3279 tarinfo = self.tarfile.next()
3281 self.tarfile._loaded = True
3290 #--------------------
3291 # exported functions
3292 #--------------------
3293 def is_tarfile(name):
3294 """Return True if name points to a tar archive that we
3295 are able to handle, else return False.