2 #-------------------------------------------------------------------
4 #-------------------------------------------------------------------
5 # Copyright (C) 2002 Lars Gustäbel <lars@gustaebel.de>
8 # Permission is hereby granted, free of charge, to any person
9 # obtaining a copy of this software and associated documentation
10 # files (the "Software"), to deal in the Software without
11 # restriction, including without limitation the rights to use,
12 # copy, modify, merge, publish, distribute, sublicense, and/or sell
13 # copies of the Software, and to permit persons to whom the
14 # Software is furnished to do so, subject to the following
17 # The above copyright notice and this permission notice shall be
18 # included in all copies or substantial portions of the Software.
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22 # OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24 # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25 # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27 # OTHER DEALINGS IN THE SOFTWARE.
29 """Read from and write to tar format archives.
32 __version__ = "$Revision: 85213 $"
36 __author__ = "Lars Gustäbel (lars@gustaebel.de)"
39 __credits__ = "Gustavo Niemeyer, Niels Gustäbel, Richard Townsend, Eduardo Robles."
57 import traceback # XXX
66 # os.symlink on Windows prior to 6.0 raises NotImplementedError
67 symlink_exception = (AttributeError, NotImplementedError)
69 # OSError (winerror=1314) will be raised if the caller does not hold the
70 # SeCreateSymbolicLinkPrivilege privilege
71 symlink_exception += (OSError,)
75 # from tarfile import *
76 __all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError"]
78 from builtins import open as _open # Since 'open' is TarFile.open
80 #---------------------------------------------------------
82 #---------------------------------------------------------
83 NUL = b"\0" # the null character
84 BLOCKSIZE = 512 # length of processing blocks
85 RECORDSIZE = BLOCKSIZE * 20 # length of records
86 GNU_MAGIC = b"ustar \0" # magic gnu tar string
87 POSIX_MAGIC = b"ustar\x0000" # magic posix tar string
89 LENGTH_NAME = 100 # maximum length of a filename
90 LENGTH_LINK = 100 # maximum length of a linkname
91 LENGTH_PREFIX = 155 # maximum length of the prefix field
93 REGTYPE = b"0" # regular file
94 AREGTYPE = b"\0" # regular file
95 LNKTYPE = b"1" # link (inside tarfile)
96 SYMTYPE = b"2" # symbolic link
97 CHRTYPE = b"3" # character special device
98 BLKTYPE = b"4" # block special device
99 DIRTYPE = b"5" # directory
100 FIFOTYPE = b"6" # fifo special device
101 CONTTYPE = b"7" # contiguous file
103 GNUTYPE_LONGNAME = b"L" # GNU tar longname
104 GNUTYPE_LONGLINK = b"K" # GNU tar longlink
105 GNUTYPE_SPARSE = b"S" # GNU tar sparse file
106 GNUTYPE_MULTIVOL = b"M" # GNU tar continuation of a file that began on
109 XHDTYPE = b"x" # POSIX.1-2001 extended header
110 XGLTYPE = b"g" # POSIX.1-2001 global header
111 SOLARIS_XHDTYPE = b"X" # Solaris extended header
113 USTAR_FORMAT = 0 # POSIX.1-1988 (ustar) format
114 GNU_FORMAT = 1 # GNU tar format
115 PAX_FORMAT = 2 # POSIX.1-2001 (pax) format
116 DEFAULT_FORMAT = GNU_FORMAT
118 GZ_FMT_HEADER = b"<BBBBLBB"
119 GZ_HEADER_SIZE = 10 # not including the name
120 GZ_MAGIC = (0x1f, 0x8b) # 0o37, 0o213
121 GZ_METHOD_DEFLATE = 0x08 # 0o10
122 GZ_FLAG_ORIG_NAME = 0x08 # 0o10, default in gzip
123 GZ_DEFLATE_FLAGS = 0x00 # 0o00, never read (deflate.c)
124 GZ_OS_CODE = 0x03 # 0o03, default in gzip (tailor.h)
125 GZ_MAGIC_BYTES = struct.pack ("<BB", GZ_MAGIC [0], GZ_MAGIC [1])
126 GZ_MAGIC_DEFLATE = struct.pack ("<BBB", GZ_MAGIC [0], GZ_MAGIC [1],
129 #---------------------------------------------------------
130 # archive handling mode
131 #---------------------------------------------------------
134 ARCMODE_ENCRYPT = 1 << 0
135 ARCMODE_COMPRESS = 1 << 1
136 ARCMODE_CONCAT = 1 << 2
139 if m == ARCMODE_PLAIN:
143 def chkappend (b, s):
148 if first is True: first = False
151 chkappend (ARCMODE_ENCRYPT, "ENCRYPT")
152 chkappend (ARCMODE_COMPRESS, "COMPRESS")
153 chkappend (ARCMODE_CONCAT, "CONCAT")
157 def arcmode_set (concat=False, encryption=None, comptype=None, init=ARCMODE_PLAIN):
159 if bool (concat) is True:
160 ret |= ARCMODE_CONCAT
161 if encryption is not None:
162 ret |= ARCMODE_ENCRYPT
164 ret |= ARCMODE_COMPRESS
167 #---------------------------------------------------------
169 #---------------------------------------------------------
170 # File types that tarfile supports:
171 SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE,
172 SYMTYPE, DIRTYPE, FIFOTYPE,
173 CONTTYPE, CHRTYPE, BLKTYPE,
174 GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
175 GNUTYPE_SPARSE, GNUTYPE_MULTIVOL)
177 # File types that will be treated as a regular file.
178 REGULAR_TYPES = (REGTYPE, AREGTYPE,
179 CONTTYPE, GNUTYPE_SPARSE)
181 # File types that are part of the GNU tar format.
182 GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
183 GNUTYPE_SPARSE, GNUTYPE_MULTIVOL)
185 # Fields from a pax header that override a TarInfo attribute.
186 PAX_FIELDS = ("path", "linkpath", "size", "mtime",
187 "uid", "gid", "uname", "gname")
189 # Fields from a pax header that are affected by hdrcharset.
190 PAX_NAME_FIELDS = {"path", "linkpath", "uname", "gname"}
192 # Fields in a pax header that are numbers, all other fields
193 # are treated as strings.
194 PAX_NUMBER_FIELDS = {
203 #---------------------------------------------------------
205 #---------------------------------------------------------
207 if os.name in ("nt", "ce"):
210 ENCODING = sys.getfilesystemencoding()
212 #---------------------------------------------------------
213 # Some useful functions
214 #---------------------------------------------------------
216 def stn(s, length, encoding, errors):
217 """Convert a string to a null-terminated bytes object.
219 s = s.encode(encoding, errors)
220 return s[:length] + (length - len(s)) * NUL
222 def nts(s, encoding, errors):
223 """Convert a null-terminated bytes object to a string.
228 return s.decode(encoding, errors)
230 def sbtn(s, length, encoding, errors):
231 """Convert a string or a bunch of bytes to a null-terminated bytes object
234 if isinstance(s, str):
235 s = s.encode(encoding, errors)
236 return s[:length] + (length - len(s)) * NUL
239 """Convert a number field to a python number.
241 # There are two possible encodings for a number field, see
243 if s[0] in (0o200, 0o377):
245 for i in range(len(s) - 1):
249 n = -(256 ** (len(s) - 1) - n)
252 n = int(nts(s, "ascii", "strict") or "0", 8)
254 raise InvalidHeaderError("invalid header")
257 def itn(n, digits=8, format=DEFAULT_FORMAT):
258 """Convert a python number to a number field.
260 # POSIX 1003.1-1988 requires numbers to be encoded as a string of
261 # octal digits followed by a null-byte, this allows values up to
262 # (8**(digits-1))-1. GNU tar allows storing numbers greater than
263 # that if necessary. A leading 0o200 or 0o377 byte indicate this
264 # particular encoding, the following digits-1 bytes are a big-endian
265 # base-256 representation. This allows values up to (256**(digits-1))-1.
266 # A 0o200 byte indicates a positive number, a 0o377 byte a negative
268 if 0 <= n < 8 ** (digits - 1):
269 s = bytes("%0*o" % (digits - 1, int(n)), "ascii") + NUL
270 elif format == GNU_FORMAT and -256 ** (digits - 1) <= n < 256 ** (digits - 1):
272 s = bytearray([0o200])
274 s = bytearray([0o377])
275 n = 256 ** digits + n
277 for i in range(digits - 1):
278 s.insert(1, n & 0o377)
281 raise ValueError("overflow in number field")
285 def calc_chksums(buf):
286 """Calculate the checksum for a member's header by summing up all
287 characters except for the chksum field which is treated as if
288 it was filled with spaces. According to the GNU tar sources,
289 some tars (Sun and NeXT) calculate chksum with signed char,
290 which will be different if there are chars in the buffer with
291 the high bit set. So we calculate two checksums, unsigned and
294 unsigned_chksum = 256 + sum(struct.unpack_from("148B8x356B", buf))
295 signed_chksum = 256 + sum(struct.unpack_from("148b8x356b", buf))
296 return unsigned_chksum, signed_chksum
298 def copyfileobj(src, dst, length=None):
299 """Copy length bytes from fileobj src to fileobj dst.
300 If length is None, copy the entire content.
305 shutil.copyfileobj(src, dst)
309 blocks, remainder = divmod(length, BUFSIZE)
310 for b in range(blocks):
311 buf = src.read(BUFSIZE)
313 if len(buf) < BUFSIZE:
314 raise OSError("end of file reached")
316 buf = src.read(remainder)
318 if len(buf) < remainder:
319 raise OSError("end of file reached")
323 """Deprecated in this location; use stat.filemode."""
325 warnings.warn("deprecated in favor of stat.filemode",
326 DeprecationWarning, 2)
327 return stat.filemode(mode)
329 class TarError(Exception):
330 """Base exception."""
332 class ExtractError(TarError):
333 """General exception for extract errors."""
335 class ReadError(TarError):
336 """Exception for unreadable tar archives."""
338 class CompressionError(TarError):
339 """Exception for unavailable compression methods."""
341 class StreamError(TarError):
342 """Exception for unsupported operations on stream-like TarFiles."""
344 class HeaderError(TarError):
345 """Base exception for header errors."""
347 class EmptyHeaderError(HeaderError):
348 """Exception for empty headers."""
350 class TruncatedHeaderError(HeaderError):
351 """Exception for truncated headers."""
353 class EOFHeaderError(HeaderError):
354 """Exception for end of file headers."""
356 class InvalidHeaderError(HeaderError):
357 """Exception for invalid headers."""
359 class SubsequentHeaderError(HeaderError):
360 """Exception for missing and invalid extended headers."""
362 class InvalidEncryptionError(TarError):
363 """Exception for undefined crypto modes and combinations."""
365 class DecryptionError(TarError):
366 """Exception for error during decryption."""
368 class EncryptionError(TarError):
369 """Exception for error during encryption."""
371 class EndOfFile(Exception):
372 """Signal end of file condition when they’re not an error."""
374 #---------------------------
375 # internal stream interface
376 #---------------------------
378 """Low-level file object. Supports reading and writing.
379 It is used instead of a regular file object for streaming
383 def __init__(self, name, mode):
386 "w": os.O_RDWR | os.O_CREAT | os.O_TRUNC,
388 if hasattr(os, "O_BINARY"):
389 _mode |= os.O_BINARY # pylint: disable=no-member
390 self.fd = os.open(name, _mode, 0o666)
396 def read(self, size):
397 ret = os.read(self.fd, size)
398 self.offset += len(ret)
401 def write(self, s, pos=None):
404 os.lseek (self.fd, pos, os.SEEK_SET)
405 n = os.write(self.fd, s)
407 self.offset += len(s)
409 append = pos + n - p0
411 self.offset += append
412 os.lseek (self.fd, p0, os.SEEK_SET)
417 def seek_set (self, pos):
418 os.lseek (self.fd, pos, os.SEEK_SET)
422 def gz_header (name=None):
423 timestamp = int(time.time())
429 flags |= GZ_FLAG_ORIG_NAME
430 if type(name) is str:
431 name = name.encode("iso-8859-1", "replace")
432 if name.endswith(b".pdtcrypt"):
434 if name.endswith(b".gz"):
436 # RFC1952 says we must use ISO-8859-1 for the FNAME field.
439 hdr = struct.pack (GZ_FMT_HEADER,
440 GZ_MAGIC [0], GZ_MAGIC [1],
441 GZ_METHOD_DEFLATE, flags,
443 GZ_DEFLATE_FLAGS, GZ_OS_CODE)
449 """Class that serves as an adapter between TarFile and
450 a stream-like object. The stream-like object only
451 needs to have a read() or write() method and is accessed
452 blockwise. Use of gzip or bzip2 compression is possible.
453 A stream-like object could be for example: sys.stdin,
454 sys.stdout, a socket, a tape device etc.
456 _Stream is intended to be used only internally but is
457 nevertherless used externally by Deltatar.
459 When encrypting, the ``enccounter`` will be used for
460 initializing the first cryptographic context. When
461 decrypting, its value will be compared to the decrypted
462 object. Decryption fails if the value does not match.
463 In effect, this means that a ``_Stream`` whose ctor was
464 passed ``enccounter`` can only be used to encrypt or
465 decrypt a single object.
468 remainder = -1 # track size in encrypted entries
471 def __init__(self, name, mode, comptype, fileobj, bufsize,
472 concat=False, encryption=None, enccounter=None,
473 compresslevel=9, tolerant=False):
474 """Construct a _Stream object.
476 self.arcmode = arcmode_set (concat, encryption, comptype)
477 self.tolerant = tolerant
479 self._extfileobj = True
481 fileobj = _LowLevelFile(name, mode)
482 self._extfileobj = False
485 # Enable transparent compression detection for the
487 fileobj = _StreamProxy(fileobj)
488 comptype = fileobj.getcomptype()
492 self.enccounter = None
493 if self.arcmode & ARCMODE_ENCRYPT:
494 self.enccounter = enccounter
496 self.name = name or ""
498 self.comptype = comptype
500 self.fileobj = fileobj
501 self.bufsize = bufsize
507 self.last_block_offset = 0
508 self.dbuf = b"" # ???
509 self.exception = None # communicate decompression failure
510 self.compresslevel = compresslevel
511 self.bytes_written = 0
513 self.encryption = encryption
521 raise CompressionError("zlib module is not available")
524 self.exception = zlib.error
527 if not (self.arcmode & ARCMODE_CONCAT):
528 if self.arcmode & ARCMODE_ENCRYPT:
529 self._init_write_encrypt (name)
530 self._init_write_gz ()
531 self.crc = zlib.crc32(b"") & 0xFFFFffff
533 elif comptype == "bz2":
534 if self.arcmode & ARCMODE_ENCRYPT:
535 raise InvalidEncryptionError("encryption not available for "
536 "compression “%s”" % comptype)
540 raise CompressionError("bz2 module is not available")
543 self.cmp = bz2.BZ2Decompressor()
544 self.exception = OSError
546 self.cmp = bz2.BZ2Compressor()
548 elif comptype == 'xz':
549 if self.arcmode & ARCMODE_ENCRYPT:
550 raise InvalidEncryptionError("encryption not available for "
551 "compression “%s”" % comptype)
555 raise CompressionError("lzma module is not available")
558 self.cmp = lzma.LZMADecompressor()
559 self.exception = lzma.LZMAError
561 self.cmp = lzma.LZMACompressor()
563 elif comptype == "tar":
564 if not (self.arcmode & ARCMODE_CONCAT) \
566 and self.arcmode & ARCMODE_ENCRYPT:
567 self._init_write_encrypt (name)
570 if self.arcmode & ARCMODE_ENCRYPT:
571 raise InvalidEncryptionError("encryption not available for "
572 "compression “%s”" % comptype)
573 raise CompressionError("unknown compression type %r" % comptype)
576 if not self._extfileobj:
582 if hasattr(self, "closed") and not self.closed:
585 except crypto.InternalError:
586 # context already finalized due to abort but close() tried
591 def next (self, name):
592 if self.arcmode & ARCMODE_COMPRESS:
593 if getattr (self, "cmp", None) is not None:
594 self._finalize_write_gz ()
596 if self.arcmode & ~(ARCMODE_ENCRYPT | ARCMODE_COMPRESS):
597 self.last_block_offset = self.fileobj.tell()
598 if self.arcmode & ARCMODE_ENCRYPT:
599 self._finalize_write_encrypt ()
600 self._init_write_encrypt (name, set_last_block_offset=True)
601 if self.arcmode & ARCMODE_COMPRESS:
602 self._init_write_gz (set_last_block_offset =
603 not (self.arcmode & ARCMODE_ENCRYPT))
604 return self.last_block_offset
607 def next_volume (self, name):
608 # with non-concat modes, this is taken care by the _Stream
609 # ctor as invoked by the newvol handler
610 if self.arcmode & ARCMODE_COMPRESS:
611 if getattr (self, "cmp", None) is not None:
612 # e. g. compressed PAX header written
613 self._finalize_write_gz ()
614 if self.arcmode & ARCMODE_ENCRYPT:
615 self._init_write_encrypt (name)
616 if self.arcmode & ARCMODE_COMPRESS:
617 self._init_write_gz ()
620 def _init_write_encrypt (self, entry=None, set_last_block_offset=False):
622 Save position for delayed write of header; fill the header location
625 # first thing, proclaim new object to the encryption context
626 # secondly, assemble the header with the updated parameters
627 # and commit it directly to the underlying stream, bypassing the
628 # encryption layer in .__write().
629 dummyhdr = self.encryption.next (entry, counter=self.enccounter)
631 raise EncryptionError ("Crypto.next(): bad dummy header") # XXX
632 self.lasthdr = self.fileobj.tell()
633 self.__write_to_file(dummyhdr)
634 if set_last_block_offset is True:
635 self.last_block_offset = self.lasthdr
638 def _finalize_write_encrypt (self):
640 Seek back to header position, read dummy bytes, finalize crypto
641 obtaining the actual header, write header, seek back to current
644 Returns the list of IV fixed parts as used during encryption.
646 if self.lasthdr is not None:
647 pos0 = self.fileobj.tell ()
648 self.fileobj.seek_set (self.lasthdr)
649 dummy = self.fileobj.read (crypto.PDTCRYPT_HDR_SIZE)
650 pos1 = self.fileobj.tell ()
651 dpos = pos1 - self.lasthdr
652 assert dpos == crypto.PDTCRYPT_HDR_SIZE
653 self.fileobj.seek_set (pos0)
654 data, hdr, _ = self.encryption.done (dummy)
655 self.__write_to_file(hdr, pos=self.lasthdr)
656 self.__write_to_file(data) # append remainder of data
660 def _finalize_write_gz (self):
661 if self.cmp is not None:
662 chunk = self.buf + self.cmp.flush()
664 if self.comptype == "gz":
665 # The native zlib crc is an unsigned 32-bit integer, but
666 # the Python wrapper implicitly casts that to a signed C
667 # long. So, on a 32-bit box self.crc may "look negative",
668 # while the same crc on a 64-bit box may "look positive".
669 # To avoid irksome warnings from the `struct` module, force
670 # it to look positive on all boxes.
671 chunk += struct.pack("<L", self.crc & 0xffffffff)
672 chunk += struct.pack("<L", self.concat_pos & 0xffffFFFF)
673 self.__enc_write (chunk)
677 def _init_write_gz (self, set_last_block_offset=False):
679 Add a new gzip block, closing last one
682 self.crc = self.zlib.crc32(b"") & 0xFFFFffff
683 first = self.cmp is None
684 self.cmp = self.zlib.compressobj(self.compresslevel,
686 -self.zlib.MAX_WBITS,
687 self.zlib.DEF_MEM_LEVEL,
690 # if aes, we encrypt after compression
691 if set_last_block_offset is True:
692 self.last_block_offset = self.fileobj.tell()
694 self.__write(gz_header (self.name if first is True else None))
698 """Write string s to the stream.
700 if self.comptype == "gz":
701 self.crc = self.zlib.crc32(s, self.crc) & 0xFFFFffff
703 self.concat_pos += len(s)
704 if self.cmp is not None:
705 s = self.cmp.compress(s)
709 """Write what’s left in the buffer to the stream."""
710 self.__write (b"") # → len (buf) <= bufsiz
711 self.__enc_write (self.buf)
714 def __write(self, s):
715 """Writes (and encodes) string s to the stream blockwise
717 will wait with encoding/writing until block is complete
720 while len(self.buf) > self.bufsize:
721 self.__enc_write(self.buf[:self.bufsize])
722 self.buf = self.buf[self.bufsize:]
725 def __write_to_file(self, s, pos=None):
727 Writes directly to the fileobj; updates self.bytes_written. If “pos” is
728 given, the stream will seek to that position first and back afterwards,
729 and the total of bytes written is not updated.
731 self.fileobj.write(s, pos)
733 self.bytes_written += len(s)
736 def __enc_write(self, s):
738 If encryption is active, the string s is encrypted before being written
743 if self.arcmode & ARCMODE_ENCRYPT:
746 n, ct = self.encryption.process(buf)
747 self.__write_to_file(ct)
750 # The entire plaintext was not consumed: The size limit
751 # for encrypted objects was reached. Transparently create
752 # a new encrypted object and continue processing the input.
753 self._finalize_write_encrypt ()
754 self._init_write_encrypt ()
756 self.__write_to_file(s)
759 def estim_file_size(self):
760 """ estimates size of file if closing it now
762 The result may differ greatly from the amount of data sent to write()
763 due to compression, encryption and buffering.
765 In tests the result (before calling close()) was up to 12k smaller than
766 the final file size if compression is being used because zlib/bz2
767 compressors do not allow inspection of their buffered data :-(
769 Still, we add what close() would add: 8 bytes for gz checksum, one
770 encryption block size if encryption is used and the size of our own
774 return self.bytes_written
776 result = self.bytes_written
778 result += len(self.buf)
779 if self.comptype == 'gz':
780 result += 8 # 2 longs = 8 byte (no extra info written for bzip2)
783 def close(self, close_fileobj=True):
784 """Close the _Stream object. No operation should be
785 done on it afterwards.
791 if close_fileobj is True:
794 if self.arcmode & ARCMODE_COMPRESS:
795 self._finalize_write_gz ()
796 # end of Tar archive marker (two empty blocks) was written
797 # finalize encryption last; no writes may be performed after
800 if self.arcmode & ARCMODE_ENCRYPT:
801 self._finalize_write_encrypt ()
803 if not self._extfileobj:
806 # read the zlib crc and length and check them
807 if self.mode == "r" and self.comptype == "gz":
808 read_crc = self.__read(4)
809 read_length = self.__read(4)
810 calculated_crc = self.crc
811 if struct.unpack("<L", read_crc)[0] != calculated_crc:
812 raise CompressionError("bad gzip crc")
816 def _init_read_gz(self):
817 """Initialize for reading a gzip compressed fileobj.
819 self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
821 read2 = self.__read(2)
823 raise EndOfFile ("_init_read_gz(): read returned zero bytes at pos "
824 "%d" % self.fileobj.tell())
825 # taken from gzip.GzipFile with some alterations
826 if read2 != GZ_MAGIC_BYTES:
827 raise ReadError("not a gzip file")
829 read1 = self.__read(1)
831 raise CompressionError("unsupported compression method")
833 self.flags = flag = ord(self.__read(1))
837 xlen = ord(self.__read(1)) + 256 * ord(self.__read(1))
842 if not s or s == NUL:
847 if not s or s == NUL:
852 def _init_read_encrypt (self):
853 """Initialize encryption for next entry in archive. Read a header and
854 notify the crypto context."""
855 if self.arcmode & ARCMODE_ENCRYPT:
856 lasthdr = self.fileobj.tell ()
858 hdr = crypto.hdr_read_stream (self.fileobj)
859 except crypto.EndOfFile:
861 except crypto.InvalidHeader as exn:
862 raise DecryptionError ("Crypto.hdr_read_stream(): error “%s” "
863 "processing %r at pos %d"
864 % (exn, self.fileobj, lasthdr)) \
866 if self.enccounter is not None:
867 # enforce that the iv counter in the header matches an
868 # explicitly requested one
869 iv = crypto.hdr_iv_counter (hdr)
870 if iv != self.enccounter:
871 raise DecryptionError ("expected IV counter %d, got %d"
872 % (self.enccounter, iv))
873 self.lasthdr = lasthdr
874 self.remainder = hdr ["ctsize"] # distance to next header
876 self.encryption.next (hdr)
877 except crypto.InvalidParameter as exn:
878 raise DecryptionError ("Crypto.next(): error “%s” "
879 "processing %r at pos %d"
880 % (exn, self.fileobj, lasthdr)) \
886 def _finalize_read_encrypt (self):
890 if self.arcmode & ARCMODE_ENCRYPT \
891 and self.lasthdr is not None :
892 assert self.remainder >= 0
893 if self.remainder > 0:
896 data = self.encryption.done ()
897 except crypto.InvalidGCMTag as exn:
898 raise DecryptionError ("decryption failed: %s" % exn)
903 """Return the stream's file pointer position.
907 def seek(self, pos=0):
908 """Set the stream's file pointer to pos. Negative seeking
911 if pos - self.pos >= 0:
912 blocks, remainder = divmod(pos - self.pos, self.bufsize)
913 for i in range(blocks):
914 self.read(self.bufsize)
917 raise StreamError("seeking backwards is not allowed")
920 def read(self, size=None):
921 """Return the next size number of bytes from the stream.
922 If size is not defined, return all bytes of the stream
928 buf = self._read(self.bufsize)
934 buf = self._read(size)
939 """Reads just one line, new line character included
941 # if \n in dbuf, no read neads to be done
942 if b'\n' in self.dbuf:
943 pos = self.dbuf.index(b'\n') + 1
944 ret = self.dbuf[:pos]
945 self.dbuf = self.dbuf[pos:]
950 chunk = self._read(self.bufsize)
952 # nothing more to read, so return the buffer
958 # if \n found, return the new line
961 pos = dbuf.index(b'\n') + 1
962 self.dbuf = dbuf[pos:] + self.dbuf
965 def _read(self, size):
966 """Return size bytes from the stream.
972 buf = self.__read(self.bufsize)
976 if self.cmp is not None:
978 buf = self.cmp.decompress(buf)
979 except self.exception as exn:
980 raise ReadError("invalid compressed data (%r)" % exn)
981 except Exception as e:
982 # happens at the end of the file
983 # _init_read_gz failed in the previous iteration so
984 # self.cmp.decompress fails here
985 if self.arcmode & ARCMODE_CONCAT:
988 raise ReadError("invalid compressed data")
989 if self.arcmode & ARCMODE_COMPRESS and hasattr(self, "crc"):
990 self.crc = self.zlib.crc32(buf, self.crc) & 0xFFFFffff
991 if self.arcmode & ARCMODE_CONCAT \
992 and len(self.cmp.unused_data) != 0:
993 self.buf = self.cmp.unused_data + self.buf
994 self.close(close_fileobj=False)
998 # happens at the end of the file
1000 self.crc = self.zlib.crc32(b"") & 0xFFFFffff
1005 self.dbuf = t[size:]
1009 def __read(self, size):
1011 Return size bytes from stream. If internal buffer is empty, read
1012 another block from the stream.
1014 The function returns up to size bytes of data. When an error occurs
1015 during decryption, everything until the end of the last successfully
1016 finalized object is returned.
1020 good_crypto = len (t)
1023 if self.arcmode & ARCMODE_ENCRYPT:
1024 if self.remainder <= 0:
1025 # prepare next object
1027 if self._init_read_encrypt () is False: # EOF
1030 except DecryptionError:
1031 if self.tolerant is True:
1032 self.buf = b"".join (t [good_crypto:])
1033 return b"".join (t [:good_crypto])
1036 # only read up to the end of the encrypted object
1037 todo = min (size, self.remainder)
1038 buf = self.fileobj.read(todo)
1039 if self.arcmode & ARCMODE_ENCRYPT:
1041 buf = self.encryption.process (buf)
1042 if todo == self.remainder:
1043 # at the end of a crypto object; finalization will fail if
1044 # the GCM tag does not match
1046 trailing = self._finalize_read_encrypt ()
1047 except DecryptionError as exn:
1048 if self.tolerant is False:
1050 if good_crypto == 0:
1052 # some objects did validate; discard all data after it;
1053 # next call will start with the bad object and error
1055 self.buf = b"".join (t [good_crypto:])
1056 return b"".join (t [:good_crypto])
1057 good_crypto = len (t) + 1
1058 if len (trailing) > 0:
1062 self.remainder -= todo
1064 if not buf: ## XXX stream terminated prematurely; this should be an error
1075 class _StreamProxy(object):
1076 """Small proxy class that enables transparent compression
1077 detection for the Stream interface (mode 'r|*').
1080 def __init__(self, fileobj):
1081 self.fileobj = fileobj
1082 self.buf = self.fileobj.read(BLOCKSIZE)
1084 def read(self, size): # pylint: disable=method-hidden
1085 self.read = self.fileobj.read
1088 def getcomptype(self):
1089 if self.buf.startswith(GZ_MAGIC_DEFLATE):
1091 elif self.buf[0:3] == b"BZh" and self.buf[4:10] == b"1AY&SY":
1093 elif self.buf.startswith((b"\x5d\x00\x00\x80", b"\xfd7zXZ")):
1099 self.fileobj.close()
1102 #------------------------
1103 # Extraction file object
1104 #------------------------
1105 class _FileInFile(object):
1106 """A thin wrapper around an existing file object that
1107 provides a part of its data as an individual file
1111 def __init__(self, fileobj, offset, size, blockinfo=None):
1112 self.fileobj = fileobj
1113 self.offset = offset
1116 self.name = getattr(fileobj, "name", None)
1119 if blockinfo is None:
1120 blockinfo = [(0, size)]
1122 # Construct a map with data and zero blocks.
1126 realpos = self.offset
1127 for offset, size in blockinfo:
1128 if offset > lastpos:
1129 self.map.append((False, lastpos, offset, None))
1130 self.map.append((True, offset, offset + size, realpos))
1132 lastpos = offset + size
1133 if lastpos < self.size:
1134 self.map.append((False, lastpos, self.size, None))
1146 return self.fileobj.seekable()
1149 """Return the current file position.
1151 return self.position
1153 def seek(self, position, whence=io.SEEK_SET):
1154 """Seek to a position in the file.
1156 if whence == io.SEEK_SET:
1157 self.position = min(max(position, 0), self.size)
1158 elif whence == io.SEEK_CUR:
1160 self.position = max(self.position + position, 0)
1162 self.position = min(self.position + position, self.size)
1163 elif whence == io.SEEK_END:
1164 self.position = max(min(self.size + position, self.size), 0)
1166 raise ValueError("Invalid argument")
1167 return self.position
1169 def read(self, size=None):
1170 """Read data from the file.
1173 size = self.size - self.position
1175 size = min(size, self.size - self.position)
1180 data, start, stop, offset = self.map[self.map_index]
1181 if start <= self.position < stop:
1185 if self.map_index == len(self.map):
1187 length = min(size, stop - self.position)
1189 self.fileobj.seek(offset + (self.position - start))
1190 buf += self.fileobj.read(length)
1194 self.position += length
1197 def readinto(self, b):
1198 buf = self.read(len(b))
1207 class ExFileObject(io.BufferedReader):
1209 def __init__(self, tarfile, tarinfo):
1210 fileobj = _FileInFile(tarfile.fileobj, tarinfo.offset_data,
1211 tarinfo.size, tarinfo.sparse)
1212 super().__init__(fileobj)
1218 class TarInfo(object):
1219 """Informational class which holds the details about an
1220 archive member given by a tar header block.
1221 TarInfo objects are returned by TarFile.getmember(),
1222 TarFile.getmembers() and TarFile.gettarinfo() and are
1223 usually created internally.
1226 __slots__ = ("name", "mode", "uid", "gid", "size", "mtime",
1227 "chksum", "type", "linkname", "uname", "gname",
1228 "devmajor", "devminor", "volume_offset",
1229 "offset", "offset_data", "pax_headers", "sparse",
1230 "tarfile", "_sparse_structs", "_link_target")
1232 def __init__(self, name=""):
1233 """Construct a TarInfo object. name is the optional name
1236 self.name = name # member name
1237 self.mode = 0o644 # file permissions
1238 self.uid = 0 # user id
1239 self.gid = 0 # group id
1240 self.size = 0 # file size
1241 self.mtime = 0 # modification time
1242 self.chksum = 0 # header checksum
1243 self.type = REGTYPE # member type
1244 self.linkname = "" # link name
1245 self.uname = "" # user name
1246 self.gname = "" # group name
1247 self.devmajor = 0 # device major number
1248 self.devminor = 0 # device minor number
1250 self.offset = 0 # the tar header starts here
1251 self.offset_data = 0 # the file's data starts here
1252 self.volume_offset = 0 # the file's data corresponds with the data
1253 # starting at this position
1255 self.sparse = None # sparse member information
1256 self.pax_headers = {} # pax header information
1258 # In pax headers the "name" and "linkname" field are called
1259 # "path" and "linkpath".
1262 def _setpath(self, name):
1264 path = property(_getpath, _setpath)
1266 def _getlinkpath(self):
1267 return self.linkname
1268 def _setlinkpath(self, linkname):
1269 self.linkname = linkname
1270 linkpath = property(_getlinkpath, _setlinkpath)
1273 return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self))
1275 def get_info(self, encoding=None, errors=None):
1276 """Return the TarInfo's attributes as a dictionary.
1280 "mode": self.mode & 0o7777,
1284 "mtime": self.mtime,
1285 "chksum": self.chksum,
1287 "linkname": self.linkname,
1288 "uname": self.uname,
1289 "gname": self.gname,
1290 "devmajor": self.devmajor,
1291 "devminor": self.devminor,
1292 "offset_data": self.offset_data,
1293 "volume_offset": self.volume_offset
1296 if info["type"] == DIRTYPE and not info["name"].endswith("/"):
1301 def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING,
1302 errors="surrogateescape"):
1303 """Return a tar header as a string of 512 byte blocks.
1305 info = self.get_info(encoding, errors)
1307 if format == USTAR_FORMAT:
1308 return self.create_ustar_header(info, encoding, errors)
1309 elif format == GNU_FORMAT:
1310 return self.create_gnu_header(info, encoding, errors)
1311 elif format == PAX_FORMAT:
1312 return self.create_pax_header(info, encoding, errors)
1314 raise ValueError("invalid format")
1316 def create_ustar_header(self, info, encoding, errors):
1317 """Return the object as a ustar header block.
1319 info["magic"] = POSIX_MAGIC
1321 if len(info["linkname"]) > LENGTH_LINK:
1322 raise ValueError("linkname is too long")
1324 if len(info["name"]) > LENGTH_NAME:
1325 info["prefix"], info["name"] = self._posix_split_name(info["name"])
1327 return self._create_header(info, USTAR_FORMAT, encoding, errors)
1329 def create_gnu_header(self, info, encoding, errors):
1330 """Return the object as a GNU header block sequence.
1332 info["magic"] = GNU_MAGIC
1334 if self.ismultivol():
1336 itn(info.get("atime", 0), 12, GNU_FORMAT),
1337 itn(info.get("ctime", 0), 12, GNU_FORMAT),
1338 itn(self.volume_offset, 12, GNU_FORMAT),
1339 itn(0, 119, GNU_FORMAT), # stuff unused in this tar implementation, set to zero
1341 info['prefix'] = b"".join(prefix)
1342 info['size'] = info['size'] - self.volume_offset
1345 if len(info["linkname"]) > LENGTH_LINK:
1346 buf += self._create_gnu_long_header(info["linkname"],
1347 GNUTYPE_LONGLINK, encoding, errors)
1349 if len(info["name"]) > LENGTH_NAME:
1350 buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME,
1353 return buf + self._create_header(info, GNU_FORMAT, encoding, errors)
1355 def create_pax_header(self, info, encoding, errors):
1356 """Return the object as a ustar header block. If it cannot be
1357 represented this way, prepend a pax extended header sequence
1358 with supplement information.
1360 info["magic"] = POSIX_MAGIC
1361 pax_headers = self.pax_headers.copy()
1362 if self.ismultivol():
1363 info['size'] = info['size'] - self.volume_offset
1365 # Test string fields for values that exceed the field length or cannot
1366 # be represented in ASCII encoding.
1367 for name, hname, length in (
1368 ("name", "path", LENGTH_NAME),
1369 ("linkname", "linkpath", LENGTH_LINK),
1370 ("uname", "uname", 32),
1371 ("gname", "gname", 32)):
1373 if hname in pax_headers:
1374 # The pax header has priority.
1377 # Try to encode the string as ASCII.
1379 info[name].encode("ascii", "strict")
1380 except UnicodeEncodeError:
1381 pax_headers[hname] = info[name]
1384 if len(info[name]) > length:
1385 pax_headers[hname] = info[name]
1387 # Test number fields for values that exceed the field limit or values
1388 # that like to be stored as float.
1389 for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)):
1390 if name in pax_headers:
1391 # The pax header has priority. Avoid overflow.
1396 if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float):
1397 pax_headers[name] = str(val)
1400 # Create a pax extended header if necessary.
1402 buf = self._create_pax_generic_header(pax_headers, XHDTYPE, encoding)
1406 return buf + self._create_header(info, USTAR_FORMAT, "ascii", "replace")
1409 def create_pax_global_header(cls, pax_headers):
1410 """Return the object as a pax global header block sequence.
1412 return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf-8")
1414 def _posix_split_name(self, name):
1415 """Split a name longer than 100 chars into a prefix
1418 prefix = name[:LENGTH_PREFIX + 1]
1419 while prefix and prefix[-1] != "/":
1420 prefix = prefix[:-1]
1422 name = name[len(prefix):]
1423 prefix = prefix[:-1]
1425 if not prefix or len(name) > LENGTH_NAME:
1426 raise ValueError("name is too long")
1430 def _create_header(info, format, encoding, errors):
1431 """Return a header block. info is a dictionary with file
1432 information, format must be one of the *_FORMAT constants.
1435 stn(info.get("name", ""), 100, encoding, errors),
1436 itn(info.get("mode", 0) & 0o7777, 8, format),
1437 itn(info.get("uid", 0), 8, format),
1438 itn(info.get("gid", 0), 8, format),
1439 itn(info.get("size", 0), 12, format),
1440 itn(info.get("mtime", 0), 12, format),
1441 b" ", # checksum field
1442 info.get("type", REGTYPE),
1443 stn(info.get("linkname", ""), 100, encoding, errors),
1444 info.get("magic", POSIX_MAGIC),
1445 stn(info.get("uname", ""), 32, encoding, errors),
1446 stn(info.get("gname", ""), 32, encoding, errors),
1447 itn(info.get("devmajor", 0), 8, format),
1448 itn(info.get("devminor", 0), 8, format),
1449 sbtn(info.get("prefix", ""), 155, encoding, errors)
1452 buf = struct.pack("%ds" % BLOCKSIZE, b"".join(parts))
1453 chksum = calc_chksums(buf[-BLOCKSIZE:])[0]
1454 buf = buf[:-364] + bytes("%06o\0" % chksum, "ascii") + buf[-357:]
1458 def _create_payload(payload):
1459 """Return the string payload filled with zero bytes
1460 up to the next 512 byte border.
1462 blocks, remainder = divmod(len(payload), BLOCKSIZE)
1464 payload += (BLOCKSIZE - remainder) * NUL
1468 def _create_gnu_long_header(cls, name, type, encoding, errors):
1469 """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence
1472 name = name.encode(encoding, errors) + NUL
1475 info["name"] = "././@LongLink"
1477 info["size"] = len(name)
1478 info["magic"] = GNU_MAGIC
1480 # create extended header + name blocks.
1481 return cls._create_header(info, USTAR_FORMAT, encoding, errors) + \
1482 cls._create_payload(name)
1485 def _create_pax_generic_header(cls, pax_headers, type, encoding):
1486 """Return a POSIX.1-2008 extended or global header sequence
1487 that contains a list of keyword, value pairs. The values
1490 # Check if one of the fields contains surrogate characters and thereby
1491 # forces hdrcharset=BINARY, see _proc_pax() for more information.
1493 for keyword, value in pax_headers.items():
1495 value.encode("utf-8", "strict")
1496 except UnicodeEncodeError:
1502 # Put the hdrcharset field at the beginning of the header.
1503 records += b"21 hdrcharset=BINARY\n"
1505 for keyword, value in pax_headers.items():
1506 keyword = keyword.encode("utf-8")
1508 # Try to restore the original byte representation of `value'.
1509 # Needless to say, that the encoding must match the string.
1510 value = value.encode(encoding, "surrogateescape")
1512 value = value.encode("utf-8")
1514 l = len(keyword) + len(value) + 3 # ' ' + '=' + '\n'
1521 records += bytes(str(p), "ascii") + b" " + keyword + b"=" + value + b"\n"
1523 # We use a hardcoded "././@PaxHeader" name like star does
1524 # instead of the one that POSIX recommends.
1526 info["name"] = "././@PaxHeader"
1528 info["size"] = len(records)
1529 info["magic"] = POSIX_MAGIC
1531 # Create pax header + record blocks.
1532 return cls._create_header(info, USTAR_FORMAT, "ascii", "replace") + \
1533 cls._create_payload(records)
1536 def frombuf(cls, buf, encoding, errors):
1537 """Construct a TarInfo object from a 512 byte bytes object.
1540 raise EmptyHeaderError("empty header")
1541 if len(buf) != BLOCKSIZE:
1542 raise TruncatedHeaderError("truncated header")
1543 if buf.count(NUL) == BLOCKSIZE:
1544 raise EOFHeaderError("end of file header")
1546 chksum = nti(buf[148:156])
1547 if chksum not in calc_chksums(buf):
1548 raise InvalidHeaderError("bad checksum")
1551 obj.name = nts(buf[0:100], encoding, errors)
1552 obj.mode = nti(buf[100:108])
1553 obj.uid = nti(buf[108:116])
1554 obj.gid = nti(buf[116:124])
1555 obj.size = nti(buf[124:136])
1556 obj.mtime = nti(buf[136:148])
1558 obj.type = buf[156:157]
1559 obj.linkname = nts(buf[157:257], encoding, errors)
1560 obj.uname = nts(buf[265:297], encoding, errors)
1561 obj.gname = nts(buf[297:329], encoding, errors)
1562 obj.devmajor = nti(buf[329:337])
1563 obj.devminor = nti(buf[337:345])
1564 prefix = nts(buf[345:500], encoding, errors)
1566 # The old GNU sparse format occupies some of the unused
1567 # space in the buffer for up to 4 sparse structures.
1568 # Save the them for later processing in _proc_sparse().
1569 if obj.type == GNUTYPE_SPARSE:
1574 offset = nti(buf[pos:pos + 12])
1575 numbytes = nti(buf[pos + 12:pos + 24])
1578 structs.append((offset, numbytes))
1580 isextended = bool(buf[482])
1581 origsize = nti(buf[483:495])
1582 obj._sparse_structs = (structs, isextended, origsize)
1584 # Old V7 tar format represents a directory as a regular
1585 # file with a trailing slash.
1586 if obj.type == AREGTYPE and obj.name.endswith("/"):
1589 # Remove redundant slashes from directories.
1591 obj.name = obj.name.rstrip("/")
1593 # Reconstruct a ustar longname.
1594 if prefix and obj.type not in GNU_TYPES:
1595 obj.name = prefix + "/" + obj.name
1597 obj.offset_data = nti(buf[369:381])
1601 def fromtarfile(cls, tarfile):
1602 """Return the next TarInfo object from TarFile object
1605 buf = tarfile.fileobj.read(BLOCKSIZE)
1606 obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors)
1607 obj.offset = tarfile.fileobj.tell() - BLOCKSIZE
1608 return obj._proc_member(tarfile)
1610 #--------------------------------------------------------------------------
1611 # The following are methods that are called depending on the type of a
1612 # member. The entry point is _proc_member() which can be overridden in a
1613 # subclass to add custom _proc_*() methods. A _proc_*() method MUST
1614 # implement the following
1616 # 1. Set self.offset_data to the position where the data blocks begin,
1617 # if there is data that follows.
1618 # 2. Set tarfile.offset to the position where the next member's header will
1620 # 3. Return self or another valid TarInfo object.
1621 def _proc_member(self, tarfile):
1622 """Choose the right processing method depending on
1623 the type and call it.
1625 if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK):
1626 return self._proc_gnulong(tarfile)
1627 elif self.type == GNUTYPE_SPARSE:
1628 return self._proc_sparse(tarfile)
1629 elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE):
1630 return self._proc_pax(tarfile)
1632 return self._proc_builtin(tarfile)
1634 def _proc_builtin(self, tarfile):
1635 """Process a builtin type or an unknown type which
1636 will be treated as a regular file.
1638 self.offset_data = tarfile.fileobj.tell()
1639 offset = self.offset_data
1640 if self.isreg() or self.ismultivol() or self.type not in SUPPORTED_TYPES:
1641 # Skip the following data blocks.
1642 offset += self._block(self.size)
1643 tarfile.offset = offset
1645 # Patch the TarInfo object with saved global
1646 # header information.
1647 self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors)
1651 def _proc_gnulong(self, tarfile):
1652 """Process the blocks that hold a GNU longname
1655 buf = tarfile.fileobj.read(self._block(self.size))
1657 # Fetch the next header and process it.
1659 next = self.fromtarfile(tarfile)
1661 raise SubsequentHeaderError("missing or bad subsequent header")
1663 # Patch the TarInfo object from the next header with
1664 # the longname information.
1665 next.offset = self.offset
1666 if self.type == GNUTYPE_LONGNAME:
1667 next.name = nts(buf, tarfile.encoding, tarfile.errors)
1668 elif self.type == GNUTYPE_LONGLINK:
1669 next.linkname = nts(buf, tarfile.encoding, tarfile.errors)
1673 def _proc_sparse(self, tarfile):
1674 """Process a GNU sparse header plus extra headers.
1676 # We already collected some sparse structures in frombuf().
1677 structs, isextended, origsize = self._sparse_structs
1678 del self._sparse_structs
1680 # Collect sparse structures from extended header blocks.
1682 buf = tarfile.fileobj.read(BLOCKSIZE)
1686 offset = nti(buf[pos:pos + 12])
1687 numbytes = nti(buf[pos + 12:pos + 24])
1690 if offset and numbytes:
1691 structs.append((offset, numbytes))
1693 isextended = bool(buf[504])
1694 self.sparse = structs
1696 self.offset_data = tarfile.fileobj.tell()
1697 tarfile.offset = self.offset_data + self._block(self.size)
1698 self.size = origsize
1701 def _proc_pax(self, tarfile):
1702 """Process an extended or global header as described in
1705 # Read the header information.
1706 buf = tarfile.fileobj.read(self._block(self.size))
1708 # A pax header stores supplemental information for either
1709 # the following file (extended) or all following files
1711 if self.type == XGLTYPE:
1712 pax_headers = tarfile.pax_headers
1714 pax_headers = tarfile.pax_headers.copy()
1716 # Check if the pax header contains a hdrcharset field. This tells us
1717 # the encoding of the path, linkpath, uname and gname fields. Normally,
1718 # these fields are UTF-8 encoded but since POSIX.1-2008 tar
1719 # implementations are allowed to store them as raw binary strings if
1720 # the translation to UTF-8 fails.
1721 match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf)
1722 if match is not None:
1723 pax_headers["hdrcharset"] = match.group(1).decode("utf-8")
1725 # For the time being, we don't care about anything other than "BINARY".
1726 # The only other value that is currently allowed by the standard is
1727 # "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
1728 hdrcharset = pax_headers.get("hdrcharset")
1729 if hdrcharset == "BINARY":
1730 encoding = tarfile.encoding
1734 # Parse pax header information. A record looks like that:
1735 # "%d %s=%s\n" % (length, keyword, value). length is the size
1736 # of the complete record including the length field itself and
1737 # the newline. keyword and value are both UTF-8 encoded strings.
1738 regex = re.compile(br"(\d+) ([^=]+)=")
1741 match = regex.match(buf, pos)
1745 length, keyword = match.groups()
1746 length = int(length)
1747 value = buf[match.end(2) + 1:match.start(1) + length - 1]
1749 # Normally, we could just use "utf-8" as the encoding and "strict"
1750 # as the error handler, but we better not take the risk. For
1751 # example, GNU tar <= 1.23 is known to store filenames it cannot
1752 # translate to UTF-8 as raw strings (unfortunately without a
1753 # hdrcharset=BINARY header).
1754 # We first try the strict standard encoding, and if that fails we
1755 # fall back on the user's encoding and error handler.
1756 keyword = self._decode_pax_field(keyword, "utf-8", "utf-8",
1758 if keyword in PAX_NAME_FIELDS:
1759 value = self._decode_pax_field(value, encoding, tarfile.encoding,
1762 value = self._decode_pax_field(value, "utf-8", "utf-8",
1765 pax_headers[keyword] = value
1769 # Fetch the next header.
1771 next = self.fromtarfile(tarfile)
1773 raise SubsequentHeaderError("missing or bad subsequent header")
1775 # Process GNU sparse information.
1776 if "GNU.sparse.map" in pax_headers:
1777 # GNU extended sparse format version 0.1.
1778 self._proc_gnusparse_01(next, pax_headers)
1780 elif "GNU.sparse.size" in pax_headers:
1781 # GNU extended sparse format version 0.0.
1782 self._proc_gnusparse_00(next, pax_headers, buf)
1784 elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0":
1785 # GNU extended sparse format version 1.0.
1786 self._proc_gnusparse_10(next, pax_headers, tarfile)
1788 if self.type in (XHDTYPE, SOLARIS_XHDTYPE):
1789 # Patch the TarInfo object with the extended header info.
1790 next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors)
1791 next.offset = self.offset
1793 if "size" in pax_headers:
1794 # If the extended header replaces the size field,
1795 # we need to recalculate the offset where the next
1797 offset = next.offset_data
1798 if next.isreg() or next.type not in SUPPORTED_TYPES:
1799 offset += next._block(next.size)
1800 tarfile.offset = offset
1802 if next is not None:
1803 if "GNU.volume.filename" in pax_headers:
1804 if pax_headers["GNU.volume.filename"] == next.name:
1805 if "GNU.volume.size" in pax_headers:
1806 next.size = int(pax_headers["GNU.volume.size"])
1807 if "GNU.volume.offset" in pax_headers:
1808 next.volume_offset = int(pax_headers["GNU.volume.offset"])
1810 for key in pax_headers.keys():
1811 if key.startswith("GNU.volume"):
1812 del tarfile.pax_headers[key]
1816 def _proc_gnusparse_00(self, next, pax_headers, buf):
1817 """Process a GNU tar extended sparse header, version 0.0.
1820 for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf):
1821 offsets.append(int(match.group(1)))
1823 for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf):
1824 numbytes.append(int(match.group(1)))
1825 next.sparse = list(zip(offsets, numbytes))
1827 def _proc_gnusparse_01(self, next, pax_headers):
1828 """Process a GNU tar extended sparse header, version 0.1.
1830 sparse = [int(x) for x in pax_headers["GNU.sparse.map"].split(",")]
1831 next.sparse = list(zip(sparse[::2], sparse[1::2]))
1833 def _proc_gnusparse_10(self, next, pax_headers, tarfile):
1834 """Process a GNU tar extended sparse header, version 1.0.
1838 buf = tarfile.fileobj.read(BLOCKSIZE)
1839 fields, buf = buf.split(b"\n", 1)
1840 fields = int(fields)
1841 while len(sparse) < fields * 2:
1842 if b"\n" not in buf:
1843 buf += tarfile.fileobj.read(BLOCKSIZE)
1844 number, buf = buf.split(b"\n", 1)
1845 sparse.append(int(number))
1846 next.offset_data = tarfile.fileobj.tell()
1847 next.sparse = list(zip(sparse[::2], sparse[1::2]))
1849 def _apply_pax_info(self, pax_headers, encoding, errors):
1850 """Replace fields with supplemental information from a previous
1851 pax extended or global header.
1853 for keyword, value in pax_headers.items():
1854 if keyword == "GNU.sparse.name":
1855 setattr(self, "path", value)
1856 elif keyword == "GNU.sparse.size":
1857 setattr(self, "size", int(value))
1858 elif keyword == "GNU.sparse.realsize":
1859 setattr(self, "size", int(value))
1860 elif keyword in PAX_FIELDS:
1861 if keyword in PAX_NUMBER_FIELDS:
1863 value = PAX_NUMBER_FIELDS[keyword](value)
1866 if keyword == "path":
1867 value = value.rstrip("/") # pylint: disable=no-member
1868 setattr(self, keyword, value)
1870 self.pax_headers = pax_headers.copy()
1872 def _decode_pax_field(self, value, encoding, fallback_encoding, fallback_errors):
1873 """Decode a single field from a pax record.
1876 return value.decode(encoding, "strict")
1877 except UnicodeDecodeError:
1878 return value.decode(fallback_encoding, fallback_errors)
1880 def _block(self, count):
1881 """Round up a byte count by BLOCKSIZE and return it,
1882 e.g. _block(834) => 1024.
1884 blocks, remainder = divmod(count, BLOCKSIZE)
1887 return blocks * BLOCKSIZE
1890 return self.type in REGULAR_TYPES
1894 return self.type == DIRTYPE
1896 return self.type == SYMTYPE
1898 return self.type == LNKTYPE
1900 return self.type == CHRTYPE
1902 return self.type == BLKTYPE
1904 return self.type == FIFOTYPE
1906 return self.sparse is not None
1908 return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE)
1909 def ismultivol(self):
1910 return self.type == GNUTYPE_MULTIVOL or self.volume_offset > 0 or\
1911 "GNU.volume.offset" in self.pax_headers
1914 class TarFile(object):
1915 """The TarFile Class provides an interface to tar archives.
1918 debug = 0 # May be set from 0 (no msgs) to 3 (all msgs)
1920 dereference = False # If true, add content of linked file to the
1921 # tar file, else the link.
1923 ignore_zeros = False # If true, skips empty or invalid blocks and
1924 # continues processing.
1926 max_volume_size = None # If different from None, establishes maximum
1927 # size of tar volumes
1929 new_volume_handler = None # function handler to be executed before when
1930 # a new volume is needed
1932 volume_number = 0 # current volume number, used for multi volume
1935 errorlevel = 1 # If 0, fatal errors only appear in debug
1936 # messages (if debug >= 0). If > 0, errors
1937 # are passed to the caller as exceptions.
1939 format = DEFAULT_FORMAT # The format to use when creating an archive.
1941 encoding = ENCODING # Encoding for 8-bit character strings.
1943 errors = None # Error handler for unicode conversion.
1945 tarinfo = TarInfo # The default TarInfo class to use.
1947 fileobject = ExFileObject # The file-object for extractfile().
1949 arcmode = ARCMODE_PLAIN # Object processing mode (“concat”, encryption,
1952 save_to_members = True # If new members are saved. This can be disabled
1953 # if you manage lots of files and don't want
1954 # to have high memory usage
1956 cache_uid2user = {} # cache to avoid getpwuid calls. It always parses /etc/passwd.
1957 cache_gid2group = {} # same cache for groups
1959 def __init__(self, name=None, mode="r", fileobj=None, format=None,
1960 tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
1961 errors="surrogateescape", pax_headers=None, debug=None,
1962 errorlevel=None, max_volume_size=None, new_volume_handler=None,
1963 concat=False, nacl=None,
1964 save_to_members=True):
1965 """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
1966 read from an existing archive, 'a' to append data to an existing
1967 file or 'w' to create a new file overwriting an existing one. `mode'
1969 If `fileobj' is given, it is used for reading or writing data. If it
1970 can be determined, `mode' is overridden by `fileobj's mode.
1971 `fileobj' is not closed, when TarFile is closed.
1973 if len(mode) > 1 or mode not in "raw":
1974 raise ValueError("mode must be 'r', 'a' or 'w'")
1976 self.arcmode = arcmode_set (concat)
1978 self._mode = {"r": "rb", "a": "r+b", "w": "wb"}[mode]
1981 if self.mode == "a" and not os.path.exists(name):
1982 # Create nonexistent files in append mode.
1985 fileobj = bltn_open(name, self._mode)
1986 self._extfileobj = False
1988 if name is None and hasattr(fileobj, "name"):
1990 # when fileobj is a gzip.GzipFile, fileobj.mode is an int (not valid for us)
1991 if hasattr(fileobj, "mode") and isinstance(fileobj.mode, str):
1992 self._mode = fileobj.mode
1993 self._extfileobj = True
1994 self.name = os.path.abspath(name) if name else None
1995 self.base_name = self.name = os.path.abspath(name) if name else None
1996 self.fileobj = fileobj
1999 if format is not None:
2000 self.format = format
2001 if tarinfo is not None:
2002 self.tarinfo = tarinfo
2003 if dereference is not None:
2004 self.dereference = dereference
2005 if ignore_zeros is not None:
2006 self.ignore_zeros = ignore_zeros
2007 if encoding is not None:
2008 self.encoding = encoding
2010 self.errors = errors
2012 if pax_headers is not None and self.format == PAX_FORMAT:
2013 self.pax_headers = pax_headers
2015 self.pax_headers = {}
2017 if debug is not None:
2019 if errorlevel is not None:
2020 self.errorlevel = errorlevel
2022 # Init datastructures.
2023 if max_volume_size and max_volume_size < 3*BLOCKSIZE:
2024 raise ValueError("max_volume_size needs to be at least %d" % (3*BLOCKSIZE))
2025 if max_volume_size and not callable(new_volume_handler):
2026 raise ValueError("new_volume_handler needs to be set and be callable for multivolume support")
2028 self.max_volume_size = int(max_volume_size)
2030 self.max_volume_size = None
2032 self.save_to_members = save_to_members
2033 self.new_volume_handler = new_volume_handler
2035 self.members = [] # list of members as TarInfo objects
2036 self._loaded = False # flag if all members have been read
2037 self.offset = self.fileobj.tell()
2038 # current position in the archive file
2039 self.inodes = {} # dictionary caching the inodes of
2040 # archive members already added
2043 if self.mode == "r":
2044 self.firstmember = None
2045 self.firstmember = self.next()
2047 if self.mode == "a":
2048 # Move to the end of the archive,
2049 # before the first empty block.
2051 self.fileobj.seek(self.offset)
2053 tarinfo = self.tarinfo.fromtarfile(self)
2054 self.members.append(tarinfo)
2055 except EOFHeaderError:
2056 self.fileobj.seek(self.offset)
2058 except HeaderError as e:
2059 raise ReadError(str(e))
2061 if self.mode in "aw":
2064 if self.pax_headers:
2065 buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy())
2066 self.fileobj.write(buf)
2067 self.offset += len(buf)
2069 if not self._extfileobj:
2070 self.fileobj.close()
2074 #--------------------------------------------------------------------------
2075 # Below are the classmethods which act as alternate constructors to the
2076 # TarFile class. The open() method is the only one that is needed for
2077 # public use; it is the "super"-constructor and is able to select an
2078 # adequate "sub"-constructor for a particular compression using the mapping
2081 # This concept allows one to subclass TarFile without losing the comfort of
2082 # the super-constructor. A sub-constructor is registered and made available
2083 # by adding it to the mapping in OPEN_METH.
2086 def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE,
2087 encryption=None, compresslevel=9, tolerant=False, **kwargs):
2088 """Open a tar archive for reading, writing or appending. Return
2089 an appropriate TarFile class.
2092 'r' or 'r:*' open for reading with transparent compression
2093 'r:' open for reading exclusively uncompressed
2094 'r:gz' open for reading with gzip compression
2095 'r:bz2' open for reading with bzip2 compression
2096 'r:xz' open for reading with lzma compression
2097 'a' or 'a:' open for appending, creating the file if necessary
2098 'w' or 'w:' open for writing without compression
2099 'w:gz' open for writing with gzip compression
2100 'w:bz2' open for writing with bzip2 compression
2101 'w:xz' open for writing with lzma compression
2103 'r|*' open a stream of tar blocks with transparent compression
2104 'r|' open an uncompressed stream of tar blocks for reading
2105 'r|gz' open a gzip compressed stream of tar blocks
2106 'r|bz2' open a bzip2 compressed stream of tar blocks
2107 'r|xz' open an lzma compressed stream of tar blocks
2108 'w|' open an uncompressed stream for writing
2109 'w|gz' open a gzip compressed stream for writing
2110 'w|bz2' open a bzip2 compressed stream for writing
2111 'w|xz' open an lzma compressed stream for writing
2113 'r#gz' open a stream of gzip compressed tar blocks for reading
2114 'w#gz' open a stream of gzip compressed tar blocks for writing
2116 if not name and not fileobj:
2117 raise ValueError("nothing to open")
2119 if mode in ("r", "r:*"):
2120 # Find out which *open() is appropriate for opening the file.
2121 for comptype in cls.OPEN_METH:
2122 func = getattr(cls, cls.OPEN_METH[comptype])
2123 if fileobj is not None:
2124 saved_pos = fileobj.tell()
2126 return func(name, "r", fileobj, **kwargs)
2127 except (ReadError, CompressionError) as e:
2128 # usually nothing exceptional but sometimes is
2129 if fileobj is not None:
2130 fileobj.seek(saved_pos)
2132 raise ReadError("file could not be opened successfully")
2135 filemode, comptype = mode.split(":", 1)
2136 filemode = filemode or "r"
2137 comptype = comptype or "tar"
2139 # Select the *open() function according to
2140 # given compression.
2141 if comptype in cls.OPEN_METH:
2142 func = getattr(cls, cls.OPEN_METH[comptype])
2144 raise CompressionError("unknown compression type %r" % comptype)
2146 # Pass on compression level for gzip / bzip2.
2147 if comptype == 'gz' or comptype == 'bz2':
2148 kwargs['compresslevel'] = compresslevel
2150 if 'max_volume_size' in kwargs:
2151 if comptype != 'tar' and filemode in 'wa' \
2152 and kwargs['max_volume_size']:
2154 warnings.warn('Only the first volume will be compressed '
2155 'for modes with "w:"!')
2157 return func(name, filemode, fileobj, **kwargs)
2160 filemode, comptype = mode.split("|", 1)
2161 filemode = filemode or "r"
2162 comptype = comptype or "tar"
2164 if filemode not in "rw":
2165 raise ValueError("mode must be 'r' or 'w'")
2167 t = cls(name, filemode,
2168 _Stream(name, filemode, comptype, fileobj, bufsize,
2169 compresslevel=compresslevel),
2171 t._extfileobj = False
2175 filemode, comptype = mode.split("#", 1)
2176 filemode = filemode or "r"
2178 if filemode not in "rw":
2179 raise ValueError ("mode %s not compatible with concat "
2180 "archive; must be 'r' or 'w'" % mode)
2182 stream = _Stream(name, filemode, comptype, fileobj, bufsize,
2183 concat=True, encryption=encryption,
2184 compresslevel=compresslevel, tolerant=tolerant)
2185 kwargs ["concat"] = True
2187 t = cls(name, filemode, stream, **kwargs)
2188 except: # XXX except what?
2190 raise # XXX raise what?
2191 t._extfileobj = False
2195 return cls.taropen(name, mode, fileobj, **kwargs)
2197 raise ValueError("undiscernible mode %r" % mode)
2200 def taropen(cls, name, mode="r", fileobj=None, **kwargs):
2201 """Open uncompressed tar archive name for reading or writing.
2203 if len(mode) > 1 or mode not in "raw":
2204 raise ValueError("mode must be 'r', 'a' or 'w'")
2205 return cls(name, mode, fileobj, **kwargs)
2208 def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
2209 """Open gzip compressed tar archive name for reading or writing.
2210 Appending is not allowed.
2212 if len(mode) > 1 or mode not in "rw":
2213 raise ValueError("mode must be 'r' or 'w'")
2218 except (ImportError, AttributeError):
2219 raise CompressionError("gzip module is not available")
2221 extfileobj = fileobj is not None
2223 fileobj = gzip.GzipFile(name, mode + "b", compresslevel, fileobj)
2224 t = cls.taropen(name, mode, fileobj, **kwargs)
2226 if not extfileobj and fileobj is not None:
2230 raise ReadError("not a gzip file")
2232 if not extfileobj and fileobj is not None:
2235 t._extfileobj = extfileobj
2239 def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
2240 """Open bzip2 compressed tar archive name for reading or writing.
2241 Appending is not allowed.
2243 if len(mode) > 1 or mode not in "rw":
2244 raise ValueError("mode must be 'r' or 'w'.")
2249 raise CompressionError("bz2 module is not available")
2251 fileobj = bz2.BZ2File(fileobj or name, mode,
2252 compresslevel=compresslevel)
2255 t = cls.taropen(name, mode, fileobj, **kwargs)
2256 except (OSError, EOFError):
2258 raise ReadError("not a bzip2 file")
2259 t._extfileobj = False
2263 def xzopen(cls, name, mode="r", fileobj=None, preset=None, **kwargs):
2264 """Open lzma compressed tar archive name for reading or writing.
2265 Appending is not allowed.
2267 if mode not in ("r", "w"):
2268 raise ValueError("mode must be 'r' or 'w'")
2273 raise CompressionError("lzma module is not available")
2275 fileobj = lzma.LZMAFile(fileobj or name, mode, preset=preset)
2278 t = cls.taropen(name, mode, fileobj, **kwargs)
2279 except (lzma.LZMAError, EOFError):
2281 raise ReadError("not an lzma file")
2282 t._extfileobj = False
2285 # All *open() methods are registered here.
2287 "tar": "taropen", # uncompressed tar
2288 "gz": "gzopen", # gzip compressed tar
2289 "bz2": "bz2open", # bzip2 compressed tar
2290 "xz": "xzopen" # lzma compressed tar
2293 #--------------------------------------------------------------------------
2294 # The public methods which TarFile provides:
2297 """Close the TarFile. In write-mode, two finishing zero blocks are
2298 appended to the archive. A special case are empty archives which are
2299 initialized accordingly so the two mandatory blocks of zeros are
2300 written abiding by the requested encryption and compression settings.
2305 if self.mode in "aw":
2306 if self.arcmode & ARCMODE_CONCAT and self.fileobj.tell () == 0:
2307 self.fileobj.next ("")
2308 self.fileobj.write(NUL * (BLOCKSIZE * 2))
2309 self.offset += (BLOCKSIZE * 2)
2310 # fill up the end with zero-blocks
2311 # (like option -b20 for tar does)
2312 blocks, remainder = divmod(self.offset, RECORDSIZE)
2314 self.fileobj.write(NUL * (RECORDSIZE - remainder))
2315 if not self._extfileobj:
2316 self.fileobj.close()
2319 def getmember(self, name):
2320 """Return a TarInfo object for member `name'. If `name' can not be
2321 found in the archive, KeyError is raised. If a member occurs more
2322 than once in the archive, its last occurrence is assumed to be the
2323 most up-to-date version.
2325 tarinfo = self._getmember(name)
2327 raise KeyError("filename %r not found" % name)
2330 def getmembers(self):
2331 """Return the members of the archive as a list of TarInfo objects. The
2332 list has the same order as the members in the archive.
2335 if not self._loaded: # if we want to obtain a list of
2336 self._load() # all members, we first have to
2337 # scan the whole archive.
2340 def get_last_member_offset(self):
2341 """Return the last member offset. Usually this is self.fileobj.tell(),
2342 but when there's encryption or concat compression going on it's more
2343 complicated than that.
2345 return self.last_block_offset
2348 """Return the members of the archive as a list of their names. It has
2349 the same order as the list returned by getmembers().
2351 return [tarinfo.name for tarinfo in self.getmembers()]
2353 def gettarinfo(self, name=None, arcname=None, fileobj=None):
2354 """Create a TarInfo object for either the file `name' or the file
2355 object `fileobj' (using os.fstat on its file descriptor). You can
2356 modify some of the TarInfo's attributes before you add it using
2357 addfile(). If given, `arcname' specifies an alternative name for the
2358 file in the archive.
2362 # When fileobj is given, replace name by
2363 # fileobj's real name.
2364 if fileobj is not None:
2367 # Building the name of the member in the archive.
2368 # Backward slashes are converted to forward slashes,
2369 # Absolute paths are turned to relative paths.
2372 drv, arcname = os.path.splitdrive(arcname)
2373 arcname = arcname.replace(os.sep, "/")
2374 arcname = arcname.lstrip("/")
2376 # Now, fill the TarInfo object with
2377 # information specific for the file.
2378 tarinfo = self.tarinfo()
2379 tarinfo.tarfile = self
2381 # Use os.stat or os.lstat, depending on platform
2382 # and if symlinks shall be resolved.
2384 if hasattr(os, "lstat") and not self.dereference:
2385 statres = os.lstat(name)
2387 statres = os.stat(name)
2389 statres = os.fstat(fileobj.fileno())
2392 stmd = statres.st_mode
2393 if stat.S_ISREG(stmd):
2394 inode = (statres.st_ino, statres.st_dev)
2395 if not self.dereference and statres.st_nlink > 1 and \
2396 inode in self.inodes and arcname != self.inodes[inode]:
2397 # Is it a hardlink to an already
2400 linkname = self.inodes[inode]
2402 # The inode is added only if its valid.
2403 # For win32 it is always 0.
2405 if inode[0] and self.save_to_members:
2406 self.inodes[inode] = arcname
2407 elif stat.S_ISDIR(stmd):
2409 elif stat.S_ISFIFO(stmd):
2411 elif stat.S_ISLNK(stmd):
2413 linkname = os.readlink(name)
2414 elif stat.S_ISCHR(stmd):
2416 elif stat.S_ISBLK(stmd):
2421 # Fill the TarInfo object with all
2422 # information we can get.
2423 tarinfo.name = arcname
2425 tarinfo.uid = statres.st_uid
2426 tarinfo.gid = statres.st_gid
2428 tarinfo.size = statres.st_size
2431 tarinfo.mtime = statres.st_mtime
2433 tarinfo.linkname = linkname
2435 if tarinfo.uid in self.cache_uid2user:
2436 tarinfo.uname = self.cache_uid2user[tarinfo.uid]
2439 tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0]
2440 self.cache_uid2user[tarinfo.uid] = tarinfo.uname
2442 # remember user does not exist:
2443 # same default value as in tarinfo class
2444 self.cache_uid2user[tarinfo.uid] = ""
2446 if tarinfo.gid in self.cache_gid2group:
2447 tarinfo.gname = self.cache_gid2group[tarinfo.gid]
2450 tarinfo.gname = grp.getgrgid(tarinfo.gid)[0]
2451 self.cache_gid2group[tarinfo.gid] = tarinfo.gname
2453 # remember group does not exist:
2454 # same default value as in tarinfo class
2455 self.cache_gid2group[tarinfo.gid] = ""
2457 if type in (CHRTYPE, BLKTYPE):
2458 if hasattr(os, "major") and hasattr(os, "minor"):
2459 tarinfo.devmajor = os.major(statres.st_rdev)
2460 tarinfo.devminor = os.minor(statres.st_rdev)
2463 def list(self, verbose=True):
2464 """Print a table of contents to sys.stdout. If `verbose' is False, only
2465 the names of the members are printed. If it is True, an `ls -l'-like
2470 for tarinfo in self:
2472 print(stat.filemode(tarinfo.mode), end=' ')
2473 print("%s/%s" % (tarinfo.uname or tarinfo.uid,
2474 tarinfo.gname or tarinfo.gid), end=' ')
2475 if tarinfo.ischr() or tarinfo.isblk():
2476 print("%10s" % ("%d,%d" \
2477 % (tarinfo.devmajor, tarinfo.devminor)), end=' ')
2479 print("%10d" % tarinfo.size, end=' ')
2480 print("%d-%02d-%02d %02d:%02d:%02d" \
2481 % time.localtime(tarinfo.mtime)[:6], end=' ')
2483 print(tarinfo.name + ("/" if tarinfo.isdir() else ""), end=' ')
2487 print("->", tarinfo.linkname, end=' ')
2489 print("link to", tarinfo.linkname, end=' ')
2492 def add(self, name, arcname=None, recursive=True, exclude=None, *, filter=None):
2493 """Add the file `name' to the archive. `name' may be any type of file
2494 (directory, fifo, symbolic link, etc.). If given, `arcname'
2495 specifies an alternative name for the file in the archive.
2496 Directories are added recursively by default. This can be avoided by
2497 setting `recursive' to False. `exclude' is a function that should
2498 return True for each filename to be excluded. `filter' is a function
2499 that expects a TarInfo object argument and returns the changed
2500 TarInfo object, if it returns None the TarInfo object will be
2501 excluded from the archive.
2508 # Exclude pathnames.
2509 if exclude is not None:
2511 warnings.warn("use the filter argument instead",
2512 DeprecationWarning, 2)
2514 self._dbg(2, "tarfile: Excluded %r" % name)
2517 # Skip if somebody tries to archive the archive...
2518 if self.name is not None and os.path.abspath(name) == self.name:
2519 self._dbg(2, "tarfile: Skipped %r" % name)
2524 # Create a TarInfo object from the file.
2525 tarinfo = self.gettarinfo(name, arcname)
2528 self._dbg(1, "tarfile: Unsupported type %r" % name)
2531 # Change or exclude the TarInfo object.
2532 if filter is not None:
2533 tarinfo = filter(tarinfo)
2535 self._dbg(2, "tarfile: Excluded %r" % name)
2538 # Append the tar header and data to the archive.
2540 with bltn_open(name, "rb") as f:
2541 self.addfile(tarinfo, f)
2543 elif tarinfo.isdir():
2544 self.addfile(tarinfo)
2546 for f in os.listdir(name):
2547 self.add(os.path.join(name, f), os.path.join(arcname, f),
2548 recursive, exclude, filter=filter)
2551 self.addfile(tarinfo)
2553 def _size_left_file(self):
2554 """Calculates size left in a volume with a maximum volume size.
2556 Assumes self.max_volume_size is set.
2557 If using compression through a _Stream, use _size_left_stream instead
2559 # left-over size = max_size - offset - 2 zero-blocks written in close
2560 size_left = self.max_volume_size - 2*BLOCKSIZE - self.offset
2561 # limit size left to a discrete number of blocks, because we won't
2562 # write only half a block when writting the end of a volume
2563 # and filling with zeros
2564 return BLOCKSIZE * (size_left // BLOCKSIZE)
2566 def _size_left_stream(self):
2567 """ Calculates size left in a volume if using comression/encryption
2569 Assumes self.max_volume_size is set and self.fileobj is a _Stream
2570 (otherwise use _size_left_file)
2572 # left-over size = max_size - bytes written - 2 zero-blocks (close)
2573 size_left = self.max_volume_size - self.fileobj.estim_file_size() \
2575 return BLOCKSIZE * (size_left // BLOCKSIZE)
2577 def addfile(self, tarinfo, fileobj=None):
2578 """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is
2579 given, tarinfo.size bytes are read from it and added to the archive.
2580 You can create TarInfo objects using gettarinfo().
2581 On Windows platforms, `fileobj' should always be opened with mode
2582 'rb' to avoid irritation about the file size.
2586 tarinfo = copy.copy(tarinfo)
2588 if self.arcmode & ARCMODE_CONCAT:
2589 self.last_block_offset = self.fileobj.next (tarinfo.name)
2591 self.last_block_offset = self.fileobj.tell()
2593 buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
2594 self.fileobj.write(buf)
2595 self.offset += len(buf)
2597 if self.max_volume_size:
2598 if isinstance(self.fileobj, _Stream):
2599 _size_left = self._size_left_stream
2601 _size_left = self._size_left_file
2603 _size_left = lambda: tarinfo.size
2605 # If there's no data to follow, finish
2607 if self.save_to_members:
2608 self.members.append(tarinfo)
2611 target_size_left = _size_left()
2612 source_size_left = tarinfo.size
2613 assert tarinfo.volume_offset == 0
2615 # we only split volumes in the middle of a file, that means we have
2616 # to write at least one block
2617 if target_size_left < BLOCKSIZE:
2618 target_size_left = BLOCKSIZE
2620 # loop over multiple volumes
2621 while source_size_left > 0:
2623 # Write as much data as possble from source into target.
2624 # When compressing data, we cannot easily predict how much data we
2625 # can write until target_size_left == 0 --> need to iterate
2626 size_can_write = min(target_size_left, source_size_left)
2628 while size_can_write > 0:
2629 copyfileobj(fileobj, self.fileobj, size_can_write)
2630 self.offset += size_can_write
2631 source_size_left -= size_can_write
2632 target_size_left = _size_left()
2633 size_can_write = min(target_size_left, source_size_left)
2635 # now target_size_left == 0 or source_size_left == 0
2637 # if there is data left to write, we need to create a new volume
2638 if source_size_left > 0:
2639 # Only finalize the crypto entry here if we’re continuing with
2640 # another one; otherwise, the encryption must include the block
2642 tarinfo.type = GNUTYPE_MULTIVOL
2644 if not self.new_volume_handler or\
2645 not callable(self.new_volume_handler):
2646 raise Exception("We need to create a new volume and you "
2647 "didn't supply a new_volume_handler")
2650 # the new volume handler should do everything needed to
2651 # start working in a new volume. usually, the handler calls
2652 # to self.open_volume
2653 self.volume_number += 1
2655 # set to be used by open_volume, because in the case of a PAX
2656 # tar it needs to write information about the volume and offset
2657 # in the global header
2658 tarinfo.volume_offset = tarinfo.size - source_size_left
2659 self.volume_tarinfo = tarinfo
2661 # the “new_volume_handler” is supposed to call .close() on the
2663 self.new_volume_handler(self, self.base_name, self.volume_number)
2665 self.volume_tarinfo = None
2667 if self.arcmode & ARCMODE_CONCAT:
2668 self.fileobj.next_volume (tarinfo.name)
2670 # write new volume header
2671 buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
2672 self.fileobj.write(buf)
2673 self.offset += len(buf)
2675 # adjust variables; open_volume should have reset self.offset
2676 # --> _size_left should be big again
2677 target_size_left = _size_left()
2678 size_can_write = min(target_size_left, source_size_left)
2679 self._dbg(3, 'new volume')
2681 # now, all data has been written. We may have to fill up the rest of
2682 # the block in target with 0s
2683 remainder = (tarinfo.size - tarinfo.volume_offset) % BLOCKSIZE
2685 self.fileobj.write(NUL * (BLOCKSIZE - remainder))
2686 self.offset += BLOCKSIZE - remainder
2688 if self.save_to_members:
2689 self.members.append(tarinfo)
2691 def open_volume(self, name="", fileobj=None, encryption=None):
2693 Called by the user to change this tar file to point to a new volume.
2695 # open the file using either fileobj or name
2697 if self.mode == "a" and not os.path.exists(name):
2698 # Create nonexistent files in append mode.
2701 self._extfileobj = False
2703 if isinstance(self.fileobj, _Stream):
2704 self._dbg(3, 'open_volume: create a _Stream')
2705 fileobj = _Stream(name=name,
2706 mode=self.fileobj.mode,
2707 comptype=self.fileobj.comptype,
2709 bufsize=self.fileobj.bufsize,
2710 encryption=encryption or self.fileobj.encryption,
2711 concat=self.fileobj.arcmode & ARCMODE_CONCAT)
2713 # here, we lose information about compression/encryption!
2714 self._dbg(3, 'open_volume: builtin open')
2715 fileobj = bltn_open(name, self._mode)
2717 if name is None and hasattr(fileobj, "name"):
2719 if hasattr(fileobj, "mode"):
2720 self._mode = fileobj.mode
2721 self._extfileobj = True
2722 self._dbg(3, 'open_volume: using external fileobj {}', fileobj)
2723 self.name = os.path.abspath(name) if name else None
2724 self.fileobj = fileobj
2726 # init data structures
2728 self.members = [] # list of members as TarInfo objects
2729 self._loaded = False # flag if all members have been read
2730 self.offset = self.fileobj.tell()
2731 # current position in the archive file
2732 self.inodes = {} # dictionary caching the inodes of
2733 # archive members already added
2736 if self.mode == "r":
2737 self.firstmember = None
2738 self.firstmember = self.next()
2740 if self.mode == "a":
2741 # Move to the end of the archive,
2742 # before the first empty block.
2744 self.fileobj.seek(self.offset)
2746 tarinfo = self.tarinfo.fromtarfile(self)
2747 self.members.append(tarinfo)
2748 except EOFHeaderError:
2749 self.fileobj.seek(self.offset)
2751 except HeaderError as e:
2752 raise ReadError(str(e))
2754 if self.mode in "aw":
2757 if self.format == PAX_FORMAT:
2759 "GNU.volume.filename": str(self.volume_tarinfo.name),
2760 "GNU.volume.size": str(self.volume_tarinfo.size - self.volume_tarinfo.volume_offset),
2761 "GNU.volume.offset": str(self.volume_tarinfo.volume_offset),
2764 self.pax_headers.update(volume_info)
2766 if isinstance(self.fileobj, _Stream):
2767 self.fileobj._init_write_gz ()
2768 buf = self.tarinfo.create_pax_global_header(volume_info.copy())
2769 self.fileobj.write(buf)
2770 self.offset += len(buf)
2771 except Exception as exn:
2772 if not self._extfileobj:
2773 self.fileobj.close()
2777 def extractall(self, path=".", members=None, filter=None):
2778 """Extract all members from the archive to the current working
2779 directory and set owner, modification time and permissions on
2780 directories afterwards. `path' specifies a different directory
2781 to extract to. `members' is optional and must be a subset of the
2782 list returned by getmembers().
2789 for tarinfo in members:
2790 if self.volume_number > 0 and tarinfo.ismultivol():
2793 if filter and not filter(tarinfo):
2797 # Extract directories with a safe mode.
2798 directories.append(tarinfo)
2799 tarinfo = copy.copy(tarinfo)
2800 tarinfo.mode = 0o0700
2801 # Do not set_attrs directories, as we will do that further down
2802 self.extract(tarinfo, path, set_attrs=not tarinfo.isdir())
2804 # Reverse sort directories.
2805 directories.sort(key=lambda a: a.name)
2806 directories.reverse()
2808 # Set correct owner, mtime and filemode on directories.
2809 for tarinfo in directories:
2810 dirpath = os.path.join(path, tarinfo.name)
2812 self.chown(tarinfo, dirpath)
2813 self.utime(tarinfo, dirpath)
2814 self.chmod(tarinfo, dirpath)
2815 except ExtractError as e:
2816 if self.errorlevel > 1:
2819 self._dbg(1, "tarfile: %s" % e)
2821 def extract(self, member, path="", set_attrs=True, symlink_cb=None):
2822 """Extract a member from the archive to the current working directory,
2823 using its full name. Its file information is extracted as accurately
2824 as possible. `member' may be a filename or a TarInfo object. You can
2825 specify a different directory using `path'. File attributes (owner,
2826 mtime, mode) are set unless `set_attrs' is False.
2827 ``symlink_cb`` is a hook accepting a function that is passed the
2828 ``member``, ``path``, and ``set_attrs`` arguments if the tarinfo for
2829 ``member`` indicates a symlink in which case only the callback
2830 passed will be applied, skipping the actual extraction. In case the
2831 callback is invoked, its return value is passed on to the caller.
2835 if isinstance(member, str):
2836 tarinfo = self.getmember(member)
2840 # Prepare the link target for makelink().
2842 tarinfo._link_target = os.path.join(path, tarinfo.linkname)
2844 if symlink_cb is not None and tarinfo.issym():
2845 return symlink_cb(member, path, set_attrs)
2848 self._extract_member(tarinfo, os.path.join(path, tarinfo.name),
2849 set_attrs=set_attrs)
2850 except EnvironmentError as e:
2851 if self.errorlevel > 0:
2854 if e.filename is None:
2855 self._dbg(1, "tarfile: %s" % e.strerror)
2857 self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename))
2858 except ExtractError as e:
2859 if self.errorlevel > 1:
2862 self._dbg(1, "tarfile: %s" % e)
2864 def extractfile(self, member):
2865 """Extract a member from the archive as a file object. `member' may be
2866 a filename or a TarInfo object. If `member' is a regular file or a
2867 link, an io.BufferedReader object is returned. Otherwise, None is
2872 if isinstance(member, str):
2873 tarinfo = self.getmember(member)
2877 if tarinfo.isreg() or tarinfo.ismultivol() or\
2878 tarinfo.type not in SUPPORTED_TYPES:
2879 # If a member's type is unknown, it is treated as a
2881 return self.fileobject(self, tarinfo)
2883 elif tarinfo.islnk() or tarinfo.issym():
2884 if isinstance(self.fileobj, _Stream):
2885 # A small but ugly workaround for the case that someone tries
2886 # to extract a (sym)link as a file-object from a non-seekable
2887 # stream of tar blocks.
2888 raise StreamError("cannot extract (sym)link as file object")
2890 # A (sym)link's file object is its target's file object.
2891 return self.extractfile(self._find_link_target(tarinfo))
2893 # If there's no data associated with the member (directory, chrdev,
2894 # blkdev, etc.), return None instead of a file object.
2897 def _extract_member(self, tarinfo, targetpath, set_attrs=True):
2898 """Extract the TarInfo object tarinfo to a physical
2899 file called targetpath.
2901 # Fetch the TarInfo object for the given name
2902 # and build the destination pathname, replacing
2903 # forward slashes to platform specific separators.
2904 targetpath = targetpath.rstrip("/")
2905 targetpath = targetpath.replace("/", os.sep)
2907 # Create all upper directories.
2908 upperdirs = os.path.dirname(targetpath)
2909 if upperdirs and not os.path.exists(upperdirs):
2910 # Create directories that are not part of the archive with
2911 # default permissions.
2912 os.makedirs(upperdirs)
2914 if tarinfo.islnk() or tarinfo.issym():
2915 self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname))
2917 self._dbg(1, tarinfo.name)
2920 self.makefile(tarinfo, targetpath)
2921 elif tarinfo.isdir():
2922 self.makedir(tarinfo, targetpath)
2923 elif tarinfo.isfifo():
2924 self.makefifo(tarinfo, targetpath)
2925 elif tarinfo.ischr() or tarinfo.isblk():
2926 self.makedev(tarinfo, targetpath)
2927 elif tarinfo.islnk() or tarinfo.issym():
2928 self.makelink(tarinfo, targetpath)
2929 elif tarinfo.type not in SUPPORTED_TYPES:
2930 self.makeunknown(tarinfo, targetpath)
2932 self.makefile(tarinfo, targetpath)
2935 self.chown(tarinfo, targetpath)
2936 if not tarinfo.issym():
2937 self.chmod(tarinfo, targetpath)
2938 self.utime(tarinfo, targetpath)
2940 #--------------------------------------------------------------------------
2941 # Below are the different file methods. They are called via
2942 # _extract_member() when extract() is called. They can be replaced in a
2943 # subclass to implement other functionality.
2945 def makedir(self, tarinfo, targetpath):
2946 """Make a directory called targetpath.
2949 # Use a safe mode for the directory, the real mode is set
2950 # later in _extract_member().
2951 os.mkdir(targetpath, 0o0700)
2952 except FileExistsError:
2955 def makefile(self, tarinfo, targetpath):
2956 """Make a file called targetpath.
2958 source = self.fileobj
2959 source.seek(tarinfo.offset_data)
2962 target = bltn_open(targetpath, "wb")
2964 if tarinfo.sparse is not None:
2966 for offset, size in tarinfo.sparse:
2968 copyfileobj(source, target, size)
2969 target.seek(tarinfo.size)
2978 copyfileobj(source, target, tarinfo.size)
2981 # only if we are extracting a multivolume this can be treated
2982 if not self.new_volume_handler:
2984 raise Exception("We need to read a new volume and you"
2985 " didn't supply a new_volume_handler")
2987 # the new volume handler should do everything needed to
2988 # start working in a new volume. usually, the handler calls
2989 # to self.open_volume
2990 self.volume_number += 1
2991 self.new_volume_handler(self, self.base_name, self.volume_number)
2992 tarinfo = self.firstmember
2993 source = self.fileobj
2998 def makeunknown(self, tarinfo, targetpath):
2999 """Make a file from a TarInfo object with an unknown type
3002 self.makefile(tarinfo, targetpath)
3003 self._dbg(1, "tarfile: Unknown file type %r, " \
3004 "extracted as regular file." % tarinfo.type)
3006 def makefifo(self, tarinfo, targetpath):
3007 """Make a fifo called targetpath.
3009 if hasattr(os, "mkfifo"):
3010 os.mkfifo(targetpath)
3012 raise ExtractError("fifo not supported by system")
3014 def makedev(self, tarinfo, targetpath):
3015 """Make a character or block device called targetpath.
3017 if not hasattr(os, "mknod") or not hasattr(os, "makedev"):
3018 raise ExtractError("special devices not supported by system")
3022 mode |= stat.S_IFBLK
3024 mode |= stat.S_IFCHR
3026 os.mknod(targetpath, mode,
3027 os.makedev(tarinfo.devmajor, tarinfo.devminor))
3029 def makelink(self, tarinfo, targetpath):
3030 """Make a (symbolic) link called targetpath. If it cannot be created
3031 (platform limitation), we try to make a copy of the referenced file
3035 # For systems that support symbolic and hard links.
3037 os.symlink(tarinfo.linkname, targetpath)
3040 if os.path.exists(tarinfo._link_target):
3041 os.link(tarinfo._link_target, targetpath)
3043 self._extract_member(self._find_link_target(tarinfo),
3045 except symlink_exception:
3047 self._extract_member(self._find_link_target(tarinfo),
3050 raise ExtractError("unable to resolve link inside archive")
3052 def chown(self, tarinfo, targetpath):
3053 """Set owner of targetpath according to tarinfo.
3055 if pwd and hasattr(os, "geteuid") and os.geteuid() == 0:
3056 # We have to be root to do so.
3058 g = grp.getgrnam(tarinfo.gname)[2]
3062 u = pwd.getpwnam(tarinfo.uname)[2]
3066 if tarinfo.issym() and hasattr(os, "lchown"):
3067 os.lchown(targetpath, u, g)
3069 os.chown(targetpath, u, g)
3070 except OSError as e:
3071 raise ExtractError("could not change owner")
3073 def chmod(self, tarinfo, targetpath):
3074 """Set file permissions of targetpath according to tarinfo.
3076 if hasattr(os, 'chmod'):
3078 os.chmod(targetpath, tarinfo.mode)
3079 except OSError as e:
3080 raise ExtractError("could not change mode")
3082 def utime(self, tarinfo, targetpath):
3083 """Set modification time of targetpath according to tarinfo.
3085 if not hasattr(os, 'utime'):
3088 os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime))
3089 except OSError as e:
3090 raise ExtractError("could not change modification time")
3092 #--------------------------------------------------------------------------
3094 """Return the next member of the archive as a TarInfo object, when
3095 TarFile is opened for reading. Return None if there is no more
3099 if self.firstmember is not None:
3100 m = self.firstmember
3101 self.firstmember = None
3104 # Read the next block.
3105 self.fileobj.seek(self.offset)
3109 tarinfo = self.tarinfo.fromtarfile(self)
3110 except EOFHeaderError as e:
3111 if self.ignore_zeros:
3112 self._dbg(2, "0x%X: %s" % (self.offset, e))
3113 self.offset += BLOCKSIZE
3115 except InvalidHeaderError as e:
3116 if self.ignore_zeros:
3117 self._dbg(2, "0x%X: %s" % (self.offset, e))
3118 self.offset += BLOCKSIZE
3120 elif self.offset == 0:
3121 raise ReadError(str(e))
3122 except EmptyHeaderError:
3123 if self.offset == 0:
3124 raise ReadError("empty file")
3125 except TruncatedHeaderError as e:
3126 if self.offset == 0:
3127 raise ReadError(str(e))
3128 except SubsequentHeaderError as e:
3129 raise ReadError(str(e))
3132 if tarinfo is not None:
3133 if self.save_to_members:
3134 self.members.append(tarinfo)
3140 #--------------------------------------------------------------------------
3141 # Little helper methods:
3143 def _getmember(self, name, tarinfo=None, normalize=False):
3144 """Find an archive member by name from bottom to top.
3145 If tarinfo is given, it is used as the starting point.
3147 # Ensure that all members have been loaded.
3148 members = self.getmembers()
3150 # Limit the member search list up to tarinfo.
3151 if tarinfo is not None:
3152 members = members[:members.index(tarinfo)]
3155 name = os.path.normpath(name)
3157 for member in reversed(members):
3159 member_name = os.path.normpath(member.name)
3161 member_name = member.name
3163 if name == member_name:
3167 """Read through the entire archive file and look for readable
3171 tarinfo = self.next()
3176 def _check(self, mode=None):
3177 """Check if TarFile is still open, and if the operation's mode
3178 corresponds to TarFile's mode.
3181 raise OSError("%s is closed" % self.__class__.__name__)
3182 if mode is not None and self.mode not in mode:
3183 raise OSError("bad operation for mode %r" % self.mode)
3185 def _find_link_target(self, tarinfo):
3186 """Find the target member of a symlink or hardlink member in the
3190 # Always search the entire archive.
3191 linkname = "/".join(filter(None, (os.path.dirname(tarinfo.name), tarinfo.linkname)))
3194 # Search the archive before the link, because a hard link is
3195 # just a reference to an already archived file.
3196 linkname = tarinfo.linkname
3199 member = self._getmember(linkname, tarinfo=limit, normalize=True)
3201 raise KeyError("linkname %r not found" % linkname)
3205 """Provide an iterator object.
3208 return iter(self.members)
3210 return TarIter(self)
3212 def _dbg(self, level, msg, *args):
3213 """Write debugging output to sys.stderr.
3215 if level <= self.debug:
3216 print(msg.format(*args), file=sys.stderr)
3218 def __enter__(self):
3222 def __exit__(self, type, value, traceback):
3226 # An exception occurred. We must not call close() because
3227 # it would try to write end-of-archive blocks and padding.
3228 if not self._extfileobj:
3229 self.fileobj.close()
3236 for tarinfo in TarFile(...):
3240 def __init__(self, tarfile):
3241 """Construct a TarIter object.
3243 self.tarfile = tarfile
3246 """Return iterator object.
3250 """Return the next item using TarFile's next() method.
3251 When all members have been read, set TarFile as _loaded.
3253 # Fix for SF #1100429: Under rare circumstances it can
3254 # happen that getmembers() is called during iteration,
3255 # which will cause TarIter to stop prematurely.
3257 if self.index == 0 and self.tarfile.firstmember is not None:
3258 tarinfo = self.tarfile.next()
3259 elif self.index < len(self.tarfile.members):
3260 tarinfo = self.tarfile.members[self.index]
3261 elif not self.tarfile._loaded:
3262 tarinfo = self.tarfile.next()
3264 self.tarfile._loaded = True
3273 #--------------------
3274 # exported functions
3275 #--------------------
3276 def is_tarfile(name):
3277 """Return True if name points to a tar archive that we
3278 are able to handle, else return False.