2 #-------------------------------------------------------------------
4 #-------------------------------------------------------------------
5 # Copyright (C) 2002 Lars Gustäbel <lars@gustaebel.de>
8 # Permission is hereby granted, free of charge, to any person
9 # obtaining a copy of this software and associated documentation
10 # files (the "Software"), to deal in the Software without
11 # restriction, including without limitation the rights to use,
12 # copy, modify, merge, publish, distribute, sublicense, and/or sell
13 # copies of the Software, and to permit persons to whom the
14 # Software is furnished to do so, subject to the following
17 # The above copyright notice and this permission notice shall be
18 # included in all copies or substantial portions of the Software.
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22 # OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24 # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25 # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27 # OTHER DEALINGS IN THE SOFTWARE.
29 """Read from and write to tar format archives.
32 __version__ = "$Revision: 85213 $"
36 __author__ = "Lars Gustäbel (lars@gustaebel.de)"
39 __credits__ = "Gustavo Niemeyer, Niels Gustäbel, Richard Townsend, Eduardo Robles."
58 import traceback # XXX
67 # os.symlink on Windows prior to 6.0 raises NotImplementedError
68 symlink_exception = (AttributeError, NotImplementedError)
70 # OSError (winerror=1314) will be raised if the caller does not hold the
71 # SeCreateSymbolicLinkPrivilege privilege
72 symlink_exception += (OSError,)
76 # from tarfile import *
77 __all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError"]
79 from builtins import open as _open # Since 'open' is TarFile.open
81 #---------------------------------------------------------
83 #---------------------------------------------------------
84 NUL = b"\0" # the null character
85 BLOCKSIZE = 512 # length of processing blocks
86 RECORDSIZE = BLOCKSIZE * 20 # length of records
87 GNU_MAGIC = b"ustar \0" # magic gnu tar string
88 POSIX_MAGIC = b"ustar\x0000" # magic posix tar string
90 LENGTH_NAME = 100 # maximum length of a filename
91 LENGTH_LINK = 100 # maximum length of a linkname
92 LENGTH_PREFIX = 155 # maximum length of the prefix field
94 REGTYPE = b"0" # regular file
95 AREGTYPE = b"\0" # regular file
96 LNKTYPE = b"1" # link (inside tarfile)
97 SYMTYPE = b"2" # symbolic link
98 CHRTYPE = b"3" # character special device
99 BLKTYPE = b"4" # block special device
100 DIRTYPE = b"5" # directory
101 FIFOTYPE = b"6" # fifo special device
102 CONTTYPE = b"7" # contiguous file
104 GNUTYPE_LONGNAME = b"L" # GNU tar longname
105 GNUTYPE_LONGLINK = b"K" # GNU tar longlink
106 GNUTYPE_SPARSE = b"S" # GNU tar sparse file
107 GNUTYPE_MULTIVOL = b"M" # GNU tar continuation of a file that began on
110 XHDTYPE = b"x" # POSIX.1-2001 extended header
111 XGLTYPE = b"g" # POSIX.1-2001 global header
112 SOLARIS_XHDTYPE = b"X" # Solaris extended header
114 USTAR_FORMAT = 0 # POSIX.1-1988 (ustar) format
115 GNU_FORMAT = 1 # GNU tar format
116 PAX_FORMAT = 2 # POSIX.1-2001 (pax) format
117 DEFAULT_FORMAT = GNU_FORMAT
119 GZ_FMT_HEADER = b"<BBBBLBB"
120 GZ_HEADER_SIZE = 10 # not including the name
121 GZ_MAGIC = (0x1f, 0x8b) # 0o37, 0o213
122 GZ_METHOD_DEFLATE = 0x08 # 0o10
123 GZ_FLAG_FTEXT = 1 << 0 # ASCII payload
124 GZ_FLAG_FHCRC = 1 << 1 # CRC16
125 GZ_FLAG_FEXTRA = 1 << 2 # extra field
126 GZ_FLAG_FNAME = 1 << 3 # set by default in gzip
127 GZ_FLAG_FCOMMENT = 1 << 4 # NUL-terminated comment
128 GZ_FLAG_RESERVED = 7 << 5 # unassigned
129 GZ_DEFLATE_FLAGS = 0x00 # 0o00, never read (deflate.c)
130 GZ_OS_CODE = 0x03 # 0o03, default in gzip (tailor.h)
131 GZ_MAGIC_BYTES = struct.pack ("<BB", GZ_MAGIC [0], GZ_MAGIC [1])
132 GZ_MAGIC_DEFLATE = struct.pack ("<BBB", GZ_MAGIC [0], GZ_MAGIC [1],
136 TOLERANCE_RECOVER = 1 # rely on offsets in index
137 TOLERANCE_RESCUE = 2 # deduce metadata from archive contents
141 #---------------------------------------------------------
142 # archive handling mode
143 #---------------------------------------------------------
146 ARCMODE_ENCRYPT = 1 << 0
147 ARCMODE_COMPRESS = 1 << 1
148 ARCMODE_CONCAT = 1 << 2
151 if m == ARCMODE_PLAIN:
155 def chkappend (b, s):
160 if first is True: first = False
163 chkappend (ARCMODE_ENCRYPT, "ENCRYPT")
164 chkappend (ARCMODE_COMPRESS, "COMPRESS")
165 chkappend (ARCMODE_CONCAT, "CONCAT")
169 def arcmode_set (concat=False, encryption=None, comptype=None, init=ARCMODE_PLAIN):
171 if bool (concat) is True:
172 ret |= ARCMODE_CONCAT
173 if encryption is not None:
174 ret |= ARCMODE_ENCRYPT
176 ret |= ARCMODE_COMPRESS
179 #---------------------------------------------------------
181 #---------------------------------------------------------
182 # File types that tarfile supports:
183 SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE,
184 SYMTYPE, DIRTYPE, FIFOTYPE,
185 CONTTYPE, CHRTYPE, BLKTYPE,
186 GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
187 GNUTYPE_SPARSE, GNUTYPE_MULTIVOL)
189 # File types that will be treated as a regular file.
190 REGULAR_TYPES = (REGTYPE, AREGTYPE,
191 CONTTYPE, GNUTYPE_SPARSE)
193 # File types that are part of the GNU tar format.
194 GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
195 GNUTYPE_SPARSE, GNUTYPE_MULTIVOL)
197 # Fields from a pax header that override a TarInfo attribute.
198 PAX_FIELDS = ("path", "linkpath", "size", "mtime",
199 "uid", "gid", "uname", "gname")
201 # Fields from a pax header that are affected by hdrcharset.
202 PAX_NAME_FIELDS = {"path", "linkpath", "uname", "gname"}
204 # Fields in a pax header that are numbers, all other fields
205 # are treated as strings.
206 PAX_NUMBER_FIELDS = {
215 #---------------------------------------------------------
217 #---------------------------------------------------------
219 if os.name in ("nt", "ce"):
222 ENCODING = sys.getfilesystemencoding()
224 #---------------------------------------------------------
225 # Some useful functions
226 #---------------------------------------------------------
228 def stn(s, length, encoding, errors):
229 """Convert a string to a null-terminated bytes object.
231 s = s.encode(encoding, errors)
232 return s[:length] + (length - len(s)) * NUL
234 def nts(s, encoding, errors):
235 """Convert a null-terminated bytes object to a string.
240 return s.decode(encoding, errors)
242 def sbtn(s, length, encoding, errors):
243 """Convert a string or a bunch of bytes to a null-terminated bytes object
246 if isinstance(s, str):
247 s = s.encode(encoding, errors)
248 return s[:length] + (length - len(s)) * NUL
251 """Convert a number field to a python number.
253 # There are two possible encodings for a number field, see
255 if s[0] in (0o200, 0o377):
257 for i in range(len(s) - 1):
261 n = -(256 ** (len(s) - 1) - n)
264 n = int(nts(s, "ascii", "strict") or "0", 8)
266 raise InvalidHeaderError("invalid header")
269 def itn(n, digits=8, format=DEFAULT_FORMAT):
270 """Convert a python number to a number field.
272 # POSIX 1003.1-1988 requires numbers to be encoded as a string of
273 # octal digits followed by a null-byte, this allows values up to
274 # (8**(digits-1))-1. GNU tar allows storing numbers greater than
275 # that if necessary. A leading 0o200 or 0o377 byte indicate this
276 # particular encoding, the following digits-1 bytes are a big-endian
277 # base-256 representation. This allows values up to (256**(digits-1))-1.
278 # A 0o200 byte indicates a positive number, a 0o377 byte a negative
280 if 0 <= n < 8 ** (digits - 1):
281 s = bytes("%0*o" % (digits - 1, int(n)), "ascii") + NUL
282 elif format == GNU_FORMAT and -256 ** (digits - 1) <= n < 256 ** (digits - 1):
284 s = bytearray([0o200])
286 s = bytearray([0o377])
287 n = 256 ** digits + n
289 for i in range(digits - 1):
290 s.insert(1, n & 0o377)
293 raise ValueError("overflow in number field")
297 def calc_chksums(buf):
298 """Calculate the checksum for a member's header by summing up all
299 characters except for the chksum field which is treated as if
300 it was filled with spaces. According to the GNU tar sources,
301 some tars (Sun and NeXT) calculate chksum with signed char,
302 which will be different if there are chars in the buffer with
303 the high bit set. So we calculate two checksums, unsigned and
306 unsigned_chksum = 256 + sum(struct.unpack_from("148B8x356B", buf))
307 signed_chksum = 256 + sum(struct.unpack_from("148b8x356b", buf))
308 return unsigned_chksum, signed_chksum
310 def copyfileobj(src, dst, length=None):
311 """Copy length bytes from fileobj src to fileobj dst.
312 If length is None, copy the entire content.
317 shutil.copyfileobj(src, dst)
320 blocks, remainder = divmod(length, BUFSIZE)
321 for b in range(blocks):
322 buf = src.read(BUFSIZE)
324 if len(buf) < BUFSIZE:
325 raise OSError("end of file reached")
327 buf = src.read(remainder)
329 if len(buf) < remainder:
330 raise OSError("end of file reached")
334 """Deprecated in this location; use stat.filemode."""
336 warnings.warn("deprecated in favor of stat.filemode",
337 DeprecationWarning, 2)
338 return stat.filemode(mode)
340 class TarError(Exception):
341 """Base exception."""
343 class ExtractError(TarError):
344 """General exception for extract errors."""
346 class ReadError(TarError):
347 """Exception for unreadable tar archives."""
349 class CompressionError(TarError):
350 """Exception for unavailable compression methods."""
352 class StreamError(TarError):
353 """Exception for unsupported operations on stream-like TarFiles."""
355 class HeaderError(TarError):
356 """Base exception for header errors."""
358 class EmptyHeaderError(HeaderError):
359 """Exception for empty headers."""
361 class TruncatedHeaderError(HeaderError):
362 """Exception for truncated headers."""
364 class EOFHeaderError(HeaderError):
365 """Exception for end of file headers."""
367 class InvalidHeaderError(HeaderError):
368 """Exception for invalid headers."""
370 class SubsequentHeaderError(HeaderError):
371 """Exception for missing and invalid extended headers."""
373 class InvalidEncryptionError(TarError):
374 """Exception for undefined crypto modes and combinations."""
376 class DecryptionError(TarError):
377 """Exception for error during decryption."""
379 class EncryptionError(TarError):
380 """Exception for error during encryption."""
382 class EndOfFile(Exception):
383 """Signal end of file condition when they’re not an error."""
386 #---------------------------
387 # internal stream interface
388 #---------------------------
390 """Low-level file object. Supports reading and writing.
391 It is used instead of a regular file object for streaming
395 def __init__(self, name, mode):
398 "w": os.O_RDWR | os.O_CREAT | os.O_TRUNC,
400 if hasattr(os, "O_BINARY"):
401 _mode |= os.O_BINARY # pylint: disable=no-member
402 self.fd = os.open(name, _mode, 0o666)
408 def read(self, size):
409 ret = os.read(self.fd, size)
410 self.offset += len(ret)
413 def write(self, s, pos=None):
416 os.lseek (self.fd, pos, os.SEEK_SET)
417 n = os.write(self.fd, s)
419 self.offset += len(s)
421 append = pos + n - p0
423 self.offset += append
424 os.lseek (self.fd, p0, os.SEEK_SET)
429 def seek_set (self, pos):
430 os.lseek (self.fd, pos, os.SEEK_SET)
434 def gz_header (name=None):
435 timestamp = int(time.time())
441 flags |= GZ_FLAG_FNAME
442 if type(name) is str:
443 name = name.encode("iso-8859-1", "replace")
444 if name.endswith(b".pdtcrypt"):
446 if name.endswith(b".gz"):
448 # RFC1952 says we must use ISO-8859-1 for the FNAME field.
451 hdr = struct.pack (GZ_FMT_HEADER,
452 GZ_MAGIC [0], GZ_MAGIC [1],
453 GZ_METHOD_DEFLATE, flags,
455 GZ_DEFLATE_FLAGS, GZ_OS_CODE)
461 """Class that serves as an adapter between TarFile and
462 a stream-like object. The stream-like object only
463 needs to have a read() or write() method and is accessed
464 blockwise. Use of gzip or bzip2 compression is possible.
465 A stream-like object could be for example: sys.stdin,
466 sys.stdout, a socket, a tape device etc.
468 _Stream is intended to be used only internally but is
469 nevertherless used externally by Deltatar.
471 When encrypting, the ``enccounter`` will be used for
472 initializing the first cryptographic context. When
473 decrypting, its value will be compared to the decrypted
474 object. Decryption fails if the value does not match.
475 In effect, this means that a ``_Stream`` whose ctor was
476 passed ``enccounter`` can only be used to encrypt or
477 decrypt a single object.
480 remainder = -1 # track size in encrypted entries
481 tolerance = TOLERANCE_STRICT
483 def __init__(self, name, mode, comptype, fileobj, bufsize,
484 concat=False, encryption=None, enccounter=None,
485 compresslevel=9, tolerance=TOLERANCE_STRICT):
486 """Construct a _Stream object.
488 self.arcmode = arcmode_set (concat, encryption, comptype)
489 self.tolerance = tolerance
491 self._extfileobj = True
493 fileobj = _LowLevelFile(name, mode)
494 self._extfileobj = False
497 # Enable transparent compression detection for the
499 fileobj = _StreamProxy(fileobj)
500 comptype = fileobj.getcomptype()
504 self.enccounter = None
505 if self.arcmode & ARCMODE_ENCRYPT:
506 self.enccounter = enccounter
508 self.name = name or ""
510 self.comptype = comptype
512 self.fileobj = fileobj
513 self.bufsize = bufsize
519 self.last_block_offset = 0
520 self.dbuf = b"" # ???
521 self.exception = None # communicate decompression failure
522 self.compresslevel = compresslevel
523 self.bytes_written = 0
525 self.encryption = encryption
533 raise CompressionError("zlib module is not available")
536 self.exception = zlib.error
539 if not (self.arcmode & ARCMODE_CONCAT):
540 if self.arcmode & ARCMODE_ENCRYPT:
541 self._init_write_encrypt (name)
542 self._init_write_gz ()
543 self.crc = zlib.crc32(b"") & 0xFFFFffff
545 elif comptype == "bz2":
546 if self.arcmode & ARCMODE_ENCRYPT:
547 raise InvalidEncryptionError("encryption not available for "
548 "compression “%s”" % comptype)
552 raise CompressionError("bz2 module is not available")
555 self.cmp = bz2.BZ2Decompressor()
556 self.exception = OSError
558 self.cmp = bz2.BZ2Compressor()
560 elif comptype == 'xz':
561 if self.arcmode & ARCMODE_ENCRYPT:
562 raise InvalidEncryptionError("encryption not available for "
563 "compression “%s”" % comptype)
567 raise CompressionError("lzma module is not available")
570 self.cmp = lzma.LZMADecompressor()
571 self.exception = lzma.LZMAError
573 self.cmp = lzma.LZMACompressor()
575 elif comptype == "tar":
576 if not (self.arcmode & ARCMODE_CONCAT) \
578 and self.arcmode & ARCMODE_ENCRYPT:
579 self._init_write_encrypt (name)
582 if self.arcmode & ARCMODE_ENCRYPT:
583 raise InvalidEncryptionError("encryption not available for "
584 "compression “%s”" % comptype)
585 raise CompressionError("unknown compression type %r" % comptype)
588 if not self._extfileobj:
594 if hasattr(self, "closed") and not self.closed:
597 except crypto.InternalError:
598 # context already finalized due to abort but close() tried
603 def next (self, name):
604 if self.arcmode & ARCMODE_COMPRESS:
605 if getattr (self, "cmp", None) is not None:
606 self._finalize_write_gz ()
608 if self.arcmode & ~(ARCMODE_ENCRYPT | ARCMODE_COMPRESS):
609 self.last_block_offset = self.fileobj.tell()
610 if self.arcmode & ARCMODE_ENCRYPT:
611 self._finalize_write_encrypt ()
612 self._init_write_encrypt (name, set_last_block_offset=True)
613 if self.arcmode & ARCMODE_COMPRESS:
614 self._init_write_gz (set_last_block_offset =
615 not (self.arcmode & ARCMODE_ENCRYPT))
616 return self.last_block_offset
619 def next_volume (self, name):
620 # with non-concat modes, this is taken care by the _Stream
621 # ctor as invoked by the newvol handler
622 if self.arcmode & ARCMODE_COMPRESS:
623 if getattr (self, "cmp", None) is not None:
624 # e. g. compressed PAX header written
625 self._finalize_write_gz ()
626 if self.arcmode & ARCMODE_ENCRYPT:
627 self._init_write_encrypt (name)
628 if self.arcmode & ARCMODE_COMPRESS:
629 self._init_write_gz ()
632 def _init_write_encrypt (self, entry=None, set_last_block_offset=False):
634 Save position for delayed write of header; fill the header location
637 # first thing, proclaim new object to the encryption context
638 # secondly, assemble the header with the updated parameters
639 # and commit it directly to the underlying stream, bypassing the
640 # encryption layer in .__write().
641 dummyhdr = self.encryption.next (entry, counter=self.enccounter)
643 raise EncryptionError ("Crypto.next(): bad dummy header") # XXX
644 self.lasthdr = self.fileobj.tell()
645 self.__write_to_file(dummyhdr)
646 if set_last_block_offset is True:
647 self.last_block_offset = self.lasthdr
650 def _finalize_write_encrypt (self):
652 Seek back to header position, read dummy bytes, finalize crypto
653 obtaining the actual header, write header, seek back to current
656 Returns the list of IV fixed parts as used during encryption.
658 if self.lasthdr is not None:
659 pos0 = self.fileobj.tell ()
660 self.fileobj.seek_set (self.lasthdr)
661 dummy = self.fileobj.read (crypto.PDTCRYPT_HDR_SIZE)
662 pos1 = self.fileobj.tell ()
663 dpos = pos1 - self.lasthdr
664 assert dpos == crypto.PDTCRYPT_HDR_SIZE
665 self.fileobj.seek_set (pos0)
666 data, hdr, _ = self.encryption.done (dummy)
667 self.__write_to_file(hdr, pos=self.lasthdr)
668 self.__write_to_file(data) # append remainder of data
672 def _finalize_write_gz (self):
673 if self.cmp is not None:
674 chunk = self.buf + self.cmp.flush()
676 if self.comptype == "gz":
677 # The native zlib crc is an unsigned 32-bit integer, but
678 # the Python wrapper implicitly casts that to a signed C
679 # long. So, on a 32-bit box self.crc may "look negative",
680 # while the same crc on a 64-bit box may "look positive".
681 # To avoid irksome warnings from the `struct` module, force
682 # it to look positive on all boxes.
683 chunk += struct.pack("<L", self.crc & 0xffffffff)
684 chunk += struct.pack("<L", self.concat_pos & 0xffffFFFF)
685 self.__enc_write (chunk)
689 def _init_write_gz (self, set_last_block_offset=False):
691 Add a new gzip block, closing last one
694 self.crc = self.zlib.crc32(b"") & 0xFFFFffff
695 first = self.cmp is None
696 self.cmp = self.zlib.compressobj(self.compresslevel,
698 -self.zlib.MAX_WBITS,
699 self.zlib.DEF_MEM_LEVEL,
702 # if aes, we encrypt after compression
703 if set_last_block_offset is True:
704 self.last_block_offset = self.fileobj.tell()
706 self.__write(gz_header (self.name if first is True else None))
710 """Write string s to the stream.
712 if self.comptype == "gz":
713 self.crc = self.zlib.crc32(s, self.crc) & 0xFFFFffff
715 self.concat_pos += len(s)
716 if self.cmp is not None:
717 s = self.cmp.compress(s)
721 """Write what’s left in the buffer to the stream."""
722 self.__write (b"") # → len (buf) <= bufsiz
723 self.__enc_write (self.buf)
726 def __write(self, s):
727 """Writes (and encodes) string s to the stream blockwise
729 will wait with encoding/writing until block is complete
732 while len(self.buf) > self.bufsize:
733 self.__enc_write(self.buf[:self.bufsize])
734 self.buf = self.buf[self.bufsize:]
737 def __write_to_file(self, s, pos=None):
739 Writes directly to the fileobj; updates self.bytes_written. If “pos” is
740 given, the stream will seek to that position first and back afterwards,
741 and the total of bytes written is not updated.
743 self.fileobj.write(s, pos)
745 self.bytes_written += len(s)
748 def __enc_write(self, s):
750 If encryption is active, the string s is encrypted before being written
755 if self.arcmode & ARCMODE_ENCRYPT:
758 n, ct = self.encryption.process(buf)
759 self.__write_to_file(ct)
762 # The entire plaintext was not consumed: The size limit
763 # for encrypted objects was reached. Transparently create
764 # a new encrypted object and continue processing the input.
765 self._finalize_write_encrypt ()
766 self._init_write_encrypt ()
768 self.__write_to_file(s)
771 def estim_file_size(self):
772 """ estimates size of file if closing it now
774 The result may differ greatly from the amount of data sent to write()
775 due to compression, encryption and buffering.
777 In tests the result (before calling close()) was up to 12k smaller than
778 the final file size if compression is being used because zlib/bz2
779 compressors do not allow inspection of their buffered data :-(
781 Still, we add what close() would add: 8 bytes for gz checksum, one
782 encryption block size if encryption is used and the size of our own
786 return self.bytes_written
788 result = self.bytes_written
790 result += len(self.buf)
791 if self.comptype == 'gz':
792 result += 8 # 2 longs = 8 byte (no extra info written for bzip2)
795 def close(self, close_fileobj=True):
796 """Close the _Stream object. No operation should be
797 done on it afterwards.
803 if close_fileobj is True:
806 if self.arcmode & ARCMODE_COMPRESS:
807 self._finalize_write_gz ()
808 # end of Tar archive marker (two empty blocks) was written
809 # finalize encryption last; no writes may be performed after
812 if self.arcmode & ARCMODE_ENCRYPT:
813 self._finalize_write_encrypt ()
815 if not self._extfileobj:
818 # read the zlib crc and length and check them
819 if self.mode == "r" and self.comptype == "gz":
820 read_crc = self.__read(4)
821 read_length = self.__read(4)
822 calculated_crc = self.crc
823 if struct.unpack("<L", read_crc)[0] != calculated_crc:
824 raise CompressionError("bad gzip crc")
828 def _init_read_gz(self):
829 """Initialize for reading a gzip compressed fileobj.
831 self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
833 read2 = self.__read(2)
835 raise EndOfFile ("_init_read_gz(): read returned zero bytes at pos "
836 "%d" % self.fileobj.tell())
837 # taken from gzip.GzipFile with some alterations
838 if read2 != GZ_MAGIC_BYTES:
839 raise ReadError("not a gzip file")
841 read1 = ord (self.__read(1))
842 if read1 != GZ_METHOD_DEFLATE:
843 raise CompressionError("unsupported compression method")
845 self.flags = flag = ord(self.__read(1))
846 self.__read(6) # discard timestamp[4], deflate flags, os code
848 if flag & GZ_FLAG_FEXTRA:
849 xlen = ord(self.__read(1)) + 256 * ord(self.__read(1))
851 if flag & GZ_FLAG_FNAME:
854 if not s or s == NUL:
856 if flag & GZ_FLAG_FCOMMENT:
859 if not s or s == NUL:
861 if flag & GZ_FLAG_FHCRC:
864 def _init_read_encrypt (self):
865 """Initialize encryption for next entry in archive. Read a header and
866 notify the crypto context."""
867 if self.arcmode & ARCMODE_ENCRYPT:
868 lasthdr = self.fileobj.tell ()
870 hdr = crypto.hdr_read_stream (self.fileobj)
871 except crypto.EndOfFile:
873 except crypto.InvalidHeader as exn:
874 raise DecryptionError ("Crypto.hdr_read_stream(): error “%s” "
875 "processing %r at pos %d"
876 % (exn, self.fileobj, lasthdr)) \
878 if self.enccounter is not None:
879 # enforce that the iv counter in the header matches an
880 # explicitly requested one
881 iv = crypto.hdr_iv_counter (hdr)
882 if iv != self.enccounter:
883 raise DecryptionError ("expected IV counter %d, got %d"
884 % (self.enccounter, iv))
885 self.lasthdr = lasthdr
886 self.remainder = hdr ["ctsize"] # distance to next header
888 self.encryption.next (hdr)
889 except crypto.InvalidParameter as exn:
890 raise DecryptionError ("Crypto.next(): error “%s” "
891 "processing %r at pos %d"
892 % (exn, self.fileobj, lasthdr)) \
898 def _read_encrypt (self, buf):
900 Demote a program error to a decryption error in tolerant mode. This
901 allows recovery from corrupted headers and invalid data.
904 return self.encryption.process (buf)
905 except RuntimeError as exn:
906 if self.tolerance != TOLERANCE_STRICT:
907 raise DecryptionError (exn)
911 def _finalize_read_encrypt (self):
915 if self.arcmode & ARCMODE_ENCRYPT \
916 and self.lasthdr is not None :
917 assert self.remainder >= 0
918 if self.remainder > 0:
921 data = self.encryption.done ()
922 except crypto.InvalidGCMTag as exn:
923 raise DecryptionError ("decryption failed: %s" % exn)
928 """Return the stream's file pointer position.
932 def seek(self, pos=0):
933 """Set the stream's file pointer to pos. Negative seeking
936 if pos - self.pos >= 0:
937 blocks, remainder = divmod(pos - self.pos, self.bufsize)
938 for i in range(blocks):
939 self.read(self.bufsize)
942 raise StreamError("seeking backwards is not allowed")
945 def read(self, size=None):
946 """Return the next size number of bytes from the stream.
947 If size is not defined, return all bytes of the stream
953 buf = self._read(self.bufsize)
959 buf = self._read(size)
964 """Reads just one line, new line character included
966 # if \n in dbuf, no read neads to be done
967 if b'\n' in self.dbuf:
968 pos = self.dbuf.index(b'\n') + 1
969 ret = self.dbuf[:pos]
970 self.dbuf = self.dbuf[pos:]
975 chunk = self._read(self.bufsize)
977 # nothing more to read, so return the buffer
983 # if \n found, return the new line
986 pos = dbuf.index(b'\n') + 1
987 self.dbuf = dbuf[pos:] + self.dbuf
990 def _read(self, size):
991 """Return size bytes from the stream.
997 buf = self.__read(self.bufsize)
1001 if self.cmp is not None:
1003 buf = self.cmp.decompress(buf)
1004 except self.exception as exn:
1005 raise ReadError("invalid compressed data (%r)" % exn)
1006 except Exception as e:
1007 # happens at the end of the file
1008 # _init_read_gz failed in the previous iteration so
1009 # self.cmp.decompress fails here
1010 if self.arcmode & ARCMODE_CONCAT:
1013 raise ReadError("invalid compressed data")
1014 if self.arcmode & ARCMODE_COMPRESS and hasattr(self, "crc"):
1015 self.crc = self.zlib.crc32(buf, self.crc) & 0xFFFFffff
1016 if self.arcmode & ARCMODE_CONCAT \
1017 and len(self.cmp.unused_data) != 0:
1018 self.buf = self.cmp.unused_data + self.buf
1019 self.close(close_fileobj=False)
1021 self._init_read_gz()
1022 except DecryptionError:
1023 if self.tolerance != TOLERANCE_STRICT:
1024 # return whatever data was processed successfully
1030 except ReadError: # gzip troubles
1031 if self.tolerance == TOLERANCE_RESCUE:
1038 # happens at the end of the file
1040 self.crc = self.zlib.crc32(b"") & 0xFFFFffff
1045 self.dbuf = t[size:]
1049 def __read(self, size):
1051 Return size bytes from stream. If internal buffer is empty, read
1052 another block from the stream.
1054 The function returns up to size bytes of data. When an error occurs
1055 during decryption, everything until the end of the last successfully
1056 finalized object is returned.
1059 t = [self.buf] if c > 0 else []
1060 good_crypto = len (t)
1065 if self.arcmode & ARCMODE_ENCRYPT:
1066 if self.remainder <= 0:
1067 # prepare next object
1068 if self._init_read_encrypt () is False: # EOF
1072 # only read up to the end of the encrypted object
1073 todo = min (size, self.remainder)
1074 buf = self.fileobj.read(todo)
1075 if self.arcmode & ARCMODE_ENCRYPT:
1077 buf = self._read_encrypt (buf)
1078 if todo == self.remainder:
1079 # at the end of a crypto object; finalization will fail if
1080 # the GCM tag does not match
1081 trailing = self._finalize_read_encrypt ()
1082 good_crypto = len (t) + 1
1083 if len (trailing) > 0:
1087 self.remainder -= todo
1088 except DecryptionError:
1089 if self.tolerance == TOLERANCE_STRICT:
1091 self.encryption.drop ()
1092 if self.tolerance == TOLERANCE_RECOVER:
1093 if good_crypto == 0:
1095 # this may occur at any of the three crypto operations above.
1096 # some objects did validate; discard all data after it; next
1097 # call will start with the bad object and error out immediately
1098 self.buf = b"".join (t [good_crypto:])
1099 return b"".join (t [:good_crypto])
1100 elif self.tolerance == TOLERANCE_RESCUE:
1101 # keep what we have so far despite the finalization issue
1106 raise RuntimeError("internal error: bad tolerance level")
1108 if not buf: ## XXX stream terminated prematurely; this should be an error
1119 class _StreamProxy(object):
1120 """Small proxy class that enables transparent compression
1121 detection for the Stream interface (mode 'r|*').
1124 def __init__(self, fileobj):
1125 self.fileobj = fileobj
1126 self.buf = self.fileobj.read(BLOCKSIZE)
1128 def read(self, size): # pylint: disable=method-hidden
1129 self.read = self.fileobj.read
1132 def getcomptype(self):
1133 if self.buf.startswith(GZ_MAGIC_DEFLATE):
1135 elif self.buf[0:3] == b"BZh" and self.buf[4:10] == b"1AY&SY":
1137 elif self.buf.startswith((b"\x5d\x00\x00\x80", b"\xfd7zXZ")):
1143 self.fileobj.close()
1146 #------------------------
1147 # Extraction file object
1148 #------------------------
1149 class _FileInFile(object):
1150 """A thin wrapper around an existing file object that
1151 provides a part of its data as an individual file
1155 def __init__(self, fileobj, offset, size, blockinfo=None):
1156 self.fileobj = fileobj
1157 self.offset = offset
1160 self.name = getattr(fileobj, "name", None)
1163 if blockinfo is None:
1164 blockinfo = [(0, size)]
1166 # Construct a map with data and zero blocks.
1170 realpos = self.offset
1171 for offset, size in blockinfo:
1172 if offset > lastpos:
1173 self.map.append((False, lastpos, offset, None))
1174 self.map.append((True, offset, offset + size, realpos))
1176 lastpos = offset + size
1177 if lastpos < self.size:
1178 self.map.append((False, lastpos, self.size, None))
1190 return self.fileobj.seekable()
1193 """Return the current file position.
1195 return self.position
1197 def seek(self, position, whence=io.SEEK_SET):
1198 """Seek to a position in the file.
1200 if whence == io.SEEK_SET:
1201 self.position = min(max(position, 0), self.size)
1202 elif whence == io.SEEK_CUR:
1204 self.position = max(self.position + position, 0)
1206 self.position = min(self.position + position, self.size)
1207 elif whence == io.SEEK_END:
1208 self.position = max(min(self.size + position, self.size), 0)
1210 raise ValueError("Invalid argument")
1211 return self.position
1213 def read(self, size=None):
1214 """Read data from the file.
1217 size = self.size - self.position
1219 size = min(size, self.size - self.position)
1224 data, start, stop, offset = self.map[self.map_index]
1225 if start <= self.position < stop:
1229 if self.map_index == len(self.map):
1231 length = min(size, stop - self.position)
1233 self.fileobj.seek(offset + (self.position - start))
1234 buf += self.fileobj.read(length)
1238 self.position += length
1241 def readinto(self, b):
1242 buf = self.read(len(b))
1251 class ExFileObject(io.BufferedReader):
1253 def __init__(self, tarfile, tarinfo):
1254 fileobj = _FileInFile(tarfile.fileobj, tarinfo.offset_data,
1255 tarinfo.size, tarinfo.sparse)
1256 super().__init__(fileobj)
1262 class TarInfo(object):
1263 """Informational class which holds the details about an
1264 archive member given by a tar header block.
1265 TarInfo objects are returned by TarFile.getmember(),
1266 TarFile.getmembers() and TarFile.gettarinfo() and are
1267 usually created internally.
1270 __slots__ = ("name", "mode", "uid", "gid", "size", "mtime",
1271 "chksum", "type", "linkname", "uname", "gname",
1272 "devmajor", "devminor", "volume_offset",
1273 "offset", "offset_data", "pax_headers", "sparse",
1274 "tarfile", "_sparse_structs", "_link_target")
1276 def __init__(self, name=""):
1277 """Construct a TarInfo object. name is the optional name
1280 self.name = name # member name
1281 self.mode = 0o644 # file permissions
1282 self.uid = 0 # user id
1283 self.gid = 0 # group id
1284 self.size = 0 # file size
1285 self.mtime = 0 # modification time
1286 self.chksum = 0 # header checksum
1287 self.type = REGTYPE # member type
1288 self.linkname = "" # link name
1289 self.uname = "" # user name
1290 self.gname = "" # group name
1291 self.devmajor = 0 # device major number
1292 self.devminor = 0 # device minor number
1294 self.offset = 0 # the tar header starts here
1295 self.offset_data = 0 # the file's data starts here
1296 self.volume_offset = 0 # the file's data corresponds with the data
1297 # starting at this position
1299 self.sparse = None # sparse member information
1300 self.pax_headers = {} # pax header information
1302 # In pax headers the "name" and "linkname" field are called
1303 # "path" and "linkpath".
1306 def _setpath(self, name):
1308 path = property(_getpath, _setpath)
1310 def _getlinkpath(self):
1311 return self.linkname
1312 def _setlinkpath(self, linkname):
1313 self.linkname = linkname
1314 linkpath = property(_getlinkpath, _setlinkpath)
1317 return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self))
1319 def get_info(self, encoding=None, errors=None):
1320 """Return the TarInfo's attributes as a dictionary.
1324 "mode": self.mode & 0o7777,
1328 "mtime": self.mtime,
1329 "chksum": self.chksum,
1331 "linkname": self.linkname,
1332 "uname": self.uname,
1333 "gname": self.gname,
1334 "devmajor": self.devmajor,
1335 "devminor": self.devminor,
1336 "offset_data": self.offset_data,
1337 "volume_offset": self.volume_offset
1340 if info["type"] == DIRTYPE and not info["name"].endswith("/"):
1345 def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING,
1346 errors="surrogateescape"):
1347 """Return a tar header as a string of 512 byte blocks.
1349 info = self.get_info(encoding, errors)
1351 if format == USTAR_FORMAT:
1352 return self.create_ustar_header(info, encoding, errors)
1353 elif format == GNU_FORMAT:
1354 return self.create_gnu_header(info, encoding, errors)
1355 elif format == PAX_FORMAT:
1356 return self.create_pax_header(info, encoding, errors)
1358 raise ValueError("invalid format")
1360 def create_ustar_header(self, info, encoding, errors):
1361 """Return the object as a ustar header block.
1363 info["magic"] = POSIX_MAGIC
1365 if len(info["linkname"]) > LENGTH_LINK:
1366 raise ValueError("linkname is too long")
1368 if len(info["name"]) > LENGTH_NAME:
1369 info["prefix"], info["name"] = self._posix_split_name(info["name"])
1371 return self._create_header(info, USTAR_FORMAT, encoding, errors)
1373 def create_gnu_header(self, info, encoding, errors):
1374 """Return the object as a GNU header block sequence.
1376 info["magic"] = GNU_MAGIC
1378 if self.ismultivol():
1380 itn(info.get("atime", 0), 12, GNU_FORMAT),
1381 itn(info.get("ctime", 0), 12, GNU_FORMAT),
1382 itn(self.volume_offset, 12, GNU_FORMAT),
1383 itn(0, 119, GNU_FORMAT), # stuff unused in this tar implementation, set to zero
1385 info['prefix'] = b"".join(prefix)
1386 info['size'] = info['size'] - self.volume_offset
1389 if len(info["linkname"]) > LENGTH_LINK:
1390 buf += self._create_gnu_long_header(info["linkname"],
1391 GNUTYPE_LONGLINK, encoding, errors)
1393 if len(info["name"]) > LENGTH_NAME:
1394 buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME,
1397 return buf + self._create_header(info, GNU_FORMAT, encoding, errors)
1399 def create_pax_header(self, info, encoding, errors):
1400 """Return the object as a ustar header block. If it cannot be
1401 represented this way, prepend a pax extended header sequence
1402 with supplement information.
1404 info["magic"] = POSIX_MAGIC
1405 pax_headers = self.pax_headers.copy()
1406 if self.ismultivol():
1407 info['size'] = info['size'] - self.volume_offset
1409 # Test string fields for values that exceed the field length or cannot
1410 # be represented in ASCII encoding.
1411 for name, hname, length in (
1412 ("name", "path", LENGTH_NAME),
1413 ("linkname", "linkpath", LENGTH_LINK),
1414 ("uname", "uname", 32),
1415 ("gname", "gname", 32)):
1417 if hname in pax_headers:
1418 # The pax header has priority.
1421 # Try to encode the string as ASCII.
1423 info[name].encode("ascii", "strict")
1424 except UnicodeEncodeError:
1425 pax_headers[hname] = info[name]
1428 if len(info[name]) > length:
1429 pax_headers[hname] = info[name]
1431 # Test number fields for values that exceed the field limit or values
1432 # that like to be stored as float.
1433 for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)):
1434 if name in pax_headers:
1435 # The pax header has priority. Avoid overflow.
1440 if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float):
1441 pax_headers[name] = str(val)
1444 # Create a pax extended header if necessary.
1446 buf = self._create_pax_generic_header(pax_headers, XHDTYPE, encoding)
1450 return buf + self._create_header(info, USTAR_FORMAT, "ascii", "replace")
1453 def create_pax_global_header(cls, pax_headers):
1454 """Return the object as a pax global header block sequence.
1456 return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf-8")
1458 def _posix_split_name(self, name):
1459 """Split a name longer than 100 chars into a prefix
1462 prefix = name[:LENGTH_PREFIX + 1]
1463 while prefix and prefix[-1] != "/":
1464 prefix = prefix[:-1]
1466 name = name[len(prefix):]
1467 prefix = prefix[:-1]
1469 if not prefix or len(name) > LENGTH_NAME:
1470 raise ValueError("name is too long")
1474 def _create_header(info, format, encoding, errors):
1475 """Return a header block. info is a dictionary with file
1476 information, format must be one of the *_FORMAT constants.
1479 stn(info.get("name", ""), 100, encoding, errors),
1480 itn(info.get("mode", 0) & 0o7777, 8, format),
1481 itn(info.get("uid", 0), 8, format),
1482 itn(info.get("gid", 0), 8, format),
1483 itn(info.get("size", 0), 12, format),
1484 itn(info.get("mtime", 0), 12, format),
1485 b" ", # checksum field
1486 info.get("type", REGTYPE),
1487 stn(info.get("linkname", ""), 100, encoding, errors),
1488 info.get("magic", POSIX_MAGIC),
1489 stn(info.get("uname", ""), 32, encoding, errors),
1490 stn(info.get("gname", ""), 32, encoding, errors),
1491 itn(info.get("devmajor", 0), 8, format),
1492 itn(info.get("devminor", 0), 8, format),
1493 sbtn(info.get("prefix", ""), 155, encoding, errors)
1496 buf = struct.pack("%ds" % BLOCKSIZE, b"".join(parts))
1497 chksum = calc_chksums(buf[-BLOCKSIZE:])[0]
1498 buf = buf[:-364] + bytes("%06o\0" % chksum, "ascii") + buf[-357:]
1502 def _create_payload(payload):
1503 """Return the string payload filled with zero bytes
1504 up to the next 512 byte border.
1506 blocks, remainder = divmod(len(payload), BLOCKSIZE)
1508 payload += (BLOCKSIZE - remainder) * NUL
1512 def _create_gnu_long_header(cls, name, type, encoding, errors):
1513 """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence
1516 name = name.encode(encoding, errors) + NUL
1519 info["name"] = "././@LongLink"
1521 info["size"] = len(name)
1522 info["magic"] = GNU_MAGIC
1524 # create extended header + name blocks.
1525 return cls._create_header(info, USTAR_FORMAT, encoding, errors) + \
1526 cls._create_payload(name)
1529 def _create_pax_generic_header(cls, pax_headers, type, encoding):
1530 """Return a POSIX.1-2008 extended or global header sequence
1531 that contains a list of keyword, value pairs. The values
1534 # Check if one of the fields contains surrogate characters and thereby
1535 # forces hdrcharset=BINARY, see _proc_pax() for more information.
1537 for keyword, value in pax_headers.items():
1539 value.encode("utf-8", "strict")
1540 except UnicodeEncodeError:
1546 # Put the hdrcharset field at the beginning of the header.
1547 records += b"21 hdrcharset=BINARY\n"
1549 for keyword, value in pax_headers.items():
1550 keyword = keyword.encode("utf-8")
1552 # Try to restore the original byte representation of `value'.
1553 # Needless to say, that the encoding must match the string.
1554 value = value.encode(encoding, "surrogateescape")
1556 value = value.encode("utf-8")
1558 l = len(keyword) + len(value) + 3 # ' ' + '=' + '\n'
1565 records += bytes(str(p), "ascii") + b" " + keyword + b"=" + value + b"\n"
1567 # We use a hardcoded "././@PaxHeader" name like star does
1568 # instead of the one that POSIX recommends.
1570 info["name"] = "././@PaxHeader"
1572 info["size"] = len(records)
1573 info["magic"] = POSIX_MAGIC
1575 # Create pax header + record blocks.
1576 return cls._create_header(info, USTAR_FORMAT, "ascii", "replace") + \
1577 cls._create_payload(records)
1580 def frombuf(cls, buf, encoding, errors):
1581 """Construct a TarInfo object from a 512 byte bytes object.
1584 raise EmptyHeaderError("empty header")
1585 if len(buf) != BLOCKSIZE:
1586 raise TruncatedHeaderError("truncated header")
1587 if buf.count(NUL) == BLOCKSIZE:
1588 raise EOFHeaderError("end of file header")
1590 chksum = nti(buf[148:156])
1591 if chksum not in calc_chksums(buf):
1592 raise InvalidHeaderError("bad checksum")
1595 obj.name = nts(buf[0:100], encoding, errors)
1596 obj.mode = nti(buf[100:108])
1597 obj.uid = nti(buf[108:116])
1598 obj.gid = nti(buf[116:124])
1599 obj.size = nti(buf[124:136])
1600 obj.mtime = nti(buf[136:148])
1602 obj.type = buf[156:157]
1603 obj.linkname = nts(buf[157:257], encoding, errors)
1604 obj.uname = nts(buf[265:297], encoding, errors)
1605 obj.gname = nts(buf[297:329], encoding, errors)
1606 obj.devmajor = nti(buf[329:337])
1607 obj.devminor = nti(buf[337:345])
1608 prefix = nts(buf[345:500], encoding, errors)
1610 # The old GNU sparse format occupies some of the unused
1611 # space in the buffer for up to 4 sparse structures.
1612 # Save the them for later processing in _proc_sparse().
1613 if obj.type == GNUTYPE_SPARSE:
1618 offset = nti(buf[pos:pos + 12])
1619 numbytes = nti(buf[pos + 12:pos + 24])
1622 structs.append((offset, numbytes))
1624 isextended = bool(buf[482])
1625 origsize = nti(buf[483:495])
1626 obj._sparse_structs = (structs, isextended, origsize)
1628 # Old V7 tar format represents a directory as a regular
1629 # file with a trailing slash.
1630 if obj.type == AREGTYPE and obj.name.endswith("/"):
1633 # Remove redundant slashes from directories.
1635 obj.name = obj.name.rstrip("/")
1637 # Reconstruct a ustar longname.
1638 if prefix and obj.type not in GNU_TYPES:
1639 obj.name = prefix + "/" + obj.name
1641 obj.offset_data = nti(buf[369:381])
1645 def fromtarfile(cls, tarfile):
1646 """Return the next TarInfo object from TarFile object
1649 buf = tarfile.fileobj.read(BLOCKSIZE)
1650 obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors)
1651 obj.offset = tarfile.fileobj.tell() - BLOCKSIZE
1652 return obj._proc_member(tarfile)
1654 #--------------------------------------------------------------------------
1655 # The following are methods that are called depending on the type of a
1656 # member. The entry point is _proc_member() which can be overridden in a
1657 # subclass to add custom _proc_*() methods. A _proc_*() method MUST
1658 # implement the following
1660 # 1. Set self.offset_data to the position where the data blocks begin,
1661 # if there is data that follows.
1662 # 2. Set tarfile.offset to the position where the next member's header will
1664 # 3. Return self or another valid TarInfo object.
1665 def _proc_member(self, tarfile):
1666 """Choose the right processing method depending on
1667 the type and call it.
1669 if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK):
1670 return self._proc_gnulong(tarfile)
1671 elif self.type == GNUTYPE_SPARSE:
1672 return self._proc_sparse(tarfile)
1673 elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE):
1674 return self._proc_pax(tarfile)
1676 return self._proc_builtin(tarfile)
1678 def _proc_builtin(self, tarfile):
1679 """Process a builtin type or an unknown type which
1680 will be treated as a regular file.
1682 self.offset_data = tarfile.fileobj.tell()
1683 offset = self.offset_data
1684 if self.isreg() or self.ismultivol() or self.type not in SUPPORTED_TYPES:
1685 # Skip the following data blocks.
1686 offset += self._block(self.size)
1687 tarfile.offset = offset
1689 # Patch the TarInfo object with saved global
1690 # header information.
1691 self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors)
1695 def _proc_gnulong(self, tarfile):
1696 """Process the blocks that hold a GNU longname
1699 buf = tarfile.fileobj.read(self._block(self.size))
1701 # Fetch the next header and process it.
1703 next = self.fromtarfile(tarfile)
1705 raise SubsequentHeaderError("missing or bad subsequent header")
1707 # Patch the TarInfo object from the next header with
1708 # the longname information.
1709 next.offset = self.offset
1710 if self.type == GNUTYPE_LONGNAME:
1711 next.name = nts(buf, tarfile.encoding, tarfile.errors)
1712 elif self.type == GNUTYPE_LONGLINK:
1713 next.linkname = nts(buf, tarfile.encoding, tarfile.errors)
1717 def _proc_sparse(self, tarfile):
1718 """Process a GNU sparse header plus extra headers.
1720 # We already collected some sparse structures in frombuf().
1721 structs, isextended, origsize = self._sparse_structs
1722 del self._sparse_structs
1724 # Collect sparse structures from extended header blocks.
1726 buf = tarfile.fileobj.read(BLOCKSIZE)
1730 offset = nti(buf[pos:pos + 12])
1731 numbytes = nti(buf[pos + 12:pos + 24])
1734 if offset and numbytes:
1735 structs.append((offset, numbytes))
1737 isextended = bool(buf[504])
1738 self.sparse = structs
1740 self.offset_data = tarfile.fileobj.tell()
1741 tarfile.offset = self.offset_data + self._block(self.size)
1742 self.size = origsize
1745 def _proc_pax(self, tarfile):
1746 """Process an extended or global header as described in
1749 # Read the header information.
1750 buf = tarfile.fileobj.read(self._block(self.size))
1752 # A pax header stores supplemental information for either
1753 # the following file (extended) or all following files
1755 if self.type == XGLTYPE:
1756 pax_headers = tarfile.pax_headers
1758 pax_headers = tarfile.pax_headers.copy()
1760 # Check if the pax header contains a hdrcharset field. This tells us
1761 # the encoding of the path, linkpath, uname and gname fields. Normally,
1762 # these fields are UTF-8 encoded but since POSIX.1-2008 tar
1763 # implementations are allowed to store them as raw binary strings if
1764 # the translation to UTF-8 fails.
1765 match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf)
1766 if match is not None:
1767 pax_headers["hdrcharset"] = match.group(1).decode("utf-8")
1769 # For the time being, we don't care about anything other than "BINARY".
1770 # The only other value that is currently allowed by the standard is
1771 # "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
1772 hdrcharset = pax_headers.get("hdrcharset")
1773 if hdrcharset == "BINARY":
1774 encoding = tarfile.encoding
1778 # Parse pax header information. A record looks like that:
1779 # "%d %s=%s\n" % (length, keyword, value). length is the size
1780 # of the complete record including the length field itself and
1781 # the newline. keyword and value are both UTF-8 encoded strings.
1782 regex = re.compile(br"(\d+) ([^=]+)=")
1785 match = regex.match(buf, pos)
1789 length, keyword = match.groups()
1790 length = int(length)
1791 value = buf[match.end(2) + 1:match.start(1) + length - 1]
1793 # Normally, we could just use "utf-8" as the encoding and "strict"
1794 # as the error handler, but we better not take the risk. For
1795 # example, GNU tar <= 1.23 is known to store filenames it cannot
1796 # translate to UTF-8 as raw strings (unfortunately without a
1797 # hdrcharset=BINARY header).
1798 # We first try the strict standard encoding, and if that fails we
1799 # fall back on the user's encoding and error handler.
1800 keyword = self._decode_pax_field(keyword, "utf-8", "utf-8",
1802 if keyword in PAX_NAME_FIELDS:
1803 value = self._decode_pax_field(value, encoding, tarfile.encoding,
1806 value = self._decode_pax_field(value, "utf-8", "utf-8",
1809 pax_headers[keyword] = value
1813 # Fetch the next header.
1815 next = self.fromtarfile(tarfile)
1817 raise SubsequentHeaderError("missing or bad subsequent header")
1819 # Process GNU sparse information.
1820 if "GNU.sparse.map" in pax_headers:
1821 # GNU extended sparse format version 0.1.
1822 self._proc_gnusparse_01(next, pax_headers)
1824 elif "GNU.sparse.size" in pax_headers:
1825 # GNU extended sparse format version 0.0.
1826 self._proc_gnusparse_00(next, pax_headers, buf)
1828 elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0":
1829 # GNU extended sparse format version 1.0.
1830 self._proc_gnusparse_10(next, pax_headers, tarfile)
1832 if self.type in (XHDTYPE, SOLARIS_XHDTYPE):
1833 # Patch the TarInfo object with the extended header info.
1834 next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors)
1835 next.offset = self.offset
1837 if "size" in pax_headers:
1838 # If the extended header replaces the size field,
1839 # we need to recalculate the offset where the next
1841 offset = next.offset_data
1842 if next.isreg() or next.type not in SUPPORTED_TYPES:
1843 offset += next._block(next.size)
1844 tarfile.offset = offset
1846 if next is not None:
1847 if "GNU.volume.filename" in pax_headers:
1848 if pax_headers["GNU.volume.filename"] == next.name:
1849 if "GNU.volume.size" in pax_headers:
1850 next.size = int(pax_headers["GNU.volume.size"])
1851 if "GNU.volume.offset" in pax_headers:
1852 next.volume_offset = int(pax_headers["GNU.volume.offset"])
1854 for key in pax_headers.keys():
1855 if key.startswith("GNU.volume"):
1856 del tarfile.pax_headers[key]
1860 def _proc_gnusparse_00(self, next, pax_headers, buf):
1861 """Process a GNU tar extended sparse header, version 0.0.
1864 for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf):
1865 offsets.append(int(match.group(1)))
1867 for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf):
1868 numbytes.append(int(match.group(1)))
1869 next.sparse = list(zip(offsets, numbytes))
1871 def _proc_gnusparse_01(self, next, pax_headers):
1872 """Process a GNU tar extended sparse header, version 0.1.
1874 sparse = [int(x) for x in pax_headers["GNU.sparse.map"].split(",")]
1875 next.sparse = list(zip(sparse[::2], sparse[1::2]))
1877 def _proc_gnusparse_10(self, next, pax_headers, tarfile):
1878 """Process a GNU tar extended sparse header, version 1.0.
1882 buf = tarfile.fileobj.read(BLOCKSIZE)
1883 fields, buf = buf.split(b"\n", 1)
1884 fields = int(fields)
1885 while len(sparse) < fields * 2:
1886 if b"\n" not in buf:
1887 buf += tarfile.fileobj.read(BLOCKSIZE)
1888 number, buf = buf.split(b"\n", 1)
1889 sparse.append(int(number))
1890 next.offset_data = tarfile.fileobj.tell()
1891 next.sparse = list(zip(sparse[::2], sparse[1::2]))
1893 def _apply_pax_info(self, pax_headers, encoding, errors):
1894 """Replace fields with supplemental information from a previous
1895 pax extended or global header.
1897 for keyword, value in pax_headers.items():
1898 if keyword == "GNU.sparse.name":
1899 setattr(self, "path", value)
1900 elif keyword == "GNU.sparse.size":
1901 setattr(self, "size", int(value))
1902 elif keyword == "GNU.sparse.realsize":
1903 setattr(self, "size", int(value))
1904 elif keyword in PAX_FIELDS:
1905 if keyword in PAX_NUMBER_FIELDS:
1907 value = PAX_NUMBER_FIELDS[keyword](value)
1910 if keyword == "path":
1911 value = value.rstrip("/") # pylint: disable=no-member
1912 setattr(self, keyword, value)
1914 self.pax_headers = pax_headers.copy()
1916 def _decode_pax_field(self, value, encoding, fallback_encoding, fallback_errors):
1917 """Decode a single field from a pax record.
1920 return value.decode(encoding, "strict")
1921 except UnicodeDecodeError:
1922 return value.decode(fallback_encoding, fallback_errors)
1924 def _block(self, count):
1925 """Round up a byte count by BLOCKSIZE and return it,
1926 e.g. _block(834) => 1024.
1928 blocks, remainder = divmod(count, BLOCKSIZE)
1931 return blocks * BLOCKSIZE
1934 return self.type in REGULAR_TYPES
1938 return self.type == DIRTYPE
1940 return self.type == SYMTYPE
1942 return self.type == LNKTYPE
1944 return self.type == CHRTYPE
1946 return self.type == BLKTYPE
1948 return self.type == FIFOTYPE
1950 return self.sparse is not None
1952 return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE)
1953 def ismultivol(self):
1954 return self.type == GNUTYPE_MULTIVOL or self.volume_offset > 0 or\
1955 "GNU.volume.offset" in self.pax_headers
1958 class TarFile(object):
1959 """The TarFile Class provides an interface to tar archives.
1962 debug = 0 # May be set from 0 (no msgs) to 3 (all msgs)
1964 dereference = False # If true, add content of linked file to the
1965 # tar file, else the link.
1967 ignore_zeros = False # If true, skips empty or invalid blocks and
1968 # continues processing.
1970 max_volume_size = None # If different from None, establishes maximum
1971 # size of tar volumes
1973 new_volume_handler = None # function handler to be executed before when
1974 # a new volume is needed
1976 volume_number = 0 # current volume number, used for multi volume
1979 errorlevel = 1 # If 0, fatal errors only appear in debug
1980 # messages (if debug >= 0). If > 0, errors
1981 # are passed to the caller as exceptions.
1983 format = DEFAULT_FORMAT # The format to use when creating an archive.
1985 encoding = ENCODING # Encoding for 8-bit character strings.
1987 errors = None # Error handler for unicode conversion.
1989 tarinfo = TarInfo # The default TarInfo class to use.
1991 fileobject = ExFileObject # The file-object for extractfile().
1993 arcmode = ARCMODE_PLAIN # Object processing mode (“concat”, encryption,
1996 save_to_members = True # If new members are saved. This can be disabled
1997 # if you manage lots of files and don't want
1998 # to have high memory usage
2000 cache_uid2user = {} # cache to avoid getpwuid calls. It always parses /etc/passwd.
2001 cache_gid2group = {} # same cache for groups
2003 def __init__(self, name=None, mode="r", fileobj=None, format=None,
2004 tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
2005 errors="surrogateescape", pax_headers=None, debug=None,
2006 errorlevel=None, max_volume_size=None, new_volume_handler=None,
2007 concat=False, nacl=None,
2008 save_to_members=True):
2009 """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
2010 read from an existing archive, 'a' to append data to an existing
2011 file or 'w' to create a new file overwriting an existing one. `mode'
2013 If `fileobj' is given, it is used for reading or writing data. If it
2014 can be determined, `mode' is overridden by `fileobj's mode.
2015 `fileobj' is not closed, when TarFile is closed.
2017 if len(mode) > 1 or mode not in "raw":
2018 raise ValueError("mode must be 'r', 'a' or 'w'")
2020 self.arcmode = arcmode_set (concat)
2022 self._mode = {"r": "rb", "a": "r+b", "w": "wb"}[mode]
2025 if self.mode == "a" and not os.path.exists(name):
2026 # Create nonexistent files in append mode.
2029 fileobj = bltn_open(name, self._mode)
2030 self._extfileobj = False
2032 if name is None and hasattr(fileobj, "name"):
2034 # when fileobj is a gzip.GzipFile, fileobj.mode is an int (not valid for us)
2035 if hasattr(fileobj, "mode") and isinstance(fileobj.mode, str):
2036 self._mode = fileobj.mode
2037 self._extfileobj = True
2038 self.name = os.path.abspath(name) if name else None
2039 self.base_name = self.name = os.path.abspath(name) if name else None
2040 self.fileobj = fileobj
2043 if format is not None:
2044 self.format = format
2045 if tarinfo is not None:
2046 self.tarinfo = tarinfo
2047 if dereference is not None:
2048 self.dereference = dereference
2049 if ignore_zeros is not None:
2050 self.ignore_zeros = ignore_zeros
2051 if encoding is not None:
2052 self.encoding = encoding
2054 self.errors = errors
2056 if pax_headers is not None and self.format == PAX_FORMAT:
2057 self.pax_headers = pax_headers
2059 self.pax_headers = {}
2061 if debug is not None:
2063 if errorlevel is not None:
2064 self.errorlevel = errorlevel
2066 # Init datastructures.
2067 if max_volume_size and max_volume_size < 3*BLOCKSIZE:
2068 raise ValueError("max_volume_size needs to be at least %d" % (3*BLOCKSIZE))
2069 if max_volume_size and not callable(new_volume_handler):
2070 raise ValueError("new_volume_handler needs to be set and be callable for multivolume support")
2072 self.max_volume_size = int(max_volume_size)
2074 self.max_volume_size = None
2076 self.save_to_members = save_to_members
2077 self.new_volume_handler = new_volume_handler
2079 self.members = [] # list of members as TarInfo objects
2080 self._loaded = False # flag if all members have been read
2081 self.offset = self.fileobj.tell()
2082 # current position in the archive file
2083 self.inodes = {} # dictionary caching the inodes of
2084 # archive members already added
2087 if self.mode == "r":
2088 self.firstmember = None
2089 self.firstmember = self.next()
2091 if self.mode == "a":
2092 # Move to the end of the archive,
2093 # before the first empty block.
2095 self.fileobj.seek(self.offset)
2097 tarinfo = self.tarinfo.fromtarfile(self)
2098 self.members.append(tarinfo)
2099 except EOFHeaderError:
2100 self.fileobj.seek(self.offset)
2102 except HeaderError as e:
2103 raise ReadError(str(e))
2105 if self.mode in "aw":
2108 if self.pax_headers:
2109 buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy())
2110 self.fileobj.write(buf)
2111 self.offset += len(buf)
2113 if not self._extfileobj:
2114 self.fileobj.close()
2118 #--------------------------------------------------------------------------
2119 # Below are the classmethods which act as alternate constructors to the
2120 # TarFile class. The open() method is the only one that is needed for
2121 # public use; it is the "super"-constructor and is able to select an
2122 # adequate "sub"-constructor for a particular compression using the mapping
2125 # This concept allows one to subclass TarFile without losing the comfort of
2126 # the super-constructor. A sub-constructor is registered and made available
2127 # by adding it to the mapping in OPEN_METH.
2130 def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE,
2131 encryption=None, compresslevel=9, tolerance=TOLERANCE_STRICT,
2133 """Open a tar archive for reading, writing or appending. Return
2134 an appropriate TarFile class.
2137 'r' or 'r:*' open for reading with transparent compression
2138 'r:' open for reading exclusively uncompressed
2139 'r:gz' open for reading with gzip compression
2140 'r:bz2' open for reading with bzip2 compression
2141 'r:xz' open for reading with lzma compression
2142 'a' or 'a:' open for appending, creating the file if necessary
2143 'w' or 'w:' open for writing without compression
2144 'w:gz' open for writing with gzip compression
2145 'w:bz2' open for writing with bzip2 compression
2146 'w:xz' open for writing with lzma compression
2148 'r|*' open a stream of tar blocks with transparent compression
2149 'r|' open an uncompressed stream of tar blocks for reading
2150 'r|gz' open a gzip compressed stream of tar blocks
2151 'r|bz2' open a bzip2 compressed stream of tar blocks
2152 'r|xz' open an lzma compressed stream of tar blocks
2153 'w|' open an uncompressed stream for writing
2154 'w|gz' open a gzip compressed stream for writing
2155 'w|bz2' open a bzip2 compressed stream for writing
2156 'w|xz' open an lzma compressed stream for writing
2158 'r#gz' open a stream of gzip compressed tar blocks for reading
2159 'w#gz' open a stream of gzip compressed tar blocks for writing
2161 if not name and not fileobj:
2162 raise ValueError("nothing to open")
2164 if mode in ("r", "r:*"):
2165 # Find out which *open() is appropriate for opening the file.
2166 for comptype in cls.OPEN_METH:
2167 func = getattr(cls, cls.OPEN_METH[comptype])
2168 if fileobj is not None:
2169 saved_pos = fileobj.tell()
2171 return func(name, "r", fileobj, **kwargs)
2172 except (ReadError, CompressionError) as e:
2173 # usually nothing exceptional but sometimes is
2174 if fileobj is not None:
2175 fileobj.seek(saved_pos)
2177 raise ReadError("file could not be opened successfully")
2180 filemode, comptype = mode.split(":", 1)
2181 filemode = filemode or "r"
2182 comptype = comptype or "tar"
2184 # Select the *open() function according to
2185 # given compression.
2186 if comptype in cls.OPEN_METH:
2187 func = getattr(cls, cls.OPEN_METH[comptype])
2189 raise CompressionError("unknown compression type %r" % comptype)
2191 # Pass on compression level for gzip / bzip2.
2192 if comptype == 'gz' or comptype == 'bz2':
2193 kwargs['compresslevel'] = compresslevel
2195 if 'max_volume_size' in kwargs:
2196 if comptype != 'tar' and filemode in 'wa' \
2197 and kwargs['max_volume_size']:
2199 warnings.warn('Only the first volume will be compressed '
2200 'for modes with "w:"!')
2202 return func(name, filemode, fileobj, **kwargs)
2205 filemode, comptype = mode.split("|", 1)
2206 filemode = filemode or "r"
2207 comptype = comptype or "tar"
2209 if filemode not in "rw":
2210 raise ValueError("mode must be 'r' or 'w'")
2212 t = cls(name, filemode,
2213 _Stream(name, filemode, comptype, fileobj, bufsize,
2214 compresslevel=compresslevel),
2216 t._extfileobj = False
2220 filemode, comptype = mode.split("#", 1)
2221 filemode = filemode or "r"
2223 if filemode not in "rw":
2224 raise ValueError ("mode %s not compatible with concat "
2225 "archive; must be 'r' or 'w'" % mode)
2227 stream = _Stream(name, filemode, comptype, fileobj, bufsize,
2228 concat=True, encryption=encryption,
2229 compresslevel=compresslevel, tolerance=tolerance)
2230 kwargs ["concat"] = True
2232 t = cls(name, filemode, stream, **kwargs)
2233 except: # XXX except what?
2235 raise # XXX raise what?
2236 t._extfileobj = False
2240 return cls.taropen(name, mode, fileobj, **kwargs)
2242 raise ValueError("undiscernible mode %r" % mode)
2246 def open_at_offset(cls, offset, *a, **kwa):
2248 Same as ``.open()``, but start reading at the given offset. Assumes a
2249 seekable file object.
2251 fileobj = kwa.get ("fileobj")
2252 if fileobj is not None:
2253 fileobj.seek (offset)
2254 return cls.open (*a, **kwa)
2258 def taropen(cls, name, mode="r", fileobj=None, **kwargs):
2259 """Open uncompressed tar archive name for reading or writing.
2261 if len(mode) > 1 or mode not in "raw":
2262 raise ValueError("mode must be 'r', 'a' or 'w'")
2263 return cls(name, mode, fileobj, **kwargs)
2266 def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
2267 """Open gzip compressed tar archive name for reading or writing.
2268 Appending is not allowed.
2270 if len(mode) > 1 or mode not in "rw":
2271 raise ValueError("mode must be 'r' or 'w'")
2276 except (ImportError, AttributeError):
2277 raise CompressionError("gzip module is not available")
2279 extfileobj = fileobj is not None
2281 fileobj = gzip.GzipFile(name, mode + "b", compresslevel, fileobj)
2282 t = cls.taropen(name, mode, fileobj, **kwargs)
2284 if not extfileobj and fileobj is not None:
2288 raise ReadError("not a gzip file")
2290 if not extfileobj and fileobj is not None:
2293 t._extfileobj = extfileobj
2297 def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
2298 """Open bzip2 compressed tar archive name for reading or writing.
2299 Appending is not allowed.
2301 if len(mode) > 1 or mode not in "rw":
2302 raise ValueError("mode must be 'r' or 'w'.")
2307 raise CompressionError("bz2 module is not available")
2309 fileobj = bz2.BZ2File(fileobj or name, mode,
2310 compresslevel=compresslevel)
2313 t = cls.taropen(name, mode, fileobj, **kwargs)
2314 except (OSError, EOFError):
2316 raise ReadError("not a bzip2 file")
2317 t._extfileobj = False
2321 def xzopen(cls, name, mode="r", fileobj=None, preset=None, **kwargs):
2322 """Open lzma compressed tar archive name for reading or writing.
2323 Appending is not allowed.
2325 if mode not in ("r", "w"):
2326 raise ValueError("mode must be 'r' or 'w'")
2331 raise CompressionError("lzma module is not available")
2333 fileobj = lzma.LZMAFile(fileobj or name, mode, preset=preset)
2336 t = cls.taropen(name, mode, fileobj, **kwargs)
2337 except (lzma.LZMAError, EOFError):
2339 raise ReadError("not an lzma file")
2340 t._extfileobj = False
2343 # All *open() methods are registered here.
2345 "tar": "taropen", # uncompressed tar
2346 "gz": "gzopen", # gzip compressed tar
2347 "bz2": "bz2open", # bzip2 compressed tar
2348 "xz": "xzopen" # lzma compressed tar
2351 #--------------------------------------------------------------------------
2352 # The public methods which TarFile provides:
2355 """Close the TarFile. In write-mode, two finishing zero blocks are
2356 appended to the archive. A special case are empty archives which are
2357 initialized accordingly so the two mandatory blocks of zeros are
2358 written abiding by the requested encryption and compression settings.
2363 if self.mode in "aw":
2364 if self.arcmode & ARCMODE_CONCAT and self.fileobj.tell () == 0:
2365 self.fileobj.next ("")
2366 self.fileobj.write(NUL * (BLOCKSIZE * 2))
2367 self.offset += (BLOCKSIZE * 2)
2368 # fill up the end with zero-blocks
2369 # (like option -b20 for tar does)
2370 blocks, remainder = divmod(self.offset, RECORDSIZE)
2372 self.fileobj.write(NUL * (RECORDSIZE - remainder))
2373 if not self._extfileobj:
2374 self.fileobj.close()
2377 def getmember(self, name):
2378 """Return a TarInfo object for member `name'. If `name' can not be
2379 found in the archive, KeyError is raised. If a member occurs more
2380 than once in the archive, its last occurrence is assumed to be the
2381 most up-to-date version.
2383 tarinfo = self._getmember(name)
2385 raise KeyError("filename %r not found" % name)
2388 def getmembers(self):
2389 """Return the members of the archive as a list of TarInfo objects. The
2390 list has the same order as the members in the archive.
2393 if not self._loaded: # if we want to obtain a list of
2394 self._load() # all members, we first have to
2395 # scan the whole archive.
2398 def get_last_member_offset(self):
2399 """Return the last member offset. Usually this is self.fileobj.tell(),
2400 but when there's encryption or concat compression going on it's more
2401 complicated than that.
2403 return self.last_block_offset
2406 """Return the members of the archive as a list of their names. It has
2407 the same order as the list returned by getmembers().
2409 return [tarinfo.name for tarinfo in self.getmembers()]
2411 def gettarinfo(self, name=None, arcname=None, fileobj=None):
2412 """Create a TarInfo object for either the file `name' or the file
2413 object `fileobj' (using os.fstat on its file descriptor). You can
2414 modify some of the TarInfo's attributes before you add it using
2415 addfile(). If given, `arcname' specifies an alternative name for the
2416 file in the archive.
2420 # When fileobj is given, replace name by
2421 # fileobj's real name.
2422 if fileobj is not None:
2425 # Building the name of the member in the archive.
2426 # Backward slashes are converted to forward slashes,
2427 # Absolute paths are turned to relative paths.
2430 drv, arcname = os.path.splitdrive(arcname)
2431 arcname = arcname.replace(os.sep, "/")
2432 arcname = arcname.lstrip("/")
2434 # Now, fill the TarInfo object with
2435 # information specific for the file.
2436 tarinfo = self.tarinfo()
2437 tarinfo.tarfile = self
2439 # Use os.stat or os.lstat, depending on platform
2440 # and if symlinks shall be resolved.
2442 if hasattr(os, "lstat") and not self.dereference:
2443 statres = os.lstat(name)
2445 statres = os.stat(name)
2447 statres = os.fstat(fileobj.fileno())
2450 stmd = statres.st_mode
2451 if stat.S_ISREG(stmd):
2452 inode = (statres.st_ino, statres.st_dev)
2453 if not self.dereference and statres.st_nlink > 1 and \
2454 inode in self.inodes and arcname != self.inodes[inode]:
2455 # Is it a hardlink to an already
2458 linkname = self.inodes[inode]
2460 # The inode is added only if its valid.
2461 # For win32 it is always 0.
2463 if inode[0] and self.save_to_members:
2464 self.inodes[inode] = arcname
2465 elif stat.S_ISDIR(stmd):
2467 elif stat.S_ISFIFO(stmd):
2469 elif stat.S_ISLNK(stmd):
2471 linkname = os.readlink(name)
2472 elif stat.S_ISCHR(stmd):
2474 elif stat.S_ISBLK(stmd):
2479 # Fill the TarInfo object with all
2480 # information we can get.
2481 tarinfo.name = arcname
2483 tarinfo.uid = statres.st_uid
2484 tarinfo.gid = statres.st_gid
2486 tarinfo.size = statres.st_size
2489 tarinfo.mtime = statres.st_mtime
2491 tarinfo.linkname = linkname
2493 if tarinfo.uid in self.cache_uid2user:
2494 tarinfo.uname = self.cache_uid2user[tarinfo.uid]
2497 tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0]
2498 self.cache_uid2user[tarinfo.uid] = tarinfo.uname
2500 # remember user does not exist:
2501 # same default value as in tarinfo class
2502 self.cache_uid2user[tarinfo.uid] = ""
2504 if tarinfo.gid in self.cache_gid2group:
2505 tarinfo.gname = self.cache_gid2group[tarinfo.gid]
2508 tarinfo.gname = grp.getgrgid(tarinfo.gid)[0]
2509 self.cache_gid2group[tarinfo.gid] = tarinfo.gname
2511 # remember group does not exist:
2512 # same default value as in tarinfo class
2513 self.cache_gid2group[tarinfo.gid] = ""
2515 if type in (CHRTYPE, BLKTYPE):
2516 if hasattr(os, "major") and hasattr(os, "minor"):
2517 tarinfo.devmajor = os.major(statres.st_rdev)
2518 tarinfo.devminor = os.minor(statres.st_rdev)
2521 def list(self, verbose=True):
2522 """Print a table of contents to sys.stdout. If `verbose' is False, only
2523 the names of the members are printed. If it is True, an `ls -l'-like
2528 for tarinfo in self:
2530 print(stat.filemode(tarinfo.mode), end=' ')
2531 print("%s/%s" % (tarinfo.uname or tarinfo.uid,
2532 tarinfo.gname or tarinfo.gid), end=' ')
2533 if tarinfo.ischr() or tarinfo.isblk():
2534 print("%10s" % ("%d,%d" \
2535 % (tarinfo.devmajor, tarinfo.devminor)), end=' ')
2537 print("%10d" % tarinfo.size, end=' ')
2538 print("%d-%02d-%02d %02d:%02d:%02d" \
2539 % time.localtime(tarinfo.mtime)[:6], end=' ')
2541 print(tarinfo.name + ("/" if tarinfo.isdir() else ""), end=' ')
2545 print("->", tarinfo.linkname, end=' ')
2547 print("link to", tarinfo.linkname, end=' ')
2550 def add(self, name, arcname=None, recursive=True, exclude=None, *, filter=None):
2551 """Add the file `name' to the archive. `name' may be any type of file
2552 (directory, fifo, symbolic link, etc.). If given, `arcname'
2553 specifies an alternative name for the file in the archive.
2554 Directories are added recursively by default. This can be avoided by
2555 setting `recursive' to False. `exclude' is a function that should
2556 return True for each filename to be excluded. `filter' is a function
2557 that expects a TarInfo object argument and returns the changed
2558 TarInfo object, if it returns None the TarInfo object will be
2559 excluded from the archive.
2566 # Exclude pathnames.
2567 if exclude is not None:
2569 warnings.warn("use the filter argument instead",
2570 DeprecationWarning, 2)
2572 self._dbg(2, "tarfile: Excluded %r" % name)
2575 # Skip if somebody tries to archive the archive...
2576 if self.name is not None and os.path.abspath(name) == self.name:
2577 self._dbg(2, "tarfile: Skipped %r" % name)
2582 # Create a TarInfo object from the file.
2583 tarinfo = self.gettarinfo(name, arcname)
2586 self._dbg(1, "tarfile: Unsupported type %r" % name)
2589 # Change or exclude the TarInfo object.
2590 if filter is not None:
2591 tarinfo = filter(tarinfo)
2593 self._dbg(2, "tarfile: Excluded %r" % name)
2596 # Append the tar header and data to the archive.
2598 with bltn_open(name, "rb") as f:
2599 self.addfile(tarinfo, f)
2601 elif tarinfo.isdir():
2602 self.addfile(tarinfo)
2604 for f in os.listdir(name):
2605 self.add(os.path.join(name, f), os.path.join(arcname, f),
2606 recursive, exclude, filter=filter)
2609 self.addfile(tarinfo)
2611 def _size_left_file(self):
2612 """Calculates size left in a volume with a maximum volume size.
2614 Assumes self.max_volume_size is set.
2615 If using compression through a _Stream, use _size_left_stream instead
2617 # left-over size = max_size - offset - 2 zero-blocks written in close
2618 size_left = self.max_volume_size - 2*BLOCKSIZE - self.offset
2619 # limit size left to a discrete number of blocks, because we won't
2620 # write only half a block when writting the end of a volume
2621 # and filling with zeros
2622 return BLOCKSIZE * (size_left // BLOCKSIZE)
2624 def _size_left_stream(self):
2625 """ Calculates size left in a volume if using comression/encryption
2627 Assumes self.max_volume_size is set and self.fileobj is a _Stream
2628 (otherwise use _size_left_file)
2630 # left-over size = max_size - bytes written - 2 zero-blocks (close)
2631 size_left = self.max_volume_size - self.fileobj.estim_file_size() \
2633 return BLOCKSIZE * (size_left // BLOCKSIZE)
2635 def addfile(self, tarinfo, fileobj=None):
2636 """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is
2637 given, tarinfo.size bytes are read from it and added to the archive.
2638 You can create TarInfo objects using gettarinfo().
2639 On Windows platforms, `fileobj' should always be opened with mode
2640 'rb' to avoid irritation about the file size.
2644 tarinfo = copy.copy(tarinfo)
2646 if self.arcmode & ARCMODE_CONCAT:
2647 self.last_block_offset = self.fileobj.next (tarinfo.name)
2649 self.last_block_offset = self.fileobj.tell()
2651 buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
2652 self.fileobj.write(buf)
2653 self.offset += len(buf)
2655 if self.max_volume_size:
2656 if isinstance(self.fileobj, _Stream):
2657 _size_left = self._size_left_stream
2659 _size_left = self._size_left_file
2661 _size_left = lambda: tarinfo.size
2663 # If there's no data to follow, finish
2665 if self.save_to_members:
2666 self.members.append(tarinfo)
2669 target_size_left = _size_left()
2670 source_size_left = tarinfo.size
2671 assert tarinfo.volume_offset == 0
2673 # we only split volumes in the middle of a file, that means we have
2674 # to write at least one block
2675 if target_size_left < BLOCKSIZE:
2676 target_size_left = BLOCKSIZE
2678 # loop over multiple volumes
2679 while source_size_left > 0:
2681 # Write as much data as possble from source into target.
2682 # When compressing data, we cannot easily predict how much data we
2683 # can write until target_size_left == 0 --> need to iterate
2684 size_can_write = min(target_size_left, source_size_left)
2686 while size_can_write > 0:
2687 copyfileobj(fileobj, self.fileobj, size_can_write)
2688 self.offset += size_can_write
2689 source_size_left -= size_can_write
2690 target_size_left = _size_left()
2691 size_can_write = min(target_size_left, source_size_left)
2693 # now target_size_left == 0 or source_size_left == 0
2695 # if there is data left to write, we need to create a new volume
2696 if source_size_left > 0:
2697 # Only finalize the crypto entry here if we’re continuing with
2698 # another one; otherwise, the encryption must include the block
2700 tarinfo.type = GNUTYPE_MULTIVOL
2702 if not self.new_volume_handler or\
2703 not callable(self.new_volume_handler):
2704 raise Exception("We need to create a new volume and you "
2705 "didn't supply a new_volume_handler")
2708 # the new volume handler should do everything needed to
2709 # start working in a new volume. usually, the handler calls
2710 # to self.open_volume
2711 self.volume_number += 1
2713 # set to be used by open_volume, because in the case of a PAX
2714 # tar it needs to write information about the volume and offset
2715 # in the global header
2716 tarinfo.volume_offset = tarinfo.size - source_size_left
2717 self.volume_tarinfo = tarinfo
2719 # the “new_volume_handler” is supposed to call .close() on the
2721 self.new_volume_handler(self, self.base_name, self.volume_number)
2723 self.volume_tarinfo = None
2725 if self.arcmode & ARCMODE_CONCAT:
2726 self.fileobj.next_volume (tarinfo.name)
2728 # write new volume header
2729 buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
2730 self.fileobj.write(buf)
2731 self.offset += len(buf)
2733 # adjust variables; open_volume should have reset self.offset
2734 # --> _size_left should be big again
2735 target_size_left = _size_left()
2736 size_can_write = min(target_size_left, source_size_left)
2737 self._dbg(3, 'new volume')
2739 # now, all data has been written. We may have to fill up the rest of
2740 # the block in target with 0s
2741 remainder = (tarinfo.size - tarinfo.volume_offset) % BLOCKSIZE
2743 self.fileobj.write(NUL * (BLOCKSIZE - remainder))
2744 self.offset += BLOCKSIZE - remainder
2746 if self.save_to_members:
2747 self.members.append(tarinfo)
2749 def open_volume(self, name="", fileobj=None, encryption=None):
2751 Called by the user to change this tar file to point to a new volume.
2753 # open the file using either fileobj or name
2755 if self.mode == "a" and not os.path.exists(name):
2756 # Create nonexistent files in append mode.
2759 self._extfileobj = False
2761 if isinstance(self.fileobj, _Stream):
2762 self._dbg(3, 'open_volume: create a _Stream')
2763 fileobj = _Stream(name=name,
2764 mode=self.fileobj.mode,
2765 comptype=self.fileobj.comptype,
2767 bufsize=self.fileobj.bufsize,
2768 encryption=encryption or self.fileobj.encryption,
2769 concat=self.fileobj.arcmode & ARCMODE_CONCAT)
2771 # here, we lose information about compression/encryption!
2772 self._dbg(3, 'open_volume: builtin open')
2773 fileobj = bltn_open(name, self._mode)
2775 if name is None and hasattr(fileobj, "name"):
2777 if hasattr(fileobj, "mode"):
2778 self._mode = fileobj.mode
2779 self._extfileobj = True
2780 self._dbg(3, 'open_volume: using external fileobj {}', fileobj)
2781 self.name = os.path.abspath(name) if name else None
2782 self.fileobj = fileobj
2784 # init data structures
2786 self.members = [] # list of members as TarInfo objects
2787 self._loaded = False # flag if all members have been read
2788 self.offset = self.fileobj.tell()
2789 # current position in the archive file
2790 self.inodes = {} # dictionary caching the inodes of
2791 # archive members already added
2794 if self.mode == "r":
2795 self.firstmember = None
2796 self.firstmember = self.next()
2798 if self.mode == "a":
2799 # Move to the end of the archive,
2800 # before the first empty block.
2802 self.fileobj.seek(self.offset)
2804 tarinfo = self.tarinfo.fromtarfile(self)
2805 self.members.append(tarinfo)
2806 except EOFHeaderError:
2807 self.fileobj.seek(self.offset)
2809 except HeaderError as e:
2810 raise ReadError(str(e))
2812 if self.mode in "aw":
2815 if self.format == PAX_FORMAT:
2817 "GNU.volume.filename": str(self.volume_tarinfo.name),
2818 "GNU.volume.size": str(self.volume_tarinfo.size - self.volume_tarinfo.volume_offset),
2819 "GNU.volume.offset": str(self.volume_tarinfo.volume_offset),
2822 self.pax_headers.update(volume_info)
2824 if isinstance(self.fileobj, _Stream):
2825 self.fileobj._init_write_gz ()
2826 buf = self.tarinfo.create_pax_global_header(volume_info.copy())
2827 self.fileobj.write(buf)
2828 self.offset += len(buf)
2829 except Exception as exn:
2830 if not self._extfileobj:
2831 self.fileobj.close()
2835 def extractall(self, path=".", members=None, filter=None):
2836 """Extract all members from the archive to the current working
2837 directory and set owner, modification time and permissions on
2838 directories afterwards. `path' specifies a different directory
2839 to extract to. `members' is optional and must be a subset of the
2840 list returned by getmembers().
2847 for tarinfo in members:
2848 if self.volume_number > 0 and tarinfo.ismultivol():
2851 if filter and not filter(tarinfo):
2855 # Extract directories with a safe mode.
2856 directories.append(tarinfo)
2857 tarinfo = copy.copy(tarinfo)
2858 tarinfo.mode = 0o0700
2859 # Do not set_attrs directories, as we will do that further down
2860 self.extract(tarinfo, path, set_attrs=not tarinfo.isdir())
2862 # Reverse sort directories.
2863 directories.sort(key=lambda a: a.name)
2864 directories.reverse()
2866 # Set correct owner, mtime and filemode on directories.
2867 for tarinfo in directories:
2868 dirpath = os.path.join(path, tarinfo.name)
2870 self.chown(tarinfo, dirpath)
2871 self.utime(tarinfo, dirpath)
2872 self.chmod(tarinfo, dirpath)
2873 except ExtractError as e:
2874 if self.errorlevel > 1:
2877 self._dbg(1, "tarfile: %s" % e)
2879 def extract(self, member, path="", set_attrs=True, symlink_cb=None):
2880 """Extract a member from the archive to the current working directory,
2881 using its full name. Its file information is extracted as accurately
2882 as possible. `member' may be a filename or a TarInfo object. You can
2883 specify a different directory using `path'. File attributes (owner,
2884 mtime, mode) are set unless `set_attrs' is False.
2885 ``symlink_cb`` is a hook accepting a function that is passed the
2886 ``member``, ``path``, and ``set_attrs`` arguments if the tarinfo for
2887 ``member`` indicates a symlink in which case only the callback
2888 passed will be applied, skipping the actual extraction. In case the
2889 callback is invoked, its return value is passed on to the caller.
2893 if isinstance(member, str):
2894 tarinfo = self.getmember(member)
2898 # Prepare the link target for makelink().
2900 tarinfo._link_target = os.path.join(path, tarinfo.linkname)
2902 if symlink_cb is not None and tarinfo.issym():
2903 return symlink_cb(member, path, set_attrs)
2906 self._extract_member(tarinfo, os.path.join(path, tarinfo.name),
2907 set_attrs=set_attrs)
2908 except EnvironmentError as e:
2909 if self.errorlevel > 0:
2912 if e.filename is None:
2913 self._dbg(1, "tarfile: %s" % e.strerror)
2915 self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename))
2916 except ExtractError as e:
2917 if self.errorlevel > 1:
2920 self._dbg(1, "tarfile: %s" % e)
2922 def extractfile(self, member):
2923 """Extract a member from the archive as a file object. `member' may be
2924 a filename or a TarInfo object. If `member' is a regular file or a
2925 link, an io.BufferedReader object is returned. Otherwise, None is
2930 if isinstance(member, str):
2931 tarinfo = self.getmember(member)
2935 if tarinfo.isreg() or tarinfo.ismultivol() or\
2936 tarinfo.type not in SUPPORTED_TYPES:
2937 # If a member's type is unknown, it is treated as a
2939 return self.fileobject(self, tarinfo)
2941 elif tarinfo.islnk() or tarinfo.issym():
2942 if isinstance(self.fileobj, _Stream):
2943 # A small but ugly workaround for the case that someone tries
2944 # to extract a (sym)link as a file-object from a non-seekable
2945 # stream of tar blocks.
2946 raise StreamError("cannot extract (sym)link as file object")
2948 # A (sym)link's file object is its target's file object.
2949 return self.extractfile(self._find_link_target(tarinfo))
2951 # If there's no data associated with the member (directory, chrdev,
2952 # blkdev, etc.), return None instead of a file object.
2955 def _extract_member(self, tarinfo, targetpath, set_attrs=True):
2956 """Extract the TarInfo object tarinfo to a physical
2957 file called targetpath.
2959 # Fetch the TarInfo object for the given name
2960 # and build the destination pathname, replacing
2961 # forward slashes to platform specific separators.
2962 targetpath = targetpath.rstrip("/")
2963 targetpath = targetpath.replace("/", os.sep)
2965 # Create all upper directories.
2966 upperdirs = os.path.dirname(targetpath)
2967 if upperdirs and not os.path.exists(upperdirs):
2968 # Create directories that are not part of the archive with
2969 # default permissions.
2970 os.makedirs(upperdirs)
2972 if tarinfo.islnk() or tarinfo.issym():
2973 self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname))
2975 self._dbg(1, tarinfo.name)
2978 self.makefile(tarinfo, targetpath)
2979 elif tarinfo.isdir():
2980 self.makedir(tarinfo, targetpath)
2981 elif tarinfo.isfifo():
2982 self.makefifo(tarinfo, targetpath)
2983 elif tarinfo.ischr() or tarinfo.isblk():
2984 self.makedev(tarinfo, targetpath)
2985 elif tarinfo.islnk() or tarinfo.issym():
2986 self.makelink(tarinfo, targetpath)
2987 elif tarinfo.type not in SUPPORTED_TYPES:
2988 self.makeunknown(tarinfo, targetpath)
2990 self.makefile(tarinfo, targetpath)
2993 self.chown(tarinfo, targetpath)
2994 if not tarinfo.issym():
2995 self.chmod(tarinfo, targetpath)
2996 self.utime(tarinfo, targetpath)
2998 #--------------------------------------------------------------------------
2999 # Below are the different file methods. They are called via
3000 # _extract_member() when extract() is called. They can be replaced in a
3001 # subclass to implement other functionality.
3003 def makedir(self, tarinfo, targetpath):
3004 """Make a directory called targetpath.
3007 # Use a safe mode for the directory, the real mode is set
3008 # later in _extract_member().
3009 os.mkdir(targetpath, 0o0700)
3010 except FileExistsError:
3013 def makefile(self, tarinfo, targetpath):
3014 """Make a file called targetpath.
3016 source = self.fileobj
3017 source.seek(tarinfo.offset_data)
3020 target = bltn_open(targetpath, "wb")
3022 if tarinfo.sparse is not None:
3024 for offset, size in tarinfo.sparse:
3026 copyfileobj(source, target, size)
3027 target.seek(tarinfo.size)
3036 copyfileobj(source, target, tarinfo.size)
3039 # only if we are extracting a multivolume this can be treated
3040 if not self.new_volume_handler:
3042 raise Exception("We need to read a new volume and you"
3043 " didn't supply a new_volume_handler")
3045 # the new volume handler should do everything needed to
3046 # start working in a new volume. usually, the handler calls
3047 # to self.open_volume
3048 self.volume_number += 1
3049 self.new_volume_handler(self, self.base_name, self.volume_number)
3050 tarinfo = self.firstmember
3051 source = self.fileobj
3056 def makeunknown(self, tarinfo, targetpath):
3057 """Make a file from a TarInfo object with an unknown type
3060 self.makefile(tarinfo, targetpath)
3061 self._dbg(1, "tarfile: Unknown file type %r, " \
3062 "extracted as regular file." % tarinfo.type)
3064 def makefifo(self, tarinfo, targetpath):
3065 """Make a fifo called targetpath.
3067 if hasattr(os, "mkfifo"):
3068 os.mkfifo(targetpath)
3070 raise ExtractError("fifo not supported by system")
3072 def makedev(self, tarinfo, targetpath):
3073 """Make a character or block device called targetpath.
3075 if not hasattr(os, "mknod") or not hasattr(os, "makedev"):
3076 raise ExtractError("special devices not supported by system")
3080 mode |= stat.S_IFBLK
3082 mode |= stat.S_IFCHR
3084 os.mknod(targetpath, mode,
3085 os.makedev(tarinfo.devmajor, tarinfo.devminor))
3087 def makelink(self, tarinfo, targetpath):
3088 """Make a (symbolic) link called targetpath. If it cannot be created
3089 (platform limitation), we try to make a copy of the referenced file
3093 # For systems that support symbolic and hard links.
3095 os.symlink(tarinfo.linkname, targetpath)
3098 if os.path.exists(tarinfo._link_target):
3099 os.link(tarinfo._link_target, targetpath)
3101 self._extract_member(self._find_link_target(tarinfo),
3103 except symlink_exception:
3105 self._extract_member(self._find_link_target(tarinfo),
3108 raise ExtractError("unable to resolve link inside archive")
3110 def chown(self, tarinfo, targetpath):
3111 """Set owner of targetpath according to tarinfo.
3113 if pwd and hasattr(os, "geteuid") and os.geteuid() == 0:
3114 # We have to be root to do so.
3116 g = grp.getgrnam(tarinfo.gname)[2]
3120 u = pwd.getpwnam(tarinfo.uname)[2]
3124 if tarinfo.issym() and hasattr(os, "lchown"):
3125 os.lchown(targetpath, u, g)
3127 os.chown(targetpath, u, g)
3128 except OSError as e:
3129 raise ExtractError("could not change owner")
3131 def chmod(self, tarinfo, targetpath):
3132 """Set file permissions of targetpath according to tarinfo.
3134 if hasattr(os, 'chmod'):
3136 os.chmod(targetpath, tarinfo.mode)
3137 except OSError as e:
3138 raise ExtractError("could not change mode")
3140 def utime(self, tarinfo, targetpath):
3141 """Set modification time of targetpath according to tarinfo.
3143 if not hasattr(os, 'utime'):
3146 os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime))
3147 except OSError as e:
3148 raise ExtractError("could not change modification time")
3150 #--------------------------------------------------------------------------
3152 """Return the next member of the archive as a TarInfo object, when
3153 TarFile is opened for reading. Return None if there is no more
3157 if self.firstmember is not None:
3158 m = self.firstmember
3159 self.firstmember = None
3162 # Read the next block.
3163 self.fileobj.seek(self.offset)
3167 tarinfo = self.tarinfo.fromtarfile(self)
3168 except EOFHeaderError as e:
3169 if self.ignore_zeros:
3170 self._dbg(2, "0x%X: %s" % (self.offset, e))
3171 self.offset += BLOCKSIZE
3173 except InvalidHeaderError as e:
3174 if self.ignore_zeros:
3175 self._dbg(2, "0x%X: %s" % (self.offset, e))
3176 self.offset += BLOCKSIZE
3178 elif self.offset == 0:
3179 raise ReadError(str(e))
3180 except EmptyHeaderError:
3181 if self.offset == 0:
3182 raise ReadError("empty file")
3183 except TruncatedHeaderError as e:
3184 if self.offset == 0:
3185 raise ReadError(str(e))
3186 except SubsequentHeaderError as e:
3187 raise ReadError(str(e))
3190 if tarinfo is not None:
3191 if self.save_to_members:
3192 self.members.append(tarinfo)
3198 #--------------------------------------------------------------------------
3199 # Little helper methods:
3201 def _getmember(self, name, tarinfo=None, normalize=False):
3202 """Find an archive member by name from bottom to top.
3203 If tarinfo is given, it is used as the starting point.
3205 # Ensure that all members have been loaded.
3206 members = self.getmembers()
3208 # Limit the member search list up to tarinfo.
3209 if tarinfo is not None:
3210 members = members[:members.index(tarinfo)]
3213 name = os.path.normpath(name)
3215 for member in reversed(members):
3217 member_name = os.path.normpath(member.name)
3219 member_name = member.name
3221 if name == member_name:
3225 """Read through the entire archive file and look for readable
3229 tarinfo = self.next()
3234 def _check(self, mode=None):
3235 """Check if TarFile is still open, and if the operation's mode
3236 corresponds to TarFile's mode.
3239 raise OSError("%s is closed" % self.__class__.__name__)
3240 if mode is not None and self.mode not in mode:
3241 raise OSError("bad operation for mode %r" % self.mode)
3243 def _find_link_target(self, tarinfo):
3244 """Find the target member of a symlink or hardlink member in the
3248 # Always search the entire archive.
3249 linkname = "/".join(filter(None, (os.path.dirname(tarinfo.name), tarinfo.linkname)))
3252 # Search the archive before the link, because a hard link is
3253 # just a reference to an already archived file.
3254 linkname = tarinfo.linkname
3257 member = self._getmember(linkname, tarinfo=limit, normalize=True)
3259 raise KeyError("linkname %r not found" % linkname)
3263 """Provide an iterator object.
3266 return iter(self.members)
3268 return TarIter(self)
3270 def _dbg(self, level, msg, *args):
3271 """Write debugging output to sys.stderr.
3273 if level <= self.debug:
3274 print(msg.format(*args), file=sys.stderr)
3276 def __enter__(self):
3280 def __exit__(self, type, value, traceback):
3284 # An exception occurred. We must not call close() because
3285 # it would try to write end-of-archive blocks and padding.
3286 if not self._extfileobj:
3287 self.fileobj.close()
3294 for tarinfo in TarFile(...):
3298 def __init__(self, tarfile):
3299 """Construct a TarIter object.
3301 self.tarfile = tarfile
3304 """Return iterator object.
3308 """Return the next item using TarFile's next() method.
3309 When all members have been read, set TarFile as _loaded.
3311 # Fix for SF #1100429: Under rare circumstances it can
3312 # happen that getmembers() is called during iteration,
3313 # which will cause TarIter to stop prematurely.
3315 if self.index == 0 and self.tarfile.firstmember is not None:
3316 tarinfo = self.tarfile.next()
3317 elif self.index < len(self.tarfile.members):
3318 tarinfo = self.tarfile.members[self.index]
3319 elif not self.tarfile._loaded:
3320 tarinfo = self.tarfile.next()
3322 self.tarfile._loaded = True
3330 #---------------------------------------------------------
3331 # support functionality for rescue mode
3332 #---------------------------------------------------------
3334 TAR_FMT_HDR = (# See tar(5):
3336 "100s" # ← char name[100]; /* 100 */
3337 "8s" # ← char mode[8]; /* 108 */
3338 "8s" # ← char uid[8]; /* 116 */
3339 "8s" # ← char gid[8]; /* 124 */
3340 "12s" # ← char size[12]; /* 136 */
3341 "12s" # ← char mtime[12]; /* 148 */
3342 "8s" # ← char checksum[8]; /* 156 */
3343 "B" # ← char typeflag[1]; /* 157 */
3344 "100s" # ← char linkname[100]; /* 257 */
3345 "6s" # ← char magic[6]; /* 263 */
3346 "2s" # ← char version[2]; /* 265 */
3347 "32s" # ← char uname[32]; /* 297 */
3348 "32s" # ← char gname[32]; /* 329 */
3349 "8s" # ← char devmajor[8]; /* 337 */
3350 "8s" # ← char devminor[8]; /* 345 */
3351 "12s" # ← char atime[12]; /* 357 */
3352 "12s" # ← char ctime[12]; /* 369 */
3353 "12s" # ← char offset[12]; /* 381 */
3354 "4s" # ← char longnames[4]; /* 385 */
3355 "B" # ← char unused[1]; /* 386 */
3357 "12s" # ← char offset[12];
3358 "12s" # ← char numbytes[12];
3359 "12s" # ← char offset[12];
3360 "12s" # ← char numbytes[12];
3361 "12s" # ← char offset[12];
3362 "12s" # ← char numbytes[12];
3363 "12s" # ← char offset[12];
3364 "12s" # ← char numbytes[12];
3365 "" # } sparse[4]; /* 482 */
3366 "B" # ← char isextended[1]; /* 483 */
3367 "12s" # ← char realsize[12]; /* 495 */
3368 "17s" # ← char pad[17]; /* 512 */
3371 # The “magic” and “version” fields are special:
3374 # magic The magic field holds the five characters “ustar” followed by a
3375 # space. Note that POSIX ustar archives have a trailing null.
3379 # /* OLDGNU_MAGIC uses both magic and version fields, which are contiguous.
3380 # Found in an archive, it indicates an old GNU header format, which will be
3381 # hopefully become obsolescent. With OLDGNU_MAGIC, uname and gname are
3382 # valid, though the header is not truly POSIX conforming. */
3385 TAR_FMT_OLDGNU_MAGIC = b"ustar "
3387 def read_gnu_tar_hdr (data):
3388 if len (data) != BLOCKSIZE: # header requires one complete block
3409 offset1, numbytes1, \
3410 offset2, numbytes2, \
3411 offset3, numbytes3, \
3412 offset4, numbytes4, \
3415 pad = struct.unpack (TAR_FMT_HDR, data)
3416 except struct.error:
3419 if magic != TAR_FMT_OLDGNU_MAGIC:
3422 # return all except “unused” and “pad”
3424 { "name" : name, "mode" : mode
3425 , "uid" : uid , "gid" : gid
3426 , "size" : size, "mtime" : mtime
3427 , "checksum" : checksum
3428 , "typeflag" : typeflag
3429 , "linkname" : linkname
3431 , "version" : version
3432 , "uname" : uname, "gname" : gname
3433 , "devmajor" : devmajor, "devminor" : devminor
3434 , "atime" : atime, "ctime" : ctime
3436 , "longnames" : longnames
3437 , "offset1" : offset1, "numbytes1" : numbytes1
3438 , "offset2" : offset2, "numbytes2" : numbytes2
3439 , "offset3" : offset3, "numbytes3" : numbytes3
3440 , "offset4" : offset4, "numbytes4" : numbytes4
3441 , "isextended" : isextended
3442 , "realsize" : realsize
3446 def readable_tar_objects_offsets (ifd):
3448 Traverse blocks in file, trying to extract tar headers.
3454 blk = os.read (ifd, BLOCKSIZE)
3455 if len (blk) != BLOCKSIZE:
3457 hdr = read_gnu_tar_hdr (blk)
3459 offsets.append (pos)
3465 def locate_gz_hdr_candidates (fd):
3467 Walk over instances of the GZ magic in the payload, collecting their
3468 positions. If the offset of the first found instance is not zero, the file
3469 begins with leading garbage.
3471 Note that since the GZ magic consists of only two bytes, we expect a lot of
3472 false positives inside binary data.
3474 :return: The list of offsets in the file.
3478 mm = mmap.mmap(fd, 0, mmap.MAP_SHARED, mmap.PROT_READ)
3481 pos = mm.find (GZ_MAGIC_BYTES, pos)
3485 pos += len (GZ_MAGIC_BYTES)
3490 HDR_CAND_GOOD = 0 # header marks begin of valid object
3491 HDR_CAND_FISHY = 1 # inconclusive
3492 HDR_CAND_JUNK = 2 # not a header / object unreadable
3495 def read_cstring (fd, max=-1, encoding=None):
3497 Read one NUL-terminated string from *fd* into a Python string. If *max* is
3498 non-negative, reading will terminate after the specified number of bytes.
3500 Optionally, an *encoding* may be specified to interpret the data as.
3502 :returns: *None* if parsing failed or the maximum number of bytes has been
3503 exceeded; a Python string with the data otherwise.
3512 if max >= 0 and l > max:
3516 if encoding is not None:
3517 buf = buf.decode (encoding)
3522 def inspect_gz_hdr (fd, off):
3524 Attempt to parse a Gzip header in *fd* at position *off*. The format is
3525 documented as RFC1952.
3527 Returns a verdict about the quality of that header plus the parsed header
3528 when readable. Problematic sizes such as fields running past the EOF are
3529 treated as garbage. Properties in which the header merely doesn’t conform
3530 to the spec (garbage flag bits, bogus timestamp) are considered “fishy”. No
3531 validation is possible on embedded strings because they are single-byte
3539 verdict = HDR_CAND_GOOD
3541 os.lseek (fd, off, os.SEEK_SET)
3542 if os.lseek (fd, 0, os.SEEK_CUR) != off:
3543 return HDR_CAND_JUNK, None
3545 raw = os.read (fd, GZ_HEADER_SIZE)
3546 if len (raw) != GZ_HEADER_SIZE:
3547 return HDR_CAND_JUNK, None
3551 _m1, _m2, meth, flags, mtime, dflags, oscode = \
3552 struct.unpack (GZ_FMT_HEADER, raw)
3553 if meth != GZ_METHOD_DEFLATE: # only deflate is supported
3554 return HDR_CAND_JUNK, None
3555 except struct.error as exn:
3556 return HDR_CAND_JUNK, None
3558 if mtime > int (time.time ()):
3559 verdict = HDR_CAND_FISHY
3561 if dflags != GZ_DEFLATE_FLAGS:
3562 verdict = HDR_CAND_FISHY
3564 if oscode != GZ_OS_CODE:
3565 verdict = HDR_CAND_FISHY
3567 if flags & GZ_FLAG_FTEXT: # created by some contrarian
3568 verdict = HDR_CAND_FISHY
3569 if flags & GZ_FLAG_FEXTRA:
3570 xlen = struct.unpack ("<H", os.read (fd, 2))
3571 xtra = os.read (fd, xlen)
3572 if len (xtra) != xlen: # eof inside header
3573 return HDR_CAND_JUNK, None
3574 if flags & GZ_FLAG_FNAME:
3575 # read up to the next NUL byte, not exceeding the maximum path length
3577 fname = read_cstring (fd, max=(LENGTH_NAME + LENGTH_PREFIX),
3578 encoding="iso-8859-1")
3580 return HDR_CAND_JUNK, None
3581 if flags & GZ_FLAG_FCOMMENT:
3582 fname = read_cstring (fd, max=(LENGTH_NAME + LENGTH_PREFIX),
3583 encoding="iso-8859-1")
3585 return HDR_CAND_JUNK, None
3586 if flags & GZ_FLAG_FHCRC: # half a CRC32; discarded
3587 crc16 = os.read (fd, 2)
3588 if len (crc16) != 2: # eof inside header
3589 return HDR_CAND_JUNK, None
3590 if flags & GZ_FLAG_RESERVED:
3591 # according to the RFC, these must not be set
3592 verdict = HDR_CAND_FISHY
3594 hlen = os.lseek (fd, 0, os.SEEK_CUR) - off
3606 def try_decompress (ifd, off, hdr):
3608 Attempt to process the object starting at *off* with gzip.
3610 :returns: A pair containing the values of the decompressed data and
3611 the length of the input consumed. Note that the latter value
3612 may exceed the length of the compressed data because the
3613 *zlib* module does not provide a means to query how much
3614 of the input it processed before the end of an object.
3617 decmp = zlib.decompressobj (-zlib.MAX_WBITS)
3619 dlen = 0 # size of decompressed data
3621 os.lseek (ifd, pos, os.SEEK_SET)
3623 cnk = os.read (ifd, BUFSIZE)
3626 data = decmp.decompress (cnk)
3627 except zlib.error as exn: # probably CRC32 mismatch; terminate softly
3630 if decmp.eof is True:
3632 if len (cnk) != BUFSIZE: # eof, but not end of decompressed object!
3635 return dlen, pos - off
3637 def readable_gz_objects_offsets (ifd, cands):
3639 Inspect header candidates for parseable *ifd* gzipped objects.
3646 vdt, hdr = inspect_gz_hdr (ifd, cand)
3647 if vdt == HDR_CAND_JUNK:
3648 pass # ignore unreadable ones
3649 elif vdt in [HDR_CAND_GOOD, HDR_CAND_FISHY]:
3650 off0 = cand + hdr ["hlen"]
3651 dlen, clen = try_decompress (ifd, off0, hdr)
3652 if dlen > 0 and clen > 0:
3658 def reconstruct_offsets_gz (fname):
3660 From the given file, retrieve all GZ header-like offsets (“candidates”).
3661 Then check each of those locations whether they can be processed as
3664 ifd = os.open (fname, os.O_RDONLY)
3667 cands = locate_gz_hdr_candidates (ifd)
3668 return readable_gz_objects_offsets (ifd, cands)
3673 def reconstruct_offsets_tar (fname):
3675 From the given file, retrieve all tar header-like offsets (“candidates”).
3676 Then check each of those locations whether they can be processed as tar
3679 ifd = os.open (fname, os.O_RDONLY)
3682 return readable_tar_objects_offsets (ifd)
3687 def read_tarobj_at_offset (fileobj, offset, mode, secret=None):
3690 if secret is not None:
3693 if ks == crypto.PDTCRYPT_SECRET_PW:
3694 decr = crypto.Decrypt (password=secret [1])
3695 elif ks == crypto.PDTCRYPT_SECRET_KEY:
3696 key = binascii.unhexlify (secret [1])
3697 decr = crypto.Decrypt (key=key)
3702 TarFile.open_at_offset (offset,
3708 save_to_members=False,
3709 tolerance=TOLERANCE_RESCUE)
3711 return tarobj.next ()
3714 def idxent_of_tarinfo (tarinfo):
3716 Scrape the information relevant for the index from a *TarInfo* object.
3717 Keys like the inode number that lack a corresponding field in a TarInfo
3718 will be set to some neutral value.
3723 , "path" : "snapshot://annotations.db"
3727 , "ctime" : 1502798115
3728 , "mtime" : 1502196423
3737 { "inode" : 0 # ignored when reading the index
3738 , "uid" : tarinfo.uid
3739 , "gid" : tarinfo.gid
3740 , "path" : tarinfo.name # keeping URI scheme
3741 , "offset" : 0 # to be added by the caller
3742 , "volume" : tarinfo.volume_offset
3743 , "mode" : tarinfo.mode
3744 , "ctime" : tarinfo.mtime
3745 , "mtime" : tarinfo.mtime
3746 , "size" : tarinfo.size
3747 , "type" : tarinfo.type
3751 def gen_rescue_index (backup_tar_path, mode, password=None, key=None):
3752 psidx = [] # pseudo index, return value
3754 secret = crypto.make_secret (password=password, key=key)
3756 if secret is not None:
3757 offsets = crypto.reconstruct_offsets (backup_tar_path, secret)
3759 offsets = reconstruct_offsets_gz (backup_tar_path)
3761 offsets = reconstruct_offsets_tar (backup_tar_path)
3763 raise TarError ("no rescue handling for mode “%s”" % mode)
3765 fileobj = bltn_open (backup_tar_path, "rb")
3766 infos = [ (off, read_tarobj_at_offset (fileobj, off, mode, secret=secret))
3767 for off in offsets ]
3769 ie = idxent_of_tarinfo (ti)
3772 psidx = [ aux (o, ti) for o, ti in infos ]
3776 #--------------------
3777 # exported functions
3778 #--------------------
3779 def is_tarfile(name):
3780 """Return True if name points to a tar archive that we
3781 are able to handle, else return False.