2 #-------------------------------------------------------------------
4 #-------------------------------------------------------------------
5 # Copyright (C) 2002 Lars Gustäbel <lars@gustaebel.de>
8 # Permission is hereby granted, free of charge, to any person
9 # obtaining a copy of this software and associated documentation
10 # files (the "Software"), to deal in the Software without
11 # restriction, including without limitation the rights to use,
12 # copy, modify, merge, publish, distribute, sublicense, and/or sell
13 # copies of the Software, and to permit persons to whom the
14 # Software is furnished to do so, subject to the following
17 # The above copyright notice and this permission notice shall be
18 # included in all copies or substantial portions of the Software.
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22 # OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24 # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25 # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27 # OTHER DEALINGS IN THE SOFTWARE.
29 """Read from and write to tar format archives.
32 __version__ = "$Revision: 85213 $"
36 __author__ = "Lars Gustäbel (lars@gustaebel.de)"
39 __credits__ = "Gustavo Niemeyer, Niels Gustäbel, Richard Townsend, Eduardo Robles."
58 import traceback # XXX
67 # os.symlink on Windows prior to 6.0 raises NotImplementedError
68 symlink_exception = (AttributeError, NotImplementedError)
70 # OSError (winerror=1314) will be raised if the caller does not hold the
71 # SeCreateSymbolicLinkPrivilege privilege
72 symlink_exception += (OSError,)
76 # from tarfile import *
77 __all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError"]
79 from builtins import open as _open # Since 'open' is TarFile.open
81 #---------------------------------------------------------
83 #---------------------------------------------------------
84 NUL = b"\0" # the null character
85 BLOCKSIZE = 512 # length of processing blocks
86 RECORDSIZE = BLOCKSIZE * 20 # length of records
87 GNU_MAGIC = b"ustar \0" # magic gnu tar string
88 POSIX_MAGIC = b"ustar\x0000" # magic posix tar string
90 LENGTH_NAME = 100 # maximum length of a filename
91 LENGTH_LINK = 100 # maximum length of a linkname
92 LENGTH_PREFIX = 155 # maximum length of the prefix field
94 REGTYPE = b"0" # regular file
95 AREGTYPE = b"\0" # regular file
96 LNKTYPE = b"1" # link (inside tarfile)
97 SYMTYPE = b"2" # symbolic link
98 CHRTYPE = b"3" # character special device
99 BLKTYPE = b"4" # block special device
100 DIRTYPE = b"5" # directory
101 FIFOTYPE = b"6" # fifo special device
102 CONTTYPE = b"7" # contiguous file
104 GNUTYPE_LONGNAME = b"L" # GNU tar longname
105 GNUTYPE_LONGLINK = b"K" # GNU tar longlink
106 GNUTYPE_SPARSE = b"S" # GNU tar sparse file
107 GNUTYPE_MULTIVOL = b"M" # GNU tar continuation of a file that began on
110 XHDTYPE = b"x" # POSIX.1-2001 extended header
111 XGLTYPE = b"g" # POSIX.1-2001 global header
112 SOLARIS_XHDTYPE = b"X" # Solaris extended header
114 USTAR_FORMAT = 0 # POSIX.1-1988 (ustar) format
115 GNU_FORMAT = 1 # GNU tar format
116 PAX_FORMAT = 2 # POSIX.1-2001 (pax) format
117 DEFAULT_FORMAT = GNU_FORMAT
119 GZ_FMT_HEADER = b"<BBBBLBB"
120 GZ_HEADER_SIZE = 10 # not including the name
121 GZ_MAGIC = (0x1f, 0x8b) # 0o37, 0o213
122 GZ_METHOD_DEFLATE = 0x08 # 0o10
123 GZ_FLAG_FTEXT = 1 << 0 # ASCII payload
124 GZ_FLAG_FHCRC = 1 << 1 # CRC16
125 GZ_FLAG_FEXTRA = 1 << 2 # extra field
126 GZ_FLAG_FNAME = 1 << 3 # set by default in gzip
127 GZ_FLAG_FCOMMENT = 1 << 4 # NUL-terminated comment
128 GZ_FLAG_RESERVED = 7 << 5 # unassigned
129 GZ_DEFLATE_FLAGS = 0x00 # 0o00, never read (deflate.c)
130 GZ_OS_CODE = 0x03 # 0o03, default in gzip (tailor.h)
131 GZ_MAGIC_BYTES = struct.pack ("<BB", GZ_MAGIC [0], GZ_MAGIC [1])
132 GZ_MAGIC_DEFLATE = struct.pack ("<BBB", GZ_MAGIC [0], GZ_MAGIC [1],
136 TOLERANCE_RECOVER = 1 # rely on offsets in index
137 TOLERANCE_RESCUE = 2 # deduce metadata from archive contents
141 #---------------------------------------------------------
142 # archive handling mode
143 #---------------------------------------------------------
146 ARCMODE_ENCRYPT = 1 << 0
147 ARCMODE_COMPRESS = 1 << 1
148 ARCMODE_CONCAT = 1 << 2
151 if m == ARCMODE_PLAIN:
155 def chkappend (b, s):
160 if first is True: first = False
163 chkappend (ARCMODE_ENCRYPT, "ENCRYPT")
164 chkappend (ARCMODE_COMPRESS, "COMPRESS")
165 chkappend (ARCMODE_CONCAT, "CONCAT")
169 def arcmode_set (concat=False, encryption=None, comptype=None, init=ARCMODE_PLAIN):
171 if bool (concat) is True:
172 ret |= ARCMODE_CONCAT
173 if encryption is not None:
174 ret |= ARCMODE_ENCRYPT
176 ret |= ARCMODE_COMPRESS
179 #---------------------------------------------------------
181 #---------------------------------------------------------
182 # File types that tarfile supports:
183 SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE,
184 SYMTYPE, DIRTYPE, FIFOTYPE,
185 CONTTYPE, CHRTYPE, BLKTYPE,
186 GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
187 GNUTYPE_SPARSE, GNUTYPE_MULTIVOL)
189 # File types that will be treated as a regular file.
190 REGULAR_TYPES = (REGTYPE, AREGTYPE,
191 CONTTYPE, GNUTYPE_SPARSE)
193 # File types that are part of the GNU tar format.
194 GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
195 GNUTYPE_SPARSE, GNUTYPE_MULTIVOL)
197 # Fields from a pax header that override a TarInfo attribute.
198 PAX_FIELDS = ("path", "linkpath", "size", "mtime",
199 "uid", "gid", "uname", "gname")
201 # Fields from a pax header that are affected by hdrcharset.
202 PAX_NAME_FIELDS = {"path", "linkpath", "uname", "gname"}
204 # Fields in a pax header that are numbers, all other fields
205 # are treated as strings.
206 PAX_NUMBER_FIELDS = {
215 #---------------------------------------------------------
217 #---------------------------------------------------------
219 if os.name in ("nt", "ce"):
222 ENCODING = sys.getfilesystemencoding()
224 #---------------------------------------------------------
225 # Some useful functions
226 #---------------------------------------------------------
228 def stn(s, length, encoding, errors):
229 """Convert a string to a null-terminated bytes object.
231 s = s.encode(encoding, errors)
232 return s[:length] + (length - len(s)) * NUL
234 def nts(s, encoding, errors):
235 """Convert a null-terminated bytes object to a string.
240 return s.decode(encoding, errors)
242 def sbtn(s, length, encoding, errors):
243 """Convert a string or a bunch of bytes to a null-terminated bytes object
246 if isinstance(s, str):
247 s = s.encode(encoding, errors)
248 return s[:length] + (length - len(s)) * NUL
251 """Convert a number field to a python number.
253 # There are two possible encodings for a number field, see
255 if s[0] in (0o200, 0o377):
257 for i in range(len(s) - 1):
261 n = -(256 ** (len(s) - 1) - n)
264 n = int(nts(s, "ascii", "strict") or "0", 8)
266 raise InvalidHeaderError("invalid header")
269 def itn(n, digits=8, format=DEFAULT_FORMAT):
270 """Convert a python number to a number field.
272 # POSIX 1003.1-1988 requires numbers to be encoded as a string of
273 # octal digits followed by a null-byte, this allows values up to
274 # (8**(digits-1))-1. GNU tar allows storing numbers greater than
275 # that if necessary. A leading 0o200 or 0o377 byte indicate this
276 # particular encoding, the following digits-1 bytes are a big-endian
277 # base-256 representation. This allows values up to (256**(digits-1))-1.
278 # A 0o200 byte indicates a positive number, a 0o377 byte a negative
280 if 0 <= n < 8 ** (digits - 1):
281 s = bytes("%0*o" % (digits - 1, int(n)), "ascii") + NUL
282 elif format == GNU_FORMAT and -256 ** (digits - 1) <= n < 256 ** (digits - 1):
284 s = bytearray([0o200])
286 s = bytearray([0o377])
287 n = 256 ** digits + n
289 for i in range(digits - 1):
290 s.insert(1, n & 0o377)
293 raise ValueError("overflow in number field")
297 def calc_chksums(buf):
298 """Calculate the checksum for a member's header by summing up all
299 characters except for the chksum field which is treated as if
300 it was filled with spaces. According to the GNU tar sources,
301 some tars (Sun and NeXT) calculate chksum with signed char,
302 which will be different if there are chars in the buffer with
303 the high bit set. So we calculate two checksums, unsigned and
306 unsigned_chksum = 256 + sum(struct.unpack_from("148B8x356B", buf))
307 signed_chksum = 256 + sum(struct.unpack_from("148b8x356b", buf))
308 return unsigned_chksum, signed_chksum
310 def copyfileobj(src, dst, length=None):
311 """Copy length bytes from fileobj src to fileobj dst.
312 If length is None, copy the entire content.
317 shutil.copyfileobj(src, dst)
320 blocks, remainder = divmod(length, BUFSIZE)
321 for b in range(blocks):
322 buf = src.read(BUFSIZE)
324 if len(buf) < BUFSIZE:
325 raise OSError("end of file reached")
327 buf = src.read(remainder)
329 if len(buf) < remainder:
330 raise OSError("end of file reached")
334 """Deprecated in this location; use stat.filemode."""
336 warnings.warn("deprecated in favor of stat.filemode",
337 DeprecationWarning, 2)
338 return stat.filemode(mode)
340 class TarError(Exception):
341 """Base exception."""
343 class ExtractError(TarError):
344 """General exception for extract errors."""
346 class ReadError(TarError):
347 """Exception for unreadable tar archives."""
349 class CompressionError(TarError):
350 """Exception for unavailable compression methods."""
352 class StreamError(TarError):
353 """Exception for unsupported operations on stream-like TarFiles."""
355 class HeaderError(TarError):
356 """Base exception for header errors."""
358 class EmptyHeaderError(HeaderError):
359 """Exception for empty headers."""
361 class TruncatedHeaderError(HeaderError):
362 """Exception for truncated headers."""
364 class EOFHeaderError(HeaderError):
365 """Exception for end of file headers."""
367 class InvalidHeaderError(HeaderError):
368 """Exception for invalid headers."""
370 class SubsequentHeaderError(HeaderError):
371 """Exception for missing and invalid extended headers."""
373 class InvalidEncryptionError(TarError):
374 """Exception for undefined crypto modes and combinations."""
376 class DecryptionError(TarError):
377 """Exception for error during decryption."""
379 class EncryptionError(TarError):
380 """Exception for error during encryption."""
382 class EndOfFile(Exception):
383 """Signal end of file condition when they’re not an error."""
385 #---------------------------
386 # internal stream interface
387 #---------------------------
389 """Low-level file object. Supports reading and writing.
390 It is used instead of a regular file object for streaming
394 def __init__(self, name, mode):
397 "w": os.O_RDWR | os.O_CREAT | os.O_TRUNC,
399 if hasattr(os, "O_BINARY"):
400 _mode |= os.O_BINARY # pylint: disable=no-member
401 self.fd = os.open(name, _mode, 0o666)
407 def read(self, size):
408 ret = os.read(self.fd, size)
409 self.offset += len(ret)
412 def write(self, s, pos=None):
415 os.lseek (self.fd, pos, os.SEEK_SET)
416 n = os.write(self.fd, s)
418 self.offset += len(s)
420 append = pos + n - p0
422 self.offset += append
423 os.lseek (self.fd, p0, os.SEEK_SET)
428 def seek_set (self, pos):
429 os.lseek (self.fd, pos, os.SEEK_SET)
433 def gz_header (name=None):
434 timestamp = int(time.time())
440 flags |= GZ_FLAG_FNAME
441 if type(name) is str:
442 name = name.encode("iso-8859-1", "replace")
443 if name.endswith(b".pdtcrypt"):
445 if name.endswith(b".gz"):
447 # RFC1952 says we must use ISO-8859-1 for the FNAME field.
450 hdr = struct.pack (GZ_FMT_HEADER,
451 GZ_MAGIC [0], GZ_MAGIC [1],
452 GZ_METHOD_DEFLATE, flags,
454 GZ_DEFLATE_FLAGS, GZ_OS_CODE)
460 """Class that serves as an adapter between TarFile and
461 a stream-like object. The stream-like object only
462 needs to have a read() or write() method and is accessed
463 blockwise. Use of gzip or bzip2 compression is possible.
464 A stream-like object could be for example: sys.stdin,
465 sys.stdout, a socket, a tape device etc.
467 _Stream is intended to be used only internally but is
468 nevertherless used externally by Deltatar.
470 When encrypting, the ``enccounter`` will be used for
471 initializing the first cryptographic context. When
472 decrypting, its value will be compared to the decrypted
473 object. Decryption fails if the value does not match.
474 In effect, this means that a ``_Stream`` whose ctor was
475 passed ``enccounter`` can only be used to encrypt or
476 decrypt a single object.
479 remainder = -1 # track size in encrypted entries
480 tolerance = TOLERANCE_STRICT
482 def __init__(self, name, mode, comptype, fileobj, bufsize,
483 concat=False, encryption=None, enccounter=None,
484 compresslevel=9, tolerance=TOLERANCE_STRICT):
485 """Construct a _Stream object.
487 self.arcmode = arcmode_set (concat, encryption, comptype)
488 self.tolerance = tolerance
490 self._extfileobj = True
492 fileobj = _LowLevelFile(name, mode)
493 self._extfileobj = False
496 # Enable transparent compression detection for the
498 fileobj = _StreamProxy(fileobj)
499 comptype = fileobj.getcomptype()
503 self.enccounter = None
504 if self.arcmode & ARCMODE_ENCRYPT:
505 self.enccounter = enccounter
507 self.name = name or ""
509 self.comptype = comptype
511 self.fileobj = fileobj
512 self.bufsize = bufsize
518 self.last_block_offset = 0
519 self.dbuf = b"" # ???
520 self.exception = None # communicate decompression failure
521 self.compresslevel = compresslevel
522 self.bytes_written = 0
524 self.encryption = encryption
532 raise CompressionError("zlib module is not available")
535 self.exception = zlib.error
538 if not (self.arcmode & ARCMODE_CONCAT):
539 if self.arcmode & ARCMODE_ENCRYPT:
540 self._init_write_encrypt (name)
541 self._init_write_gz ()
542 self.crc = zlib.crc32(b"") & 0xFFFFffff
544 elif comptype == "bz2":
545 if self.arcmode & ARCMODE_ENCRYPT:
546 raise InvalidEncryptionError("encryption not available for "
547 "compression “%s”" % comptype)
551 raise CompressionError("bz2 module is not available")
554 self.cmp = bz2.BZ2Decompressor()
555 self.exception = OSError
557 self.cmp = bz2.BZ2Compressor()
559 elif comptype == 'xz':
560 if self.arcmode & ARCMODE_ENCRYPT:
561 raise InvalidEncryptionError("encryption not available for "
562 "compression “%s”" % comptype)
566 raise CompressionError("lzma module is not available")
569 self.cmp = lzma.LZMADecompressor()
570 self.exception = lzma.LZMAError
572 self.cmp = lzma.LZMACompressor()
574 elif comptype == "tar":
575 if not (self.arcmode & ARCMODE_CONCAT) \
577 and self.arcmode & ARCMODE_ENCRYPT:
578 self._init_write_encrypt (name)
581 if self.arcmode & ARCMODE_ENCRYPT:
582 raise InvalidEncryptionError("encryption not available for "
583 "compression “%s”" % comptype)
584 raise CompressionError("unknown compression type %r" % comptype)
587 if not self._extfileobj:
593 if hasattr(self, "closed") and not self.closed:
596 except crypto.InternalError:
597 # context already finalized due to abort but close() tried
602 def next (self, name):
603 if self.arcmode & ARCMODE_COMPRESS:
604 if getattr (self, "cmp", None) is not None:
605 self._finalize_write_gz ()
607 if self.arcmode & ~(ARCMODE_ENCRYPT | ARCMODE_COMPRESS):
608 self.last_block_offset = self.fileobj.tell()
609 if self.arcmode & ARCMODE_ENCRYPT:
610 self._finalize_write_encrypt ()
611 self._init_write_encrypt (name, set_last_block_offset=True)
612 if self.arcmode & ARCMODE_COMPRESS:
613 self._init_write_gz (set_last_block_offset =
614 not (self.arcmode & ARCMODE_ENCRYPT))
615 return self.last_block_offset
618 def next_volume (self, name):
619 # with non-concat modes, this is taken care by the _Stream
620 # ctor as invoked by the newvol handler
621 if self.arcmode & ARCMODE_COMPRESS:
622 if getattr (self, "cmp", None) is not None:
623 # e. g. compressed PAX header written
624 self._finalize_write_gz ()
625 if self.arcmode & ARCMODE_ENCRYPT:
626 self._init_write_encrypt (name)
627 if self.arcmode & ARCMODE_COMPRESS:
628 self._init_write_gz ()
631 def _init_write_encrypt (self, entry=None, set_last_block_offset=False):
633 Save position for delayed write of header; fill the header location
636 # first thing, proclaim new object to the encryption context
637 # secondly, assemble the header with the updated parameters
638 # and commit it directly to the underlying stream, bypassing the
639 # encryption layer in .__write().
640 dummyhdr = self.encryption.next (entry, counter=self.enccounter)
642 raise EncryptionError ("Crypto.next(): bad dummy header") # XXX
643 self.lasthdr = self.fileobj.tell()
644 self.__write_to_file(dummyhdr)
645 if set_last_block_offset is True:
646 self.last_block_offset = self.lasthdr
649 def _finalize_write_encrypt (self):
651 Seek back to header position, read dummy bytes, finalize crypto
652 obtaining the actual header, write header, seek back to current
655 Returns the list of IV fixed parts as used during encryption.
657 if self.lasthdr is not None:
658 pos0 = self.fileobj.tell ()
659 self.fileobj.seek_set (self.lasthdr)
660 dummy = self.fileobj.read (crypto.PDTCRYPT_HDR_SIZE)
661 pos1 = self.fileobj.tell ()
662 dpos = pos1 - self.lasthdr
663 assert dpos == crypto.PDTCRYPT_HDR_SIZE
664 self.fileobj.seek_set (pos0)
665 data, hdr, _ = self.encryption.done (dummy)
666 self.__write_to_file(hdr, pos=self.lasthdr)
667 self.__write_to_file(data) # append remainder of data
671 def _finalize_write_gz (self):
672 if self.cmp is not None:
673 chunk = self.buf + self.cmp.flush()
675 if self.comptype == "gz":
676 # The native zlib crc is an unsigned 32-bit integer, but
677 # the Python wrapper implicitly casts that to a signed C
678 # long. So, on a 32-bit box self.crc may "look negative",
679 # while the same crc on a 64-bit box may "look positive".
680 # To avoid irksome warnings from the `struct` module, force
681 # it to look positive on all boxes.
682 chunk += struct.pack("<L", self.crc & 0xffffffff)
683 chunk += struct.pack("<L", self.concat_pos & 0xffffFFFF)
684 self.__enc_write (chunk)
688 def _init_write_gz (self, set_last_block_offset=False):
690 Add a new gzip block, closing last one
693 self.crc = self.zlib.crc32(b"") & 0xFFFFffff
694 first = self.cmp is None
695 self.cmp = self.zlib.compressobj(self.compresslevel,
697 -self.zlib.MAX_WBITS,
698 self.zlib.DEF_MEM_LEVEL,
701 # if aes, we encrypt after compression
702 if set_last_block_offset is True:
703 self.last_block_offset = self.fileobj.tell()
705 self.__write(gz_header (self.name if first is True else None))
709 """Write string s to the stream.
711 if self.comptype == "gz":
712 self.crc = self.zlib.crc32(s, self.crc) & 0xFFFFffff
714 self.concat_pos += len(s)
715 if self.cmp is not None:
716 s = self.cmp.compress(s)
720 """Write what’s left in the buffer to the stream."""
721 self.__write (b"") # → len (buf) <= bufsiz
722 self.__enc_write (self.buf)
725 def __write(self, s):
726 """Writes (and encodes) string s to the stream blockwise
728 will wait with encoding/writing until block is complete
731 while len(self.buf) > self.bufsize:
732 self.__enc_write(self.buf[:self.bufsize])
733 self.buf = self.buf[self.bufsize:]
736 def __write_to_file(self, s, pos=None):
738 Writes directly to the fileobj; updates self.bytes_written. If “pos” is
739 given, the stream will seek to that position first and back afterwards,
740 and the total of bytes written is not updated.
742 self.fileobj.write(s, pos)
744 self.bytes_written += len(s)
747 def __enc_write(self, s):
749 If encryption is active, the string s is encrypted before being written
754 if self.arcmode & ARCMODE_ENCRYPT:
757 n, ct = self.encryption.process(buf)
758 self.__write_to_file(ct)
761 # The entire plaintext was not consumed: The size limit
762 # for encrypted objects was reached. Transparently create
763 # a new encrypted object and continue processing the input.
764 self._finalize_write_encrypt ()
765 self._init_write_encrypt ()
767 self.__write_to_file(s)
770 def estim_file_size(self):
771 """ estimates size of file if closing it now
773 The result may differ greatly from the amount of data sent to write()
774 due to compression, encryption and buffering.
776 In tests the result (before calling close()) was up to 12k smaller than
777 the final file size if compression is being used because zlib/bz2
778 compressors do not allow inspection of their buffered data :-(
780 Still, we add what close() would add: 8 bytes for gz checksum, one
781 encryption block size if encryption is used and the size of our own
785 return self.bytes_written
787 result = self.bytes_written
789 result += len(self.buf)
790 if self.comptype == 'gz':
791 result += 8 # 2 longs = 8 byte (no extra info written for bzip2)
794 def close(self, close_fileobj=True):
795 """Close the _Stream object. No operation should be
796 done on it afterwards.
802 if close_fileobj is True:
805 if self.arcmode & ARCMODE_COMPRESS:
806 self._finalize_write_gz ()
807 # end of Tar archive marker (two empty blocks) was written
808 # finalize encryption last; no writes may be performed after
811 if self.arcmode & ARCMODE_ENCRYPT:
812 self._finalize_write_encrypt ()
814 if not self._extfileobj:
817 # read the zlib crc and length and check them
818 if self.mode == "r" and self.comptype == "gz":
819 read_crc = self.__read(4)
820 read_length = self.__read(4)
821 calculated_crc = self.crc
822 if struct.unpack("<L", read_crc)[0] != calculated_crc:
823 raise CompressionError("bad gzip crc")
827 def _init_read_gz(self):
828 """Initialize for reading a gzip compressed fileobj.
830 self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
832 read2 = self.__read(2)
834 raise EndOfFile ("_init_read_gz(): read returned zero bytes at pos "
835 "%d" % self.fileobj.tell())
836 # taken from gzip.GzipFile with some alterations
837 if read2 != GZ_MAGIC_BYTES:
838 raise ReadError("not a gzip file")
840 read1 = ord (self.__read(1))
841 if read1 != GZ_METHOD_DEFLATE:
842 raise CompressionError("unsupported compression method")
844 self.flags = flag = ord(self.__read(1))
845 self.__read(6) # discard timestamp[4], deflate flags, os code
847 if flag & GZ_FLAG_FEXTRA:
848 xlen = ord(self.__read(1)) + 256 * ord(self.__read(1))
850 if flag & GZ_FLAG_FNAME:
853 if not s or s == NUL:
855 if flag & GZ_FLAG_FCOMMENT:
858 if not s or s == NUL:
860 if flag & GZ_FLAG_FHCRC:
863 def _init_read_encrypt (self):
864 """Initialize encryption for next entry in archive. Read a header and
865 notify the crypto context."""
866 if self.arcmode & ARCMODE_ENCRYPT:
867 lasthdr = self.fileobj.tell ()
869 hdr = crypto.hdr_read_stream (self.fileobj)
870 except crypto.EndOfFile:
872 except crypto.InvalidHeader as exn:
873 raise DecryptionError ("Crypto.hdr_read_stream(): error “%s” "
874 "processing %r at pos %d"
875 % (exn, self.fileobj, lasthdr)) \
877 if self.enccounter is not None:
878 # enforce that the iv counter in the header matches an
879 # explicitly requested one
880 iv = crypto.hdr_iv_counter (hdr)
881 if iv != self.enccounter:
882 raise DecryptionError ("expected IV counter %d, got %d"
883 % (self.enccounter, iv))
884 self.lasthdr = lasthdr
885 self.remainder = hdr ["ctsize"] # distance to next header
887 self.encryption.next (hdr)
888 except crypto.InvalidParameter as exn:
889 raise DecryptionError ("Crypto.next(): error “%s” "
890 "processing %r at pos %d"
891 % (exn, self.fileobj, lasthdr)) \
897 def _read_encrypt (self, buf):
899 Demote a program error to a decryption error in tolerant mode. This
900 allows recovery from corrupted headers and invalid data.
903 return self.encryption.process (buf)
904 except RuntimeError as exn:
905 if self.tolerance != TOLERANCE_STRICT:
906 raise DecryptionError (exn)
910 def _finalize_read_encrypt (self):
914 if self.arcmode & ARCMODE_ENCRYPT \
915 and self.lasthdr is not None :
916 assert self.remainder >= 0
917 if self.remainder > 0:
920 data = self.encryption.done ()
921 except crypto.InvalidGCMTag as exn:
922 raise DecryptionError ("decryption failed: %s" % exn)
927 """Return the stream's file pointer position.
931 def seek(self, pos=0):
932 """Set the stream's file pointer to pos. Negative seeking
935 if pos - self.pos >= 0:
936 blocks, remainder = divmod(pos - self.pos, self.bufsize)
937 for i in range(blocks):
938 self.read(self.bufsize)
941 raise StreamError("seeking backwards is not allowed")
944 def read(self, size=None):
945 """Return the next size number of bytes from the stream.
946 If size is not defined, return all bytes of the stream
952 buf = self._read(self.bufsize)
958 buf = self._read(size)
963 """Reads just one line, new line character included
965 # if \n in dbuf, no read neads to be done
966 if b'\n' in self.dbuf:
967 pos = self.dbuf.index(b'\n') + 1
968 ret = self.dbuf[:pos]
969 self.dbuf = self.dbuf[pos:]
974 chunk = self._read(self.bufsize)
976 # nothing more to read, so return the buffer
982 # if \n found, return the new line
985 pos = dbuf.index(b'\n') + 1
986 self.dbuf = dbuf[pos:] + self.dbuf
989 def _read(self, size):
990 """Return size bytes from the stream.
996 buf = self.__read(self.bufsize)
1000 if self.cmp is not None:
1002 buf = self.cmp.decompress(buf)
1003 except self.exception as exn:
1004 raise ReadError("invalid compressed data (%r)" % exn)
1005 except Exception as e:
1006 # happens at the end of the file
1007 # _init_read_gz failed in the previous iteration so
1008 # self.cmp.decompress fails here
1009 if self.arcmode & ARCMODE_CONCAT:
1012 raise ReadError("invalid compressed data")
1013 if self.arcmode & ARCMODE_COMPRESS and hasattr(self, "crc"):
1014 self.crc = self.zlib.crc32(buf, self.crc) & 0xFFFFffff
1015 if self.arcmode & ARCMODE_CONCAT \
1016 and len(self.cmp.unused_data) != 0:
1017 self.buf = self.cmp.unused_data + self.buf
1018 self.close(close_fileobj=False)
1020 self._init_read_gz()
1021 except DecryptionError:
1022 if self.tolerance != TOLERANCE_STRICT:
1023 # return whatever data was processed successfully
1030 # happens at the end of the file
1032 self.crc = self.zlib.crc32(b"") & 0xFFFFffff
1037 self.dbuf = t[size:]
1041 def __read(self, size):
1043 Return size bytes from stream. If internal buffer is empty, read
1044 another block from the stream.
1046 The function returns up to size bytes of data. When an error occurs
1047 during decryption, everything until the end of the last successfully
1048 finalized object is returned.
1051 t = [self.buf] if c > 0 else []
1052 good_crypto = len (t)
1057 if self.arcmode & ARCMODE_ENCRYPT:
1058 if self.remainder <= 0:
1059 # prepare next object
1060 if self._init_read_encrypt () is False: # EOF
1064 # only read up to the end of the encrypted object
1065 todo = min (size, self.remainder)
1066 buf = self.fileobj.read(todo)
1067 if self.arcmode & ARCMODE_ENCRYPT:
1069 buf = self._read_encrypt (buf)
1070 if todo == self.remainder:
1071 # at the end of a crypto object; finalization will fail if
1072 # the GCM tag does not match
1073 trailing = self._finalize_read_encrypt ()
1074 good_crypto = len (t) + 1
1075 if len (trailing) > 0:
1079 self.remainder -= todo
1080 except DecryptionError:
1081 if self.tolerance == TOLERANCE_STRICT:
1083 self.encryption.drop ()
1084 if good_crypto == 0:
1086 # this may occur at any of the three crypto operations above.
1087 # some objects did validate; discard all data after it; next
1088 # call will start with the bad object and error out immediately
1089 self.buf = b"".join (t [good_crypto:])
1090 return b"".join (t [:good_crypto])
1092 if not buf: ## XXX stream terminated prematurely; this should be an error
1103 class _StreamProxy(object):
1104 """Small proxy class that enables transparent compression
1105 detection for the Stream interface (mode 'r|*').
1108 def __init__(self, fileobj):
1109 self.fileobj = fileobj
1110 self.buf = self.fileobj.read(BLOCKSIZE)
1112 def read(self, size): # pylint: disable=method-hidden
1113 self.read = self.fileobj.read
1116 def getcomptype(self):
1117 if self.buf.startswith(GZ_MAGIC_DEFLATE):
1119 elif self.buf[0:3] == b"BZh" and self.buf[4:10] == b"1AY&SY":
1121 elif self.buf.startswith((b"\x5d\x00\x00\x80", b"\xfd7zXZ")):
1127 self.fileobj.close()
1130 #------------------------
1131 # Extraction file object
1132 #------------------------
1133 class _FileInFile(object):
1134 """A thin wrapper around an existing file object that
1135 provides a part of its data as an individual file
1139 def __init__(self, fileobj, offset, size, blockinfo=None):
1140 self.fileobj = fileobj
1141 self.offset = offset
1144 self.name = getattr(fileobj, "name", None)
1147 if blockinfo is None:
1148 blockinfo = [(0, size)]
1150 # Construct a map with data and zero blocks.
1154 realpos = self.offset
1155 for offset, size in blockinfo:
1156 if offset > lastpos:
1157 self.map.append((False, lastpos, offset, None))
1158 self.map.append((True, offset, offset + size, realpos))
1160 lastpos = offset + size
1161 if lastpos < self.size:
1162 self.map.append((False, lastpos, self.size, None))
1174 return self.fileobj.seekable()
1177 """Return the current file position.
1179 return self.position
1181 def seek(self, position, whence=io.SEEK_SET):
1182 """Seek to a position in the file.
1184 if whence == io.SEEK_SET:
1185 self.position = min(max(position, 0), self.size)
1186 elif whence == io.SEEK_CUR:
1188 self.position = max(self.position + position, 0)
1190 self.position = min(self.position + position, self.size)
1191 elif whence == io.SEEK_END:
1192 self.position = max(min(self.size + position, self.size), 0)
1194 raise ValueError("Invalid argument")
1195 return self.position
1197 def read(self, size=None):
1198 """Read data from the file.
1201 size = self.size - self.position
1203 size = min(size, self.size - self.position)
1208 data, start, stop, offset = self.map[self.map_index]
1209 if start <= self.position < stop:
1213 if self.map_index == len(self.map):
1215 length = min(size, stop - self.position)
1217 self.fileobj.seek(offset + (self.position - start))
1218 buf += self.fileobj.read(length)
1222 self.position += length
1225 def readinto(self, b):
1226 buf = self.read(len(b))
1235 class ExFileObject(io.BufferedReader):
1237 def __init__(self, tarfile, tarinfo):
1238 fileobj = _FileInFile(tarfile.fileobj, tarinfo.offset_data,
1239 tarinfo.size, tarinfo.sparse)
1240 super().__init__(fileobj)
1246 class TarInfo(object):
1247 """Informational class which holds the details about an
1248 archive member given by a tar header block.
1249 TarInfo objects are returned by TarFile.getmember(),
1250 TarFile.getmembers() and TarFile.gettarinfo() and are
1251 usually created internally.
1254 __slots__ = ("name", "mode", "uid", "gid", "size", "mtime",
1255 "chksum", "type", "linkname", "uname", "gname",
1256 "devmajor", "devminor", "volume_offset",
1257 "offset", "offset_data", "pax_headers", "sparse",
1258 "tarfile", "_sparse_structs", "_link_target")
1260 def __init__(self, name=""):
1261 """Construct a TarInfo object. name is the optional name
1264 self.name = name # member name
1265 self.mode = 0o644 # file permissions
1266 self.uid = 0 # user id
1267 self.gid = 0 # group id
1268 self.size = 0 # file size
1269 self.mtime = 0 # modification time
1270 self.chksum = 0 # header checksum
1271 self.type = REGTYPE # member type
1272 self.linkname = "" # link name
1273 self.uname = "" # user name
1274 self.gname = "" # group name
1275 self.devmajor = 0 # device major number
1276 self.devminor = 0 # device minor number
1278 self.offset = 0 # the tar header starts here
1279 self.offset_data = 0 # the file's data starts here
1280 self.volume_offset = 0 # the file's data corresponds with the data
1281 # starting at this position
1283 self.sparse = None # sparse member information
1284 self.pax_headers = {} # pax header information
1286 # In pax headers the "name" and "linkname" field are called
1287 # "path" and "linkpath".
1290 def _setpath(self, name):
1292 path = property(_getpath, _setpath)
1294 def _getlinkpath(self):
1295 return self.linkname
1296 def _setlinkpath(self, linkname):
1297 self.linkname = linkname
1298 linkpath = property(_getlinkpath, _setlinkpath)
1301 return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self))
1303 def get_info(self, encoding=None, errors=None):
1304 """Return the TarInfo's attributes as a dictionary.
1308 "mode": self.mode & 0o7777,
1312 "mtime": self.mtime,
1313 "chksum": self.chksum,
1315 "linkname": self.linkname,
1316 "uname": self.uname,
1317 "gname": self.gname,
1318 "devmajor": self.devmajor,
1319 "devminor": self.devminor,
1320 "offset_data": self.offset_data,
1321 "volume_offset": self.volume_offset
1324 if info["type"] == DIRTYPE and not info["name"].endswith("/"):
1329 def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING,
1330 errors="surrogateescape"):
1331 """Return a tar header as a string of 512 byte blocks.
1333 info = self.get_info(encoding, errors)
1335 if format == USTAR_FORMAT:
1336 return self.create_ustar_header(info, encoding, errors)
1337 elif format == GNU_FORMAT:
1338 return self.create_gnu_header(info, encoding, errors)
1339 elif format == PAX_FORMAT:
1340 return self.create_pax_header(info, encoding, errors)
1342 raise ValueError("invalid format")
1344 def create_ustar_header(self, info, encoding, errors):
1345 """Return the object as a ustar header block.
1347 info["magic"] = POSIX_MAGIC
1349 if len(info["linkname"]) > LENGTH_LINK:
1350 raise ValueError("linkname is too long")
1352 if len(info["name"]) > LENGTH_NAME:
1353 info["prefix"], info["name"] = self._posix_split_name(info["name"])
1355 return self._create_header(info, USTAR_FORMAT, encoding, errors)
1357 def create_gnu_header(self, info, encoding, errors):
1358 """Return the object as a GNU header block sequence.
1360 info["magic"] = GNU_MAGIC
1362 if self.ismultivol():
1364 itn(info.get("atime", 0), 12, GNU_FORMAT),
1365 itn(info.get("ctime", 0), 12, GNU_FORMAT),
1366 itn(self.volume_offset, 12, GNU_FORMAT),
1367 itn(0, 119, GNU_FORMAT), # stuff unused in this tar implementation, set to zero
1369 info['prefix'] = b"".join(prefix)
1370 info['size'] = info['size'] - self.volume_offset
1373 if len(info["linkname"]) > LENGTH_LINK:
1374 buf += self._create_gnu_long_header(info["linkname"],
1375 GNUTYPE_LONGLINK, encoding, errors)
1377 if len(info["name"]) > LENGTH_NAME:
1378 buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME,
1381 return buf + self._create_header(info, GNU_FORMAT, encoding, errors)
1383 def create_pax_header(self, info, encoding, errors):
1384 """Return the object as a ustar header block. If it cannot be
1385 represented this way, prepend a pax extended header sequence
1386 with supplement information.
1388 info["magic"] = POSIX_MAGIC
1389 pax_headers = self.pax_headers.copy()
1390 if self.ismultivol():
1391 info['size'] = info['size'] - self.volume_offset
1393 # Test string fields for values that exceed the field length or cannot
1394 # be represented in ASCII encoding.
1395 for name, hname, length in (
1396 ("name", "path", LENGTH_NAME),
1397 ("linkname", "linkpath", LENGTH_LINK),
1398 ("uname", "uname", 32),
1399 ("gname", "gname", 32)):
1401 if hname in pax_headers:
1402 # The pax header has priority.
1405 # Try to encode the string as ASCII.
1407 info[name].encode("ascii", "strict")
1408 except UnicodeEncodeError:
1409 pax_headers[hname] = info[name]
1412 if len(info[name]) > length:
1413 pax_headers[hname] = info[name]
1415 # Test number fields for values that exceed the field limit or values
1416 # that like to be stored as float.
1417 for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)):
1418 if name in pax_headers:
1419 # The pax header has priority. Avoid overflow.
1424 if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float):
1425 pax_headers[name] = str(val)
1428 # Create a pax extended header if necessary.
1430 buf = self._create_pax_generic_header(pax_headers, XHDTYPE, encoding)
1434 return buf + self._create_header(info, USTAR_FORMAT, "ascii", "replace")
1437 def create_pax_global_header(cls, pax_headers):
1438 """Return the object as a pax global header block sequence.
1440 return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf-8")
1442 def _posix_split_name(self, name):
1443 """Split a name longer than 100 chars into a prefix
1446 prefix = name[:LENGTH_PREFIX + 1]
1447 while prefix and prefix[-1] != "/":
1448 prefix = prefix[:-1]
1450 name = name[len(prefix):]
1451 prefix = prefix[:-1]
1453 if not prefix or len(name) > LENGTH_NAME:
1454 raise ValueError("name is too long")
1458 def _create_header(info, format, encoding, errors):
1459 """Return a header block. info is a dictionary with file
1460 information, format must be one of the *_FORMAT constants.
1463 stn(info.get("name", ""), 100, encoding, errors),
1464 itn(info.get("mode", 0) & 0o7777, 8, format),
1465 itn(info.get("uid", 0), 8, format),
1466 itn(info.get("gid", 0), 8, format),
1467 itn(info.get("size", 0), 12, format),
1468 itn(info.get("mtime", 0), 12, format),
1469 b" ", # checksum field
1470 info.get("type", REGTYPE),
1471 stn(info.get("linkname", ""), 100, encoding, errors),
1472 info.get("magic", POSIX_MAGIC),
1473 stn(info.get("uname", ""), 32, encoding, errors),
1474 stn(info.get("gname", ""), 32, encoding, errors),
1475 itn(info.get("devmajor", 0), 8, format),
1476 itn(info.get("devminor", 0), 8, format),
1477 sbtn(info.get("prefix", ""), 155, encoding, errors)
1480 buf = struct.pack("%ds" % BLOCKSIZE, b"".join(parts))
1481 chksum = calc_chksums(buf[-BLOCKSIZE:])[0]
1482 buf = buf[:-364] + bytes("%06o\0" % chksum, "ascii") + buf[-357:]
1486 def _create_payload(payload):
1487 """Return the string payload filled with zero bytes
1488 up to the next 512 byte border.
1490 blocks, remainder = divmod(len(payload), BLOCKSIZE)
1492 payload += (BLOCKSIZE - remainder) * NUL
1496 def _create_gnu_long_header(cls, name, type, encoding, errors):
1497 """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence
1500 name = name.encode(encoding, errors) + NUL
1503 info["name"] = "././@LongLink"
1505 info["size"] = len(name)
1506 info["magic"] = GNU_MAGIC
1508 # create extended header + name blocks.
1509 return cls._create_header(info, USTAR_FORMAT, encoding, errors) + \
1510 cls._create_payload(name)
1513 def _create_pax_generic_header(cls, pax_headers, type, encoding):
1514 """Return a POSIX.1-2008 extended or global header sequence
1515 that contains a list of keyword, value pairs. The values
1518 # Check if one of the fields contains surrogate characters and thereby
1519 # forces hdrcharset=BINARY, see _proc_pax() for more information.
1521 for keyword, value in pax_headers.items():
1523 value.encode("utf-8", "strict")
1524 except UnicodeEncodeError:
1530 # Put the hdrcharset field at the beginning of the header.
1531 records += b"21 hdrcharset=BINARY\n"
1533 for keyword, value in pax_headers.items():
1534 keyword = keyword.encode("utf-8")
1536 # Try to restore the original byte representation of `value'.
1537 # Needless to say, that the encoding must match the string.
1538 value = value.encode(encoding, "surrogateescape")
1540 value = value.encode("utf-8")
1542 l = len(keyword) + len(value) + 3 # ' ' + '=' + '\n'
1549 records += bytes(str(p), "ascii") + b" " + keyword + b"=" + value + b"\n"
1551 # We use a hardcoded "././@PaxHeader" name like star does
1552 # instead of the one that POSIX recommends.
1554 info["name"] = "././@PaxHeader"
1556 info["size"] = len(records)
1557 info["magic"] = POSIX_MAGIC
1559 # Create pax header + record blocks.
1560 return cls._create_header(info, USTAR_FORMAT, "ascii", "replace") + \
1561 cls._create_payload(records)
1564 def frombuf(cls, buf, encoding, errors):
1565 """Construct a TarInfo object from a 512 byte bytes object.
1568 raise EmptyHeaderError("empty header")
1569 if len(buf) != BLOCKSIZE:
1570 raise TruncatedHeaderError("truncated header")
1571 if buf.count(NUL) == BLOCKSIZE:
1572 raise EOFHeaderError("end of file header")
1574 chksum = nti(buf[148:156])
1575 if chksum not in calc_chksums(buf):
1576 raise InvalidHeaderError("bad checksum")
1579 obj.name = nts(buf[0:100], encoding, errors)
1580 obj.mode = nti(buf[100:108])
1581 obj.uid = nti(buf[108:116])
1582 obj.gid = nti(buf[116:124])
1583 obj.size = nti(buf[124:136])
1584 obj.mtime = nti(buf[136:148])
1586 obj.type = buf[156:157]
1587 obj.linkname = nts(buf[157:257], encoding, errors)
1588 obj.uname = nts(buf[265:297], encoding, errors)
1589 obj.gname = nts(buf[297:329], encoding, errors)
1590 obj.devmajor = nti(buf[329:337])
1591 obj.devminor = nti(buf[337:345])
1592 prefix = nts(buf[345:500], encoding, errors)
1594 # The old GNU sparse format occupies some of the unused
1595 # space in the buffer for up to 4 sparse structures.
1596 # Save the them for later processing in _proc_sparse().
1597 if obj.type == GNUTYPE_SPARSE:
1602 offset = nti(buf[pos:pos + 12])
1603 numbytes = nti(buf[pos + 12:pos + 24])
1606 structs.append((offset, numbytes))
1608 isextended = bool(buf[482])
1609 origsize = nti(buf[483:495])
1610 obj._sparse_structs = (structs, isextended, origsize)
1612 # Old V7 tar format represents a directory as a regular
1613 # file with a trailing slash.
1614 if obj.type == AREGTYPE and obj.name.endswith("/"):
1617 # Remove redundant slashes from directories.
1619 obj.name = obj.name.rstrip("/")
1621 # Reconstruct a ustar longname.
1622 if prefix and obj.type not in GNU_TYPES:
1623 obj.name = prefix + "/" + obj.name
1625 obj.offset_data = nti(buf[369:381])
1629 def fromtarfile(cls, tarfile):
1630 """Return the next TarInfo object from TarFile object
1633 buf = tarfile.fileobj.read(BLOCKSIZE)
1634 obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors)
1635 obj.offset = tarfile.fileobj.tell() - BLOCKSIZE
1636 return obj._proc_member(tarfile)
1638 #--------------------------------------------------------------------------
1639 # The following are methods that are called depending on the type of a
1640 # member. The entry point is _proc_member() which can be overridden in a
1641 # subclass to add custom _proc_*() methods. A _proc_*() method MUST
1642 # implement the following
1644 # 1. Set self.offset_data to the position where the data blocks begin,
1645 # if there is data that follows.
1646 # 2. Set tarfile.offset to the position where the next member's header will
1648 # 3. Return self or another valid TarInfo object.
1649 def _proc_member(self, tarfile):
1650 """Choose the right processing method depending on
1651 the type and call it.
1653 if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK):
1654 return self._proc_gnulong(tarfile)
1655 elif self.type == GNUTYPE_SPARSE:
1656 return self._proc_sparse(tarfile)
1657 elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE):
1658 return self._proc_pax(tarfile)
1660 return self._proc_builtin(tarfile)
1662 def _proc_builtin(self, tarfile):
1663 """Process a builtin type or an unknown type which
1664 will be treated as a regular file.
1666 self.offset_data = tarfile.fileobj.tell()
1667 offset = self.offset_data
1668 if self.isreg() or self.ismultivol() or self.type not in SUPPORTED_TYPES:
1669 # Skip the following data blocks.
1670 offset += self._block(self.size)
1671 tarfile.offset = offset
1673 # Patch the TarInfo object with saved global
1674 # header information.
1675 self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors)
1679 def _proc_gnulong(self, tarfile):
1680 """Process the blocks that hold a GNU longname
1683 buf = tarfile.fileobj.read(self._block(self.size))
1685 # Fetch the next header and process it.
1687 next = self.fromtarfile(tarfile)
1689 raise SubsequentHeaderError("missing or bad subsequent header")
1691 # Patch the TarInfo object from the next header with
1692 # the longname information.
1693 next.offset = self.offset
1694 if self.type == GNUTYPE_LONGNAME:
1695 next.name = nts(buf, tarfile.encoding, tarfile.errors)
1696 elif self.type == GNUTYPE_LONGLINK:
1697 next.linkname = nts(buf, tarfile.encoding, tarfile.errors)
1701 def _proc_sparse(self, tarfile):
1702 """Process a GNU sparse header plus extra headers.
1704 # We already collected some sparse structures in frombuf().
1705 structs, isextended, origsize = self._sparse_structs
1706 del self._sparse_structs
1708 # Collect sparse structures from extended header blocks.
1710 buf = tarfile.fileobj.read(BLOCKSIZE)
1714 offset = nti(buf[pos:pos + 12])
1715 numbytes = nti(buf[pos + 12:pos + 24])
1718 if offset and numbytes:
1719 structs.append((offset, numbytes))
1721 isextended = bool(buf[504])
1722 self.sparse = structs
1724 self.offset_data = tarfile.fileobj.tell()
1725 tarfile.offset = self.offset_data + self._block(self.size)
1726 self.size = origsize
1729 def _proc_pax(self, tarfile):
1730 """Process an extended or global header as described in
1733 # Read the header information.
1734 buf = tarfile.fileobj.read(self._block(self.size))
1736 # A pax header stores supplemental information for either
1737 # the following file (extended) or all following files
1739 if self.type == XGLTYPE:
1740 pax_headers = tarfile.pax_headers
1742 pax_headers = tarfile.pax_headers.copy()
1744 # Check if the pax header contains a hdrcharset field. This tells us
1745 # the encoding of the path, linkpath, uname and gname fields. Normally,
1746 # these fields are UTF-8 encoded but since POSIX.1-2008 tar
1747 # implementations are allowed to store them as raw binary strings if
1748 # the translation to UTF-8 fails.
1749 match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf)
1750 if match is not None:
1751 pax_headers["hdrcharset"] = match.group(1).decode("utf-8")
1753 # For the time being, we don't care about anything other than "BINARY".
1754 # The only other value that is currently allowed by the standard is
1755 # "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
1756 hdrcharset = pax_headers.get("hdrcharset")
1757 if hdrcharset == "BINARY":
1758 encoding = tarfile.encoding
1762 # Parse pax header information. A record looks like that:
1763 # "%d %s=%s\n" % (length, keyword, value). length is the size
1764 # of the complete record including the length field itself and
1765 # the newline. keyword and value are both UTF-8 encoded strings.
1766 regex = re.compile(br"(\d+) ([^=]+)=")
1769 match = regex.match(buf, pos)
1773 length, keyword = match.groups()
1774 length = int(length)
1775 value = buf[match.end(2) + 1:match.start(1) + length - 1]
1777 # Normally, we could just use "utf-8" as the encoding and "strict"
1778 # as the error handler, but we better not take the risk. For
1779 # example, GNU tar <= 1.23 is known to store filenames it cannot
1780 # translate to UTF-8 as raw strings (unfortunately without a
1781 # hdrcharset=BINARY header).
1782 # We first try the strict standard encoding, and if that fails we
1783 # fall back on the user's encoding and error handler.
1784 keyword = self._decode_pax_field(keyword, "utf-8", "utf-8",
1786 if keyword in PAX_NAME_FIELDS:
1787 value = self._decode_pax_field(value, encoding, tarfile.encoding,
1790 value = self._decode_pax_field(value, "utf-8", "utf-8",
1793 pax_headers[keyword] = value
1797 # Fetch the next header.
1799 next = self.fromtarfile(tarfile)
1801 raise SubsequentHeaderError("missing or bad subsequent header")
1803 # Process GNU sparse information.
1804 if "GNU.sparse.map" in pax_headers:
1805 # GNU extended sparse format version 0.1.
1806 self._proc_gnusparse_01(next, pax_headers)
1808 elif "GNU.sparse.size" in pax_headers:
1809 # GNU extended sparse format version 0.0.
1810 self._proc_gnusparse_00(next, pax_headers, buf)
1812 elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0":
1813 # GNU extended sparse format version 1.0.
1814 self._proc_gnusparse_10(next, pax_headers, tarfile)
1816 if self.type in (XHDTYPE, SOLARIS_XHDTYPE):
1817 # Patch the TarInfo object with the extended header info.
1818 next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors)
1819 next.offset = self.offset
1821 if "size" in pax_headers:
1822 # If the extended header replaces the size field,
1823 # we need to recalculate the offset where the next
1825 offset = next.offset_data
1826 if next.isreg() or next.type not in SUPPORTED_TYPES:
1827 offset += next._block(next.size)
1828 tarfile.offset = offset
1830 if next is not None:
1831 if "GNU.volume.filename" in pax_headers:
1832 if pax_headers["GNU.volume.filename"] == next.name:
1833 if "GNU.volume.size" in pax_headers:
1834 next.size = int(pax_headers["GNU.volume.size"])
1835 if "GNU.volume.offset" in pax_headers:
1836 next.volume_offset = int(pax_headers["GNU.volume.offset"])
1838 for key in pax_headers.keys():
1839 if key.startswith("GNU.volume"):
1840 del tarfile.pax_headers[key]
1844 def _proc_gnusparse_00(self, next, pax_headers, buf):
1845 """Process a GNU tar extended sparse header, version 0.0.
1848 for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf):
1849 offsets.append(int(match.group(1)))
1851 for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf):
1852 numbytes.append(int(match.group(1)))
1853 next.sparse = list(zip(offsets, numbytes))
1855 def _proc_gnusparse_01(self, next, pax_headers):
1856 """Process a GNU tar extended sparse header, version 0.1.
1858 sparse = [int(x) for x in pax_headers["GNU.sparse.map"].split(",")]
1859 next.sparse = list(zip(sparse[::2], sparse[1::2]))
1861 def _proc_gnusparse_10(self, next, pax_headers, tarfile):
1862 """Process a GNU tar extended sparse header, version 1.0.
1866 buf = tarfile.fileobj.read(BLOCKSIZE)
1867 fields, buf = buf.split(b"\n", 1)
1868 fields = int(fields)
1869 while len(sparse) < fields * 2:
1870 if b"\n" not in buf:
1871 buf += tarfile.fileobj.read(BLOCKSIZE)
1872 number, buf = buf.split(b"\n", 1)
1873 sparse.append(int(number))
1874 next.offset_data = tarfile.fileobj.tell()
1875 next.sparse = list(zip(sparse[::2], sparse[1::2]))
1877 def _apply_pax_info(self, pax_headers, encoding, errors):
1878 """Replace fields with supplemental information from a previous
1879 pax extended or global header.
1881 for keyword, value in pax_headers.items():
1882 if keyword == "GNU.sparse.name":
1883 setattr(self, "path", value)
1884 elif keyword == "GNU.sparse.size":
1885 setattr(self, "size", int(value))
1886 elif keyword == "GNU.sparse.realsize":
1887 setattr(self, "size", int(value))
1888 elif keyword in PAX_FIELDS:
1889 if keyword in PAX_NUMBER_FIELDS:
1891 value = PAX_NUMBER_FIELDS[keyword](value)
1894 if keyword == "path":
1895 value = value.rstrip("/") # pylint: disable=no-member
1896 setattr(self, keyword, value)
1898 self.pax_headers = pax_headers.copy()
1900 def _decode_pax_field(self, value, encoding, fallback_encoding, fallback_errors):
1901 """Decode a single field from a pax record.
1904 return value.decode(encoding, "strict")
1905 except UnicodeDecodeError:
1906 return value.decode(fallback_encoding, fallback_errors)
1908 def _block(self, count):
1909 """Round up a byte count by BLOCKSIZE and return it,
1910 e.g. _block(834) => 1024.
1912 blocks, remainder = divmod(count, BLOCKSIZE)
1915 return blocks * BLOCKSIZE
1918 return self.type in REGULAR_TYPES
1922 return self.type == DIRTYPE
1924 return self.type == SYMTYPE
1926 return self.type == LNKTYPE
1928 return self.type == CHRTYPE
1930 return self.type == BLKTYPE
1932 return self.type == FIFOTYPE
1934 return self.sparse is not None
1936 return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE)
1937 def ismultivol(self):
1938 return self.type == GNUTYPE_MULTIVOL or self.volume_offset > 0 or\
1939 "GNU.volume.offset" in self.pax_headers
1942 class TarFile(object):
1943 """The TarFile Class provides an interface to tar archives.
1946 debug = 0 # May be set from 0 (no msgs) to 3 (all msgs)
1948 dereference = False # If true, add content of linked file to the
1949 # tar file, else the link.
1951 ignore_zeros = False # If true, skips empty or invalid blocks and
1952 # continues processing.
1954 max_volume_size = None # If different from None, establishes maximum
1955 # size of tar volumes
1957 new_volume_handler = None # function handler to be executed before when
1958 # a new volume is needed
1960 volume_number = 0 # current volume number, used for multi volume
1963 errorlevel = 1 # If 0, fatal errors only appear in debug
1964 # messages (if debug >= 0). If > 0, errors
1965 # are passed to the caller as exceptions.
1967 format = DEFAULT_FORMAT # The format to use when creating an archive.
1969 encoding = ENCODING # Encoding for 8-bit character strings.
1971 errors = None # Error handler for unicode conversion.
1973 tarinfo = TarInfo # The default TarInfo class to use.
1975 fileobject = ExFileObject # The file-object for extractfile().
1977 arcmode = ARCMODE_PLAIN # Object processing mode (“concat”, encryption,
1980 save_to_members = True # If new members are saved. This can be disabled
1981 # if you manage lots of files and don't want
1982 # to have high memory usage
1984 cache_uid2user = {} # cache to avoid getpwuid calls. It always parses /etc/passwd.
1985 cache_gid2group = {} # same cache for groups
1987 def __init__(self, name=None, mode="r", fileobj=None, format=None,
1988 tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
1989 errors="surrogateescape", pax_headers=None, debug=None,
1990 errorlevel=None, max_volume_size=None, new_volume_handler=None,
1991 concat=False, nacl=None,
1992 save_to_members=True):
1993 """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
1994 read from an existing archive, 'a' to append data to an existing
1995 file or 'w' to create a new file overwriting an existing one. `mode'
1997 If `fileobj' is given, it is used for reading or writing data. If it
1998 can be determined, `mode' is overridden by `fileobj's mode.
1999 `fileobj' is not closed, when TarFile is closed.
2001 if len(mode) > 1 or mode not in "raw":
2002 raise ValueError("mode must be 'r', 'a' or 'w'")
2004 self.arcmode = arcmode_set (concat)
2006 self._mode = {"r": "rb", "a": "r+b", "w": "wb"}[mode]
2009 if self.mode == "a" and not os.path.exists(name):
2010 # Create nonexistent files in append mode.
2013 fileobj = bltn_open(name, self._mode)
2014 self._extfileobj = False
2016 if name is None and hasattr(fileobj, "name"):
2018 # when fileobj is a gzip.GzipFile, fileobj.mode is an int (not valid for us)
2019 if hasattr(fileobj, "mode") and isinstance(fileobj.mode, str):
2020 self._mode = fileobj.mode
2021 self._extfileobj = True
2022 self.name = os.path.abspath(name) if name else None
2023 self.base_name = self.name = os.path.abspath(name) if name else None
2024 self.fileobj = fileobj
2027 if format is not None:
2028 self.format = format
2029 if tarinfo is not None:
2030 self.tarinfo = tarinfo
2031 if dereference is not None:
2032 self.dereference = dereference
2033 if ignore_zeros is not None:
2034 self.ignore_zeros = ignore_zeros
2035 if encoding is not None:
2036 self.encoding = encoding
2038 self.errors = errors
2040 if pax_headers is not None and self.format == PAX_FORMAT:
2041 self.pax_headers = pax_headers
2043 self.pax_headers = {}
2045 if debug is not None:
2047 if errorlevel is not None:
2048 self.errorlevel = errorlevel
2050 # Init datastructures.
2051 if max_volume_size and max_volume_size < 3*BLOCKSIZE:
2052 raise ValueError("max_volume_size needs to be at least %d" % (3*BLOCKSIZE))
2053 if max_volume_size and not callable(new_volume_handler):
2054 raise ValueError("new_volume_handler needs to be set and be callable for multivolume support")
2056 self.max_volume_size = int(max_volume_size)
2058 self.max_volume_size = None
2060 self.save_to_members = save_to_members
2061 self.new_volume_handler = new_volume_handler
2063 self.members = [] # list of members as TarInfo objects
2064 self._loaded = False # flag if all members have been read
2065 self.offset = self.fileobj.tell()
2066 # current position in the archive file
2067 self.inodes = {} # dictionary caching the inodes of
2068 # archive members already added
2071 if self.mode == "r":
2072 self.firstmember = None
2073 self.firstmember = self.next()
2075 if self.mode == "a":
2076 # Move to the end of the archive,
2077 # before the first empty block.
2079 self.fileobj.seek(self.offset)
2081 tarinfo = self.tarinfo.fromtarfile(self)
2082 self.members.append(tarinfo)
2083 except EOFHeaderError:
2084 self.fileobj.seek(self.offset)
2086 except HeaderError as e:
2087 raise ReadError(str(e))
2089 if self.mode in "aw":
2092 if self.pax_headers:
2093 buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy())
2094 self.fileobj.write(buf)
2095 self.offset += len(buf)
2097 if not self._extfileobj:
2098 self.fileobj.close()
2102 #--------------------------------------------------------------------------
2103 # Below are the classmethods which act as alternate constructors to the
2104 # TarFile class. The open() method is the only one that is needed for
2105 # public use; it is the "super"-constructor and is able to select an
2106 # adequate "sub"-constructor for a particular compression using the mapping
2109 # This concept allows one to subclass TarFile without losing the comfort of
2110 # the super-constructor. A sub-constructor is registered and made available
2111 # by adding it to the mapping in OPEN_METH.
2114 def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE,
2115 encryption=None, compresslevel=9, tolerance=TOLERANCE_STRICT,
2117 """Open a tar archive for reading, writing or appending. Return
2118 an appropriate TarFile class.
2121 'r' or 'r:*' open for reading with transparent compression
2122 'r:' open for reading exclusively uncompressed
2123 'r:gz' open for reading with gzip compression
2124 'r:bz2' open for reading with bzip2 compression
2125 'r:xz' open for reading with lzma compression
2126 'a' or 'a:' open for appending, creating the file if necessary
2127 'w' or 'w:' open for writing without compression
2128 'w:gz' open for writing with gzip compression
2129 'w:bz2' open for writing with bzip2 compression
2130 'w:xz' open for writing with lzma compression
2132 'r|*' open a stream of tar blocks with transparent compression
2133 'r|' open an uncompressed stream of tar blocks for reading
2134 'r|gz' open a gzip compressed stream of tar blocks
2135 'r|bz2' open a bzip2 compressed stream of tar blocks
2136 'r|xz' open an lzma compressed stream of tar blocks
2137 'w|' open an uncompressed stream for writing
2138 'w|gz' open a gzip compressed stream for writing
2139 'w|bz2' open a bzip2 compressed stream for writing
2140 'w|xz' open an lzma compressed stream for writing
2142 'r#gz' open a stream of gzip compressed tar blocks for reading
2143 'w#gz' open a stream of gzip compressed tar blocks for writing
2145 if not name and not fileobj:
2146 raise ValueError("nothing to open")
2148 if mode in ("r", "r:*"):
2149 # Find out which *open() is appropriate for opening the file.
2150 for comptype in cls.OPEN_METH:
2151 func = getattr(cls, cls.OPEN_METH[comptype])
2152 if fileobj is not None:
2153 saved_pos = fileobj.tell()
2155 return func(name, "r", fileobj, **kwargs)
2156 except (ReadError, CompressionError) as e:
2157 # usually nothing exceptional but sometimes is
2158 if fileobj is not None:
2159 fileobj.seek(saved_pos)
2161 raise ReadError("file could not be opened successfully")
2164 filemode, comptype = mode.split(":", 1)
2165 filemode = filemode or "r"
2166 comptype = comptype or "tar"
2168 # Select the *open() function according to
2169 # given compression.
2170 if comptype in cls.OPEN_METH:
2171 func = getattr(cls, cls.OPEN_METH[comptype])
2173 raise CompressionError("unknown compression type %r" % comptype)
2175 # Pass on compression level for gzip / bzip2.
2176 if comptype == 'gz' or comptype == 'bz2':
2177 kwargs['compresslevel'] = compresslevel
2179 if 'max_volume_size' in kwargs:
2180 if comptype != 'tar' and filemode in 'wa' \
2181 and kwargs['max_volume_size']:
2183 warnings.warn('Only the first volume will be compressed '
2184 'for modes with "w:"!')
2186 return func(name, filemode, fileobj, **kwargs)
2189 filemode, comptype = mode.split("|", 1)
2190 filemode = filemode or "r"
2191 comptype = comptype or "tar"
2193 if filemode not in "rw":
2194 raise ValueError("mode must be 'r' or 'w'")
2196 t = cls(name, filemode,
2197 _Stream(name, filemode, comptype, fileobj, bufsize,
2198 compresslevel=compresslevel),
2200 t._extfileobj = False
2204 filemode, comptype = mode.split("#", 1)
2205 filemode = filemode or "r"
2207 if filemode not in "rw":
2208 raise ValueError ("mode %s not compatible with concat "
2209 "archive; must be 'r' or 'w'" % mode)
2211 stream = _Stream(name, filemode, comptype, fileobj, bufsize,
2212 concat=True, encryption=encryption,
2213 compresslevel=compresslevel, tolerance=tolerance)
2214 kwargs ["concat"] = True
2216 t = cls(name, filemode, stream, **kwargs)
2217 except: # XXX except what?
2219 raise # XXX raise what?
2220 t._extfileobj = False
2224 return cls.taropen(name, mode, fileobj, **kwargs)
2226 raise ValueError("undiscernible mode %r" % mode)
2230 def open_at_offset(cls, offset, *a, **kwa):
2232 Same as ``.open()``, but start reading at the given offset. Assumes a
2233 seekable file object.
2235 fileobj = kwa.get ("fileobj")
2236 if fileobj is not None:
2237 fileobj.seek (offset)
2238 return cls.open (*a, **kwa)
2242 def taropen(cls, name, mode="r", fileobj=None, **kwargs):
2243 """Open uncompressed tar archive name for reading or writing.
2245 if len(mode) > 1 or mode not in "raw":
2246 raise ValueError("mode must be 'r', 'a' or 'w'")
2247 return cls(name, mode, fileobj, **kwargs)
2250 def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
2251 """Open gzip compressed tar archive name for reading or writing.
2252 Appending is not allowed.
2254 if len(mode) > 1 or mode not in "rw":
2255 raise ValueError("mode must be 'r' or 'w'")
2260 except (ImportError, AttributeError):
2261 raise CompressionError("gzip module is not available")
2263 extfileobj = fileobj is not None
2265 fileobj = gzip.GzipFile(name, mode + "b", compresslevel, fileobj)
2266 t = cls.taropen(name, mode, fileobj, **kwargs)
2268 if not extfileobj and fileobj is not None:
2272 raise ReadError("not a gzip file")
2274 if not extfileobj and fileobj is not None:
2277 t._extfileobj = extfileobj
2281 def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
2282 """Open bzip2 compressed tar archive name for reading or writing.
2283 Appending is not allowed.
2285 if len(mode) > 1 or mode not in "rw":
2286 raise ValueError("mode must be 'r' or 'w'.")
2291 raise CompressionError("bz2 module is not available")
2293 fileobj = bz2.BZ2File(fileobj or name, mode,
2294 compresslevel=compresslevel)
2297 t = cls.taropen(name, mode, fileobj, **kwargs)
2298 except (OSError, EOFError):
2300 raise ReadError("not a bzip2 file")
2301 t._extfileobj = False
2305 def xzopen(cls, name, mode="r", fileobj=None, preset=None, **kwargs):
2306 """Open lzma compressed tar archive name for reading or writing.
2307 Appending is not allowed.
2309 if mode not in ("r", "w"):
2310 raise ValueError("mode must be 'r' or 'w'")
2315 raise CompressionError("lzma module is not available")
2317 fileobj = lzma.LZMAFile(fileobj or name, mode, preset=preset)
2320 t = cls.taropen(name, mode, fileobj, **kwargs)
2321 except (lzma.LZMAError, EOFError):
2323 raise ReadError("not an lzma file")
2324 t._extfileobj = False
2327 # All *open() methods are registered here.
2329 "tar": "taropen", # uncompressed tar
2330 "gz": "gzopen", # gzip compressed tar
2331 "bz2": "bz2open", # bzip2 compressed tar
2332 "xz": "xzopen" # lzma compressed tar
2335 #--------------------------------------------------------------------------
2336 # The public methods which TarFile provides:
2339 """Close the TarFile. In write-mode, two finishing zero blocks are
2340 appended to the archive. A special case are empty archives which are
2341 initialized accordingly so the two mandatory blocks of zeros are
2342 written abiding by the requested encryption and compression settings.
2347 if self.mode in "aw":
2348 if self.arcmode & ARCMODE_CONCAT and self.fileobj.tell () == 0:
2349 self.fileobj.next ("")
2350 self.fileobj.write(NUL * (BLOCKSIZE * 2))
2351 self.offset += (BLOCKSIZE * 2)
2352 # fill up the end with zero-blocks
2353 # (like option -b20 for tar does)
2354 blocks, remainder = divmod(self.offset, RECORDSIZE)
2356 self.fileobj.write(NUL * (RECORDSIZE - remainder))
2357 if not self._extfileobj:
2358 self.fileobj.close()
2361 def getmember(self, name):
2362 """Return a TarInfo object for member `name'. If `name' can not be
2363 found in the archive, KeyError is raised. If a member occurs more
2364 than once in the archive, its last occurrence is assumed to be the
2365 most up-to-date version.
2367 tarinfo = self._getmember(name)
2369 raise KeyError("filename %r not found" % name)
2372 def getmembers(self):
2373 """Return the members of the archive as a list of TarInfo objects. The
2374 list has the same order as the members in the archive.
2377 if not self._loaded: # if we want to obtain a list of
2378 self._load() # all members, we first have to
2379 # scan the whole archive.
2382 def get_last_member_offset(self):
2383 """Return the last member offset. Usually this is self.fileobj.tell(),
2384 but when there's encryption or concat compression going on it's more
2385 complicated than that.
2387 return self.last_block_offset
2390 """Return the members of the archive as a list of their names. It has
2391 the same order as the list returned by getmembers().
2393 return [tarinfo.name for tarinfo in self.getmembers()]
2395 def gettarinfo(self, name=None, arcname=None, fileobj=None):
2396 """Create a TarInfo object for either the file `name' or the file
2397 object `fileobj' (using os.fstat on its file descriptor). You can
2398 modify some of the TarInfo's attributes before you add it using
2399 addfile(). If given, `arcname' specifies an alternative name for the
2400 file in the archive.
2404 # When fileobj is given, replace name by
2405 # fileobj's real name.
2406 if fileobj is not None:
2409 # Building the name of the member in the archive.
2410 # Backward slashes are converted to forward slashes,
2411 # Absolute paths are turned to relative paths.
2414 drv, arcname = os.path.splitdrive(arcname)
2415 arcname = arcname.replace(os.sep, "/")
2416 arcname = arcname.lstrip("/")
2418 # Now, fill the TarInfo object with
2419 # information specific for the file.
2420 tarinfo = self.tarinfo()
2421 tarinfo.tarfile = self
2423 # Use os.stat or os.lstat, depending on platform
2424 # and if symlinks shall be resolved.
2426 if hasattr(os, "lstat") and not self.dereference:
2427 statres = os.lstat(name)
2429 statres = os.stat(name)
2431 statres = os.fstat(fileobj.fileno())
2434 stmd = statres.st_mode
2435 if stat.S_ISREG(stmd):
2436 inode = (statres.st_ino, statres.st_dev)
2437 if not self.dereference and statres.st_nlink > 1 and \
2438 inode in self.inodes and arcname != self.inodes[inode]:
2439 # Is it a hardlink to an already
2442 linkname = self.inodes[inode]
2444 # The inode is added only if its valid.
2445 # For win32 it is always 0.
2447 if inode[0] and self.save_to_members:
2448 self.inodes[inode] = arcname
2449 elif stat.S_ISDIR(stmd):
2451 elif stat.S_ISFIFO(stmd):
2453 elif stat.S_ISLNK(stmd):
2455 linkname = os.readlink(name)
2456 elif stat.S_ISCHR(stmd):
2458 elif stat.S_ISBLK(stmd):
2463 # Fill the TarInfo object with all
2464 # information we can get.
2465 tarinfo.name = arcname
2467 tarinfo.uid = statres.st_uid
2468 tarinfo.gid = statres.st_gid
2470 tarinfo.size = statres.st_size
2473 tarinfo.mtime = statres.st_mtime
2475 tarinfo.linkname = linkname
2477 if tarinfo.uid in self.cache_uid2user:
2478 tarinfo.uname = self.cache_uid2user[tarinfo.uid]
2481 tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0]
2482 self.cache_uid2user[tarinfo.uid] = tarinfo.uname
2484 # remember user does not exist:
2485 # same default value as in tarinfo class
2486 self.cache_uid2user[tarinfo.uid] = ""
2488 if tarinfo.gid in self.cache_gid2group:
2489 tarinfo.gname = self.cache_gid2group[tarinfo.gid]
2492 tarinfo.gname = grp.getgrgid(tarinfo.gid)[0]
2493 self.cache_gid2group[tarinfo.gid] = tarinfo.gname
2495 # remember group does not exist:
2496 # same default value as in tarinfo class
2497 self.cache_gid2group[tarinfo.gid] = ""
2499 if type in (CHRTYPE, BLKTYPE):
2500 if hasattr(os, "major") and hasattr(os, "minor"):
2501 tarinfo.devmajor = os.major(statres.st_rdev)
2502 tarinfo.devminor = os.minor(statres.st_rdev)
2505 def list(self, verbose=True):
2506 """Print a table of contents to sys.stdout. If `verbose' is False, only
2507 the names of the members are printed. If it is True, an `ls -l'-like
2512 for tarinfo in self:
2514 print(stat.filemode(tarinfo.mode), end=' ')
2515 print("%s/%s" % (tarinfo.uname or tarinfo.uid,
2516 tarinfo.gname or tarinfo.gid), end=' ')
2517 if tarinfo.ischr() or tarinfo.isblk():
2518 print("%10s" % ("%d,%d" \
2519 % (tarinfo.devmajor, tarinfo.devminor)), end=' ')
2521 print("%10d" % tarinfo.size, end=' ')
2522 print("%d-%02d-%02d %02d:%02d:%02d" \
2523 % time.localtime(tarinfo.mtime)[:6], end=' ')
2525 print(tarinfo.name + ("/" if tarinfo.isdir() else ""), end=' ')
2529 print("->", tarinfo.linkname, end=' ')
2531 print("link to", tarinfo.linkname, end=' ')
2534 def add(self, name, arcname=None, recursive=True, exclude=None, *, filter=None):
2535 """Add the file `name' to the archive. `name' may be any type of file
2536 (directory, fifo, symbolic link, etc.). If given, `arcname'
2537 specifies an alternative name for the file in the archive.
2538 Directories are added recursively by default. This can be avoided by
2539 setting `recursive' to False. `exclude' is a function that should
2540 return True for each filename to be excluded. `filter' is a function
2541 that expects a TarInfo object argument and returns the changed
2542 TarInfo object, if it returns None the TarInfo object will be
2543 excluded from the archive.
2550 # Exclude pathnames.
2551 if exclude is not None:
2553 warnings.warn("use the filter argument instead",
2554 DeprecationWarning, 2)
2556 self._dbg(2, "tarfile: Excluded %r" % name)
2559 # Skip if somebody tries to archive the archive...
2560 if self.name is not None and os.path.abspath(name) == self.name:
2561 self._dbg(2, "tarfile: Skipped %r" % name)
2566 # Create a TarInfo object from the file.
2567 tarinfo = self.gettarinfo(name, arcname)
2570 self._dbg(1, "tarfile: Unsupported type %r" % name)
2573 # Change or exclude the TarInfo object.
2574 if filter is not None:
2575 tarinfo = filter(tarinfo)
2577 self._dbg(2, "tarfile: Excluded %r" % name)
2580 # Append the tar header and data to the archive.
2582 with bltn_open(name, "rb") as f:
2583 self.addfile(tarinfo, f)
2585 elif tarinfo.isdir():
2586 self.addfile(tarinfo)
2588 for f in os.listdir(name):
2589 self.add(os.path.join(name, f), os.path.join(arcname, f),
2590 recursive, exclude, filter=filter)
2593 self.addfile(tarinfo)
2595 def _size_left_file(self):
2596 """Calculates size left in a volume with a maximum volume size.
2598 Assumes self.max_volume_size is set.
2599 If using compression through a _Stream, use _size_left_stream instead
2601 # left-over size = max_size - offset - 2 zero-blocks written in close
2602 size_left = self.max_volume_size - 2*BLOCKSIZE - self.offset
2603 # limit size left to a discrete number of blocks, because we won't
2604 # write only half a block when writting the end of a volume
2605 # and filling with zeros
2606 return BLOCKSIZE * (size_left // BLOCKSIZE)
2608 def _size_left_stream(self):
2609 """ Calculates size left in a volume if using comression/encryption
2611 Assumes self.max_volume_size is set and self.fileobj is a _Stream
2612 (otherwise use _size_left_file)
2614 # left-over size = max_size - bytes written - 2 zero-blocks (close)
2615 size_left = self.max_volume_size - self.fileobj.estim_file_size() \
2617 return BLOCKSIZE * (size_left // BLOCKSIZE)
2619 def addfile(self, tarinfo, fileobj=None):
2620 """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is
2621 given, tarinfo.size bytes are read from it and added to the archive.
2622 You can create TarInfo objects using gettarinfo().
2623 On Windows platforms, `fileobj' should always be opened with mode
2624 'rb' to avoid irritation about the file size.
2628 tarinfo = copy.copy(tarinfo)
2630 if self.arcmode & ARCMODE_CONCAT:
2631 self.last_block_offset = self.fileobj.next (tarinfo.name)
2633 self.last_block_offset = self.fileobj.tell()
2635 buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
2636 self.fileobj.write(buf)
2637 self.offset += len(buf)
2639 if self.max_volume_size:
2640 if isinstance(self.fileobj, _Stream):
2641 _size_left = self._size_left_stream
2643 _size_left = self._size_left_file
2645 _size_left = lambda: tarinfo.size
2647 # If there's no data to follow, finish
2649 if self.save_to_members:
2650 self.members.append(tarinfo)
2653 target_size_left = _size_left()
2654 source_size_left = tarinfo.size
2655 assert tarinfo.volume_offset == 0
2657 # we only split volumes in the middle of a file, that means we have
2658 # to write at least one block
2659 if target_size_left < BLOCKSIZE:
2660 target_size_left = BLOCKSIZE
2662 # loop over multiple volumes
2663 while source_size_left > 0:
2665 # Write as much data as possble from source into target.
2666 # When compressing data, we cannot easily predict how much data we
2667 # can write until target_size_left == 0 --> need to iterate
2668 size_can_write = min(target_size_left, source_size_left)
2670 while size_can_write > 0:
2671 copyfileobj(fileobj, self.fileobj, size_can_write)
2672 self.offset += size_can_write
2673 source_size_left -= size_can_write
2674 target_size_left = _size_left()
2675 size_can_write = min(target_size_left, source_size_left)
2677 # now target_size_left == 0 or source_size_left == 0
2679 # if there is data left to write, we need to create a new volume
2680 if source_size_left > 0:
2681 # Only finalize the crypto entry here if we’re continuing with
2682 # another one; otherwise, the encryption must include the block
2684 tarinfo.type = GNUTYPE_MULTIVOL
2686 if not self.new_volume_handler or\
2687 not callable(self.new_volume_handler):
2688 raise Exception("We need to create a new volume and you "
2689 "didn't supply a new_volume_handler")
2692 # the new volume handler should do everything needed to
2693 # start working in a new volume. usually, the handler calls
2694 # to self.open_volume
2695 self.volume_number += 1
2697 # set to be used by open_volume, because in the case of a PAX
2698 # tar it needs to write information about the volume and offset
2699 # in the global header
2700 tarinfo.volume_offset = tarinfo.size - source_size_left
2701 self.volume_tarinfo = tarinfo
2703 # the “new_volume_handler” is supposed to call .close() on the
2705 self.new_volume_handler(self, self.base_name, self.volume_number)
2707 self.volume_tarinfo = None
2709 if self.arcmode & ARCMODE_CONCAT:
2710 self.fileobj.next_volume (tarinfo.name)
2712 # write new volume header
2713 buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
2714 self.fileobj.write(buf)
2715 self.offset += len(buf)
2717 # adjust variables; open_volume should have reset self.offset
2718 # --> _size_left should be big again
2719 target_size_left = _size_left()
2720 size_can_write = min(target_size_left, source_size_left)
2721 self._dbg(3, 'new volume')
2723 # now, all data has been written. We may have to fill up the rest of
2724 # the block in target with 0s
2725 remainder = (tarinfo.size - tarinfo.volume_offset) % BLOCKSIZE
2727 self.fileobj.write(NUL * (BLOCKSIZE - remainder))
2728 self.offset += BLOCKSIZE - remainder
2730 if self.save_to_members:
2731 self.members.append(tarinfo)
2733 def open_volume(self, name="", fileobj=None, encryption=None):
2735 Called by the user to change this tar file to point to a new volume.
2737 # open the file using either fileobj or name
2739 if self.mode == "a" and not os.path.exists(name):
2740 # Create nonexistent files in append mode.
2743 self._extfileobj = False
2745 if isinstance(self.fileobj, _Stream):
2746 self._dbg(3, 'open_volume: create a _Stream')
2747 fileobj = _Stream(name=name,
2748 mode=self.fileobj.mode,
2749 comptype=self.fileobj.comptype,
2751 bufsize=self.fileobj.bufsize,
2752 encryption=encryption or self.fileobj.encryption,
2753 concat=self.fileobj.arcmode & ARCMODE_CONCAT)
2755 # here, we lose information about compression/encryption!
2756 self._dbg(3, 'open_volume: builtin open')
2757 fileobj = bltn_open(name, self._mode)
2759 if name is None and hasattr(fileobj, "name"):
2761 if hasattr(fileobj, "mode"):
2762 self._mode = fileobj.mode
2763 self._extfileobj = True
2764 self._dbg(3, 'open_volume: using external fileobj {}', fileobj)
2765 self.name = os.path.abspath(name) if name else None
2766 self.fileobj = fileobj
2768 # init data structures
2770 self.members = [] # list of members as TarInfo objects
2771 self._loaded = False # flag if all members have been read
2772 self.offset = self.fileobj.tell()
2773 # current position in the archive file
2774 self.inodes = {} # dictionary caching the inodes of
2775 # archive members already added
2778 if self.mode == "r":
2779 self.firstmember = None
2780 self.firstmember = self.next()
2782 if self.mode == "a":
2783 # Move to the end of the archive,
2784 # before the first empty block.
2786 self.fileobj.seek(self.offset)
2788 tarinfo = self.tarinfo.fromtarfile(self)
2789 self.members.append(tarinfo)
2790 except EOFHeaderError:
2791 self.fileobj.seek(self.offset)
2793 except HeaderError as e:
2794 raise ReadError(str(e))
2796 if self.mode in "aw":
2799 if self.format == PAX_FORMAT:
2801 "GNU.volume.filename": str(self.volume_tarinfo.name),
2802 "GNU.volume.size": str(self.volume_tarinfo.size - self.volume_tarinfo.volume_offset),
2803 "GNU.volume.offset": str(self.volume_tarinfo.volume_offset),
2806 self.pax_headers.update(volume_info)
2808 if isinstance(self.fileobj, _Stream):
2809 self.fileobj._init_write_gz ()
2810 buf = self.tarinfo.create_pax_global_header(volume_info.copy())
2811 self.fileobj.write(buf)
2812 self.offset += len(buf)
2813 except Exception as exn:
2814 if not self._extfileobj:
2815 self.fileobj.close()
2819 def extractall(self, path=".", members=None, filter=None):
2820 """Extract all members from the archive to the current working
2821 directory and set owner, modification time and permissions on
2822 directories afterwards. `path' specifies a different directory
2823 to extract to. `members' is optional and must be a subset of the
2824 list returned by getmembers().
2831 for tarinfo in members:
2832 if self.volume_number > 0 and tarinfo.ismultivol():
2835 if filter and not filter(tarinfo):
2839 # Extract directories with a safe mode.
2840 directories.append(tarinfo)
2841 tarinfo = copy.copy(tarinfo)
2842 tarinfo.mode = 0o0700
2843 # Do not set_attrs directories, as we will do that further down
2844 self.extract(tarinfo, path, set_attrs=not tarinfo.isdir())
2846 # Reverse sort directories.
2847 directories.sort(key=lambda a: a.name)
2848 directories.reverse()
2850 # Set correct owner, mtime and filemode on directories.
2851 for tarinfo in directories:
2852 dirpath = os.path.join(path, tarinfo.name)
2854 self.chown(tarinfo, dirpath)
2855 self.utime(tarinfo, dirpath)
2856 self.chmod(tarinfo, dirpath)
2857 except ExtractError as e:
2858 if self.errorlevel > 1:
2861 self._dbg(1, "tarfile: %s" % e)
2863 def extract(self, member, path="", set_attrs=True, symlink_cb=None):
2864 """Extract a member from the archive to the current working directory,
2865 using its full name. Its file information is extracted as accurately
2866 as possible. `member' may be a filename or a TarInfo object. You can
2867 specify a different directory using `path'. File attributes (owner,
2868 mtime, mode) are set unless `set_attrs' is False.
2869 ``symlink_cb`` is a hook accepting a function that is passed the
2870 ``member``, ``path``, and ``set_attrs`` arguments if the tarinfo for
2871 ``member`` indicates a symlink in which case only the callback
2872 passed will be applied, skipping the actual extraction. In case the
2873 callback is invoked, its return value is passed on to the caller.
2877 if isinstance(member, str):
2878 tarinfo = self.getmember(member)
2882 # Prepare the link target for makelink().
2884 tarinfo._link_target = os.path.join(path, tarinfo.linkname)
2886 if symlink_cb is not None and tarinfo.issym():
2887 return symlink_cb(member, path, set_attrs)
2890 self._extract_member(tarinfo, os.path.join(path, tarinfo.name),
2891 set_attrs=set_attrs)
2892 except EnvironmentError as e:
2893 if self.errorlevel > 0:
2896 if e.filename is None:
2897 self._dbg(1, "tarfile: %s" % e.strerror)
2899 self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename))
2900 except ExtractError as e:
2901 if self.errorlevel > 1:
2904 self._dbg(1, "tarfile: %s" % e)
2906 def extractfile(self, member):
2907 """Extract a member from the archive as a file object. `member' may be
2908 a filename or a TarInfo object. If `member' is a regular file or a
2909 link, an io.BufferedReader object is returned. Otherwise, None is
2914 if isinstance(member, str):
2915 tarinfo = self.getmember(member)
2919 if tarinfo.isreg() or tarinfo.ismultivol() or\
2920 tarinfo.type not in SUPPORTED_TYPES:
2921 # If a member's type is unknown, it is treated as a
2923 return self.fileobject(self, tarinfo)
2925 elif tarinfo.islnk() or tarinfo.issym():
2926 if isinstance(self.fileobj, _Stream):
2927 # A small but ugly workaround for the case that someone tries
2928 # to extract a (sym)link as a file-object from a non-seekable
2929 # stream of tar blocks.
2930 raise StreamError("cannot extract (sym)link as file object")
2932 # A (sym)link's file object is its target's file object.
2933 return self.extractfile(self._find_link_target(tarinfo))
2935 # If there's no data associated with the member (directory, chrdev,
2936 # blkdev, etc.), return None instead of a file object.
2939 def _extract_member(self, tarinfo, targetpath, set_attrs=True):
2940 """Extract the TarInfo object tarinfo to a physical
2941 file called targetpath.
2943 # Fetch the TarInfo object for the given name
2944 # and build the destination pathname, replacing
2945 # forward slashes to platform specific separators.
2946 targetpath = targetpath.rstrip("/")
2947 targetpath = targetpath.replace("/", os.sep)
2949 # Create all upper directories.
2950 upperdirs = os.path.dirname(targetpath)
2951 if upperdirs and not os.path.exists(upperdirs):
2952 # Create directories that are not part of the archive with
2953 # default permissions.
2954 os.makedirs(upperdirs)
2956 if tarinfo.islnk() or tarinfo.issym():
2957 self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname))
2959 self._dbg(1, tarinfo.name)
2962 self.makefile(tarinfo, targetpath)
2963 elif tarinfo.isdir():
2964 self.makedir(tarinfo, targetpath)
2965 elif tarinfo.isfifo():
2966 self.makefifo(tarinfo, targetpath)
2967 elif tarinfo.ischr() or tarinfo.isblk():
2968 self.makedev(tarinfo, targetpath)
2969 elif tarinfo.islnk() or tarinfo.issym():
2970 self.makelink(tarinfo, targetpath)
2971 elif tarinfo.type not in SUPPORTED_TYPES:
2972 self.makeunknown(tarinfo, targetpath)
2974 self.makefile(tarinfo, targetpath)
2977 self.chown(tarinfo, targetpath)
2978 if not tarinfo.issym():
2979 self.chmod(tarinfo, targetpath)
2980 self.utime(tarinfo, targetpath)
2982 #--------------------------------------------------------------------------
2983 # Below are the different file methods. They are called via
2984 # _extract_member() when extract() is called. They can be replaced in a
2985 # subclass to implement other functionality.
2987 def makedir(self, tarinfo, targetpath):
2988 """Make a directory called targetpath.
2991 # Use a safe mode for the directory, the real mode is set
2992 # later in _extract_member().
2993 os.mkdir(targetpath, 0o0700)
2994 except FileExistsError:
2997 def makefile(self, tarinfo, targetpath):
2998 """Make a file called targetpath.
3000 source = self.fileobj
3001 source.seek(tarinfo.offset_data)
3004 target = bltn_open(targetpath, "wb")
3006 if tarinfo.sparse is not None:
3008 for offset, size in tarinfo.sparse:
3010 copyfileobj(source, target, size)
3011 target.seek(tarinfo.size)
3020 copyfileobj(source, target, tarinfo.size)
3023 # only if we are extracting a multivolume this can be treated
3024 if not self.new_volume_handler:
3026 raise Exception("We need to read a new volume and you"
3027 " didn't supply a new_volume_handler")
3029 # the new volume handler should do everything needed to
3030 # start working in a new volume. usually, the handler calls
3031 # to self.open_volume
3032 self.volume_number += 1
3033 self.new_volume_handler(self, self.base_name, self.volume_number)
3034 tarinfo = self.firstmember
3035 source = self.fileobj
3040 def makeunknown(self, tarinfo, targetpath):
3041 """Make a file from a TarInfo object with an unknown type
3044 self.makefile(tarinfo, targetpath)
3045 self._dbg(1, "tarfile: Unknown file type %r, " \
3046 "extracted as regular file." % tarinfo.type)
3048 def makefifo(self, tarinfo, targetpath):
3049 """Make a fifo called targetpath.
3051 if hasattr(os, "mkfifo"):
3052 os.mkfifo(targetpath)
3054 raise ExtractError("fifo not supported by system")
3056 def makedev(self, tarinfo, targetpath):
3057 """Make a character or block device called targetpath.
3059 if not hasattr(os, "mknod") or not hasattr(os, "makedev"):
3060 raise ExtractError("special devices not supported by system")
3064 mode |= stat.S_IFBLK
3066 mode |= stat.S_IFCHR
3068 os.mknod(targetpath, mode,
3069 os.makedev(tarinfo.devmajor, tarinfo.devminor))
3071 def makelink(self, tarinfo, targetpath):
3072 """Make a (symbolic) link called targetpath. If it cannot be created
3073 (platform limitation), we try to make a copy of the referenced file
3077 # For systems that support symbolic and hard links.
3079 os.symlink(tarinfo.linkname, targetpath)
3082 if os.path.exists(tarinfo._link_target):
3083 os.link(tarinfo._link_target, targetpath)
3085 self._extract_member(self._find_link_target(tarinfo),
3087 except symlink_exception:
3089 self._extract_member(self._find_link_target(tarinfo),
3092 raise ExtractError("unable to resolve link inside archive")
3094 def chown(self, tarinfo, targetpath):
3095 """Set owner of targetpath according to tarinfo.
3097 if pwd and hasattr(os, "geteuid") and os.geteuid() == 0:
3098 # We have to be root to do so.
3100 g = grp.getgrnam(tarinfo.gname)[2]
3104 u = pwd.getpwnam(tarinfo.uname)[2]
3108 if tarinfo.issym() and hasattr(os, "lchown"):
3109 os.lchown(targetpath, u, g)
3111 os.chown(targetpath, u, g)
3112 except OSError as e:
3113 raise ExtractError("could not change owner")
3115 def chmod(self, tarinfo, targetpath):
3116 """Set file permissions of targetpath according to tarinfo.
3118 if hasattr(os, 'chmod'):
3120 os.chmod(targetpath, tarinfo.mode)
3121 except OSError as e:
3122 raise ExtractError("could not change mode")
3124 def utime(self, tarinfo, targetpath):
3125 """Set modification time of targetpath according to tarinfo.
3127 if not hasattr(os, 'utime'):
3130 os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime))
3131 except OSError as e:
3132 raise ExtractError("could not change modification time")
3134 #--------------------------------------------------------------------------
3136 """Return the next member of the archive as a TarInfo object, when
3137 TarFile is opened for reading. Return None if there is no more
3141 if self.firstmember is not None:
3142 m = self.firstmember
3143 self.firstmember = None
3146 # Read the next block.
3147 self.fileobj.seek(self.offset)
3151 tarinfo = self.tarinfo.fromtarfile(self)
3152 except EOFHeaderError as e:
3153 if self.ignore_zeros:
3154 self._dbg(2, "0x%X: %s" % (self.offset, e))
3155 self.offset += BLOCKSIZE
3157 except InvalidHeaderError as e:
3158 if self.ignore_zeros:
3159 self._dbg(2, "0x%X: %s" % (self.offset, e))
3160 self.offset += BLOCKSIZE
3162 elif self.offset == 0:
3163 raise ReadError(str(e))
3164 except EmptyHeaderError:
3165 if self.offset == 0:
3166 raise ReadError("empty file")
3167 except TruncatedHeaderError as e:
3168 if self.offset == 0:
3169 raise ReadError(str(e))
3170 except SubsequentHeaderError as e:
3171 raise ReadError(str(e))
3174 if tarinfo is not None:
3175 if self.save_to_members:
3176 self.members.append(tarinfo)
3182 #--------------------------------------------------------------------------
3183 # Little helper methods:
3185 def _getmember(self, name, tarinfo=None, normalize=False):
3186 """Find an archive member by name from bottom to top.
3187 If tarinfo is given, it is used as the starting point.
3189 # Ensure that all members have been loaded.
3190 members = self.getmembers()
3192 # Limit the member search list up to tarinfo.
3193 if tarinfo is not None:
3194 members = members[:members.index(tarinfo)]
3197 name = os.path.normpath(name)
3199 for member in reversed(members):
3201 member_name = os.path.normpath(member.name)
3203 member_name = member.name
3205 if name == member_name:
3209 """Read through the entire archive file and look for readable
3213 tarinfo = self.next()
3218 def _check(self, mode=None):
3219 """Check if TarFile is still open, and if the operation's mode
3220 corresponds to TarFile's mode.
3223 raise OSError("%s is closed" % self.__class__.__name__)
3224 if mode is not None and self.mode not in mode:
3225 raise OSError("bad operation for mode %r" % self.mode)
3227 def _find_link_target(self, tarinfo):
3228 """Find the target member of a symlink or hardlink member in the
3232 # Always search the entire archive.
3233 linkname = "/".join(filter(None, (os.path.dirname(tarinfo.name), tarinfo.linkname)))
3236 # Search the archive before the link, because a hard link is
3237 # just a reference to an already archived file.
3238 linkname = tarinfo.linkname
3241 member = self._getmember(linkname, tarinfo=limit, normalize=True)
3243 raise KeyError("linkname %r not found" % linkname)
3247 """Provide an iterator object.
3250 return iter(self.members)
3252 return TarIter(self)
3254 def _dbg(self, level, msg, *args):
3255 """Write debugging output to sys.stderr.
3257 if level <= self.debug:
3258 print(msg.format(*args), file=sys.stderr)
3260 def __enter__(self):
3264 def __exit__(self, type, value, traceback):
3268 # An exception occurred. We must not call close() because
3269 # it would try to write end-of-archive blocks and padding.
3270 if not self._extfileobj:
3271 self.fileobj.close()
3278 for tarinfo in TarFile(...):
3282 def __init__(self, tarfile):
3283 """Construct a TarIter object.
3285 self.tarfile = tarfile
3288 """Return iterator object.
3292 """Return the next item using TarFile's next() method.
3293 When all members have been read, set TarFile as _loaded.
3295 # Fix for SF #1100429: Under rare circumstances it can
3296 # happen that getmembers() is called during iteration,
3297 # which will cause TarIter to stop prematurely.
3299 if self.index == 0 and self.tarfile.firstmember is not None:
3300 tarinfo = self.tarfile.next()
3301 elif self.index < len(self.tarfile.members):
3302 tarinfo = self.tarfile.members[self.index]
3303 elif not self.tarfile._loaded:
3304 tarinfo = self.tarfile.next()
3306 self.tarfile._loaded = True
3314 #---------------------------------------------------------
3315 # support functionality for rescue mode
3316 #---------------------------------------------------------
3318 def locate_gz_hdr_candidates (fd):
3320 Walk over instances of the GZ magic in the payload, collecting their
3321 positions. If the offset of the first found instance is not zero, the file
3322 begins with leading garbage.
3324 Note that since the GZ magic consists of only two bytes, we expect a lot of
3325 false positives inside binary data.
3327 :return: The list of offsets in the file.
3331 mm = mmap.mmap(fd, 0, mmap.MAP_SHARED, mmap.PROT_READ)
3334 pos = mm.find (GZ_MAGIC_BYTES, pos)
3338 pos += len (GZ_MAGIC_BYTES)
3343 HDR_CAND_GOOD = 0 # header marks begin of valid object
3344 HDR_CAND_FISHY = 1 # inconclusive
3345 HDR_CAND_JUNK = 2 # not a header / object unreadable
3348 def read_cstring (fd, max=-1, encoding=None):
3350 Read one NUL-terminated string from *fd* into a Python string. If *max* is
3351 non-negative, reading will terminate after the specified number of bytes.
3353 Optionally, an *encoding* may be specified to interpret the data as.
3355 :returns: *None* if parsing failed or the maximum number of bytes has been
3356 exceeded; a Python string with the data otherwise.
3365 if max >= 0 and l > max:
3369 if encoding is not None:
3370 buf = buf.decode (encoding)
3375 def inspect_gz_hdr (fd, off):
3377 Attempt to parse a Gzip header in *fd* at position *off*. The format is
3378 documented as RFC1952.
3380 Returns a verdict about the quality of that header plus the parsed header
3381 when readable. Problematic sizes such as fields running past the EOF are
3382 treated as garbage. Properties in which the header merely doesn’t conform
3383 to the spec (garbage flag bits, bogus timestamp) are considered “fishy”. No
3384 validation is possible on embedded strings because they are single-byte
3392 verdict = HDR_CAND_GOOD
3394 os.lseek (fd, off, os.SEEK_SET)
3395 if os.lseek (fd, 0, os.SEEK_CUR) != off:
3396 return HDR_CAND_JUNK, None
3398 raw = os.read (fd, GZ_HEADER_SIZE)
3399 if len (raw) != GZ_HEADER_SIZE:
3400 return HDR_CAND_JUNK, None
3404 _m1, _m2, meth, flags, mtime, dflags, oscode = \
3405 struct.unpack (GZ_FMT_HEADER, raw)
3406 if meth != GZ_METHOD_DEFLATE: # only deflate is supported
3407 return HDR_CAND_JUNK, None
3408 except struct.error as exn:
3409 return HDR_CAND_JUNK, None
3411 if mtime > int (time.time ()):
3412 verdict = HDR_CAND_FISHY
3414 if dflags != GZ_DEFLATE_FLAGS:
3415 verdict = HDR_CAND_FISHY
3417 if oscode != GZ_OS_CODE:
3418 verdict = HDR_CAND_FISHY
3420 if flags & GZ_FLAG_FTEXT: # created by some contrarian
3421 verdict = HDR_CAND_FISHY
3422 if flags & GZ_FLAG_FEXTRA:
3423 xlen = struct.unpack ("<H", os.read (fd, 2))
3424 xtra = os.read (fd, xlen)
3425 if len (xtra) != xlen: # eof inside header
3426 return HDR_CAND_JUNK, None
3427 if flags & GZ_FLAG_FNAME:
3428 # read up to the next NUL byte, not exceeding the maximum path length
3430 fname = read_cstring (fd, max=(LENGTH_NAME + LENGTH_PREFIX),
3431 encoding="iso-8859-1")
3433 return HDR_CAND_JUNK, None
3434 if flags & GZ_FLAG_FCOMMENT:
3435 fname = read_cstring (fd, max=(LENGTH_NAME + LENGTH_PREFIX),
3436 encoding="iso-8859-1")
3438 return HDR_CAND_JUNK, None
3439 if flags & GZ_FLAG_FHCRC: # half a CRC32; discarded
3440 crc16 = os.read (fd, 2)
3441 if len (crc16) != 2: # eof inside header
3442 return HDR_CAND_JUNK, None
3443 if flags & GZ_FLAG_RESERVED:
3444 # according to the RFC, these must not be set
3445 verdict = HDR_CAND_FISHY
3447 hlen = os.lseek (fd, 0, os.SEEK_CUR) - off
3459 def try_decompress (ifd, off, hdr):
3461 Attempt to process the object starting at *off* with gzip.
3463 :returns: A pair containing the values of the decompressed data and
3464 the length of the input consumed. Note that the latter value
3465 may exceed the length of the compressed data because the
3466 *zlib* module does not provide a means to query how much
3467 of the input it processed before the end of an object.
3470 decmp = zlib.decompressobj (-zlib.MAX_WBITS)
3472 dlen = 0 # size of decompressed data
3474 os.lseek (ifd, pos, os.SEEK_SET)
3476 cnk = os.read (ifd, BUFSIZE)
3479 data = decmp.decompress (cnk)
3480 except zlib.error as exn: # probably CRC32 mismatch; terminate softly
3483 if decmp.eof is True:
3485 if len (cnk) != BUFSIZE: # eof, but not end of decompressed object!
3488 return dlen, pos - off
3490 def readable_gz_objects_offsets (ifd, cands):
3492 Inspect header candidates for parseable *ifd* gzipped objects.
3499 vdt, hdr = inspect_gz_hdr (ifd, cand)
3500 if vdt == HDR_CAND_JUNK:
3501 pass # ignore unreadable ones
3502 elif vdt in [HDR_CAND_GOOD, HDR_CAND_FISHY]:
3503 off0 = cand + hdr ["hlen"]
3504 dlen, clen = try_decompress (ifd, off0, hdr)
3505 if dlen > 0 and clen > 0:
3511 def reconstruct_offsets_gz (fname):
3513 From the given file, retrieve all GZ header-like offsets (“candidates”).
3514 Then check each of those locations whether they can be processed as
3517 ifd = os.open (fname, os.O_RDONLY)
3520 cands = locate_gz_hdr_candidates (ifd)
3521 return readable_gz_objects_offsets (ifd, cands)
3526 def read_tarobj_at_offset (fileobj, offset, mode, secret=None):
3529 if secret is not None:
3532 if ks == crypto.PDTCRYPT_SECRET_PW:
3533 decr = crypto.Decrypt (password=secret [1])
3534 elif ks == crypto.PDTCRYPT_SECRET_KEY:
3535 key = binascii.unhexlify (secret [1])
3536 decr = crypto.Decrypt (key=key)
3541 TarFile.open_at_offset (offset,
3547 save_to_members=False,
3548 tolerance=TOLERANCE_RESCUE)
3550 return tarobj.next ()
3553 def idxent_of_tarinfo (tarinfo):
3555 Scrape the information relevant for the index from a *TarInfo* object.
3556 Keys like the inode number that lack a corresponding field in a TarInfo
3557 will be set to some neutral value.
3562 , "path" : "snapshot://annotations.db"
3566 , "ctime" : 1502798115
3567 , "mtime" : 1502196423
3576 { "inode" : 0 # ignored when reading the index
3577 , "uid" : tarinfo.uid
3578 , "gid" : tarinfo.gid
3579 , "path" : tarinfo.name # keeping URI scheme
3580 , "offset" : 0 # to be added by the caller
3581 , "volume" : tarinfo.volume_offset
3582 , "mode" : tarinfo.mode
3583 , "ctime" : tarinfo.mtime
3584 , "mtime" : tarinfo.mtime
3585 , "size" : tarinfo.size
3586 , "type" : tarinfo.type
3590 def gen_rescue_index (backup_tar_path, mode, password=None, key=None):
3591 psidx = [] # pseudo index, return value
3593 secret = crypto.make_secret (password=password, key=key)
3595 if secret is not None:
3596 offsets = crypto.reconstruct_offsets (backup_tar_path, secret)
3598 offsets = reconstruct_offsets_gz (backup_tar_path)
3600 fileobj = bltn_open (backup_tar_path, "rb")
3601 infos = [ (off, read_tarobj_at_offset (fileobj, off, mode, secret=secret))
3602 for off in offsets ]
3604 ie = idxent_of_tarinfo (ti)
3607 psidx = [ aux (o, ti) for o, ti in infos ]
3611 #--------------------
3612 # exported functions
3613 #--------------------
3614 def is_tarfile(name):
3615 """Return True if name points to a tar archive that we
3616 are able to handle, else return False.