2 #-------------------------------------------------------------------
4 #-------------------------------------------------------------------
5 # Copyright (C) 2002 Lars Gustäbel <lars@gustaebel.de>
8 # Permission is hereby granted, free of charge, to any person
9 # obtaining a copy of this software and associated documentation
10 # files (the "Software"), to deal in the Software without
11 # restriction, including without limitation the rights to use,
12 # copy, modify, merge, publish, distribute, sublicense, and/or sell
13 # copies of the Software, and to permit persons to whom the
14 # Software is furnished to do so, subject to the following
17 # The above copyright notice and this permission notice shall be
18 # included in all copies or substantial portions of the Software.
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22 # OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24 # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25 # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27 # OTHER DEALINGS IN THE SOFTWARE.
29 """Read from and write to tar format archives.
32 __version__ = "$Revision: 85213 $"
36 __author__ = "Lars Gustäbel (lars@gustaebel.de)"
39 __credits__ = "Gustavo Niemeyer, Niels Gustäbel, Richard Townsend, Eduardo Robles."
59 import traceback # XXX
68 # os.symlink on Windows prior to 6.0 raises NotImplementedError
69 symlink_exception = (AttributeError, NotImplementedError)
71 # OSError (winerror=1314) will be raised if the caller does not hold the
72 # SeCreateSymbolicLinkPrivilege privilege
73 symlink_exception += (OSError,)
77 # from tarfile import *
78 __all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError"]
80 from builtins import open as _open # Since 'open' is TarFile.open
82 #---------------------------------------------------------
84 #---------------------------------------------------------
85 NUL = b"\0" # the null character
86 BLOCKSIZE = 512 # length of processing blocks
87 RECORDSIZE = BLOCKSIZE * 20 # length of records
88 GNU_MAGIC = b"ustar \0" # magic gnu tar string
89 POSIX_MAGIC = b"ustar\x0000" # magic posix tar string
91 LENGTH_NAME = 100 # maximum length of a filename
92 LENGTH_LINK = 100 # maximum length of a linkname
93 LENGTH_PREFIX = 155 # maximum length of the prefix field
95 REGTYPE = b"0" # regular file
96 AREGTYPE = b"\0" # regular file
97 LNKTYPE = b"1" # link (inside tarfile)
98 SYMTYPE = b"2" # symbolic link
99 CHRTYPE = b"3" # character special device
100 BLKTYPE = b"4" # block special device
101 DIRTYPE = b"5" # directory
102 FIFOTYPE = b"6" # fifo special device
103 CONTTYPE = b"7" # contiguous file
105 GNUTYPE_LONGNAME = b"L" # GNU tar longname
106 GNUTYPE_LONGLINK = b"K" # GNU tar longlink
107 GNUTYPE_SPARSE = b"S" # GNU tar sparse file
108 GNUTYPE_MULTIVOL = b"M" # GNU tar continuation of a file that began on
111 XHDTYPE = b"x" # POSIX.1-2001 extended header
112 XGLTYPE = b"g" # POSIX.1-2001 global header
113 SOLARIS_XHDTYPE = b"X" # Solaris extended header
115 USTAR_FORMAT = 0 # POSIX.1-1988 (ustar) format
116 GNU_FORMAT = 1 # GNU tar format
117 PAX_FORMAT = 2 # POSIX.1-2001 (pax) format
118 DEFAULT_FORMAT = GNU_FORMAT
120 GZ_FMT_HEADER = b"<BBBBLBB"
121 GZ_HEADER_SIZE = 10 # not including the name
122 GZ_MAGIC = (0x1f, 0x8b) # 0o37, 0o213
123 GZ_METHOD_DEFLATE = 0x08 # 0o10
124 GZ_FLAG_FTEXT = 1 << 0 # ASCII payload
125 GZ_FLAG_FHCRC = 1 << 1 # CRC16
126 GZ_FLAG_FEXTRA = 1 << 2 # extra field
127 GZ_FLAG_FNAME = 1 << 3 # set by default in gzip
128 GZ_FLAG_FCOMMENT = 1 << 4 # NUL-terminated comment
129 GZ_FLAG_RESERVED = 7 << 5 # unassigned
130 GZ_DEFLATE_FLAGS = 0x00 # 0o00, never read (deflate.c)
131 GZ_OS_CODE = 0x03 # 0o03, default in gzip (tailor.h)
132 GZ_MAGIC_BYTES = struct.pack ("<BB", GZ_MAGIC [0], GZ_MAGIC [1])
133 GZ_MAGIC_DEFLATE = struct.pack ("<BBB", GZ_MAGIC [0], GZ_MAGIC [1],
137 TOLERANCE_RECOVER = 1 # rely on offsets in index
138 TOLERANCE_RESCUE = 2 # deduce metadata from archive contents
142 #---------------------------------------------------------
143 # archive handling mode
144 #---------------------------------------------------------
147 ARCMODE_ENCRYPT = 1 << 0
148 ARCMODE_COMPRESS = 1 << 1
149 ARCMODE_CONCAT = 1 << 2
152 if m == ARCMODE_PLAIN:
156 def chkappend (b, s):
161 if first is True: first = False
164 chkappend (ARCMODE_ENCRYPT, "ENCRYPT")
165 chkappend (ARCMODE_COMPRESS, "COMPRESS")
166 chkappend (ARCMODE_CONCAT, "CONCAT")
170 def arcmode_set (concat=False, encryption=None, comptype=None, init=ARCMODE_PLAIN):
172 if bool (concat) is True:
173 ret |= ARCMODE_CONCAT
174 if encryption is not None:
175 ret |= ARCMODE_ENCRYPT
177 ret |= ARCMODE_COMPRESS
180 #---------------------------------------------------------
182 #---------------------------------------------------------
183 # File types that tarfile supports:
184 SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE,
185 SYMTYPE, DIRTYPE, FIFOTYPE,
186 CONTTYPE, CHRTYPE, BLKTYPE,
187 GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
188 GNUTYPE_SPARSE, GNUTYPE_MULTIVOL)
190 # File types that will be treated as a regular file.
191 REGULAR_TYPES = (REGTYPE, AREGTYPE,
192 CONTTYPE, GNUTYPE_SPARSE)
194 # File types that are part of the GNU tar format.
195 GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
196 GNUTYPE_SPARSE, GNUTYPE_MULTIVOL)
198 # Fields from a pax header that override a TarInfo attribute.
199 PAX_FIELDS = ("path", "linkpath", "size", "mtime",
200 "uid", "gid", "uname", "gname")
202 # Fields from a pax header that are affected by hdrcharset.
203 PAX_NAME_FIELDS = {"path", "linkpath", "uname", "gname"}
205 # Fields in a pax header that are numbers, all other fields
206 # are treated as strings.
207 PAX_NUMBER_FIELDS = {
216 #---------------------------------------------------------
218 #---------------------------------------------------------
220 if os.name in ("nt", "ce"):
223 ENCODING = sys.getfilesystemencoding()
225 #---------------------------------------------------------
226 # Some useful functions
227 #---------------------------------------------------------
229 def stn(s, length, encoding, errors):
230 """Convert a string to a null-terminated bytes object.
232 s = s.encode(encoding, errors)
233 return s[:length] + (length - len(s)) * NUL
235 def nts(s, encoding, errors):
236 """Convert a null-terminated bytes object to a string.
241 return s.decode(encoding, errors)
243 def sbtn(s, length, encoding, errors):
244 """Convert a string or a bunch of bytes to a null-terminated bytes object
247 if isinstance(s, str):
248 s = s.encode(encoding, errors)
249 return s[:length] + (length - len(s)) * NUL
252 """Convert a number field to a python number.
254 # There are two possible encodings for a number field, see
256 if s[0] in (0o200, 0o377):
258 for i in range(len(s) - 1):
262 n = -(256 ** (len(s) - 1) - n)
265 n = int(nts(s, "ascii", "strict") or "0", 8)
267 raise InvalidHeaderError("invalid header")
270 def itn(n, digits=8, format=DEFAULT_FORMAT):
271 """Convert a python number to a number field.
273 # POSIX 1003.1-1988 requires numbers to be encoded as a string of
274 # octal digits followed by a null-byte, this allows values up to
275 # (8**(digits-1))-1. GNU tar allows storing numbers greater than
276 # that if necessary. A leading 0o200 or 0o377 byte indicate this
277 # particular encoding, the following digits-1 bytes are a big-endian
278 # base-256 representation. This allows values up to (256**(digits-1))-1.
279 # A 0o200 byte indicates a positive number, a 0o377 byte a negative
281 if 0 <= n < 8 ** (digits - 1):
282 s = bytes("%0*o" % (digits - 1, int(n)), "ascii") + NUL
283 elif format == GNU_FORMAT and -256 ** (digits - 1) <= n < 256 ** (digits - 1):
285 s = bytearray([0o200])
287 s = bytearray([0o377])
288 n = 256 ** digits + n
290 for i in range(digits - 1):
291 s.insert(1, n & 0o377)
294 raise ValueError("overflow in number field")
298 def calc_chksums(buf):
299 """Calculate the checksum for a member's header by summing up all
300 characters except for the chksum field which is treated as if
301 it was filled with spaces. According to the GNU tar sources,
302 some tars (Sun and NeXT) calculate chksum with signed char,
303 which will be different if there are chars in the buffer with
304 the high bit set. So we calculate two checksums, unsigned and
307 unsigned_chksum = 256 + sum(struct.unpack_from("148B8x356B", buf))
308 signed_chksum = 256 + sum(struct.unpack_from("148b8x356b", buf))
309 return unsigned_chksum, signed_chksum
311 def copyfileobj(src, dst, length=None):
312 """Copy length bytes from fileobj src to fileobj dst.
313 If length is None, copy the entire content.
318 shutil.copyfileobj(src, dst)
321 blocks, remainder = divmod(length, BUFSIZE)
322 for b in range(blocks):
323 buf = src.read(BUFSIZE)
325 if len(buf) < BUFSIZE:
326 raise OSError("end of file reached")
328 buf = src.read(remainder)
330 if len(buf) < remainder:
331 raise OSError("end of file reached")
335 """Deprecated in this location; use stat.filemode."""
337 warnings.warn("deprecated in favor of stat.filemode",
338 DeprecationWarning, 2)
339 return stat.filemode(mode)
341 class TarError(Exception):
342 """Base exception."""
344 class ExtractError(TarError):
345 """General exception for extract errors."""
347 class ReadError(TarError):
348 """Exception for unreadable tar archives."""
350 class CompressionError(TarError):
351 """Exception for unavailable compression methods."""
353 class StreamError(TarError):
354 """Exception for unsupported operations on stream-like TarFiles."""
356 class HeaderError(TarError):
357 """Base exception for header errors."""
359 class EmptyHeaderError(HeaderError):
360 """Exception for empty headers."""
362 class TruncatedHeaderError(HeaderError):
363 """Exception for truncated headers."""
365 class EOFHeaderError(HeaderError):
366 """Exception for end of file headers."""
368 class InvalidHeaderError(HeaderError):
369 """Exception for invalid headers."""
371 class SubsequentHeaderError(HeaderError):
372 """Exception for missing and invalid extended headers."""
374 class InvalidEncryptionError(TarError):
375 """Exception for undefined crypto modes and combinations."""
377 class DecryptionError(TarError):
378 """Exception for error during decryption."""
380 class EncryptionError(TarError):
381 """Exception for error during encryption."""
383 class EndOfFile(Exception):
384 """Signal end of file condition when they’re not an error."""
387 #---------------------------
388 # internal stream interface
389 #---------------------------
391 """Low-level file object. Supports reading and writing.
392 It is used instead of a regular file object for streaming
396 def __init__(self, name, mode):
399 "w": os.O_RDWR | os.O_CREAT | os.O_TRUNC,
401 if hasattr(os, "O_BINARY"):
402 _mode |= os.O_BINARY # pylint: disable=no-member
403 self.fd = os.open(name, _mode, 0o666)
409 def read(self, size):
410 ret = os.read(self.fd, size)
411 self.offset += len(ret)
414 def write(self, s, pos=None):
417 os.lseek (self.fd, pos, os.SEEK_SET)
418 n = os.write(self.fd, s)
420 self.offset += len(s)
422 append = pos + n - p0
424 self.offset += append
425 os.lseek (self.fd, p0, os.SEEK_SET)
430 def seek_set (self, pos):
431 os.lseek (self.fd, pos, os.SEEK_SET)
435 def gz_header (name=None):
436 timestamp = int(time.time())
442 flags |= GZ_FLAG_FNAME
443 if type(name) is str:
444 name = name.encode("iso-8859-1", "replace")
445 if name.endswith(b".pdtcrypt"):
447 if name.endswith(b".gz"):
449 # RFC1952 says we must use ISO-8859-1 for the FNAME field.
452 hdr = struct.pack (GZ_FMT_HEADER,
453 GZ_MAGIC [0], GZ_MAGIC [1],
454 GZ_METHOD_DEFLATE, flags,
456 GZ_DEFLATE_FLAGS, GZ_OS_CODE)
462 """Class that serves as an adapter between TarFile and
463 a stream-like object. The stream-like object only
464 needs to have a read() or write() method and is accessed
465 blockwise. Use of gzip or bzip2 compression is possible.
466 A stream-like object could be for example: sys.stdin,
467 sys.stdout, a socket, a tape device etc.
469 _Stream is intended to be used only internally but is
470 nevertherless used externally by Deltatar.
472 When encrypting, the ``enccounter`` will be used for
473 initializing the first cryptographic context. When
474 decrypting, its value will be compared to the decrypted
475 object. Decryption fails if the value does not match.
476 In effect, this means that a ``_Stream`` whose ctor was
477 passed ``enccounter`` can only be used to encrypt or
478 decrypt a single object.
481 remainder = -1 # track size in encrypted entries
482 tolerance = TOLERANCE_STRICT
484 def __init__(self, name, mode, comptype, fileobj, bufsize,
485 concat=False, encryption=None, enccounter=None,
486 compresslevel=9, tolerance=TOLERANCE_STRICT):
487 """Construct a _Stream object.
489 self.arcmode = arcmode_set (concat, encryption, comptype)
490 self.tolerance = tolerance
492 self._extfileobj = True
494 fileobj = _LowLevelFile(name, mode)
495 self._extfileobj = False
498 # Enable transparent compression detection for the
500 fileobj = _StreamProxy(fileobj)
501 comptype = fileobj.getcomptype()
505 self.enccounter = None
506 if self.arcmode & ARCMODE_ENCRYPT:
507 self.enccounter = enccounter
509 self.name = name or ""
511 self.comptype = comptype
513 self.fileobj = fileobj
514 self.bufsize = bufsize
520 self.last_block_offset = 0
521 self.dbuf = b"" # ???
522 self.exception = None # communicate decompression failure
523 self.compresslevel = compresslevel
524 self.bytes_written = 0
526 self.encryption = encryption
534 raise CompressionError("zlib module is not available")
537 self.exception = zlib.error
540 if not (self.arcmode & ARCMODE_CONCAT):
541 if self.arcmode & ARCMODE_ENCRYPT:
542 self._init_write_encrypt (name)
543 self._init_write_gz ()
544 self.crc = zlib.crc32(b"") & 0xFFFFffff
546 elif comptype == "bz2":
547 if self.arcmode & ARCMODE_ENCRYPT:
548 raise InvalidEncryptionError("encryption not available for "
549 "compression “%s”" % comptype)
553 raise CompressionError("bz2 module is not available")
556 self.cmp = bz2.BZ2Decompressor()
557 self.exception = OSError
559 self.cmp = bz2.BZ2Compressor()
561 elif comptype == 'xz':
562 if self.arcmode & ARCMODE_ENCRYPT:
563 raise InvalidEncryptionError("encryption not available for "
564 "compression “%s”" % comptype)
568 raise CompressionError("lzma module is not available")
571 self.cmp = lzma.LZMADecompressor()
572 self.exception = lzma.LZMAError
574 self.cmp = lzma.LZMACompressor()
576 elif comptype == "tar":
577 if not (self.arcmode & ARCMODE_CONCAT) \
579 and self.arcmode & ARCMODE_ENCRYPT:
580 self._init_write_encrypt (name)
583 if self.arcmode & ARCMODE_ENCRYPT:
584 raise InvalidEncryptionError("encryption not available for "
585 "compression “%s”" % comptype)
586 raise CompressionError("unknown compression type %r" % comptype)
589 if not self._extfileobj:
595 if hasattr(self, "closed") and not self.closed:
598 except crypto.InternalError:
599 # context already finalized due to abort but close() tried
604 def next (self, name):
605 if self.arcmode & ARCMODE_COMPRESS:
606 if getattr (self, "cmp", None) is not None:
607 self._finalize_write_gz ()
609 if self.arcmode & ~(ARCMODE_ENCRYPT | ARCMODE_COMPRESS):
610 self.last_block_offset = self.fileobj.tell()
611 if self.arcmode & ARCMODE_ENCRYPT:
612 self._finalize_write_encrypt ()
613 self._init_write_encrypt (name, set_last_block_offset=True)
614 if self.arcmode & ARCMODE_COMPRESS:
615 self._init_write_gz (set_last_block_offset =
616 not (self.arcmode & ARCMODE_ENCRYPT))
617 return self.last_block_offset
620 def next_volume (self, name):
621 # with non-concat modes, this is taken care by the _Stream
622 # ctor as invoked by the newvol handler
623 if self.arcmode & ARCMODE_COMPRESS:
624 if getattr (self, "cmp", None) is not None:
625 # e. g. compressed PAX header written
626 self._finalize_write_gz ()
627 if self.arcmode & ARCMODE_ENCRYPT:
628 self._init_write_encrypt (name)
629 if self.arcmode & ARCMODE_COMPRESS:
630 self._init_write_gz ()
633 def _init_write_encrypt (self, entry=None, set_last_block_offset=False):
635 Save position for delayed write of header; fill the header location
638 # first thing, proclaim new object to the encryption context
639 # secondly, assemble the header with the updated parameters
640 # and commit it directly to the underlying stream, bypassing the
641 # encryption layer in .__write().
642 dummyhdr = self.encryption.next (entry, counter=self.enccounter)
644 raise EncryptionError ("Crypto.next(): bad dummy header") # XXX
645 self.lasthdr = self.fileobj.tell()
646 self.__write_to_file(dummyhdr)
647 if set_last_block_offset is True:
648 self.last_block_offset = self.lasthdr
651 def _finalize_write_encrypt (self):
653 Seek back to header position, read dummy bytes, finalize crypto
654 obtaining the actual header, write header, seek back to current
657 Returns the list of IV fixed parts as used during encryption.
659 if self.lasthdr is not None:
660 pos0 = self.fileobj.tell ()
661 self.fileobj.seek_set (self.lasthdr)
662 dummy = self.fileobj.read (crypto.PDTCRYPT_HDR_SIZE)
663 pos1 = self.fileobj.tell ()
664 dpos = pos1 - self.lasthdr
665 assert dpos == crypto.PDTCRYPT_HDR_SIZE
666 self.fileobj.seek_set (pos0)
667 data, hdr, _ = self.encryption.done (dummy)
668 self.__write_to_file(hdr, pos=self.lasthdr)
669 self.__write_to_file(data) # append remainder of data
673 def _finalize_write_gz (self):
674 if self.cmp is not None:
675 chunk = self.buf + self.cmp.flush()
677 if self.comptype == "gz":
678 # The native zlib crc is an unsigned 32-bit integer, but
679 # the Python wrapper implicitly casts that to a signed C
680 # long. So, on a 32-bit box self.crc may "look negative",
681 # while the same crc on a 64-bit box may "look positive".
682 # To avoid irksome warnings from the `struct` module, force
683 # it to look positive on all boxes.
684 chunk += struct.pack("<L", self.crc & 0xffffffff)
685 chunk += struct.pack("<L", self.concat_pos & 0xffffFFFF)
686 self.__enc_write (chunk)
690 def _init_write_gz (self, set_last_block_offset=False):
692 Add a new gzip block, closing last one
695 self.crc = self.zlib.crc32(b"") & 0xFFFFffff
696 first = self.cmp is None
697 self.cmp = self.zlib.compressobj(self.compresslevel,
699 -self.zlib.MAX_WBITS,
700 self.zlib.DEF_MEM_LEVEL,
703 # if aes, we encrypt after compression
704 if set_last_block_offset is True:
705 self.last_block_offset = self.fileobj.tell()
707 self.__write(gz_header (self.name if first is True else None))
711 """Write string s to the stream.
713 if self.comptype == "gz":
714 self.crc = self.zlib.crc32(s, self.crc) & 0xFFFFffff
716 self.concat_pos += len(s)
717 if self.cmp is not None:
718 s = self.cmp.compress(s)
722 """Write what’s left in the buffer to the stream."""
723 self.__write (b"") # → len (buf) <= bufsiz
724 self.__enc_write (self.buf)
727 def __write(self, s):
728 """Writes (and encodes) string s to the stream blockwise
730 will wait with encoding/writing until block is complete
733 while len(self.buf) > self.bufsize:
734 self.__enc_write(self.buf[:self.bufsize])
735 self.buf = self.buf[self.bufsize:]
738 def __write_to_file(self, s, pos=None):
740 Writes directly to the fileobj; updates self.bytes_written. If “pos” is
741 given, the stream will seek to that position first and back afterwards,
742 and the total of bytes written is not updated.
744 self.fileobj.write(s, pos)
746 self.bytes_written += len(s)
749 def __enc_write(self, s):
751 If encryption is active, the string s is encrypted before being written
756 if self.arcmode & ARCMODE_ENCRYPT:
759 n, ct = self.encryption.process(buf)
760 self.__write_to_file(ct)
763 # The entire plaintext was not consumed: The size limit
764 # for encrypted objects was reached. Transparently create
765 # a new encrypted object and continue processing the input.
766 self._finalize_write_encrypt ()
767 self._init_write_encrypt ()
769 self.__write_to_file(s)
772 def estim_file_size(self):
773 """ estimates size of file if closing it now
775 The result may differ greatly from the amount of data sent to write()
776 due to compression, encryption and buffering.
778 In tests the result (before calling close()) was up to 12k smaller than
779 the final file size if compression is being used because zlib/bz2
780 compressors do not allow inspection of their buffered data :-(
782 Still, we add what close() would add: 8 bytes for gz checksum, one
783 encryption block size if encryption is used and the size of our own
787 return self.bytes_written
789 result = self.bytes_written
791 result += len(self.buf)
792 if self.comptype == 'gz':
793 result += 8 # 2 longs = 8 byte (no extra info written for bzip2)
796 def close(self, close_fileobj=True):
797 """Close the _Stream object. No operation should be
798 done on it afterwards.
804 if close_fileobj is True:
807 if self.arcmode & ARCMODE_COMPRESS:
808 self._finalize_write_gz ()
809 # end of Tar archive marker (two empty blocks) was written
810 # finalize encryption last; no writes may be performed after
813 if self.arcmode & ARCMODE_ENCRYPT:
814 self._finalize_write_encrypt ()
816 if not self._extfileobj:
819 # read the zlib crc and length and check them
820 if self.mode == "r" and self.comptype == "gz":
821 read_crc = self.__read(4)
822 read_length = self.__read(4)
823 calculated_crc = self.crc
824 if struct.unpack("<L", read_crc)[0] != calculated_crc:
825 raise CompressionError("bad gzip crc")
829 def _init_read_gz(self):
830 """Initialize for reading a gzip compressed fileobj.
832 self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
834 read2 = self.__read(2)
836 raise EndOfFile ("_init_read_gz(): read returned zero bytes at pos "
837 "%d" % self.fileobj.tell())
838 # taken from gzip.GzipFile with some alterations
839 if read2 != GZ_MAGIC_BYTES:
840 raise ReadError("not a gzip file")
842 read1 = self.__read(1)
844 raise EndOfFile ("_init_read_gz(): read returned zero bytes inside "
845 "gzip header at pos %d" % self.fileobj.tell())
846 if ord (read1) != GZ_METHOD_DEFLATE:
847 raise CompressionError("unsupported compression method")
849 self.flags = flag = ord(self.__read(1))
850 self.__read(6) # discard timestamp[4], deflate flags, os code
852 if flag & GZ_FLAG_FEXTRA:
853 xlen = ord(self.__read(1)) + 256 * ord(self.__read(1))
855 if flag & GZ_FLAG_FNAME:
858 if not s or s == NUL:
860 if flag & GZ_FLAG_FCOMMENT:
863 if not s or s == NUL:
865 if flag & GZ_FLAG_FHCRC:
868 def _init_read_encrypt (self):
869 """Initialize encryption for next entry in archive. Read a header and
870 notify the crypto context."""
871 if self.arcmode & ARCMODE_ENCRYPT:
872 lasthdr = self.fileobj.tell ()
874 hdr = crypto.hdr_read_stream (self.fileobj)
875 except crypto.EndOfFile:
877 except crypto.InvalidHeader as exn:
878 raise DecryptionError ("Crypto.hdr_read_stream(): error “%s” "
879 "processing %r at pos %d"
880 % (exn, self.fileobj, lasthdr)) \
882 if self.enccounter is not None:
883 # enforce that the iv counter in the header matches an
884 # explicitly requested one
885 iv = crypto.hdr_iv_counter (hdr)
886 if iv != self.enccounter:
887 raise DecryptionError ("expected IV counter %d, got %d"
888 % (self.enccounter, iv))
889 self.lasthdr = lasthdr
890 self.remainder = hdr ["ctsize"] # distance to next header
892 self.encryption.next (hdr)
893 except crypto.InvalidParameter as exn:
894 raise DecryptionError ("Crypto.next(): error “%s” "
895 "processing %r at pos %d"
896 % (exn, self.fileobj, lasthdr)) \
902 def _read_encrypt (self, buf):
904 Demote a program error to a decryption error in tolerant mode. This
905 allows recovery from corrupted headers and invalid data.
908 return self.encryption.process (buf)
909 except RuntimeError as exn:
910 if self.tolerance != TOLERANCE_STRICT:
911 raise DecryptionError (exn)
915 def _finalize_read_encrypt (self):
919 if self.arcmode & ARCMODE_ENCRYPT \
920 and self.lasthdr is not None :
921 assert self.remainder >= 0
922 if self.remainder > 0:
925 data = self.encryption.done ()
926 except crypto.InvalidGCMTag as exn:
927 raise DecryptionError ("decryption failed: %s" % exn)
932 """Return the stream's file pointer position.
936 def seek(self, pos=0):
937 """Set the stream's file pointer to pos. Negative seeking
940 if pos - self.pos >= 0:
941 blocks, remainder = divmod(pos - self.pos, self.bufsize)
942 for i in range(blocks):
943 self.read(self.bufsize)
946 raise StreamError("seeking backwards is not allowed")
949 def read(self, size=None):
950 """Return the next size number of bytes from the stream.
951 If size is not defined, return all bytes of the stream
957 buf = self._read(self.bufsize)
963 buf = self._read(size)
968 """Reads just one line, new line character included
970 # if \n in dbuf, no read neads to be done
971 if b'\n' in self.dbuf:
972 pos = self.dbuf.index(b'\n') + 1
973 ret = self.dbuf[:pos]
974 self.dbuf = self.dbuf[pos:]
979 chunk = self._read(self.bufsize)
981 # nothing more to read, so return the buffer
987 # if \n found, return the new line
990 pos = dbuf.index(b'\n') + 1
991 self.dbuf = dbuf[pos:] + self.dbuf
994 def _read(self, size):
995 """Return size bytes from the stream.
1001 buf = self.__read(self.bufsize)
1005 if self.cmp is not None:
1007 buf = self.cmp.decompress(buf)
1008 except self.exception as exn:
1009 raise ReadError("invalid compressed data (%r)" % exn)
1010 except Exception as e:
1011 # happens at the end of the file
1012 # _init_read_gz failed in the previous iteration so
1013 # self.cmp.decompress fails here
1014 if self.arcmode & ARCMODE_CONCAT:
1017 raise ReadError("invalid compressed data")
1018 if self.arcmode & ARCMODE_COMPRESS and hasattr(self, "crc"):
1019 self.crc = self.zlib.crc32(buf, self.crc) & 0xFFFFffff
1020 if self.arcmode & ARCMODE_CONCAT \
1021 and len(self.cmp.unused_data) != 0:
1022 self.buf = self.cmp.unused_data + self.buf
1023 self.close(close_fileobj=False)
1025 self._init_read_gz()
1026 except DecryptionError:
1027 if self.tolerance != TOLERANCE_STRICT:
1028 # return whatever data was processed successfully
1034 except ReadError: # gzip troubles
1035 if self.tolerance == TOLERANCE_RESCUE:
1042 # happens at the end of the file
1044 self.crc = self.zlib.crc32(b"") & 0xFFFFffff
1049 self.dbuf = t[size:]
1053 def __read(self, size):
1055 Return size bytes from stream. If internal buffer is empty, read
1056 another block from the stream.
1058 The function returns up to size bytes of data. When an error occurs
1059 during decryption, everything until the end of the last successfully
1060 finalized object is returned.
1063 t = [self.buf] if c > 0 else []
1064 good_crypto = len (t)
1069 if self.arcmode & ARCMODE_ENCRYPT:
1070 if self.remainder <= 0:
1071 # prepare next object
1072 if self._init_read_encrypt () is False: # EOF
1076 # only read up to the end of the encrypted object
1077 todo = min (size, self.remainder)
1078 buf = self.fileobj.read(todo)
1079 if self.arcmode & ARCMODE_ENCRYPT:
1081 buf = self._read_encrypt (buf)
1082 if todo == self.remainder:
1083 # at the end of a crypto object; finalization will fail if
1084 # the GCM tag does not match
1085 trailing = self._finalize_read_encrypt ()
1086 good_crypto = len (t) + 1
1087 if len (trailing) > 0:
1091 self.remainder -= todo
1092 except DecryptionError:
1093 if self.tolerance == TOLERANCE_STRICT:
1095 self.encryption.drop ()
1096 if self.tolerance == TOLERANCE_RECOVER:
1097 if good_crypto == 0:
1099 # this may occur at any of the three crypto operations above.
1100 # some objects did validate; discard all data after it; next
1101 # call will start with the bad object and error out immediately
1102 self.buf = b"".join (t [good_crypto:])
1103 return b"".join (t [:good_crypto])
1104 elif self.tolerance == TOLERANCE_RESCUE:
1105 # keep what we have so far despite the finalization issue
1110 raise RuntimeError("internal error: bad tolerance level")
1112 if not buf: ## XXX stream terminated prematurely; this should be an error
1123 class _StreamProxy(object):
1124 """Small proxy class that enables transparent compression
1125 detection for the Stream interface (mode 'r|*').
1128 def __init__(self, fileobj):
1129 self.fileobj = fileobj
1130 self.buf = self.fileobj.read(BLOCKSIZE)
1132 def read(self, size): # pylint: disable=method-hidden
1133 self.read = self.fileobj.read
1136 def getcomptype(self):
1137 if self.buf.startswith(GZ_MAGIC_DEFLATE):
1139 elif self.buf[0:3] == b"BZh" and self.buf[4:10] == b"1AY&SY":
1141 elif self.buf.startswith((b"\x5d\x00\x00\x80", b"\xfd7zXZ")):
1147 self.fileobj.close()
1150 #------------------------
1151 # Extraction file object
1152 #------------------------
1153 class _FileInFile(object):
1154 """A thin wrapper around an existing file object that
1155 provides a part of its data as an individual file
1159 def __init__(self, fileobj, offset, size, blockinfo=None):
1160 self.fileobj = fileobj
1161 self.offset = offset
1164 self.name = getattr(fileobj, "name", None)
1167 if blockinfo is None:
1168 blockinfo = [(0, size)]
1170 # Construct a map with data and zero blocks.
1174 realpos = self.offset
1175 for offset, size in blockinfo:
1176 if offset > lastpos:
1177 self.map.append((False, lastpos, offset, None))
1178 self.map.append((True, offset, offset + size, realpos))
1180 lastpos = offset + size
1181 if lastpos < self.size:
1182 self.map.append((False, lastpos, self.size, None))
1194 return self.fileobj.seekable()
1197 """Return the current file position.
1199 return self.position
1201 def seek(self, position, whence=io.SEEK_SET):
1202 """Seek to a position in the file.
1204 if whence == io.SEEK_SET:
1205 self.position = min(max(position, 0), self.size)
1206 elif whence == io.SEEK_CUR:
1208 self.position = max(self.position + position, 0)
1210 self.position = min(self.position + position, self.size)
1211 elif whence == io.SEEK_END:
1212 self.position = max(min(self.size + position, self.size), 0)
1214 raise ValueError("Invalid argument")
1215 return self.position
1217 def read(self, size=None):
1218 """Read data from the file.
1221 size = self.size - self.position
1223 size = min(size, self.size - self.position)
1228 data, start, stop, offset = self.map[self.map_index]
1229 if start <= self.position < stop:
1233 if self.map_index == len(self.map):
1235 length = min(size, stop - self.position)
1237 self.fileobj.seek(offset + (self.position - start))
1238 buf += self.fileobj.read(length)
1242 self.position += length
1245 def readinto(self, b):
1246 buf = self.read(len(b))
1255 class ExFileObject(io.BufferedReader):
1257 def __init__(self, tarfile, tarinfo):
1258 fileobj = _FileInFile(tarfile.fileobj, tarinfo.offset_data,
1259 tarinfo.size, tarinfo.sparse)
1260 super().__init__(fileobj)
1266 class TarInfo(object):
1267 """Informational class which holds the details about an
1268 archive member given by a tar header block.
1269 TarInfo objects are returned by TarFile.getmember(),
1270 TarFile.getmembers() and TarFile.gettarinfo() and are
1271 usually created internally.
1274 __slots__ = ("name", "mode", "uid", "gid", "size", "mtime",
1275 "chksum", "type", "linkname", "uname", "gname",
1276 "devmajor", "devminor", "volume_offset",
1277 "offset", "offset_data", "pax_headers", "sparse",
1278 "tarfile", "_sparse_structs", "_link_target")
1280 def __init__(self, name=""):
1281 """Construct a TarInfo object. name is the optional name
1284 self.name = name # member name
1285 self.mode = 0o644 # file permissions
1286 self.uid = 0 # user id
1287 self.gid = 0 # group id
1288 self.size = 0 # file size
1289 self.mtime = 0 # modification time
1290 self.chksum = 0 # header checksum
1291 self.type = REGTYPE # member type
1292 self.linkname = "" # link name
1293 self.uname = "" # user name
1294 self.gname = "" # group name
1295 self.devmajor = 0 # device major number
1296 self.devminor = 0 # device minor number
1298 self.offset = 0 # the tar header starts here
1299 self.offset_data = 0 # the file's data starts here
1300 self.volume_offset = 0 # the file's data corresponds with the data
1301 # starting at this position
1303 self.sparse = None # sparse member information
1304 self.pax_headers = {} # pax header information
1306 # In pax headers the "name" and "linkname" field are called
1307 # "path" and "linkpath".
1310 def _setpath(self, name):
1312 path = property(_getpath, _setpath)
1314 def _getlinkpath(self):
1315 return self.linkname
1316 def _setlinkpath(self, linkname):
1317 self.linkname = linkname
1318 linkpath = property(_getlinkpath, _setlinkpath)
1321 return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self))
1323 def get_info(self, encoding=None, errors=None):
1324 """Return the TarInfo's attributes as a dictionary.
1328 "mode": self.mode & 0o7777,
1332 "mtime": self.mtime,
1333 "chksum": self.chksum,
1335 "linkname": self.linkname,
1336 "uname": self.uname,
1337 "gname": self.gname,
1338 "devmajor": self.devmajor,
1339 "devminor": self.devminor,
1340 "offset_data": self.offset_data,
1341 "volume_offset": self.volume_offset
1344 if info["type"] == DIRTYPE and not info["name"].endswith("/"):
1349 def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING,
1350 errors="surrogateescape"):
1351 """Return a tar header as a string of 512 byte blocks.
1353 info = self.get_info(encoding, errors)
1355 if format == USTAR_FORMAT:
1356 return self.create_ustar_header(info, encoding, errors)
1357 elif format == GNU_FORMAT:
1358 return self.create_gnu_header(info, encoding, errors)
1359 elif format == PAX_FORMAT:
1360 return self.create_pax_header(info, encoding, errors)
1362 raise ValueError("invalid format")
1364 def create_ustar_header(self, info, encoding, errors):
1365 """Return the object as a ustar header block.
1367 info["magic"] = POSIX_MAGIC
1369 if len(info["linkname"]) > LENGTH_LINK:
1370 raise ValueError("linkname is too long")
1372 if len(info["name"]) > LENGTH_NAME:
1373 info["prefix"], info["name"] = self._posix_split_name(info["name"])
1375 return self._create_header(info, USTAR_FORMAT, encoding, errors)
1377 def create_gnu_header(self, info, encoding, errors):
1378 """Return the object as a GNU header block sequence.
1380 info["magic"] = GNU_MAGIC
1382 if self.ismultivol():
1384 itn(info.get("atime", 0), 12, GNU_FORMAT),
1385 itn(info.get("ctime", 0), 12, GNU_FORMAT),
1386 itn(self.volume_offset, 12, GNU_FORMAT),
1387 itn(0, 119, GNU_FORMAT), # stuff unused in this tar implementation, set to zero
1389 info['prefix'] = b"".join(prefix)
1390 info['size'] = info['size'] - self.volume_offset
1393 if len(info["linkname"]) > LENGTH_LINK:
1394 buf += self._create_gnu_long_header(info["linkname"],
1395 GNUTYPE_LONGLINK, encoding, errors)
1397 if len(info["name"]) > LENGTH_NAME:
1398 buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME,
1401 return buf + self._create_header(info, GNU_FORMAT, encoding, errors)
1403 def create_pax_header(self, info, encoding, errors):
1404 """Return the object as a ustar header block. If it cannot be
1405 represented this way, prepend a pax extended header sequence
1406 with supplement information.
1408 info["magic"] = POSIX_MAGIC
1409 pax_headers = self.pax_headers.copy()
1410 if self.ismultivol():
1411 info['size'] = info['size'] - self.volume_offset
1413 # Test string fields for values that exceed the field length or cannot
1414 # be represented in ASCII encoding.
1415 for name, hname, length in (
1416 ("name", "path", LENGTH_NAME),
1417 ("linkname", "linkpath", LENGTH_LINK),
1418 ("uname", "uname", 32),
1419 ("gname", "gname", 32)):
1421 if hname in pax_headers:
1422 # The pax header has priority.
1425 # Try to encode the string as ASCII.
1427 info[name].encode("ascii", "strict")
1428 except UnicodeEncodeError:
1429 pax_headers[hname] = info[name]
1432 if len(info[name]) > length:
1433 pax_headers[hname] = info[name]
1435 # Test number fields for values that exceed the field limit or values
1436 # that like to be stored as float.
1437 for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)):
1438 if name in pax_headers:
1439 # The pax header has priority. Avoid overflow.
1444 if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float):
1445 pax_headers[name] = str(val)
1448 # Create a pax extended header if necessary.
1450 buf = self._create_pax_generic_header(pax_headers, XHDTYPE, encoding)
1454 return buf + self._create_header(info, USTAR_FORMAT, "ascii", "replace")
1457 def create_pax_global_header(cls, pax_headers):
1458 """Return the object as a pax global header block sequence.
1460 return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf-8")
1462 def _posix_split_name(self, name):
1463 """Split a name longer than 100 chars into a prefix
1466 prefix = name[:LENGTH_PREFIX + 1]
1467 while prefix and prefix[-1] != "/":
1468 prefix = prefix[:-1]
1470 name = name[len(prefix):]
1471 prefix = prefix[:-1]
1473 if not prefix or len(name) > LENGTH_NAME:
1474 raise ValueError("name is too long")
1478 def _create_header(info, format, encoding, errors):
1479 """Return a header block. info is a dictionary with file
1480 information, format must be one of the *_FORMAT constants.
1483 stn(info.get("name", ""), 100, encoding, errors),
1484 itn(info.get("mode", 0) & 0o7777, 8, format),
1485 itn(info.get("uid", 0), 8, format),
1486 itn(info.get("gid", 0), 8, format),
1487 itn(info.get("size", 0), 12, format),
1488 itn(info.get("mtime", 0), 12, format),
1489 b" ", # checksum field
1490 info.get("type", REGTYPE),
1491 stn(info.get("linkname", ""), 100, encoding, errors),
1492 info.get("magic", POSIX_MAGIC),
1493 stn(info.get("uname", ""), 32, encoding, errors),
1494 stn(info.get("gname", ""), 32, encoding, errors),
1495 itn(info.get("devmajor", 0), 8, format),
1496 itn(info.get("devminor", 0), 8, format),
1497 sbtn(info.get("prefix", ""), 155, encoding, errors)
1500 buf = struct.pack("%ds" % BLOCKSIZE, b"".join(parts))
1501 chksum = calc_chksums(buf[-BLOCKSIZE:])[0]
1502 buf = buf[:-364] + bytes("%06o\0" % chksum, "ascii") + buf[-357:]
1506 def _create_payload(payload):
1507 """Return the string payload filled with zero bytes
1508 up to the next 512 byte border.
1510 blocks, remainder = divmod(len(payload), BLOCKSIZE)
1512 payload += (BLOCKSIZE - remainder) * NUL
1516 def _create_gnu_long_header(cls, name, type, encoding, errors):
1517 """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence
1520 name = name.encode(encoding, errors) + NUL
1523 info["name"] = "././@LongLink"
1525 info["size"] = len(name)
1526 info["magic"] = GNU_MAGIC
1528 # create extended header + name blocks.
1529 return cls._create_header(info, USTAR_FORMAT, encoding, errors) + \
1530 cls._create_payload(name)
1533 def _create_pax_generic_header(cls, pax_headers, type, encoding):
1534 """Return a POSIX.1-2008 extended or global header sequence
1535 that contains a list of keyword, value pairs. The values
1538 # Check if one of the fields contains surrogate characters and thereby
1539 # forces hdrcharset=BINARY, see _proc_pax() for more information.
1541 for keyword, value in pax_headers.items():
1543 value.encode("utf-8", "strict")
1544 except UnicodeEncodeError:
1550 # Put the hdrcharset field at the beginning of the header.
1551 records += b"21 hdrcharset=BINARY\n"
1553 for keyword, value in pax_headers.items():
1554 keyword = keyword.encode("utf-8")
1556 # Try to restore the original byte representation of `value'.
1557 # Needless to say, that the encoding must match the string.
1558 value = value.encode(encoding, "surrogateescape")
1560 value = value.encode("utf-8")
1562 l = len(keyword) + len(value) + 3 # ' ' + '=' + '\n'
1569 records += bytes(str(p), "ascii") + b" " + keyword + b"=" + value + b"\n"
1571 # We use a hardcoded "././@PaxHeader" name like star does
1572 # instead of the one that POSIX recommends.
1574 info["name"] = "././@PaxHeader"
1576 info["size"] = len(records)
1577 info["magic"] = POSIX_MAGIC
1579 # Create pax header + record blocks.
1580 return cls._create_header(info, USTAR_FORMAT, "ascii", "replace") + \
1581 cls._create_payload(records)
1584 def frombuf(cls, buf, encoding, errors):
1585 """Construct a TarInfo object from a 512 byte bytes object.
1588 raise EmptyHeaderError("empty header")
1589 if len(buf) != BLOCKSIZE:
1590 raise TruncatedHeaderError("truncated header")
1591 if buf.count(NUL) == BLOCKSIZE:
1592 raise EOFHeaderError("end of file header")
1594 chksum = nti(buf[148:156])
1595 if chksum not in calc_chksums(buf):
1596 raise InvalidHeaderError("bad checksum")
1599 obj.name = nts(buf[0:100], encoding, errors)
1600 obj.mode = nti(buf[100:108])
1601 obj.uid = nti(buf[108:116])
1602 obj.gid = nti(buf[116:124])
1603 obj.size = nti(buf[124:136])
1604 obj.mtime = nti(buf[136:148])
1606 obj.type = buf[156:157]
1607 obj.linkname = nts(buf[157:257], encoding, errors)
1608 obj.uname = nts(buf[265:297], encoding, errors)
1609 obj.gname = nts(buf[297:329], encoding, errors)
1610 obj.devmajor = nti(buf[329:337])
1611 obj.devminor = nti(buf[337:345])
1612 prefix = nts(buf[345:500], encoding, errors)
1614 # The old GNU sparse format occupies some of the unused
1615 # space in the buffer for up to 4 sparse structures.
1616 # Save the them for later processing in _proc_sparse().
1617 if obj.type == GNUTYPE_SPARSE:
1622 offset = nti(buf[pos:pos + 12])
1623 numbytes = nti(buf[pos + 12:pos + 24])
1626 structs.append((offset, numbytes))
1628 isextended = bool(buf[482])
1629 origsize = nti(buf[483:495])
1630 obj._sparse_structs = (structs, isextended, origsize)
1632 # Old V7 tar format represents a directory as a regular
1633 # file with a trailing slash.
1634 if obj.type == AREGTYPE and obj.name.endswith("/"):
1637 # Remove redundant slashes from directories.
1639 obj.name = obj.name.rstrip("/")
1641 # Reconstruct a ustar longname.
1642 if prefix and obj.type not in GNU_TYPES:
1643 obj.name = prefix + "/" + obj.name
1645 obj.offset_data = nti(buf[369:381])
1649 def fromtarfile(cls, tarfile):
1650 """Return the next TarInfo object from TarFile object
1653 buf = tarfile.fileobj.read(BLOCKSIZE)
1654 obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors)
1655 obj.offset = tarfile.fileobj.tell() - BLOCKSIZE
1656 return obj._proc_member(tarfile)
1658 #--------------------------------------------------------------------------
1659 # The following are methods that are called depending on the type of a
1660 # member. The entry point is _proc_member() which can be overridden in a
1661 # subclass to add custom _proc_*() methods. A _proc_*() method MUST
1662 # implement the following
1664 # 1. Set self.offset_data to the position where the data blocks begin,
1665 # if there is data that follows.
1666 # 2. Set tarfile.offset to the position where the next member's header will
1668 # 3. Return self or another valid TarInfo object.
1669 def _proc_member(self, tarfile):
1670 """Choose the right processing method depending on
1671 the type and call it.
1673 if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK):
1674 return self._proc_gnulong(tarfile)
1675 elif self.type == GNUTYPE_SPARSE:
1676 return self._proc_sparse(tarfile)
1677 elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE):
1678 return self._proc_pax(tarfile)
1680 return self._proc_builtin(tarfile)
1682 def _proc_builtin(self, tarfile):
1683 """Process a builtin type or an unknown type which
1684 will be treated as a regular file.
1686 self.offset_data = tarfile.fileobj.tell()
1687 offset = self.offset_data
1688 if self.isreg() or self.ismultivol() or self.type not in SUPPORTED_TYPES:
1689 # Skip the following data blocks.
1690 offset += self._block(self.size)
1691 tarfile.offset = offset
1693 # Patch the TarInfo object with saved global
1694 # header information.
1695 self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors)
1699 def _proc_gnulong(self, tarfile):
1700 """Process the blocks that hold a GNU longname
1703 buf = tarfile.fileobj.read(self._block(self.size))
1705 # Fetch the next header and process it.
1707 next = self.fromtarfile(tarfile)
1709 raise SubsequentHeaderError("missing or bad subsequent header")
1711 # Patch the TarInfo object from the next header with
1712 # the longname information.
1713 next.offset = self.offset
1714 if self.type == GNUTYPE_LONGNAME:
1715 next.name = nts(buf, tarfile.encoding, tarfile.errors)
1716 elif self.type == GNUTYPE_LONGLINK:
1717 next.linkname = nts(buf, tarfile.encoding, tarfile.errors)
1721 def _proc_sparse(self, tarfile):
1722 """Process a GNU sparse header plus extra headers.
1724 # We already collected some sparse structures in frombuf().
1725 structs, isextended, origsize = self._sparse_structs
1726 del self._sparse_structs
1728 # Collect sparse structures from extended header blocks.
1730 buf = tarfile.fileobj.read(BLOCKSIZE)
1734 offset = nti(buf[pos:pos + 12])
1735 numbytes = nti(buf[pos + 12:pos + 24])
1738 if offset and numbytes:
1739 structs.append((offset, numbytes))
1741 isextended = bool(buf[504])
1742 self.sparse = structs
1744 self.offset_data = tarfile.fileobj.tell()
1745 tarfile.offset = self.offset_data + self._block(self.size)
1746 self.size = origsize
1749 def _proc_pax(self, tarfile):
1750 """Process an extended or global header as described in
1753 # Read the header information.
1754 buf = tarfile.fileobj.read(self._block(self.size))
1756 # A pax header stores supplemental information for either
1757 # the following file (extended) or all following files
1759 if self.type == XGLTYPE:
1760 pax_headers = tarfile.pax_headers
1762 pax_headers = tarfile.pax_headers.copy()
1764 # Check if the pax header contains a hdrcharset field. This tells us
1765 # the encoding of the path, linkpath, uname and gname fields. Normally,
1766 # these fields are UTF-8 encoded but since POSIX.1-2008 tar
1767 # implementations are allowed to store them as raw binary strings if
1768 # the translation to UTF-8 fails.
1769 match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf)
1770 if match is not None:
1771 pax_headers["hdrcharset"] = match.group(1).decode("utf-8")
1773 # For the time being, we don't care about anything other than "BINARY".
1774 # The only other value that is currently allowed by the standard is
1775 # "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
1776 hdrcharset = pax_headers.get("hdrcharset")
1777 if hdrcharset == "BINARY":
1778 encoding = tarfile.encoding
1782 # Parse pax header information. A record looks like that:
1783 # "%d %s=%s\n" % (length, keyword, value). length is the size
1784 # of the complete record including the length field itself and
1785 # the newline. keyword and value are both UTF-8 encoded strings.
1786 regex = re.compile(br"(\d+) ([^=]+)=")
1789 match = regex.match(buf, pos)
1793 length, keyword = match.groups()
1794 length = int(length)
1795 value = buf[match.end(2) + 1:match.start(1) + length - 1]
1797 # Normally, we could just use "utf-8" as the encoding and "strict"
1798 # as the error handler, but we better not take the risk. For
1799 # example, GNU tar <= 1.23 is known to store filenames it cannot
1800 # translate to UTF-8 as raw strings (unfortunately without a
1801 # hdrcharset=BINARY header).
1802 # We first try the strict standard encoding, and if that fails we
1803 # fall back on the user's encoding and error handler.
1804 keyword = self._decode_pax_field(keyword, "utf-8", "utf-8",
1806 if keyword in PAX_NAME_FIELDS:
1807 value = self._decode_pax_field(value, encoding, tarfile.encoding,
1810 value = self._decode_pax_field(value, "utf-8", "utf-8",
1813 pax_headers[keyword] = value
1817 # Fetch the next header.
1819 next = self.fromtarfile(tarfile)
1821 raise SubsequentHeaderError("missing or bad subsequent header")
1823 # Process GNU sparse information.
1824 if "GNU.sparse.map" in pax_headers:
1825 # GNU extended sparse format version 0.1.
1826 self._proc_gnusparse_01(next, pax_headers)
1828 elif "GNU.sparse.size" in pax_headers:
1829 # GNU extended sparse format version 0.0.
1830 self._proc_gnusparse_00(next, pax_headers, buf)
1832 elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0":
1833 # GNU extended sparse format version 1.0.
1834 self._proc_gnusparse_10(next, pax_headers, tarfile)
1836 if self.type in (XHDTYPE, SOLARIS_XHDTYPE):
1837 # Patch the TarInfo object with the extended header info.
1838 next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors)
1839 next.offset = self.offset
1841 if "size" in pax_headers:
1842 # If the extended header replaces the size field,
1843 # we need to recalculate the offset where the next
1845 offset = next.offset_data
1846 if next.isreg() or next.type not in SUPPORTED_TYPES:
1847 offset += next._block(next.size)
1848 tarfile.offset = offset
1850 if next is not None:
1851 if "GNU.volume.filename" in pax_headers:
1852 if pax_headers["GNU.volume.filename"] == next.name:
1853 if "GNU.volume.size" in pax_headers:
1854 next.size = int(pax_headers["GNU.volume.size"])
1855 if "GNU.volume.offset" in pax_headers:
1856 next.volume_offset = int(pax_headers["GNU.volume.offset"])
1858 for key in pax_headers.keys():
1859 if key.startswith("GNU.volume"):
1860 del tarfile.pax_headers[key]
1864 def _proc_gnusparse_00(self, next, pax_headers, buf):
1865 """Process a GNU tar extended sparse header, version 0.0.
1868 for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf):
1869 offsets.append(int(match.group(1)))
1871 for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf):
1872 numbytes.append(int(match.group(1)))
1873 next.sparse = list(zip(offsets, numbytes))
1875 def _proc_gnusparse_01(self, next, pax_headers):
1876 """Process a GNU tar extended sparse header, version 0.1.
1878 sparse = [int(x) for x in pax_headers["GNU.sparse.map"].split(",")]
1879 next.sparse = list(zip(sparse[::2], sparse[1::2]))
1881 def _proc_gnusparse_10(self, next, pax_headers, tarfile):
1882 """Process a GNU tar extended sparse header, version 1.0.
1886 buf = tarfile.fileobj.read(BLOCKSIZE)
1887 fields, buf = buf.split(b"\n", 1)
1888 fields = int(fields)
1889 while len(sparse) < fields * 2:
1890 if b"\n" not in buf:
1891 buf += tarfile.fileobj.read(BLOCKSIZE)
1892 number, buf = buf.split(b"\n", 1)
1893 sparse.append(int(number))
1894 next.offset_data = tarfile.fileobj.tell()
1895 next.sparse = list(zip(sparse[::2], sparse[1::2]))
1897 def _apply_pax_info(self, pax_headers, encoding, errors):
1898 """Replace fields with supplemental information from a previous
1899 pax extended or global header.
1901 for keyword, value in pax_headers.items():
1902 if keyword == "GNU.sparse.name":
1903 setattr(self, "path", value)
1904 elif keyword == "GNU.sparse.size":
1905 setattr(self, "size", int(value))
1906 elif keyword == "GNU.sparse.realsize":
1907 setattr(self, "size", int(value))
1908 elif keyword in PAX_FIELDS:
1909 if keyword in PAX_NUMBER_FIELDS:
1911 value = PAX_NUMBER_FIELDS[keyword](value)
1914 if keyword == "path":
1915 value = value.rstrip("/") # pylint: disable=no-member
1916 setattr(self, keyword, value)
1918 self.pax_headers = pax_headers.copy()
1920 def _decode_pax_field(self, value, encoding, fallback_encoding, fallback_errors):
1921 """Decode a single field from a pax record.
1924 return value.decode(encoding, "strict")
1925 except UnicodeDecodeError:
1926 return value.decode(fallback_encoding, fallback_errors)
1928 def _block(self, count):
1929 """Round up a byte count by BLOCKSIZE and return it,
1930 e.g. _block(834) => 1024.
1932 blocks, remainder = divmod(count, BLOCKSIZE)
1935 return blocks * BLOCKSIZE
1938 return self.type in REGULAR_TYPES
1942 return self.type == DIRTYPE
1944 return self.type == SYMTYPE
1946 return self.type == LNKTYPE
1948 return self.type == CHRTYPE
1950 return self.type == BLKTYPE
1952 return self.type == FIFOTYPE
1954 return self.sparse is not None
1956 return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE)
1957 def ismultivol(self):
1958 return self.type == GNUTYPE_MULTIVOL or self.volume_offset > 0 or\
1959 "GNU.volume.offset" in self.pax_headers
1962 class TarFile(object):
1963 """The TarFile Class provides an interface to tar archives.
1966 debug = 0 # May be set from 0 (no msgs) to 3 (all msgs)
1968 dereference = False # If true, add content of linked file to the
1969 # tar file, else the link.
1971 ignore_zeros = False # If true, skips empty or invalid blocks and
1972 # continues processing.
1974 max_volume_size = None # If different from None, establishes maximum
1975 # size of tar volumes
1977 new_volume_handler = None # function handler to be executed before when
1978 # a new volume is needed
1980 volume_number = 0 # current volume number, used for multi volume
1983 errorlevel = 1 # If 0, fatal errors only appear in debug
1984 # messages (if debug >= 0). If > 0, errors
1985 # are passed to the caller as exceptions.
1987 format = DEFAULT_FORMAT # The format to use when creating an archive.
1989 encoding = ENCODING # Encoding for 8-bit character strings.
1991 errors = None # Error handler for unicode conversion.
1993 tarinfo = TarInfo # The default TarInfo class to use.
1995 fileobject = ExFileObject # The file-object for extractfile().
1997 arcmode = ARCMODE_PLAIN # Object processing mode (“concat”, encryption,
2000 save_to_members = True # If new members are saved. This can be disabled
2001 # if you manage lots of files and don't want
2002 # to have high memory usage
2004 cache_uid2user = {} # cache to avoid getpwuid calls. It always parses /etc/passwd.
2005 cache_gid2group = {} # same cache for groups
2007 def __init__(self, name=None, mode="r", fileobj=None, format=None,
2008 tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
2009 errors="surrogateescape", pax_headers=None, debug=None,
2010 errorlevel=None, max_volume_size=None, new_volume_handler=None,
2011 concat=False, nacl=None,
2012 save_to_members=True):
2013 """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
2014 read from an existing archive, 'a' to append data to an existing
2015 file or 'w' to create a new file overwriting an existing one. `mode'
2017 If `fileobj' is given, it is used for reading or writing data. If it
2018 can be determined, `mode' is overridden by `fileobj's mode.
2019 `fileobj' is not closed, when TarFile is closed.
2021 if len(mode) > 1 or mode not in "raw":
2022 raise ValueError("mode must be 'r', 'a' or 'w'")
2024 self.arcmode = arcmode_set (concat)
2026 self._mode = {"r": "rb", "a": "r+b", "w": "wb"}[mode]
2029 if self.mode == "a" and not os.path.exists(name):
2030 # Create nonexistent files in append mode.
2033 fileobj = bltn_open(name, self._mode)
2034 self._extfileobj = False
2036 if name is None and hasattr(fileobj, "name"):
2038 # when fileobj is a gzip.GzipFile, fileobj.mode is an int (not valid for us)
2039 if hasattr(fileobj, "mode") and isinstance(fileobj.mode, str):
2040 self._mode = fileobj.mode
2041 self._extfileobj = True
2042 self.name = os.path.abspath(name) if name else None
2043 self.base_name = self.name = os.path.abspath(name) if name else None
2044 self.fileobj = fileobj
2047 if format is not None:
2048 self.format = format
2049 if tarinfo is not None:
2050 self.tarinfo = tarinfo
2051 if dereference is not None:
2052 self.dereference = dereference
2053 if ignore_zeros is not None:
2054 self.ignore_zeros = ignore_zeros
2055 if encoding is not None:
2056 self.encoding = encoding
2058 self.errors = errors
2060 if pax_headers is not None and self.format == PAX_FORMAT:
2061 self.pax_headers = pax_headers
2063 self.pax_headers = {}
2065 if debug is not None:
2067 if errorlevel is not None:
2068 self.errorlevel = errorlevel
2070 # Init datastructures.
2071 if max_volume_size and max_volume_size < 3*BLOCKSIZE:
2072 raise ValueError("max_volume_size needs to be at least %d" % (3*BLOCKSIZE))
2073 if max_volume_size and not callable(new_volume_handler):
2074 raise ValueError("new_volume_handler needs to be set and be callable for multivolume support")
2076 self.max_volume_size = int(max_volume_size)
2078 self.max_volume_size = None
2080 self.save_to_members = save_to_members
2081 self.new_volume_handler = new_volume_handler
2083 self.members = [] # list of members as TarInfo objects
2084 self._loaded = False # flag if all members have been read
2085 self.offset = self.fileobj.tell()
2086 # current position in the archive file
2087 self.inodes = {} # dictionary caching the inodes of
2088 # archive members already added
2091 if self.mode == "r":
2092 self.firstmember = None
2093 self.firstmember = self.next()
2095 if self.mode == "a":
2096 # Move to the end of the archive,
2097 # before the first empty block.
2099 self.fileobj.seek(self.offset)
2101 tarinfo = self.tarinfo.fromtarfile(self)
2102 self.members.append(tarinfo)
2103 except EOFHeaderError:
2104 self.fileobj.seek(self.offset)
2106 except HeaderError as e:
2107 raise ReadError(str(e))
2109 if self.mode in "aw":
2112 if self.pax_headers:
2113 buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy())
2114 self.fileobj.write(buf)
2115 self.offset += len(buf)
2117 if not self._extfileobj:
2118 self.fileobj.close()
2122 #--------------------------------------------------------------------------
2123 # Below are the classmethods which act as alternate constructors to the
2124 # TarFile class. The open() method is the only one that is needed for
2125 # public use; it is the "super"-constructor and is able to select an
2126 # adequate "sub"-constructor for a particular compression using the mapping
2129 # This concept allows one to subclass TarFile without losing the comfort of
2130 # the super-constructor. A sub-constructor is registered and made available
2131 # by adding it to the mapping in OPEN_METH.
2134 def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE,
2135 encryption=None, compresslevel=9, tolerance=TOLERANCE_STRICT,
2137 """Open a tar archive for reading, writing or appending. Return
2138 an appropriate TarFile class.
2141 'r' or 'r:*' open for reading with transparent compression
2142 'r:' open for reading exclusively uncompressed
2143 'r:gz' open for reading with gzip compression
2144 'r:bz2' open for reading with bzip2 compression
2145 'r:xz' open for reading with lzma compression
2146 'a' or 'a:' open for appending, creating the file if necessary
2147 'w' or 'w:' open for writing without compression
2148 'w:gz' open for writing with gzip compression
2149 'w:bz2' open for writing with bzip2 compression
2150 'w:xz' open for writing with lzma compression
2152 'r|*' open a stream of tar blocks with transparent compression
2153 'r|' open an uncompressed stream of tar blocks for reading
2154 'r|gz' open a gzip compressed stream of tar blocks
2155 'r|bz2' open a bzip2 compressed stream of tar blocks
2156 'r|xz' open an lzma compressed stream of tar blocks
2157 'w|' open an uncompressed stream for writing
2158 'w|gz' open a gzip compressed stream for writing
2159 'w|bz2' open a bzip2 compressed stream for writing
2160 'w|xz' open an lzma compressed stream for writing
2162 'r#gz' open a stream of gzip compressed tar blocks for reading
2163 'w#gz' open a stream of gzip compressed tar blocks for writing
2165 if not name and not fileobj:
2166 raise ValueError("nothing to open")
2168 if mode in ("r", "r:*"):
2169 # Find out which *open() is appropriate for opening the file.
2170 for comptype in cls.OPEN_METH:
2171 func = getattr(cls, cls.OPEN_METH[comptype])
2172 if fileobj is not None:
2173 saved_pos = fileobj.tell()
2175 return func(name, "r", fileobj, **kwargs)
2176 except (ReadError, CompressionError) as e:
2177 # usually nothing exceptional but sometimes is
2178 if fileobj is not None:
2179 fileobj.seek(saved_pos)
2181 raise ReadError("file could not be opened successfully")
2184 filemode, comptype = mode.split(":", 1)
2185 filemode = filemode or "r"
2186 comptype = comptype or "tar"
2188 # Select the *open() function according to
2189 # given compression.
2190 if comptype in cls.OPEN_METH:
2191 func = getattr(cls, cls.OPEN_METH[comptype])
2193 raise CompressionError("unknown compression type %r" % comptype)
2195 # Pass on compression level for gzip / bzip2.
2196 if comptype == 'gz' or comptype == 'bz2':
2197 kwargs['compresslevel'] = compresslevel
2199 if 'max_volume_size' in kwargs:
2200 if comptype != 'tar' and filemode in 'wa' \
2201 and kwargs['max_volume_size']:
2203 warnings.warn('Only the first volume will be compressed '
2204 'for modes with "w:"!')
2206 return func(name, filemode, fileobj, **kwargs)
2209 filemode, comptype = mode.split("|", 1)
2210 filemode = filemode or "r"
2211 comptype = comptype or "tar"
2213 if filemode not in "rw":
2214 raise ValueError("mode must be 'r' or 'w'")
2216 t = cls(name, filemode,
2217 _Stream(name, filemode, comptype, fileobj, bufsize,
2218 compresslevel=compresslevel),
2220 t._extfileobj = False
2224 filemode, comptype = mode.split("#", 1)
2225 filemode = filemode or "r"
2227 if filemode not in "rw":
2228 raise ValueError ("mode %s not compatible with concat "
2229 "archive; must be 'r' or 'w'" % mode)
2231 stream = _Stream(name, filemode, comptype, fileobj, bufsize,
2232 concat=True, encryption=encryption,
2233 compresslevel=compresslevel, tolerance=tolerance)
2234 kwargs ["concat"] = True
2236 t = cls(name, filemode, stream, **kwargs)
2237 except: # XXX except what?
2239 raise # XXX raise what?
2240 t._extfileobj = False
2244 return cls.taropen(name, mode, fileobj, **kwargs)
2246 raise ValueError("undiscernible mode %r" % mode)
2250 def open_at_offset(cls, offset, *a, **kwa):
2252 Same as ``.open()``, but start reading at the given offset. Assumes a
2253 seekable file object. Returns *None* if opening failed due to a read
2256 fileobj = kwa.get ("fileobj")
2257 if fileobj is not None:
2258 fileobj.seek (offset)
2260 return cls.open (*a, **kwa)
2264 def taropen(cls, name, mode="r", fileobj=None, **kwargs):
2265 """Open uncompressed tar archive name for reading or writing.
2267 if len(mode) > 1 or mode not in "raw":
2268 raise ValueError("mode must be 'r', 'a' or 'w'")
2269 return cls(name, mode, fileobj, **kwargs)
2272 def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
2273 """Open gzip compressed tar archive name for reading or writing.
2274 Appending is not allowed.
2276 if len(mode) > 1 or mode not in "rw":
2277 raise ValueError("mode must be 'r' or 'w'")
2282 except (ImportError, AttributeError):
2283 raise CompressionError("gzip module is not available")
2285 extfileobj = fileobj is not None
2287 fileobj = gzip.GzipFile(name, mode + "b", compresslevel, fileobj)
2288 t = cls.taropen(name, mode, fileobj, **kwargs)
2290 if not extfileobj and fileobj is not None:
2294 raise ReadError("not a gzip file")
2296 if not extfileobj and fileobj is not None:
2299 t._extfileobj = extfileobj
2303 def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
2304 """Open bzip2 compressed tar archive name for reading or writing.
2305 Appending is not allowed.
2307 if len(mode) > 1 or mode not in "rw":
2308 raise ValueError("mode must be 'r' or 'w'.")
2313 raise CompressionError("bz2 module is not available")
2315 fileobj = bz2.BZ2File(fileobj or name, mode,
2316 compresslevel=compresslevel)
2319 t = cls.taropen(name, mode, fileobj, **kwargs)
2320 except (OSError, EOFError):
2322 raise ReadError("not a bzip2 file")
2323 t._extfileobj = False
2327 def xzopen(cls, name, mode="r", fileobj=None, preset=None, **kwargs):
2328 """Open lzma compressed tar archive name for reading or writing.
2329 Appending is not allowed.
2331 if mode not in ("r", "w"):
2332 raise ValueError("mode must be 'r' or 'w'")
2337 raise CompressionError("lzma module is not available")
2339 fileobj = lzma.LZMAFile(fileobj or name, mode, preset=preset)
2342 t = cls.taropen(name, mode, fileobj, **kwargs)
2343 except (lzma.LZMAError, EOFError):
2345 raise ReadError("not an lzma file")
2346 t._extfileobj = False
2349 # All *open() methods are registered here.
2351 "tar": "taropen", # uncompressed tar
2352 "gz": "gzopen", # gzip compressed tar
2353 "bz2": "bz2open", # bzip2 compressed tar
2354 "xz": "xzopen" # lzma compressed tar
2357 #--------------------------------------------------------------------------
2358 # The public methods which TarFile provides:
2361 """Close the TarFile. In write-mode, two finishing zero blocks are
2362 appended to the archive. A special case are empty archives which are
2363 initialized accordingly so the two mandatory blocks of zeros are
2364 written abiding by the requested encryption and compression settings.
2369 if self.mode in "aw":
2370 if self.arcmode & ARCMODE_CONCAT and self.fileobj.tell () == 0:
2371 self.fileobj.next ("")
2372 self.fileobj.write(NUL * (BLOCKSIZE * 2))
2373 self.offset += (BLOCKSIZE * 2)
2374 # fill up the end with zero-blocks
2375 # (like option -b20 for tar does)
2376 blocks, remainder = divmod(self.offset, RECORDSIZE)
2378 self.fileobj.write(NUL * (RECORDSIZE - remainder))
2379 if not self._extfileobj:
2380 self.fileobj.close()
2383 def getmember(self, name):
2384 """Return a TarInfo object for member `name'. If `name' can not be
2385 found in the archive, KeyError is raised. If a member occurs more
2386 than once in the archive, its last occurrence is assumed to be the
2387 most up-to-date version.
2389 tarinfo = self._getmember(name)
2391 raise KeyError("filename %r not found" % name)
2394 def getmembers(self):
2395 """Return the members of the archive as a list of TarInfo objects. The
2396 list has the same order as the members in the archive.
2399 if not self._loaded: # if we want to obtain a list of
2400 self._load() # all members, we first have to
2401 # scan the whole archive.
2404 def get_last_member_offset(self):
2405 """Return the last member offset. Usually this is self.fileobj.tell(),
2406 but when there's encryption or concat compression going on it's more
2407 complicated than that.
2409 return self.last_block_offset
2412 """Return the members of the archive as a list of their names. It has
2413 the same order as the list returned by getmembers().
2415 return [tarinfo.name for tarinfo in self.getmembers()]
2417 def gettarinfo(self, name=None, arcname=None, fileobj=None):
2418 """Create a TarInfo object for either the file `name' or the file
2419 object `fileobj' (using os.fstat on its file descriptor). You can
2420 modify some of the TarInfo's attributes before you add it using
2421 addfile(). If given, `arcname' specifies an alternative name for the
2422 file in the archive.
2426 # When fileobj is given, replace name by
2427 # fileobj's real name.
2428 if fileobj is not None:
2431 # Building the name of the member in the archive.
2432 # Backward slashes are converted to forward slashes,
2433 # Absolute paths are turned to relative paths.
2436 drv, arcname = os.path.splitdrive(arcname)
2437 arcname = arcname.replace(os.sep, "/")
2438 arcname = arcname.lstrip("/")
2440 # Now, fill the TarInfo object with
2441 # information specific for the file.
2442 tarinfo = self.tarinfo()
2443 tarinfo.tarfile = self
2445 # Use os.stat or os.lstat, depending on platform
2446 # and if symlinks shall be resolved.
2448 if hasattr(os, "lstat") and not self.dereference:
2449 statres = os.lstat(name)
2451 statres = os.stat(name)
2453 statres = os.fstat(fileobj.fileno())
2456 stmd = statres.st_mode
2457 if stat.S_ISREG(stmd):
2458 inode = (statres.st_ino, statres.st_dev)
2459 if not self.dereference and statres.st_nlink > 1 and \
2460 inode in self.inodes and arcname != self.inodes[inode]:
2461 # Is it a hardlink to an already
2464 linkname = self.inodes[inode]
2466 # The inode is added only if its valid.
2467 # For win32 it is always 0.
2469 if inode[0] and self.save_to_members:
2470 self.inodes[inode] = arcname
2471 elif stat.S_ISDIR(stmd):
2473 elif stat.S_ISFIFO(stmd):
2475 elif stat.S_ISLNK(stmd):
2477 linkname = os.readlink(name)
2478 elif stat.S_ISCHR(stmd):
2480 elif stat.S_ISBLK(stmd):
2485 # Fill the TarInfo object with all
2486 # information we can get.
2487 tarinfo.name = arcname
2489 tarinfo.uid = statres.st_uid
2490 tarinfo.gid = statres.st_gid
2492 tarinfo.size = statres.st_size
2495 tarinfo.mtime = statres.st_mtime
2497 tarinfo.linkname = linkname
2499 if tarinfo.uid in self.cache_uid2user:
2500 tarinfo.uname = self.cache_uid2user[tarinfo.uid]
2503 tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0]
2504 self.cache_uid2user[tarinfo.uid] = tarinfo.uname
2506 # remember user does not exist:
2507 # same default value as in tarinfo class
2508 self.cache_uid2user[tarinfo.uid] = ""
2510 if tarinfo.gid in self.cache_gid2group:
2511 tarinfo.gname = self.cache_gid2group[tarinfo.gid]
2514 tarinfo.gname = grp.getgrgid(tarinfo.gid)[0]
2515 self.cache_gid2group[tarinfo.gid] = tarinfo.gname
2517 # remember group does not exist:
2518 # same default value as in tarinfo class
2519 self.cache_gid2group[tarinfo.gid] = ""
2521 if type in (CHRTYPE, BLKTYPE):
2522 if hasattr(os, "major") and hasattr(os, "minor"):
2523 tarinfo.devmajor = os.major(statres.st_rdev)
2524 tarinfo.devminor = os.minor(statres.st_rdev)
2527 def list(self, verbose=True):
2528 """Print a table of contents to sys.stdout. If `verbose' is False, only
2529 the names of the members are printed. If it is True, an `ls -l'-like
2534 for tarinfo in self:
2536 print(stat.filemode(tarinfo.mode), end=' ')
2537 print("%s/%s" % (tarinfo.uname or tarinfo.uid,
2538 tarinfo.gname or tarinfo.gid), end=' ')
2539 if tarinfo.ischr() or tarinfo.isblk():
2540 print("%10s" % ("%d,%d" \
2541 % (tarinfo.devmajor, tarinfo.devminor)), end=' ')
2543 print("%10d" % tarinfo.size, end=' ')
2544 print("%d-%02d-%02d %02d:%02d:%02d" \
2545 % time.localtime(tarinfo.mtime)[:6], end=' ')
2547 print(tarinfo.name + ("/" if tarinfo.isdir() else ""), end=' ')
2551 print("->", tarinfo.linkname, end=' ')
2553 print("link to", tarinfo.linkname, end=' ')
2556 def add(self, name, arcname=None, recursive=True, exclude=None, *, filter=None):
2557 """Add the file `name' to the archive. `name' may be any type of file
2558 (directory, fifo, symbolic link, etc.). If given, `arcname'
2559 specifies an alternative name for the file in the archive.
2560 Directories are added recursively by default. This can be avoided by
2561 setting `recursive' to False. `exclude' is a function that should
2562 return True for each filename to be excluded. `filter' is a function
2563 that expects a TarInfo object argument and returns the changed
2564 TarInfo object, if it returns None the TarInfo object will be
2565 excluded from the archive.
2572 # Exclude pathnames.
2573 if exclude is not None:
2575 warnings.warn("use the filter argument instead",
2576 DeprecationWarning, 2)
2578 self._dbg(2, "tarfile: Excluded %r" % name)
2581 # Skip if somebody tries to archive the archive...
2582 if self.name is not None and os.path.abspath(name) == self.name:
2583 self._dbg(2, "tarfile: Skipped %r" % name)
2588 # Create a TarInfo object from the file.
2589 tarinfo = self.gettarinfo(name, arcname)
2592 self._dbg(1, "tarfile: Unsupported type %r" % name)
2595 # Change or exclude the TarInfo object.
2596 if filter is not None:
2597 tarinfo = filter(tarinfo)
2599 self._dbg(2, "tarfile: Excluded %r" % name)
2602 # Append the tar header and data to the archive.
2604 with bltn_open(name, "rb") as f:
2605 self.addfile(tarinfo, f)
2607 elif tarinfo.isdir():
2608 self.addfile(tarinfo)
2610 for f in os.listdir(name):
2611 self.add(os.path.join(name, f), os.path.join(arcname, f),
2612 recursive, exclude, filter=filter)
2615 self.addfile(tarinfo)
2617 def _size_left_file(self):
2618 """Calculates size left in a volume with a maximum volume size.
2620 Assumes self.max_volume_size is set.
2621 If using compression through a _Stream, use _size_left_stream instead
2623 # left-over size = max_size - offset - 2 zero-blocks written in close
2624 size_left = self.max_volume_size - 2*BLOCKSIZE - self.offset
2625 # limit size left to a discrete number of blocks, because we won't
2626 # write only half a block when writting the end of a volume
2627 # and filling with zeros
2628 return BLOCKSIZE * (size_left // BLOCKSIZE)
2630 def _size_left_stream(self):
2631 """ Calculates size left in a volume if using comression/encryption
2633 Assumes self.max_volume_size is set and self.fileobj is a _Stream
2634 (otherwise use _size_left_file)
2636 # left-over size = max_size - bytes written - 2 zero-blocks (close)
2637 size_left = self.max_volume_size - self.fileobj.estim_file_size() \
2639 return BLOCKSIZE * (size_left // BLOCKSIZE)
2641 def addfile(self, tarinfo, fileobj=None):
2642 """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is
2643 given, tarinfo.size bytes are read from it and added to the archive.
2644 You can create TarInfo objects using gettarinfo().
2645 On Windows platforms, `fileobj' should always be opened with mode
2646 'rb' to avoid irritation about the file size.
2650 tarinfo = copy.copy(tarinfo)
2652 if self.arcmode & ARCMODE_CONCAT:
2653 self.last_block_offset = self.fileobj.next (tarinfo.name)
2655 self.last_block_offset = self.fileobj.tell()
2657 buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
2658 self.fileobj.write(buf)
2659 self.offset += len(buf)
2661 if self.max_volume_size:
2662 if isinstance(self.fileobj, _Stream):
2663 _size_left = self._size_left_stream
2665 _size_left = self._size_left_file
2667 _size_left = lambda: tarinfo.size
2669 # If there's no data to follow, finish
2671 if self.save_to_members:
2672 self.members.append(tarinfo)
2675 target_size_left = _size_left()
2676 source_size_left = tarinfo.size
2677 assert tarinfo.volume_offset == 0
2679 # we only split volumes in the middle of a file, that means we have
2680 # to write at least one block
2681 if target_size_left < BLOCKSIZE:
2682 target_size_left = BLOCKSIZE
2684 # loop over multiple volumes
2685 while source_size_left > 0:
2687 # Write as much data as possble from source into target.
2688 # When compressing data, we cannot easily predict how much data we
2689 # can write until target_size_left == 0 --> need to iterate
2690 size_can_write = min(target_size_left, source_size_left)
2692 while size_can_write > 0:
2693 copyfileobj(fileobj, self.fileobj, size_can_write)
2694 self.offset += size_can_write
2695 source_size_left -= size_can_write
2696 target_size_left = _size_left()
2697 size_can_write = min(target_size_left, source_size_left)
2699 # now target_size_left == 0 or source_size_left == 0
2701 # if there is data left to write, we need to create a new volume
2702 if source_size_left > 0:
2703 # Only finalize the crypto entry here if we’re continuing with
2704 # another one; otherwise, the encryption must include the block
2706 tarinfo.type = GNUTYPE_MULTIVOL
2708 if not self.new_volume_handler or\
2709 not callable(self.new_volume_handler):
2710 raise Exception("We need to create a new volume and you "
2711 "didn't supply a new_volume_handler")
2714 # the new volume handler should do everything needed to
2715 # start working in a new volume. usually, the handler calls
2716 # to self.open_volume
2717 self.volume_number += 1
2719 # set to be used by open_volume, because in the case of a PAX
2720 # tar it needs to write information about the volume and offset
2721 # in the global header
2722 tarinfo.volume_offset = tarinfo.size - source_size_left
2723 self.volume_tarinfo = tarinfo
2725 # the “new_volume_handler” is supposed to call .close() on the
2727 self.new_volume_handler(self, self.base_name, self.volume_number)
2729 self.volume_tarinfo = None
2731 if self.arcmode & ARCMODE_CONCAT:
2732 self.fileobj.next_volume (tarinfo.name)
2734 # write new volume header
2735 buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
2736 self.fileobj.write(buf)
2737 self.offset += len(buf)
2739 # adjust variables; open_volume should have reset self.offset
2740 # --> _size_left should be big again
2741 target_size_left = _size_left()
2742 size_can_write = min(target_size_left, source_size_left)
2743 self._dbg(3, 'new volume')
2745 # now, all data has been written. We may have to fill up the rest of
2746 # the block in target with 0s
2747 remainder = (tarinfo.size - tarinfo.volume_offset) % BLOCKSIZE
2749 self.fileobj.write(NUL * (BLOCKSIZE - remainder))
2750 self.offset += BLOCKSIZE - remainder
2752 if self.save_to_members:
2753 self.members.append(tarinfo)
2755 def open_volume(self, name="", fileobj=None, encryption=None):
2757 Called by the user to change this tar file to point to a new volume.
2760 # open the file using either fileobj or name
2762 if self.mode == "a" and not os.path.exists(name):
2763 # Create nonexistent files in append mode.
2766 self._extfileobj = False
2768 if isinstance(self.fileobj, _Stream):
2769 self._dbg(3, 'open_volume: create a _Stream')
2770 fileobj = _Stream(name=name,
2771 mode=self.fileobj.mode,
2772 comptype=self.fileobj.comptype,
2774 bufsize=self.fileobj.bufsize,
2775 encryption=encryption or self.fileobj.encryption,
2776 concat=self.fileobj.arcmode & ARCMODE_CONCAT,
2777 tolerance=self.fileobj.tolerance)
2779 # here, we lose information about compression/encryption!
2780 self._dbg(3, 'open_volume: builtin open')
2781 fileobj = bltn_open(name, self._mode)
2783 if name is None and hasattr(fileobj, "name"):
2785 if hasattr(fileobj, "mode"):
2786 self._mode = fileobj.mode
2787 self._extfileobj = True
2788 self._dbg(3, 'open_volume: using external fileobj {}', fileobj)
2789 self.name = os.path.abspath(name) if name else None
2790 self.fileobj = fileobj
2792 # init data structures
2794 self.members = [] # list of members as TarInfo objects
2795 self._loaded = False # flag if all members have been read
2796 self.offset = self.fileobj.tell()
2797 # current position in the archive file
2798 self.inodes = {} # dictionary caching the inodes of
2799 # archive members already added
2802 if self.mode == "r":
2803 self.firstmember = None
2804 self.firstmember = self.next()
2806 if self.mode == "a":
2807 # Move to the end of the archive,
2808 # before the first empty block.
2810 self.fileobj.seek(self.offset)
2812 tarinfo = self.tarinfo.fromtarfile(self)
2813 self.members.append(tarinfo)
2814 except EOFHeaderError:
2815 self.fileobj.seek(self.offset)
2817 except HeaderError as e:
2818 raise ReadError(str(e))
2820 if self.mode in "aw":
2823 if self.format == PAX_FORMAT:
2825 "GNU.volume.filename": str(self.volume_tarinfo.name),
2826 "GNU.volume.size": str(self.volume_tarinfo.size - self.volume_tarinfo.volume_offset),
2827 "GNU.volume.offset": str(self.volume_tarinfo.volume_offset),
2830 self.pax_headers.update(volume_info)
2832 if isinstance(self.fileobj, _Stream):
2833 self.fileobj._init_write_gz ()
2834 buf = self.tarinfo.create_pax_global_header(volume_info.copy())
2835 self.fileobj.write(buf)
2836 self.offset += len(buf)
2837 except Exception as exn:
2838 if not self._extfileobj:
2839 self.fileobj.close()
2843 def extractall(self, path=".", members=None, filter=None):
2844 """Extract all members from the archive to the current working
2845 directory and set owner, modification time and permissions on
2846 directories afterwards. `path' specifies a different directory
2847 to extract to. `members' is optional and must be a subset of the
2848 list returned by getmembers().
2855 for tarinfo in members:
2856 if self.volume_number > 0 and tarinfo.ismultivol():
2859 if filter and not filter(tarinfo):
2863 # Extract directories with a safe mode.
2864 directories.append(tarinfo)
2865 tarinfo = copy.copy(tarinfo)
2866 tarinfo.mode = 0o0700
2867 # Do not set_attrs directories, as we will do that further down
2868 self.extract(tarinfo, path, set_attrs=not tarinfo.isdir())
2870 # Reverse sort directories.
2871 directories.sort(key=lambda a: a.name)
2872 directories.reverse()
2874 # Set correct owner, mtime and filemode on directories.
2875 for tarinfo in directories:
2876 dirpath = os.path.join(path, tarinfo.name)
2878 self.chown(tarinfo, dirpath)
2879 self.utime(tarinfo, dirpath)
2880 self.chmod(tarinfo, dirpath)
2881 except ExtractError as e:
2882 if self.errorlevel > 1:
2885 self._dbg(1, "tarfile: %s" % e)
2887 def extract(self, member, path="", set_attrs=True, symlink_cb=None):
2888 """Extract a member from the archive to the current working directory,
2889 using its full name. Its file information is extracted as accurately
2890 as possible. `member' may be a filename or a TarInfo object. You can
2891 specify a different directory using `path'. File attributes (owner,
2892 mtime, mode) are set unless `set_attrs' is False.
2893 ``symlink_cb`` is a hook accepting a function that is passed the
2894 ``member``, ``path``, and ``set_attrs`` arguments if the tarinfo for
2895 ``member`` indicates a symlink in which case only the callback
2896 passed will be applied, skipping the actual extraction. In case the
2897 callback is invoked, its return value is passed on to the caller.
2901 if isinstance(member, str):
2902 tarinfo = self.getmember(member)
2906 # Prepare the link target for makelink().
2908 tarinfo._link_target = os.path.join(path, tarinfo.linkname)
2910 if symlink_cb is not None and tarinfo.issym():
2911 return symlink_cb(member, path, set_attrs)
2914 self._extract_member(tarinfo, os.path.join(path, tarinfo.name),
2915 set_attrs=set_attrs)
2916 except EnvironmentError as e:
2917 if self.errorlevel > 0:
2920 if e.filename is None:
2921 self._dbg(1, "tarfile: %s" % e.strerror)
2923 self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename))
2924 except ExtractError as e:
2925 if self.errorlevel > 1:
2928 self._dbg(1, "tarfile: %s" % e)
2930 def extractfile(self, member):
2931 """Extract a member from the archive as a file object. `member' may be
2932 a filename or a TarInfo object. If `member' is a regular file or a
2933 link, an io.BufferedReader object is returned. Otherwise, None is
2938 if isinstance(member, str):
2939 tarinfo = self.getmember(member)
2943 if tarinfo.isreg() or tarinfo.ismultivol() or\
2944 tarinfo.type not in SUPPORTED_TYPES:
2945 # If a member's type is unknown, it is treated as a
2947 return self.fileobject(self, tarinfo)
2949 elif tarinfo.islnk() or tarinfo.issym():
2950 if isinstance(self.fileobj, _Stream):
2951 # A small but ugly workaround for the case that someone tries
2952 # to extract a (sym)link as a file-object from a non-seekable
2953 # stream of tar blocks.
2954 raise StreamError("cannot extract (sym)link as file object")
2956 # A (sym)link's file object is its target's file object.
2957 return self.extractfile(self._find_link_target(tarinfo))
2959 # If there's no data associated with the member (directory, chrdev,
2960 # blkdev, etc.), return None instead of a file object.
2963 def _extract_member(self, tarinfo, targetpath, set_attrs=True):
2964 """Extract the TarInfo object tarinfo to a physical
2965 file called targetpath.
2967 # Fetch the TarInfo object for the given name
2968 # and build the destination pathname, replacing
2969 # forward slashes to platform specific separators.
2970 targetpath = targetpath.rstrip("/")
2971 targetpath = targetpath.replace("/", os.sep)
2973 # Create all upper directories.
2974 upperdirs = os.path.dirname(targetpath)
2975 if upperdirs and not os.path.exists(upperdirs):
2976 # Create directories that are not part of the archive with
2977 # default permissions.
2978 os.makedirs(upperdirs)
2980 if tarinfo.islnk() or tarinfo.issym():
2981 self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname))
2983 self._dbg(1, tarinfo.name)
2986 self.makefile(tarinfo, targetpath)
2987 elif tarinfo.isdir():
2988 self.makedir(tarinfo, targetpath)
2989 elif tarinfo.isfifo():
2990 self.makefifo(tarinfo, targetpath)
2991 elif tarinfo.ischr() or tarinfo.isblk():
2992 self.makedev(tarinfo, targetpath)
2993 elif tarinfo.islnk() or tarinfo.issym():
2994 self.makelink(tarinfo, targetpath)
2995 elif tarinfo.type not in SUPPORTED_TYPES:
2996 self.makeunknown(tarinfo, targetpath)
2998 self.makefile(tarinfo, targetpath)
3001 self.chown(tarinfo, targetpath)
3002 if not tarinfo.issym():
3003 self.chmod(tarinfo, targetpath)
3004 self.utime(tarinfo, targetpath)
3006 #--------------------------------------------------------------------------
3007 # Below are the different file methods. They are called via
3008 # _extract_member() when extract() is called. They can be replaced in a
3009 # subclass to implement other functionality.
3011 def makedir(self, tarinfo, targetpath):
3012 """Make a directory called targetpath.
3015 # Use a safe mode for the directory, the real mode is set
3016 # later in _extract_member().
3017 os.mkdir(targetpath, 0o0700)
3018 except FileExistsError:
3021 def makefile(self, tarinfo, targetpath):
3022 """Make a file called targetpath.
3024 source = self.fileobj
3025 source.seek(tarinfo.offset_data)
3028 target = bltn_open(targetpath, "wb")
3030 if tarinfo.sparse is not None:
3032 for offset, size in tarinfo.sparse:
3034 copyfileobj(source, target, size)
3035 target.seek(tarinfo.size)
3044 copyfileobj(source, target, tarinfo.size)
3047 # only if we are extracting a multivolume this can be treated
3048 if not self.new_volume_handler:
3050 raise Exception("We need to read a new volume and you"
3051 " didn't supply a new_volume_handler")
3053 # the new volume handler should do everything needed to
3054 # start working in a new volume. usually, the handler calls
3055 # to self.open_volume
3056 self.volume_number += 1
3057 self.new_volume_handler(self, self.base_name, self.volume_number)
3058 tarinfo = self.firstmember
3059 source = self.fileobj
3064 def makeunknown(self, tarinfo, targetpath):
3065 """Make a file from a TarInfo object with an unknown type
3068 self.makefile(tarinfo, targetpath)
3069 self._dbg(1, "tarfile: Unknown file type %r, " \
3070 "extracted as regular file." % tarinfo.type)
3072 def makefifo(self, tarinfo, targetpath):
3073 """Make a fifo called targetpath.
3075 if hasattr(os, "mkfifo"):
3076 os.mkfifo(targetpath)
3078 raise ExtractError("fifo not supported by system")
3080 def makedev(self, tarinfo, targetpath):
3081 """Make a character or block device called targetpath.
3083 if not hasattr(os, "mknod") or not hasattr(os, "makedev"):
3084 raise ExtractError("special devices not supported by system")
3088 mode |= stat.S_IFBLK
3090 mode |= stat.S_IFCHR
3092 os.mknod(targetpath, mode,
3093 os.makedev(tarinfo.devmajor, tarinfo.devminor))
3095 def makelink(self, tarinfo, targetpath):
3096 """Make a (symbolic) link called targetpath. If it cannot be created
3097 (platform limitation), we try to make a copy of the referenced file
3101 # For systems that support symbolic and hard links.
3103 os.symlink(tarinfo.linkname, targetpath)
3106 if os.path.exists(tarinfo._link_target):
3107 os.link(tarinfo._link_target, targetpath)
3109 self._extract_member(self._find_link_target(tarinfo),
3111 except symlink_exception:
3113 self._extract_member(self._find_link_target(tarinfo),
3116 raise ExtractError("unable to resolve link inside archive")
3118 def chown(self, tarinfo, targetpath):
3119 """Set owner of targetpath according to tarinfo.
3121 if pwd and hasattr(os, "geteuid") and os.geteuid() == 0:
3122 # We have to be root to do so.
3124 g = grp.getgrnam(tarinfo.gname)[2]
3128 u = pwd.getpwnam(tarinfo.uname)[2]
3132 if tarinfo.issym() and hasattr(os, "lchown"):
3133 os.lchown(targetpath, u, g)
3135 os.chown(targetpath, u, g)
3136 except OSError as e:
3137 raise ExtractError("could not change owner")
3139 def chmod(self, tarinfo, targetpath):
3140 """Set file permissions of targetpath according to tarinfo.
3142 if hasattr(os, 'chmod'):
3144 os.chmod(targetpath, tarinfo.mode)
3145 except OSError as e:
3146 raise ExtractError("could not change mode")
3148 def utime(self, tarinfo, targetpath):
3149 """Set modification time of targetpath according to tarinfo.
3151 if not hasattr(os, 'utime'):
3154 os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime))
3155 except OSError as e:
3156 raise ExtractError("could not change modification time")
3158 #--------------------------------------------------------------------------
3160 """Return the next member of the archive as a TarInfo object, when
3161 TarFile is opened for reading. Return None if there is no more
3165 if self.firstmember is not None:
3166 m = self.firstmember
3167 self.firstmember = None
3170 # Read the next block.
3171 self.fileobj.seek(self.offset)
3175 tarinfo = self.tarinfo.fromtarfile(self)
3176 except EOFHeaderError as e:
3177 if self.ignore_zeros:
3178 self._dbg(2, "0x%X: %s" % (self.offset, e))
3179 self.offset += BLOCKSIZE
3181 except InvalidHeaderError as e:
3182 if self.ignore_zeros:
3183 self._dbg(2, "0x%X: %s" % (self.offset, e))
3184 self.offset += BLOCKSIZE
3186 elif self.offset == 0:
3187 raise ReadError(str(e))
3188 except EmptyHeaderError:
3189 if self.offset == 0:
3190 raise ReadError("empty file")
3191 except TruncatedHeaderError as e:
3192 if self.offset == 0:
3193 raise ReadError(str(e))
3194 except SubsequentHeaderError as e:
3195 raise ReadError(str(e))
3198 if tarinfo is not None:
3199 if self.save_to_members:
3200 self.members.append(tarinfo)
3206 #--------------------------------------------------------------------------
3207 # Little helper methods:
3209 def _getmember(self, name, tarinfo=None, normalize=False):
3210 """Find an archive member by name from bottom to top.
3211 If tarinfo is given, it is used as the starting point.
3213 # Ensure that all members have been loaded.
3214 members = self.getmembers()
3216 # Limit the member search list up to tarinfo.
3217 if tarinfo is not None:
3218 members = members[:members.index(tarinfo)]
3221 name = os.path.normpath(name)
3223 for member in reversed(members):
3225 member_name = os.path.normpath(member.name)
3227 member_name = member.name
3229 if name == member_name:
3233 """Read through the entire archive file and look for readable
3237 tarinfo = self.next()
3242 def _check(self, mode=None):
3243 """Check if TarFile is still open, and if the operation's mode
3244 corresponds to TarFile's mode.
3247 raise OSError("%s is closed" % self.__class__.__name__)
3248 if mode is not None and self.mode not in mode:
3249 raise OSError("bad operation for mode %r" % self.mode)
3251 def _find_link_target(self, tarinfo):
3252 """Find the target member of a symlink or hardlink member in the
3256 # Always search the entire archive.
3257 linkname = "/".join(filter(None, (os.path.dirname(tarinfo.name), tarinfo.linkname)))
3260 # Search the archive before the link, because a hard link is
3261 # just a reference to an already archived file.
3262 linkname = tarinfo.linkname
3265 member = self._getmember(linkname, tarinfo=limit, normalize=True)
3267 raise KeyError("linkname %r not found" % linkname)
3271 """Provide an iterator object.
3274 return iter(self.members)
3276 return TarIter(self)
3278 def _dbg(self, level, msg, *args):
3279 """Write debugging output to sys.stderr.
3281 if level <= self.debug:
3282 print(msg.format(*args), file=sys.stderr)
3284 def __enter__(self):
3288 def __exit__(self, type, value, traceback):
3292 # An exception occurred. We must not call close() because
3293 # it would try to write end-of-archive blocks and padding.
3294 if not self._extfileobj:
3295 self.fileobj.close()
3302 for tarinfo in TarFile(...):
3306 def __init__(self, tarfile):
3307 """Construct a TarIter object.
3309 self.tarfile = tarfile
3312 """Return iterator object.
3316 """Return the next item using TarFile's next() method.
3317 When all members have been read, set TarFile as _loaded.
3319 # Fix for SF #1100429: Under rare circumstances it can
3320 # happen that getmembers() is called during iteration,
3321 # which will cause TarIter to stop prematurely.
3323 if self.index == 0 and self.tarfile.firstmember is not None:
3324 tarinfo = self.tarfile.next()
3325 elif self.index < len(self.tarfile.members):
3326 tarinfo = self.tarfile.members[self.index]
3327 elif not self.tarfile._loaded:
3328 tarinfo = self.tarfile.next()
3330 self.tarfile._loaded = True
3338 #---------------------------------------------------------
3339 # support functionality for rescue mode
3340 #---------------------------------------------------------
3342 TAR_FMT_HDR = (# See tar(5):
3344 "100s" # ← char name[100]; /* 100 */
3345 "8s" # ← char mode[8]; /* 108 */
3346 "8s" # ← char uid[8]; /* 116 */
3347 "8s" # ← char gid[8]; /* 124 */
3348 "12s" # ← char size[12]; /* 136 */
3349 "12s" # ← char mtime[12]; /* 148 */
3350 "8s" # ← char checksum[8]; /* 156 */
3351 "B" # ← char typeflag[1]; /* 157 */
3352 "100s" # ← char linkname[100]; /* 257 */
3353 "6s" # ← char magic[6]; /* 263 */
3354 "2s" # ← char version[2]; /* 265 */
3355 "32s" # ← char uname[32]; /* 297 */
3356 "32s" # ← char gname[32]; /* 329 */
3357 "8s" # ← char devmajor[8]; /* 337 */
3358 "8s" # ← char devminor[8]; /* 345 */
3359 "12s" # ← char atime[12]; /* 357 */
3360 "12s" # ← char ctime[12]; /* 369 */
3361 "12s" # ← char offset[12]; /* 381 */
3362 "4s" # ← char longnames[4]; /* 385 */
3363 "B" # ← char unused[1]; /* 386 */
3365 "12s" # ← char offset[12];
3366 "12s" # ← char numbytes[12];
3367 "12s" # ← char offset[12];
3368 "12s" # ← char numbytes[12];
3369 "12s" # ← char offset[12];
3370 "12s" # ← char numbytes[12];
3371 "12s" # ← char offset[12];
3372 "12s" # ← char numbytes[12];
3373 "" # } sparse[4]; /* 482 */
3374 "B" # ← char isextended[1]; /* 483 */
3375 "12s" # ← char realsize[12]; /* 495 */
3376 "17s" # ← char pad[17]; /* 512 */
3379 # The “magic” and “version” fields are special:
3382 # magic The magic field holds the five characters “ustar” followed by a
3383 # space. Note that POSIX ustar archives have a trailing null.
3387 # /* OLDGNU_MAGIC uses both magic and version fields, which are contiguous.
3388 # Found in an archive, it indicates an old GNU header format, which will be
3389 # hopefully become obsolescent. With OLDGNU_MAGIC, uname and gname are
3390 # valid, though the header is not truly POSIX conforming. */
3393 TAR_HDR_OFF_MAGIC = 257
3394 TAR_FMT_OLDGNU_MAGIC = b"ustar "
3396 def read_gnu_tar_hdr (data):
3397 if len (data) != BLOCKSIZE: # header requires one complete block
3418 offset1, numbytes1, \
3419 offset2, numbytes2, \
3420 offset3, numbytes3, \
3421 offset4, numbytes4, \
3424 pad = struct.unpack (TAR_FMT_HDR, data)
3425 except struct.error:
3428 if magic != TAR_FMT_OLDGNU_MAGIC:
3431 # return all except “unused” and “pad”
3433 { "name" : name, "mode" : mode
3434 , "uid" : uid , "gid" : gid
3435 , "size" : size, "mtime" : mtime
3436 , "checksum" : checksum
3437 , "typeflag" : typeflag
3438 , "linkname" : linkname
3440 , "version" : version
3441 , "uname" : uname, "gname" : gname
3442 , "devmajor" : devmajor, "devminor" : devminor
3443 , "atime" : atime, "ctime" : ctime
3445 , "longnames" : longnames
3446 , "offset1" : offset1, "numbytes1" : numbytes1
3447 , "offset2" : offset2, "numbytes2" : numbytes2
3448 , "offset3" : offset3, "numbytes3" : numbytes3
3449 , "offset4" : offset4, "numbytes4" : numbytes4
3450 , "isextended" : isextended
3451 , "realsize" : realsize
3455 def tar_hdr_check_chksum (data):
3456 hdr = read_gnu_tar_hdr (data)
3459 s = calc_chksums (data)
3460 return nti (hdr ["checksum"]) in s
3463 def readable_tar_objects_offsets (ifd):
3465 Traverse blocks in file, trying to extract tar headers.
3470 mm = mmap.mmap(ifd, 0, mmap.MAP_SHARED, mmap.PROT_READ)
3471 pos = TAR_HDR_OFF_MAGIC
3474 pos = mm.find (TAR_FMT_OLDGNU_MAGIC, pos)
3477 off = pos - TAR_HDR_OFF_MAGIC
3479 blk = mm.read (BLOCKSIZE)
3480 if tar_hdr_check_chksum (blk) is True:
3481 offsets.append (off)
3487 def locate_gz_hdr_candidates (fd):
3489 Walk over instances of the GZ magic in the payload, collecting their
3490 positions. If the offset of the first found instance is not zero, the file
3491 begins with leading garbage.
3493 Note that since the GZ magic consists of only two bytes, we expect a lot of
3494 false positives inside binary data.
3496 :return: The list of offsets in the file.
3500 mm = mmap.mmap(fd, 0, mmap.MAP_SHARED, mmap.PROT_READ)
3503 pos = mm.find (GZ_MAGIC_BYTES, pos)
3507 pos += len (GZ_MAGIC_BYTES)
3512 HDR_CAND_GOOD = 0 # header marks begin of valid object
3513 HDR_CAND_FISHY = 1 # inconclusive
3514 HDR_CAND_JUNK = 2 # not a header / object unreadable
3517 def read_cstring (fd, max=-1, encoding=None):
3519 Read one NUL-terminated string from *fd* into a Python string. If *max* is
3520 non-negative, reading will terminate after the specified number of bytes.
3522 Optionally, an *encoding* may be specified to interpret the data as.
3524 :returns: *None* if parsing failed or the maximum number of bytes has been
3525 exceeded; a Python string with the data otherwise.
3534 if max >= 0 and l > max:
3538 if encoding is not None:
3539 buf = buf.decode (encoding)
3544 def inspect_gz_hdr (fd, off):
3546 Attempt to parse a Gzip header in *fd* at position *off*. The format is
3547 documented as RFC1952.
3549 Returns a verdict about the quality of that header plus the parsed header
3550 when readable. Problematic sizes such as fields running past the EOF are
3551 treated as garbage. Properties in which the header merely doesn’t conform
3552 to the spec (garbage flag bits, bogus timestamp) are considered “fishy”. No
3553 validation is possible on embedded strings because they are single-byte
3561 verdict = HDR_CAND_GOOD
3563 os.lseek (fd, off, os.SEEK_SET)
3564 if os.lseek (fd, 0, os.SEEK_CUR) != off:
3565 return HDR_CAND_JUNK, None
3567 raw = os.read (fd, GZ_HEADER_SIZE)
3568 if len (raw) != GZ_HEADER_SIZE:
3569 return HDR_CAND_JUNK, None
3573 _m1, _m2, meth, flags, mtime, dflags, oscode = \
3574 struct.unpack (GZ_FMT_HEADER, raw)
3575 if meth != GZ_METHOD_DEFLATE: # only deflate is supported
3576 return HDR_CAND_JUNK, None
3577 except struct.error as exn:
3578 return HDR_CAND_JUNK, None
3580 if mtime > int (time.time ()):
3581 verdict = HDR_CAND_FISHY
3583 if dflags != GZ_DEFLATE_FLAGS:
3584 verdict = HDR_CAND_FISHY
3586 if oscode != GZ_OS_CODE:
3587 verdict = HDR_CAND_FISHY
3589 if flags & GZ_FLAG_FTEXT: # created by some contrarian
3590 verdict = HDR_CAND_FISHY
3591 if flags & GZ_FLAG_FEXTRA:
3592 xlen = struct.unpack ("<H", os.read (fd, 2))
3593 xtra = os.read (fd, xlen)
3594 if len (xtra) != xlen: # eof inside header
3595 return HDR_CAND_JUNK, None
3596 if flags & GZ_FLAG_FNAME:
3597 # read up to the next NUL byte, not exceeding the maximum path length
3599 fname = read_cstring (fd, max=(LENGTH_NAME + LENGTH_PREFIX),
3600 encoding="iso-8859-1")
3602 return HDR_CAND_JUNK, None
3603 if flags & GZ_FLAG_FCOMMENT:
3604 fname = read_cstring (fd, max=(LENGTH_NAME + LENGTH_PREFIX),
3605 encoding="iso-8859-1")
3607 return HDR_CAND_JUNK, None
3608 if flags & GZ_FLAG_FHCRC: # half a CRC32; discarded
3609 crc16 = os.read (fd, 2)
3610 if len (crc16) != 2: # eof inside header
3611 return HDR_CAND_JUNK, None
3612 if flags & GZ_FLAG_RESERVED:
3613 # according to the RFC, these must not be set
3614 verdict = HDR_CAND_FISHY
3616 hlen = os.lseek (fd, 0, os.SEEK_CUR) - off
3628 def try_decompress (ifd, off, hdr):
3630 Attempt to process the object starting at *off* with gzip.
3632 :returns: A pair containing the values of the decompressed data and
3633 the length of the input consumed. Note that the latter value
3634 may exceed the length of the compressed data because the
3635 *zlib* module does not provide a means to query how much
3636 of the input it processed before the end of an object.
3639 decmp = zlib.decompressobj (-zlib.MAX_WBITS)
3641 dlen = 0 # size of decompressed data
3643 os.lseek (ifd, pos, os.SEEK_SET)
3645 cnk = os.read (ifd, BUFSIZE)
3648 data = decmp.decompress (cnk)
3649 except zlib.error as exn: # probably CRC32 mismatch; terminate softly
3652 if decmp.eof is True:
3654 if len (cnk) != BUFSIZE: # eof, but not end of decompressed object!
3657 return dlen, pos - off
3659 def readable_gz_objects_offsets (ifd, cands):
3661 Inspect header candidates for parseable *ifd* gzipped objects.
3668 vdt, hdr = inspect_gz_hdr (ifd, cand)
3669 if vdt == HDR_CAND_JUNK:
3670 pass # ignore unreadable ones
3671 elif vdt in [HDR_CAND_GOOD, HDR_CAND_FISHY]:
3672 off0 = cand + hdr ["hlen"]
3673 dlen, clen = try_decompress (ifd, off0, hdr)
3674 if dlen > 0 and clen > 0:
3680 def reconstruct_offsets_gz (fname):
3682 From the given file, retrieve all GZ header-like offsets (“candidates”).
3683 Then check each of those locations whether they can be processed as
3686 ifd = os.open (fname, os.O_RDONLY)
3689 cands = locate_gz_hdr_candidates (ifd)
3690 return readable_gz_objects_offsets (ifd, cands)
3695 def reconstruct_offsets_tar (fname):
3697 From the given file, retrieve all tar header-like offsets (“candidates”).
3698 Then check each of those locations whether they can be processed as tar
3701 ifd = os.open (fname, os.O_RDONLY)
3704 return readable_tar_objects_offsets (ifd)
3709 def read_tarobj_at_offset (fileobj, offset, mode, secret=None):
3712 if secret is not None:
3715 if ks == crypto.PDTCRYPT_SECRET_PW:
3716 decr = crypto.Decrypt (password=secret [1])
3717 elif ks == crypto.PDTCRYPT_SECRET_KEY:
3718 key = binascii.unhexlify (secret [1])
3719 decr = crypto.Decrypt (key=key)
3725 TarFile.open_at_offset (offset,
3731 save_to_members=False,
3732 tolerance=TOLERANCE_RESCUE)
3733 except (ReadError, EndOfFile):
3736 return tarobj.next ()
3739 def idxent_of_tarinfo (tarinfo):
3741 Scrape the information relevant for the index from a *TarInfo* object.
3742 Keys like the inode number that lack a corresponding field in a TarInfo
3743 will be set to some neutral value.
3748 , "path" : "snapshot://annotations.db"
3752 , "ctime" : 1502798115
3753 , "mtime" : 1502196423
3762 { "inode" : 0 # ignored when reading the index
3763 , "uid" : tarinfo.uid
3764 , "gid" : tarinfo.gid
3765 , "path" : tarinfo.name # keeping URI scheme
3766 , "offset" : 0 # to be added by the caller
3767 , "volume" : tarinfo.volume_offset
3768 , "mode" : tarinfo.mode
3769 , "ctime" : tarinfo.mtime
3770 , "mtime" : tarinfo.mtime
3771 , "size" : tarinfo.size
3772 , "type" : tarinfo.type
3776 def gen_rescue_index (gen_volume_name, mode, maxvol=None, password=None, key=None):
3778 psidx = [] # pseudo index, return value
3780 secret = crypto.make_secret (password=password, key=key)
3785 vpath = gen_volume_name (nvol)
3787 if secret is not None:
3788 offsets = crypto.reconstruct_offsets (vpath, secret)
3790 offsets = reconstruct_offsets_gz (vpath)
3792 offsets = reconstruct_offsets_tar (vpath)
3794 raise TarError ("no rescue handling for mode “%s”" % mode)
3795 except FileNotFoundError as exn:
3796 # volume does not exist
3797 if maxvol is not None and i < maxvol:
3798 continue # explicit volume number specified, ignore missing ones
3802 fileobj = bltn_open (vpath, "rb")
3805 obj = read_tarobj_at_offset (fileobj, off, mode, secret=secret)
3807 acc.append ((off, nvol, obj))
3809 infos += functools.reduce (aux, offsets, [])
3813 def aux (o, nvol, ti):
3814 ie = idxent_of_tarinfo (ti)
3816 ie ["volume"] = nvol
3819 psidx = [ aux (o, nvol, ti) for o, nvol, ti in infos ]
3823 #--------------------
3824 # exported functions
3825 #--------------------
3826 def is_tarfile(name):
3827 """Return True if name points to a tar archive that we
3828 are able to handle, else return False.