2 #-------------------------------------------------------------------
4 #-------------------------------------------------------------------
5 # Copyright (C) 2002 Lars Gustäbel <lars@gustaebel.de>
8 # Permission is hereby granted, free of charge, to any person
9 # obtaining a copy of this software and associated documentation
10 # files (the "Software"), to deal in the Software without
11 # restriction, including without limitation the rights to use,
12 # copy, modify, merge, publish, distribute, sublicense, and/or sell
13 # copies of the Software, and to permit persons to whom the
14 # Software is furnished to do so, subject to the following
17 # The above copyright notice and this permission notice shall be
18 # included in all copies or substantial portions of the Software.
20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22 # OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24 # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25 # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27 # OTHER DEALINGS IN THE SOFTWARE.
29 """Read from and write to tar format archives.
32 __version__ = "$Revision: 85213 $"
36 __author__ = "Lars Gustäbel (lars@gustaebel.de)"
39 __credits__ = "Gustavo Niemeyer, Niels Gustäbel, Richard Townsend, Eduardo Robles."
59 import traceback # XXX
68 # os.symlink on Windows prior to 6.0 raises NotImplementedError
69 symlink_exception = (AttributeError, NotImplementedError)
71 # OSError (winerror=1314) will be raised if the caller does not hold the
72 # SeCreateSymbolicLinkPrivilege privilege
73 symlink_exception += (OSError,)
77 # from tarfile import *
78 __all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError"]
80 from builtins import open as _open # Since 'open' is TarFile.open
82 #---------------------------------------------------------
84 #---------------------------------------------------------
85 NUL = b"\0" # the null character
86 BLOCKSIZE = 512 # length of processing blocks
87 RECORDSIZE = BLOCKSIZE * 20 # length of records
88 GNU_MAGIC = b"ustar \0" # magic gnu tar string
89 POSIX_MAGIC = b"ustar\x0000" # magic posix tar string
91 LENGTH_NAME = 100 # maximum length of a filename
92 LENGTH_LINK = 100 # maximum length of a linkname
93 LENGTH_PREFIX = 155 # maximum length of the prefix field
95 REGTYPE = b"0" # regular file
96 AREGTYPE = b"\0" # regular file
97 LNKTYPE = b"1" # link (inside tarfile)
98 SYMTYPE = b"2" # symbolic link
99 CHRTYPE = b"3" # character special device
100 BLKTYPE = b"4" # block special device
101 DIRTYPE = b"5" # directory
102 FIFOTYPE = b"6" # fifo special device
103 CONTTYPE = b"7" # contiguous file
105 GNUTYPE_LONGNAME = b"L" # GNU tar longname
106 GNUTYPE_LONGLINK = b"K" # GNU tar longlink
107 GNUTYPE_SPARSE = b"S" # GNU tar sparse file
108 GNUTYPE_MULTIVOL = b"M" # GNU tar continuation of a file that began on
111 XHDTYPE = b"x" # POSIX.1-2001 extended header
112 XGLTYPE = b"g" # POSIX.1-2001 global header
113 SOLARIS_XHDTYPE = b"X" # Solaris extended header
115 USTAR_FORMAT = 0 # POSIX.1-1988 (ustar) format
116 GNU_FORMAT = 1 # GNU tar format
117 PAX_FORMAT = 2 # POSIX.1-2001 (pax) format
118 DEFAULT_FORMAT = GNU_FORMAT
120 GZ_FMT_HEADER = b"<BBBBLBB"
121 GZ_HEADER_SIZE = 10 # not including the name
122 GZ_MAGIC = (0x1f, 0x8b) # 0o37, 0o213
123 GZ_METHOD_DEFLATE = 0x08 # 0o10
124 GZ_FLAG_FTEXT = 1 << 0 # ASCII payload
125 GZ_FLAG_FHCRC = 1 << 1 # CRC16
126 GZ_FLAG_FEXTRA = 1 << 2 # extra field
127 GZ_FLAG_FNAME = 1 << 3 # set by default in gzip
128 GZ_FLAG_FCOMMENT = 1 << 4 # NUL-terminated comment
129 GZ_FLAG_RESERVED = 7 << 5 # unassigned
130 GZ_DEFLATE_FLAGS = 0x00 # 0o00, never read (deflate.c)
131 GZ_OS_CODE = 0x03 # 0o03, default in gzip (tailor.h)
132 GZ_MAGIC_BYTES = struct.pack ("<BB", GZ_MAGIC [0], GZ_MAGIC [1])
133 GZ_MAGIC_DEFLATE = struct.pack ("<BBB", GZ_MAGIC [0], GZ_MAGIC [1],
137 TOLERANCE_RECOVER = 1 # rely on offsets in index
138 TOLERANCE_RESCUE = 2 # deduce metadata from archive contents
142 #---------------------------------------------------------
143 # archive handling mode
144 #---------------------------------------------------------
147 ARCMODE_ENCRYPT = 1 << 0
148 ARCMODE_COMPRESS = 1 << 1
149 ARCMODE_CONCAT = 1 << 2
152 if m == ARCMODE_PLAIN:
156 def chkappend (b, s):
161 if first is True: first = False
164 chkappend (ARCMODE_ENCRYPT, "ENCRYPT")
165 chkappend (ARCMODE_COMPRESS, "COMPRESS")
166 chkappend (ARCMODE_CONCAT, "CONCAT")
170 def arcmode_set (concat=False, encryption=None, comptype=None, init=ARCMODE_PLAIN):
172 if bool (concat) is True:
173 ret |= ARCMODE_CONCAT
174 if encryption is not None:
175 ret |= ARCMODE_ENCRYPT
177 ret |= ARCMODE_COMPRESS
180 #---------------------------------------------------------
182 #---------------------------------------------------------
183 # File types that tarfile supports:
184 SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE,
185 SYMTYPE, DIRTYPE, FIFOTYPE,
186 CONTTYPE, CHRTYPE, BLKTYPE,
187 GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
188 GNUTYPE_SPARSE, GNUTYPE_MULTIVOL)
190 # File types that will be treated as a regular file.
191 REGULAR_TYPES = (REGTYPE, AREGTYPE,
192 CONTTYPE, GNUTYPE_SPARSE)
194 # File types that are part of the GNU tar format.
195 GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
196 GNUTYPE_SPARSE, GNUTYPE_MULTIVOL)
198 # Fields from a pax header that override a TarInfo attribute.
199 PAX_FIELDS = ("path", "linkpath", "size", "mtime",
200 "uid", "gid", "uname", "gname")
202 # Fields from a pax header that are affected by hdrcharset.
203 PAX_NAME_FIELDS = {"path", "linkpath", "uname", "gname"}
205 # Fields in a pax header that are numbers, all other fields
206 # are treated as strings.
207 PAX_NUMBER_FIELDS = {
216 #---------------------------------------------------------
218 #---------------------------------------------------------
220 if os.name in ("nt", "ce"):
223 ENCODING = sys.getfilesystemencoding()
225 #---------------------------------------------------------
226 # Some useful functions
227 #---------------------------------------------------------
229 def stn(s, length, encoding, errors):
230 """Convert a string to a null-terminated bytes object.
232 s = s.encode(encoding, errors)
233 return s[:length] + (length - len(s)) * NUL
235 def nts(s, encoding, errors):
236 """Convert a null-terminated bytes object to a string.
241 return s.decode(encoding, errors)
243 def sbtn(s, length, encoding, errors):
244 """Convert a string or a bunch of bytes to a null-terminated bytes object
247 if isinstance(s, str):
248 s = s.encode(encoding, errors)
249 return s[:length] + (length - len(s)) * NUL
252 """Convert a number field to a python number.
254 # There are two possible encodings for a number field, see
256 if s[0] in (0o200, 0o377):
258 for i in range(len(s) - 1):
262 n = -(256 ** (len(s) - 1) - n)
265 n = int(nts(s, "ascii", "strict") or "0", 8)
267 raise InvalidHeaderError("invalid header")
270 def itn(n, digits=8, format=DEFAULT_FORMAT):
271 """Convert a python number to a number field.
273 # POSIX 1003.1-1988 requires numbers to be encoded as a string of
274 # octal digits followed by a null-byte, this allows values up to
275 # (8**(digits-1))-1. GNU tar allows storing numbers greater than
276 # that if necessary. A leading 0o200 or 0o377 byte indicate this
277 # particular encoding, the following digits-1 bytes are a big-endian
278 # base-256 representation. This allows values up to (256**(digits-1))-1.
279 # A 0o200 byte indicates a positive number, a 0o377 byte a negative
281 if 0 <= n < 8 ** (digits - 1):
282 s = bytes("%0*o" % (digits - 1, int(n)), "ascii") + NUL
283 elif format == GNU_FORMAT and -256 ** (digits - 1) <= n < 256 ** (digits - 1):
285 s = bytearray([0o200])
287 s = bytearray([0o377])
288 n = 256 ** digits + n
290 for i in range(digits - 1):
291 s.insert(1, n & 0o377)
294 raise ValueError("overflow in number field")
298 def calc_chksums(buf):
299 """Calculate the checksum for a member's header by summing up all
300 characters except for the chksum field which is treated as if
301 it was filled with spaces. According to the GNU tar sources,
302 some tars (Sun and NeXT) calculate chksum with signed char,
303 which will be different if there are chars in the buffer with
304 the high bit set. So we calculate two checksums, unsigned and
307 unsigned_chksum = 256 + sum(struct.unpack_from("148B8x356B", buf))
308 signed_chksum = 256 + sum(struct.unpack_from("148b8x356b", buf))
309 return unsigned_chksum, signed_chksum
311 def copyfileobj(src, dst, length=None):
312 """Copy length bytes from fileobj src to fileobj dst.
313 If length is None, copy the entire content.
318 shutil.copyfileobj(src, dst)
321 blocks, remainder = divmod(length, BUFSIZE)
322 for b in range(blocks):
323 buf = src.read(BUFSIZE)
325 if len(buf) < BUFSIZE:
326 raise OSError("end of file reached")
328 buf = src.read(remainder)
330 if len(buf) < remainder:
331 raise OSError("end of file reached")
335 """Deprecated in this location; use stat.filemode."""
337 warnings.warn("deprecated in favor of stat.filemode",
338 DeprecationWarning, 2)
339 return stat.filemode(mode)
341 class TarError(Exception):
342 """Base exception."""
344 class ExtractError(TarError):
345 """General exception for extract errors."""
347 class ReadError(TarError):
348 """Exception for unreadable tar archives."""
350 class CompressionError(TarError):
351 """Exception for unavailable compression methods."""
353 class StreamError(TarError):
354 """Exception for unsupported operations on stream-like TarFiles."""
356 class HeaderError(TarError):
357 """Base exception for header errors."""
359 class EmptyHeaderError(HeaderError):
360 """Exception for empty headers."""
362 class TruncatedHeaderError(HeaderError):
363 """Exception for truncated headers."""
365 class EOFHeaderError(HeaderError):
366 """Exception for end of file headers."""
368 class InvalidHeaderError(HeaderError):
369 """Exception for invalid headers."""
371 class SubsequentHeaderError(HeaderError):
372 """Exception for missing and invalid extended headers."""
374 class InvalidEncryptionError(TarError):
375 """Exception for undefined crypto modes and combinations."""
377 class DecryptionError(TarError):
378 """Exception for error during decryption."""
380 class EncryptionError(TarError):
381 """Exception for error during encryption."""
383 class EndOfFile(Exception):
384 """Signal end of file condition when they’re not an error."""
387 #---------------------------
388 # internal stream interface
389 #---------------------------
391 """Low-level file object. Supports reading and writing.
392 It is used instead of a regular file object for streaming
396 def __init__(self, name, mode):
399 "w": os.O_RDWR | os.O_CREAT | os.O_TRUNC,
401 if hasattr(os, "O_BINARY"):
402 _mode |= os.O_BINARY # pylint: disable=no-member
403 self.fd = os.open(name, _mode, 0o666)
409 def read(self, size):
410 ret = os.read(self.fd, size)
411 self.offset += len(ret)
414 def write(self, s, pos=None):
417 os.lseek (self.fd, pos, os.SEEK_SET)
418 n = os.write(self.fd, s)
420 self.offset += len(s)
422 append = pos + n - p0
424 self.offset += append
425 os.lseek (self.fd, p0, os.SEEK_SET)
430 def seek_set (self, pos):
431 os.lseek (self.fd, pos, os.SEEK_SET)
435 def gz_header (name=None):
436 timestamp = int(time.time())
442 flags |= GZ_FLAG_FNAME
443 if type(name) is str:
444 name = name.encode("iso-8859-1", "replace")
445 if name.endswith(b".pdtcrypt"):
447 if name.endswith(b".gz"):
449 # RFC1952 says we must use ISO-8859-1 for the FNAME field.
452 hdr = struct.pack (GZ_FMT_HEADER,
453 GZ_MAGIC [0], GZ_MAGIC [1],
454 GZ_METHOD_DEFLATE, flags,
456 GZ_DEFLATE_FLAGS, GZ_OS_CODE)
462 """Class that serves as an adapter between TarFile and
463 a stream-like object. The stream-like object only
464 needs to have a read() or write() method and is accessed
465 blockwise. Use of gzip or bzip2 compression is possible.
466 A stream-like object could be for example: sys.stdin,
467 sys.stdout, a socket, a tape device etc.
469 _Stream is intended to be used only internally but is
470 nevertherless used externally by Deltatar.
472 When encrypting, the ``enccounter`` will be used for
473 initializing the first cryptographic context. When
474 decrypting, its value will be compared to the decrypted
475 object. Decryption fails if the value does not match.
476 In effect, this means that a ``_Stream`` whose ctor was
477 passed ``enccounter`` can only be used to encrypt or
478 decrypt a single object.
481 remainder = -1 # track size in encrypted entries
482 tolerance = TOLERANCE_STRICT
484 def __init__(self, name, mode, comptype, fileobj, bufsize,
485 concat=False, encryption=None, enccounter=None,
486 compresslevel=9, tolerance=TOLERANCE_STRICT):
487 """Construct a _Stream object.
489 self.arcmode = arcmode_set (concat, encryption, comptype)
490 self.tolerance = tolerance
492 self._extfileobj = True
494 fileobj = _LowLevelFile(name, mode)
495 self._extfileobj = False
498 # Enable transparent compression detection for the
500 fileobj = _StreamProxy(fileobj)
501 comptype = fileobj.getcomptype()
505 self.enccounter = None
506 if self.arcmode & ARCMODE_ENCRYPT:
507 self.enccounter = enccounter
509 self.name = name or ""
511 self.comptype = comptype
513 self.fileobj = fileobj
514 self.bufsize = bufsize
520 self.last_block_offset = 0
521 self.dbuf = b"" # ???
522 self.exception = None # communicate decompression failure
523 self.compresslevel = compresslevel
524 self.bytes_written = 0
526 self.encryption = encryption
529 if encryption is not None:
530 encryption.reset_last_iv ()
537 raise CompressionError("zlib module is not available")
540 self.exception = zlib.error
543 if not (self.arcmode & ARCMODE_CONCAT):
544 if self.arcmode & ARCMODE_ENCRYPT:
545 self._init_write_encrypt (name)
546 self._init_write_gz ()
547 self.crc = zlib.crc32(b"") & 0xFFFFffff
549 elif comptype == "bz2":
550 if self.arcmode & ARCMODE_ENCRYPT:
551 raise InvalidEncryptionError("encryption not available for "
552 "compression “%s”" % comptype)
556 raise CompressionError("bz2 module is not available")
559 self.cmp = bz2.BZ2Decompressor()
560 self.exception = OSError
562 self.cmp = bz2.BZ2Compressor()
564 elif comptype == 'xz':
565 if self.arcmode & ARCMODE_ENCRYPT:
566 raise InvalidEncryptionError("encryption not available for "
567 "compression “%s”" % comptype)
571 raise CompressionError("lzma module is not available")
574 self.cmp = lzma.LZMADecompressor()
575 self.exception = lzma.LZMAError
577 self.cmp = lzma.LZMACompressor()
579 elif comptype == "tar":
580 if not (self.arcmode & ARCMODE_CONCAT) \
582 and self.arcmode & ARCMODE_ENCRYPT:
583 self._init_write_encrypt (name)
586 if self.arcmode & ARCMODE_ENCRYPT:
587 raise InvalidEncryptionError("encryption not available for "
588 "compression “%s”" % comptype)
589 raise CompressionError("unknown compression type %r" % comptype)
592 if not self._extfileobj:
598 if hasattr(self, "closed") and not self.closed:
601 except crypto.InternalError:
602 # context already finalized due to abort but close() tried
607 def next (self, name):
608 if self.arcmode & ARCMODE_COMPRESS:
609 if getattr (self, "cmp", None) is not None:
610 self._finalize_write_gz ()
612 if self.arcmode & ~(ARCMODE_ENCRYPT | ARCMODE_COMPRESS):
613 self.last_block_offset = self.fileobj.tell()
614 if self.arcmode & ARCMODE_ENCRYPT:
615 self._finalize_write_encrypt ()
616 self._init_write_encrypt (name, set_last_block_offset=True)
617 if self.arcmode & ARCMODE_COMPRESS:
618 self._init_write_gz (set_last_block_offset =
619 not (self.arcmode & ARCMODE_ENCRYPT))
620 return self.last_block_offset
623 def next_volume (self, name):
624 # with non-concat modes, this is taken care by the _Stream
625 # ctor as invoked by the newvol handler
626 if self.arcmode & ARCMODE_COMPRESS:
627 if getattr (self, "cmp", None) is not None:
628 # e. g. compressed PAX header written
629 self._finalize_write_gz ()
630 if self.arcmode & ARCMODE_ENCRYPT:
631 self._init_write_encrypt (name)
632 if self.arcmode & ARCMODE_COMPRESS:
633 self._init_write_gz ()
636 def _init_write_encrypt (self, entry=None, set_last_block_offset=False):
638 Save position for delayed write of header; fill the header location
641 # first thing, proclaim new object to the encryption context
642 # secondly, assemble the header with the updated parameters
643 # and commit it directly to the underlying stream, bypassing the
644 # encryption layer in .__write().
645 dummyhdr = self.encryption.next (entry, counter=self.enccounter)
647 raise EncryptionError ("Crypto.next(): bad dummy header") # XXX
648 self.lasthdr = self.fileobj.tell()
649 self.__write_to_file(dummyhdr)
650 if set_last_block_offset is True:
651 self.last_block_offset = self.lasthdr
654 def _finalize_write_encrypt (self):
656 Seek back to header position, read dummy bytes, finalize crypto
657 obtaining the actual header, write header, seek back to current
660 Returns the list of IV fixed parts as used during encryption.
662 if self.lasthdr is not None:
663 pos0 = self.fileobj.tell ()
664 self.fileobj.seek_set (self.lasthdr)
665 dummy = self.fileobj.read (crypto.PDTCRYPT_HDR_SIZE)
666 pos1 = self.fileobj.tell ()
667 dpos = pos1 - self.lasthdr
668 assert dpos == crypto.PDTCRYPT_HDR_SIZE
669 self.fileobj.seek_set (pos0)
670 data, hdr, _ = self.encryption.done (dummy)
671 self.__write_to_file(hdr, pos=self.lasthdr)
672 self.__write_to_file(data) # append remainder of data
676 def _finalize_write_gz (self):
677 if self.cmp is not None:
678 chunk = self.buf + self.cmp.flush()
680 if self.comptype == "gz":
681 # The native zlib crc is an unsigned 32-bit integer, but
682 # the Python wrapper implicitly casts that to a signed C
683 # long. So, on a 32-bit box self.crc may "look negative",
684 # while the same crc on a 64-bit box may "look positive".
685 # To avoid irksome warnings from the `struct` module, force
686 # it to look positive on all boxes.
687 chunk += struct.pack("<L", self.crc & 0xffffffff)
688 chunk += struct.pack("<L", self.concat_pos & 0xffffFFFF)
689 self.__enc_write (chunk)
693 def _init_write_gz (self, set_last_block_offset=False):
695 Add a new gzip block, closing last one
698 self.crc = self.zlib.crc32(b"") & 0xFFFFffff
699 first = self.cmp is None
700 self.cmp = self.zlib.compressobj(self.compresslevel,
702 -self.zlib.MAX_WBITS,
703 self.zlib.DEF_MEM_LEVEL,
706 # if aes, we encrypt after compression
707 if set_last_block_offset is True:
708 self.last_block_offset = self.fileobj.tell()
710 self.__write(gz_header (self.name if first is True else None))
714 """Write string s to the stream.
716 if self.comptype == "gz":
717 self.crc = self.zlib.crc32(s, self.crc) & 0xFFFFffff
719 self.concat_pos += len(s)
720 if self.cmp is not None:
721 s = self.cmp.compress(s)
725 """Write what’s left in the buffer to the stream."""
726 self.__write (b"") # → len (buf) <= bufsiz
727 self.__enc_write (self.buf)
730 def __write(self, s):
731 """Writes (and encodes) string s to the stream blockwise
733 will wait with encoding/writing until block is complete
736 while len(self.buf) > self.bufsize:
737 self.__enc_write(self.buf[:self.bufsize])
738 self.buf = self.buf[self.bufsize:]
741 def __write_to_file(self, s, pos=None):
743 Writes directly to the fileobj; updates self.bytes_written. If “pos” is
744 given, the stream will seek to that position first and back afterwards,
745 and the total of bytes written is not updated.
747 self.fileobj.write(s, pos)
749 self.bytes_written += len(s)
752 def __enc_write(self, s):
754 If encryption is active, the string s is encrypted before being written
759 if self.arcmode & ARCMODE_ENCRYPT:
762 n, ct = self.encryption.process(buf)
763 self.__write_to_file(ct)
766 # The entire plaintext was not consumed: The size limit
767 # for encrypted objects was reached. Transparently create
768 # a new encrypted object and continue processing the input.
769 self._finalize_write_encrypt ()
770 self._init_write_encrypt ()
772 self.__write_to_file(s)
775 def estim_file_size(self):
776 """ estimates size of file if closing it now
778 The result may differ greatly from the amount of data sent to write()
779 due to compression, encryption and buffering.
781 In tests the result (before calling close()) was up to 12k smaller than
782 the final file size if compression is being used because zlib/bz2
783 compressors do not allow inspection of their buffered data :-(
785 Still, we add what close() would add: 8 bytes for gz checksum, one
786 encryption block size if encryption is used and the size of our own
790 return self.bytes_written
792 result = self.bytes_written
794 result += len(self.buf)
795 if self.comptype == 'gz':
796 result += 8 # 2 longs = 8 byte (no extra info written for bzip2)
799 def close(self, close_fileobj=True):
800 """Close the _Stream object. No operation should be
801 done on it afterwards.
807 if close_fileobj is True:
810 if self.arcmode & ARCMODE_COMPRESS:
811 self._finalize_write_gz ()
812 # end of Tar archive marker (two empty blocks) was written
813 # finalize encryption last; no writes may be performed after
816 if self.arcmode & ARCMODE_ENCRYPT:
817 self._finalize_write_encrypt ()
819 if not self._extfileobj:
822 # read the zlib crc and length and check them
823 if self.mode == "r" and self.comptype == "gz":
824 read_crc = self.__read(4)
825 read_length = self.__read(4)
826 calculated_crc = self.crc
827 if struct.unpack("<L", read_crc)[0] != calculated_crc:
828 raise CompressionError("bad gzip crc")
832 def _init_read_gz(self):
833 """Initialize for reading a gzip compressed fileobj.
835 self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
837 read2 = self.__read(2)
839 raise EndOfFile ("_init_read_gz(): read returned zero bytes at pos "
840 "%d" % self.fileobj.tell())
841 # taken from gzip.GzipFile with some alterations
842 if read2 != GZ_MAGIC_BYTES:
843 raise ReadError("not a gzip file")
845 read1 = self.__read(1)
847 raise EndOfFile ("_init_read_gz(): read returned zero bytes inside "
848 "gzip header at pos %d" % self.fileobj.tell())
849 if ord (read1) != GZ_METHOD_DEFLATE:
850 raise CompressionError("unsupported compression method")
852 self.flags = flag = ord(self.__read(1))
853 self.__read(6) # discard timestamp[4], deflate flags, os code
855 if flag & GZ_FLAG_FEXTRA:
856 xlen = ord(self.__read(1)) + 256 * ord(self.__read(1))
858 if flag & GZ_FLAG_FNAME:
861 if not s or s == NUL:
863 if flag & GZ_FLAG_FCOMMENT:
866 if not s or s == NUL:
868 if flag & GZ_FLAG_FHCRC:
871 def _init_read_encrypt (self):
872 """Initialize encryption for next entry in archive. Read a header and
873 notify the crypto context."""
874 if self.arcmode & ARCMODE_ENCRYPT:
875 lasthdr = self.fileobj.tell ()
877 hdr = crypto.hdr_read_stream (self.fileobj)
878 except crypto.EndOfFile:
880 except crypto.InvalidHeader as exn:
881 raise DecryptionError ("Crypto.hdr_read_stream(): error “%s” "
882 "processing %r at pos %d"
883 % (exn, self.fileobj, lasthdr)) \
885 if self.enccounter is not None:
886 # enforce that the iv counter in the header matches an
887 # explicitly requested one
888 iv = crypto.hdr_iv_counter (hdr)
889 if iv != self.enccounter:
890 raise DecryptionError ("expected IV counter %d, got %d"
891 % (self.enccounter, iv))
892 self.lasthdr = lasthdr
893 self.remainder = hdr ["ctsize"] # distance to next header
895 self.encryption.next (hdr)
896 except crypto.InvalidParameter as exn:
897 raise DecryptionError ("Crypto.next(): error “%s” "
898 "processing %r at pos %d"
899 % (exn, self.fileobj, lasthdr)) \
905 def _read_encrypt (self, buf):
907 Demote a program error to a decryption error in tolerant mode. This
908 allows recovery from corrupted headers and invalid data.
911 return self.encryption.process (buf)
912 except RuntimeError as exn:
913 if self.tolerance != TOLERANCE_STRICT:
914 raise DecryptionError (exn)
918 def _finalize_read_encrypt (self):
922 if self.arcmode & ARCMODE_ENCRYPT \
923 and self.lasthdr is not None :
924 assert self.remainder >= 0
925 if self.remainder > 0:
928 data = self.encryption.done ()
929 except crypto.InvalidGCMTag as exn:
930 raise DecryptionError ("decryption failed: %s" % exn)
935 """Return the stream's file pointer position.
939 def seek(self, pos=0):
940 """Set the stream's file pointer to pos. Negative seeking
945 elif pos - self.pos >= 0:
946 blocks, remainder = divmod(pos - self.pos, self.bufsize)
947 if self.encryption is not None:
948 # IV succession is only preserved between successive objects.
949 self.encryption.reset_last_iv ()
950 for i in range(blocks):
951 self.read(self.bufsize)
954 raise StreamError("seeking backwards is not allowed")
957 def read(self, size=None):
958 """Return the next size number of bytes from the stream.
959 If size is not defined, return all bytes of the stream
965 buf = self._read(self.bufsize)
971 buf = self._read(size)
976 """Reads just one line, new line character included
978 # if \n in dbuf, no read neads to be done
979 if b'\n' in self.dbuf:
980 pos = self.dbuf.index(b'\n') + 1
981 ret = self.dbuf[:pos]
982 self.dbuf = self.dbuf[pos:]
987 chunk = self._read(self.bufsize)
989 # nothing more to read, so return the buffer
995 # if \n found, return the new line
998 pos = dbuf.index(b'\n') + 1
999 self.dbuf = dbuf[pos:] + self.dbuf
1002 def _read(self, size):
1003 """Return size bytes from the stream.
1009 buf = self.__read(self.bufsize)
1013 if self.cmp is not None:
1015 buf = self.cmp.decompress(buf)
1016 except self.exception as exn:
1017 raise ReadError("invalid compressed data (%r)" % exn)
1018 except Exception as e:
1019 # happens at the end of the file
1020 # _init_read_gz failed in the previous iteration so
1021 # self.cmp.decompress fails here
1022 if self.arcmode & ARCMODE_CONCAT:
1025 raise ReadError("invalid compressed data")
1026 if self.arcmode & ARCMODE_COMPRESS and hasattr(self, "crc"):
1027 self.crc = self.zlib.crc32(buf, self.crc) & 0xFFFFffff
1028 if self.arcmode & ARCMODE_CONCAT \
1029 and len(self.cmp.unused_data) != 0:
1030 self.buf = self.cmp.unused_data + self.buf
1031 self.close(close_fileobj=False)
1033 self._init_read_gz()
1034 except DecryptionError:
1035 if self.tolerance != TOLERANCE_STRICT:
1036 # return whatever data was processed successfully
1042 except ReadError: # gzip troubles
1043 if self.tolerance == TOLERANCE_RESCUE:
1050 # happens at the end of the file
1052 self.crc = self.zlib.crc32(b"") & 0xFFFFffff
1057 self.dbuf = t[size:]
1061 def __read(self, size):
1063 Return size bytes from stream. If internal buffer is empty, read
1064 another block from the stream.
1066 The function returns up to size bytes of data. When an error occurs
1067 during decryption, everything until the end of the last successfully
1068 finalized object is returned.
1071 t = [self.buf] if c > 0 else []
1072 good_crypto = len (t)
1077 if self.arcmode & ARCMODE_ENCRYPT:
1078 if self.remainder <= 0:
1079 # prepare next object
1080 if self._init_read_encrypt () is False: # EOF
1084 # only read up to the end of the encrypted object
1085 todo = min (size, self.remainder)
1086 buf = self.fileobj.read(todo)
1087 if self.arcmode & ARCMODE_ENCRYPT:
1089 buf = self._read_encrypt (buf)
1090 if todo == self.remainder:
1091 # at the end of a crypto object; finalization will fail if
1092 # the GCM tag does not match
1093 trailing = self._finalize_read_encrypt ()
1094 good_crypto = len (t) + 1
1095 if len (trailing) > 0:
1099 self.remainder -= todo
1100 except DecryptionError:
1101 if self.tolerance == TOLERANCE_STRICT:
1103 self.encryption.drop ()
1104 if self.tolerance == TOLERANCE_RECOVER:
1105 if good_crypto == 0:
1107 # this may occur at any of the three crypto operations above.
1108 # some objects did validate; discard all data after it; next
1109 # call will start with the bad object and error out immediately
1110 self.buf = b"".join (t [good_crypto:])
1111 return b"".join (t [:good_crypto])
1112 elif self.tolerance == TOLERANCE_RESCUE:
1113 # keep what we have so far despite the finalization issue
1118 raise RuntimeError("internal error: bad tolerance level")
1120 if not buf: ## XXX stream terminated prematurely; this should be an error
1131 class _StreamProxy(object):
1132 """Small proxy class that enables transparent compression
1133 detection for the Stream interface (mode 'r|*').
1136 def __init__(self, fileobj):
1137 self.fileobj = fileobj
1138 self.buf = self.fileobj.read(BLOCKSIZE)
1140 def read(self, size): # pylint: disable=method-hidden
1141 self.read = self.fileobj.read
1144 def getcomptype(self):
1145 if self.buf.startswith(GZ_MAGIC_DEFLATE):
1147 elif self.buf[0:3] == b"BZh" and self.buf[4:10] == b"1AY&SY":
1149 elif self.buf.startswith((b"\x5d\x00\x00\x80", b"\xfd7zXZ")):
1155 self.fileobj.close()
1158 #------------------------
1159 # Extraction file object
1160 #------------------------
1161 class _FileInFile(object):
1162 """A thin wrapper around an existing file object that
1163 provides a part of its data as an individual file
1167 def __init__(self, fileobj, offset, size, blockinfo=None):
1168 self.fileobj = fileobj
1169 self.offset = offset
1172 self.name = getattr(fileobj, "name", None)
1175 if blockinfo is None:
1176 blockinfo = [(0, size)]
1178 # Construct a map with data and zero blocks.
1182 realpos = self.offset
1183 for offset, size in blockinfo:
1184 if offset > lastpos:
1185 self.map.append((False, lastpos, offset, None))
1186 self.map.append((True, offset, offset + size, realpos))
1188 lastpos = offset + size
1189 if lastpos < self.size:
1190 self.map.append((False, lastpos, self.size, None))
1202 return self.fileobj.seekable()
1205 """Return the current file position.
1207 return self.position
1209 def seek(self, position, whence=io.SEEK_SET):
1210 """Seek to a position in the file.
1212 if whence == io.SEEK_SET:
1213 self.position = min(max(position, 0), self.size)
1214 elif whence == io.SEEK_CUR:
1216 self.position = max(self.position + position, 0)
1218 self.position = min(self.position + position, self.size)
1219 elif whence == io.SEEK_END:
1220 self.position = max(min(self.size + position, self.size), 0)
1222 raise ValueError("Invalid argument")
1223 return self.position
1225 def read(self, size=None):
1226 """Read data from the file.
1229 size = self.size - self.position
1231 size = min(size, self.size - self.position)
1236 data, start, stop, offset = self.map[self.map_index]
1237 if start <= self.position < stop:
1241 if self.map_index == len(self.map):
1243 length = min(size, stop - self.position)
1245 self.fileobj.seek(offset + (self.position - start))
1246 buf += self.fileobj.read(length)
1250 self.position += length
1253 def readinto(self, b):
1254 buf = self.read(len(b))
1263 class ExFileObject(io.BufferedReader):
1265 def __init__(self, tarfile, tarinfo):
1266 fileobj = _FileInFile(tarfile.fileobj, tarinfo.offset_data,
1267 tarinfo.size, tarinfo.sparse)
1268 super().__init__(fileobj)
1274 class TarInfo(object):
1275 """Informational class which holds the details about an
1276 archive member given by a tar header block.
1277 TarInfo objects are returned by TarFile.getmember(),
1278 TarFile.getmembers() and TarFile.gettarinfo() and are
1279 usually created internally.
1282 __slots__ = ("name", "mode", "uid", "gid", "size", "mtime",
1283 "chksum", "type", "linkname", "uname", "gname",
1284 "devmajor", "devminor", "volume_offset",
1285 "offset", "offset_data", "pax_headers", "sparse",
1286 "tarfile", "_sparse_structs", "_link_target")
1288 def __init__(self, name=""):
1289 """Construct a TarInfo object. name is the optional name
1292 self.name = name # member name
1293 self.mode = 0o644 # file permissions
1294 self.uid = 0 # user id
1295 self.gid = 0 # group id
1296 self.size = 0 # file size
1297 self.mtime = 0 # modification time
1298 self.chksum = 0 # header checksum
1299 self.type = REGTYPE # member type
1300 self.linkname = "" # link name
1301 self.uname = "" # user name
1302 self.gname = "" # group name
1303 self.devmajor = 0 # device major number
1304 self.devminor = 0 # device minor number
1306 self.offset = 0 # the tar header starts here
1307 self.offset_data = 0 # the file's data starts here
1308 self.volume_offset = 0 # the file's data corresponds with the data
1309 # starting at this position
1311 self.sparse = None # sparse member information
1312 self.pax_headers = {} # pax header information
1314 # In pax headers the "name" and "linkname" field are called
1315 # "path" and "linkpath".
1318 def _setpath(self, name):
1320 path = property(_getpath, _setpath)
1322 def _getlinkpath(self):
1323 return self.linkname
1324 def _setlinkpath(self, linkname):
1325 self.linkname = linkname
1326 linkpath = property(_getlinkpath, _setlinkpath)
1329 return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self))
1331 def get_info(self, encoding=None, errors=None):
1332 """Return the TarInfo's attributes as a dictionary.
1336 "mode": self.mode & 0o7777,
1340 "mtime": self.mtime,
1341 "chksum": self.chksum,
1343 "linkname": self.linkname,
1344 "uname": self.uname,
1345 "gname": self.gname,
1346 "devmajor": self.devmajor,
1347 "devminor": self.devminor,
1348 "offset_data": self.offset_data,
1349 "volume_offset": self.volume_offset
1352 if info["type"] == DIRTYPE and not info["name"].endswith("/"):
1357 def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING,
1358 errors="surrogateescape"):
1359 """Return a tar header as a string of 512 byte blocks.
1361 info = self.get_info(encoding, errors)
1363 if format == USTAR_FORMAT:
1364 return self.create_ustar_header(info, encoding, errors)
1365 elif format == GNU_FORMAT:
1366 return self.create_gnu_header(info, encoding, errors)
1367 elif format == PAX_FORMAT:
1368 return self.create_pax_header(info, encoding, errors)
1370 raise ValueError("invalid format")
1372 def create_ustar_header(self, info, encoding, errors):
1373 """Return the object as a ustar header block.
1375 info["magic"] = POSIX_MAGIC
1377 if len(info["linkname"]) > LENGTH_LINK:
1378 raise ValueError("linkname is too long")
1380 if len(info["name"]) > LENGTH_NAME:
1381 info["prefix"], info["name"] = self._posix_split_name(info["name"])
1383 return self._create_header(info, USTAR_FORMAT, encoding, errors)
1385 def create_gnu_header(self, info, encoding, errors):
1386 """Return the object as a GNU header block sequence.
1388 info["magic"] = GNU_MAGIC
1390 if self.ismultivol():
1392 itn(info.get("atime", 0), 12, GNU_FORMAT),
1393 itn(info.get("ctime", 0), 12, GNU_FORMAT),
1394 itn(self.volume_offset, 12, GNU_FORMAT),
1395 itn(0, 119, GNU_FORMAT), # stuff unused in this tar implementation, set to zero
1397 info['prefix'] = b"".join(prefix)
1398 info['size'] = info['size'] - self.volume_offset
1401 if len(info["linkname"]) > LENGTH_LINK:
1402 buf += self._create_gnu_long_header(info["linkname"],
1403 GNUTYPE_LONGLINK, encoding, errors)
1405 if len(info["name"]) > LENGTH_NAME:
1406 buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME,
1409 return buf + self._create_header(info, GNU_FORMAT, encoding, errors)
1411 def create_pax_header(self, info, encoding, errors):
1412 """Return the object as a ustar header block. If it cannot be
1413 represented this way, prepend a pax extended header sequence
1414 with supplement information.
1416 info["magic"] = POSIX_MAGIC
1417 pax_headers = self.pax_headers.copy()
1418 if self.ismultivol():
1419 info['size'] = info['size'] - self.volume_offset
1421 # Test string fields for values that exceed the field length or cannot
1422 # be represented in ASCII encoding.
1423 for name, hname, length in (
1424 ("name", "path", LENGTH_NAME),
1425 ("linkname", "linkpath", LENGTH_LINK),
1426 ("uname", "uname", 32),
1427 ("gname", "gname", 32)):
1429 if hname in pax_headers:
1430 # The pax header has priority.
1433 # Try to encode the string as ASCII.
1435 info[name].encode("ascii", "strict")
1436 except UnicodeEncodeError:
1437 pax_headers[hname] = info[name]
1440 if len(info[name]) > length:
1441 pax_headers[hname] = info[name]
1443 # Test number fields for values that exceed the field limit or values
1444 # that like to be stored as float.
1445 for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)):
1446 if name in pax_headers:
1447 # The pax header has priority. Avoid overflow.
1452 if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float):
1453 pax_headers[name] = str(val)
1456 # Create a pax extended header if necessary.
1458 buf = self._create_pax_generic_header(pax_headers, XHDTYPE, encoding)
1462 return buf + self._create_header(info, USTAR_FORMAT, "ascii", "replace")
1465 def create_pax_global_header(cls, pax_headers):
1466 """Return the object as a pax global header block sequence.
1468 return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf-8")
1470 def _posix_split_name(self, name):
1471 """Split a name longer than 100 chars into a prefix
1474 prefix = name[:LENGTH_PREFIX + 1]
1475 while prefix and prefix[-1] != "/":
1476 prefix = prefix[:-1]
1478 name = name[len(prefix):]
1479 prefix = prefix[:-1]
1481 if not prefix or len(name) > LENGTH_NAME:
1482 raise ValueError("name is too long")
1486 def _create_header(info, format, encoding, errors):
1487 """Return a header block. info is a dictionary with file
1488 information, format must be one of the *_FORMAT constants.
1491 stn(info.get("name", ""), 100, encoding, errors),
1492 itn(info.get("mode", 0) & 0o7777, 8, format),
1493 itn(info.get("uid", 0), 8, format),
1494 itn(info.get("gid", 0), 8, format),
1495 itn(info.get("size", 0), 12, format),
1496 itn(info.get("mtime", 0), 12, format),
1497 b" ", # checksum field
1498 info.get("type", REGTYPE),
1499 stn(info.get("linkname", ""), 100, encoding, errors),
1500 info.get("magic", POSIX_MAGIC),
1501 stn(info.get("uname", ""), 32, encoding, errors),
1502 stn(info.get("gname", ""), 32, encoding, errors),
1503 itn(info.get("devmajor", 0), 8, format),
1504 itn(info.get("devminor", 0), 8, format),
1505 sbtn(info.get("prefix", ""), 155, encoding, errors)
1508 buf = struct.pack("%ds" % BLOCKSIZE, b"".join(parts))
1509 chksum = calc_chksums(buf[-BLOCKSIZE:])[0]
1510 buf = buf[:-364] + bytes("%06o\0" % chksum, "ascii") + buf[-357:]
1514 def _create_payload(payload):
1515 """Return the string payload filled with zero bytes
1516 up to the next 512 byte border.
1518 blocks, remainder = divmod(len(payload), BLOCKSIZE)
1520 payload += (BLOCKSIZE - remainder) * NUL
1524 def _create_gnu_long_header(cls, name, type, encoding, errors):
1525 """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence
1528 name = name.encode(encoding, errors) + NUL
1531 info["name"] = "././@LongLink"
1533 info["size"] = len(name)
1534 info["magic"] = GNU_MAGIC
1536 # create extended header + name blocks.
1537 return cls._create_header(info, USTAR_FORMAT, encoding, errors) + \
1538 cls._create_payload(name)
1541 def _create_pax_generic_header(cls, pax_headers, type, encoding):
1542 """Return a POSIX.1-2008 extended or global header sequence
1543 that contains a list of keyword, value pairs. The values
1546 # Check if one of the fields contains surrogate characters and thereby
1547 # forces hdrcharset=BINARY, see _proc_pax() for more information.
1549 for keyword, value in pax_headers.items():
1551 value.encode("utf-8", "strict")
1552 except UnicodeEncodeError:
1558 # Put the hdrcharset field at the beginning of the header.
1559 records += b"21 hdrcharset=BINARY\n"
1561 for keyword, value in pax_headers.items():
1562 keyword = keyword.encode("utf-8")
1564 # Try to restore the original byte representation of `value'.
1565 # Needless to say, that the encoding must match the string.
1566 value = value.encode(encoding, "surrogateescape")
1568 value = value.encode("utf-8")
1570 l = len(keyword) + len(value) + 3 # ' ' + '=' + '\n'
1577 records += bytes(str(p), "ascii") + b" " + keyword + b"=" + value + b"\n"
1579 # We use a hardcoded "././@PaxHeader" name like star does
1580 # instead of the one that POSIX recommends.
1582 info["name"] = "././@PaxHeader"
1584 info["size"] = len(records)
1585 info["magic"] = POSIX_MAGIC
1587 # Create pax header + record blocks.
1588 return cls._create_header(info, USTAR_FORMAT, "ascii", "replace") + \
1589 cls._create_payload(records)
1592 def frombuf(cls, buf, encoding, errors):
1593 """Construct a TarInfo object from a 512 byte bytes object.
1596 raise EmptyHeaderError("empty header")
1597 if len(buf) != BLOCKSIZE:
1598 raise TruncatedHeaderError("truncated header")
1599 if buf.count(NUL) == BLOCKSIZE:
1600 raise EOFHeaderError("end of file header")
1602 chksum = nti(buf[148:156])
1603 if chksum not in calc_chksums(buf):
1604 raise InvalidHeaderError("bad checksum")
1607 obj.name = nts(buf[0:100], encoding, errors)
1608 obj.mode = nti(buf[100:108])
1609 obj.uid = nti(buf[108:116])
1610 obj.gid = nti(buf[116:124])
1611 obj.size = nti(buf[124:136])
1612 obj.mtime = nti(buf[136:148])
1614 obj.type = buf[156:157]
1615 obj.linkname = nts(buf[157:257], encoding, errors)
1616 obj.uname = nts(buf[265:297], encoding, errors)
1617 obj.gname = nts(buf[297:329], encoding, errors)
1618 obj.devmajor = nti(buf[329:337])
1619 obj.devminor = nti(buf[337:345])
1620 prefix = nts(buf[345:500], encoding, errors)
1622 # The old GNU sparse format occupies some of the unused
1623 # space in the buffer for up to 4 sparse structures.
1624 # Save the them for later processing in _proc_sparse().
1625 if obj.type == GNUTYPE_SPARSE:
1630 offset = nti(buf[pos:pos + 12])
1631 numbytes = nti(buf[pos + 12:pos + 24])
1634 structs.append((offset, numbytes))
1636 isextended = bool(buf[482])
1637 origsize = nti(buf[483:495])
1638 obj._sparse_structs = (structs, isextended, origsize)
1640 # Old V7 tar format represents a directory as a regular
1641 # file with a trailing slash.
1642 if obj.type == AREGTYPE and obj.name.endswith("/"):
1645 # Remove redundant slashes from directories.
1647 obj.name = obj.name.rstrip("/")
1649 # Reconstruct a ustar longname.
1650 if prefix and obj.type not in GNU_TYPES:
1651 obj.name = prefix + "/" + obj.name
1653 obj.offset_data = nti(buf[369:381])
1657 def fromtarfile(cls, tarfile):
1658 """Return the next TarInfo object from TarFile object
1661 buf = tarfile.fileobj.read(BLOCKSIZE)
1662 obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors)
1663 obj.offset = tarfile.fileobj.tell() - BLOCKSIZE
1664 return obj._proc_member(tarfile)
1666 #--------------------------------------------------------------------------
1667 # The following are methods that are called depending on the type of a
1668 # member. The entry point is _proc_member() which can be overridden in a
1669 # subclass to add custom _proc_*() methods. A _proc_*() method MUST
1670 # implement the following
1672 # 1. Set self.offset_data to the position where the data blocks begin,
1673 # if there is data that follows.
1674 # 2. Set tarfile.offset to the position where the next member's header will
1676 # 3. Return self or another valid TarInfo object.
1677 def _proc_member(self, tarfile):
1678 """Choose the right processing method depending on
1679 the type and call it.
1681 if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK):
1682 return self._proc_gnulong(tarfile)
1683 elif self.type == GNUTYPE_SPARSE:
1684 return self._proc_sparse(tarfile)
1685 elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE):
1686 return self._proc_pax(tarfile)
1688 return self._proc_builtin(tarfile)
1690 def _proc_builtin(self, tarfile):
1691 """Process a builtin type or an unknown type which
1692 will be treated as a regular file.
1694 self.offset_data = tarfile.fileobj.tell()
1695 offset = self.offset_data
1696 if self.isreg() or self.ismultivol() or self.type not in SUPPORTED_TYPES:
1697 # Skip the following data blocks.
1698 offset += self._block(self.size)
1699 tarfile.offset = offset
1701 # Patch the TarInfo object with saved global
1702 # header information.
1703 self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors)
1707 def _proc_gnulong(self, tarfile):
1708 """Process the blocks that hold a GNU longname
1711 buf = tarfile.fileobj.read(self._block(self.size))
1713 # Fetch the next header and process it.
1715 next = self.fromtarfile(tarfile)
1717 raise SubsequentHeaderError("missing or bad subsequent header")
1719 # Patch the TarInfo object from the next header with
1720 # the longname information.
1721 next.offset = self.offset
1722 if self.type == GNUTYPE_LONGNAME:
1723 next.name = nts(buf, tarfile.encoding, tarfile.errors)
1724 elif self.type == GNUTYPE_LONGLINK:
1725 next.linkname = nts(buf, tarfile.encoding, tarfile.errors)
1729 def _proc_sparse(self, tarfile):
1730 """Process a GNU sparse header plus extra headers.
1732 # We already collected some sparse structures in frombuf().
1733 structs, isextended, origsize = self._sparse_structs
1734 del self._sparse_structs
1736 # Collect sparse structures from extended header blocks.
1738 buf = tarfile.fileobj.read(BLOCKSIZE)
1742 offset = nti(buf[pos:pos + 12])
1743 numbytes = nti(buf[pos + 12:pos + 24])
1746 if offset and numbytes:
1747 structs.append((offset, numbytes))
1749 isextended = bool(buf[504])
1750 self.sparse = structs
1752 self.offset_data = tarfile.fileobj.tell()
1753 tarfile.offset = self.offset_data + self._block(self.size)
1754 self.size = origsize
1757 def _proc_pax(self, tarfile):
1758 """Process an extended or global header as described in
1761 # Read the header information.
1762 buf = tarfile.fileobj.read(self._block(self.size))
1764 # A pax header stores supplemental information for either
1765 # the following file (extended) or all following files
1767 if self.type == XGLTYPE:
1768 pax_headers = tarfile.pax_headers
1770 pax_headers = tarfile.pax_headers.copy()
1772 # Check if the pax header contains a hdrcharset field. This tells us
1773 # the encoding of the path, linkpath, uname and gname fields. Normally,
1774 # these fields are UTF-8 encoded but since POSIX.1-2008 tar
1775 # implementations are allowed to store them as raw binary strings if
1776 # the translation to UTF-8 fails.
1777 match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf)
1778 if match is not None:
1779 pax_headers["hdrcharset"] = match.group(1).decode("utf-8")
1781 # For the time being, we don't care about anything other than "BINARY".
1782 # The only other value that is currently allowed by the standard is
1783 # "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
1784 hdrcharset = pax_headers.get("hdrcharset")
1785 if hdrcharset == "BINARY":
1786 encoding = tarfile.encoding
1790 # Parse pax header information. A record looks like that:
1791 # "%d %s=%s\n" % (length, keyword, value). length is the size
1792 # of the complete record including the length field itself and
1793 # the newline. keyword and value are both UTF-8 encoded strings.
1794 regex = re.compile(br"(\d+) ([^=]+)=")
1797 match = regex.match(buf, pos)
1801 length, keyword = match.groups()
1802 length = int(length)
1803 value = buf[match.end(2) + 1:match.start(1) + length - 1]
1805 # Normally, we could just use "utf-8" as the encoding and "strict"
1806 # as the error handler, but we better not take the risk. For
1807 # example, GNU tar <= 1.23 is known to store filenames it cannot
1808 # translate to UTF-8 as raw strings (unfortunately without a
1809 # hdrcharset=BINARY header).
1810 # We first try the strict standard encoding, and if that fails we
1811 # fall back on the user's encoding and error handler.
1812 keyword = self._decode_pax_field(keyword, "utf-8", "utf-8",
1814 if keyword in PAX_NAME_FIELDS:
1815 value = self._decode_pax_field(value, encoding, tarfile.encoding,
1818 value = self._decode_pax_field(value, "utf-8", "utf-8",
1821 pax_headers[keyword] = value
1825 # Fetch the next header.
1827 next = self.fromtarfile(tarfile)
1829 raise SubsequentHeaderError("missing or bad subsequent header")
1831 # Process GNU sparse information.
1832 if "GNU.sparse.map" in pax_headers:
1833 # GNU extended sparse format version 0.1.
1834 self._proc_gnusparse_01(next, pax_headers)
1836 elif "GNU.sparse.size" in pax_headers:
1837 # GNU extended sparse format version 0.0.
1838 self._proc_gnusparse_00(next, pax_headers, buf)
1840 elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0":
1841 # GNU extended sparse format version 1.0.
1842 self._proc_gnusparse_10(next, pax_headers, tarfile)
1844 if self.type in (XHDTYPE, SOLARIS_XHDTYPE):
1845 # Patch the TarInfo object with the extended header info.
1846 next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors)
1847 next.offset = self.offset
1849 if "size" in pax_headers:
1850 # If the extended header replaces the size field,
1851 # we need to recalculate the offset where the next
1853 offset = next.offset_data
1854 if next.isreg() or next.type not in SUPPORTED_TYPES:
1855 offset += next._block(next.size)
1856 tarfile.offset = offset
1858 if next is not None:
1859 if "GNU.volume.filename" in pax_headers:
1860 if pax_headers["GNU.volume.filename"] == next.name:
1861 if "GNU.volume.size" in pax_headers:
1862 next.size = int(pax_headers["GNU.volume.size"])
1863 if "GNU.volume.offset" in pax_headers:
1864 next.volume_offset = int(pax_headers["GNU.volume.offset"])
1866 for key in pax_headers.keys():
1867 if key.startswith("GNU.volume"):
1868 del tarfile.pax_headers[key]
1872 def _proc_gnusparse_00(self, next, pax_headers, buf):
1873 """Process a GNU tar extended sparse header, version 0.0.
1876 for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf):
1877 offsets.append(int(match.group(1)))
1879 for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf):
1880 numbytes.append(int(match.group(1)))
1881 next.sparse = list(zip(offsets, numbytes))
1883 def _proc_gnusparse_01(self, next, pax_headers):
1884 """Process a GNU tar extended sparse header, version 0.1.
1886 sparse = [int(x) for x in pax_headers["GNU.sparse.map"].split(",")]
1887 next.sparse = list(zip(sparse[::2], sparse[1::2]))
1889 def _proc_gnusparse_10(self, next, pax_headers, tarfile):
1890 """Process a GNU tar extended sparse header, version 1.0.
1894 buf = tarfile.fileobj.read(BLOCKSIZE)
1895 fields, buf = buf.split(b"\n", 1)
1896 fields = int(fields)
1897 while len(sparse) < fields * 2:
1898 if b"\n" not in buf:
1899 buf += tarfile.fileobj.read(BLOCKSIZE)
1900 number, buf = buf.split(b"\n", 1)
1901 sparse.append(int(number))
1902 next.offset_data = tarfile.fileobj.tell()
1903 next.sparse = list(zip(sparse[::2], sparse[1::2]))
1905 def _apply_pax_info(self, pax_headers, encoding, errors):
1906 """Replace fields with supplemental information from a previous
1907 pax extended or global header.
1909 for keyword, value in pax_headers.items():
1910 if keyword == "GNU.sparse.name":
1911 setattr(self, "path", value)
1912 elif keyword == "GNU.sparse.size":
1913 setattr(self, "size", int(value))
1914 elif keyword == "GNU.sparse.realsize":
1915 setattr(self, "size", int(value))
1916 elif keyword in PAX_FIELDS:
1917 if keyword in PAX_NUMBER_FIELDS:
1919 value = PAX_NUMBER_FIELDS[keyword](value)
1922 if keyword == "path":
1923 value = value.rstrip("/") # pylint: disable=no-member
1924 setattr(self, keyword, value)
1926 self.pax_headers = pax_headers.copy()
1928 def _decode_pax_field(self, value, encoding, fallback_encoding, fallback_errors):
1929 """Decode a single field from a pax record.
1932 return value.decode(encoding, "strict")
1933 except UnicodeDecodeError:
1934 return value.decode(fallback_encoding, fallback_errors)
1936 def _block(self, count):
1937 """Round up a byte count by BLOCKSIZE and return it,
1938 e.g. _block(834) => 1024.
1940 blocks, remainder = divmod(count, BLOCKSIZE)
1943 return blocks * BLOCKSIZE
1946 return self.type in REGULAR_TYPES
1950 return self.type == DIRTYPE
1952 return self.type == SYMTYPE
1954 return self.type == LNKTYPE
1956 return self.type == CHRTYPE
1958 return self.type == BLKTYPE
1960 return self.type == FIFOTYPE
1962 return self.sparse is not None
1964 return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE)
1965 def ismultivol(self):
1966 return self.type == GNUTYPE_MULTIVOL or self.volume_offset > 0 or\
1967 "GNU.volume.offset" in self.pax_headers
1970 class TarFile(object):
1971 """The TarFile Class provides an interface to tar archives.
1974 debug = 0 # May be set from 0 (no msgs) to 3 (all msgs)
1976 dereference = False # If true, add content of linked file to the
1977 # tar file, else the link.
1979 ignore_zeros = False # If true, skips empty or invalid blocks and
1980 # continues processing.
1982 max_volume_size = None # If different from None, establishes maximum
1983 # size of tar volumes
1985 new_volume_handler = None # function handler to be executed before when
1986 # a new volume is needed
1988 volume_number = 0 # current volume number, used for multi volume
1991 errorlevel = 1 # If 0, fatal errors only appear in debug
1992 # messages (if debug >= 0). If > 0, errors
1993 # are passed to the caller as exceptions.
1995 format = DEFAULT_FORMAT # The format to use when creating an archive.
1997 encoding = ENCODING # Encoding for 8-bit character strings.
1999 errors = None # Error handler for unicode conversion.
2001 tarinfo = TarInfo # The default TarInfo class to use.
2003 fileobject = ExFileObject # The file-object for extractfile().
2005 arcmode = ARCMODE_PLAIN # Object processing mode (“concat”, encryption,
2008 save_to_members = True # If new members are saved. This can be disabled
2009 # if you manage lots of files and don't want
2010 # to have high memory usage
2012 cache_uid2user = {} # cache to avoid getpwuid calls. It always parses /etc/passwd.
2013 cache_gid2group = {} # same cache for groups
2015 def __init__(self, name=None, mode="r", fileobj=None, format=None,
2016 tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
2017 errors="surrogateescape", pax_headers=None, debug=None,
2018 errorlevel=None, max_volume_size=None, new_volume_handler=None,
2019 concat=False, nacl=None,
2020 save_to_members=True):
2021 """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
2022 read from an existing archive, 'a' to append data to an existing
2023 file or 'w' to create a new file overwriting an existing one. `mode'
2025 If `fileobj' is given, it is used for reading or writing data. If it
2026 can be determined, `mode' is overridden by `fileobj's mode.
2027 `fileobj' is not closed, when TarFile is closed.
2029 if len(mode) > 1 or mode not in "raw":
2030 raise ValueError("mode must be 'r', 'a' or 'w'")
2032 self.arcmode = arcmode_set (concat)
2034 self._mode = {"r": "rb", "a": "r+b", "w": "wb"}[mode]
2037 if self.mode == "a" and not os.path.exists(name):
2038 # Create nonexistent files in append mode.
2041 fileobj = bltn_open(name, self._mode)
2042 self._extfileobj = False
2044 if name is None and hasattr(fileobj, "name"):
2046 # when fileobj is a gzip.GzipFile, fileobj.mode is an int (not valid for us)
2047 if hasattr(fileobj, "mode") and isinstance(fileobj.mode, str):
2048 self._mode = fileobj.mode
2049 self._extfileobj = True
2050 self.name = os.path.abspath(name) if name else None
2051 self.base_name = self.name = os.path.abspath(name) if name else None
2052 self.fileobj = fileobj
2055 if format is not None:
2056 self.format = format
2057 if tarinfo is not None:
2058 self.tarinfo = tarinfo
2059 if dereference is not None:
2060 self.dereference = dereference
2061 if ignore_zeros is not None:
2062 self.ignore_zeros = ignore_zeros
2063 if encoding is not None:
2064 self.encoding = encoding
2066 self.errors = errors
2068 if pax_headers is not None and self.format == PAX_FORMAT:
2069 self.pax_headers = pax_headers
2071 self.pax_headers = {}
2073 if debug is not None:
2075 if errorlevel is not None:
2076 self.errorlevel = errorlevel
2078 # Init datastructures.
2079 if max_volume_size and max_volume_size < 3*BLOCKSIZE:
2080 raise ValueError("max_volume_size needs to be at least %d" % (3*BLOCKSIZE))
2081 if max_volume_size and not callable(new_volume_handler):
2082 raise ValueError("new_volume_handler needs to be set and be callable for multivolume support")
2084 self.max_volume_size = int(max_volume_size)
2086 self.max_volume_size = None
2088 self.save_to_members = save_to_members
2089 self.new_volume_handler = new_volume_handler
2091 self.members = [] # list of members as TarInfo objects
2092 self._loaded = False # flag if all members have been read
2093 self.offset = self.fileobj.tell()
2094 # current position in the archive file
2095 self.inodes = {} # dictionary caching the inodes of
2096 # archive members already added
2099 if self.mode == "r":
2100 self.firstmember = None
2101 self.firstmember = self.next()
2103 if self.mode == "a":
2104 # Move to the end of the archive,
2105 # before the first empty block.
2107 self.fileobj.seek(self.offset)
2109 tarinfo = self.tarinfo.fromtarfile(self)
2110 self.members.append(tarinfo)
2111 except EOFHeaderError:
2112 self.fileobj.seek(self.offset)
2114 except HeaderError as e:
2115 raise ReadError(str(e))
2117 if self.mode in "aw":
2120 if self.pax_headers:
2121 buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy())
2122 self.fileobj.write(buf)
2123 self.offset += len(buf)
2125 if not self._extfileobj:
2126 self.fileobj.close()
2130 #--------------------------------------------------------------------------
2131 # Below are the classmethods which act as alternate constructors to the
2132 # TarFile class. The open() method is the only one that is needed for
2133 # public use; it is the "super"-constructor and is able to select an
2134 # adequate "sub"-constructor for a particular compression using the mapping
2137 # This concept allows one to subclass TarFile without losing the comfort of
2138 # the super-constructor. A sub-constructor is registered and made available
2139 # by adding it to the mapping in OPEN_METH.
2142 def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE,
2143 encryption=None, compresslevel=9, tolerance=TOLERANCE_STRICT,
2145 """Open a tar archive for reading, writing or appending. Return
2146 an appropriate TarFile class.
2149 'r' or 'r:*' open for reading with transparent compression
2150 'r:' open for reading exclusively uncompressed
2151 'r:gz' open for reading with gzip compression
2152 'r:bz2' open for reading with bzip2 compression
2153 'r:xz' open for reading with lzma compression
2154 'a' or 'a:' open for appending, creating the file if necessary
2155 'w' or 'w:' open for writing without compression
2156 'w:gz' open for writing with gzip compression
2157 'w:bz2' open for writing with bzip2 compression
2158 'w:xz' open for writing with lzma compression
2160 'r|*' open a stream of tar blocks with transparent compression
2161 'r|' open an uncompressed stream of tar blocks for reading
2162 'r|gz' open a gzip compressed stream of tar blocks
2163 'r|bz2' open a bzip2 compressed stream of tar blocks
2164 'r|xz' open an lzma compressed stream of tar blocks
2165 'w|' open an uncompressed stream for writing
2166 'w|gz' open a gzip compressed stream for writing
2167 'w|bz2' open a bzip2 compressed stream for writing
2168 'w|xz' open an lzma compressed stream for writing
2170 'r#gz' open a stream of gzip compressed tar blocks for reading
2171 'w#gz' open a stream of gzip compressed tar blocks for writing
2173 if not name and not fileobj:
2174 raise ValueError("nothing to open")
2176 if mode in ("r", "r:*"):
2177 # Find out which *open() is appropriate for opening the file.
2178 for comptype in cls.OPEN_METH:
2179 func = getattr(cls, cls.OPEN_METH[comptype])
2180 if fileobj is not None:
2181 saved_pos = fileobj.tell()
2183 return func(name, "r", fileobj, **kwargs)
2184 except (ReadError, CompressionError) as e:
2185 # usually nothing exceptional but sometimes is
2186 if fileobj is not None:
2187 fileobj.seek(saved_pos)
2189 raise ReadError("file could not be opened successfully")
2192 filemode, comptype = mode.split(":", 1)
2193 filemode = filemode or "r"
2194 comptype = comptype or "tar"
2196 # Select the *open() function according to
2197 # given compression.
2198 if comptype in cls.OPEN_METH:
2199 func = getattr(cls, cls.OPEN_METH[comptype])
2201 raise CompressionError("unknown compression type %r" % comptype)
2203 # Pass on compression level for gzip / bzip2.
2204 if comptype == 'gz' or comptype == 'bz2':
2205 kwargs['compresslevel'] = compresslevel
2207 if 'max_volume_size' in kwargs:
2208 if comptype != 'tar' and filemode in 'wa' \
2209 and kwargs['max_volume_size']:
2211 warnings.warn('Only the first volume will be compressed '
2212 'for modes with "w:"!')
2214 return func(name, filemode, fileobj, **kwargs)
2217 filemode, comptype = mode.split("|", 1)
2218 filemode = filemode or "r"
2219 comptype = comptype or "tar"
2221 if filemode not in "rw":
2222 raise ValueError("mode must be 'r' or 'w'")
2224 t = cls(name, filemode,
2225 _Stream(name, filemode, comptype, fileobj, bufsize,
2226 compresslevel=compresslevel),
2228 t._extfileobj = False
2232 filemode, comptype = mode.split("#", 1)
2233 filemode = filemode or "r"
2235 if filemode not in "rw":
2236 raise ValueError ("mode %s not compatible with concat "
2237 "archive; must be 'r' or 'w'" % mode)
2239 stream = _Stream(name, filemode, comptype, fileobj, bufsize,
2240 concat=True, encryption=encryption,
2241 compresslevel=compresslevel, tolerance=tolerance)
2242 kwargs ["concat"] = True
2244 t = cls(name, filemode, stream, **kwargs)
2245 except: # XXX except what?
2247 raise # XXX raise what?
2248 t._extfileobj = False
2252 return cls.taropen(name, mode, fileobj, **kwargs)
2254 raise ValueError("undiscernible mode %r" % mode)
2258 def open_at_offset(cls, offset, *a, **kwa):
2260 Same as ``.open()``, but start reading at the given offset. Assumes a
2261 seekable file object. Returns *None* if opening failed due to a read
2264 fileobj = kwa.get ("fileobj")
2265 if fileobj is not None:
2266 fileobj.seek (offset)
2268 return cls.open (*a, **kwa)
2272 def taropen(cls, name, mode="r", fileobj=None, **kwargs):
2273 """Open uncompressed tar archive name for reading or writing.
2275 if len(mode) > 1 or mode not in "raw":
2276 raise ValueError("mode must be 'r', 'a' or 'w'")
2277 return cls(name, mode, fileobj, **kwargs)
2280 def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
2281 """Open gzip compressed tar archive name for reading or writing.
2282 Appending is not allowed.
2284 if len(mode) > 1 or mode not in "rw":
2285 raise ValueError("mode must be 'r' or 'w'")
2290 except (ImportError, AttributeError):
2291 raise CompressionError("gzip module is not available")
2293 extfileobj = fileobj is not None
2295 fileobj = gzip.GzipFile(name, mode + "b", compresslevel, fileobj)
2296 t = cls.taropen(name, mode, fileobj, **kwargs)
2298 if not extfileobj and fileobj is not None:
2302 raise ReadError("not a gzip file")
2304 if not extfileobj and fileobj is not None:
2307 t._extfileobj = extfileobj
2311 def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
2312 """Open bzip2 compressed tar archive name for reading or writing.
2313 Appending is not allowed.
2315 if len(mode) > 1 or mode not in "rw":
2316 raise ValueError("mode must be 'r' or 'w'.")
2321 raise CompressionError("bz2 module is not available")
2323 fileobj = bz2.BZ2File(fileobj or name, mode,
2324 compresslevel=compresslevel)
2327 t = cls.taropen(name, mode, fileobj, **kwargs)
2328 except (OSError, EOFError):
2330 raise ReadError("not a bzip2 file")
2331 t._extfileobj = False
2335 def xzopen(cls, name, mode="r", fileobj=None, preset=None, **kwargs):
2336 """Open lzma compressed tar archive name for reading or writing.
2337 Appending is not allowed.
2339 if mode not in ("r", "w"):
2340 raise ValueError("mode must be 'r' or 'w'")
2345 raise CompressionError("lzma module is not available")
2347 fileobj = lzma.LZMAFile(fileobj or name, mode, preset=preset)
2350 t = cls.taropen(name, mode, fileobj, **kwargs)
2351 except (lzma.LZMAError, EOFError):
2353 raise ReadError("not an lzma file")
2354 t._extfileobj = False
2357 # All *open() methods are registered here.
2359 "tar": "taropen", # uncompressed tar
2360 "gz": "gzopen", # gzip compressed tar
2361 "bz2": "bz2open", # bzip2 compressed tar
2362 "xz": "xzopen" # lzma compressed tar
2365 #--------------------------------------------------------------------------
2366 # The public methods which TarFile provides:
2369 """Close the TarFile. In write-mode, two finishing zero blocks are
2370 appended to the archive. A special case are empty archives which are
2371 initialized accordingly so the two mandatory blocks of zeros are
2372 written abiding by the requested encryption and compression settings.
2377 if self.mode in "aw":
2378 if self.arcmode & ARCMODE_CONCAT and self.fileobj.tell () == 0:
2379 self.fileobj.next ("")
2380 self.fileobj.write(NUL * (BLOCKSIZE * 2))
2381 self.offset += (BLOCKSIZE * 2)
2382 # fill up the end with zero-blocks
2383 # (like option -b20 for tar does)
2384 blocks, remainder = divmod(self.offset, RECORDSIZE)
2386 self.fileobj.write(NUL * (RECORDSIZE - remainder))
2387 if not self._extfileobj:
2388 self.fileobj.close()
2391 def getmember(self, name):
2392 """Return a TarInfo object for member `name'. If `name' can not be
2393 found in the archive, KeyError is raised. If a member occurs more
2394 than once in the archive, its last occurrence is assumed to be the
2395 most up-to-date version.
2397 tarinfo = self._getmember(name)
2399 raise KeyError("filename %r not found" % name)
2402 def getmembers(self):
2403 """Return the members of the archive as a list of TarInfo objects. The
2404 list has the same order as the members in the archive.
2407 if not self._loaded: # if we want to obtain a list of
2408 self._load() # all members, we first have to
2409 # scan the whole archive.
2412 def get_last_member_offset(self):
2413 """Return the last member offset. Usually this is self.fileobj.tell(),
2414 but when there's encryption or concat compression going on it's more
2415 complicated than that.
2417 return self.last_block_offset
2420 """Return the members of the archive as a list of their names. It has
2421 the same order as the list returned by getmembers().
2423 return [tarinfo.name for tarinfo in self.getmembers()]
2425 def gettarinfo(self, name=None, arcname=None, fileobj=None):
2426 """Create a TarInfo object for either the file `name' or the file
2427 object `fileobj' (using os.fstat on its file descriptor). You can
2428 modify some of the TarInfo's attributes before you add it using
2429 addfile(). If given, `arcname' specifies an alternative name for the
2430 file in the archive.
2434 # When fileobj is given, replace name by
2435 # fileobj's real name.
2436 if fileobj is not None:
2439 # Building the name of the member in the archive.
2440 # Backward slashes are converted to forward slashes,
2441 # Absolute paths are turned to relative paths.
2444 drv, arcname = os.path.splitdrive(arcname)
2445 arcname = arcname.replace(os.sep, "/")
2446 arcname = arcname.lstrip("/")
2448 # Now, fill the TarInfo object with
2449 # information specific for the file.
2450 tarinfo = self.tarinfo()
2451 tarinfo.tarfile = self
2453 # Use os.stat or os.lstat, depending on platform
2454 # and if symlinks shall be resolved.
2456 if hasattr(os, "lstat") and not self.dereference:
2457 statres = os.lstat(name)
2459 statres = os.stat(name)
2461 statres = os.fstat(fileobj.fileno())
2464 stmd = statres.st_mode
2465 if stat.S_ISREG(stmd):
2466 inode = (statres.st_ino, statres.st_dev)
2467 if not self.dereference and statres.st_nlink > 1 and \
2468 inode in self.inodes and arcname != self.inodes[inode]:
2469 # Is it a hardlink to an already
2472 linkname = self.inodes[inode]
2474 # The inode is added only if its valid.
2475 # For win32 it is always 0.
2477 if inode[0] and self.save_to_members:
2478 self.inodes[inode] = arcname
2479 elif stat.S_ISDIR(stmd):
2481 elif stat.S_ISFIFO(stmd):
2483 elif stat.S_ISLNK(stmd):
2485 linkname = os.readlink(name)
2486 elif stat.S_ISCHR(stmd):
2488 elif stat.S_ISBLK(stmd):
2493 # Fill the TarInfo object with all
2494 # information we can get.
2495 tarinfo.name = arcname
2497 tarinfo.uid = statres.st_uid
2498 tarinfo.gid = statres.st_gid
2500 tarinfo.size = statres.st_size
2503 tarinfo.mtime = statres.st_mtime
2505 tarinfo.linkname = linkname
2507 if tarinfo.uid in self.cache_uid2user:
2508 tarinfo.uname = self.cache_uid2user[tarinfo.uid]
2511 tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0]
2512 self.cache_uid2user[tarinfo.uid] = tarinfo.uname
2514 # remember user does not exist:
2515 # same default value as in tarinfo class
2516 self.cache_uid2user[tarinfo.uid] = ""
2518 if tarinfo.gid in self.cache_gid2group:
2519 tarinfo.gname = self.cache_gid2group[tarinfo.gid]
2522 tarinfo.gname = grp.getgrgid(tarinfo.gid)[0]
2523 self.cache_gid2group[tarinfo.gid] = tarinfo.gname
2525 # remember group does not exist:
2526 # same default value as in tarinfo class
2527 self.cache_gid2group[tarinfo.gid] = ""
2529 if type in (CHRTYPE, BLKTYPE):
2530 if hasattr(os, "major") and hasattr(os, "minor"):
2531 tarinfo.devmajor = os.major(statres.st_rdev)
2532 tarinfo.devminor = os.minor(statres.st_rdev)
2535 def list(self, verbose=True):
2536 """Print a table of contents to sys.stdout. If `verbose' is False, only
2537 the names of the members are printed. If it is True, an `ls -l'-like
2542 for tarinfo in self:
2544 print(stat.filemode(tarinfo.mode), end=' ')
2545 print("%s/%s" % (tarinfo.uname or tarinfo.uid,
2546 tarinfo.gname or tarinfo.gid), end=' ')
2547 if tarinfo.ischr() or tarinfo.isblk():
2548 print("%10s" % ("%d,%d" \
2549 % (tarinfo.devmajor, tarinfo.devminor)), end=' ')
2551 print("%10d" % tarinfo.size, end=' ')
2552 print("%d-%02d-%02d %02d:%02d:%02d" \
2553 % time.localtime(tarinfo.mtime)[:6], end=' ')
2555 print(tarinfo.name + ("/" if tarinfo.isdir() else ""), end=' ')
2559 print("->", tarinfo.linkname, end=' ')
2561 print("link to", tarinfo.linkname, end=' ')
2564 def add(self, name, arcname=None, recursive=True, exclude=None, *, filter=None):
2565 """Add the file `name' to the archive. `name' may be any type of file
2566 (directory, fifo, symbolic link, etc.). If given, `arcname'
2567 specifies an alternative name for the file in the archive.
2568 Directories are added recursively by default. This can be avoided by
2569 setting `recursive' to False. `exclude' is a function that should
2570 return True for each filename to be excluded. `filter' is a function
2571 that expects a TarInfo object argument and returns the changed
2572 TarInfo object, if it returns None the TarInfo object will be
2573 excluded from the archive.
2580 # Exclude pathnames.
2581 if exclude is not None:
2583 warnings.warn("use the filter argument instead",
2584 DeprecationWarning, 2)
2586 self._dbg(2, "tarfile: Excluded %r" % name)
2589 # Skip if somebody tries to archive the archive...
2590 if self.name is not None and os.path.abspath(name) == self.name:
2591 self._dbg(2, "tarfile: Skipped %r" % name)
2596 # Create a TarInfo object from the file.
2597 tarinfo = self.gettarinfo(name, arcname)
2600 self._dbg(1, "tarfile: Unsupported type %r" % name)
2603 # Change or exclude the TarInfo object.
2604 if filter is not None:
2605 tarinfo = filter(tarinfo)
2607 self._dbg(2, "tarfile: Excluded %r" % name)
2610 # Append the tar header and data to the archive.
2612 with bltn_open(name, "rb") as f:
2613 self.addfile(tarinfo, f)
2615 elif tarinfo.isdir():
2616 self.addfile(tarinfo)
2618 for f in os.listdir(name):
2619 self.add(os.path.join(name, f), os.path.join(arcname, f),
2620 recursive, exclude, filter=filter)
2623 self.addfile(tarinfo)
2625 def _size_left_file(self):
2626 """Calculates size left in a volume with a maximum volume size.
2628 Assumes self.max_volume_size is set.
2629 If using compression through a _Stream, use _size_left_stream instead
2631 # left-over size = max_size - offset - 2 zero-blocks written in close
2632 size_left = self.max_volume_size - 2*BLOCKSIZE - self.offset
2633 # limit size left to a discrete number of blocks, because we won't
2634 # write only half a block when writting the end of a volume
2635 # and filling with zeros
2636 return BLOCKSIZE * (size_left // BLOCKSIZE)
2638 def _size_left_stream(self):
2639 """ Calculates size left in a volume if using comression/encryption
2641 Assumes self.max_volume_size is set and self.fileobj is a _Stream
2642 (otherwise use _size_left_file)
2644 # left-over size = max_size - bytes written - 2 zero-blocks (close)
2645 size_left = self.max_volume_size - self.fileobj.estim_file_size() \
2647 return BLOCKSIZE * (size_left // BLOCKSIZE)
2649 def addfile(self, tarinfo, fileobj=None):
2650 """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is
2651 given, tarinfo.size bytes are read from it and added to the archive.
2652 You can create TarInfo objects using gettarinfo().
2653 On Windows platforms, `fileobj' should always be opened with mode
2654 'rb' to avoid irritation about the file size.
2658 tarinfo = copy.copy(tarinfo)
2660 if self.arcmode & ARCMODE_CONCAT:
2661 self.last_block_offset = self.fileobj.next (tarinfo.name)
2663 self.last_block_offset = self.fileobj.tell()
2665 buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
2666 self.fileobj.write(buf)
2667 self.offset += len(buf)
2669 if self.max_volume_size:
2670 if isinstance(self.fileobj, _Stream):
2671 _size_left = self._size_left_stream
2673 _size_left = self._size_left_file
2675 _size_left = lambda: tarinfo.size
2677 # If there's no data to follow, finish
2679 if self.save_to_members:
2680 self.members.append(tarinfo)
2683 target_size_left = _size_left()
2684 source_size_left = tarinfo.size
2685 assert tarinfo.volume_offset == 0
2687 # we only split volumes in the middle of a file, that means we have
2688 # to write at least one block
2689 if target_size_left < BLOCKSIZE:
2690 target_size_left = BLOCKSIZE
2692 # loop over multiple volumes
2693 while source_size_left > 0:
2695 # Write as much data as possble from source into target.
2696 # When compressing data, we cannot easily predict how much data we
2697 # can write until target_size_left == 0 --> need to iterate
2698 size_can_write = min(target_size_left, source_size_left)
2700 while size_can_write > 0:
2701 copyfileobj(fileobj, self.fileobj, size_can_write)
2702 self.offset += size_can_write
2703 source_size_left -= size_can_write
2704 target_size_left = _size_left()
2705 size_can_write = min(target_size_left, source_size_left)
2707 # now target_size_left == 0 or source_size_left == 0
2709 # if there is data left to write, we need to create a new volume
2710 if source_size_left > 0:
2711 # Only finalize the crypto entry here if we’re continuing with
2712 # another one; otherwise, the encryption must include the block
2714 tarinfo.type = GNUTYPE_MULTIVOL
2716 if not self.new_volume_handler or\
2717 not callable(self.new_volume_handler):
2718 raise Exception("We need to create a new volume and you "
2719 "didn't supply a new_volume_handler")
2722 # the new volume handler should do everything needed to
2723 # start working in a new volume. usually, the handler calls
2724 # to self.open_volume
2725 self.volume_number += 1
2727 # set to be used by open_volume, because in the case of a PAX
2728 # tar it needs to write information about the volume and offset
2729 # in the global header
2730 tarinfo.volume_offset = tarinfo.size - source_size_left
2731 self.volume_tarinfo = tarinfo
2733 # the “new_volume_handler” is supposed to call .close() on the
2735 self.new_volume_handler(self, self.base_name, self.volume_number)
2737 self.volume_tarinfo = None
2739 if self.arcmode & ARCMODE_CONCAT:
2740 self.fileobj.next_volume (tarinfo.name)
2742 # write new volume header
2743 buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
2744 self.fileobj.write(buf)
2745 self.offset += len(buf)
2747 # adjust variables; open_volume should have reset self.offset
2748 # --> _size_left should be big again
2749 target_size_left = _size_left()
2750 size_can_write = min(target_size_left, source_size_left)
2751 self._dbg(3, 'new volume')
2753 # now, all data has been written. We may have to fill up the rest of
2754 # the block in target with 0s
2755 remainder = (tarinfo.size - tarinfo.volume_offset) % BLOCKSIZE
2757 self.fileobj.write(NUL * (BLOCKSIZE - remainder))
2758 self.offset += BLOCKSIZE - remainder
2760 if self.save_to_members:
2761 self.members.append(tarinfo)
2763 def open_volume(self, name="", fileobj=None, encryption=None):
2765 Called by the user to change this tar file to point to a new volume.
2768 # open the file using either fileobj or name
2770 if self.mode == "a" and not os.path.exists(name):
2771 # Create nonexistent files in append mode.
2774 self._extfileobj = False
2776 if isinstance(self.fileobj, _Stream):
2777 self._dbg(3, 'open_volume: create a _Stream')
2778 fileobj = _Stream(name=name,
2779 mode=self.fileobj.mode,
2780 comptype=self.fileobj.comptype,
2782 bufsize=self.fileobj.bufsize,
2783 encryption=encryption or self.fileobj.encryption,
2784 concat=self.fileobj.arcmode & ARCMODE_CONCAT,
2785 tolerance=self.fileobj.tolerance)
2787 # here, we lose information about compression/encryption!
2788 self._dbg(3, 'open_volume: builtin open')
2789 fileobj = bltn_open(name, self._mode)
2791 if name is None and hasattr(fileobj, "name"):
2793 if hasattr(fileobj, "mode"):
2794 self._mode = fileobj.mode
2795 self._extfileobj = True
2796 self._dbg(3, 'open_volume: using external fileobj {}', fileobj)
2797 self.name = os.path.abspath(name) if name else None
2798 self.fileobj.close()
2799 self.fileobj = fileobj
2801 # init data structures
2803 self.members = [] # list of members as TarInfo objects
2804 self._loaded = False # flag if all members have been read
2805 self.offset = self.fileobj.tell()
2806 # current position in the archive file
2807 self.inodes = {} # dictionary caching the inodes of
2808 # archive members already added
2811 if self.mode == "r":
2812 self.firstmember = None
2813 self.firstmember = self.next()
2815 if self.mode == "a":
2816 # Move to the end of the archive,
2817 # before the first empty block.
2819 self.fileobj.seek(self.offset)
2821 tarinfo = self.tarinfo.fromtarfile(self)
2822 self.members.append(tarinfo)
2823 except EOFHeaderError:
2824 self.fileobj.seek(self.offset)
2826 except HeaderError as e:
2827 raise ReadError(str(e))
2829 if self.mode in "aw":
2832 if self.format == PAX_FORMAT:
2834 "GNU.volume.filename": str(self.volume_tarinfo.name),
2835 "GNU.volume.size": str(self.volume_tarinfo.size - self.volume_tarinfo.volume_offset),
2836 "GNU.volume.offset": str(self.volume_tarinfo.volume_offset),
2839 self.pax_headers.update(volume_info)
2841 if isinstance(self.fileobj, _Stream):
2842 self.fileobj._init_write_gz ()
2843 buf = self.tarinfo.create_pax_global_header(volume_info.copy())
2844 self.fileobj.write(buf)
2845 self.offset += len(buf)
2846 except Exception as exn:
2847 if not self._extfileobj:
2848 self.fileobj.close()
2852 def extractall(self, path=".", members=None, filter=None, unlink=False):
2853 """Extract all members from the archive to the current working
2854 directory and set owner, modification time and permissions on
2855 directories afterwards. `path' specifies a different directory
2856 to extract to. `members' is optional and must be a subset of the
2857 list returned by getmembers().
2864 for tarinfo in members:
2865 if self.volume_number > 0 and tarinfo.ismultivol():
2868 if filter and not filter(tarinfo):
2872 # Extract directories with a safe mode.
2873 directories.append(tarinfo)
2874 tarinfo = copy.copy(tarinfo)
2875 tarinfo.mode = 0o0700
2876 # Do not set_attrs directories, as we will do that further down
2877 self.extract(tarinfo, path, set_attrs=not tarinfo.isdir(), unlink=unlink)
2879 # Reverse sort directories.
2880 directories.sort(key=lambda a: a.name)
2881 directories.reverse()
2883 # Set correct owner, mtime and filemode on directories.
2884 for tarinfo in directories:
2885 dirpath = os.path.join(path, tarinfo.name)
2887 self.chown(tarinfo, dirpath)
2888 self.utime(tarinfo, dirpath)
2889 self.chmod(tarinfo, dirpath)
2890 except ExtractError as e:
2891 if self.errorlevel > 1:
2894 self._dbg(1, "tarfile: %s" % e)
2896 def extract(self, member, path="", set_attrs=True, symlink_cb=None,
2898 """Extract a member from the archive to the current working directory,
2899 using its full name. Its file information is extracted as accurately
2900 as possible. `member' may be a filename or a TarInfo object. You can
2901 specify a different directory using `path'. File attributes (owner,
2902 mtime, mode) are set unless `set_attrs' is False.
2903 ``symlink_cb`` is a hook accepting a function that is passed the
2904 ``member``, ``path``, and ``set_attrs`` arguments if the tarinfo for
2905 ``member`` indicates a symlink in which case only the callback
2906 passed will be applied, skipping the actual extraction. In case the
2907 callback is invoked, its return value is passed on to the caller.
2911 if isinstance(member, str):
2912 tarinfo = self.getmember(member)
2916 # Prepare the link target for makelink().
2918 tarinfo._link_target = os.path.join(path, tarinfo.linkname)
2920 if symlink_cb is not None and tarinfo.issym():
2921 return symlink_cb(member, path, set_attrs)
2924 self._extract_member(tarinfo, os.path.join(path, tarinfo.name),
2925 set_attrs=set_attrs, unlink=unlink)
2926 except EnvironmentError as e:
2927 if self.errorlevel > 0:
2930 if e.filename is None:
2931 self._dbg(1, "tarfile: %s" % e.strerror)
2933 self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename))
2934 except ExtractError as e:
2935 if self.errorlevel > 1:
2938 self._dbg(1, "tarfile: %s" % e)
2940 def extractfile(self, member):
2941 """Extract a member from the archive as a file object. `member' may be
2942 a filename or a TarInfo object. If `member' is a regular file or a
2943 link, an io.BufferedReader object is returned. Otherwise, None is
2948 if isinstance(member, str):
2949 tarinfo = self.getmember(member)
2953 if tarinfo.isreg() or tarinfo.ismultivol() or\
2954 tarinfo.type not in SUPPORTED_TYPES:
2955 # If a member's type is unknown, it is treated as a
2957 return self.fileobject(self, tarinfo)
2959 elif tarinfo.islnk() or tarinfo.issym():
2960 if isinstance(self.fileobj, _Stream):
2961 # A small but ugly workaround for the case that someone tries
2962 # to extract a (sym)link as a file-object from a non-seekable
2963 # stream of tar blocks.
2964 raise StreamError("cannot extract (sym)link as file object")
2966 # A (sym)link's file object is its target's file object.
2967 return self.extractfile(self._find_link_target(tarinfo))
2969 # If there's no data associated with the member (directory, chrdev,
2970 # blkdev, etc.), return None instead of a file object.
2973 def _extract_member(self, tarinfo, targetpath, set_attrs=True, unlink=False):
2974 """Extract the TarInfo object tarinfo to a physical
2975 file called targetpath.
2977 # Fetch the TarInfo object for the given name
2978 # and build the destination pathname, replacing
2979 # forward slashes to platform specific separators.
2980 targetpath = targetpath.rstrip("/")
2981 targetpath = targetpath.replace("/", os.sep)
2983 # Create all upper directories.
2984 upperdirs = os.path.dirname(targetpath)
2985 if upperdirs and not os.path.exists(upperdirs):
2986 # Create directories that are not part of the archive with
2987 # default permissions.
2988 os.makedirs(upperdirs)
2990 if tarinfo.islnk() or tarinfo.issym():
2991 self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname))
2993 self._dbg(1, tarinfo.name)
2996 _unlinkfirst(targetpath)
2999 self.makefile(tarinfo, targetpath)
3000 elif tarinfo.isdir():
3001 self.makedir(tarinfo, targetpath)
3002 elif tarinfo.isfifo():
3003 self.makefifo(tarinfo, targetpath)
3004 elif tarinfo.ischr() or tarinfo.isblk():
3005 self.makedev(tarinfo, targetpath)
3006 elif tarinfo.islnk() or tarinfo.issym():
3007 self.makelink(tarinfo, targetpath)
3008 elif tarinfo.type not in SUPPORTED_TYPES:
3009 self.makeunknown(tarinfo, targetpath)
3011 self.makefile(tarinfo, targetpath)
3014 self.chown(tarinfo, targetpath)
3015 if not tarinfo.issym():
3016 self.chmod(tarinfo, targetpath)
3017 self.utime(tarinfo, targetpath)
3019 #--------------------------------------------------------------------------
3020 # Below are the different file methods. They are called via
3021 # _extract_member() when extract() is called. They can be replaced in a
3022 # subclass to implement other functionality.
3024 def makedir(self, tarinfo, targetpath):
3025 """Make a directory called targetpath.
3028 # Use a safe mode for the directory, the real mode is set
3029 # later in _extract_member().
3030 os.mkdir(targetpath, 0o0700)
3031 except FileExistsError:
3034 def makefile(self, tarinfo, targetpath):
3035 """Make a file called targetpath.
3037 source = self.fileobj
3038 source.seek(tarinfo.offset_data)
3041 target = bltn_open(targetpath, "wb")
3043 if tarinfo.sparse is not None:
3045 for offset, size in tarinfo.sparse:
3047 copyfileobj(source, target, size)
3048 target.seek(tarinfo.size)
3057 copyfileobj(source, target, tarinfo.size)
3060 # only if we are extracting a multivolume this can be treated
3061 if not self.new_volume_handler:
3062 raise Exception("We need to read a new volume and you"
3063 " didn't supply a new_volume_handler")
3065 # the new volume handler should do everything needed to
3066 # start working in a new volume. usually, the handler calls
3067 # to self.open_volume
3068 self.volume_number += 1
3069 self.new_volume_handler(self, self.base_name, self.volume_number)
3070 tarinfo = self.firstmember
3071 source = self.fileobj
3074 if iterate is False: target.close()
3077 def makeunknown(self, tarinfo, targetpath):
3078 """Make a file from a TarInfo object with an unknown type
3081 self.makefile(tarinfo, targetpath)
3082 self._dbg(1, "tarfile: Unknown file type %r, " \
3083 "extracted as regular file." % tarinfo.type)
3085 def makefifo(self, tarinfo, targetpath):
3086 """Make a fifo called targetpath.
3088 if hasattr(os, "mkfifo"):
3089 os.mkfifo(targetpath)
3091 raise ExtractError("fifo not supported by system")
3093 def makedev(self, tarinfo, targetpath):
3094 """Make a character or block device called targetpath.
3096 if not hasattr(os, "mknod") or not hasattr(os, "makedev"):
3097 raise ExtractError("special devices not supported by system")
3101 mode |= stat.S_IFBLK
3103 mode |= stat.S_IFCHR
3105 os.mknod(targetpath, mode,
3106 os.makedev(tarinfo.devmajor, tarinfo.devminor))
3108 def makelink(self, tarinfo, targetpath):
3109 """Make a (symbolic) link called targetpath. If it cannot be created
3110 (platform limitation), we try to make a copy of the referenced file
3114 # For systems that support symbolic and hard links.
3116 os.symlink(tarinfo.linkname, targetpath)
3119 if os.path.exists(tarinfo._link_target):
3120 os.link(tarinfo._link_target, targetpath)
3122 self._extract_member(self._find_link_target(tarinfo),
3124 except symlink_exception:
3126 self._extract_member(self._find_link_target(tarinfo),
3129 raise ExtractError("unable to resolve link inside archive")
3131 def chown(self, tarinfo, targetpath):
3132 """Set owner of targetpath according to tarinfo.
3134 if pwd and hasattr(os, "geteuid") and os.geteuid() == 0:
3135 # We have to be root to do so.
3137 g = grp.getgrnam(tarinfo.gname)[2]
3141 u = pwd.getpwnam(tarinfo.uname)[2]
3145 if tarinfo.issym() and hasattr(os, "lchown"):
3146 os.lchown(targetpath, u, g)
3148 os.chown(targetpath, u, g)
3149 except OSError as e:
3150 raise ExtractError("could not change owner")
3152 def chmod(self, tarinfo, targetpath):
3153 """Set file permissions of targetpath according to tarinfo.
3155 if hasattr(os, 'chmod'):
3157 os.chmod(targetpath, tarinfo.mode)
3158 except OSError as e:
3159 raise ExtractError("could not change mode")
3161 def utime(self, tarinfo, targetpath):
3162 """Set modification time of targetpath according to tarinfo.
3164 if not hasattr(os, 'utime'):
3167 os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime))
3168 except OSError as e:
3169 raise ExtractError("could not change modification time")
3171 #--------------------------------------------------------------------------
3173 """Return the next member of the archive as a TarInfo object, when
3174 TarFile is opened for reading. Return None if there is no more
3178 if self.firstmember is not None:
3179 m = self.firstmember
3180 self.firstmember = None
3183 # Read the next block.
3184 self.fileobj.seek(self.offset)
3188 tarinfo = self.tarinfo.fromtarfile(self)
3189 except EOFHeaderError as e:
3190 if self.ignore_zeros:
3191 self._dbg(2, "0x%X: %s" % (self.offset, e))
3192 self.offset += BLOCKSIZE
3194 except InvalidHeaderError as e:
3195 if self.ignore_zeros:
3196 self._dbg(2, "0x%X: %s" % (self.offset, e))
3197 self.offset += BLOCKSIZE
3199 elif self.offset == 0:
3200 raise ReadError(str(e))
3201 except EmptyHeaderError:
3202 if self.offset == 0:
3203 raise ReadError("empty file")
3204 except TruncatedHeaderError as e:
3205 if self.offset == 0:
3206 raise ReadError(str(e))
3207 except SubsequentHeaderError as e:
3208 raise ReadError(str(e))
3211 if tarinfo is not None:
3212 if self.save_to_members:
3213 self.members.append(tarinfo)
3219 #--------------------------------------------------------------------------
3220 # Little helper methods:
3222 def _getmember(self, name, tarinfo=None, normalize=False):
3223 """Find an archive member by name from bottom to top.
3224 If tarinfo is given, it is used as the starting point.
3226 # Ensure that all members have been loaded.
3227 members = self.getmembers()
3229 # Limit the member search list up to tarinfo.
3230 if tarinfo is not None:
3231 members = members[:members.index(tarinfo)]
3234 name = os.path.normpath(name)
3236 for member in reversed(members):
3238 member_name = os.path.normpath(member.name)
3240 member_name = member.name
3242 if name == member_name:
3246 """Read through the entire archive file and look for readable
3250 tarinfo = self.next()
3255 def _check(self, mode=None):
3256 """Check if TarFile is still open, and if the operation's mode
3257 corresponds to TarFile's mode.
3260 raise OSError("%s is closed" % self.__class__.__name__)
3261 if mode is not None and self.mode not in mode:
3262 raise OSError("bad operation for mode %r" % self.mode)
3264 def _find_link_target(self, tarinfo):
3265 """Find the target member of a symlink or hardlink member in the
3269 # Always search the entire archive.
3270 linkname = "/".join(filter(None, (os.path.dirname(tarinfo.name), tarinfo.linkname)))
3273 # Search the archive before the link, because a hard link is
3274 # just a reference to an already archived file.
3275 linkname = tarinfo.linkname
3278 member = self._getmember(linkname, tarinfo=limit, normalize=True)
3280 raise KeyError("linkname %r not found" % linkname)
3284 """Provide an iterator object.
3287 return iter(self.members)
3289 return TarIter(self)
3291 def _dbg(self, level, msg, *args):
3292 """Write debugging output to sys.stderr.
3294 if level <= self.debug:
3295 print(msg.format(*args), file=sys.stderr)
3297 def __enter__(self):
3301 def __exit__(self, type, value, traceback):
3305 # An exception occurred. We must not call close() because
3306 # it would try to write end-of-archive blocks and padding.
3307 if not self._extfileobj:
3308 self.fileobj.close()
3311 def _unlinkfirst(targetpath):
3313 os.unlink(targetpath)
3314 except OSError as e:
3315 if e.errno == errno.ENOENT or e.errno == errno.EISDIR:
3324 for tarinfo in TarFile(...):
3328 def __init__(self, tarfile):
3329 """Construct a TarIter object.
3331 self.tarfile = tarfile
3334 """Return iterator object.
3338 """Return the next item using TarFile's next() method.
3339 When all members have been read, set TarFile as _loaded.
3341 # Fix for SF #1100429: Under rare circumstances it can
3342 # happen that getmembers() is called during iteration,
3343 # which will cause TarIter to stop prematurely.
3345 if self.index == 0 and self.tarfile.firstmember is not None:
3346 tarinfo = self.tarfile.next()
3347 elif self.index < len(self.tarfile.members):
3348 tarinfo = self.tarfile.members[self.index]
3349 elif not self.tarfile._loaded:
3350 tarinfo = self.tarfile.next()
3352 self.tarfile._loaded = True
3360 #---------------------------------------------------------
3361 # support functionality for rescue mode
3362 #---------------------------------------------------------
3364 TAR_FMT_HDR = (# See tar(5):
3366 "100s" # ← char name[100]; /* 100 */
3367 "8s" # ← char mode[8]; /* 108 */
3368 "8s" # ← char uid[8]; /* 116 */
3369 "8s" # ← char gid[8]; /* 124 */
3370 "12s" # ← char size[12]; /* 136 */
3371 "12s" # ← char mtime[12]; /* 148 */
3372 "8s" # ← char checksum[8]; /* 156 */
3373 "B" # ← char typeflag[1]; /* 157 */
3374 "100s" # ← char linkname[100]; /* 257 */
3375 "6s" # ← char magic[6]; /* 263 */
3376 "2s" # ← char version[2]; /* 265 */
3377 "32s" # ← char uname[32]; /* 297 */
3378 "32s" # ← char gname[32]; /* 329 */
3379 "8s" # ← char devmajor[8]; /* 337 */
3380 "8s" # ← char devminor[8]; /* 345 */
3381 "12s" # ← char atime[12]; /* 357 */
3382 "12s" # ← char ctime[12]; /* 369 */
3383 "12s" # ← char offset[12]; /* 381 */
3384 "4s" # ← char longnames[4]; /* 385 */
3385 "B" # ← char unused[1]; /* 386 */
3387 "12s" # ← char offset[12];
3388 "12s" # ← char numbytes[12];
3389 "12s" # ← char offset[12];
3390 "12s" # ← char numbytes[12];
3391 "12s" # ← char offset[12];
3392 "12s" # ← char numbytes[12];
3393 "12s" # ← char offset[12];
3394 "12s" # ← char numbytes[12];
3395 "" # } sparse[4]; /* 482 */
3396 "B" # ← char isextended[1]; /* 483 */
3397 "12s" # ← char realsize[12]; /* 495 */
3398 "17s" # ← char pad[17]; /* 512 */
3401 # The “magic” and “version” fields are special:
3404 # magic The magic field holds the five characters “ustar” followed by a
3405 # space. Note that POSIX ustar archives have a trailing null.
3409 # /* OLDGNU_MAGIC uses both magic and version fields, which are contiguous.
3410 # Found in an archive, it indicates an old GNU header format, which will be
3411 # hopefully become obsolescent. With OLDGNU_MAGIC, uname and gname are
3412 # valid, though the header is not truly POSIX conforming. */
3415 TAR_HDR_OFF_MAGIC = 257
3416 TAR_FMT_OLDGNU_MAGIC = b"ustar "
3418 def read_gnu_tar_hdr (data):
3419 if len (data) != BLOCKSIZE: # header requires one complete block
3440 offset1, numbytes1, \
3441 offset2, numbytes2, \
3442 offset3, numbytes3, \
3443 offset4, numbytes4, \
3446 pad = struct.unpack (TAR_FMT_HDR, data)
3447 except struct.error:
3450 if magic != TAR_FMT_OLDGNU_MAGIC:
3453 # return all except “unused” and “pad”
3455 { "name" : name, "mode" : mode
3456 , "uid" : uid , "gid" : gid
3457 , "size" : size, "mtime" : mtime
3458 , "checksum" : checksum
3459 , "typeflag" : typeflag
3460 , "linkname" : linkname
3462 , "version" : version
3463 , "uname" : uname, "gname" : gname
3464 , "devmajor" : devmajor, "devminor" : devminor
3465 , "atime" : atime, "ctime" : ctime
3467 , "longnames" : longnames
3468 , "offset1" : offset1, "numbytes1" : numbytes1
3469 , "offset2" : offset2, "numbytes2" : numbytes2
3470 , "offset3" : offset3, "numbytes3" : numbytes3
3471 , "offset4" : offset4, "numbytes4" : numbytes4
3472 , "isextended" : isextended
3473 , "realsize" : realsize
3477 def tar_hdr_check_chksum (data):
3478 hdr = read_gnu_tar_hdr (data)
3481 s = calc_chksums (data)
3482 return nti (hdr ["checksum"]) in s
3485 def readable_tar_objects_offsets (ifd):
3487 Traverse blocks in file, trying to extract tar headers.
3492 mm = mmap.mmap(ifd, 0, mmap.MAP_SHARED, mmap.PROT_READ)
3493 pos = TAR_HDR_OFF_MAGIC
3496 pos = mm.find (TAR_FMT_OLDGNU_MAGIC, pos)
3499 off = pos - TAR_HDR_OFF_MAGIC
3501 blk = mm.read (BLOCKSIZE)
3502 if tar_hdr_check_chksum (blk) is True:
3503 offsets.append (off)
3509 def locate_gz_hdr_candidates (fd):
3511 Walk over instances of the GZ magic in the payload, collecting their
3512 positions. If the offset of the first found instance is not zero, the file
3513 begins with leading garbage.
3515 Note that since the GZ magic consists of only two bytes, we expect a lot of
3516 false positives inside binary data.
3518 :return: The list of offsets in the file.
3522 mm = mmap.mmap(fd, 0, mmap.MAP_SHARED, mmap.PROT_READ)
3525 pos = mm.find (GZ_MAGIC_BYTES, pos)
3529 pos += len (GZ_MAGIC_BYTES)
3534 HDR_CAND_GOOD = 0 # header marks begin of valid object
3535 HDR_CAND_FISHY = 1 # inconclusive
3536 HDR_CAND_JUNK = 2 # not a header / object unreadable
3539 def read_cstring (fd, max=-1, encoding=None):
3541 Read one NUL-terminated string from *fd* into a Python string. If *max* is
3542 non-negative, reading will terminate after the specified number of bytes.
3544 Optionally, an *encoding* may be specified to interpret the data as.
3546 :returns: *None* if parsing failed or the maximum number of bytes has been
3547 exceeded; a Python string with the data otherwise.
3556 if max >= 0 and l > max:
3560 if encoding is not None:
3561 buf = buf.decode (encoding)
3566 def inspect_gz_hdr (fd, off):
3568 Attempt to parse a Gzip header in *fd* at position *off*. The format is
3569 documented as RFC1952.
3571 Returns a verdict about the quality of that header plus the parsed header
3572 when readable. Problematic sizes such as fields running past the EOF are
3573 treated as garbage. Properties in which the header merely doesn’t conform
3574 to the spec (garbage flag bits, bogus timestamp) are considered “fishy”. No
3575 validation is possible on embedded strings because they are single-byte
3583 verdict = HDR_CAND_GOOD
3585 os.lseek (fd, off, os.SEEK_SET)
3586 if os.lseek (fd, 0, os.SEEK_CUR) != off:
3587 return HDR_CAND_JUNK, None
3589 raw = os.read (fd, GZ_HEADER_SIZE)
3590 if len (raw) != GZ_HEADER_SIZE:
3591 return HDR_CAND_JUNK, None
3595 _m1, _m2, meth, flags, mtime, dflags, oscode = \
3596 struct.unpack (GZ_FMT_HEADER, raw)
3597 if meth != GZ_METHOD_DEFLATE: # only deflate is supported
3598 return HDR_CAND_JUNK, None
3599 except struct.error as exn:
3600 return HDR_CAND_JUNK, None
3602 if mtime > int (time.time ()):
3603 verdict = HDR_CAND_FISHY
3605 if dflags != GZ_DEFLATE_FLAGS:
3606 verdict = HDR_CAND_FISHY
3608 if oscode != GZ_OS_CODE:
3609 verdict = HDR_CAND_FISHY
3611 if flags & GZ_FLAG_FTEXT: # created by some contrarian
3612 verdict = HDR_CAND_FISHY
3613 if flags & GZ_FLAG_FEXTRA:
3614 xlen = struct.unpack ("<H", os.read (fd, 2))[0]
3615 xtra = os.read (fd, xlen)
3616 if len (xtra) != xlen: # eof inside header
3617 return HDR_CAND_JUNK, None
3618 if flags & GZ_FLAG_FNAME:
3619 # read up to the next NUL byte, not exceeding the maximum path length
3621 fname = read_cstring (fd, max=(LENGTH_NAME + LENGTH_PREFIX),
3622 encoding="iso-8859-1")
3624 return HDR_CAND_JUNK, None
3625 if flags & GZ_FLAG_FCOMMENT:
3626 fname = read_cstring (fd, max=(LENGTH_NAME + LENGTH_PREFIX),
3627 encoding="iso-8859-1")
3629 return HDR_CAND_JUNK, None
3630 if flags & GZ_FLAG_FHCRC: # half a CRC32; discarded
3631 crc16 = os.read (fd, 2)
3632 if len (crc16) != 2: # eof inside header
3633 return HDR_CAND_JUNK, None
3634 if flags & GZ_FLAG_RESERVED:
3635 # according to the RFC, these must not be set
3636 verdict = HDR_CAND_FISHY
3638 hlen = os.lseek (fd, 0, os.SEEK_CUR) - off
3650 def try_decompress (ifd, off, hdr):
3652 Attempt to process the object starting at *off* with gzip.
3654 :returns: A pair containing the values of the decompressed data and
3655 the length of the input consumed. Note that the latter value
3656 may exceed the length of the compressed data because the
3657 *zlib* module does not provide a means to query how much
3658 of the input it processed before the end of an object.
3661 decmp = zlib.decompressobj (-zlib.MAX_WBITS)
3663 dlen = 0 # size of decompressed data
3665 os.lseek (ifd, pos, os.SEEK_SET)
3667 cnk = os.read (ifd, BUFSIZE)
3670 data = decmp.decompress (cnk)
3671 except zlib.error as exn: # probably CRC32 mismatch; terminate softly
3674 if decmp.eof is True:
3676 if len (cnk) != BUFSIZE: # eof, but not end of decompressed object!
3679 return dlen, pos - off
3681 def readable_gz_objects_offsets (ifd, cands):
3683 Inspect header candidates for parseable *ifd* gzipped objects.
3690 vdt, hdr = inspect_gz_hdr (ifd, cand)
3691 if vdt == HDR_CAND_JUNK:
3692 pass # ignore unreadable ones
3693 elif vdt in [HDR_CAND_GOOD, HDR_CAND_FISHY]:
3694 off0 = cand + hdr ["hlen"]
3695 dlen, clen = try_decompress (ifd, off0, hdr)
3696 if dlen > 0 and clen > 0:
3702 def reconstruct_offsets_gz (fname):
3704 From the given file, retrieve all GZ header-like offsets (“candidates”).
3705 Then check each of those locations whether they can be processed as
3708 ifd = os.open (fname, os.O_RDONLY)
3711 cands = locate_gz_hdr_candidates (ifd)
3712 return readable_gz_objects_offsets (ifd, cands)
3717 def reconstruct_offsets_tar (fname):
3719 From the given file, retrieve all tar header-like offsets (“candidates”).
3720 Then check each of those locations whether they can be processed as tar
3723 ifd = os.open (fname, os.O_RDONLY)
3726 return readable_tar_objects_offsets (ifd)
3731 def read_tarobj_at_offset (fileobj, offset, mode, secret=None,
3732 strict_validation=True):
3734 :type strict_validation: bool
3735 :param strict_validation: Enable strict IV checking in the crypto
3736 layer. Should be disabled when dealing with
3737 potentially corrupted data.
3741 if secret is not None:
3744 if ks == crypto.PDTCRYPT_SECRET_PW:
3745 decr = crypto.Decrypt (password=secret [1],
3746 strict_ivs=strict_validation)
3747 elif ks == crypto.PDTCRYPT_SECRET_KEY:
3748 key = binascii.unhexlify (secret [1])
3749 decr = crypto.Decrypt (key=key,
3750 strict_ivs=strict_validation)
3756 TarFile.open_at_offset (offset,
3762 save_to_members=False,
3763 tolerance=TOLERANCE_RESCUE)
3764 except (ReadError, EndOfFile):
3767 return tarobj.next ()
3770 def idxent_of_tarinfo (tarinfo):
3772 Scrape the information relevant for the index from a *TarInfo* object.
3773 Keys like the inode number that lack a corresponding field in a TarInfo
3774 will be set to some neutral value.
3779 , "path" : "snapshot://annotations.db"
3783 , "ctime" : 1502798115
3784 , "mtime" : 1502196423
3793 { "inode" : 0 # ignored when reading the index
3794 , "uid" : tarinfo.uid
3795 , "gid" : tarinfo.gid
3796 , "path" : tarinfo.name # keeping URI scheme
3797 , "offset" : 0 # to be added by the caller
3798 , "volume" : tarinfo.volume_offset
3799 , "mode" : tarinfo.mode
3800 , "ctime" : tarinfo.mtime
3801 , "mtime" : tarinfo.mtime
3802 , "size" : tarinfo.size
3803 , "type" : tarinfo.type
3807 def gen_rescue_index (gen_volume_name, mode, maxvol=None, password=None, key=None):
3809 psidx = [] # pseudo index, return value
3811 secret = crypto.make_secret (password=password, key=key)
3816 vpath = gen_volume_name (nvol)
3818 if secret is not None:
3819 offsets = crypto.reconstruct_offsets (vpath, secret)
3821 offsets = reconstruct_offsets_gz (vpath)
3823 offsets = reconstruct_offsets_tar (vpath)
3825 raise TarError ("no rescue handling for mode “%s”" % mode)
3826 except FileNotFoundError as exn:
3827 # volume does not exist
3828 if maxvol is not None and nvol < maxvol:
3829 continue # explicit volume number specified, ignore missing ones
3833 fileobj = bltn_open (vpath, "rb")
3836 obj = read_tarobj_at_offset (fileobj, off, mode, secret=secret,
3837 strict_validation=False)
3839 acc.append ((off, nvol, obj))
3841 infos += functools.reduce (aux, offsets, [])
3847 def aux (o, nvol, ti):
3848 ie = idxent_of_tarinfo (ti)
3850 ie ["volume"] = nvol
3853 psidx = [ aux (o, nvol, ti) for o, nvol, ti in infos ]
3857 #--------------------
3858 # exported functions
3859 #--------------------
3860 def is_tarfile(name):
3861 """Return True if name points to a tar archive that we
3862 are able to handle, else return False.