From: Eduardo Robles Elvira Date: Fri, 14 Jun 2013 10:55:49 +0000 (+0200) Subject: initial tarfile multivol writing implementation X-Git-Tag: v2.2~191 X-Git-Url: http://developer.intra2net.com/git/?a=commitdiff_plain;h=68ddf9552b2641f47115afcf0c4bfd2d3585dd76;p=python-delta-tar initial tarfile multivol writing implementation --- diff --git a/tarfile.py b/tarfile.py index a81a49e..fe12671 100644 --- a/tarfile.py +++ b/tarfile.py @@ -87,6 +87,8 @@ CONTTYPE = "7" # contiguous file GNUTYPE_LONGNAME = "L" # GNU tar longname GNUTYPE_LONGLINK = "K" # GNU tar longlink GNUTYPE_SPARSE = "S" # GNU tar sparse file +GNUTYPE_MULTIVOL = "M" # GNU tar continuation of a file that began on + # another volume XHDTYPE = "x" # POSIX.1-2001 extended header XGLTYPE = "g" # POSIX.1-2001 global header @@ -105,7 +107,7 @@ SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE, SYMTYPE, DIRTYPE, FIFOTYPE, CONTTYPE, CHRTYPE, BLKTYPE, GNUTYPE_LONGNAME, GNUTYPE_LONGLINK, - GNUTYPE_SPARSE) + GNUTYPE_SPARSE, GNUTYPE_MULTIVOL) # File types that will be treated as a regular file. REGULAR_TYPES = (REGTYPE, AREGTYPE, @@ -113,7 +115,7 @@ REGULAR_TYPES = (REGTYPE, AREGTYPE, # File types that are part of the GNU tar format. GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK, - GNUTYPE_SPARSE) + GNUTYPE_SPARSE, GNUTYPE_MULTIVOL) # Fields from a pax header that override a TarInfo attribute. PAX_FIELDS = ("path", "linkpath", "size", "mtime", @@ -1111,7 +1113,7 @@ class TarInfo(object): itn(info.get("size", 0), 12, format), itn(info.get("mtime", 0), 12, format), " ", # checksum field - info.get("type", REGTYPE), + info.get("type", REGTYPE), # TODO change to GNUTYPE_MULTIVOL when appropriate stn(info.get("linkname", ""), 100), stn(info.get("magic", POSIX_MAGIC), 8), stn(info.get("uname", ""), 32), @@ -1470,6 +1472,8 @@ class TarInfo(object): return self.type == GNUTYPE_SPARSE def isdev(self): return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE) + def ismultivol(self): + return self.type == GNUTYPE_MULTIVOL # class TarInfo class TarFile(object): @@ -1484,6 +1488,15 @@ class TarFile(object): ignore_zeros = False # If true, skips empty or invalid blocks and # continues processing. + max_volume_size = None # If different from None, stablishes maximum + # size of tar volumes + + new_volume_handler = None # function handler to be executed before when + # a new volume is needed + + volume_number = 0 # current volume number, used for multi volume + # support + errorlevel = 1 # If 0, fatal errors only appear in debug # messages (if debug >= 0). If > 0, errors # are passed to the caller as exceptions. @@ -1500,7 +1513,8 @@ class TarFile(object): def __init__(self, name=None, mode="r", fileobj=None, format=None, tarinfo=None, dereference=None, ignore_zeros=None, encoding=None, - errors=None, pax_headers=None, debug=None, errorlevel=None): + errors=None, pax_headers=None, debug=None, errorlevel=None, + max_volume_size=None, new_volume_handler=None): """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to read from an existing archive, 'a' to append data to an existing file or 'w' to create a new file overwriting an existing one. `mode' @@ -1560,6 +1574,8 @@ class TarFile(object): self.errorlevel = errorlevel # Init datastructures. + self.max_volume_size = max_volume_size + self.new_volume_handler = new_volume_handler self.closed = False self.members = [] # list of members as TarInfo objects self._loaded = False # flag if all members have been read @@ -2015,17 +2031,115 @@ class TarFile(object): self.fileobj.write(buf) self.offset += len(buf) - # If there's data to follow, append it. - if fileobj is not None: - copyfileobj(fileobj, self.fileobj, tarinfo.size) - blocks, remainder = divmod(tarinfo.size, BLOCKSIZE) + + # If there's no data to follow, finish + if not fileobj: + self.members.append(tarinfo) + return + + is_multivol = False + + # handle multivolume support + if self.max_volume_size: + size_left = self.max_volume_size - self.offset + max_size_to_write = min(size_left, tarinfo.size) + else: + size_left = max_size_to_write = tarinfo.size + + # iterate, one iteration per volume (usually only one volume) + while size_left > 0: + if is_multivol: + buf = tarinfo.tobuf(self.format, self.encoding, self.errors) + self.fileobj.write(buf) + self.offset += len(buf) + copyfileobj(fileobj, self.fileobj, max_size_to_write) + blocks, remainder = divmod(max_size_to_write, BLOCKSIZE) if remainder > 0: self.fileobj.write(NUL * (BLOCKSIZE - remainder)) blocks += 1 self.offset += blocks * BLOCKSIZE + size_left = self.max_volume_size - self.offset + + # create new volume is needed + if size_left <= 0: + tarinfo.offset_data += blocks * BLOCKSIZE + tarinfo.type == GNUTYPE_MULTIVOL + + if not self.new_volume_handler or\ + not callable(self.new_volume_handler): + raise Exception("We need to create a new volume and you" + " didn't supply a new_volume_handler") + + # the new volume handler should do everything needed to + # start working in a new volume. usually, the handler calls + # to self.open_volume + self.new_volume_handler(self) + is_multivol = True self.members.append(tarinfo) + def open_volume(self, name="", fileobj=None): + ''' + Called by the user to change this tar file to point to a new volume + ''' + # open the file using either fileobj or name + if not fileobj: + if self.mode == "a" and not os.path.exists(name): + # Create nonexistent files in append mode. + self.mode = "w" + self._mode = "wb" + fileobj = bltn_open(name, self._mode) + self._extfileobj = False + else: + if name is None and hasattr(fileobj, "name"): + name = fileobj.name + if hasattr(fileobj, "mode"): + self._mode = fileobj.mode + self._extfileobj = True + self.name = os.path.abspath(name) if name else None + self.fileobj = fileobj + + # init data structures + self.closed = False + self.members = [] # list of members as TarInfo objects + self._loaded = False # flag if all members have been read + self.offset = self.fileobj.tell() + # current position in the archive file + self.inodes = {} # dictionary caching the inodes of + # archive members already added + + try: + if self.mode == "r": + self.firstmember = None + self.firstmember = self.next() + + if self.mode == "a": + # Move to the end of the archive, + # before the first empty block. + while True: + self.fileobj.seek(self.offset) + try: + tarinfo = self.tarinfo.fromtarfile(self) + self.members.append(tarinfo) + except EOFHeaderError: + self.fileobj.seek(self.offset) + break + except HeaderError, e: + raise ReadError(str(e)) + + if self.mode in "aw": + self._loaded = True + + if self.pax_headers: + buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy()) + self.fileobj.write(buf) + self.offset += len(buf) + except: + if not self._extfileobj: + self.fileobj.close() + self.closed = True + raise + def extractall(self, path=".", members=None): """Extract all members from the archive to the current working directory and set owner, modification time and permissions on diff --git a/tarfile_multivol_example.py b/tarfile_multivol_example.py new file mode 100644 index 0000000..d8e36f2 --- /dev/null +++ b/tarfile_multivol_example.py @@ -0,0 +1,47 @@ + +''' +when reading, the file being read is not going to fail reading because tar will +have writen the tar file at appropiate sizes. so it's transparent for _Stream + +when writing, it will tarobj who will notice when that the file is too big, and +thus it will be tarobj job to close the current stream and call to +new_volume_handler before continue using stream for writing. But it will be +still transparent from the stream object POV. + + +In the case of restarting gzip compression with #gz: + +For writing it will be tarobj job to stop writing current file and tell the +_Stream object to handle the new file event. So it will be _Stream job to do +that. + +For reading it will be tarobj job to notice the end of a file when reading, and +call to _Stream object to handle the new file event, in this case for reading. + +''' + +from tarfile import TarFile + +def new_volume_handler(tarobj): + volume_path = "%s.%d" % (tarobj.name, tarobj.volume_number + 1) + print "new volume: ", volume_path + tarobj.open_volume(volume_path) + + +# write +tar = TarFile.open("sample.tar.gz", + mode="w|gz", + max_volume_size=1024**2, + new_volume_handler=new_volume_handler) +tar.add("big") +tar.close() + +## read +#tar = tarfile.open("sample.tar.gz", + #mode="r#gz", + #new_volume_handler=new_volume) +#for name in ["foo", "bar", "quux"]: + #tar.add(name) +#tar.close() + +# when creating a \ No newline at end of file