#!/usr/bin/env python3 # Copyright (C) 2013, 2014 Intra2net AG # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as published # by the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see # DELTATAR_HEADER_VERSION = 1 DELTATAR_PARAMETER_VERSION = 1 import logging import datetime import binascii import io import operator import os import copy import shutil import re import stat import json import typing from functools import partial from . import tarfile from . import crypto class NullHandler(logging.Handler): def emit(self, record): pass logging.getLogger("deltatar.DeltaTar").addHandler(NullHandler()) # match mode NO_MATCH = False MATCH = True PARENT_MATCH = 2 # encryption direction CRYPTO_MODE_ENCRYPT = 0 CRYPTO_MODE_DECRYPT = 1 # The canonical extension for encrypted backup files regardless of the actual # encryption parameters is “.pdtcrypt”. This is analogous to the encryption # header which starts with the eight ASCII bytes “PDTCRYPT”. Historical note: # Since the introduction of the versioned header there no longer any need # for encoding encryption parameters in the file extensions (“.aes128” and # suchlike). PDTCRYPT_EXTENSION = "pdtcrypt" PDT_TYPE_ARCHIVE = 0 PDT_TYPE_AUX = 1 AUXILIARY_FILE_INDEX = 0 AUXILIARY_FILE_INFO = 1 class DeltaTar(object): ''' Backup class used to create backups ''' # list of files to exclude in the backup creation or restore operation. It # can contain python regular expressions. excluded_files = [] # list of files to include in the backup creation or restore operation. It # can contain python regular expressions. If empty, all files in the source # path will be backed up (when creating a backup) or all the files in the # backup will be restored (when restoring a backup), but if included_files # is set then only the files include in the list will be processed. included_files = [] # custom filter of files to be backed up (or restored). Unused and unset # by default. The function receives a file path and must return a boolean. filter_func = None # mode in which the delta will be created (when creating a backup) or # opened (when restoring). Accepts modes analog to the tarfile library. mode = "" # used together with aes modes to encrypt and decrypt backups. password = None crypto_key = None nacl = None # parameter version to use when encrypting; note that this has no effect # on decryption since the required settings are determined from the headers crypto_version = DELTATAR_HEADER_VERSION crypto_paramversion = None # when encrypting or decrypting, these hold crypto handlers; created before # establishing the Tarfile stream iff a password is supplied. encryptor = None decryptor = None # python logger object. logger = None # specifies the index mode in the same format as @param mode, but without # the ':', '|' or '#' at the begining. It doesn't make sense to specify # that the index is encrypted if no password is given in the constructor. index_mode = None # current time for this backup. Used for file names and file creation checks current_time = None # extra data to included in the header of the index file when creating a # backup extra_data = dict() # valid tarfile modes and their corresponding default file extension __file_extensions_dict = { '': '', ':': '', ':gz': '.gz', ':bz2': '.bz2', '|': '', '|gz': '.gz', '|bz2': '.bz2', '#gz': '.gz', '#gz.pdtcrypt': '.gz', '#pdtcrypt': '', '#': '', } # valid index modes and their corresponding default file extension __index_extensions_dict = { '': '', 'gz': '.gz', 'bz2': '.bz2', 'gz.pdtcrypt': '.gz', 'pdtcrypt': '', } # valid path prefixes __path_prefix_list = [ u'snapshot://', u'list://', u'delete://' ] def __init__(self, excluded_files=[], included_files=[], filter_func=None, mode="", password=None, crypto_key=None, nacl=None, crypto_version=DELTATAR_HEADER_VERSION, crypto_paramversion=DELTATAR_PARAMETER_VERSION, logger=None, index_mode=None, index_name_func=None, volume_name_func=None): ''' Constructor. Configures the diff engine. Parameters: - excluded_files: list of files to exclude in the backup creation or restore operation. It can contain python regular expressions. - included_files: list of files to include in the backup creation or restore operation. It can contain python regular expressions. If empty, all files in the source path will be backed up (when creating a backup) or all the files in the backup will be restored (when restoring a backup), but if included_files is set then only the files include in the list will be processed. - filter_func: custom filter of files to be backed up (or restored). Unused and unset by default. The function receives a file path and must return a boolean. - mode: mode in which the delta will be created (when creating a backup) or opened (when restoring). Accepts the same modes as the tarfile library. Valid modes are: '' open uncompressed ':' open uncompressed ':gz' open with gzip compression ':bz2' open with bzip2 compression '|' open an uncompressed stream of tar blocks '|gz' open a gzip compressed stream of tar blocks '|bz2' open a bzip2 compressed stream of tar blocks '#gz' open a stream of gzip compressed tar blocks - crypto_key: used to encrypt and decrypt backups. Encryption will be enabled automatically if a key is supplied. Requires a salt to be passed as well. - nacl: salt that was used to derive the encryption key for embedding in the PDTCRYPT header. Not needed when decrypting and when encrypting with password. - password: used to encrypt and decrypt backups. Encryption will be enabled automatically if a password is supplied. - crypto_version: version of the format, determining the kind of PDT object header. - crypto_paramversion: optionally request encryption conforming to a specific parameter version. Defaults to the standard PDT value which as of 2017 is the only one available. - logger: python logger object. Optional. - index_mode: specifies the index mode in the same format as @param mode, but without the ':', '|' or '#' at the begining. If encryption is requested it will extend to the auxiliary (index, info) files as well. This is an optional parameter that will automatically mimic @param mode by default if not provided. Valid modes are: '' open uncompressed 'gz' open with gzip compression 'bz2' open with bzip2 compression - index_name_func: function that sets a custom name for the index file. This function receives a flag to indicate whether the name will be used for a full or diff backup. The backup path will be prepended to its return value. - volume_name_func: function that defines the name of tar volumes. It receives the backup_path, if it's a full backup and the volume number, and must return the name for the corresponding volume name. Optional, DeltaTar has default names for tar volumes. ''' if mode not in self.__file_extensions_dict: raise Exception('Unrecognized extension mode=[%s] requested for files' % str(mode)) self.excluded_files = excluded_files self.included_files = included_files self.filter_func = filter_func self.logger = logging.getLogger('deltatar.DeltaTar') if logger: self.logger.addHandler(logger) self.mode = mode if crypto_key is not None: self.crypto_key = crypto_key self.nacl = nacl # encryption only if password is not None: self.password = password if crypto_version is not None: self.crypto_version = crypto_version if crypto_paramversion is not None: self.crypto_paramversion = crypto_paramversion # generate index_mode if index_mode is None: index_mode = '' if 'gz' in mode: index_mode = "gz" elif 'bz2' in mode: index_mode = "bz2" elif mode not in self.__index_extensions_dict: raise Exception('Unrecognized extension mode=[%s] requested for index' % str(mode)) self.index_mode = index_mode self.current_time = datetime.datetime.now() if index_name_func is not None: self.index_name_func = index_name_func if volume_name_func is not None: self.volume_name_func = volume_name_func def pick_extension(self, kind, mode=None): """ Choose the extension depending on a) the kind of file given, b) the processing mode, and c) the current encryption settings. """ ret = "" if kind == PDT_TYPE_ARCHIVE: ret += ".tar" if mode is None: mode = self.__index_extensions_dict [self.index_mode] ret += mode if self.crypto_key is not None or self.password is not None: ret += "." + PDTCRYPT_EXTENSION return ret def index_name_func(self, is_full): # pylint: disable=method-hidden ''' Callback for setting a custom name for the index file. Depending on whether *is_full* is set, it will create a suitable name for a full or a diff backup. ''' prefix = "bfull" if is_full else "bdiff" date_str = self.current_time.strftime("%Y-%m-%d-%H%M") extension = self.pick_extension \ (PDT_TYPE_AUX, self.__index_extensions_dict [self.index_mode]) return "%s-%s.index%s" % (prefix, date_str, extension) def volume_name_func(self, backup_path, # pylint: disable=method-hidden is_full, volume_number, guess_name=False): ''' function that defines the name of tar volumes. It receives the backup_path, if it's a full backup and the volume number, and must return the name for the corresponding volume name. Optional, DeltaTar has default names for tar volumes. If guess_name is activated, the file is intended not to be created but to be found, and thus the date will be guessed. ''' prefix = "bfull" if is_full else "bdiff" extension = self.pick_extension \ (PDT_TYPE_ARCHIVE, self.__file_extensions_dict [self.mode]) if not guess_name: date_str = self.current_time.strftime("%Y-%m-%d-%H%M") return "%s-%s-%03d%s" % (prefix, date_str, volume_number + 1, extension) else: prefix = prefix + "-" postfix = "-%03d%s" % (volume_number + 1, extension) for f in os.listdir(backup_path): if f.startswith(prefix) and f.endswith(postfix): return f raise Exception("volume not found") def filter_path(self, path, source_path="", is_dir=None): ''' Filters a path, given the source_path, using the filtering properties set in the constructor. The filtering order is: 1. included_files (if any) 2. excluded_files 3. filter_func (which must return whether the file is accepted or not) ''' if len(source_path) > 0: # ensure that exactly one '/' at end of dir is also removed source_path = source_path.rstrip(os.sep) + os.sep path = path[len(source_path):] # 1. filter included_files match = MATCH if len(self.included_files) > 0: match = NO_MATCH for i in self.included_files: # it can be either a regexp or a string if isinstance(i, str): # if the string matches, then continue if i == path: match = MATCH break # if the string ends with / it's a directory, and if the # path is contained in it, it is included if i.endswith('/') and path.startswith(i): match = MATCH break # if the string doesn't end with /, add it and do the same # check elif path.startswith(i + '/'): match = MATCH break # check for PARENT_MATCH if is_dir: dir_path = path if not dir_path.endswith('/'): dir_path += '/' if i.startswith(dir_path): match = PARENT_MATCH # if it's a reg exp, then we just check if it matches elif isinstance(i, typing.Pattern): if i.match(path): match = MATCH break else: self.logger.warning('Invalid pattern in included_files: %s' % str(i)) if match == NO_MATCH: return NO_MATCH # when a directory is in PARENT_MATCH, it doesn't matter if it's # excluded. It's subfiles will be excluded, but the directory itself # won't if match != PARENT_MATCH: for e in self.excluded_files: # it can be either a regexp or a string if isinstance(e, str): # if the string matches, then exclude if e == path: return NO_MATCH # if the string ends with / it's a directory, and if the # path starts with the directory, then exclude if e.endswith('/') and path.startswith(e): return NO_MATCH # if the string doesn't end with /, do the same check with # the slash added elif path.startswith(e + '/'): return NO_MATCH # if it's a reg exp, then we just check if it matches elif isinstance(e, typing.Pattern): if e.match(path): return NO_MATCH else: self.logger.warning('Invalid pattern in excluded_files: %s' % str(e)) if self.filter_func: return self.filter_func(path) return match def _recursive_walk_dir(self, source_path, keep_base_dir=False): ''' Walk a directory recursively, yielding each file/directory Returns the path of an entity. If ``keep_base_dir`` is set, the path returned contains the prefix ``source_path``; otherwise it is relative to the prefix. ''' source_path = source_path.rstrip(os.sep) if keep_base_dir: beginning_size = 0 else: beginning_size = len(source_path) + 1 # +1 for os.sep queue = [source_path] while queue: cur_path = queue.pop(0) try: dfd = os.open (cur_path, os.O_DIRECTORY) except FileNotFoundError as exn: self.logger.warning ("failed to open entity [%s] as directory; " "file system (error: %s); skipping" % (cur_path, str (exn))) continue try: for filename in sorted(os.listdir(dfd)): child = os.path.join(cur_path, filename) is_dir = os.path.isdir(child) status = self.filter_path(child, source_path, is_dir) if status == NO_MATCH: continue if not os.access(child, os.R_OK): self.logger.warning('Error accessing possibly locked file %s' % child) continue if status == MATCH: yield child[beginning_size:] if is_dir and (status == MATCH or status == PARENT_MATCH): queue.append(child) finally: os.close (dfd) def _stat_dict(self, path): ''' Returns a dict with the stat data used to compare files ''' stinfo = os.stat(path) mode = stinfo.st_mode ptype = None if stat.S_ISDIR(mode): ptype = u'directory' elif stat.S_ISREG(mode): ptype = u'file' elif stat.S_ISLNK(mode): ptype = u'link' return { u'type': ptype, u'path': path, u'mode': mode, u'mtime': int(stinfo.st_mtime), u'ctime': int(stinfo.st_ctime), u'uid': stinfo.st_uid, u'gid': stinfo.st_gid, u'inode': stinfo.st_ino, u'size': stinfo.st_size } def _equal_stat_dicts(self, d1, d2, listsnapshot_equal=False): ''' Return if the dicts are equal in the stat keys ''' keys = [u'type', u'mode',u'size', u'mtime', # not restored: u'inode', u'ctime' ] # only if user is root, then also check gid/uid. otherwise do not check it, # because tarfile can chown in case of being superuser only # # also, skip the check in rpmbuild since the sources end up with the # uid:gid of the packager while the extracted files are 0:0. if hasattr(os, "geteuid") and os.geteuid() == 0 \ and os.getenv ("RPMBUILD_OPTIONS") is None: keys.append('gid') keys.append('uid') if (not d1 and d2 != None) or (d1 != None and not d2): return False if self.prefixed(d1.get('path', -1), listsnapshot_equal) != self.prefixed(d2.get('path', -2), listsnapshot_equal): return False type = d1.get('type', '') for key in keys: # size doesn't matter for directories if type == 'directory' and key == 'size': continue if d1.get(key, -1) != d2.get(key, -2): return False return True def prefixed(self, path, listsnapshot_equal=False): ''' if a path is not prefixed, return it prefixed ''' for prefix in self.__path_prefix_list: if path.startswith(prefix): if listsnapshot_equal and prefix == u'list://': return u'snapshot://' + path[len(prefix):] return path return u'snapshot://' + path def unprefixed(self, path): ''' remove a path prefix if any ''' for prefix in self.__path_prefix_list: if path.startswith(prefix): return path[len(prefix):] return path def initialize_encryption (self, mode, strict_validation=True): """ :type strict_validation: bool :param strict_validation: Enable strict IV checking in the crypto layer. Should be disabled when dealing with potentially corrupted data. """ password = self.password key = self.crypto_key nacl = self.nacl if key is None and password is None: return if mode == CRYPTO_MODE_ENCRYPT: return crypto.Encrypt (password=password, key=key, nacl=nacl, version=self.crypto_version, paramversion=self.crypto_paramversion) if mode == CRYPTO_MODE_DECRYPT: return crypto.Decrypt (password=password, key=key, strict_ivs=strict_validation) raise Exception ("invalid encryption mode [%r]" % mode) def open_auxiliary_file(self, path, mode='r', kind=AUXILIARY_FILE_INDEX, strict_validation=True): ''' Given the specified configuration, opens a file for reading or writing, inheriting the encryption and compression settings from the backup. Returns a file object ready to use. :param mode: IO mode (read or write, ``"r"`` and ``"w"``, respectively). :type mode: str :param kind: Role of the file, see AUXILIARY_FILE_* constants. Both the info and the auxiliary file have a globally unique, constant counter value. :type kind: str ''' if self.index_mode.startswith('gz'): comptype = 'gz' elif self.index_mode.startswith('bz2'): comptype = 'bz2' else: comptype = 'tar' crypto_ctx = None enccounter = None if mode == "w": crypto_ctx = self.initialize_encryption (CRYPTO_MODE_ENCRYPT) elif mode == "r": crypto_ctx = self.initialize_encryption (CRYPTO_MODE_DECRYPT, strict_validation=strict_validation) if crypto_ctx is not None: if kind == AUXILIARY_FILE_INFO: enccounter = crypto.AES_GCM_IV_CNT_INFOFILE elif kind == AUXILIARY_FILE_INDEX: enccounter = crypto.AES_GCM_IV_CNT_INDEX else: raise Exception ("invalid kind of aux file %r" % kind) sink = tarfile._Stream(name=path, mode=mode, comptype=comptype, bufsize=tarfile.RECORDSIZE, fileobj=None, encryption=crypto_ctx, enccounter=enccounter) return sink def create_full_backup(self, source_path, backup_path, max_volume_size=None, extra_data=dict()): ''' Creates a full backup. Parameters: - source_path: source path to the directory to back up. - backup_path: path where the back up will be stored. Backup path will be created if not existent. - max_volume_size: maximum volume size in megabytes. Used to split the backup in volumes. Optional (won't split in volumes by default). - extra_data: a json-serializable dictionary with information that you want to be included in the header of the index file ''' # check input if not isinstance(source_path, str): raise Exception('Source path must be a string') if not isinstance(backup_path, str): raise Exception('Backup path must be a string') if not os.path.exists(source_path) or not os.path.isdir(source_path): raise Exception('Source path "%s" does not exist or is not a '\ 'directory' % source_path) if max_volume_size != None and (not isinstance(max_volume_size, int) or\ max_volume_size < 1): raise Exception('max_volume_size must be a positive integer') if max_volume_size != None: max_volume_size = max_volume_size*1024*1024 if not isinstance(extra_data, dict): raise Exception('extra_data must be a dictionary') try: extra_data_str = json.dumps(extra_data) except: raise Exception('extra_data is not json-serializable') if not os.access(source_path, os.R_OK): raise Exception('Source path "%s" is not readable' % source_path) # try to create backup path if needed os.makedirs(backup_path, exist_ok=True) if not os.access(backup_path, os.W_OK): raise Exception('Backup path "%s" is not writeable' % backup_path) if source_path.endswith('/'): source_path = source_path[:-1] if backup_path.endswith('/'): backup_path = backup_path[:-1] # update current time self.current_time = datetime.datetime.now() if self.mode not in self.__file_extensions_dict: raise Exception('Unrecognized extension') # setup for encrypting payload if self.encryptor is None: self.encryptor = self.initialize_encryption (CRYPTO_MODE_ENCRYPT) # some initialization self.vol_no = 0 # generate the first volume name vol_name = self.volume_name_func(backup_path, True, 0) tarfile_path = os.path.join(backup_path, vol_name) # init index index_name = self.index_name_func(True) index_path = os.path.join(backup_path, index_name) index_sink = self.open_auxiliary_file(index_path, 'w') cwd = os.getcwd() def new_volume_handler(deltarobj, cwd, backup_path, encryption, tarobj, base_name, volume_number): ''' Handles the new volumes ''' volume_name = deltarobj.volume_name_func(backup_path, True, volume_number) volume_path = os.path.join(backup_path, volume_name) deltarobj.vol_no = volume_number # we convert relative paths into absolute because CWD is changed if not os.path.isabs(volume_path): volume_path = os.path.join(cwd, volume_path) if tarobj.fileobj is not None: tarobj.fileobj.close() deltarobj.logger.debug("opening volume %s" % volume_path) tarobj.open_volume(volume_path, encryption=encryption) # wraps some args from context into the handler new_volume_handler = partial(new_volume_handler, self, cwd, backup_path, self.encryptor) index_sink.write(bytes('{"type": "python-delta-tar-index", "version": 1, "backup-type": "full", "extra_data": %s}\n' % extra_data_str, 'UTF-8')) s = bytes('{"type": "BEGIN-FILE-LIST"}\n', 'UTF-8') # calculate checksum and write into the stream crc = binascii.crc32(s) & 0xFFFFffff index_sink.write(s) # start creating the tarfile tarobj = tarfile.TarFile.open(tarfile_path, mode='w' + self.mode, format=tarfile.GNU_FORMAT, concat='#' in self.mode, encryption=self.encryptor, max_volume_size=max_volume_size, new_volume_handler=new_volume_handler, save_to_members=False, dereference=True) os.chdir(source_path) # for each file to be in the backup, do: for path in self._recursive_walk_dir('.'): try: # backup file # calculate stat dict for current file statd = self._stat_dict(path) statd['path'] = u'snapshot://' + statd['path'] statd['volume'] = self.vol_no # backup file tarobj.add(path, arcname = statd['path'], recursive=False) except FileNotFoundError as exn: # file vanished since the call to access(3) above self.logger.warning ("object [%s] no longer available in " "file system (error: %s); skipping" % (path, str (exn))) continue # prevent indexing # retrieve file offset statd['offset'] = tarobj.get_last_member_offset() self.logger.debug("backup %s" % statd['path']) # store the stat dict in the index s = bytes(json.dumps(statd) + '\n', 'UTF-8') crc = binascii.crc32(s, crc) & 0xffffffff index_sink.write(s) s = bytes('{"type": "END-FILE-LIST"}\n', 'UTF-8') crc = binascii.crc32(s, crc) & 0xffffffff index_sink.write(s) s = bytes('{"type": "file-list-checksum", "checksum": %d}\n' % crc, 'UTF-8') index_sink.write(s) os.chdir(cwd) tarobj.close() index_sink.close (close_fileobj=True) def create_diff_backup(self, source_path, backup_path, previous_index_path, max_volume_size=None, extra_data=dict()): ''' Creates a backup. Parameters: - source_path: source path to the directory to back up. - backup_path: path where the back up will be stored. Backup path will be created if not existent. - previous_index_path: index of the previous backup, needed to know which files changed since then. - max_volume_size: maximum volume size in megabytes (MB). Used to split the backup in volumes. Optional (won't split in volumes by default). NOTE: previous index is assumed to follow exactly the same format as the index_mode setup in the constructor. ''' # check/sanitize input if not isinstance(source_path, str): raise Exception('Source path must be a string') if not isinstance(backup_path, str): raise Exception('Backup path must be a string') if not os.path.exists(source_path) or not os.path.isdir(source_path): raise Exception('Source path "%s" does not exist or is not a '\ 'directory' % source_path) if not isinstance(extra_data, dict): raise Exception('extra_data must be a dictionary') try: extra_data_str = json.dumps(extra_data) except: raise Exception('extra_data is not json-serializable') if not os.access(source_path, os.R_OK): raise Exception('Source path "%s" is not readable' % source_path) if max_volume_size != None and (not isinstance(max_volume_size, int) or\ max_volume_size < 1): raise Exception('max_volume_size must be a positive integer') if max_volume_size != None: max_volume_size = max_volume_size*1024*1024 if not isinstance(previous_index_path, str): raise Exception('previous_index_path must be A string') if not os.path.exists(previous_index_path) or not os.path.isfile(previous_index_path): raise Exception('Index path "%s" does not exist or is not a '\ 'file' % previous_index_path) if not os.access(previous_index_path, os.R_OK): raise Exception('Index path "%s" is not readable' % previous_index_path) # try to create backup path if needed os.makedirs(backup_path, exist_ok=True) if not os.access(backup_path, os.W_OK): raise Exception('Backup path "%s" is not writeable' % backup_path) if source_path.endswith('/'): source_path = source_path[:-1] if backup_path.endswith('/'): backup_path = backup_path[:-1] # update current time self.current_time = datetime.datetime.now() if self.mode not in self.__file_extensions_dict: raise Exception('Unrecognized extension') # setup for encrypting payload if self.encryptor is None: self.encryptor = self.initialize_encryption (CRYPTO_MODE_ENCRYPT) # some initialization self.vol_no = 0 # generate the first volume name vol_name = self.volume_name_func(backup_path, is_full=False, volume_number=0) tarfile_path = os.path.join(backup_path, vol_name) # init index cwd = os.getcwd() index_name = self.index_name_func(is_full=False) index_path = os.path.join(backup_path, index_name) index_sink = self.open_auxiliary_file(index_path, 'w') def new_volume_handler(deltarobj, cwd, backup_path, tarobj, base_name, volume_number): ''' Handles the new volumes ''' volume_name = deltarobj.volume_name_func(backup_path, is_full=False, volume_number=volume_number) volume_path = os.path.join(backup_path, volume_name) deltarobj.vol_no = volume_number # we convert relative paths into absolute because CWD is changed if not os.path.isabs(volume_path): volume_path = os.path.join(cwd, volume_path) deltarobj.logger.debug("opening volume %s" % volume_path) tarobj.open_volume(volume_path) # wraps some args from context into the handler new_volume_handler = partial(new_volume_handler, self, cwd, backup_path) index_sink.write(bytes('{"type": "python-delta-tar-index", "version": 1, "backup-type": "diff", "extra_data": %s}\n' % extra_data_str, 'UTF-8')) s = bytes('{"type": "BEGIN-FILE-LIST"}\n', 'UTF-8') # calculate checksum and write into the stream crc = binascii.crc32(s) & 0xFFFFffff index_sink.write(s) # start creating the tarfile tarobj = tarfile.TarFile.open(tarfile_path, mode='w' + self.mode, format=tarfile.GNU_FORMAT, concat='#' in self.mode, encryption=self.encryptor, max_volume_size=max_volume_size, new_volume_handler=new_volume_handler, save_to_members=False, dereference=True) # create the iterators, first the previous index iterator, then the # source path directory iterator and collate and iterate them if not os.path.isabs(previous_index_path): previous_index_path = os.path.join(cwd, previous_index_path) index_it = self.iterate_index_path(previous_index_path) os.chdir(source_path) dir_it = self._recursive_walk_dir('.') dir_path_it = self.jsonize_path_iterator(dir_it) def pr(path): if not path: return "None" else: return path["path"] # for each file to be in the backup, do: for ipath, dpath, l_no in self.collate_iterators(index_it, dir_path_it): action = None # if file is not in the index, it means it's a new file, so we have # to take a snapshot if not ipath: action = 'snapshot' # if the file is not in the directory iterator, it means that it has # been deleted, so we need to mark it as such elif not dpath: action = 'delete' # if the file is in both iterators, it means it might have either # not changed (in which case we will just list it in our index but # it will not be included in the tar file), or it might have # changed, in which case we will snapshot it. elif ipath and dpath: if self._equal_stat_dicts(ipath, dpath): action = 'list' else: action = 'snapshot' # TODO: when creating chained backups (i.e. diffing from another # diff), we will need to detect the type of action in the previous # index, because if it was delete and dpath is None, we should # discard the file if action == 'snapshot': # calculate stat dict for current file stat = dpath.copy() stat['path'] = "snapshot://" + dpath['path'] stat['volume'] = self.vol_no self.logger.debug("[STORE] %s" % dpath['path']) try: # backup file tarobj.add(dpath['path'], arcname=stat['path'], recursive=False) # retrieve file offset stat['offset'] = tarobj.get_last_member_offset() except FileNotFoundError as exn: # file vanished since the call to access(3) above self.logger.warning ("object [%s] no longer available in " "file system (error: %s); skipping" % (dpath ["path"], str (exn))) stat = None # prevent indexing elif action == 'delete': path = self.unprefixed(ipath['path']) stat = { u'path': u'delete://' + path, u'type': ipath['type'] } self.logger.debug("[DELETE] %s" % path) # mark it as deleted in the backup tarobj.add("/dev/null", arcname=stat['path']) elif action == 'list': stat = dpath.copy() path = self.unprefixed(ipath['path']) stat['path'] = u'list://' + path # unchanged files do not enter in the backup, only in the index self.logger.debug("[UNCHANGED] %s" % path) else: # should not happen self.logger.warning('unknown action in create_diff_backup: {0}' ''.format(action)) stat = None if stat: # store the stat dict in the index s = bytes(json.dumps(stat) + '\n', 'UTF-8') crc = binascii.crc32(s, crc) & 0xffffffff index_sink.write(s) s = bytes('{"type": "END-FILE-LIST"}\n', 'UTF-8') crc = binascii.crc32(s, crc) & 0xffffffff index_sink.write(s) s = bytes('{"type": "file-list-checksum", "checksum": %d}\n' % crc, 'UTF-8') index_sink.write(s) index_it.release() os.chdir(cwd) tarobj.close() index_sink.close() def iterate_index_path(self, index_path, strict_validation=True): ''' Returns an index iterator. Internally, it uses a classic iterator class. We do that instead of just yielding so that the iterator object can have an additional function to close the file descriptor that is opened in the constructor. ''' class IndexPathIterator(object): def __init__(self, delta_tar, index_path): self.delta_tar = delta_tar self.index_path = index_path self.f = None self.extra_data = dict() self.__enter__() def __iter__(self): return self def release(self): if self.f: self.f.close() def __enter__(self): ''' Allows this iterator to be used with the "with" statement ''' if self.f is None: self.f = self.delta_tar.open_auxiliary_file \ (self.index_path, 'r', strict_validation=strict_validation) # check index header j, l_no = self.delta_tar._parse_json_line(self.f, 0) if j.get("type", '') != 'python-delta-tar-index' or\ j.get('version', -1) != 1: raise Exception("invalid index file format: %s" % json.dumps(j)) self.extra_data = j.get('extra_data', dict()) # find BEGIN-FILE-LIST, ignore other headers while True: j, l_no = self.delta_tar._parse_json_line(self.f, l_no) if j.get('type', '') == 'BEGIN-FILE-LIST': break return self def __exit__(self, type, value, tb): ''' Allows this iterator to be used with the "with" statement ''' if self.f: self.f.close() self.f = None def __next__(self): # read each file in the index and process it to do the restore j = {} l_no = -1 try: j, l_no = self.delta_tar._parse_json_line(self.f, l_no) except Exception as e: if self.f: self.f.close() raise e op_type = j.get('type', '') # when we detect the end of the list, break the loop if op_type == 'END-FILE-LIST': if self.f: self.f.close() raise StopIteration # check input if op_type not in ['directory', 'file', 'link']: self.delta_tar.logger.warning('unrecognized type to be ' 'restored: %s, line %d' % (op_type, l_no)) # iterate again return self.__next__() return j, l_no return IndexPathIterator(self, index_path) def iterate_tar_path(self, tar_path, new_volume_handler=None): ''' Returns a tar iterator that iterates jsonized member items that contain an additional "member" field, used by RestoreHelper. ''' class TarPathIterator(object): def __init__(self, delta_tar, tar_path, new_volume_handler=None): self.delta_tar = delta_tar self.tar_path = tar_path self.tar_obj = None self.last_member = None self.new_volume_handler = new_volume_handler self.__enter__() def __iter__(self): return self def release(self): if self.tar_obj: self.tar_obj.close() def __enter__(self): ''' Allows this iterator to be used with the "with" statement ''' if self.tar_obj is None: decryptor = None if self.delta_tar.password is not None: decryptor = crypto.Decrypt \ (password=self.delta_tar.password, key=self.delta_tar.crypto_key, strict_ivs=False) self.tar_obj = tarfile.TarFile.open(self.tar_path, mode='r' + self.delta_tar.mode, format=tarfile.GNU_FORMAT, concat='#' in self.delta_tar.mode, encryption=decryptor, new_volume_handler=self.new_volume_handler, save_to_members=False, dereference=True) return self def __exit__(self, type, value, tb): ''' Allows this iterator to be used with the "with" statement ''' if self.tar_obj: self.tar_obj.close() self.tar_obj = None def __next__(self): ''' Read each member and return it as a stat dict ''' tarinfo = self.tar_obj.__iter__().__next__() # NOTE: here we compare if tarinfo.path is the same as before # instead of comparing the tarinfo object itself because the # object itself might change for multivol tarinfos if tarinfo is None or (self.last_member is not None and\ self.delta_tar.unprefixed(tarinfo.path) == self.delta_tar.unprefixed(self.last_member.path)): raise StopIteration self.last_member = tarinfo ptype = 'unknown' if tarinfo.isfile(): ptype = 'file' elif tarinfo.isdir(): ptype = 'directory' elif tarinfo.islnk() or tarinfo.issym(): ptype = 'link' return { u'type': ptype, u'path': tarinfo.path, u'mode': tarinfo.mode, u'mtime': tarinfo.mtime, u'ctime': -1, # cannot restore u'uid': tarinfo.uid, u'gid': tarinfo.gid, u'inode': -1, # cannot restore u'size': tarinfo.size, u'member': tarinfo }, 0 return TarPathIterator(self, tar_path, new_volume_handler) def jsonize_path_iterator(self, iter, strip=0): ''' converts the yielded items of an iterator into json path lines. strip: Strip the smallest prefix containing num leading slashes from the file path. ''' while True: try: path = iter.__next__() if strip == 0: yield self._stat_dict(path), 0 else: st = self._stat_dict(path) st['path'] = "/".join(path.split("/")[strip:]) yield st, 0 except StopIteration: break def iterate_disaster_index (self, index): """ Mimick the behavior of the other object iterators, just with the inputs supplied directly as *index*. """ class RawIndexIterator(object): def __init__(self, delta_tar, index): self.delta_tar = delta_tar self.index = index self.__enter__() def __iter__(self): return self def release(self): pass def __enter__(self): ''' Allows this iterator to be used with the "with" statement ''' self.iter = self.index.__iter__ () return self def __exit__(self, type, value, tb): ''' Allows this iterator to be used with the "with" statement ''' def __next__(self): idxent = self.iter.__next__ () return idxent, 0 return RawIndexIterator(self, index) def collate_iterators(self, it1, it2): ''' Collate two iterators, so that it returns pairs of the items of each iterator (if the items are the same), or (None, elem2) or (elem1, None) when there's no match for the items in the other iterator. It assumes that the items in both lists are ordered in the same way. ''' l_no = 0 elem1, elem2 = None, None while True: if not elem1: try: elem1, l_no = it1.__next__() except StopIteration: if elem2: yield (None, elem2, l_no) for elem2 in it2: if isinstance(elem2, tuple): elem2 = elem2[0] yield (None, elem2, l_no) break if not elem2: try: elem2 = it2.__next__() if isinstance(elem2, tuple): elem2 = elem2[0] except StopIteration: if elem1: yield (elem1, None, l_no) for elem1, l_no in it1: yield (elem1, None, l_no) break index1 = self.unprefixed(elem1['path']) index2 = self.unprefixed(elem2['path']) i1, i2 = self.compare_indexes(index1, index2) yield1 = yield2 = None if i1 is not None: yield1 = elem1 elem1 = None if i2 is not None: yield2 = elem2 elem2 = None yield (yield1, yield2, l_no) def compare_indexes(self, index1, index2): ''' Compare iterator indexes and return a tuple in the following form: if index1 < index2, returns (index1, None) if index1 == index2 returns (index1, index2) else: returns (None, index2) ''' l1 = index1.split('/') l2 = index2.split('/') length = len(l2) - len(l1) if length > 0: return (index1, None) elif length < 0: return (None, index2) for i1, i2 in zip(l1, l2): if i1 < i2: return (index1, None) elif i1 > i2: return (None, index2) return (index1, index2) def list_backup(self, backup_tar_path, list_func=None): if not isinstance(backup_tar_path, str): raise Exception('Backup tar path must be a string') if not os.path.exists(backup_tar_path) or not os.path.isfile(backup_tar_path): raise Exception('Source path "%s" does not exist or is not a '\ 'file' % backup_tar_path) if not os.access(backup_tar_path, os.R_OK): raise Exception('Source path "%s" is not readable' % backup_tar_path) cwd = os.getcwd() def new_volume_handler(deltarobj, cwd, backup_path, encryption, tarobj, base_name, volume_number): ''' Handles the new volumes ''' volume_name = deltarobj.volume_name_func(backup_path, True, volume_number, guess_name=True) volume_path = os.path.join(backup_path, volume_name) # we convert relative paths into absolute because CWD is changed if not os.path.isabs(volume_path): volume_path = os.path.join(cwd, volume_path) tarobj.open_volume(volume_path, encryption=encryption) if self.decryptor is None: self.decryptor = \ self.initialize_encryption (CRYPTO_MODE_DECRYPT, strict_validation=False) backup_path = os.path.dirname(backup_tar_path) if not os.path.isabs(backup_path): backup_path = os.path.join(cwd, backup_path) new_volume_handler = partial(new_volume_handler, self, cwd, backup_path, self.decryptor) tarobj = tarfile.TarFile.open(backup_tar_path, mode='r' + self.mode, format=tarfile.GNU_FORMAT, concat='#' in self.mode, encryption=self.decryptor, new_volume_handler=new_volume_handler, save_to_members=False, dereference=True) def filter(cls, list_func, tarinfo): if list_func is None: self.logger.info(tarinfo.path) else: list_func(tarinfo) return False filter = partial(filter, self, list_func) tarobj.extractall(filter=filter, unlink=True) tarobj.close() def restore_backup(self, target_path, backup_indexes_paths=[], backup_tar_path=None, restore_callback=None, disaster=tarfile.TOLERANCE_STRICT, backup_index=None, strict_validation=True): ''' Restores a backup. Parameters: - target_path: path to restore. - backup_indexes_paths: path to backup indexes, in descending date order. The indexes indicate the location of their respective backup volumes, and multiple indexes are needed to be able to restore diff backups. Note that this is an optional parameter: if not suplied, it will try to restore directly from backup_tar_path. - backup_tar_path: path to the backup tar file. Used as an alternative to backup_indexes_paths to restore directly from a tar file without using any file index. If it's a multivol tarfile, volume_name_func will be called. - restore_callback: callback function to be called during restore. This is passed to the helper and gets called for every file. NOTE: If you want to use an index to restore a backup, this function only supports to do so when the tarfile mode is either uncompressed or uses concat compress mode, because otherwise it would be very slow. NOTE: Indices are assumed to follow the same format as the index_mode specified in the constructor. Returns the list of files that could not be restored, if there were any. ''' # check/sanitize input if not isinstance(target_path, str): raise Exception('Target path must be a string') if backup_indexes_paths is None and backup_tar_path == []: raise Exception("You have to either provide index paths or a tar path") if isinstance (backup_index, list) is True: mode = "disaster" elif len(backup_indexes_paths) == 0: mode = "tar" else: mode = "diff" if mode == "tar": if not isinstance(backup_tar_path, str): raise Exception('Backup tar path must be a string') if not os.path.exists(backup_tar_path) or not os.path.isfile(backup_tar_path): raise Exception('Source path "%s" does not exist or is not a '\ 'file' % backup_tar_path) if not os.access(backup_tar_path, os.R_OK): raise Exception('Source path "%s" is not readable' % backup_tar_path) else: if not isinstance(backup_indexes_paths, list): raise Exception('backup_indexes_paths must be a list') if self.mode.startswith(':') or self.mode.startswith('|'): raise Exception('Restore only supports either uncompressed tars' ' or concat compression when restoring from an index, and ' ' the open mode you provided is "%s"' % self.mode) for index in backup_indexes_paths: if not isinstance(index, str): raise Exception('indices must be strings') if not os.path.exists(index) or not os.path.isfile(index): raise Exception('Index path "%s" does not exist or is not a '\ 'file' % index) if not os.access(index, os.R_OK): raise Exception('Index path "%s" is not readable' % index) # try to create backup path if needed os.makedirs(target_path, exist_ok=True) # make backup_tar_path absolute so that iterate_tar_path works fine if backup_tar_path and not os.path.isabs(backup_tar_path): backup_tar_path = os.path.abspath(backup_tar_path) cwd = os.getcwd() os.chdir(target_path) # setup for decrypting payload if self.decryptor is None: self.decryptor = \ self.initialize_encryption (CRYPTO_MODE_DECRYPT, strict_validation=strict_validation) if mode == 'tar': index_it = self.iterate_tar_path(backup_tar_path) helper = RestoreHelper(self, cwd, backup_path=backup_tar_path, tarobj=index_it.tar_obj) elif mode == "diff": helper = RestoreHelper(self, cwd, backup_indexes_paths, disaster=disaster) try: # get iterator from newest index at _data[0] index1 = helper._data[0]["path"] index_it = \ self.iterate_index_path(index1, strict_validation=strict_validation) except tarfile.DecryptionError as exn: self.logger.error("failed to decrypt file [%s]: %s; is this an " "actual encrypted index file?" % (index1, str (exn))) return [(index1, exn)] except Exception as exn: # compressed files self.logger.error("failed to read file [%s]: %s; is this an " "actual index file?" % (index1, str (exn))) return [(index1, exn)] elif mode == "disaster": index_it = self.iterate_disaster_index (backup_index) helper = RestoreHelper (self, cwd, backup_path=backup_tar_path, backup_index=backup_index, disaster=disaster) index_decryptor = helper._data[0]["decryptor"] dir_it = self._recursive_walk_dir('.') dir_path_it = self.jsonize_path_iterator(dir_it) failed = [] # irrecoverable files # for each file to be restored, do: for ipath, dpath, l_no in self.collate_iterators(index_it, dir_path_it): if not ipath: upath = dpath['path'] op_type = dpath['type'] else: upath = self.unprefixed(ipath['path']) op_type = ipath['type'] # filter paths if self.filter_path(upath, '', op_type == 'directory') == NO_MATCH: continue # if types of the file mismatch, the file needs to be deleted # and re-restored if ipath is not None and dpath is not None and\ dpath['type'] != ipath['type']: helper.delete(upath) # if file not found in dpath, we can directly restore from index if not dpath: # if the file doesn't exist and it needs to be deleted, it # means that work is already done if ipath['path'].startswith('delete://'): continue try: self.logger.debug("restore %s" % ipath['path']) helper.restore(ipath, l_no, restore_callback) except Exception as e: iipath = ipath.get ("path", "") self.logger.error("FAILED to restore: {} ({})" .format(iipath, e)) if disaster != tarfile.TOLERANCE_STRICT: failed.append ((iipath, e)) continue # if both files are equal, we have nothing to restore if self._equal_stat_dicts(ipath, dpath, listsnapshot_equal=True): continue # we have to restore the file, but first we need to delete the # current existing file. # we don't delete the file if it's a directory, because it might # just have changed mtime, so it's quite inefficient to remove # it if ipath: if ipath['type'] != 'directory' or ipath['path'].startswith('delete://'): helper.delete(upath) self.logger.debug("restore %s" % ipath['path']) try: helper.restore(ipath, l_no, restore_callback) except Exception as e: if disaster == tarfile.TOLERANCE_STRICT: raise failed.append ((ipath.get ("path", ""), e)) continue # if the file is not in the index (so it comes from the target # directory) then we have to delete it else: self.logger.debug("delete %s" % upath) helper.delete(upath) helper.restore_directories_permissions() index_it.release() os.chdir(cwd) helper.cleanup() return failed def recover_backup(self, target_path, backup_indexes_paths=[], restore_callback=None): """ Walk the index, extracting objects in disaster mode. Bad files are reported along with a reason. *Security considerations*: In *recovery mode* the headers of encrypted objects are assumed damaged and GCM tags are not validated so modification of cryptographically relevant parts of the header (more specifically, the initalization vectors) can no longer be detected. If an attacker can manipulate the encrypted backup set and has access to the plaintext of some of the contents, they may be able to obtain the plaintext of other encrypted objects by injecting initialization vectors. For this reason *recovery mode* should only be used to emergency situations and the contents of the resulting files should be validated manually if possible and not be disclosed to untrusted parties. """ return self.restore_backup(target_path, backup_indexes_paths=backup_indexes_paths, disaster=tarfile.TOLERANCE_RECOVER, strict_validation=False) def rescue_backup(self, target_path, backup_tar_path, restore_callback=None): """ More aggressive “unfsck” mode: do not rely on the index data as the files may be corrupt; skim files for header-like information and attempt to retrieve the data. *Security considerations*: As with *recovery mode*, in *rescue mode* the headers of encrypted objects are assumed damaged and GCM tags are not validated so modification of cryptographically relevant parts of the header (more specifically, the initalization vectors) can no longer be detected. If an attacker can manipulate the encrypted backup set and has access to the plaintext of some of the contents, they may be able to obtain the plaintext of other encrypted objects by injecting initialization vectors. For this reason *rescue mode* should only be used to emergency situations and the contents of the resulting files should be validated manually if possible and not be disclosed to untrusted parties. """ def gen_volume_name (nvol): return os.path.join (os.path.dirname (backup_tar_path), self.volume_name_func (backup_tar_path, True, nvol)) backup_index = tarfile.gen_rescue_index (gen_volume_name, self.mode, password=self.password, key=self.crypto_key) return self.restore_backup(target_path, backup_index=backup_index, backup_tar_path=backup_tar_path, disaster=tarfile.TOLERANCE_RESCUE, strict_validation=False) def _parse_json_line(self, f, l_no): ''' Read line from file like object and process it as JSON. ''' l = f.readline() l_no += 1 try: j = json.loads(l.decode('UTF-8')) except UnicodeDecodeError as e: if tuple (l [0:2]) == tarfile.GZ_MAGIC: raise Exception \ ("error parsing line #%d as json: looks like a compressed file (%d B: [%s..])" % (l_no, len (l), binascii.hexlify (l [:16]).decode ())) \ from e raise Exception \ ("error parsing line #%d as json: not a text file (%d B: [%s..])" % (l_no, len (l), binascii.hexlify (l [:16]).decode ())) \ from e except ValueError as e: raise Exception("error parsing this json line " "(line number %d): %s" % (l_no, l)) return j, l_no class RestoreHelper(object): ''' Class used to help to restore files from indices ''' # holds the dicts of data _data = [] _deltatar = None _cwd = None # list of directories to be restored. This is done as a last step, see # tarfile.extractall for details. _directories = [] _disaster = tarfile.TOLERANCE_STRICT def __init__(self, deltatar, cwd, index_list=None, backup_path=False, backup_index=None, tarobj=None, disaster=tarfile.TOLERANCE_STRICT): ''' Constructor opens the tars and init the data structures. Assumptions: - Index list must be provided in reverse order (newer first). - “newer first” apparently means that if there are n backups provided, the last full backup is at index n-1 and the most recent diff backup is at index 0. - Only the first, the second, and the last elements of ``index_list`` are relevant, others will not be accessed. - If no ``index_list`` is provided, both ``tarobj`` and ``backup_path`` must be passed. - If ``index_list`` is provided, the values of ``tarobj`` and ``backup_path`` are ignored. ''' self._data = [] self._directories = [] self._deltatar = deltatar self._cwd = cwd self._password = deltatar.password self._crypto_key = deltatar.crypto_key self._decryptors = [] self._disaster = disaster # Disable strict checking for linearly increasing IVs when running # in rescue or recover mode. strict_validation = disaster == tarfile.TOLERANCE_STRICT try: import grp, pwd except ImportError: grp = pwd = None if pwd and hasattr(os, "geteuid") and os.geteuid() == 0: self.canchown = True else: self.canchown = False if isinstance (backup_index, list) is True: decryptor = self._deltatar.decryptor self._data = \ [{ "curr_vol_no" : None , "vol_fd" : None , "offset" : -1 , "tarobj" : None , "path" : backup_path , "is_full" : True , "iterator" : None , "last_itelement" : None , "last_lno" : 0 , "new_volume_handler" : partial(self.new_volume_handler, self._deltatar, self._cwd, True, os.path.dirname(backup_path), decryptor) , "decryptor" : decryptor }] elif index_list is not None: for index in index_list: is_full = index == index_list[-1] decryptor = None if self._password is not None: decryptor = crypto.Decrypt (password=self._password, key=self._crypto_key, strict_ivs=strict_validation) # make paths absolute to avoid cwd problems if not os.path.isabs(index): index = os.path.normpath(os.path.join(cwd, index)) s = dict( curr_vol_no = None, vol_fd = None, offset = -1, tarobj = None, path = index, is_full = is_full, iterator = None, last_itelement = None, last_lno = 0, new_volume_handler = partial(self.new_volume_handler, self._deltatar, self._cwd, is_full, os.path.dirname(index), decryptor), decryptor = decryptor ) self._data.append(s) else: # make paths absolute to avoid cwd problems if not os.path.isabs(backup_path): backup_path = os.path.normpath(os.path.join(cwd, backup_path)) # update the new_volume_handler of tar_obj tarobj.new_volume_handler = partial(self.new_volume_handler, self._deltatar, self._cwd, True, os.path.dirname(backup_path), self._deltatar.decryptor) s = dict( curr_vol_no = None, vol_fd = None, offset = -1, tarobj = tarobj, path = backup_path, is_full = True, iterator = None, last_itelement = None, last_lno = 0, new_volume_handler = tarobj.new_volume_handler, decryptor = self._deltatar.decryptor ) self._data.append(s) def cleanup(self): ''' Closes all open files ''' for data in self._data: if data['vol_fd']: data['vol_fd'].close() data['vol_fd'] = None if data['tarobj']: data['tarobj'].close() data['tarobj'] = None def delete(self, path): ''' Delete a file ''' if not os.path.exists(path): return # to preserve parent directory mtime, we save it parent_dir = os.path.dirname(path) or os.getcwd() parent_dir_mtime = int(os.stat(parent_dir).st_mtime) if os.path.isdir(path) and not os.path.islink(path): shutil.rmtree(path) else: os.unlink(path) # now we restore parent_directory mtime os.utime(parent_dir, (parent_dir_mtime, parent_dir_mtime)) def restore(self, itpath, l_no, callback=None): ''' Restore the path from the appropriate backup. Receives the current path from the newest (=first) index iterator. itpath must be not null. callback is a custom function that gets called for every file. NB: This function takes the attribute ``_data`` as input but will only ever use its first and, if available, second element. Anything else in ``._data[]`` will be ignored. ''' path = itpath['path'] # Calls the callback function if callback: callback() if path.startswith('delete://'): # the file has previously been deleted already in restore_backup in # all cases so we just need to finish return # get data from newest index (_data[0]) data = self._data[0] upath = self._deltatar.unprefixed(path) # to preserve parent directory mtime, we save it parent_dir = os.path.dirname(upath) or os.getcwd() os.makedirs(parent_dir, exist_ok=True) parent_dir_mtime = int(os.stat(parent_dir).st_mtime) # if path is found in the newest index as to be snapshotted, deal with it # and finish if path.startswith('snapshot://'): self.restore_file(itpath, data, path, l_no, upath) # now we restore parent_directory mtime os.utime(parent_dir, (parent_dir_mtime, parent_dir_mtime)) return # we go from index to index, finding the path in the index, then finding # the index with the most recent snapshot of the file being restored # # Right now we support diff backups, only. No incremental backups. # As a result _data[0] is always the diff backup index # and _data[1] the full backup index. if len(self._data) == 2: data = self._data[1] d, l_no, dpath = self.find_path_in_index(data, upath) if not d: self._deltatar.logger.warning('Error restoring file %s from ' 'index, not found in index %s' % (path, data['path'])) return cur_path = d.get('path', '') if cur_path.startswith('delete://'): self._deltatar.logger.warning(('Strange thing happened, file ' '%s was listed in first index but deleted by another ' 'one. Path was ignored and untouched.') % path) return elif cur_path.startswith('snapshot://'): # this code path is reached when the file is unchanged # in the newest index and therefore of type 'list://' self.restore_file(d, data, path, l_no, dpath) # now we restore parent_directory mtime os.utime(parent_dir, (parent_dir_mtime, parent_dir_mtime)) return # error code path is reached when: # a) we have more than two indexes (unsupported atm) # b) both indexes contain a list:// entry (logic error) # c) we have just one index and it also contains list:// self._deltatar.logger.warning(('Error restoring file %s from index, ' 'snapshot not found in any index') % path) def find_path_in_index(self, data, upath): # NOTE: we restart the iterator sometimes because the iterator can be # walked over completely multiple times, for example if one path if not # found in one index and we have to go to the next index. it = data['iterator'] if it is None: it = data['iterator'] = self._deltatar.iterate_index_path(data["path"]) d, l_no = it.__next__() else: d = data['last_itelement'] l_no = data['last_lno'] while True: dpath = self._deltatar.unprefixed(d.get('path', '')) if upath == dpath: data['last_itelement'] = d data['last_lno'] = l_no return d, l_no, dpath up, dp = self._deltatar.compare_indexes(upath, dpath) # any time upath should have appeared before current dpath, it means # upath is just not in this index and we should stop if dp is None: data['last_itelement'] = d data['last_lno'] = l_no return None, 0, '' try: d, l_no = it.__next__() except StopIteration: data['last_itelement'] = d data['last_lno'] = l_no return None, 0, '' def restore_directories_permissions(self): ''' Restore directory permissions when everything have been restored ''' try: import grp, pwd except ImportError: grp = pwd = None self._directories.sort(key=operator.attrgetter('name')) self._directories.reverse() # Set correct owner, mtime and filemode on directories. for member in self._directories: dirpath = member.name try: os.chmod(dirpath, member.mode) os.utime(dirpath, (member.mtime, member.mtime)) if self.canchown: # We have to be root to do so. try: g = grp.getgrnam(member.gname)[2] except KeyError: g = member.gid try: u = pwd.getpwnam(member.uname)[2] except KeyError: u = member.uid try: if member.issym and hasattr(os, "lchown"): os.lchown(dirpath, u, g) else: os.chown(dirpath, u, g) except EnvironmentError: raise tarfile.ExtractError("could not change owner") except tarfile.ExtractError as e: self._deltatar.logger.warning('tarfile: %s' % e) @staticmethod def new_volume_handler(deltarobj, cwd, is_full, backup_path, decryptor, tarobj, base_name, volume_number): ''' Set up a new volume and perform the tasks necessary for transitioning to the next one. ''' volume_name = deltarobj.volume_name_func(backup_path, is_full, volume_number, guess_name=True) volume_path = os.path.join(backup_path, volume_name) # we convert relative paths into absolute because CWD is changed if not os.path.isabs(volume_path): volume_path = os.path.join(cwd, volume_path) tarobj.open_volume(volume_path, encryption=decryptor) def restore_file(self, file_data, index_data, path, l_no, unprefixed_path): ''' Restores a snapshot of a file from a specific backup ''' op_type = file_data.get('type', -1) member = file_data.get('member', None) ismember = bool(member) # when member is set, then we can assume everything is right and we # just have to restore the path if member is None: vol_no = file_data.get('volume', -1) # sanity check if not isinstance(vol_no, int) or vol_no < 0: self._deltatar.logger.warning('unrecognized type to be restored: ' '%s, line %d' % (op_type, l_no)) # setup the volume that needs to be read. only needed when member is # not set if index_data['curr_vol_no'] != vol_no: index_data['curr_vol_no'] = vol_no backup_path = os.path.dirname(index_data['path']) vol_name = self._deltatar.volume_name_func(backup_path, index_data['is_full'], vol_no, guess_name=True) vol_path = os.path.join(backup_path, vol_name) if index_data['vol_fd']: index_data['vol_fd'].close() index_data['vol_fd'] = open(vol_path, 'rb') # force reopen of the tarobj because of new volume if index_data['tarobj']: index_data['tarobj'].close() index_data['tarobj'] = None # seek tarfile if needed offset = file_data.get('offset', -1) if index_data['tarobj']: if self._disaster == tarfile.TOLERANCE_RESCUE: # force a seek and reopen index_data['tarobj'].close() index_data['tarobj'] = None else: try: member = index_data['tarobj'].__iter__().__next__() except tarfile.DecryptionError: pass except tarfile.CompressionError: pass if not member or member.path != file_data['path']: # force a seek and reopen index_data['tarobj'].close() index_data['tarobj'] = None # open the tarfile if needed if not index_data['tarobj']: index_data['vol_fd'].seek(offset) index_data['tarobj'] = tarfile.open(mode="r" + self._deltatar.mode, fileobj=index_data['vol_fd'], format=tarfile.GNU_FORMAT, concat='#' in self._deltatar.mode, encryption=index_data["decryptor"], new_volume_handler=index_data['new_volume_handler'], save_to_members=False, tolerance=self._disaster) member = index_data['tarobj'].__iter__().__next__() member.path = unprefixed_path member.name = unprefixed_path if op_type == 'directory': self.add_member_dir(member) member = copy.copy(member) member.mode = 0o0700 # if it's an existing directory, we then don't need to recreate it # just set the right permissions, mtime and that kind of stuff if os.path.exists(member.path): return if not ismember: # set current volume number in tarobj, otherwise the extraction of the # file might fail when trying to extract a multivolume member index_data['tarobj'].volume_number = index_data['curr_vol_no'] def ignore_symlink (member, *_args): self._deltatar.logger.warning("Ignoring symlink %s" % member.name) # finally, restore the file index_data['tarobj'].extract(member, symlink_cb=ignore_symlink, unlink=True) def add_member_dir(self, member): ''' Add member dir to be restored at the end ''' if not self.canchown: self._directories.append(DirItem(name=member.name, mode=member.mode, mtime=member.mtime)) else: self._directories.append(DirItem(name=member.name, mode=member.mode, mtime=member.mtime, gname=member.gname, uname=member.uname, uid=member.uid, gid=member.gid, issym=member.issym())) class DirItem(object): def __init__(self, **kwargs): for k, v in kwargs.items(): setattr(self, k, v)